-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel_evaluation.py
185 lines (147 loc) · 7.9 KB
/
model_evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
from data_extractor import DataExtractor
from performance_visualizer import RocAucVisualizer
from datetime import date
class MetricStrategy:
def compute_metric(self, y_true, y_pred):
raise NotImplementedError
class Precision(MetricStrategy):
def compute_metric(self, y_true, y_pred):
return precision_score(y_true, y_pred, average='weighted')
class Recall(MetricStrategy):
def compute_metric(self, y_true, y_pred):
return recall_score(y_true, y_pred, average='weighted')
class F1Score(MetricStrategy):
def compute_metric(self, y_true, y_pred):
return f1_score(y_true, y_pred, average='weighted')
class Accuracy(MetricStrategy):
def compute_metric(self, y_true, y_pred):
return accuracy_score(y_true, y_pred)
class TopKAccuracy(MetricStrategy):
def __init__(self, k):
self.k = k
def compute_metric(self, y_true, y_pred_proba):
# Get the top k predictions by probability
top_k_preds = np.argsort(y_pred_proba, axis=1)[:, -self.k:]
print(f"top_k_preds:\n{top_k_preds}\n")
# We want to compare the indices of the top-k predictions
# to the indices (classes) in y_true. So let's encode y_true
# into class indices as well.
le = LabelEncoder()
y_true_encoded = le.fit_transform(y_true)
print(f"y_true_encoded:\n{y_true_encoded}\n")
# Check if y_true is in top k for each set of predictions
mask = np.any(top_k_preds == y_true_encoded[:, None], axis=1)
# Compute top k accuracy
top_k_accuracy = np.mean(mask)
return top_k_accuracy
class AUC(MetricStrategy):
def compute_metric(self, y_true, y_pred_proba):
lb = LabelBinarizer()
lb.fit(y_true)
binarized_labels = lb.transform(y_true)
auc_score = roc_auc_score(binarized_labels, y_pred_proba, multi_class='ovr')
return auc_score
class DistanceCalculator:
def __init__(self, model_name, validation_type, distance_metrics):
self.model_name = model_name
self.validation_type = validation_type
self.distance_metrics = distance_metrics
self.de = DataExtractor()
self.data = self.de.data
self.metric_strategies = {
'Precision': Precision(),
'Recall': Recall(),
'F1-Score': F1Score(),
'Accuracy': Accuracy(),
'Top-3 Accuracy': TopKAccuracy(k=3),
'AUC': AUC()
}
self.performance_metrics = pd.DataFrame(columns=['Metric', 'Cosine Distance', 'Euclidean Distance'])
def _prepare_data(self, data):
embeddings = []
labels = []
for syndrome_id, syndrome_value in data.items():
for subject_id, subject_value in syndrome_value.items():
for image_id, image_embedding in subject_value.items():
# append the entire 320x1 embedding array as a single element of the list
embeddings.append(np.array(image_embedding))
# use the syndrome_id as the label
labels.append(syndrome_id)
return embeddings, labels
def perform_cross_validation(self):
skf = StratifiedKFold(n_splits=10)
data, labels = self._prepare_data(self.data)
data = np.array(data)
labels = np.array(labels)
for fold_num, (train_index, test_index) in enumerate(skf.split(data,labels), start=1):
train_data, test_data = data[train_index], data[test_index]
train_labels, test_labels = labels[train_index], labels[test_index]
self.calculate_distance(train_data, test_data)
self.classify(train_data, test_data, train_labels, test_labels, fold_num)
def calculate_distance(self, train_data, test_data):
cosine_distance = cosine_distances(train_data, test_data)
euclidean_distance = euclidean_distances(train_data, test_data)
print(f"Cosine Distance: {cosine_distance}")
print(f"Euclidean Distance: {euclidean_distance}")
def compute_performance_metrics(self, y_true, y_pred_cosine, y_pred_euclidean, y_pred_cosine_proba, y_pred_euclidean_proba, fold_num):
for metric_name, strategy in self.metric_strategies.items():
if metric_name in ['Precision', 'Recall', 'F1-Score', 'Accuracy']:
result_cosine = strategy.compute_metric(y_true, y_pred_cosine)
result_euclidean = strategy.compute_metric(y_true, y_pred_euclidean)
else:
result_cosine = strategy.compute_metric(y_true, y_pred_cosine_proba)
result_euclidean = strategy.compute_metric(y_true, y_pred_euclidean_proba)
# Round the float values in d_
d_ = {'Metric': metric_name, 'Cosine Distance': result_cosine, 'Euclidean Distance': result_euclidean}
rounded_values = {k: round(v, 4) if isinstance(v, np.float64) else v for k, v in d_.items()}
# Create a DataFrame from the rounded values
new_row = pd.DataFrame([rounded_values])
# Concatenate the DataFrames
self.performance_metrics = pd.concat([self.performance_metrics, new_row], ignore_index=True)
n_classes = len(np.unique(y_true))
roc_visualizer_cosine = RocAucVisualizer(y_true, y_pred_cosine_proba, n_classes)
print("ROC AUC for Cosine Distance:")
roc_visualizer_cosine.save(f"./data/cosine/fold_{fold_num}")
# ROC AUC for Euclidean Distance
roc_visualizer_euclidean = RocAucVisualizer(y_true, y_pred_euclidean_proba, n_classes)
print("ROC AUC for Euclidean Distance:")
#roc_visualizer_euclidean.save("./data/roc_auc_euclidean")
roc_visualizer_euclidean.save(f"./data/euclidean/fold_{fold_num}")
def classify(self, train_data, test_data, train_labels, test_labels, fold_num):
# Define classifiers
knn_cosine = KNeighborsClassifier(n_neighbors=3, metric='cosine')
knn_euclidean = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
# Fit the models
knn_cosine.fit(train_data, train_labels)
knn_euclidean.fit(train_data, train_labels)
# Predict the test set results
prediction_cosine = knn_cosine.predict(test_data)
prediction_euclidean = knn_euclidean.predict(test_data)
# Predict the test set probabilities
prediction_cosine_proba = knn_cosine.predict_proba(test_data)
prediction_euclidean_proba = knn_euclidean.predict_proba(test_data)
# Compute performance metrics and add them to the DataFrame
self.compute_performance_metrics(test_labels, prediction_cosine, prediction_euclidean, prediction_cosine_proba, prediction_euclidean_proba, fold_num)
# Export metrics as csv/txt
for file_type in ['csv', 'txt']:
self.performance_metrics.to_csv(f'./data/{self.generate_filename(file_type)}')
self.performance_metrics.groupby('Metric').mean().round(4).to_csv(f'./data/Average_{self.generate_filename(file_type)}')
def generate_filename(self, file_type=None):
current_date = date.today().strftime('%Y%m%d') # Get current date and format it as 'yyyymmdd'
filename = f'{self.model_name}_Metrics_{self.validation_type}_{self.distance_metrics}_{current_date}.{file_type}'
return filename
if __name__ == '__main__':
model_name = 'KNN'
validation_type = 'StratifiedKFold'
distance_metrics = 'CosineEuclidean'
dc = DistanceCalculator(model_name, validation_type, distance_metrics)
dc.perform_cross_validation()