-
Notifications
You must be signed in to change notification settings - Fork 4
/
evaluate_model.py
325 lines (272 loc) · 10.4 KB
/
evaluate_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
"""
Evaluate a given model on a given dataset.
"""
import click
from comet_ml import Experiment
from config import DEFAULT_OUT_DIR
from dataset import get_dataset_by_name, AVAIL_DATASETS, MLMA_RAW_DATASETS
import metrics
import pandas as pd
import logging
import numpy as np
import os
from tqdm import tqdm
from joblib import Parallel, delayed
import torch
import glob
logging.basicConfig(
format="%(levelname)s:%(asctime)s:%(module)s:%(message)s", level=logging.INFO
)
logger = logging.getLogger(__name__)
@click.command()
@click.option("--dataset", type=click.Choice(AVAIL_DATASETS), required=True)
@click.option("--model_path", type=str)
@click.option(
"--subgroups_path", type=str, help="Path to the subgroups file.", default=None
)
@click.option(
"--n_jobs",
type=int,
help="Used to parallelize the evaluation of bias metrics",
default=4,
)
@click.option("--cpu_only", is_flag=True)
@click.option("--no_bias_metrics", is_flag=True)
@click.option("--model_suffix", type=str)
@click.option("--out_folder", type=str, default=DEFAULT_OUT_DIR)
@click.option("--log_comet", is_flag=True)
@click.option("--ckpt_pattern", type=str, default=None)
@click.option("--src_tokenizer", type=str, default=None)
@click.option("--src_model", type=str, default=None)
def evaluate(
dataset,
model_path,
subgroups_path,
n_jobs,
cpu_only,
no_bias_metrics,
model_suffix,
out_folder,
log_comet,
ckpt_pattern,
src_tokenizer,
src_model,
):
os.makedirs(out_folder, exist_ok=True)
hparams = locals()
if src_model:
logger.info(f"Using model {src_model}")
model_name = src_model.split("/")[1]
model_path = src_model
else:
model_name = os.path.basename(model_path)
# lm_ws is created by Kennedy's simulation but shouldn't be used
if model_name.startswith("lm_ws"):
logger.info(f"Skipping the model {model_name}...")
return
if model_suffix:
model_name = f"{model_name}-{model_suffix}"
hparams["model"] = model_name
if log_comet:
experiment = Experiment(
api_key=os.environ["COMET_API_KEY"],
project_name="unbias-text-classifiers",
log_code=False,
log_graph=False,
)
experiment.set_name(f"evaluate_{dataset}")
experiment.log_parameters(hparams)
experiment.add_tag("evaluation")
logger.info(f"BEGIN evaluating {model_name} on {dataset}")
# Get dataset splits. Discard train and dev
_, _, test = get_dataset_by_name(dataset)
if log_comet:
experiment.log_other("test_size", len(test))
y_true = test.get_labels()
scores_file = os.path.join(out_folder, f"scores_{model_name}_{dataset}.pt")
if os.path.exists(os.path.join(out_folder, scores_file)):
logger.info(
"Scores already exist. Loading them and continuing the evaluation..."
)
scores = torch.load(os.path.join(out_folder, scores_file))
else:
scores = evaluate_bert(test, model_path, cpu_only, ckpt_pattern, src_tokenizer)
# Compute classification metrics based on scores
logger.info("Evaluating standard performance metrics...")
perf, y_pred = metrics.evaluate_metrics(y_true=y_true, y_score=scores, th=0.5)
if log_comet:
experiment.log_metrics(perf)
# Save scores and classification metrics locally and on Comet
torch.save(scores, scores_file)
pd.Series(perf).to_frame().to_csv(
os.path.join(out_folder, f"class_metrics_{model_name}_{dataset}.csv")
)
if log_comet:
experiment.log_asset(scores_file)
experiment.log_metrics(perf)
experiment.log_confusion_matrix(
y_true=y_true, y_predicted=y_pred.astype(int).tolist()
)
# run the evaluation on MLMA
if dataset in MLMA_RAW_DATASETS:
logger.info("Processing MLMA per-target performance")
mlma_results = compute_metrics_on_mlma(test, y_true, scores)
mlma_df = pd.DataFrame(
[r[3] for r in mlma_results], index=[r[0] for r in mlma_results]
)
mlma_df.to_csv(
os.path.join(
out_folder, f"class_metrics_by_target_{model_name}_{dataset}.csv"
)
)
if no_bias_metrics:
if log_comet:
experiment.add_tag("no_bias_metrics")
logger.info(f"END {model_name} (skipped bias metrics)")
return
# --- Evaluation of bias metrics ---
# Read subgroups and add a dummy column indicating its presence
with open(subgroups_path) as fp:
subgroups = [line.strip().split("\t")[0] for line in fp.readlines()]
logging.info(f"Found subgroups: {subgroups}")
if log_comet:
experiment.log_other("subgroups", subgroups)
experiment.log_other("subgroups_count", len(subgroups))
# this df is required by the Jigsaw's code for bias metrics
data_df = pd.DataFrame(
{"text": test.get_texts(), "label": y_true, model_name: scores}
)
data_df = metrics.add_subgroup_columns_from_text(data_df, "text", subgroups)
logger.info("Evaluating bias metrics (parallel)...")
bias_records = Parallel(n_jobs=n_jobs)(
delayed(metrics.compute_bias_metrics_for_subgroup_and_model)(
dataset=data_df,
subgroup=subg,
model=model_name,
label_col="label",
include_asegs=True,
)
for subg in tqdm(subgroups)
)
bias_terms_file = os.path.join(out_folder, f"bias_terms_{model_name}_{dataset}.csv")
per_term_df = pd.DataFrame(bias_records)
per_term_df.to_csv(bias_terms_file, index=False)
if log_comet:
experiment.log_table(bias_terms_file)
# Average bias metrics across subgroups
records_df = per_term_df.drop(columns=["test_size", "subgroup"])
# TODO: ignore nans?
# compute the mean value of each bias metric across subgroups. Here we use
# 1. power mean (Jigsaw's Kaggle competition). It weights more subgroups where the metric is low.
# 2. arithmetic mean
power_mean_values = metrics.power_mean(records_df.values, -5, ignore_nans=True)
mean_values = metrics.power_mean(records_df.values, 1, ignore_nans=True)
power_mean_dict = {
f"{name}_power_mean": v
for name, v in zip(records_df.columns, power_mean_values)
}
mean_dict = {f"{name}_mean": v for name, v in zip(records_df.columns, mean_values)}
# The final summary metric is the average between:
# overall AUC, subgroup_auc, bpsn_auc, bnsp_auc
summary_metric_pm = np.nanmean(
np.array(
[
perf["AUC"],
power_mean_dict["subgroup_auc_power_mean"],
power_mean_dict["bpsn_auc_power_mean"],
power_mean_dict["bnsp_auc_power_mean"],
]
)
)
summary_metric = np.nanmean(
np.array(
[
perf["AUC"],
mean_dict["subgroup_auc_mean"],
mean_dict["bpsn_auc_mean"],
mean_dict["bnsp_auc_mean"],
]
)
)
bias_metrics = {
**power_mean_dict,
**mean_dict,
"summary_power_mean": summary_metric_pm,
"summary_mean": summary_metric,
}
# Add False Positive and False Negative Equality Difference (Equality of Odds)
bias_metrics["fped"] = per_term_df[metrics.FPR_GAP].abs().sum()
bias_metrics["fped_mean"] = per_term_df[metrics.FPR_GAP].abs().mean()
bias_metrics["fped_std"] = per_term_df[metrics.FPR_GAP].abs().std()
bias_metrics["fned"] = per_term_df[metrics.FNR_GAP].abs().sum()
bias_metrics["fned_mean"] = per_term_df[metrics.FNR_GAP].abs().mean()
bias_metrics["fned_std"] = per_term_df[metrics.FNR_GAP].abs().std()
if log_comet:
experiment.log_metrics(bias_metrics)
pd.Series(bias_metrics).to_frame().to_csv(
os.path.join(out_folder, f"bias_metrics_{model_name}_{dataset}.csv")
)
logger.info(f"END {model_name}")
def evaluate_bert(
dataset,
model_dir,
cpu_only: bool,
ckpt_pattern,
src_tokenizer,
batch_size=64,
max_sequence_length=120,
):
"""Run evaluation on Kennedy's BERT."""
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader
device = "cuda:0" if (torch.cuda.is_available() and not cpu_only) else "cpu"
logger.info(f"Device: {device}")
if ckpt_pattern:
from train_bert import LMForSequenceClassification
ckpt_path = glob.glob(os.path.join(model_dir, f"*{ckpt_pattern}*"))[0]
logger.info(f"Loading ckpt {ckpt_path}")
model = LMForSequenceClassification.load_from_checkpoint(ckpt_path).to(device)
else:
model = AutoModelForSequenceClassification.from_pretrained(model_dir).to(device)
model.eval()
if src_tokenizer:
tokenizer = AutoTokenizer.from_pretrained(src_tokenizer)
else:
logger.info(f"Src tokenizer not specified, using {model_dir}")
tokenizer = AutoTokenizer.from_pretrained(model_dir)
os.environ["TOKENIZERS_PARALLELISM"] = "true"
test_loader = DataLoader(dataset, batch_size=batch_size, num_workers=0)
probs = list()
with torch.no_grad():
for batch in tqdm(test_loader):
encodings = tokenizer(
batch["text"],
add_special_tokens=True, # they use BERT's special tokens
padding=True,
truncation=True,
max_length=max_sequence_length,
return_tensors="pt",
).to(device)
output = model(**encodings)
batch_probs = output["logits"].softmax(-1) # batch_size x 2
probs.append(batch_probs)
probs = torch.cat(probs, dim=0)
# return probabilities for the positive label only
return probs[:, 1].cpu()
def compute_metrics_on_mlma(mlma_data, y_true, scores):
targets = mlma_data.data.target.unique()
logger.info(f"Targets found {targets}")
target_mask = pd.get_dummies(mlma_data.data["target"]).astype(bool)
# y_true is a list, y_pred a np.array, scores a torch.tensor
y_true = np.array(y_true)
results = list()
for target in targets:
mask = target_mask[target].values
perf, y_pred = metrics.evaluate_metrics(
y_true=y_true[mask], y_score=scores[mask], th=0.5
)
perf["size"] = y_true[mask].size
results.append((target, y_true[mask], scores[mask], perf))
return results
if __name__ == "__main__":
evaluate()