forked from lemonhu/NER-BERT-pytorch
-
Notifications
You must be signed in to change notification settings - Fork 1
/
evaluate.py
129 lines (96 loc) · 4.67 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
"""Evaluate the model"""
import argparse
import random
import logging
import os
import numpy as np
import torch
from pytorch_pretrained_bert import BertForTokenClassification, BertConfig
from metrics import f1_score
from metrics import classification_report
from data_loader import DataLoader
import utils
parser = argparse.ArgumentParser()
parser.add_argument('--data_dir', default='data/msra/', help="Directory containing the dataset")
parser.add_argument('--bert_model_dir', default='bert-base-chinese-pytorch', help="Directory containing the BERT model in PyTorch")
parser.add_argument('--model_dir', default='experiments/base_model', help="Directory containing params.json")
parser.add_argument('--seed', type=int, default=23, help="random seed for initialization")
parser.add_argument('--restore_file', default='best', help="name of the file in `model_dir` containing weights to load")
parser.add_argument('--multi_gpu', default=False, action='store_true', help="Whether to use multiple GPUs if available")
parser.add_argument('--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit")
def evaluate(model, data_iterator, params, mark='Eval', verbose=False):
"""Evaluate the model on `steps` batches."""
# set model to evaluation mode
model.eval()
idx2tag = params.idx2tag
true_tags = []
pred_tags = []
# a running average object for loss
loss_avg = utils.RunningAverage()
for _ in range(params.eval_steps):
# fetch the next evaluation batch
batch_data, batch_tags = next(data_iterator)
batch_masks = batch_data.gt(0)
loss = model(batch_data, token_type_ids=None, attention_mask=batch_masks, labels=batch_tags)
if params.n_gpu > 1 and params.multi_gpu:
loss = loss.mean()
loss_avg.update(loss.item())
batch_output = model(batch_data, token_type_ids=None, attention_mask=batch_masks) # shape: (batch_size, max_len, num_labels)
batch_output = batch_output.detach().cpu().numpy()
batch_tags = batch_tags.to('cpu').numpy()
pred_tags.extend([idx2tag.get(idx) for indices in np.argmax(batch_output, axis=2) for idx in indices])
true_tags.extend([idx2tag.get(idx) for indices in batch_tags for idx in indices])
assert len(pred_tags) == len(true_tags)
# logging loss, f1 and report
metrics = {}
f1 = f1_score(true_tags, pred_tags)
metrics['loss'] = loss_avg()
metrics['f1'] = f1
metrics_str = "; ".join("{}: {:05.2f}".format(k, v) for k, v in metrics.items())
logging.info("- {} metrics: ".format(mark) + metrics_str)
if verbose:
report = classification_report(true_tags, pred_tags)
logging.info(report)
return metrics
if __name__ == '__main__':
args = parser.parse_args()
# Load the parameters from json file
json_path = os.path.join(args.model_dir, 'params.json')
assert os.path.isfile(json_path), "No json configuration file found at {}".format(json_path)
params = utils.Params(json_path)
# Use GPUs if available
params.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
params.n_gpu = torch.cuda.device_count()
params.multi_gpu = args.multi_gpu
# Set the random seed for reproducible experiments
random.seed(args.seed)
torch.manual_seed(args.seed)
if params.n_gpu > 0:
torch.cuda.manual_seed_all(args.seed) # set random seed for all GPUs
params.seed = args.seed
# Set the logger
utils.set_logger(os.path.join(args.model_dir, 'evaluate.log'))
# Create the input data pipeline
logging.info("Loading the dataset...")
# Initialize the DataLoader
data_loader = DataLoader(args.data_dir, args.bert_model_dir, params, token_pad_idx=0)
# Load data
test_data = data_loader.load_data('test')
# Specify the test set size
params.test_size = test_data['size']
params.eval_steps = params.test_size // params.batch_size
test_data_iterator = data_loader.data_iterator(test_data, shuffle=False)
logging.info("- done.")
# Define the model
config_path = os.path.join(args.bert_model_dir, 'bert_config.json')
config = BertConfig.from_json_file(config_path)
model = BertForTokenClassification(config, num_labels=len(params.tag2idx))
model.to(params.device)
# Reload weights from the saved file
utils.load_checkpoint(os.path.join(args.model_dir, args.restore_file + '.pth.tar'), model)
if args.fp16:
model.half()
if params.n_gpu > 1 and args.multi_gpu:
model = torch.nn.DataParallel(model)
logging.info("Starting evaluation...")
test_metrics = evaluate(model, test_data_iterator, params, mark='Test', verbose=True)