-
Notifications
You must be signed in to change notification settings - Fork 1
/
transform_diachronic.py
executable file
·89 lines (69 loc) · 3.41 KB
/
transform_diachronic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#! python3
# coding: utf-8
from helpers import *
from argparse import ArgumentParser
if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument('--modelfile', required=True, action='store')
parser.add_argument('--reference', required=True, action='store')
parser.add_argument('--skip', action='store', type=bool, default=False)
parser.add_argument('--lmbd', action='store', type=float, default=0.0)
args = parser.parse_args()
modelfile = args.modelfile
referencefile = args.reference
currentyear = referencefile.split('/')[1].split('_')[0]
currentyear = int(currentyear)
print('Current training year:', currentyear, file=sys.stderr)
skip = args.skip
model = load_embeddings(modelfile)
df, _ = load_dataset(referencefile, embedding=model)
train_pairs = list(zip(df['Location'], df['Insurgent']))
df['LocVec'] = df['Location'].apply(get_vector, emb=model)
df['InsVec'] = df['Insurgent'].apply(get_vector, emb=model)
print('Whole dataset shape:', df.shape, file=sys.stderr)
transforms = learn_projection(df, model, lmbd=args.lmbd)
print('Tranformation matrix created', file=sys.stderr)
# print(transforms.shape, file=sys.stderr)
print('Testing on the next years', file=sys.stderr)
# print('Year\tAccuracy@1\tAccuracy@5\tAccuracy@10\tAccuracy@1_new\tAccuracy@5_new\t'
# 'Accuracy@10_new\tOOV\tNew pairs')
next_year = currentyear + 1
print('Now testing on year:', next_year, file=sys.stderr)
testmodelfile = modelfile.replace(str(currentyear), str(next_year))
testmodel = load_embeddings(testmodelfile)
testfile = referencefile.replace(str(currentyear), str(next_year))
df, test_unknown = load_dataset(testfile, embedding=testmodel, evaluation=True)
size = df.shape[0]
print('Whole test dataset shape:', df.shape, file=sys.stderr)
accuracies1 = []
accuracies5 = []
accuracies10 = []
for loc, ins in zip(df['Location'], df['Insurgent']):
candidates = predict(loc, testmodel, transforms)
# print >> sys.stderr, candidates
accuracy1, accuracy5, accuracy10 = calc_accuracies(candidates, ins)
accuracies1.append(accuracy1)
accuracies5.append(accuracy5)
accuracies10.append(accuracy10)
# Now goes the unknown:
accuracies1_new = []
accuracies5_new = []
accuracies10_new = []
print('Predictions for the unknown pairs:', file=sys.stderr)
for loc, ins in zip(df['Location'], df['Insurgent']):
if (loc, ins) in train_pairs:
# Was in the training set
continue
candidates = predict(loc, testmodel, transforms)
print((loc, ins), candidates[:5], file=sys.stderr)
accuracy1_new, accuracy5_new, accuracy10_new = calc_accuracies(candidates, ins)
# print('Accuracy @1:', accuracy1_new, file=sys.stderr)
# print('Accuracy @5:', accuracy5_new, file=sys.stderr)
# print('Accuracy @10:', accuracy10_new, file=sys.stderr)
accuracies1_new.append(accuracy1_new)
accuracies5_new.append(accuracy5_new)
accuracies10_new.append(accuracy10_new)
print(next_year, '\t', np.average(accuracies1), '\t', np.average(accuracies5), '\t',
np.average(accuracies10), '\t', np.average(accuracies1_new), '\t',
np.average(accuracies5_new), '\t', np.average(accuracies10_new), '\t',
test_unknown / size, '\t', len(accuracies1_new) / len(accuracies1))