This repository has been archived by the owner on Jun 2, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessing.py
751 lines (677 loc) · 37.6 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
# -*- coding: utf-8 -*-
import codecs
import pickle
from collections import Counter
import matplotlib.pyplot as plt
import spacy
import numpy as np
import sqlite3
from geopy.distance import great_circle
from matplotlib import pyplot, colors
# -------- GLOBAL CONSTANTS AND VARIABLES -------- #
BATCH_SIZE = 64
CONTEXT_LENGTH = 200 # each side of target entity
UNKNOWN = u"<unknown>"
EMBEDDING_DIMENSION = 50
TARGET_LENGTH = 15
ENCODING_MAP_1x1 = pickle.load(open(u"data/1x1_encode_map.pkl",'rb')) # We need these maps
ENCODING_MAP_2x2 = pickle.load(open(u"data/2x2_encode_map.pkl",'rb')) # and the reverse ones
REVERSE_MAP_1x1 = pickle.load(open(u"data/1x1_reverse_map.pkl",'rb')) # to handle the used and
REVERSE_MAP_2x2 = pickle.load(open(u"data/2x2_reverse_map.pkl",'rb')) # unused map_vector polygons.
OUTLIERS_MAP_1x1 = pickle.load(open(u"data/1x1_outliers_map.pkl",'rb')) # Outliers are redundant polygons that
OUTLIERS_MAP_2x2 = pickle.load(open(u"data/2x2_outliers_map.pkl",'rb')) # have been removed but must also be handled.
# -------- GLOBAL CONSTANTS AND VARIABLES -------- #
def print_stats(accuracy):
"""
Prints Mean, Median, AUC and acc@161km for the list.
:param accuracy: a list of geocoding errors
"""
print("==============================================================================================")
print(u"Median error:", np.median(sorted(accuracy)))
print(u"Mean error:", np.mean(accuracy))
accuracy = np.log(np.array(accuracy) + 1)
k = np.log(161)
print (u"Accuracy to 161 km: ", sum([1.0 for dist in accuracy if dist < k]) / len(accuracy))
print (u"AUC = ", np.trapz(accuracy) / (np.log(20039) * (len(accuracy) - 1))) # Trapezoidal rule.
print("==============================================================================================")
def pad_list(size, a_list, from_left, padding):
"""
Utility function that pads a list with any given padding.
:param size: the final length of the list i.e. pad up to size
:param a_list: the list to pad
:param from_left: True to pad from the left, False to pad from the right
:param padding: whatever you want to use for padding, example "0"
:return: the padded list
"""
while len(a_list) < size:
if from_left:
a_list = [padding] + a_list
else:
a_list += [padding]
return a_list
def coord_to_index(coordinates, polygon_size):
"""
Convert coordinates into an array (world representation) index. Use that to modify map_vector polygon value.
:param coordinates: (latitude, longitude) to convert to the map vector index
:param polygon_size: integer size of the polygon? i.e. the resolution of the world
:return: index pointing into map_vector array
"""
latitude = float(coordinates[0]) - 90 if float(coordinates[0]) != -90 else -179.99 # The two edge cases must
longitude = float(coordinates[1]) + 180 if float(coordinates[1]) != 180 else 359.99 # get handled differently!
if longitude < 0:
longitude = -longitude
if latitude < 0:
latitude = -latitude
x = int(360 / polygon_size) * int(latitude / polygon_size)
y = int(longitude / polygon_size)
return x + y if 0 <= x + y <= int(360 / polygon_size) * int(180 / polygon_size) else Exception(u"Shock horror!!")
def index_to_coord(index, polygon_size):
"""
Convert index (output of the prediction model) back to coordinates.
:param index: of the polygon/tile in map_vector array (given by model prediction)
:param polygon_size: size of each polygon/tile i.e. resolution of the world
:return: pair of (latitude, longitude)
"""
x = int(index / (360 / polygon_size))
y = index % int(360 / polygon_size)
if x > int(90 / polygon_size):
x = -int((x - (90 / polygon_size)) * polygon_size)
else:
x = int(((90 / polygon_size) - x) * polygon_size)
if y < int(180 / polygon_size):
y = -int(((180 / polygon_size) - y) * polygon_size)
else:
y = int((y - (180 / polygon_size)) * polygon_size)
return x, y
def get_coordinates(con, loc_name):
"""
Access the database to retrieve coordinates and other data from DB.
:param con: sqlite3 database cursor i.e. DB connection
:param loc_name: name of the place
:return: a list of tuples [(latitude, longitude, population, feature_code), ...]
"""
result = con.execute(u"SELECT METADATA FROM GEO WHERE NAME = ?", (loc_name.lower(),)).fetchone()
if result:
result = eval(result[0]) # Do not remove the sorting, the function below assumes sorted results!
return sorted(result, key=lambda a: a[2], reverse=True)
else:
return []
def construct_map_vector(a_list, polygon_size, mapping, outliers):
"""
Build the map_vector vector representation from a_list of location data.
:param a_list: of tuples [(latitude, longitude, population, feature_code), ...]
:param polygon_size: what's the resolution? size of each polygon in degrees.
:param mapping: one of the transformation maps 1x1 or 2x2
:param outliers: the outlier map, 1x1 or 2x2 (must match resolution or mapping above)
:return: map_vector representation
"""
map_vector = np.zeros(len(mapping), )
if len(a_list) == 0:
return map_vector
max_pop = a_list[0][2] if a_list[0][2] > 0 else 1
for s in a_list:
index = coord_to_index((s[0], s[1]), polygon_size)
if index in mapping:
index = mapping[index]
else:
index = mapping[outliers[index]]
map_vector[index] += float(max(s[2], 1)) / max_pop
return map_vector / map_vector.max() if map_vector.max() > 0.0 else map_vector
def construct_map_vector_full_scale(a_list, polygon_size):
"""
This function is similar to the above BUT it builds map_vector WITHOUT removing redundant polygons.
:param a_list: of tuples [(latitude, longitude, population, feature_code), ...]
:param polygon_size: size of each polygon in degrees i.e 1x1 or 2x2
:return: map_vector (full scale) i.e. without removing redundant polygons, used for visualisation in 2D
"""
map_vector = np.zeros(int(360 / polygon_size) * int(180 / polygon_size))
if len(a_list) == 0:
return map_vector
max_pop = a_list[0][2] if a_list[0][2] > 0 else 1
for s in a_list:
index = coord_to_index((s[0], s[1]), polygon_size)
map_vector[index] += float(max(s[2], 1)) / max_pop
return map_vector / map_vector.max() if map_vector.max() > 0.0 else map_vector
def merge_lists(lists):
"""
Utility function to merge multiple lists.
:param lists: a list of lists to be merged
:return: one single list with all items from above list of lists
"""
out = []
for l in lists:
out.extend(l)
return out
def populate_sql():
"""
Create and populate the sqlite3 database with GeoNames data. Requires Geonames dump.
No need to run this function, I share the database as a separate dump on GitHub (see link).
"""
geo_names = {}
p_map = {"PPLC": 100000, "PCLI": 100000, "PCL": 100000, "PCLS": 10000, "PCLF": 10000, "CONT": 100000, "RGN": 100000}
for line in codecs.open(u"../data/allCountries.txt", u"r", encoding=u"utf-8"):
line = line.split("\t")
feat_code = line[7]
class_code = line[6]
pop = int(line[14])
for name in [line[1], line[2]] + line[3].split(","):
name = name.lower()
if len(name) != 0:
if name in geo_names:
already_have_entry = False
for item in geo_names[name]:
if great_circle((float(line[4]), float(line[5])), (item[0], item[1])).km < 100:
if item[2] >= pop:
already_have_entry = True
if not already_have_entry:
pop = get_population(class_code, feat_code, p_map, pop)
geo_names[name].add((float(line[4]), float(line[5]), pop, feat_code))
else:
pop = get_population(class_code, feat_code, p_map, pop)
geo_names[name] = {(float(line[4]), float(line[5]), pop, feat_code)}
conn = sqlite3.connect(u'../data/geonames.db')
c = conn.cursor()
# c.execute("CREATE TABLE GEO (NAME VARCHAR(100) PRIMARY KEY NOT NULL, METADATA VARCHAR(5000) NOT NULL);")
c.execute(u"DELETE FROM GEO") # alternatively, delete the database file.
conn.commit()
for gn in geo_names:
c.execute(u"INSERT INTO GEO VALUES (?, ?)", (gn, str(list(geo_names[gn]))))
print(u"Entries saved:", len(geo_names))
conn.commit()
conn.close()
def get_population(class_code, feat_code, p_map, pop):
"""
Utility function to eliminate code duplication. Nothing of much interest, methinks.
:param class_code: Geonames code for the class of location
:param feat_code: Geonames code for the feature type of an database entry
:param p_map: dictionary mapping feature codes to estimated population
:param pop: population count
:return: population (modified if class code is one of A, P or L.
"""
if pop == 0 and class_code in ["A", "P", "L"]:
pop = p_map.get(feat_code, 0)
return pop
def generate_training_data():
"""
Prepare Wikipedia training data. Please download the required files from GitHub.
Files: geonames.db and geowiki.txt both inside the data folder (see README)
Alternatively, create your own with http://medialab.di.unipi.it/wiki/Wikipedia_Extractor
"""
conn = sqlite3.connect(u'../data/geonames.db')
c = conn.cursor()
nlp = spacy.load(u'en') # or spacy.load(u'en_core_web_lg') depending on your Spacy Download (simple, full)
padding = nlp(u"0")[0]
inp = codecs.open(u"../data/geowiki.txt", u"r", encoding=u"utf-8")
o = codecs.open(u"../data/train_wiki.txt", u"w", encoding=u"utf-8")
lat, lon = u"", u""
target, string = u"", u""
skipped = 0
for line in inp:
if len(line.strip()) == 0:
continue
limit = 0
if line.startswith(u"NEW ARTICLE::"):
if len(string.strip()) > 0 and len(target) != 0:
locations_near, locations_far = [], []
doc = nlp(string)
for d in doc:
if d.text == target[0]:
if u" ".join(target) == u" ".join([t.text for t in doc[d.i:d.i + len(target)]]):
near_inp = pad_list(CONTEXT_LENGTH / 2, [x for x in doc[max(0, d.i - CONTEXT_LENGTH / 2):d.i]], True, padding) \
+ pad_list(CONTEXT_LENGTH / 2, [x for x in doc[d.i + len(target): d.i + len(target) + CONTEXT_LENGTH / 2]], False, padding)
far_inp = pad_list(CONTEXT_LENGTH / 2, [x for x in doc[max(0, d.i - CONTEXT_LENGTH):max(0, d.i - CONTEXT_LENGTH / 2)]], True, padding) \
+ pad_list(CONTEXT_LENGTH / 2, [x for x in doc[d.i + len(target) + CONTEXT_LENGTH / 2: d.i + len(target) + CONTEXT_LENGTH]], False, padding)
near_out, far_out = [], []
location = u""
for (out_list, in_list, is_near) in [(near_out, near_inp, True), (far_out, far_inp, False)]:
for index, item in enumerate(in_list):
if item.ent_type_ in [u"GPE", u"FACILITY", u"LOC", u"FAC", u"LOCATION"]:
if item.ent_iob_ == u"B" and item.text.lower() == u"the":
out_list.append(item.text.lower())
else:
location += item.text + u" "
out_list.append(u"**LOC**" + item.text.lower())
elif item.ent_type_ in [u"PERSON", u"DATE", u"TIME", u"PERCENT", u"MONEY",
u"QUANTITY", u"CARDINAL", u"ORDINAL"]:
out_list.append(u'0')
elif item.is_punct:
out_list.append(u'0')
elif item.is_digit or item.like_num:
out_list.append(u'0')
elif item.like_email:
out_list.append(u'0')
elif item.like_url:
out_list.append(u'0')
elif item.is_stop:
out_list.append(u'0')
else:
out_list.append(item.lemma_)
if location.strip() != u"" and (item.ent_type == 0 or index == len(in_list) - 1):
location = location.strip()
coords = get_coordinates(c, location)
if len(coords) > 0 and location != u" ".join(target):
if is_near:
locations_near.append(coords)
else:
locations_far.append(coords)
else:
offset = 1 if index == len(in_list) - 1 else 0
for i in range(index - len(location.split()), index):
out_list[i + offset] = in_list[i + offset].lemma_ \
if in_list[i + offset].is_alpha and location != u" ".join(target) else u'0'
location = u""
target_grid = get_coordinates(c, u" ".join(target))
if len(target_grid) == 0:
skipped += 1
break
entities_near = merge_lists(locations_near)
entities_far = merge_lists(locations_far)
locations_near, locations_far = [], []
o.write(lat + u"\t" + lon + u"\t" + str(near_out) + u"\t" + str(far_out) + u"\t")
o.write(str(target_grid) + u"\t" + str([t.lower() for t in target][:TARGET_LENGTH]))
o.write(u"\t" + str(entities_near) + u"\t" + str(entities_far) + u"\n")
limit += 1
if limit > 29:
break
line = line.strip().split("\t")
if u"(" in line[1]:
line[1] = line[1].split(u"(")[0].strip()
if line[1].strip().startswith(u"Geography of "):
target = line[1].replace(u"Geography of ", u"").split()
elif u"," in line[1]:
target = line[1].split(u",")[0].strip().split()
else:
target = line[1].split()
lat = line[2]
lon = line[3]
string = ""
print(u"Processed", limit, u"Skipped:", skipped, u"Name:", u" ".join(target))
else:
string += line
o.close()
def generate_evaluation_data(corpus, file_name):
"""
Create evaluation data from text files. See README for formatting and download instructions.
:param corpus: name of the dataset such as LGL, GEOVIRUS or WIKTOR
:param file_name: an affix, in case you're creating several versions of the same dataset
"""
conn = sqlite3.connect(u'../data/geonames.db')
c = conn.cursor()
nlp = spacy.load(u'en') # or spacy.load(u'en_core_web_lg'), it depends on your choice of model
padding = nlp(u"0")[0]
directory = u"../data/" + corpus + u"/"
o = codecs.open(u"data/eval_" + corpus + file_name + u".txt", u"w", encoding=u"utf-8")
line_no = 0 if corpus == u"lgl" else -1
for line in codecs.open(u"data/" + corpus + file_name + u".txt", u"r", encoding=u"utf-8"):
line_no += 1
if len(line.strip()) == 0:
continue
for toponym in line.split(u"||")[:-1]:
captured = False
doc = nlp(codecs.open(directory + str(line_no), u"r", encoding=u"utf-8").read())
locations_near, locations_far = [], []
toponym = toponym.split(u",,")
target = [t.text for t in nlp(toponym[1])]
ent_length = len(u" ".join(target))
lat, lon = toponym[2], toponym[3]
start, end = int(toponym[4]), int(toponym[5])
for d in doc:
if d.text == target[0]:
if u" ".join(target) == u" ".join([t.text for t in doc[d.i:d.i + len(target)]]):
if abs(d.idx - start) > 4 or abs(d.idx + ent_length - end) > 4:
continue
captured = True
near_inp = pad_list(CONTEXT_LENGTH / 2, [x for x in doc[max(0, d.i - CONTEXT_LENGTH / 2):d.i]], True, padding) \
+ pad_list(CONTEXT_LENGTH / 2, [x for x in doc[d.i + len(target): d.i + len(target) + CONTEXT_LENGTH / 2]], False, padding)
far_inp = pad_list(CONTEXT_LENGTH / 2, [x for x in doc[max(0, d.i - CONTEXT_LENGTH):max(0, d.i - CONTEXT_LENGTH / 2)]], True, padding) \
+ pad_list(CONTEXT_LENGTH / 2, [x for x in doc[d.i + len(target) + CONTEXT_LENGTH / 2: d.i + len(target) + CONTEXT_LENGTH]], False, padding)
near_out, far_out = [], []
location = u""
for (out_list, in_list, is_near) in [(near_out, near_inp, True), (far_out, far_inp, False)]:
for index, item in enumerate(in_list):
if item.ent_type_ in [u"GPE", u"FACILITY", u"LOC", u"FAC", u"LOCATION"]:
if item.ent_iob_ == u"B" and item.text.lower() == u"the":
out_list.append(item.text.lower())
else:
location += item.text + u" "
out_list.append(u"**LOC**" + item.text.lower())
elif item.ent_type_ in [u"PERSON", u"DATE", u"TIME", u"PERCENT", u"MONEY",
u"QUANTITY", u"CARDINAL", u"ORDINAL"]:
out_list.append(u'0')
elif item.is_punct:
out_list.append(u'0')
elif item.is_digit or item.like_num:
out_list.append(u'0')
elif item.like_email:
out_list.append(u'0')
elif item.like_url:
out_list.append(u'0')
elif item.is_stop:
out_list.append(u'0')
else:
out_list.append(item.lemma_)
if location.strip() != u"" and (item.ent_type == 0 or index == len(in_list) - 1):
location = location.strip()
coords = get_coordinates(c, location)
if len(coords) > 0 and location != u" ".join(target):
if is_near:
locations_near.append(coords)
else:
locations_far.append(coords)
else:
offset = 1 if index == len(in_list) - 1 else 0
for i in range(index - len(location.split()), index):
out_list[i + offset] = in_list[i + offset].lemma_ \
if in_list[i + offset].is_alpha and location != u" ".join(target) else u'0'
location = u""
lookup = toponym[0] if corpus != u"wiki" else toponym[1]
target_grid = get_coordinates(c, lookup)
if len(target_grid) == 0:
raise Exception(u"No entry in the database!", lookup)
entities_near = merge_lists(locations_near)
entities_far = merge_lists(locations_far)
locations_near, locations_far = [], []
o.write(lat + u"\t" + lon + u"\t" + str(near_out) + u"\t" + str(far_out) + u"\t")
o.write(str(target_grid) + u"\t" + str([t.lower() for t in lookup.split()][:TARGET_LENGTH]))
o.write(u"\t" + str(entities_near) + u"\t" + str(entities_far) + u"\n")
if not captured:
print (line_no, line, target, start, end)
o.close()
def visualise_2D_grid(x, title, log=False):
"""
Display 2D array data with a title. Optional: log for better visualisation of small values.
:param x: 2D numpy array you want to visualise
:param title: of the chart because it's nice to have one :-)
:param log: True in order to log the values and make for better visualisation, False for raw numbers
"""
if log:
x = np.log10(x)
cmap = colors.LinearSegmentedColormap.from_list('my_colormap', ['lightgrey', 'darkgrey', 'dimgrey', 'black'])
cmap.set_bad(color='white')
img = pyplot.imshow(x, cmap=cmap, interpolation='nearest')
pyplot.colorbar(img, cmap=cmap)
plt.title(title)
# plt.savefig(title + u".png", dpi=200, transparent=True) # Uncomment to save to file
plt.show()
def generate_vocabulary(path, min_words, min_entities):
"""
Prepare the vocabulary for training/testing. This function is to be called on generated data only, not plain text.
:param path: to the file from which to build
:param min_words: occurrence for inclusion in the vocabulary
:param min_entities: occurrence for inclusion in the vocabulary
"""
vocab_words, vocab_locations = {UNKNOWN, u'0'}, {UNKNOWN, u'0'}
words, locations = [], []
for f in [path]: # You can also build the vocabulary from several files, just add to the list.
training_file = codecs.open(f, u"r", encoding=u"utf-8")
for line in training_file:
line = line.strip().split("\t")
words.extend([w for w in eval(line[2]) if u"**LOC**" not in w]) # NEAR WORDS
words.extend([w for w in eval(line[3]) if u"**LOC**" not in w]) # FAR WORDS
locations.extend([w for w in eval(line[2]) if u"**LOC**" in w]) # NEAR ENTITIES
locations.extend([w for w in eval(line[3]) if u"**LOC**" in w]) # FAR ENTITIES
words = Counter(words)
for word in words:
if words[word] > min_words:
vocab_words.add(word)
print(u"Words saved:", len(vocab_words))
locations = Counter(locations)
for location in locations:
if locations[location] > min_entities:
vocab_locations.add(location.replace(u"**LOC**", u""))
print(u"Locations saved:", len(vocab_locations))
vocabulary = vocab_words.union(vocab_locations)
word_to_index = dict([(w, i) for i, w in enumerate(vocabulary)])
pickle.dump(word_to_index, open(u"data/words2index.pkl", "w"))
def generate_arrays_from_file(path, words_to_index, train=True):
"""
Generator function for the FULL (SOTA) CNN + map_vector model in the paper. Uses all available data inputs.
:param path: to the training file (see training data generation functions)
:param words_to_index: the vocabulary set
:param train: True is generating training data, false for test data
"""
while True:
training_file = codecs.open(path, "r", encoding="utf-8")
counter = 0
context_words, entities_strings, labels = [], [], []
map_vector, target_string = [], []
for line in training_file:
counter += 1
line = line.strip().split("\t")
labels.append(construct_map_vector([(float(line[0]), float(line[1]), 0)], 2, ENCODING_MAP_2x2, OUTLIERS_MAP_2x2))
near = [w if u"**LOC**" not in w else u'0' for w in eval(line[2])]
far = [w if u"**LOC**" not in w else u'0' for w in eval(line[3])]
context_words.append(far[:CONTEXT_LENGTH / 2] + near + far[CONTEXT_LENGTH / 2:])
near = [w.replace(u"**LOC**", u"") if u"**LOC**" in w else u'0' for w in eval(line[2])]
far = [w.replace(u"**LOC**", u"") if u"**LOC**" in w else u'0' for w in eval(line[3])]
entities_strings.append(far[:CONTEXT_LENGTH / 2] + near + far[CONTEXT_LENGTH / 2:])
# map_vector.append(construct_map_vector(sorted(eval(line[4]) + eval(line[6]) + eval(line[7]),
# key=lambda (a, b, c, d): c, reverse=True), 1, ENCODING_MAP_1x1, OUTLIERS_MAP_1x1))
# paper version above versus small experimental setup below, map_vector is fully modular, remember? Try both!
map_vector.append(construct_map_vector(eval(line[4]) + eval(line[6]) + eval(line[7]), 1, ENCODING_MAP_1x1, OUTLIERS_MAP_1x1))
target_string.append(pad_list(TARGET_LENGTH, eval(line[5]), True, u'0'))
if counter % BATCH_SIZE == 0:
for collection in [context_words, entities_strings, target_string]:
for x in collection:
for i, w in enumerate(x):
if w in words_to_index:
x[i] = words_to_index[w]
else:
x[i] = words_to_index[UNKNOWN]
if train:
yield ([np.asarray(context_words), np.asarray(context_words), np.asarray(entities_strings),
np.asarray(entities_strings), np.asarray(map_vector), np.asarray(target_string)], np.asarray(labels))
else:
yield ([np.asarray(context_words), np.asarray(context_words), np.asarray(entities_strings),
np.asarray(entities_strings), np.asarray(map_vector), np.asarray(target_string)])
context_words, entities_strings, labels = [], [], []
map_vector, target_string = [], []
if len(labels) > 0: # This block is only ever entered at the end to yield the final few samples. (< BATCH_SIZE)
for collection in [context_words, entities_strings, target_string]:
for x in collection:
for i, w in enumerate(x):
if w in words_to_index:
x[i] = words_to_index[w]
else:
x[i] = words_to_index[UNKNOWN]
if train:
yield ([np.asarray(context_words), np.asarray(context_words), np.asarray(entities_strings),
np.asarray(entities_strings), np.asarray(map_vector), np.asarray(target_string)], np.asarray(labels))
else:
yield ([np.asarray(context_words), np.asarray(context_words), np.asarray(entities_strings),
np.asarray(entities_strings), np.asarray(map_vector), np.asarray(target_string)])
def generate_arrays_from_file_lstm(path, words_to_index, train=True):
"""
Generator for the context2vec model. Uses only lexical features.
To replicate the map_vector + CONTEXT2VEC model from the paper, uncomment a few sections below
and in the context2vec.py file. I hope it's clear enough :-) Email me if it isn't!
:param path: to the training file (see training data generation functions)
:param words_to_index: the vocabulary set
:param train: True for training stage, False for testing stage
"""
while True:
training_file = codecs.open(path, "r", encoding="utf-8")
counter = 0
left, right, map_vector = [], [], []
target_string, labels = [], []
for line in training_file:
counter += 1
line = line.strip().split("\t")
labels.append(construct_map_vector([(float(line[0]), float(line[1]), 0)], 2, ENCODING_MAP_2x2, OUTLIERS_MAP_2x2))
near = [w.replace(u"**LOC**", u"") for w in eval(line[2])]
far = [w.replace(u"**LOC**", u"") for w in eval(line[3])]
left.append(far[:CONTEXT_LENGTH / 2] + near[:CONTEXT_LENGTH / 2])
right.append(near[CONTEXT_LENGTH / 2:] + far[CONTEXT_LENGTH / 2:])
target_string.append(pad_list(TARGET_LENGTH, eval(line[5]), True, u'0'))
# map_vector.append(construct_map_vector(eval(line[4]) + eval(line[6]) + eval(line[7]), 1, ENCODING_MAP_1x1, OUTLIERS_MAP_1x1))
if counter % BATCH_SIZE == 0:
for collection in [left, right, target_string]:
for x in collection:
for i, w in enumerate(x):
if w in words_to_index:
x[i] = words_to_index[w]
else:
x[i] = words_to_index[UNKNOWN]
if train:
yield ([np.asarray(left), np.asarray(right), np.asarray(target_string)], np.asarray(labels))
# yield ([np.asarray(left), np.asarray(right), np.asarray(map_vector), np.asarray(target_string)], np.asarray(labels))
else:
yield ([np.asarray(left), np.asarray(right), np.asarray(target_string)])
# yield ([np.asarray(left), np.asarray(right), np.asarray(map_vector), np.asarray(target_string)])
left, right, map_vector = [], [], []
target_string, labels = [], []
if len(labels) > 0: # This block is only ever entered at the end to yield the final few samples. (< BATCH_SIZE)
for collection in [left, right, target_string]:
for x in collection:
for i, w in enumerate(x):
if w in words_to_index:
x[i] = words_to_index[w]
else:
x[i] = words_to_index[UNKNOWN]
if train:
yield ([np.asarray(left), np.asarray(right), np.asarray(target_string)], np.asarray(labels))
# yield ([np.asarray(left), np.asarray(right), np.asarray(map_vector), np.asarray(target_string)], np.asarray(labels))
else:
yield ([np.asarray(left), np.asarray(right), np.asarray(target_string)])
# yield ([np.asarray(left), np.asarray(right), np.asarray(map_vector), np.asarray(target_string)])
def generate_strings_from_file(path):
"""
Generator of labels, location names and context. Used for training and testing.
:param path: to the training file (see training data generation functions)
:return: Yields a list of tuples [(label, location name, context), ...]
"""
while True:
for line in codecs.open(path, "r", encoding="utf-8"):
line = line.strip().split("\t")
context = u" ".join(eval(line[2])) + u"*E*" + u" ".join(eval(line[5])) + u"*E*" + u" ".join(eval(line[3]))
yield ((float(line[0]), float(line[1])), u" ".join(eval(line[5])).strip(), context)
def generate_arrays_from_file_map_vector(path, train=True, looping=True):
"""
Generator for the plain map_vector model, works for MLP, Naive Bayes or Random Forest. Table 2 in the paper.
:param path: to the training file (see training data generation functions)
:param train: True for training phase, False for testing phase
:param looping: True for continuous generation, False for one iteration.
"""
while True:
training_file = codecs.open(path, "r", encoding="utf-8")
counter = 0
labels, target_coord = [], []
for line in training_file:
counter += 1
line = line.strip().split("\t")
labels.append(construct_map_vector([(float(line[0]), float(line[1]), 0, u'')], 2, ENCODING_MAP_2x2, OUTLIERS_MAP_2x2))
target_coord.append(construct_map_vector(eval(line[4]) + eval(line[6]) + eval(line[7]), 1, ENCODING_MAP_1x1, OUTLIERS_MAP_1x1))
if counter % BATCH_SIZE == 0:
if train:
yield ([np.asarray(target_coord)], np.asarray(labels))
else:
yield ([np.asarray(target_coord)])
labels = []
target_coord = []
if len(labels) > 0:
# This block is only ever entered at the end to yield the final few samples. (< BATCH_SIZE)
if train:
yield ([np.asarray(target_coord)], np.asarray(labels))
else:
yield ([np.asarray(target_coord)])
if not looping:
break
def shrink_map_vector(polygon_size):
"""
Remove polygons that only cover oceans. Dumps a dictionary of DB entries.
:param polygon_size: the size of each polygon such as 1x1 or 2x2 or 3x3 degrees (integer)
"""
map_vector = np.zeros((180 / polygon_size) * (360 / polygon_size),)
for line in codecs.open(u"../data/allCountries.txt", u"r", encoding=u"utf-8"):
line = line.split("\t")
lat, lon = float(line[4]), float(line[5])
index = coord_to_index((lat, lon), polygon_size=polygon_size)
map_vector[index] += 1.0
pickle.dump(map_vector, open(u"mapvec_shrink.pkl", "w"))
def oracle(path):
"""
Calculate the Oracle (best possible given your database) performance for a given dataset.
Prints the Oracle scores including mean, median, AUC and acc@161.
:param path: file path to evaluate
"""
final_errors = []
conn = sqlite3.connect(u'../data/geonames.db')
for line in codecs.open(path, "r", encoding="utf-8"):
line = line.strip().split("\t")
coordinates = (float(line[0]), float(line[1]))
best_candidate = []
for candidate in get_coordinates(conn.cursor(), u" ".join(eval(line[5])).strip()):
best_candidate.append(great_circle(coordinates, (float(candidate[0]), float(candidate[1]))).km)
final_errors.append(sorted(best_candidate)[0])
print_stats(final_errors)
# --------------------------------------------- INVOKE FUNCTIONS ---------------------------------------------------
# prepare_geocorpora()
# print get_coordinates(sqlite3.connect('../data/geonames.db').cursor(), u"dublin")
# generate_training_data()
# generate_evaluation_data(corpus="geovirus", file_name="")
# generate_vocabulary(path=u"../data/train_wiki.txt", min_words=9, min_entities=1)
# shrink_map_vector(2)
# oracle(u"data/eval_geovirus_gold.txt")
# conn = sqlite3.connect('../data/geonames.db')
# c = conn.cursor()
# c.execute("INSERT INTO GEO VALUES (?, ?)", (u"darfur", u"[(13.5, 23.5, 0), (44.05135, -94.83804, 106)]"))
# c.execute("DELETE FROM GEO WHERE name = 'darfur'")
# conn.commit()
# print index_to_coord(8177, 2)
# populate_sql()
# -------- CREATE MAPS (mapping from 64,000/16,200 polygons to 23,002, 7,821) ------------
# map_vector = list(cPickle.load(open(u"data/1x1_geonames.pkl")))
# zeros = dict([(i, v) for i, v in enumerate(map_vector) if v > 0]) # isolate the non zero values
# zeros = dict([(i, v) for i, v in enumerate(zeros)]) # replace counts with indices
# zeros = dict([(v, i) for (i, v) in zeros.iteritems()]) # reverse keys and values
# cPickle.dump(zeros, open(u"data/1x1_encode_map.pkl", "w"))
# ------- VISUALISE THE WHOLE DATABASE ----------
# map_vector = np.reshape(map_vector, newshape=((180 / 1), (360 / 1)))
# visualise_2D_grid(map_vector, "Geonames Database", True)
# -------- CREATE OUTLIERS (polygons outside of map_vector) MAP --------
# filtered = [i for i, v in enumerate(map_vector) if v > 0]
# the_rest = [i for i, v in enumerate(map_vector) if v == 0]
# poly_size = 2
# dict_rest = dict()
#
# for poly_rest in the_rest:
# best_index = 100000
# best_dist = 100000
# for poly_filtered in filtered:
# dist = great_circle(index_to_coord(poly_rest, poly_size), index_to_coord(poly_filtered, poly_size)).km
# if dist < best_dist:
# best_index = poly_filtered
# best_dist = dist
# dict_rest[poly_rest] = best_index
#
# cPickle.dump(dict_rest, open(u"data/2x2_outliers_map.pkl", "w"))
# ------ PROFILING SETUP -----------
# import cProfile, pstats, StringIO
# pr = cProfile.Profile()
# pr.enable()
# CODE HERE
# pr.disable()
# s = StringIO.StringIO()
# sortby = 'cumulative'
# ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
# ps.print_stats()
# print s.getvalue()
# ----------- VISUALISATION OF DIFFERENT LOCATIONS -------------
# print len(get_coordinates(sqlite3.connect('../data/geonames.db').cursor(), u"Melbourne"))
# coord = get_coordinates(sqlite3.connect('../data/geonames.db').cursor(), u"Giza")
# print coord
# coord.extend(get_coordinates(sqlite3.connect('../data/geonames.db').cursor(), u"Giza Plateau"))
# coord.extend(get_coordinates(sqlite3.connect('../data/geonames.db').cursor(), u"Cairo"))
# coord.extend(get_coordinates(sqlite3.connect('../data/geonames.db').cursor(), u"Egypt"))
# coord = sorted(coord, key=lambda (a, b, c, d): c, reverse=True)
# x = construct_map_vector_full_scale(coord, polygon_size=2)
# x = np.reshape(x, newshape=((180 / 2), (360 / 2)))
# visualise_2D_grid(x, "Giza, Giza Plateau, Egypt, Cairo", True)
# ---------- DUMP DATABASE ------
# import sqlite3
#
# con = sqlite3.connect('../data/geonames.db')
# with codecs.open('dump.sql', 'w', 'utf-8') as f:
# for line in con.iterdump():
# f.write('%s\n' % line)
# -------------------------------