-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathactor_replacement.py
executable file
·107 lines (73 loc) · 2.51 KB
/
actor_replacement.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/env python
from jsonlite import jsonlite2json
from replacement_factory import replacement_factory
import json
import sys
import re
import os
DEBUG = False
if len(sys.argv) < 4:
print("Usage: actor_replacement.py parsed_text.jsonlite words_to_actor.csv words_to_words.csv")
print()
print("Actor dict can be generated from CSV using actor_parser.py")
sys.exit(1)
parsed_jsonl, words_to_actor, words_to_words = sys.argv[1:]
with open(parsed_jsonl) as f:
data = jsonlite2json(f)
#
# --- Handle words to entity replacements ---
#
import csv
f = csv.reader(open(words_to_actor))
header = next(f)
replacement_values = header[1:5]
replacements = [line[1:5] for line in f]
# These classes will be parsed out later. For now, dump them.
replacements = [[item.split(':')[0].strip() for item in line] for line in replacements]
repl_dict_actors = {}
for line in replacements:
for i, item in enumerate(line):
if item.strip():
repl_dict_actors[item.lower()] = replacement_values[i].lower()
#
# --- Handle words to words replacements ---
#
print(words_to_words)
f = csv.reader(open(words_to_words))
header = next(f)
assert('Lemmas' in header[0])
repl_dict_verbs = {}
for line in f:
words = [w.strip() for w in line[:2]]
if words[0] and words[1]:
repl_dict_verbs[words[0].lower()] = words[1].lower()
multiple_replace_actors = replacement_factory(repl_dict_actors)
multiple_replace_verbs = replacement_factory(repl_dict_verbs)
# Here we specify where the dictionary has to be applied
for entry in data:
for key in entry:
entry[key] = entry[key].lower()
for i, entry in enumerate(data):
if DEBUG:
if any(k in entry['text'] for k in keys):
print()
print('<<', entry['text'])
print('>>', multiple_replace(entry['text']))
if 'S' in entry:
entry['S'] = multiple_replace_actors(entry['S'])
if 'O' in entry:
entry['O'] = multiple_replace_actors(entry['O'])
if 'tua' in entry:
entry['tua'] = multiple_replace_actors(entry['tua'])
if 'Lemma' in entry:
entry['Lemma'] = multiple_replace_verbs(entry['Lemma'])
if 'tua' in entry:
entry['tua'] = multiple_replace_verbs(entry['tua'])
if (i % 1000 == 0):
print('Progress: %.1f%%\r' % (i / (len(data) - 1) * 100), end='')
print("Progress: 100%")
fn = 'output_' + os.path.split(parsed_jsonl)[-1]
with open(fn, 'w') as f:
print('Saving ouput to', fn)
for line in data:
f.writelines([json.dumps(line), '\n'])