-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract_features.py
157 lines (136 loc) · 5.24 KB
/
extract_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# coding: utf-8
import lxml.etree as ET
import os, csv
from features import get_features
# check hte disambiguated corpus and extract all the features for the verb pairs
PATH_TO_CORPUS = os.path.join(os.getcwd(), 'source/post1950/anecdota')
#PATH_TO_CORPUS = os.path.join(os.getcwd(), 'texts')
PATH_TO_VERBS = ['verbs_prefixes.csv', 'verbs_zal.txt']
errors = open('Parse_errors.txt', 'w')
class Corpus():
def __init__(self):
self.verbs = set([])
self.partcp = set([])
self.gerund = set([])
def load_file(self, path, verbs):
print(path)
"""
Open RNC XML and get all unique tokens
"""
tree = ET.parse(path)
for elem in tree.iter('w'):
word = ''.join(elem.itertext()).lower().replace('`', '') # remove stress
for item in elem.iter('ana'):
info = item
print(info)
try:
info_prev = [t for t in info.getparent().getprevious() if t.tag == 'ana'][0]
except TypeError:
info_prev = None
except IndexError:
info_prev = None
#print(ET.tostring(info.getparent().getprevious(), encoding='utf-8'))
break
#lemma = [item.get("lex") for item in elem.iter('ana')] # todo: deal with homonymy?
lemma = info.get('lex')
# get POS tag
tag = info.get("gr").split('=')[0].split(',')[0]
if lemma in verbs and tag == 'V':
features = get_features(info, info_prev)
verb = Verb(lemma, word, *features)
if verb.form == 'partcp':
self.partcp.add(verb)
elif verb.form == 'ger':
self.gerund.add(verb)
self.verbs.add(verb)
def load_dir(self, path, verbs):
"""
Traverse a given directory and add all text files
:param path: path to corpus folder
"""
for root, dirs, files in os.walk(path):
for name in files:
if name.endswith('ml'): # todo open all files, but throw warnings if they are not corpus files
try:
self.load_file(os.path.join(root, name), verbs)
except:
errors.write(path + '\n')
def to_csv(self):
"""
Write featurized verbs to csv file
"""
HEADER = ('token', 'lemma', 'aspect', 'form', 'transitivity',
'number', 'tense', 'mood', 'person', 'voice')
with open('feature_matrix_big.csv', 'w') as out:
writer = csv.writer(out, delimiter=',', quotechar='"')
writer.writerow(HEADER)
for verb in self.verbs:
row = (
verb.wf, verb.lemma, verb.aspect, verb.form, verb.transitivity,
verb.number, verb.tense, verb.mood, verb.person, verb.voice
)
writer.writerow(row)
def participles(self):
"""
Write featurized participles to csv file for inspection
"""
HEADER = ('token', 'lemma', 'aspect', 'form', 'transitivity',
'number', 'tense', 'mood', 'person', 'voice')
with open('participles_big.csv', 'w') as out:
writer = csv.writer(out, delimiter=',', quotechar='"')
writer.writerow(HEADER)
for verb in self.partcp:
row = (
verb.wf, verb.lemma, verb.aspect, verb.form, verb.transitivity,
verb.number, verb.tense, verb.mood, verb.person, verb.voice
)
writer.writerow(row)
def gerunds(self):
"""
Write featurized gerunds to csv file for inspection
"""
HEADER = ('token', 'lemma', 'aspect', 'form', 'transitivity',
'number', 'tense', 'mood', 'person', 'voice')
with open('gerunds_big.csv', 'w') as out:
writer = csv.writer(out, delimiter=',', quotechar='"')
writer.writerow(HEADER)
for verb in self.gerund:
row = (
verb.wf, verb.lemma, verb.aspect, verb.form, verb.transitivity,
verb.number, verb.tense, verb.mood, verb.person, verb.voice
)
writer.writerow(row)
class Verb():
def __init__(self, lemma, wf, aspect, tense, person, number, trans, voice, form, mood):
self.lemma = lemma
self.wf = wf
self.form = form
self.aspect = aspect
self.transitivity = trans
self.number = number
self.tense = tense
self.mood = mood
self.person = person
self.voice = voice
def load_verbs(path):
verbs = set()
with open(path[0]) as f:
reader = csv.reader(f, delimiter=',')
for row in reader:
verbs.add(row[0])
verbs.add(row[1])
with open(path[1]) as f:
for line in f:
verbs.add(line.strip())
return verbs
def run():
verbs = load_verbs(PATH_TO_VERBS)
corpus = Corpus()
corpus.load_dir(PATH_TO_CORPUS, verbs)
corpus.to_csv()
corpus.participles()
corpus.gerunds()
if __name__ == '__main__':
# test()
run()
errors.close()