-
Notifications
You must be signed in to change notification settings - Fork 1
/
JoshSearch.py
158 lines (147 loc) · 9.83 KB
/
JoshSearch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import xml.etree.ElementTree as ET
import os
from utility import deaccent
def perseuscount(froot, i, j, inffile, fn):
"""Prints every instance of this articular infinitive construction for Perseus treebanks."""
idtoheadid = {}
inflist = []
verbslistbyid = []
idtoform = {}
for body in froot:
for sentence in body:
for word in sentence:
if word.tag == 'word':
# Create artheadid{ID:HeadID}
idtoheadid[word.get('id')] = word.get('head')
# Create a list of every id of an infinitive.
if word.get('postag')[4] == 'n':
inflist.append(word.get('id'))
# Create a dictionary idtoform{ID:form}
idtoform[word.get('id')] = word.get('form')
if word.get('postag')[0] == 'v':
verbslistbyid.append(word.get('id'))
for body in froot:
for sentence in body:
for word in sentence:
if word.tag == 'word':
if deaccent(word.get('lemma')) == 'ο' and word.get('head') in inflist and \
word.get('relation') == 'ATR':
infinitiveid = word.get('head')
mainverbid = idtoheadid[infinitiveid]
if mainverbid in verbslistbyid:
for infsubj in sentence:
if infsubj.tag == 'word':
if infsubj.get('head') == infinitiveid and infsubj.get('relation') == 'SBJ' and\
infsubj.get('postag')[7] == 'a':
for infobj in sentence:
if infobj.tag == 'word':
if infobj.get('postag')[7] == 'a' and infobj.get('head')\
== infinitiveid and infobj.get('relation') == 'OBJ':
print(sentence.get('subdoc'), idtoform[mainverbid],
infsubj.get('form'), '(SUBJ)', word.get('form'),
idtoform[infinitiveid], infobj.get('form'))
inffile.writelines([fn, sentence.get('subdoc')])
if int(word.get('id')) > int(infobj.get('id')):
print('^^Backwards^^')
j += 1
i += 1
if infsubj.get('head') == mainverbid and infsubj.get('relation') == 'OBJ' and\
infsubj.get('postag')[7] == 'a':
for infobj in sentence:
if infobj.tag == 'word':
if infobj.get('postag')[7] == 'a' and infobj.get('head') ==\
infinitiveid and infobj.get('relation') == 'OBJ':
print(sentence.get('subdoc'), idtoform[mainverbid],
infsubj.get('form'), "(OBJ)", word.get('form'),
idtoform[infinitiveid], infobj.get('form'))
inffile.writelines([fn, sentence.get('subdoc')])
if int(word.get('id')) > int(infobj.get('id')):
print('^^Backwards^^')
j += 1
i += 1
return i, j, inffile
def proielcount(froot, i, j, inffile, fn):
"""Prints every instance of this articular infinitive construction for PROIEL treebanks."""
idtoheadid = {}
inflist = []
verbslistbyid = []
idtoform = {}
for source in froot:
for division in source:
for sentence in division:
for token in sentence:
if token.tag == 'token' and token.get('empty-token-sort') is None:
# Create artheadid{ID:HeadID}
idtoheadid[token.get('id')] = token.get('head-id')
# Create a list of every id of an infinitive.
if token.get('morphology')[3] == 'n':
inflist.append(token.get('id'))
# Create a dictionary idtoform{ID:form}
idtoform[token.get('id')] = token.get('form')
if token.get('part-of-speech') == 'V-':
verbslistbyid.append(token.get('id'))
for source in froot:
for division in source:
for sentence in division:
if sentence.tag == 'sentence':
for token in sentence:
if token.tag == 'token' and token.get('empty-token-sort') is None:
if deaccent(token.get('lemma')) == 'ο' and token.get('head-id') in inflist and\
token.get('relation') == 'aux':
infinitiveid = token.get('head-id')
mainverbid = idtoheadid[infinitiveid]
if mainverbid in verbslistbyid:
for infsubj in sentence:
if infsubj.tag == 'token' and infsubj.get('empty-token-sort') is None:
if infsubj.get('morphology')[6] == 'a' and \
infsubj.get('head-id') == mainverbid and \
infsubj.get('relation') == 'obj':
for infobj in sentence:
if infobj.tag == 'token' and infobj.get('empty-token-sort') is None:
if infobj.get('morphology')[6] == 'a' and \
infobj.get('head-id') == infinitiveid and \
infobj.get('relation') == 'obj':
print(token.get('citation-part'), idtoform[mainverbid],
infsubj.get('form'), 'OBJ', token.get('form'),
idtoform[token.get('head-id')], infobj.get('form'))
inffile.writelines([fn, token.get('citation-part')])
if int(token.get('id')) > int(infobj.get('id')):
print('^^Backwards!')
j += 1
i += 1
if infsubj.get('morphology')[6] == 'a' and \
infsubj.get('head-id') == infinitiveid and \
infsubj.get('relation') == 'sub':
for infobj in sentence:
if infobj.tag == 'token' and infobj.get('empty-token-sort') is None:
if infobj.get('morphology')[6] == 'a' and \
infobj.get('head-id') == infinitiveid and \
infobj.get('relation') == 'obj':
print(token.get('citation-part'), idtoform[mainverbid],
infsubj.get('form'), 'SUB', token.get('form'),
idtoform[token.get('head-id')], infobj.get('form'))
inffile.writelines([fn, token.get('citation-part')])
if int(token.get('id')) > int(infobj.get('id')):
print('^^Backwards!')
j += 1
i += 1
return i, j, inffile
os.chdir('/home/chris/Desktop/CustomTB')
indir = os.listdir('/home/chris/Desktop/CustomTB')
infFile = open('/home/chris/Desktop/articularinf.txt', 'w')
infFile.write('Every time an articular infinitive occurs with a head that is an explicit verb and that articular '
'infinitive also has an explicit accusative subject and accusative object.')
infCount = 0
backwardCount = 0
for file_name in indir:
if not file_name == 'README.md':
tb = ET.parse(file_name)
tbroot = tb.getroot()
print(file_name)
if tbroot.tag == 'proiel':
infCount, backwardCount, infFile = proielcount(tbroot, infCount, backwardCount, infFile, file_name)
if tbroot.tag == 'treebank':
infCount, backwardCount, infFile = perseuscount(tbroot, infCount, backwardCount, infFile, file_name)
infFile.close()
print('This construction occurs', infCount, 'times.')
print('The object of the infinitive occurs before the article of the infinitive', backwardCount, 'times.')