-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexample.py
67 lines (53 loc) · 2.08 KB
/
example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from uralicNLP.cg3 import Cg3
from uralicNLP import uralicApi
import xml.etree.cElementTree as ET
input_xml = "some_file.xml"
output_xml = "new_file.xml"
# This function takes from uralicNLP's output those
# lemmas that all analysis agree on. If there is no
# agreement, underline is returned to mark empty spot.
def get_lonely_lemmas(ambiguities):
lemmas = set([])
for analysis in ambiguities:
analysis_components = analysis[0].split("+")
lemmas.add(analysis_components[0])
if len(lemmas) == 1:
return(''.join(sorted(lemmas)))
else:
return("_")
# This function returns all those tags that the different
# analysis agree on. There are multiple ways to resolve this question
# but this could be one way to deal with it. Another solution would
# be to add into
def get_agreed_tags(ambiguities):
tags = []
for analysis in ambiguities:
analysis_components = analysis[0].split("+")
analysis_components.pop(0) # removes the lemma
tags.append(analysis_components)
# print(tags)
if tags:
agreed_tags = set.intersection(*map(set,tags)) # picks the shared tags
agreed_tags_str = ' '.join(agreed_tags)
if agreed_tags_str:
return(agreed_tags_str)
else:
return("_")
else:
return("_")
# Here we read the XML file
tree = ET.parse(input_xml)
root = tree.getroot()
# Now we loop over each sentence
for sentence in root.findall('p/sentence'):
annotated_text = "\n"
for line in sentence.text.splitlines():
if line:
line_content = line.split("\t")
# print(line_content) # Uncommenting this is useful in checking where the script goes
analysis = uralicApi.analyze(line_content[1], "kpv")
line_text = line_content[0] + "\t" + line_content[1] + "\t" + line_content[2] + "\t" + get_lonely_lemmas(analysis) + "\t" + get_agreed_tags(analysis) + "\n"
annotated_text += line_text
sentence.text = annotated_text
# In the end we write the new XML file
tree.write(output_xml, encoding="UTF-8")