-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathelan_app.py
114 lines (85 loc) · 3.82 KB
/
elan_app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from flask import Flask
from flask import request
import re
from collections import OrderedDict
import xml.etree.ElementTree as ET
from uralicNLP import uralicApi
from uralicNLP.cg3 import Cg3
import itertools
app = Flask(__name__)
@app.route("/", methods=['POST'])
def elan():
# This saves the input so it is easier to examine what is going on
with open("examples/input_from_elan.xml","wb") as fo:
fo.write(request.data)
# The language attribute apparently comes from ELAN too somehow
# This should be picked automatically, and the analyser should
# be selected based on that. This works very well when each
# speaker speaks different language (one Komi, another Russian etc.)
# Q: what should be done if the analyser for one language is not found?
cg = Cg3("kpv")
tree = ET.fromstring(request.data)
xmlns = {'corpus': '{http://www.dspin.de/data/textcorpus}'}
# The sentences are somehow tokenized, but this should be done better…
tokens = []
for token in tree.findall('.//{corpus}token'.format(**xmlns)):
text = re.sub("(,|\.|\?|!)", '', token.text)
tokens.append(text)
# This is kind of a fake-approach to just pick one of the readings,
# since I don't know if it is possible to have that through the pipeline now
disambiguations = cg.disambiguate(tokens)
print(disambiguations)
tags = []
lemmas = []
for disambiguation in disambiguations:
possible_words = disambiguation[1]
temp_list = []
for possible_word in possible_words:
possible_word.morphology.pop()
temp_list.append(possible_word.morphology)
flat_list = list(itertools.chain(*temp_list))
unique_list = list(set(flat_list))
tags.append('|'.join(unique_list))
for disambiguation in disambiguations:
possible_words = disambiguation[1]
temp_list = []
for possible_word in possible_words:
temp_list.append(possible_word.lemma)
unique_list = list(set(temp_list))
lemmas.append('|'.join(unique_list))
# I collect here everything into distinct lists so that I can later
# loop over them. There is probably some better data structure in
# Python -- maybe what comes out from uralicNLP is already something better?
token_ids = []
for token in tree.findall('.//{corpus}token'.format(**xmlns)):
token_ids.append(token.attrib['ID'])
tag_ids = []
for token_id in token_ids:
tag_ids.append(re.sub('t', 'pt', token_id))
lemma_ids = []
for token_id in token_ids:
lemma_ids.append(re.sub('t', 'le', token_id))
# This constructs the XML, the namespaces were bit tricky, but everything
# seems to work now. First we create POStags node and then put it to the
# right place.
pos_tag = ET.Element("ns2:POStags", tagset="stts")
textcorpus = tree.find('.//{corpus}TextCorpus'.format(**xmlns))
textcorpus.append(pos_tag)
for token_id, tag_id, tag in zip(token_ids, tag_ids, tags):
current_tag = ET.Element("tag", tokenIDs=token_id, ID=tag_id)
current_tag.text = tag
textcorpus.append(current_tag)
lemma_tag = ET.Element("ns2:lemmas", tagset="stts")
textcorpus = tree.find('.//{corpus}TextCorpus'.format(**xmlns))
textcorpus.append(lemma_tag)
for token_id, lemma_id, lemma in zip(token_ids, lemma_ids, lemmas):
current_lemma = ET.Element("lemma", tokenIDs=token_id, ID=lemma_id)
current_lemma.text = lemma
textcorpus.append(current_lemma)
print(lemma)
# This writes the output into file for examination
with open("output.txt","wb") as fo:
fo.write(ET.tostring(tree))
return(ET.tostring(tree))
if __name__ == "__main__":
app.run()