-
Notifications
You must be signed in to change notification settings - Fork 0
/
prepareForCRF1.py
55 lines (47 loc) · 904 Bytes
/
prepareForCRF1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import xml.etree.ElementTree as ET
import pprint
def prepareDictionary(f):
tree = ET.parse(f)
dic = {}
for elem in tree.iter(tag='ENAMEX'):
a = elem.attrib
l = a.values()
s = ''.join(l)
t = elem.text
words = t.split()
c = 0
for word in words:
if c == 0:
dic[word] = 'B'+'-'+s
else:
dic[word] = 'I'+'-'+s
c = c + 1
return dic
f = open('7.xml','rb');
g = open('1.xml','rb');
h = open('final2.txt','wb');
dic = prepareDictionary(g)
for line in f:
words = line.split();
for word in words:
'''
c = 0
if word[-1] == '.':
word = word[0:-1]
c = 1
'''
word = word.decode("utf-8")
try:
value = dic[word]
word = word.encode('utf-8')
s = word + ' ' + value + '\n'
except KeyError:
word = word.encode('utf-8')
s = word + ' ' + 'O' + '\n'
h.write(s)
'''
if c==1:
s = '. O' + '\n\n'
h.write(s)
'''
print line[-1]