forked from techknowledgist/act
-
Notifications
You must be signed in to change notification settings - Fork 0
/
canon.py
280 lines (231 loc) · 8.9 KB
/
canon.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
# -*- coding: utf-8 -*-
# canon.py
# routines for canonicalizing English phrases
# using dictionary lemmatization (based on 55k lexicon.tcl)
# for unknown nouns: inflect.py package (debugged by PGA)
# original version was from https://pypi.python.org/pypi/inflect
# PGA made some corrections
# for unknown verbs: nltk porter stemmer
import sys
import pdb
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
import re
import inflect
ie = inflect.engine()
lexicon_file = "lexicon.tcl"
class Canon():
def __init__(self):
self.d_lex = {}
# keep a dict mapping head nouns to their canonical form
# this is needed to regenerate surface forms for canonicalized noun phrases
self.d_n2canon = {}
self.process_lexicon(lexicon_file)
def process_lexicon(self, lexicon_file):
# lexicon should be loaded before creating chart instances
s_lex = open(lexicon_file, "r")
for line in s_lex:
line = line.strip("\n")
line = line.strip(" ")
(surface, lemma, pos, pos_detailed, paradigm_info) = line.split(" ")
# key is pos <underscore> surface form
# pos: verb, noun, conjunction, preposition, adjective, adverb
key = pos + "_" + surface
self.d_lex[key] = lemma
s_lex.close()
# return lemma for nouns and verbs; lowercased surface for all other pos.
def get_lemma(self, surface, stanford_pos):
# create the dict key from surface and stanford_pos
first_char = stanford_pos[0]
pos = ""
surface = surface.lower()
if first_char == "V":
pos = "verb"
elif first_char == "N":
pos = "noun"
key = pos + "_" + surface
if self.d_lex.has_key(key):
lemma = self.d_lex.get(key)
else:
lemma = None
return(lemma)
# reduce a noun to its singular by stemming only
def stem_noun(self, word):
#print "word: %s" % word
cword = ie.singular_noun(word)
if cword:
return(cword)
else:
return(word)
# canonicalize the head word of an NP
# assume NP is aleady split into a list of words
# get_canon_l_npr(["my", "running", "men"])
def get_canon_l_np(self, l_np):
head = l_np[-1]
lemma = self.get_lemma(head, "N")
canon_head = ""
if lemma != None:
canon_head = lemma
else:
canon_head = head
"""
Removed the defaulting to stemming.
If the head word is unknown, do not canonicalize at all, since
it could be a proper noun or part of a name.
else:
# use the inflect.py stemmer
canon_head = self.stem_noun(head)
"""
if canon_head != head:
l_np[-1] = canon_head
#pdb.set_trace()
# add the canonical entry to the dictionary mapping surface to canonical nouns
if head not in self.d_n2canon:
self.d_n2canon[head] = canon_head
return(l_np)
def get_canon_v(self, head):
lemma = self.get_lemma(head, "V")
canon_head = head
if lemma != None:
canon_head = lemma
else:
# do stemming for unknown verbs using NLTK porter stemmer
canon_head = porter_stemmer.stem(head)
return(canon_head)
# defaults to stemming if noun is not in lexicon
def get_canon_n(self, head):
lemma = self.get_lemma(head, "N")
canon_head = ""
if lemma != None:
canon_head = lemma
else:
# use the inflect.py stemmer
canon_head = self.stem_noun(head)
return(canon_head)
# noun as object of prep
# get_canon_l_npr(["men", "at"])
# get_canon_l_npr(["dialysis", "in"])
def get_canon_l_npr(self, l_npr):
head = l_npr[0]
canon_head = self.get_canon_n(head)
if canon_head != head:
l_npr[0] = canon_head
return(l_npr)
# verb followed by prep
# get_canon_l_npr(["running", "at"])
def get_canon_l_vp(self, l_vp):
head = l_vp[0]
canon_head = self.get_canon_v(head)
#replace the head in the phrase if canonical form is different
if canon_head != head:
l_vp[0] = canon_head
return(l_vp)
# noun or noun phrase
def get_canon_np(self, np):
l_np = np.split(" ")
return(" ".join(self.get_canon_l_np(l_np)))
# verb or verb followed by preposition
def get_canon_vp(self, vp):
l_vp = vp.split(" ")
return(" ".join(self.get_canon_l_vp(l_vp)))
# noun as object of preposition
def get_canon_npr(self, npr):
l_npr = npr.split(" ")
return(" ".join(self.get_canon_l_npr(l_npr)))
"""
canonicalize the head term of features of the form
prev_VNP=disclosed_in|application|by
prev_V=uniting_without
prev_Npr=controllers_for
last_word=programs
last_word
prev_J
prev_Jpr
prev_Npr
prev_V
prev_VNP
"""
# features contain type=value.
def get_canon_feature(self, feat):
(feat_type, value) = feat.split("=")
#pdb.set_trace()
# sometimes the porter stemmer generates an error:
# UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2 in position 9: ordinal not in range(128)
try:
if feat_type[0:6] == "prev_V":
# handle the optional separator "_"
l_value = value.split("_")
verb = self.get_canon_v(l_value[0])
l_value[0] = verb
return(feat_type + "=" + "_".join(l_value))
elif feat_type in ( "prev_N", "prev_Npr", "last_word"):
# handle the optional separator "_"
l_value = value.split("_")
noun = self.get_canon_n(l_value[0])
l_value[0] = noun
return(feat_type + "=" + "_".join(l_value))
else:
return(feat)
except:
# return feat unchanged
return(feat)
# noise detection
# note the need for ur and flags in order for the regex to match unicode characters!
re_noise_phrase = re.compile(ur'[\,\+\=\.\:\\\\′\®\±\%\═\≅\>\>\<\≡\≡\″\≡\→\°]', flags=re.UNICODE)
# bibliography names, e.g. "nestle f o"
re_bib_name = re.compile('[a-z]+ [a-z] [a-z]')
# journal name
re_journal_name = re.compile('j[ .][a-z.]+')
# if these words appear in a phrase, we reject the phrase as
# incomplete or inappropriate for bracketing analysis
# u'\u2212' is a type of dash found in doc US20040248097A1 (year 2000 biomed patents)
illegal_words = set([u'\u2212', u'\u2032', u'\u2550', "−", "-", "'s", "'", "′", "%", "co", "et", "much", "millimeter", "milliliter", "mm", "ml", "mg", "kg", "kl", "km", "hr", "μl", "μg", "ng", "μm", "ul", "tm", "am", "pm", "example", "centimeter", "mcg\/liter", "milligram", "liter", "m2\/g", "kcal", "ml\/g", "p\/p", "m2\/g", "mcg\/liter", "lu\/ml", "moles\/l", "g\/l", "w\/cm2", "units\/ml", "w\/v", "w\/w", "g\/ml", "u\/ml", "mol\/kg", "regions\/cm2", "cm2\/cm2", "mmoles\/l", "g\/liter", "\*", "mol\/l", "micromole\/gram", "mg\/g", "mg\/l", "mg\/2", "g\/kg", "kg\/h", "pmole\/cm2", "weight\/weight", "iu\/g", "iu\/ml", "grams\/day", "μmole" ])
max_legal_word_len = 30
def illegal_word_len_p(phr, max_len=max_legal_word_len):
min_len = 2
for word in phr.split(" "):
if len(word) > max_len:
return(True)
elif len(word) < min_len:
return(True)
return(False)
# return True if phr contains illegal punc or a word
# matching an illegal word
def illegal_phrase_p(phr):
# use debugging here to catch illegal words/characters
#if phr.find("nestle") >= 0:
# pdb.set_trace()
# first character of phrase should be alpha
if not phr[0].isalpha():
return(True)
if illegal_word_len_p(phr):
return(True)
illegal_punc_p = bool(re_noise_phrase.search(phr)) or bool(re_bib_name.search(phr)) or bool(re_journal_name.search(phr))
if illegal_punc_p:
return(True)
l_words = phr.split(" ")
if list(illegal_words & set(l_words)) == []:
return(False)
else:
return(True)
# A feature should be in the form name=value
# Check that 1 and only one equal sign occurs in the string
# ie. "=" should split a feature into exactly two parts
def illegal_feature_p(feature):
if len(feature.split("=")) != 2:
return(True)
else:
return(False)
if __name__ == "__main__":
try:
# which field is to be filtered?
filtered_field_idx = sys.argv[1] - 1
except:
# if not specified as an argument, select 0 (first tab separated field)
filtered_field_idx = 0
for line in sys.stdin:
line = line.strip()
l_line = line.split("\t")
term = l_line[filtered_field_idx]
if not illegal_phrase_p(term):
print line