-
Notifications
You must be signed in to change notification settings - Fork 0
/
references.py
105 lines (92 loc) · 4.43 KB
/
references.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import utils
# --------------------------------------------------------------------------------------------
# Code below resolves references based on simple mentioned male/female entity memory scheme:
# 1. Parse every sentence - find person & PRP tags
# 2. Determine person's sex & store him/her as last one mentioned (separate memory for male/female)
# 3. If PRP is found, then based on sex & last memorized person the person name is assigned to the PRP
# --------------------------------------------------------------------------------------------
class References:
def get_gender(self, word):
if word.lower() in ("she", "her"): return "female"
if word.lower() in ("he", "him", "his"): return "male"
return "?"
def find(self, people, sentences, tagged_sentences):
# create a all possible lowercase names & drink beer later :P
names = utils.get_names_dict(people)
# find names in text and mark them with special symbols & append additional data
new_tagged_sentences = []
for index, sentence in enumerate(tagged_sentences):
sent = []
for word, tag in sentence:
key = word.lower()
if key in names:
sent.append((word.lower(), tag, '+', names[key])) # add `+` to a name & include data
else:
sent.append((word.lower(), tag, 'o', None)) # not a name - mark as `o`
new_tagged_sentences.append(sent)
# store references as lists: [PRP, fullname, sentence_index]
refs = []
# store last he & she while scanning sentences,
# if unknown sex name is found - store it also (determine type on first PRP found)
last_he = [None, 0] # store as: [people_data, word_index_in_text]
last_she = [None, 0]
last_unknown = [None, 0]
word_index = 0
for index, sentence in enumerate(new_tagged_sentences):
for element in sentence:
word, tag, flag, data = element # unpack all data
word_index += 1 # increase processed words index
if tag.startswith("PRP") and len(word) <= 5 and (word not in ("it", "our", "their", "us", "its", "we", "they")): # a determined reference was found!
### unknown sex resolver
if last_unknown[0]: # if we have an unknown name without sex, then assign the next first found sex to it
gender = self.get_gender(word)
print "Last unknown person -", last_unknown[0]['fullname'], "- was assigned sex:", gender
last_unknown[0]['sex'] = gender
# the hack has been fixed :P
if gender == "male":
if last_he[0]: # last `he` exists
if last_he[1] < last_unknown[1]: # override if unknown is newer
last_he = last_unknown
else:
last_he = last_unknown # override as no `he` exists
elif gender == "female":
if last_she[0]:
if last_she[1] < last_unknown[1]:
last_she = last_unknown
else:
last_she = last_unknown
if gender in ("male", "female"): # clear only if actualy determined
last_unknown = [None, 0] # clear
#### determine the person mentioned
#print "*"*80
matched = None
if word in ("he", "his", "him") and last_he[0]: # male
matched = last_he
elif word in ("she", "her") and last_she[0]: # female
matched = last_she
elif word in ("i", "me", "our"): # cannot determine sex - as multi
if last_he[0] and last_she[0]: # we have both sex types in memory, choose the last one mentioned
if last_he[1] > last_she[1]: # he is more fresh
matched = last_he
else: # she is more fresh
matched = last_she
elif last_he[0]: # we have only `he` in memory
matched = last_he
elif last_she[0]: # we have only `she` in memory
matched = last_she
if matched: # refresh index - this person has been just mentioned
matched[1] = word_index
#print "REF[", word, "] is -", matched[0]['fullname'], "- in sentence Nr.", index
refs.append([word, matched[0]['fullname'], index])
else:
#print "REF[", word, "] is", "UNKNOWN", "in sentence Nr.", index
refs.append([word, "?", index])
#print "\t", sentences[index] # show the corresponding sentence with the match
elif flag == "+": # a word is a name, so put it into memory
if data["sex"] == "male":
last_he = [data, word_index]
elif data["sex"] == "female":
last_she = [data, word_index]
elif data['sex'] == "?":
last_unknown = [data, word_index]
return refs