-
Notifications
You must be signed in to change notification settings - Fork 1
/
build_mentions.py
133 lines (126 loc) · 5.48 KB
/
build_mentions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from spacy.language import Language
from spacy.tokens import Span, Token, Doc
def build_mention(
heads : list[Token],
nlp: Language,
extra_detached_dep:list[str] = None
) -> Span:
'''Builds a mention span from the heads of a mention
the mention is built using spacy's french parse tree
detached_dep : list of dependencies of the head of the mention to exclude from the span
'''
rules_analyzer = nlp.get_pipe("coreferee").annotator.rules_analyzer
if isinstance(heads, Token):
heads = [heads]
doc = heads[0].doc
mention_pos_before = ("PROPN","NOUN","ADJ","DET","NUM")
mention_pos_after = ("PROPN","NOUN","ADJ","NUM", "PRON", "PART", "ADV", "VERB", "AUX")
detached_dep = ["appos","dislocated","advmod","obl:mod",
"obl:arg","obl:agent","obl","orphan","parataxis"]
detached_dep = ["appos","dislocated","advmod","obl:mod",
"obl:arg","obl:agent","obl","orphan","parataxis"]
if extra_detached_dep is not None:
detached_dep.extend(extra_detached_dep)
start = heads[0].left_edge.i
end = heads[-1].right_edge.i
siblings = rules_analyzer.get_dependent_siblings(heads[0])
unincluded_siblings_tokens = set()
for sibling in siblings:
if sibling not in heads:
unincluded_siblings_tokens.update(sibling.subtree)
unincluded_siblings_tokens.update(doc[sibling.i:end+1])
#be clauses are parsed differently
subj_attr_subtree = set()
cops = [cop for cop in heads[0].children if cop.dep_ == "cop"]
if cops:
# We will trim all the tokens that are in the copula or subj attribute
subjs = [s for s in heads[0].children if s.dep_ == "nsubj"]
cop_subtree = set(cops[0].subtree)
subj_subtree = set(subjs[0].subtree) if subjs else set()
subj_attr_subtree = cop_subtree | subj_subtree
if cops[0].i < heads[0].i:
subj_attr_subtree.update(doc[start:cops[0].i])
if cops[0].i > heads[-1].i:
subj_attr_subtree.update(doc[cops[0].i:end+1])
if subjs and subjs[0].i < heads[0].i:
subj_attr_subtree.update(doc[start:subjs[0].i])
if subjs and subjs[0].i > heads[-1].i:
subj_attr_subtree.update(doc[subjs[0].i:end+1])
detached_tokens = set()
for c in heads[0].children:
if (c.dep_ in detached_dep or
c.dep_ == "acl" and "VerbForm=Inf" in c.morph):
detached_tokens.update(c.subtree)
if c.i < heads[0].i:
detached_tokens.update(doc[start:c.i])
if c.i > heads[-1].i:
detached_tokens.update(doc[c.i:end+1])
# Trims the mentions
# left
for i in range(start, heads[0].i, 1):
if (
(doc[i].pos_ not in mention_pos_before and doc[i].lemma_ != "-")
or doc[i] in subj_attr_subtree|detached_tokens|unincluded_siblings_tokens):
start = i + 1
else:
break
#right
for j in range(end, heads[-1].i, -1):
if (
(doc[j].pos_ not in mention_pos_after )
or doc[j] in subj_attr_subtree|detached_tokens|unincluded_siblings_tokens):
end = j - 1
else:
break
return doc[start:end+1]
def create_mentions(
doc: Doc,
nlp: Language,
add_singletons: bool =False,
add_coordinated_singletons: bool=False
) -> dict[Span, int]:
'''
Return a dict with:
key: all the mention phrases found in a document
value : the index of the coreference chain they are part of
By default only corefering mentions are included
set add_singletons = True to include singleton mentions as well
'''
rules_analyzer = nlp.get_pipe("coreferee").annotator.rules_analyzer
def is_mention_head(token):
return (rules_analyzer.is_independent_noun(token) or
rules_analyzer.is_potential_anaphor(token))
indexed_mentions = {}
for chain in doc._.coref_chains:
for mention in chain:
mention_heads = [doc[i] for i in mention.token_indexes]
mention_phrase = build_mention(mention_heads, nlp)
#print(mention_phrase, (mention_start, mention_end), chain.index)
indexed_mentions[mention_phrase] = chain.index
if add_singletons:
try:
last_chain_index = chain.index
except NameError:
last_chain_index = 0
for token in doc:
if not is_mention_head(token): continue
mention_phrase = build_mention([token], nlp)
if mention_phrase not in indexed_mentions:
last_chain_index +=1
indexed_mentions[mention_phrase] = last_chain_index
if not add_coordinated_singletons : continue
siblings = rules_analyzer.get_dependent_siblings(token)
if not siblings or not all(is_mention_head(s) for s in siblings):continue
mention_phrase = build_mention([token]+siblings, nlp)
if mention_phrase not in indexed_mentions:
last_chain_index +=1
indexed_mentions[mention_phrase] = last_chain_index
return indexed_mentions
def make_new_chains(new_mentions):
new_chains = {}
for new_mention, chain_index in new_mentions.items():
if chain_index in new_chains:
new_chains[chain_index].append(new_mention)
else:
new_chains[chain_index] = [new_mention]
return new_chains