-
Notifications
You must be signed in to change notification settings - Fork 0
/
rte_features.py
53 lines (45 loc) · 2.11 KB
/
rte_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import nltk
from string_features import jaccard_sim
def lemmatize(word):
stemmer = nltk.stem.snowball.PortugueseStemmer()
lemma = stemmer.stem(word)
if lemma is not None:
return lemma
return word
def ne(token):
#This just assumes that words in all caps or titles are named entities.
if token.istitle() or token.isupper():
return True
return False
class RTE_features(object):
def __init__(self, text, hypothesis,lemmatize=False):
self.negwords = set(['não', 'nao', 'nunca', 'jamais', 'nada', 'nenhum', 'ninguém']) #'not', 'no', 'never', 'nothing', 'none', 'nobody'
#'can', 'could', 'may', 'might', 'will', 'would', 'must', 'shall' 'should', 'possible', 'possibly'
self.modalwords = set(['podia','poderia','dever','deve','devia','deverá','deveria','faria','possivel','possibilidade','possa'])
self.text_words = set(text)
self.hypothesis_words = set(hypothesis)
if lemmatize:
self.text_words = set(lemmatize(token) for token in self.text_words)
self.hypothesis_words = set(lemmatize(token) for token in self.hypothesis_words)
self._overlap = (self.text_words & self.hypothesis_words)
self._text_extra = jaccard_sim(self.text_words,self.hypothesis_words)
self._hypothesis_extra = self.hypothesis_words - self.text_words
self._racio_neg = jaccard_sim( (self.negwords & self.text_words),(self.negwords & self.hypothesis_words))
self._non_common = jaccard_sim(self.text_words,self.hypothesis_words)
self._racio_modal = jaccard_sim( (self.modalwords & self.text_words) , (self.modalwords & self.hypothesis_words))
def overlap(self, token_type, debug=False):
ne_in_text = set(token for token in self.text_words if ne(token))
ne_in_hypo = set(token for token in self.hypothesis_words if ne(token))
ne_overlap = jaccard_sim(ne_in_text, ne_in_hypo)
if token_type == 'ne':
if debug:
print("ne_overlap", ne_overlap)
return ne_overlap
elif token_type == 'word':
if debug:
print("word_overlap", self._overlap - ne_overlap)
return self._overlap - ne_overlap
else:
raise ValueError("Type not recognized:'%s'" % token_type)