forked from blmoistawinde/HarvestText
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbasics.py
235 lines (203 loc) · 9.99 KB
/
basics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
#coding=utf-8
from harvesttext import HarvestText
ht = HarvestText()
def new_word_discover():
para = "上港的武磊和恒大的郜林,谁是中国最好的前锋?那当然是武磊武球王了,他是射手榜第一,原来是弱点的单刀也有了进步"
# 返回关于新词质量的一系列信息,允许手工改进筛选(pd.DataFrame型)
new_words_info = ht.word_discover(para)
# new_words_info = ht.word_discover(para, threshold_seeds=["武磊"])
new_words = new_words_info.index.tolist()
print(new_words)
def new_word_register():
new_words = ["落叶球","666"]
ht.add_new_words(new_words) # 作为广义上的"新词"登录
ht.add_new_entity("落叶球", mention0="落叶球", type0="术语") # 作为特定类型登录
print(ht.seg("这个落叶球踢得真是666", return_sent=True))
for word, flag in ht.posseg("这个落叶球踢得真是666"):
print("%s:%s" % (word, flag), end=" ")
def entity_segmentation():
para = "上港的武磊和恒大的郜林,谁是中国最好的前锋?那当然是武磊武球王了,他是射手榜第一,原来是弱点的单刀也有了进步"
print("\nadd entity info(mention, type)")
entity_mention_dict = {'武磊': ['武磊', '武球王'], '郜林': ['郜林', '郜飞机'], '前锋': ['前锋'], '上海上港': ['上港'], '广州恒大': ['恒大'],
'单刀球': ['单刀']}
entity_type_dict = {'武磊': '球员', '郜林': '球员', '前锋': '位置', '上海上港': '球队', '广州恒大': '球队', '单刀球': '术语'}
ht.add_entities(entity_mention_dict, entity_type_dict)
print("\nWord segmentation")
print(ht.seg(para, return_sent=True)) # return_sent=False时,则返回词语列表
print("\nPOS tagging with entity types")
for word, flag in ht.posseg(para):
print("%s:%s" % (word, flag), end=" ")
print("\n\nentity_linking")
for span, entity in ht.entity_linking(para):
print(span, entity)
print("Sentence segmentation")
print(ht.cut_sentences(para))
def sentiment_dict():
print("\nsentiment dictionary")
sents = ["武磊威武,中超第一射手!",
"武磊强,中超最第一本土球员!",
"郜林不行,只会抱怨的球员注定上限了",
"郜林看来不行,已经到上限了"]
sent_dict = ht.build_sent_dict(sents,min_times=1,pos_seeds=["第一"],neg_seeds=["不行"])
print("%s:%f" % ("威武",sent_dict["威武"]))
print("%s:%f" % ("球员",sent_dict["球员"]))
print("%s:%f" % ("上限",sent_dict["上限"]))
print("\nsentence sentiment")
sent = "武球王威武,中超最强球员!"
print("%f:%s" % (ht.analyse_sent(sent), sent))
def entity_search():
print("\nentity search")
docs = ["武磊威武,中超第一射手!",
"郜林看来不行,已经到上限了。",
"武球王威武,中超最强前锋!",
"武磊和郜林,谁是中国最好的前锋?"]
inv_index = ht.build_index(docs)
print(ht.get_entity_counts(docs, inv_index)) # 获得文档中所有实体的出现次数
print(ht.search_entity("武磊", docs, inv_index)) # 单实体查找
print(ht.search_entity("武磊 郜林", docs, inv_index)) # 多实体共现
# 谁是最被人们热议的前锋?用这里的接口可以很简便地回答这个问题
subdocs = ht.search_entity("#球员# 前锋", docs, inv_index)
print(subdocs) # 实体、实体类型混合查找
inv_index2 = ht.build_index(subdocs)
print(ht.get_entity_counts(subdocs, inv_index2, used_type=["球员"])) # 可以限定类型
def text_summarization():
# 文本摘要
print("\nText summarization")
docs = ["武磊威武,中超第一射手!",
"郜林看来不行,已经到上限了。",
"武球王威武,中超最强前锋!",
"武磊和郜林,谁是中国最好的前锋?"]
for doc in ht.get_summary(docs, topK=2):
print(doc)
def entity_network():
print("\nentity network")
# 在现有实体库的基础上随时新增,比如从新词发现中得到的漏网之鱼
ht.add_new_entity("颜骏凌", "颜骏凌", "球员")
docs = ["武磊和颜骏凌是队友",
"武磊和郜林都是国内顶尖前锋"]
G = ht.build_entity_graph(docs)
print(dict(G.edges.items()))
G = ht.build_entity_graph(docs, used_types=["球员"])
print(dict(G.edges.items()))
def save_load_clear():
from harvesttext import loadHT,saveHT
para = "上港的武磊和恒大的郜林,谁是中国最好的前锋?那当然是武磊武球王了,他是射手榜第一,原来是弱点的单刀也有了进步"
saveHT(ht,"ht_model1")
ht2 = loadHT("ht_model1")
print("cut with loaded model")
print(ht2.seg(para))
ht2.clear()
print("cut with cleared model")
print(ht2.seg(para))
def load_resources():
from harvesttext.resources import get_qh_sent_dict,get_baidu_stopwords,get_sanguo,get_sanguo_entity_dict
sdict = get_qh_sent_dict() # {"pos":[积极词...],"neg":[消极词...]}
print("pos_words:",list(sdict["pos"])[10:15])
print("neg_words:",list(sdict["neg"])[5:10])
stopwords = get_baidu_stopwords()
print("stopwords:", list(stopwords)[5:10])
docs = get_sanguo() # 文本列表,每个元素为一章的文本
print("三国演义最后一章末16字:\n",docs[-1][-16:])
entity_mention_dict, entity_type_dict = get_sanguo_entity_dict()
print("刘备 指称:",entity_mention_dict["刘备"])
print("刘备 类别:",entity_type_dict["刘备"])
print("蜀 类别:", entity_type_dict["蜀"])
print("益州 类别:", entity_type_dict["益州"])
def linking_strategy():
ht0 = HarvestText()
def test_case(text0,entity_mention_dict,strategy,entity_type_dict=None,**kwargs):
ht0.add_entities(entity_mention_dict,entity_type_dict)
ht0.set_linking_strategy(strategy,**kwargs)
print(ht0.entity_linking(text0))
ht0.clear()
# latest 例
test_case('X老师您好。请问老师这题怎么做?',
entity_mention_dict={"X老师": ["X老师", "老师"], "Y老师": ["Y老师", "老师"]},
strategy="latest"
)
test_case('谢谢老师',
entity_mention_dict={"X老师": ["X老师", "老师"], "Y老师": ["Y老师", "老师"]},
strategy="latest",
lastest_mention={"老师": "X老师"})
# freq 单字面值例
test_case('市长',
entity_mention_dict={"A市长": ["市长"], "B市长": ["长江"]},
strategy="freq",
entity_freq={"A市长": 5, "B市长": 3})
# freq 重叠字面值例
test_case('xx市长江yy',
entity_mention_dict={"xx市长":["xx市长"],"长江yy":["长江yy"]},
strategy="freq",
entity_freq={"xx市长":3,"长江yy":5})
test_case('我叫小沈阳',
entity_mention_dict={"沈阳": ["沈阳"], "小沈阳": ["小沈阳"]},
strategy="freq",
entity_type_dict={"沈阳": "地名", "小沈阳": "人名"},
type_freq={"地名": -1})
def find_with_rules():
from harvesttext.match_patterns import UpperFirst, AllEnglish, Contains, StartsWith, EndsWith
# some more patterns is provided
text0 = "我喜欢Python,因为requests库很适合爬虫"
ht0 = HarvestText()
found_entities = ht0.find_entity_with_rule(text0, rulesets=[AllEnglish()], type0="英文名")
print(found_entities)
print(ht0.posseg(text0))
print(ht0.mention2entity("Python"))
# Satisfying one of the rules
ht0.clear()
found_entities = ht0.find_entity_with_rule(text0,rulesets=[AllEnglish(),Contains("爬")],type0="技术")
print(found_entities)
print(ht0.posseg(text0))
# Satisfying a couple of rules [using tuple]
ht0.clear()
found_entities = ht0.find_entity_with_rule(text0, rulesets=[(AllEnglish(),UpperFirst())], type0="专有英文词")
print(found_entities)
print(ht0.posseg(text0))
def build_word_ego_graph():
import networkx as nx
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] # 步骤一(替换sans-serif字体)
plt.rcParams['axes.unicode_minus'] = False # 步骤二(解决坐标轴负数的负号显示问题)
from harvesttext import get_sanguo, get_sanguo_entity_dict, get_baidu_stopwords
ht0 = HarvestText()
entity_mention_dict, entity_type_dict = get_sanguo_entity_dict()
ht0.add_entities(entity_mention_dict, entity_type_dict)
sanguo1 = get_sanguo()[0]
stopwords = get_baidu_stopwords()
docs = ht0.cut_sentences(sanguo1)
G = ht0.build_word_ego_graph(docs,"刘备",min_freq=3,other_min_freq=2,stopwords=stopwords)
pos = nx.kamada_kawai_layout(G)
nx.draw(G,pos)
nx.draw_networkx_labels(G,pos)
plt.show()
G = ht0.build_entity_ego_graph(docs, "刘备", min_freq=3, other_min_freq=2)
pos = nx.kamada_kawai_layout(G)
nx.draw(G, pos)
nx.draw_networkx_labels(G, pos)
plt.show()
def using_typed_words():
from harvesttext.resources import get_qh_typed_words,get_baidu_stopwords
ht0 = HarvestText()
typed_words, stopwords = get_qh_typed_words(), get_baidu_stopwords()
ht0.add_typed_words(typed_words)
print("加载清华领域词典,并使用停用词")
print("全部类型",typed_words.keys())
sentence = "THUOCL是自然语言处理的一套中文词库,词表来自主流网站的社会标签、搜索热词、输入法词库等。"
print(sentence)
print(ht0.posseg(sentence,stopwords=stopwords))
print("一些词语被赋予特殊类型IT,而“是”等词语被筛出。")
if __name__ == "__main__":
new_word_discover()
new_word_register()
entity_segmentation()
sentiment_dict()
entity_search()
text_summarization()
entity_network()
save_load_clear()
load_resources()
linking_strategy()
find_with_rules()
load_resources()
using_typed_words()
build_word_ego_graph()