forked from kwsy/FictionAnalysis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
hibiscusMain.py
101 lines (83 loc) · 3.44 KB
/
hibiscusMain.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#coding=utf-8
'''
Created on 2016-1-23
@author: kwsy
'''
import os
import hibiscusTools
import codecs
from xlwt import Workbook
import sys
class Hibiscus():
def analyseNovel(self,filename):
if not os.path.exists(filename):
pass
with codecs.open(filename, encoding='GBK') as file:
content = file.read()
txtlist = hibiscusTools.getAllChineseCharacters(content)
self.novelInfo = {}
index = 0
for txt in txtlist:
itemlst = hibiscusTools.getLatentword(txt, index)
index = index+len(txt)
for item in itemlst:
word = item['word']
if not word in self.novelInfo:
self.novelInfo[word] = {'leftLst':[],'rightLst':[],'wordindexLst':[],'count':0,'word':word}
if not item['left']==None:
self.novelInfo[word]['leftLst'].append(item['left'])
if not item['right']==None:
self.novelInfo[word]['rightLst'].append(item['right'])
self.novelInfo[word]['wordindexLst'].append(item['wordindex'])
self.novelInfo[word]['count'] = self.novelInfo[word]['count']+1
self.charCount = index
self.calculte()
def outExcel(self,filename):
wb = Workbook()
table = wb.add_sheet('新词')
table.write(0,0,'单词')
table.write(0,1,'出现次数')
table.write(0,2,'凝结度')
table.write(0,3,'自由度')
lst = []
for k,v in self.novelInfo.items():
if v['count']>30 and len(k)>1 and v['solidification']>50 and v['freedom']>3:
lst.append(v)
lst = sorted(lst,key=lambda x:x['count'],reverse=True)
line = 1
for index ,item in enumerate(lst):
table.write(line,0,item['word'])
table.write(line,1,item['count'])
table.write(line,2,item['solidification'])
table.write(line,3,item['freedom'])
line +=1
wb.save('./'+os.path.splitext(os.path.basename(filename))[0] +'.xls')
def calculte(self):
for word,info in self.novelInfo.items():
self.novelInfo[word]['solidification']= self.getSolidification(word)
self.novelInfo[word]['freedom'] = self.getFreedom(self.novelInfo[word])
def getFreedom(self,wordinfo):
leftfreedom = hibiscusTools.calculateFreedom(wordinfo['leftLst'])
rightfreedom = hibiscusTools.calculateFreedom(wordinfo['rightLst'])
if leftfreedom<rightfreedom:
return leftfreedom
return rightfreedom
def getSolidification(self,word):
splitLst = hibiscusTools.splitWord(word)
wordcount = self.novelInfo[word]['count']
probability = float(wordcount)/float(self.charCount)
min = 10000000
for item in splitLst:
left,right = item[0],item[1]
leftcount,rightcount = self.novelInfo[left]['count'],self.novelInfo[right]['count']
Togetherprobability = probability/((float(rightcount)/float(self.charCount))*(float(leftcount)/float(self.charCount)))
if Togetherprobability<min:
min = Togetherprobability
return min
def excute(name):
filename = sys.argv[1]
hibi = Hibiscus()
hibi.analyseNovel(filename)
hibi.outExcel(filename)
if __name__ == '__main__':
excute( sys.argv[1:])