forked from ZiwenZhuang/Web-Text_mining_Project
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Prasers.py
253 lines (220 loc) · 10.5 KB
/
Prasers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
import nltk
from nltk.util import ngrams
from collections import defaultdict
import math
import re
class MainRanker():
'''
This class encapsulate all methods that ranks the topics.
Beside the __init__() method, each other method name end with rank using the
initialized filename to rank using the following method. And they all return a list
of tuple (with keyword(str) and frequency(int)), started with the hottest topic.
You can choose whether to plot the curve in initializing the object or runing the ranking.
'''
def __init__(self, filename_main, num_files, will_plot= False, hottest= 10):
'''
In the following method, they will actually open file in filename_main + str(i) + '.txt'
to open the file.
'''
self.filename_main = filename_main
self.num_files = num_files
self.will_plot = will_plot
self.hottest = hottest # specify the number of hottest topic returned by the ranker
def extract_words(self, filename, will_lower= True)->list:
'''
This method returns a list of words from the specific file
And each element is encoded in 'utf-8'
will_lower argument specifies whether to change all uppercase char into lowercase
'''
# start with open the file
to_return = []
with open(filename, 'r', encoding='utf-8') as f:
# preprocess the file
raw_text = f.read().replace('\n', ' ').replace('\r', '')
tokens = nltk.word_tokenize(raw_text)
# extracting only words
if (will_lower):
temp = [w.lower() for w in tokens]
tokens = temp # change to lower case
temp = [w for w in tokens if w.isalpha()]
tokens = temp
# extract only alphabetical words
to_return = tokens
return to_return
def simple_BOW_rank(self, mywill_plot= None, myhottest= None):
'''
This method applys simple bag of words procedure. And return hot topics
And it didn't remove any stop words
'''
# prase the arguments for this method
if mywill_plot == None:
mywill_plot = self.will_plot
if myhottest == None:
myhottest = self.hottest
# generate a frequency dictionary for all tokens
freq_words = nltk.FreqDist()
for i in range(self.num_files):
# add samples in the file
freq_words.update( self.extract_words(self.filename_main + str(i) + '.txt') )
# sort the frequency list in decending order
sorted_freq_words = sorted(freq_words.items(),\
key = lambda k:k[1],\
reverse = True
)
# display and return the answer
print('Applying simple bag of words method with max frequency: ' + str(sorted_freq_words[0][1]))
if mywill_plot:
freq_words.plot(myhottest)
return sorted_freq_words[:myhottest]
def BOW_stem_stop_rank(self, mywill_plot= None, myhottest= None, stemmer_name= 'Porter'):
'''
This method applies bag-of-words and then stemming and stop words removal method
And you can choose stemmer by specifying in the stemmer_name attribute. You have
at least the followin choise:
Porter
Lancaster
'''
# prase the arguments for this method
if mywill_plot == None:
mywill_plot = self.will_plot
if myhottest == None:
myhottest = self.hottest
# generate a frequency dictionary for all tokens not in stopwords
# and use stemmer to stem the word in each document
stopwords = nltk.corpus.stopwords.words('english')
stemmer = eval('nltk.' + stemmer_name + 'Stemmer()')
freq_words = nltk.FreqDist()
for i in range(self.num_files):
# add samples in the file
words = self.extract_words(self.filename_main + str(i) + '.txt')
temp = [w for w in words if (not w in stopwords)]
freq_words.update([stemmer.stem(w) for w in temp])
# sort the frequency list in decending order
sorted_freq_words = sorted(freq_words.items(),\
key = lambda k:k[1],\
reverse = True
)
# display and return the answer
print('Applying bag-of-words and stemming and stopword removal with max frequency: ' + str(sorted_freq_words[0][1]))
if mywill_plot:
freq_words.plot(myhottest)
return sorted_freq_words[:myhottest]
def POS_rank(self, focus, mywill_plot= None, myhottest= None):
'''
This method applies part-of-speech tagging approach
focus: it should be a string and specifies which kind of tags this program should focus
on, for example:
NN: Noun, singular or mass
NNP: Proper noun, singular
NNS: Noun, plural
NNPS: Proper noun, plural
'''
# prase the arguments for this method
if mywill_plot == None:
mywill_plot = self.will_plot
if myhottest == None:
myhottest = self.hottest
# generate a frequency dictionary for all tokens not in stopwords
# and use stemmer to stem the word in each document
stopwords = nltk.corpus.stopwords.words('english')
stemmer = eval('nltk.PorterStemmer()')
freq_words = nltk.FreqDist()
for i in range(self.num_files):
# add samples in the file
words = self.extract_words(self.filename_main + str(i) + '.txt', False)
temp = [w for w in words if (not w in stopwords)]
stemmed = [stemmer.stem(w) for w in temp]
with_tags = nltk.pos_tag(stemmed)
# select words with specific tag
selected = [w for (w,t) in with_tags if t == focus]
freq_words.update(selected)
# sort the frequency list in decending order
sorted_freq_words = sorted(freq_words.items(),\
key = lambda k:k[1],\
reverse = True
)
# display and return the answer
print('Applying POS ranking with max frequency: ' + str(sorted_freq_words[0][1]))
if mywill_plot:
freq_words.plot(myhottest)
return sorted_freq_words[:myhottest]
def ngrams_rank(self, n= 2, mywill_plot= None, myhottest= None, stemmer_name= 'Porter'):
'''
Applying ngrams method to extract tokens and then count, sort to find the hottest topic.
The argurment n means the n-grams, when n=2 (also is a default value) all tokens will be
unigrams and bigrams (no intermediate)
'''
# prase the arguments for this method
if mywill_plot == None:
mywill_plot = self.will_plot
if myhottest == None:
myhottest = self.hottest
# generate a frequency dictionary for all tokens not in stopwords
# and use stemmer to stem the word in each document
stopwords = nltk.corpus.stopwords.words('english')
stemmer = eval('nltk.LancasterStemmer()')
freq_words = nltk.FreqDist()
for i in range(self.num_files):
# add samples in the file
words = self.extract_words(self.filename_main + str(i) + '.txt')
words += ngrams(words, n)
temp = [w for w in words if (not w in stopwords)]
stemmed = [stemmer.stem(w) for w in temp]
with_tags = nltk.pos_tag(stemmed)
# select words with specific tag
selected = [w for (w,t) in with_tags if t.startwith('N')]
freq_words.update(selected)
# sort the frequency list in decending order
sorted_freq_words = sorted(freq_words.items(),\
key = lambda k:k[1],\
reverse = True
)
# display and return the answer
print('Applying ngrams ranking with max frequency: ' + str(sorted_freq_words[0][1]))
if mywill_plot:
freq_words.plot(myhottest)
return sorted_freq_words[:myhottest]
def tfidf_rank(self, mywill_plot= None, myhottest= None, stemmer_name= 'Porter', para= None, to_remove= []):
'''
Based on the previous counting method, adding tf-idf method to rank the importance of words
to_remove: you specifies words that you expelicitly don't want to be counted, they does not include stop words
'''
# prase the arguments for this method
if mywill_plot == None:
mywill_plot = self.will_plot
if myhottest == None:
myhottest = self.hottest
# generate a frequency dictionary for all tokens not in stopwords
# and use stemmer to stem the word in each document
stopwords = nltk.corpus.stopwords.words('english')
stemmer = eval('nltk.' + stemmer_name + 'Stemmer(' + para + ')')
freq_words = nltk.FreqDist()
# use for count the idf of each term
word_idf = defaultdict(lambda: 0)
for i in range(self.num_files):
# add samples in the file
words = self.extract_words(self.filename_main + str(i) + '.txt')
temp = [w for w in words if (not w in stopwords)]
to_add = [stemmer.stem(w) for w in temp]
to_add = [w for w in to_add if not w in to_remove]
word_set = set(to_add)
freq_words.update(to_add)
# set the idf
for word in word_set:
word_idf[word] += 1
# Calculate the idf of each word
for word in freq_words.keys():
word_idf[word] = math.log(self.num_files / float(1 + word_idf[word]))
# update frequency list with tf idf
for word in freq_words.keys():
freq_words[word] *= word_idf[word]
# sort the frequency list in decending order
sorted_freq_words = sorted(freq_words.items(),\
key = lambda k:k[1],\
reverse = True
)
# display and return the answer
print('Applying bag-of-words and stemming and stopword removal with max frequency: ' + str(sorted_freq_words[0][1]))
if mywill_plot:
freq_words.plot(myhottest)
return sorted_freq_words[:myhottest]