-
Notifications
You must be signed in to change notification settings - Fork 0
/
wordprocessing.py
104 lines (82 loc) · 3.13 KB
/
wordprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import re
from nltk.corpus import stopwords
import enchant
suffix = ['able','ac','acity','ocity','ade','age','aholic','oholic','al','algia','an','ian','ance','ant','ar','ard',
'arian','arium','ary','ate','ation','ative','cide','cracy','crat','cule','cy','cycle','dom','dox','ectomy','ed','ee',
'eer','emia','en','ence','ency','ent','er','ern','escence','ese','esque','ess','est','etic','ette','ful','fy','gam',
'gon','hood','ial','ian','iasis','iatric','ible','ic','ile','ily','ine','ing','ion','ious','ish','ism','ist','ite',
'itis','ity','ive','ization','ize','less','let','like','ling','loger','log','ly','ment','ness','oid','ology','oma',
'onym','opia','opsy','or','ory','osis','ostomy','ous','path','pathy','phile','phobia','phone','phyte','plegia','plegic',
'pnea','s','scopy','scribe','sect','ship','sion','some','sophy','th','tion','tome','trophy','tude','ty','ular','uous',
'ure','ward','ware','wise','y']
def root_mode(word):
for s in suffix:
pattern = re.compile(s + "$")
if re.search(pattern,word):
word = re.split(pattern,word)[0]
return word
#parses the text file and returns a list of words
def parse(file):
with open(file, 'rb') as f:
return re.findall(re.compile('\w+'), f.read().lower())
#checks for an stop words present in the text file
def stop_words_remove(word_list):
filtered_words = [word for word in word_list if word not in stopwords.words('english')]
print filtered_words
print "filtered_words"
return filtered_words
#checks if a word is present in the US English dictionary
def check_valid_word(word):
d = enchant.Dict("en_US")
if word != "" and d.check(word) and len(word) > 1 and not is_number(word):
return word
return ""
#checks if a string is a number
def is_number(s):
try:
float(s)
int(s)
return True
except ValueError:
return False
#parses the dictionary to return the key with the max value
def getMax(dictionary):
v=list(dictionary.values())
k=list(dictionary.keys())
return k[v.index(max(v))]
#returns the final dictionary of root words from the text file with the word count
def final_word(filename):
word_list = stop_words_remove(parse(filename))
print word_list
print "WORD LIST"
output = {}
for i in range(0,len(word_list)):
print i/float(len(word_list))*100
if check_valid_word(word_list[i]):
if check_valid_word(root_mode(word_list[i])):
insert_word = root_mode(word_list[i])
if insert_word in output:
output[insert_word] +=1
else:
output[insert_word] = 1
else:
if word_list[i] in output:
output[word_list[i]] += 1
else:
output[word_list[i]] = 1
return output
#returns the top25frequently words in the dictionary
def top50FrequentWords(filename):
i = 50
output = {}
result = []
output = final_word(filename)
print output
print len(output)
while i != 0 and len(output.keys()) != 0:
result.append(getMax(output))
output.pop(getMax(output), None)
i -= 1
print "TOP 50 Words"
return result
print top25FrequentWords('/Users/Jayalakshmi/Desktop/text2.txt')