-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathphase4.py
36 lines (35 loc) · 1.08 KB
/
phase4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import os,string,json
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords,wordnet
stop = stop= stopwords.words('english') + list(string.punctuation)
cwd=os.getcwd()
test=cwd+"/input/test/"
categories=os.listdir(test)
bag={}
cnt=0
for i in categories:
col={}
filenames=test+i+"/"
for j in os.listdir(filenames):
print i, "Processing File: ",j,". Number of Files Processed: ",cnt
tmp1=[]
with open(filenames+j,'r') as file1:
doc=file1.read()
ndoc=""
for k in doc:
if k.isalnum() or k==" " or k=="\n":
ndoc+=k
doc=""
for word in ndoc.split():
if wordnet.synsets(word) and word not in stop:
doc+=word.lower()+" "
tmp1.append(doc)
file1.close()
col[str(i)+"_"+str(j)]=tmp1
cnt+=1
bag[i]=col
# print bag
with open(os.getcwd()+r'/input/jsons/'+r'test.json','w') as file1:
dp=json.dumps(bag, sort_keys=True, indent=4, separators=(',', ': '))
file1.write(dp)
file1.close()