-
Notifications
You must be signed in to change notification settings - Fork 20
/
WUSTLCode_3.py
72 lines (45 loc) · 1.73 KB
/
WUSTLCode_3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
##########################
###
###
### Day 3 Lecture, Text as Data Workshop
###
###
###
##########################
import re, os
from urllib import urlopen
from BeautifulSoup import BeautifulSoup
out = open('/users/justingrimmer/Dropbox/HouseData/NewListPress.csv', 'r')
press = out.readlines()
pos_words = urlopen('http://www.unc.edu/~ncaren/haphazard/positive.txt').read().split('\n')
neg_words = urlopen('http://www.unc.edu/~ncaren/haphazard/negative.txt').read().split('\n')
from nltk import PorterStemmer
from nltk import word_tokenize
st = PorterStemmer()
pos_stem = map(st.stem, pos_words)
neg_stem = map(st.stem, neg_words)
stop_words = urlopen('http://jmlr.org/papers/volume5/lewis04a/a11-smart-stop-list/english.stop').read().split('\n')
stop_stemmed = map(st.stem, stop_words)
pos_stem = [x for x in pos_stem if x not in stop_stemmed]
neg_stem = [x for x in neg_stem if x not in stop_stemmed]
##now going through a big collection of press releases
output = open('/users/justingrimmer/dropbox/teaching/text/tad14/class4/ScorePress.csv', 'w')
output.write('Document,Num_Words,Pos_Words,Neg_Words')
output.write('\n')
for z in range(1, len(press)):
temp = press[z].strip('\n').split(',')[-1]
start = open(temp, 'r').read()
start2 = start.lower()
start3 = re.sub('\W', ' ', start2)
start4 = word_tokenize(start3)
start5 = map(pt.stem, start4)
num_words = len([x for x in start5 if x not in stop_stemmed])
pos_words = len([x for x in start5 if x in pos_stem])
neg_words = len([x for x in start5 if x in neg_stem])
part = str(z) + str(num_words) + ',' + str(pos_words) +',' + str(neg_words)
output.write(part)
output.write('\n')
if z %100 == 0:
print z
output.close()
##this creates the figures, we can then use the output.