-
Notifications
You must be signed in to change notification settings - Fork 11
/
mac_9.py
76 lines (48 loc) · 1.85 KB
/
mac_9.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
##########################
###
###
### Day 3 Lecture, Text as Data Workshop
###
###
###
##########################
##convert to sets.
import re, os
from urllib import urlopen
from BeautifulSoup import BeautifulSoup
out = open('/users/justingrimmer/Dropbox/HouseData/NewListPress.csv', 'r')
press = out.readlines()
pos_words = urlopen('http://www.unc.edu/~ncaren/haphazard/positive.txt').read().split('\n')
neg_words = urlopen('http://www.unc.edu/~ncaren/haphazard/negative.txt').read().split('\n')
from nltk import PorterStemmer
from nltk import word_tokenize
st = PorterStemmer()
pos_stem = set(map(st.stem, pos_words))
neg_stem = set(map(st.stem, neg_words))
stop_words = urlopen('http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a11-smart-stop-list/english.stop').read().split('\n')
stop_stemmed = map(st.stem, stop_words)
#pos_stem = [x for x in pos_stem if x not in stop_stemmed]
#neg_stem = [x for x in neg_stem if x not in stop_stemmed]
##now going through a big collection of press releases
output = open('/users/justingrimmer/dropbox/teaching/text/tad14/class4/ScorePress.csv', 'w')
output.write('Document,Num_Words,Pos_Words,Neg_Words')
output.write('\n')
def get_overlap(s,reference_set):
return len(s&reference_set)
for z in range(1, len(press)):
temp = press[z].strip('\n').split(',')[-1]
start = open(temp, 'r').read()
start2 = start.lower()
start3 = re.sub('\W', ' ', start2)
start4 = word_tokenize(start3)
start5 = map(pt.stem, start4)
num_words = len([x for x in start5 if x not in stop_stemmed])
pos_words = len([x for x in start5 if x in pos_stem])
neg_words = len([x for x in start5 if x in neg_stem])
part = str(z) + str(num_words) + ',' + str(pos_words) +',' + str(neg_words)
output.write(part)
output.write('\n')
if z %100 == 0:
print z
output.close()
##this creates the figures, we can then use the output.