-
Notifications
You must be signed in to change notification settings - Fork 0
/
filter-amazon.py
52 lines (38 loc) · 1.23 KB
/
filter-amazon.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import sys, gzip, json, datetime
sys.path.append("./pylinguistics/pylinguistics/")
import Pylinguistics as pl
import descriptive
import pandas as pd
import numpy as np
df = {}
i = 0
columns = []
g = gzip.open('reviews-amazon.json.gz', 'r')
for l in g:
review_json = json.loads(l)
date = int(datetime.datetime.strptime(review_json['reviewTime'], "%m %d, %Y").strftime("%Y%m%d"))
thumbs = int(review_json['helpful'][0]) + int(review_json['helpful'][1])
if date > 20050915 and date < 20130924 and thumbs > 0:
try:
review_pd = []
objpl = pl.text(review_json['reviewText'],'en')
#print objpl.getFeatures()
if columns == []:
for attr in objpl.getFeatures():
columns.append(attr)
for attr in objpl.getFeatures():
review_pd.append(str(objpl.getFeatures()[attr]))
review_pd.append(review_json['helpful'][0])
review_pd.append(review_json['helpful'][1])
review_pd.append(int(review_json['overall']))
df[i] = review_pd
sys.stdout.write(str(i))
except:
sys.stdout.write('e')
i += 1
if i > 35000:
break
reviews = pd.DataFrame.from_dict(df, orient='index')
columns.extend(['thumbsup','thumbsdown','stars'])
reviews.columns = columns
reviews.to_csv('experiments/amazon-help.csv.gz', compression='gzip')