-
Notifications
You must be signed in to change notification settings - Fork 0
/
processing.py
87 lines (64 loc) · 2.77 KB
/
processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import pandas as pd
from tqdm import tqdm
import contractions
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
import re
import argparse
nltk.download('stopwords')
stop_words = stopwords.words('english')
tqdm.pandas()
def preprocess_single(txt):
stemmer = PorterStemmer()
txt = txt.lower()
txt = contractions.fix(txt)
#df = df.apply(lambda s: "".join([ch for ch in s if ch not in string.punctuation and ch not in string.digits and ch in string.printable]))
txt = re.sub('[^a-zA-Z\!\? ]+', '', txt)
txt = " ".join([stemmer.stem(word) for word in txt.split() if word not in stop_words])
return txt
def preprocess_text(df):
stemmer = PorterStemmer()
df = df.str.lower()
df = df.apply(contractions.fix)
#df = df.apply(lambda s: "".join([ch for ch in s if ch not in string.punctuation and ch not in string.digits and ch in string.printable]))
df = df.apply(lambda s: re.sub('[^a-zA-Z\!\? ]+', '', s))
df = df.apply(lambda s: " ".join([stemmer.stem(word) for word in s.split() if word not in stop_words]))
return df
def preprocess_data(df):
df.rename(columns={"review_summary": "review_summary_orig", "review_detail": "review_detail_orig"}, inplace=True)
df["review_summary_orig"] = df['review_summary_orig'].apply(lambda s: re.sub('\n', '', s))
df["review_detail_orig"] = df['review_detail_orig'].apply(lambda s: re.sub('\n', '', s))
df['review_summary'] = preprocess_text(df['review_summary_orig'])
df['review_detail'] = preprocess_text(df['review_detail_orig'])
df['review_summary_cleaned'] = df['review_summary'].apply(lambda s: re.sub('[^a-zA-Z ]+', '', s))
df['review_detail_cleaned'] = df['review_detail'].apply(lambda s: re.sub('[^a-zA-Z ]+', '', s))
return df
def extract_helpful_count(df):
helpful = df['helpful'].apply(lambda h: h[0]).apply(lambda h: h.replace(',', '')).astype('int')
all_ = df['helpful'].apply(lambda h: h[1]).apply(lambda h: h.replace(',', '')).astype('int')
df['helpful_ratio'] = helpful / all_
return df
def write_list_to_txt(list_, output_file='output.txt'):
with open(output_file, 'w') as f:
for el in tqdm(list_):
f.write("{}\n".format(el))
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument('--input', type=str, required=True)
parser.add_argument('--output', type=str, required=True)
args = parser.parse_args()
return args
if __name__ == '__main__':
args = parse_arguments()
df = pd.read_json(args.input)
df = df.head(250000)
df = df.dropna()
df = preprocess_data(df)
df = extract_helpful_count(df)
df = df.dropna()
del df['helpful']
del df['reviewer']
del df['review_date']
df.to_csv(args.output, index=False)