-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract.py
157 lines (140 loc) · 6.5 KB
/
extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import json
import re
from datetime import datetime
import pandas as pd
def extract_reviews(input_file):
reviews = [] # List to store extracted reviews
review_id = 1 # Initialize review ID
with open(input_file, "r") as file:
review_content = "" # Initialize review content
written_date = None # Initialize written date
travel_type = None # Initialize travel type
num_contributions = None # Initialize number of contributions
personal_info = None # Initialize personal information
likes = None # Initialize number of likes
in_reply = False # Flag to track if in reply
for line in file:
if (
"This review is the subjective opinion of a Tripadvisor member and not of Tripadvisor LLC. Tripadvisor performs checks on reviews as part of our industry-leading trust & safety standards. Read our transparency report to learn more."
in line
):
# End of review detected, add the review to the list
reviews.append(
{
"review_id": review_id,
"review_content": review_content.strip(),
"written_date": written_date,
"travel_type": travel_type,
"num_contributions": num_contributions,
"personal_info": personal_info,
"likes": likes,
}
)
review_id += 1 # Increment review ID
review_content = "" # Reset review content
written_date = None # Reset written date
travel_type = None # Reset travel type
num_contributions = None # Reset number of contributions
personal_info = None # Reset personal information
likes = None # Reset number of likes
elif line.strip().endswith("• Family"):
travel_type = "Family"
elif line.strip().endswith("• Friends"):
travel_type = "Friends"
elif line.strip().endswith("• Couples"):
travel_type = "Couples"
elif line.strip().endswith("• Solo"):
travel_type = "Solo"
elif line.strip().endswith("contribution") or line.strip().endswith(
"contributions"
):
# Extract number of contributions
match = re.search(r"(\d+)\s(contribution|contributions)", line)
if match:
num_contributions = int(match.group(1))
elif line.startswith("Written "):
# Extract written date from the line
match = re.search(r"Written (\w+ \d{1,2}, \d{4})", line)
if match:
written_date_str = match.group(1)
# Parse written date into datetime object
written_date = datetime.strptime(
written_date_str, "%B %d, %Y"
).strftime("%Y-%m-%d")
elif line.strip().isdigit():
# Extract number of likes
likes = int(line.strip())
elif "ConserveForests1" in line:
# Start of reply detected, set in_reply flag to True
in_reply = True
elif (
"This response is the subjective opinion of the management representative and not of Tripadvisor LLC."
in line
):
# End of reply detected, set in_reply flag to False
in_reply = False
elif not in_reply:
# Check if line is before the number of contributions line and not part of a reply
if num_contributions is None:
# Append line to personal information
if personal_info is None:
personal_info = line.strip() # Initialize personal information
else:
personal_info += (
" " + line.strip()
) # Append to existing personal information
else:
# Append line to review content
review_content += line
# Add the last review (if any) to the list
if review_content.strip():
reviews.append(
{
"review_id": review_id,
"review_content": review_content.strip(),
"written_date": written_date,
"travel_type": travel_type,
"num_contributions": num_contributions,
"personal_info": personal_info,
"likes": likes,
}
)
return reviews
def save_reviews_to_json(reviews, output_file):
with open(output_file, "w") as file:
json.dump(reviews, file, indent=4)
def export_review_to_txt(review):
# Define the filename for the review
filename = f"{review['review_id']}.txt"
# Write review information to a text file
with open(filename, "w") as file:
file.write(f"{review['personal_info']}\n")
# file.write(f"Review ID: {review['review_id']}\n")
file.write(f"{review['review_content']}\n")
file.write(f"{review['written_date']}\n")
file.write(f"{review['travel_type']}\n")
# file.write(f"{review['num_contributions']}\n")
# file.write(f"{review['likes']}\n")
def export_reviews_to_excel(reviews, output_file):
# Create a DataFrame from the reviews list
df = pd.DataFrame(reviews)
# Export the DataFrame to an Excel file
df.to_excel(output_file, index=False)
# Input file containing the reviews
input_file = "reviews_sadhana.txt"
# Output file to save the extracted reviews as JSON
output_file = "extracted_reviews_sadhana.json"
# Output file to save the extracted reviews as Excel
output_file_excel = "extracted_reviews_sadhana.xlsx"
# Extract reviews from the input file
extracted_reviews = extract_reviews(input_file)
# Save extracted reviews to a JSON file
save_reviews_to_json(extracted_reviews, output_file)
print("Reviews extracted and saved to", output_file)
# Export each review information to a text file
for review in extracted_reviews:
export_review_to_txt(review)
print("Individual review information exported to text files.")
# Export the reviews to an Excel file
export_reviews_to_excel(extracted_reviews, output_file_excel)
print("Reviews exported to Excel file:", output_file_excel)