-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_api_response.py
109 lines (76 loc) · 2.76 KB
/
parse_api_response.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import glob
import json
import os.path
# TODO use median value instead
MIN_SCORE = 512
def parse_post(post):
# if post.get('removed_by') is not None:
# print("following post was removed: ", post)
# continue
if '[deleted]' in post.get('selftext'):
# print("post content was deleted: ", post['title'], post['selftext'])
# print(str(post), '\n')
return None
if '[removed]' in post.get('selftext'):
return None
return {
'score': post['score'],
'title': post['title'],
'selftext': post['selftext'],
'upvote_ratio': post.get('upvote_ratio', -1),
'author': post['author'],
'permalink': post['permalink'],
# 'created_utc': post['created_utc'],
'utc_datetime_str': post['utc_datetime_str']
}
def to_skip(post):
return post['score'] < MIN_SCORE
def parse_folder(path: str):
result = []
grand_total = 0
grand_archived = 0
grand_skipped = 0
for file in glob.glob(path):
with open(file, 'r') as f:
content = f.read()
posts = json.loads(content)['data']
total = 0
archived = 0
for post in posts:
total += 1
parsed_post = parse_post(post)
if parsed_post is None:
archived += 1
continue
if to_skip(post):
grand_skipped += 1
continue
result.append(parsed_post)
grand_total += total
grand_archived += archived
# print("total: ", total, ", archived: ", archived)
result.sort(key=lambda k: k.get('score'), reverse=True)
print(
f"result size: {len(result)}, grand archived: {grand_archived}, grand skipped: {grand_skipped}, grand total: {grand_total}")
return result
def write_result_to_json_file(posts: list, filename: str):
with open(filename, 'w', encoding='utf-8') as f:
json.dump(posts, f, ensure_ascii=False, indent=4)
def write_result_to_plain_file(posts: list, filename: str):
with open(filename, 'w', encoding='utf-8') as f:
for post in posts:
title = post['title']
body = post['selftext']
f.write(title)
if not title.endswith('.'):
f.write(". ")
f.write(body)
f.write('\n')
def get_datasets_list(root: str) -> list[str]:
return [d for d in glob.glob(root) if os.path.isdir(d)]
datasets = get_datasets_list("./datasets/*")
for dataset_dir in datasets:
print(f"parsing {dataset_dir}")
posts = parse_folder(f'{dataset_dir}/*')
write_result_to_json_file(posts, f"./{dataset_dir}.json")
write_result_to_plain_file(posts, f"./{dataset_dir}.txt")