-
Notifications
You must be signed in to change notification settings - Fork 0
/
reddit2csv2.py
102 lines (93 loc) · 4.4 KB
/
reddit2csv2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""
A quick and dirty script to parse Reddit dumps
(written using the pushshift 40k top subreddit 2005-2023 archive)
Reads in submission and comment jsonl files, scans for keywords,
then outputs select fields in CSV format.
Submissions and comments are in chronological order.
Outputs matches in submissions and comments independently
(i.e. comments matches can be replies to submissions that don't
match; won't fetch non-matching comments for submissions that do)
so this is most useful for unstructured analysis.
"""
REDDITS = ["breastcancer"]
KEYWORDS = ["flat", "aesthetic closure", "goldilocks", "goldilock"]
SUBMISSION_COLUMNS = ["subreddit","type","title","author","score","selftext","url","id","permalink","created_utc","date"]
COMMENT_COLUMNS = ["subreddit","type","author","score","body","id","permalink","created_utc","date"]
# Reddit API notes
# 'score' is the total score ('ups' - 'downs') of a post. 'ups' and
# downs' are deprecated - 'ups' is always the same as 'score' and
# 'downs' is always zero.
# 'controversality' is a boolean flag to show that a comment has a high,
# but roughly similar number of likes/dislikes
#
# Reddit API changes mean that some fields (e.g. created_utc) are
# different types over time and some (e.g. permalinks) may not exist
# at all.
SAMPLE = False
import json
import csv
from datetime import datetime
import re, string
cleanup_pattern = re.compile(r'[\W_]+')
for reddit in REDDITS:
if SAMPLE:
submissions_filename = reddit+"_submissions_sample"
else:
submissions_filename = reddit+"_submissions"
with open(submissions_filename,"r", encoding="UTF-8") as submissions_file:
submissions = []
for line in submissions_file:
j = json.loads(line)
title = " "+cleanup_pattern.sub(' ', j["title"].lower())+" "
selftext = " "+cleanup_pattern.sub(' ', j["selftext"].lower())+" "
if any([(" "+keyword+" " in selftext or " "+keyword+" " in title) for keyword in KEYWORDS]):
submission = {}
submission["subreddit"] = j["subreddit"]
submission["type"] = "submission"
submission["title"] = j["title"]
submission["author"] = j["author"]
submission["score"] = j["score"]
submission["selftext"] = j["selftext"]
submission["url"] = j["url"]
submission["id"] = j["id"]
submission["permalink"] = j["permalink"]
submission["created_utc"] = j["created_utc"]
submission["date"] = datetime.fromtimestamp(int(j["created_utc"])).strftime('%Y-%m-%d')
submissions.append(submission)
continue
with open("./output/"+reddit+"_submissions"+".csv","w",encoding="UTF-8") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=SUBMISSION_COLUMNS)
writer.writeheader()
for submission in submissions:
writer.writerow(submission)
if SAMPLE:
comments_filename = reddit+"_comments_sample"
else:
comments_filename = reddit+"_comments"
with open(comments_filename,"r", encoding="UTF-8") as comments_file:
comments = []
for line in comments_file:
j = json.loads(line)
body = " "+cleanup_pattern.sub(' ', j["body"].lower())+" "
if any([" "+keyword+" " in body for keyword in KEYWORDS]):
comment = {}
comment["subreddit"] = j["subreddit"]
comment["type"] = "comment"
comment["author"] = j["author"]
comment["score"] = j["score"]
comment["body"] = j["body"]
comment["id"] = j["id"]
if "permalink" in j:
comment["permalink"] = j["permalink"]
else:
# sometimes (maybe because of older API versions) there aren't permalinks
comment["permalink"] = "NONE"
comment["created_utc"] = j["created_utc"]
comment["date"] = datetime.fromtimestamp(int(j["created_utc"])).strftime('%Y-%m-%d')
comments.append(comment)
continue
with open("./output/"+reddit+"_comments"+".csv","w",encoding="UTF-8") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=COMMENT_COLUMNS)
writer.writeheader()
for comment in comments:
writer.writerow(comment)