-
Notifications
You must be signed in to change notification settings - Fork 0
/
queries.py
215 lines (191 loc) · 5.15 KB
/
queries.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
import pg
from tabulate import tabulate
from utils import time_this
@time_this
def query1_1(db: pg.Connector):
"""
Find average post score per personality type.
"""
query = """
--sql
SELECT mbti, ROUND(AVG(score), 2) AS avg_score, COUNT(*) AS num_comments
FROM posts
GROUP BY mbti
ORDER BY avg_score DESC
;
"""
db.cursor.execute(query)
rows = db.cursor.fetchall()
print("Average post score per personality type.")
headers = ["MBTI", "Avg. score", "Post count"]
print(tabulate(rows, headers=headers, tablefmt="pretty"))
@time_this
def query1_2(db: pg.Connector):
"""
Find average comment score per personality type.
"""
query = """
--sql
SELECT mbti, ROUND(AVG(score)) AS avg_score, COUNT(*) AS num_comments
FROM comments
GROUP BY mbti
ORDER BY avg_score DESC
;
"""
db.cursor.execute(query)
rows = db.cursor.fetchall()
print("Average comment score per personality type.")
headers = ["MBTI", "Avg. score", "Post count"]
print(tabulate(rows, headers=headers, tablefmt="pretty"))
@time_this
def query2(db: pg.Connector):
"""
Find percentage of controversial comments per personality type.
"""
query = """
--sql
SELECT mbti,
controversial_comments,
total_comments,
ROUND(controversial_comments*1.0 / total_comments, 5) * 100 AS ratio
FROM (SELECT
mbti,
COUNT(*) FILTER (WHERE controversiality > 0) AS controversial_comments,
COUNT(*) AS total_comments
FROM comments
GROUP BY mbti
) AS a
UNION
SELECT
'ALL',
controversial_comments,
total_comments,
ROUND(controversial_comments*1.0 / total_comments, 5) * 100 AS ratio
FROM (
SELECT
COUNT(*) FILTER (WHERE controversiality > 0) AS controversial_comments,
COUNT(*) AS total_comments
FROM comments
) AS c
ORDER BY total_comments DESC
;
"""
db.cursor.execute(query)
rows = db.cursor.fetchall()
print("Average controversiality of comments per personality type.")
headers = ["MBTI", "Controversial comments", "Total comments", "Percentage controversial comments"]
print(tabulate(rows, headers=headers, tablefmt="pretty"))
def query3(db: pg.Connector):
"""
Group comments by hour (24 buckets) and MBTI type.
"""
query = """
--sql
SELECT mbti, EXTRACT(HOUR FROM to_timestamp(created_utc)) AS hour, COUNT(*) AS comments_per_hour
FROM comments
GROUP BY mbti, hour
;
"""
db.cursor.execute(query)
rows = db.cursor.fetchall()
print(tabulate(rows))
def query4(db: pg.Connector):
query = """
--sql
SELECT mbti_type as mbti, subreddit, COUNT(*) AS comment_count
FROM typed_comments
GROUP BY subreddit, mbti
ORDER BY comment_count DESC
LIMIT 20
;
"""
query = """
--sql
SELECT comments.subreddit, comments.mbti
FROM comments
INNER JOIN (
SELECT subreddit, COUNT(*) as total_comments
FROM comments
GROUP BY subreddit
ORDER BY total_comments DESC
LIMIT 20
) AS top_subreddits
ON comments.subreddit = top_subreddits.subreddit
;
"""
db.cursor.execute(query)
rows = db.cursor.fetchall()
print(tabulate(rows))
def query6(db: pg.Connector):
"""
How many users of each type are in the dataset in total. Number and percentage.
"""
query = f"""
--sql
CREATE MATERIALIZED VIEW IF NOT EXISTS percentage_comments_per_mbti AS
SELECT mbti_type,
COUNT(*) AS total_comments,
COUNT(*) * 1.0/ (
SELECT COUNT(*) * 1.0 FROM typed_comments
) AS percentage_comments
FROM typed_comments
GROUP BY mbti_type
;
"""
query = f"""
--sql
CREATE MATERIALIZED VIEW comments_per_mbti2 AS
SELECT mbti, COUNT(*) AS mbti_comment_count
FROM comments
GROUP BY mbti
;
"""
db.cursor.execute(query)
db.connection.commit()
# rows = db.cursor.fetchall()
# headers = [desc[0] for desc in db.cursor.description]
# print(tabulate(rows, headers=headers))
def get_user_data(db: pg.Connector, table: str, user: str):
"""
Get all data for a specific user from a given table.
"""
query = f"""
--sql
SELECT *
FROM {table}
WHERE author='{user}'
;
"""
db.cursor.execute(query)
rows = db.cursor.fetchall()
db.connection.commit()
return rows
def get_user_comments(db: pg.Connector, table: str, user: str):
"""
Get only comments for a specific user from a given table.
"""
query = f"""
--sql
SELECT comment
FROM {table}
WHERE author='{user}'
;
"""
db.cursor.execute(query)
rows = db.cursor.fetchall()
db.connection.commit()
return rows
if __name__ == "__main__":
# Example usage
db = pg.Connector()
# # Include metadata
# rows = get_user_data(db, "comments", "Famraine")
# for row in rows:
# print(row)
# # Only comments
# rows = get_user_comments(db, "comments", "Famraine")
# for row in rows:
# print(row)
# query1_1(db)
# query1_2(db)
db.connection.close()