-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_cloud_data.py
42 lines (32 loc) · 1.21 KB
/
generate_cloud_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
"""
This script extract the daily most frequent K hashtags and print them in a format that we can use in the html page which displays them.
Author: Sofiane Abbar
You can run the code as follows: python generate_cloud_data.py tweets.txt 50
"""
from collections import Counter, defaultdict
import json
import time
import sys
def to_ts_day(tw_time):
return time.strftime('%Y-%m-%d', time.strptime(tw_time,'%a %b %d %H:%M:%S +0000 %Y'))
def generate_daily_hashatags(fname, topK=50):
day_tags = defaultdict(list)
with open(fname) as f:
for line in f:
try:
o = json.loads(line)
day_tags[to_ts_day(o['created_at'])] += [x["text"] for x in o['entities']["hashtags"]]
except:
continue
for day in sorted(day_tags):
print '===================', day
for k, v in Counter(day_tags[day]).most_common(topK):
print '{text: "%s", weight: %s},' % (k,v)
if __name__ == '__main__':
try:
fname = sys.argv[1]
topK = int(sys.argv[2])
except:
print 'You need to provide path to data file and number of hashtags per day. E.g., python generate_cloud_data.py tweets.txt 50'
sys.exit()
generate_daily_hashatags(fname, topK)