-
Notifications
You must be signed in to change notification settings - Fork 1
/
twint_scrape.py
86 lines (75 loc) · 3.19 KB
/
twint_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import datetime
import random
import twint
from twint import run
def scrape(start, finish, keyword, file_start = '', off_by = 0):
''' Given a start/finish datetime instance and keyword(s) to scrape for,
search and return a dictionary with first key as start date '''
# Initialize twint configuration
c = twint.Config()
c.Search = keyword
c.Pandas = True
# c.Hide_output = True
# c.Retries_count = 100
c.Count = True # To ensure running
# Filename equals start time if this is first time scraping
if file_start == '':
file_start = start
# If script killed mid-day, num_per_day is non-zero
num_per_day = off_by
# timedelta value
time_span = finish - start
# Increment by a timedelta value of 1 hour
# incr = start + datetime.timedelta(hours = 1)
incr = finish
# Create file write num_per_day for each day in file and name by start date
# Use second file to keep track of parameters if terminated early
filename = str(keyword) + '_' + str(file_start)[:10] + '.txt'
file = open(filename, 'a')
# Increment start/finish dates until the end date is reached
while time_span != datetime.timedelta(0):
# Use new start, incr values to search
start_new = str(start)
incr_new = str(incr)
# print(incr_new)
print("currently at", start_new, "to", incr_new)
c.Since = start_new
c.Until = incr_new
print(c.Since + " -> "+ c.Until)
twint.run.Search(c)
# Store tweets in a variable, keep track of number of tweets
tweets = twint.storage.panda.Tweets_df
num_per_day += len(tweets)
# If no tweets were returned, try the same search again
if len(tweets) == 0:
try_again = input('Potential server error, retry? (y/n): ')
# try_again = 'y' # For mindless running
# print('Potential server error, trying again') # For mindless running
if try_again == 'y':
continue
else:
print('Connection error')
break
# If hour & min = 0, day has changed. Thus, reset num_per_day vals
# and increment key counter
if incr.hour == 0 and incr.minute == 0:
file.write(str(num_per_day) + ' ' + str(start)[:10] + '\n')
file.close
file = open(filename, 'a')
num_per_day = 0
# After a search has been conducted, keep track of where to pick up
# again if terminated early.
# After search & num_per_day reset to ensure that num_per_day is
# the sum of tweets thus far on that day with start time incr
file2 = open(str(keyword) + '_terminated.txt', 'w')
file2.write(str(incr) + ' ' + str(num_per_day)) # time for next start
file2.close
# decrease time discrepancy and increment start, incr
time_span = time_span - datetime.timedelta(hours = 1)
start = incr
# print(start) # Can comment out
incr = incr + datetime.timedelta(hours = 1)
# print(incr) # Can comment out
file.write(str(num_per_day) + ' ' + str(start)[:10] + '\n')
file.close
return True