-
Notifications
You must be signed in to change notification settings - Fork 1
/
Scraping NY Times headlines.py
73 lines (61 loc) · 2.68 KB
/
Scraping NY Times headlines.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import requests
import os
import time
import datetime
import dateutil
import pandas as pd
end = datetime.date(2020, 12, 31)
start = datetime.date(2020, 1, 1)
year = "2020" # <1851 - 2020>
month = "12" # <1 - 12>
months_in_range = [x.split(' ') for x in pd.date_range(start, end, freq='MS').strftime("%Y %m").tolist()]
def send_request(date):
'''Sends a request to the NYT Archive API for given date.'''
url = f"https://api.nytimes.com/svc/archive/v1/{year}/{month}.json?api-key=#INSERT-YOUR-API-KEY"
response = requests.get(url).json()
time.sleep(6)
return response
def is_valid(article, date):
'''An article is only worth checking if it is in range, and has a headline.'''
is_in_range = date > start and date < end
has_headline = type(article['headline']) == dict and 'main' in article['headline'].keys()
return is_in_range and has_headline
def parse_response(response):
'''Parses and returns response as pandas data frame.'''
data = {'headline': [],
'date': [],
'doc_type': [],
'material_type': [],
'section': [],
'keywords': []}
articles = response['response']['docs']
for article in articles: # For each article, make sure it falls within our date range
date = dateutil.parser.parse(article['pub_date']).date()
if is_valid(article, date):
data['date'].append(date)
data['headline'].append(article['headline']['main'])
if 'section' in article:
data['section'].append(article['section_name'])
else:
data['section'].append(None)
data['doc_type'].append(article['document_type'])
if 'type_of_material' in article:
data['material_type'].append(article['type_of_material'])
else:
data['material_type'].append(None)
keywords = [keyword['value'] for keyword in article['keywords'] if keyword['name'] == 'subject']
data['keywords'].append(keywords)
return pd.DataFrame(data)
def get_data(dates):
'''Sends and parses request/response to/from NYT Archive API for given dates.'''
total = 0
print('Date range: ' + str(dates[0]) + ' to ' + str(dates[-1]))
if not os.path.exists('headlines'):
os.mkdir('headlines')
for date in dates:
response = send_request(date)
df = parse_response(response)
total += len(df)
df.to_csv('headlines/' + date[0] + '-' + date[1] + '.csv', index=False)
print('Saving headlines/' + date[0] + '-' + date[1] + '.csv...')
print('Number of articles collected: ' + str(total))