-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract.py
301 lines (262 loc) · 9.76 KB
/
extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
# built-in modules
import os
import argparse
from csv import DictWriter
from datetime import datetime
from time import sleep
# libraries
import json
# project modules
from utils import (
get_aws_session, add_name_timestamp,
firehose_put, firehose_batch, POS_STREAM_NAME,
ROUTES_STREAM_NAME, ROUTES_SCHED_STREAM_NAME,
STOPS_STREAM_NAME, STOPS_SCHED_STREAM_NAME,
mkdir_timestamp, DATA_FIELDNAMES_MAP
)
from wmata import (
get_bus_position, get_routes, get_schedule,
get_stops, get_stop_schedule, get_route_ids,
get_stop_ids, flatten_route_sched_data,
get_path_details, flatten_path_details_data
)
AWS_FIREHOSE_CLIENT = get_aws_session().client('firehose')
def _send_to_firehose(json_data: str, data_name: str, stream_name: str, verbose=False):
if len(json_data) > 1024000:
# send json_data in chunks of 1000000 bytes or less
start = 0
end = 1000000
chunk_batch = list()
while True:
chunk_batch.append({'Data': json_data[start:end]})
start = end
end += 1000000
if end >= len(json_data):
end = len(json_data) + 1
chunk_batch.append({'Data': json_data[start:end] + '\n'})
firehose_batch(
client=AWS_FIREHOSE_CLIENT, data_name=data_name,
stream_name=stream_name, records=chunk_batch, verbose=verbose
)
break
else:
record = {'Data': json_data + '\n'}
firehose_put(
client=AWS_FIREHOSE_CLIENT, data_name=data_name,
stream_name=stream_name, record=record, verbose=verbose
)
def _save_csv(data, api_type: str, path_level=2, custom: str = ''):
if custom:
custom = ''.join(e for e in custom if e.isalnum()) # rm spec chars
file_name = '_'.join([
api_type, custom, datetime.now().strftime('%m-%d-%Y_%H-%M-%S')
]) + '.csv'
else:
file_name = '_'.join([
api_type, datetime.now().strftime('%m-%d-%Y_%H-%M-%S')
]) + '.csv'
path = mkdir_timestamp(
data_type=api_type, level=path_level
)
path = os.path.join(path, file_name)
with open(path, mode='w', newline='') as csv:
writer = DictWriter(
csv, fieldnames=DATA_FIELDNAMES_MAP[api_type]
)
writer.writeheader()
writer.writerows(data)
print('\t[{}]: saved!'.format(file_name))
def fetch_bus_positions(verbose=False) -> None:
"""Extract bus_position data and load to S3 via firehose.
Args:
verbose (bool): if True, print firehose response element.
"""
data_name = 'bus_positions'
resp = get_bus_position()
# iterate BusPositions elements and stream to firehose
# TODO: add to_csv option
records = list()
for bus_pos in resp.json()['BusPositions']:
records.append({'Data': json.dumps(bus_pos) + '\n'})
if len(records) == 400:
firehose_batch(
client=AWS_FIREHOSE_CLIENT, data_name=data_name,
stream_name=POS_STREAM_NAME, records=records, verbose=verbose
)
records = list() # reset records for next batch
else:
firehose_batch(
client=AWS_FIREHOSE_CLIENT, data_name=data_name,
stream_name=POS_STREAM_NAME, records=records, verbose=verbose
)
def fetch_routes(
to_csv=True, to_firehose=False, get_sched=True, get_path=True, verbose=False
) -> None:
"""Fetch routes data.
Args:
to_csv (bool): save local as csv.
to_firehose (bool): send data to aws_firehose.
get_sched (bool): fetch bus stop schedules.
get_path (bool): fetch path details for each route.
verbose (bool): if True, print firehose response element.
"""
data_name = 'routes'
resp = get_routes()
if to_csv:
_save_csv(data=resp.json()['Routes'], api_type=data_name)
if to_firehose: # TODO: see bus_positions to complete
data = add_name_timestamp(resp_data=resp.json(), data_name=data_name)
raise NotImplementedError
if get_sched:
route_ids = get_route_ids(resp.json())
fetch_route_sched(
route_ids=route_ids, to_csv=to_csv,
to_firehose=to_firehose, verbose=verbose
)
if get_path:
route_ids = get_route_ids(resp.json())
fetch_path_details(
route_ids=route_ids, to_csv=to_csv,
to_firehose=to_firehose, verbose=verbose
)
def fetch_route_sched(
route_ids: list, to_csv=True, to_firehose=False, verbose=False
) -> None:
"""Fetch route schedules data.
Args:
route_ids (list): route ids to fetch.
to_csv (bool): save local as csv.
to_firehose (bool): send data to aws_firehose.
verbose (bool): if True, print firehose response element.
"""
data_name = 'route_scheds'
for i, route_id in enumerate(route_ids):
resp = get_schedule(route_id)
print(f'Route id: {route_id}, size: {len(resp.content)}')
data = flatten_route_sched_data(resp_json=resp.json())
if to_csv:
_save_csv(
data=data, api_type=data_name, path_level=3, custom=route_id
)
if to_firehose:
raise NotImplementedError
sleep(1/10) # Per API specs 10 calls/second limit
def fetch_stops(
to_csv=True, to_firehose=False, get_sched=False, verbose=False
) -> None:
"""Fetch stops data.
Args:
to_csv (bool): save local as csv.
to_firehose (bool): send data to aws_firehose.
get_sched (bool): fetch bus stop schedule.
verbose (bool): if True, print firehose response element.
"""
data_name = 'stops'
resp = get_stops()
if to_csv:
_save_csv(data=resp.json()['Stops'], api_type=data_name)
if to_firehose: # TODO: see bus_positions to complete
data = add_name_timestamp(resp_data=resp.json(), data_name=data_name)
raise NotImplementedError
if get_sched:
stop_ids = get_stop_ids(resp.json())
fetch_stop_scheds(
stop_ids=stop_ids, to_csv=to_csv,
to_firehose=to_firehose, verbose=verbose
)
# TODO: Not fully implemented
def fetch_stop_scheds(
stop_ids: list, to_csv=True, to_firehose=False, verbose=False
) -> None:
"""Fetch stop schedules data.
Args:
stop_ids (list): stop ids to fetch.
to_csv (bool): save local as csv.
to_firehose (bool): send data to aws_firehose.
verbose (bool): if True, print firehose response element.
"""
data_name = 'stop_schedules'
for i, stop_id in enumerate(stop_ids):
resp = get_stop_schedule(stop_id)
print(f'Stop id: {stop_id}, size: {len(resp.content)}')
if to_csv:
raise NotImplementedError
if to_firehose: # TODO: see bus_positions to complete
sched_data = add_name_timestamp(
resp_data=resp.json(), data_name=data_name
)
raise NotImplementedError
sleep(1/10) # Per API specs 10 calls/second limit
def fetch_path_details(
route_ids: list, date='', to_csv=True, to_firehose=False, verbose=False
) -> None:
"""Fetch path details data for specified routes.
Args:
route_ids (list): route ids to fetch.
date (str): Date in YYYY-MM-DD format for which to retrieve
path details. Defaults to today's date unless specified.
to_csv (bool): save local as csv.
to_firehose (bool): send data to aws_firehose.
verbose (bool): if True, print firehose response element.
"""
for i, route_id in enumerate(route_ids):
resp = get_path_details(route_id, date)
print(f'Route id: {route_id}, size: {len(resp.content)}')
flat_data = flatten_path_details_data(resp_json=resp.json())
for data_name, data in flat_data.items():
if to_csv:
_save_csv(
data=data, api_type=data_name, path_level=3, custom=route_id
)
if to_firehose:
raise NotImplementedError
sleep(1/10) # Per API specs 10 calls/second limit
def extract(data, sched, nocsv, date, firehose, verbose, path):
if data == 'positions':
fetch_bus_positions(verbose)
if data == 'routes':
fetch_routes(
to_csv=nocsv, to_firehose=firehose,
get_sched=sched, get_path=path,
verbose=verbose
)
if data == 'stops':
fetch_stops(
to_csv=nocsv, to_firehose=firehose,
get_sched=sched, verbose=verbose
)
if __name__ == '__main__':
arg_parser = argparse.ArgumentParser(
description='Fetch data from WMATA API.'
)
arg_parser.add_argument(
'data', choices=['position', 'routes', 'stops'],
help='The data to be fetched and saved.'
)
arg_parser.add_argument(
'--sched', action='store_true',
help='Get schedule data, if routes or stops data fetched.'
)
arg_parser.add_argument(
'--path', action='store_true',
help='Get path details, if routes data fetched.'
)
arg_parser.add_argument(
'--nocsv', action='store_false',
help='Don\'t save data to csv file.'
)
arg_parser.add_argument(
'--date', '-d',
default=datetime.today().strftime('%Y-%m-%d'),
help='Date in YYYY-MM-DD format, for fetching historical schedules data.'
)
arg_parser.add_argument(
'--firehose', action='store_true',
help='Send fetched data to AWS Firehose.'
)
arg_parser.add_argument(
'-v', '--verbose', action='store_true',
help='if True print firehose response.',
dest='verbose'
)
extract(**vars(arg_parser.parse_args()))