generated from byuibigdata/project_safegraph
-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_safegraph.py
64 lines (44 loc) · 1.88 KB
/
parse_safegraph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# %%
import pandas as pd
import numpy as np
import python_examples.safegraph_functions as sgf
# %%
pathLoc = "SafeGraph - Patterns and Core Data - Chipotle - July 2021/Core Places and Patterns Data/chipotle_core_poi_and_patterns.csv"
dat = pd.read_csv(pathLoc)
# %%
# fix dates
# 2021-08-01T00:00:00-05:00 for date_range
# 2019-07 closed_since
# ['date_range_start', 'date_range_end', 'tracking_closed_since']
dat = dat.assign(
date_range_start = lambda x: pd.to_datetime(x.date_range_start.str.split("T").str[0], utc=True),
date_range_end = lambda x: pd.to_datetime(x.date_range_end.str.split("T").str[0], utc=True),
tracking_closed_since = lambda x: pd.to_datetime(x.tracking_closed_since, format= "%Y-%m")
)
# %%
# complex columns
list_cols = ['visits_by_day', 'popularity_by_hour']
json_cols = ['open_hours', 'bucketed_dwell_times', 'related_same_day_brand', 'related_same_month_brand', 'popularity_by_day', 'device_type', 'visitor_home_aggregation', 'visitor_home_cbgs', 'visitor_country_of_origin','visitor_daytime_cbgs']
# %%
# base dataset
dat_base = dat.drop(list_cols + json_cols, axis=1)
dat_base.to_parquet("parquet/poi.parquet")
# %%
# only two list columns
dat_vbd = sgf.expand_list("visits_by_day", dat)
dat_pbh = sgf.expand_list("popularity_by_hour", dat)
dat_pbh.to_parquet("parquet/popularity_by_hour.parquet")
dat_vbd.to_parquet("parquet/visits_by_day.parquet")
# %%
# build tables
# my expand_json function feels super slow. took ~90 minutes to run. The visitor cbgs columns are the beast.
for i in json_cols:
print(i)
dati = sgf.expand_json(i, dat)
dati.to_parquet("parquet/" + i + ".parquet")
# %%
# example of the expand_json function
dat_pbd = sgf.expand_json('popularity_by_day', dat, wide=False)
dat_rsdb = sgf.expand_json('related_same_day_brand', dat, wide=False)
dat_vhcbgs = sgf.expand_json('visitor_home_cbgs', dat.iloc[:100,:], wide=False)
# %%