Skip to content

Latest commit

 

History

History
664 lines (520 loc) · 13.5 KB

database_python.md

File metadata and controls

664 lines (520 loc) · 13.5 KB

Read citibike data and write to SQL database from python

# define function to read in a citibike file (already downloaded and unzipped), do some 
# cleaning, and add some fields
import pandas as pd
#dat = pd.read_csv('data/2013-09 - Citi Bike trip data.csv',parse_dates=True)
def load_citibike_monthly(fname):
    dat = pd.read_csv(fname,parse_dates=True)
    dat.starttime=pd.to_datetime(dat.starttime)
    dat.stoptime=pd.to_datetime(dat.stoptime)
    dat['gender']=dat['gender'].astype('category')
    dat.rename(columns=lambda x: x.replace(' ', '_'), inplace=True)
    dat['day']=dat.starttime.dt.day
    dat['month']=dat.starttime.dt.month
    dat['year']=dat.starttime.dt.year
    dat['yday']=dat.starttime.dt.dayofyear
    dat['wkday']=dat.starttime.dt.dayofweek
    return dat
# test it out
fname='data/2013-09 - Citi Bike trip data.csv'
dat = load_citibike_monthly(fname)
dat.head()
tripduration starttime stoptime start_station_id start_station_name start_station_latitude start_station_longitude end_station_id end_station_name end_station_latitude end_station_longitude bikeid usertype birth_year gender day month year yday wkday
0 1010 2013-09-01 00:00:02 2013-09-01 00:16:52 254 W 11 St & 6 Ave 40.735324 -73.998004 147 Greenwich St & Warren St 40.715422 -74.011220 15014 Subscriber 1974 1 1 9 2013 244 6
1 1443 2013-09-01 00:00:09 2013-09-01 00:24:12 151 Cleveland Pl & Spring St 40.721816 -73.997203 497 E 17 St & Broadway 40.737050 -73.990093 19393 Customer \N 0 1 9 2013 244 6
2 1387 2013-09-01 00:00:16 2013-09-01 00:23:23 352 W 56 St & 6 Ave 40.763406 -73.977225 405 Washington St & Gansevoort St 40.739323 -74.008119 16160 Subscriber 1992 1 1 9 2013 244 6
3 405 2013-09-01 00:00:18 2013-09-01 00:07:03 490 8 Ave & W 33 St 40.751551 -73.993934 459 W 20 St & 11 Ave 40.746745 -74.007756 14997 Subscriber 1973 1 1 9 2013 244 6
4 270 2013-09-01 00:00:20 2013-09-01 00:04:50 236 St Marks Pl & 2 Ave 40.728419 -73.987140 393 E 5 St & Avenue C 40.722992 -73.979955 19609 Subscriber 1984 1 1 9 2013 244 6

Now write a script to read all monthly files and put in database

# Get list of the citibke csv files we have
import glob
flist = glob.glob('data/*data.csv')
flist
['data/2013-07 - Citi Bike trip data.csv',
 'data/2013-08 - Citi Bike trip data.csv',
 'data/2013-09 - Citi Bike trip data.csv',
 'data/2013-10 - Citi Bike trip data.csv',
 'data/2013-11 - Citi Bike trip data.csv',
 'data/2013-12 - Citi Bike trip data.csv',
 'data/2014-01 - Citi Bike trip data.csv',
 'data/2014-02 - Citi Bike trip data.csv',
 'data/2014-03 - Citi Bike trip data.csv',
 'data/2014-04 - Citi Bike trip data.csv',
 'data/2014-05 - Citi Bike trip data.csv',
 'data/2014-06 - Citi Bike trip data.csv',
 'data/2014-07 - Citi Bike trip data.csv',
 'data/2014-08 - Citi Bike trip data.csv',
 'data/201409-citibike-tripdata.csv',
 'data/201410-citibike-tripdata.csv',
 'data/201411-citibike-tripdata.csv',
 'data/201412-citibike-tripdata.csv']
# Then loop  and load each one
# if this is too much (some files are pretty large), could also read each file in in chunks

import sqlite3
con = sqlite3.connect("/Users/Andy/Desktop/test2.db3")
for fname in flist:
    print('loading ' + fname)
    dat = load_citibike_monthly(fname)
    print('adding ' + fname + ' to sql database')
    dat.to_sql("rides",con,if_exists='append',chunksize=10000)
    del dat
print('done')
 
loading data/2013-07 - Citi Bike trip data.csv
adding data/2013-07 - Citi Bike trip data.csv to sql database
loading data/2013-08 - Citi Bike trip data.csv
adding data/2013-08 - Citi Bike trip data.csv to sql database
loading data/2013-09 - Citi Bike trip data.csv
adding data/2013-09 - Citi Bike trip data.csv to sql database
loading data/2013-10 - Citi Bike trip data.csv
adding data/2013-10 - Citi Bike trip data.csv to sql database
loading data/2013-11 - Citi Bike trip data.csv
adding data/2013-11 - Citi Bike trip data.csv to sql database
loading data/2013-12 - Citi Bike trip data.csv
adding data/2013-12 - Citi Bike trip data.csv to sql database
loading data/2014-01 - Citi Bike trip data.csv
adding data/2014-01 - Citi Bike trip data.csv to sql database
loading data/2014-02 - Citi Bike trip data.csv
adding data/2014-02 - Citi Bike trip data.csv to sql database
loading data/2014-03 - Citi Bike trip data.csv
adding data/2014-03 - Citi Bike trip data.csv to sql database
loading data/2014-04 - Citi Bike trip data.csv
adding data/2014-04 - Citi Bike trip data.csv to sql database
loading data/2014-05 - Citi Bike trip data.csv
adding data/2014-05 - Citi Bike trip data.csv to sql database
loading data/2014-06 - Citi Bike trip data.csv
adding data/2014-06 - Citi Bike trip data.csv to sql database
loading data/2014-07 - Citi Bike trip data.csv
adding data/2014-07 - Citi Bike trip data.csv to sql database
loading data/2014-08 - Citi Bike trip data.csv
adding data/2014-08 - Citi Bike trip data.csv to sql database
loading data/201409-citibike-tripdata.csv
adding data/201409-citibike-tripdata.csv to sql database
loading data/201410-citibike-tripdata.csv
adding data/201410-citibike-tripdata.csv to sql database
loading data/201411-citibike-tripdata.csv
adding data/201411-citibike-tripdata.csv to sql database
loading data/201412-citibike-tripdata.csv
adding data/201412-citibike-tripdata.csv to sql database
done
# count the total number of rows: there are over 13 million rows, no way we could load that all in Pandas
pd.read_sql_query("SELECT COUNT(*) FROM rides", con)
COUNT(*)
0 13118401

First i'll examine the number of rides per month, for 2014. The plot below shows that there is a strong seasonal cycle, with more rides in the summer months. My hypothesis is that this is mainly driven by the temperature; I will get weather data later and test this.

# count rides per month
df = pd.read_sql_query("SELECT year,month,count(*) as num_rides FROM rides WHERE year=2014 GROUP BY month", con)
df
year month num_rides
0 2014 1 300400
1 2014 2 224736
2 2014 3 439117
3 2014 4 670780
4 2014 5 866117
5 2014 6 936880
6 2014 7 968842
7 2014 8 963489
8 2014 9 953887
9 2014 10 828711
10 2014 11 529188
11 2014 12 399069
# Plot rides vs month
# There is a very strong seasonal cycle
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(df.month,df.num_rides)
plt.scatter(df.month,df.num_rides)
plt.grid()
plt.xlabel('month')
plt.ylabel('# rides')
<matplotlib.text.Text at 0x11e7ffeb8>

png

Next i'll look at the number of rides each day in 2014. This shows that there is a strong seasonal pattern, but also some big residuals from that pattern. These could be days that were unseasonably cold/warm, or due to other factors.

# plot rides/day for 2014
df = pd.read_sql_query("select yday, count(*) as num_rides from rides where year=2014 group by yday",con)
plt.plot(df.yday,df.num_rides)
plt.grid()
plt.xlabel('day of year')
plt.ylabel('# rides')
<matplotlib.text.Text at 0x120348b00>

png

Which stations were the most used?

df = pd.read_sql_query("select start_station_id, start_station_name, count(*) as num_rides from rides where year=2014 group by start_station_id order by num_rides desc",con)
df.head(10)
start_station_id start_station_name num_rides
0 521 8 Ave & W 31 St 100498
1 519 Pershing Square North 92137
2 293 Lafayette St & E 8 St 86692
3 497 E 17 St & Broadway 80166
4 435 W 21 St & 6 Ave 73448
5 285 Broadway & E 14 St 65852
6 426 West St & Chambers St 63221
7 402 Broadway & E 22 St 60932
8 151 Cleveland Pl & Spring St 60092
9 382 University Pl & E 14 St 60072
plt.boxplot(df.num_rides)
{'boxes': [<matplotlib.lines.Line2D at 0x1223a4588>],
 'caps': [<matplotlib.lines.Line2D at 0x1223ba518>,
  <matplotlib.lines.Line2D at 0x1223cf208>],
 'fliers': [<matplotlib.lines.Line2D at 0x120348cf8>],
 'means': [],
 'medians': [<matplotlib.lines.Line2D at 0x11f8f9c18>],
 'whiskers': [<matplotlib.lines.Line2D at 0x1223a44a8>,
  <matplotlib.lines.Line2D at 0x12142ec18>]}

png

# There are a small number of stations with many more rides; where are they? Plot on map?

To-do:

  • fit seasonal cycle and remove to examine anomalies
# How many rides were taken by men vs women?
df = pd.read_sql_query("SELECT gender,count(*) as num_rides FROM rides GROUP BY gender ORDER BY num_rides DESC", con)
df.head()
# write a separate function to read the data and write to sql in chunks, see if it is faster.