diff --git a/taxi_dashboard.ipynb b/taxi_dashboard.ipynb index 5e98e02..1b95b08 100644 --- a/taxi_dashboard.ipynb +++ b/taxi_dashboard.ipynb @@ -1258,7 +1258,20 @@ "text/html": [ "\n", "\n", - "\n", "\n", "\n", @@ -2300,7 +2300,7 @@ { "data": { "text/plain": [ - "'/home/winston/turbo-telegram/data/nyc_taxi_jan15.csv'" + "'/home/winston/turbo-telegram/data/nyc_taxi_*'" ] }, "execution_count": 1, @@ -2337,7 +2337,7 @@ "# tag path to this notebook\n", "cwd = os.getcwd() \n", "# add taxi wildcard to complete file path of data\n", - "path = cwd + '/' + 'data/nyc_taxi_jan15.csv'\n", + "path = cwd + '/' + 'data/nyc_taxi_*'\n", "# how's it look?\n", "path" ] @@ -2392,106 +2392,97 @@ " mta_tax\n", " tip_amount\n", " tolls_amount\n", - " improvement_surcharge\n", " total_amount\n", " \n", " \n", " \n", " \n", - " 697498\n", - " 2\n", - " 2015-01-31 20:46:12\n", - " 2015-01-31 20:57:49\n", + " 1911983\n", " 1\n", - " 2.31\n", - " -8.237828e+06\n", - " 4.972035e+06\n", + " 2015-02-02 13:12:45\n", + " 2015-02-02 13:20:29\n", + " 1\n", + " 0.80\n", + " -8.233733e+06\n", + " 4.978858e+06\n", " 1\n", " N\n", - " -8.239360e+06\n", - " 4.969136e+06\n", + " -8.232789e+06\n", + " 4.980123e+06\n", " 2\n", - " 10.0\n", - " 0.5\n", + " 6.5\n", + " 0.0\n", " 0.5\n", " 0.0\n", " 0.0\n", - " 0.3\n", - " 11.3\n", + " 7.3\n", " \n", " \n", - " 697499\n", - " 1\n", - " 2015-01-31 21:04:48\n", - " 2015-01-31 21:20:53\n", - " 1\n", - " 3.10\n", - " -8.233061e+06\n", - " 4.978176e+06\n", + " 1911984\n", + " 2\n", + " 2015-02-12 21:39:55\n", + " 2015-02-12 21:47:53\n", + " 2\n", + " 1.04\n", + " -8.235069e+06\n", + " 4.976922e+06\n", " 1\n", " N\n", - " -8.236266e+06\n", - " 4.972510e+06\n", + " -8.233533e+06\n", + " 4.976631e+06\n", " 1\n", - " 13.0\n", + " 7.0\n", " 0.5\n", " 0.5\n", - " 1.0\n", + " 1.5\n", " 0.0\n", - " 0.3\n", - " 15.3\n", + " 9.8\n", " \n", " \n", - " 697500\n", - " 1\n", - " 2015-01-31 20:52:51\n", - " 2015-01-31 21:20:06\n", + " 1911985\n", + " 2\n", + " 2015-02-25 16:55:21\n", + " 2015-02-25 17:06:37\n", " 1\n", - " 6.90\n", - " -8.234850e+06\n", - " 4.981938e+06\n", + " 1.84\n", + " -8.234084e+06\n", + " 4.977811e+06\n", " 1\n", " N\n", - " -8.237785e+06\n", - " 4.971117e+06\n", - " 1\n", - " 25.5\n", - " 0.5\n", + " -8.231831e+06\n", + " 4.979868e+06\n", + " 2\n", + " 9.5\n", + " 1.0\n", " 0.5\n", - " 1.2\n", " 0.0\n", - " 0.3\n", - " 28.0\n", + " 0.0\n", + " 11.3\n", " \n", " \n", "\n", "" ], "text/plain": [ - " VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count \\\n", - "697498 2 2015-01-31 20:46:12 2015-01-31 20:57:49 1 \n", - "697499 1 2015-01-31 21:04:48 2015-01-31 21:20:53 1 \n", - "697500 1 2015-01-31 20:52:51 2015-01-31 21:20:06 1 \n", - "\n", - " trip_distance pickup_x pickup_y RateCodeID \\\n", - "697498 2.31 -8.237828e+06 4.972035e+06 1 \n", - "697499 3.10 -8.233061e+06 4.978176e+06 1 \n", - "697500 6.90 -8.234850e+06 4.981938e+06 1 \n", - "\n", - " store_and_fwd_flag dropoff_x dropoff_y payment_type \\\n", - "697498 N -8.239360e+06 4.969136e+06 2 \n", - "697499 N -8.236266e+06 4.972510e+06 1 \n", - "697500 N -8.237785e+06 4.971117e+06 1 \n", - "\n", - " fare_amount extra mta_tax tip_amount tolls_amount \\\n", - "697498 10.0 0.5 0.5 0.0 0.0 \n", - "697499 13.0 0.5 0.5 1.0 0.0 \n", - "697500 25.5 0.5 0.5 1.2 0.0 \n", - "\n", - " improvement_surcharge total_amount \n", - "697498 0.3 11.3 \n", - "697499 0.3 15.3 \n", - "697500 0.3 28.0 " + " VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count \\\n", + "1911983 1 2015-02-02 13:12:45 2015-02-02 13:20:29 1 \n", + "1911984 2 2015-02-12 21:39:55 2015-02-12 21:47:53 2 \n", + "1911985 2 2015-02-25 16:55:21 2015-02-25 17:06:37 1 \n", + "\n", + " trip_distance pickup_x pickup_y RateCodeID \\\n", + "1911983 0.80 -8.233733e+06 4.978858e+06 1 \n", + "1911984 1.04 -8.235069e+06 4.976922e+06 1 \n", + "1911985 1.84 -8.234084e+06 4.977811e+06 1 \n", + "\n", + " store_and_fwd_flag dropoff_x dropoff_y payment_type \\\n", + "1911983 N -8.232789e+06 4.980123e+06 2 \n", + "1911984 N -8.233533e+06 4.976631e+06 1 \n", + "1911985 N -8.231831e+06 4.979868e+06 2 \n", + "\n", + " fare_amount extra mta_tax tip_amount tolls_amount total_amount \n", + "1911983 6.5 0.0 0.5 0.0 0.0 7.3 \n", + "1911984 7.0 0.5 0.5 1.5 0.0 9.8 \n", + "1911985 9.5 1.0 0.5 0.0 0.0 11.3 " ] }, "execution_count": 2, @@ -2505,7 +2496,7 @@ " 'int', 'str', 'float64', 'float64', 'int', 'float64', 'float64', \n", " 'float64', 'float64', 'float64', 'float64', 'float64']\n", "bc.create_table('big_taxi', path, dtype=col_types, header=0)\n", - "# what's it look like?\n", + "# what's it look like? \n", "bc.sql('select * from big_taxi').tail(3)" ] }, @@ -2524,7 +2515,6 @@ "source": [ "# build outputs (each will have it's own tab)\n", "output = widgets.Output()\n", - "Christmas_NYC = widgets.Output()\n", "plot_output = widgets.Output()\n", "hv_output = widgets.Output()\n", "hv_output2 = widgets.Output()\n", @@ -2591,44 +2581,19 @@ "default_location = dropdown_location.value\n", "\n", "def common_filtering(day, month, year, n_riders, tol_cost, location):\n", - " # sql: day, month, year, n_riders, tol_cost\n", - " # visual dataframe: month, year, n_riders, tol_cost\n", - " # plot: month, year, n_riders, tol_cost, location\n", - " \n", - " # check if input boxes are blank or not\n", - " if n_riders != '':\n", - " # and strip extra blank space from start & end\n", - " n_riders = n_riders.strip()\n", - " # check again that it's not blank\n", - " if n_riders != '':\n", - " # make sure there's a direction\n", - " if n_riders[0] not in '<>=':\n", - " # give user time to sort it\n", - " time.sleep(4) \n", - " # call it\n", - " return 'number of riders input must start with one of the following:\\n >\\n <\\n =\\n >=\\n <=\\nor be blank\\n\\nfor example: > 2 or = 3'\n", - " if tol_cost != '':\n", - " # e.g. ' ' = '' or '> 5 ' = '> 5'\n", - " tol_cost = tol_cost.strip()\n", - " # check again that it's not blank\n", - " if tol_cost != '':\n", - " # make sure there's a direction\n", - " if tol_cost[0] not in '<>=':\n", - " # give user some time to figure it out\n", - " time.sleep(4)\n", - " # we need a direction\n", - " return 'fare amount input must start with one of the following:\\n >\\n <\\n =\\n >=\\n <=\\nor be blank\\n\\nfor example: > 2 or = 3'\n", - " \n", + " \"\"\"sql: day, month, year, n_riders, tol_cost\n", + " visual dataframe: month, year, n_riders, tol_cost\n", + " plot: month, year, n_riders, tol_cost, location\"\"\"\n", + " \n", " # clear outputs (replace instead of stacking, basically)\n", " output.clear_output()\n", - " Christmas_NYC.clear_output()\n", " plot_output.clear_output()\n", " hv_output.clear_output()\n", " hv_output2.clear_output()\n", - " \n", + "\n", " # start building our SQL query \n", " query = 'SELECT * FROM big_taxi'\n", - " \n", + "\n", " # has user focused specific days?\n", " if day != default_day:\n", " # go through days selected\n", @@ -2699,102 +2664,39 @@ " else:\n", " # add WHERE & total fare to query \n", " query += f' WHERE fare_amount {tol_cost}'\n", - " \n", + "\n", " # run query & deliver cuDF DataFrame of results\n", " query = query.replace('big_taxi', f'({time_table})')\n", " gdf = bc.sql(query).to_pandas()\n", - " \n", + "\n", " with output:\n", " print(f'{query}\\n') # print(gdf.info())\n", " # trim down columns to display \n", " focused = ['passenger_count', 'trip_distance', 'VendorID', 'RateCodeID', 'payment_type', \n", " 'store_and_fwd_flag', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', \n", - " 'tolls_amount', 'improvement_surcharge', 'total_amount', 'tpep_pickup_datetime']\n", + " 'tolls_amount', 'total_amount', 'tpep_pickup_datetime']\n", " left_out = 'tpep_dropoff_datetime, pickup_x, pickup_y, dropoff_x, dropoff_y'\n", " print(f'DATAFRAME SAMPLE\\nexcluding columns: {left_out}')\n", " display(gdf[focused].sample(21))\n", - " \n", - " with Christmas_NYC:\n", - " map_tiles = EsriImagery().opts(alpha=0.5, width=900, height=480, bgcolor='red')\n", - " # check viability of graph\n", - " if len(gdf) >= 5000:\n", - " if dropdown_location.value == 'Pickup Location':\n", - " points = hv.Points(gdf, ['pickup_x', 'pickup_y'])\n", - " else:\n", - " points = hv.Points(gdf, ['dropoff_x', 'dropoff_y'])\n", - " taxi_trips = datashade(points, x_sampling=1, y_sampling=1, cmap=['green', 'white'], width=900, height=480)\n", - " display(map_tiles * taxi_trips)\n", - " # no dataframe edge case\n", - " elif len(gdf) == 0:\n", - " print(f'NO DATAFRAME TO DISPLAY\\nlen(gdf) == {len(gdf)}')\n", - " display(gdf)\n", - " # hyper-focused edge case \n", - " else: # in danger of unintrepretable map\n", - " # what's going on?\n", - " print(f'DISPLAY WARNING: len(gdf) < 5000\\n len(gdf) == {len(gdf)}')\n", - " extras = 5000 // len(gdf) # how short are we?\n", - " print(f'Graph will display DataFrame copied {1 + extras} times to saturate points')\n", - " print('Actual DataFrame WILL NOT change, this DOES NOT apply to other graphs')\n", - " # make temp dataframe (copied/saturated dataframe)\n", - " temp_df = pd.concat([gdf] + [gdf.copy() for i in range(1 + extras)], axis=0)\n", - " # and then we call the kdeplot method of seaborn by passing the number of visits:\n", - " if dropdown_location.value == 'Pickup Location':\n", - " points = hv.Points(temp_df, ['pickup_x', 'pickup_y'])\n", - " else:\n", - " points = hv.Points(temp_df, ['dropoff_x', 'dropoff_y'])\n", - " taxi_trips = datashade(points, x_sampling=1, y_sampling=1, cmap=fire, width=900, height=480)\n", - " # forget temp\n", - " temp_df = ''\n", - " display(map_tiles * taxi_trips)\n", - " # erase temp dataframe\n", - " temp_df = '' \n", - " # display NYC coord map \n", - " display(tf.set_background(tf.shade(agg, cmap=fire), \"black\")) \n", + " # nyc map \n", " with plot_output:\n", + " # determine pickup location \n", + " if dropdown_location.value == 'Pickup Location':\n", + " points = hv.Points(gdf, ['pickup_x', 'pickup_y'])\n", + " else:\n", + " points = hv.Points(gdf, ['dropoff_x', 'dropoff_y'])\n", + " # graph it\n", + " taxi_trips = datashade(points, x_sampling=1, y_sampling=1, cmap=fire, width=900, height=480)\n", + " # overlay NYC map & display the graph\n", " map_tiles = EsriImagery().opts(alpha=0.5, width=900, height=480, bgcolor='black')\n", - " # check viability of graph\n", - " if len(gdf) >= 5000:\n", - " # determine pickup location \n", - " if dropdown_location.value == 'Pickup Location':\n", - " points = hv.Points(gdf, ['pickup_x', 'pickup_y'])\n", - " else:\n", - " points = hv.Points(gdf, ['dropoff_x', 'dropoff_y'])\n", - " # graph it\n", - " taxi_trips = datashade(points, x_sampling=1, y_sampling=1, cmap=fire, width=900, height=480)\n", - " display(map_tiles * taxi_trips)\n", - " # no dataframe edge case\n", - " elif len(gdf) == 0:\n", - " print(f'NO DATAFRAME TO DISPLAY\\nlen(gdf) == {len(gdf)}')\n", - " display(gdf)\n", - " # hyper-focused edge case \n", - " else: # in danger of unintrepretable map\n", - " # what's going on?\n", - " print(f'DISPLAY WARNING: len(gdf) < 5000\\n len(gdf) == {len(gdf)}')\n", - " extras = 5000 // len(gdf) # how short are we?\n", - " print(f'Graph will display DataFrame copied {1 + extras} times to saturate points')\n", - " print('Actual DataFrame WILL NOT change, this DOES NOT apply to other graphs')\n", - " # make temp dataframe (copied/saturated dataframe)\n", - " temp_df = pd.concat([gdf] + [gdf.copy() for i in range(1 + extras)], axis=0)\n", - " # and then we call the kdeplot method of seaborn by passing the number of visits:\n", - " if dropdown_location.value == 'Pickup Location':\n", - " points = hv.Points(temp_df, ['pickup_x', 'pickup_y'])\n", - " else:\n", - " points = hv.Points(temp_df, ['dropoff_x', 'dropoff_y'])\n", - " # graph it\n", - " taxi_trips = datashade(points, x_sampling=1, y_sampling=1, cmap=fire, width=900, height=480)\n", - " # forget temp\n", - " temp_df = ''\n", - " display(map_tiles * taxi_trips)\n", - " # erase temp dataframe\n", - " temp_df = '' \n", - " # display NYC coord map \n", - " display(tf.set_background(tf.shade(agg, cmap=fire), \"black\")) \n", + " display(map_tiles * taxi_trips)\n", " # tip amount vs number of riders\n", " with hv_output:\n", " scatter = hv.Scatter(gdf, 'passenger_count', 'tip_amount')\n", - " ranged = scatter.redim.range(tip_amount=(0, 30), passenger_count=(-0.5, 6.5))\n", + " ranged = scatter.redim.range(tip_amount=(0, 60), passenger_count=(-0.5, 6.5))\n", " shaded = hd.spread(hd.datashade(ranged, x_sampling=0.15))\n", " labelled = shaded.redim.label(passenger_count=\"Passengers\", tip_amount=\"Tip ($)\")\n", + " # let's see\n", " display(labelled)\n", " # trip distance vs tip amount\n", " with hv_output2:\n", @@ -2802,27 +2704,66 @@ " ranged = scatter.redim.range(trip_distance=(0, 20), tip_amount=(0, 20))\n", " shaded = hd.spread(hd.datashade(ranged))\n", " labelled = shaded.redim.label(trip_distance=\"Trip Distance\", tip_amount=\"Tip ($)\")\n", + " # let's see\n", " display(labelled)\n", " \n", - "# The existing event handlers need to be adjusted \n", + " \n", "def dropdown_day_eventhandler(change):\n", " common_filtering(change.new, dropdown_month.value, dropdown_year.value, dropdown_riders.value, \n", " dropdown_fare.value, dropdown_location.value)\n", + " \n", "def dropdown_month_eventhandler(change):\n", " common_filtering(dropdown_day.value, change.new, dropdown_year.value, dropdown_riders.value, \n", " dropdown_fare.value, dropdown_location.value)\n", + " \n", "def dropdown_year_eventhandler(change):\n", " common_filtering(dropdown_day.value, dropdown_month.value, change.new, dropdown_riders.value, \n", " dropdown_fare.value, dropdown_location.value)\n", + " \n", "def dropdown_riders_eventhandler(change):\n", - " common_filtering(dropdown_day.value, dropdown_month.value, dropdown_year.value, change.new.replace('\"','').replace(\"'\",''), \n", - " dropdown_fare.value, dropdown_location.value) \n", + " # adjust change to remove errors \n", + " change = change.new.strip().replace('\"','').replace(\"'\",'')\n", + " # check change to make sure it works\n", + " valid = True\n", + " # is input blank \n", + " if change != '':\n", + " # make sure there's a direction\n", + " if change[0] not in '<>=':\n", + " # query not ready yet\n", + " valid = False\n", + " # make sure there's a number\n", + " if change[-1] not in '1234567890':\n", + " # query not ready yet\n", + " valid = False\n", + " # is the query ready?\n", + " if valid == True:\n", + " common_filtering(dropdown_day.value, dropdown_month.value, dropdown_year.value, \n", + " change, dropdown_fare.value, dropdown_location.value) \n", + " \n", "def dropdown_fare_eventhandler(change):\n", - " common_filtering(dropdown_day.value, dropdown_month.value, dropdown_year.value, dropdown_riders.value, \n", - " change.new.replace('\"','').replace(\"'\",''), dropdown_location.value) \n", + " # adjust change to remove errors \n", + " change = change.new.strip().replace('\"','').replace(\"'\",'')\n", + " # check change to make sure it works\n", + " valid = True\n", + " # is input blank \n", + " if change != '':\n", + " # make sure there's a direction\n", + " if change[0] not in '<>=':\n", + " # query not ready yet\n", + " valid = False\n", + " # make sure there's a number\n", + " if change[-1] not in '1234567890':\n", + " # query not ready yet\n", + " valid = False\n", + " # is the query ready?\n", + " if valid == True:\n", + " common_filtering(dropdown_day.value, dropdown_month.value, dropdown_year.value, \n", + " dropdown_riders.value, change, dropdown_location.value) \n", + " \n", "def dropdown_location_eventhandler(change):\n", " common_filtering(dropdown_day.value, dropdown_month.value, dropdown_year.value, dropdown_riders.value, \n", " dropdown_fare.value, change.new) \n", + " \n", "\n", "# We bind the handlers to the dropdowns, and that’s it!\n", "dropdown_day.observe(dropdown_day_eventhandler, names='value')\n", @@ -2832,6 +2773,7 @@ "dropdown_fare.observe(dropdown_fare_eventhandler, names='value')\n", "dropdown_location.observe(dropdown_location_eventhandler, names='value')\n", "\n", + "\n", "def input_dashboard():\n", " # The HBox will add widgets to it one at a time from left-to-right:\n", " input_widgets = widgets.HBox([dropdown_day, dropdown_month, dropdown_year])\n", @@ -2842,12 +2784,11 @@ " \n", "def display_dashboard():\n", " # dashboard outputs\n", - " tab = widgets.Tab([output, Christmas_NYC, plot_output, hv_output, hv_output2])\n", + " tab = widgets.Tab([output, plot_output, hv_output, hv_output2])\n", " tab.set_title(0, 'Dataset Exploration')\n", - " tab.set_title(1, 'Christmas Map')\n", - " tab.set_title(2, 'Coordinate Map')\n", - " tab.set_title(3, 'Tips vs # Riders')\n", - " tab.set_title(4, 'Tips vs Distance')\n", + " tab.set_title(1, 'Coordinate Map')\n", + " tab.set_title(2, 'Tips vs # Riders')\n", + " tab.set_title(3, 'Tips vs Distance')\n", " # display outputs \n", " display(tab)" ] @@ -2867,7 +2808,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "8203f2502706466ba1023f6167b80fcf", + "model_id": "542e1862ca0144549585a3ff55d35e62", "version_major": 2, "version_minor": 0 }, @@ -2881,7 +2822,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "283914e92b394ef989eec7e905e05748", + "model_id": "c55575a743dc4142be59f35c368249c1", "version_major": 2, "version_minor": 0 }, @@ -2894,6 +2835,7 @@ } ], "source": [ + "# to fix: running query when incomplete\n", "input_dashboard()" ] }, @@ -2912,12 +2854,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "c202eeed1770411ea5e93e39691f064e", + "model_id": "5539e063239841b49dc3245e86b84b1b", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "Tab(children=(Output(), Output(), Output(), Output(), Output()), _titles={'0': 'Dataset Exploration', '1': 'Ch…" + "Tab(children=(Output(), Output(), Output(), Output()), _titles={'0': 'Dataset Exploration', '1': 'Coordinate M…" ] }, "metadata": {}, @@ -2945,7 +2887,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.7.6" } }, "nbformat": 4,