-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
271 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,271 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": { | ||
"_cell_guid": "b1b1c30b-5503-4896-a6ae-3fde8c58a857", | ||
"_execution_state": "idle", | ||
"_uuid": "6253fc3613cae37331065f8a7b510ab3ee469e8a" | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"/n/home12/rebecca.krall/.conda/envs/py3xg/lib/python3.6/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", | ||
" \"This module will be removed in 0.20.\", DeprecationWarning)\n", | ||
"/n/home12/rebecca.krall/.conda/envs/py3xg/lib/python3.6/site-packages/sklearn/grid_search.py:43: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.\n", | ||
" DeprecationWarning)\n" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"fastest_routes_test.csv\n", | ||
"fastest_routes_train_part_1.csv\n", | ||
"fastest_routes_train_part_2.csv\n", | ||
"test.csv\n", | ||
"train.csv\n", | ||
"weather_data_nyc_centralpark_2016.csv\n", | ||
"\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import numpy as np \n", | ||
"import pandas as pd\n", | ||
"import seaborn as sns\n", | ||
"import matplotlib.pyplot as plt\n", | ||
"from sklearn.model_selection import ShuffleSplit\n", | ||
"from sklearn.model_selection import cross_val_score\n", | ||
"from sklearn.grid_search import GridSearchCV\n", | ||
"import xgboost\n", | ||
"%matplotlib inline\n", | ||
"\n", | ||
"from subprocess import check_output\n", | ||
"print(check_output([\"ls\", \"input/\"]).decode(\"utf8\"))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## XGBoost" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"/n/home12/rebecca.krall/.conda/envs/py3xg/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (18,19) have mixed types. Specify dtype option on import or set low_memory=False.\n", | ||
" interactivity=interactivity, compiler=compiler, result=result)\n", | ||
"/n/home12/rebecca.krall/.conda/envs/py3xg/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (27,28) have mixed types. Specify dtype option on import or set low_memory=False.\n", | ||
" interactivity=interactivity, compiler=compiler, result=result)\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"df = pd.read_csv('train_data.csv')\n", | ||
"tdf = pd.read_csv('test_data.csv')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"features = df[['vendor_id','passenger_count','pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude','pu_hour','wday','month','workday','precipitation','snowfall','snowdepth','total_distance','total_travel_time','jfk','lga']]\n", | ||
"target = df['trip_duration']" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"tfeatures = tdf[['vendor_id','passenger_count','pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude','pu_hour','wday','month','workday','precipitation','snowfall','snowdepth','total_distance','total_travel_time','jfk','lga']]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"def rmsle(evaluator,X,real):\n", | ||
" sum = 0.0\n", | ||
" predicted = evaluator.predict(X)\n", | ||
" print(\"Number predicted less than 0: {}\".format(np.where(predicted < 0)[0].shape))\n", | ||
"\n", | ||
" predicted[predicted < 0] = 0\n", | ||
" for x in range(len(predicted)):\n", | ||
" p = np.log(predicted[x]+1)\n", | ||
" r = np.log(real[x]+1)\n", | ||
" sum = sum + (p-r)**2\n", | ||
" return (sum/len(predicted))**0.5" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# cv_params = {'max_depth': [3,5,7], 'min_child_weight': [1,3,5]}\n", | ||
"# ind_params = {'learning_rate': 0.08, 'n_estimators': 100, 'seed':0, 'subsample': 0.75, 'colsample_bytree': 1}\n", | ||
"# optimized_GBM = GridSearchCV(xgboost.XGBRegressor(**ind_params), \n", | ||
"# cv_params,scoring = rmsle, cv =4) \n", | ||
"# optimized_GBM.fit(features, np.ravel(target))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# print(optimized_GBM.grid_scores_)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 8, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Number predicted less than 0: (3,)\n", | ||
"Number predicted less than 0: (1,)\n", | ||
"Number predicted less than 0: (0,)\n", | ||
"Number predicted less than 0: (1,)\n", | ||
"Number predicted less than 0: (0,)\n", | ||
"Number predicted less than 0: (2,)\n", | ||
"Number predicted less than 0: (0,)\n", | ||
"Number predicted less than 0: (1,)\n", | ||
"Number predicted less than 0: (1,)\n", | ||
"Number predicted less than 0: (0,)\n", | ||
"Number predicted less than 0: (0,)\n", | ||
"Number predicted less than 0: (0,)\n", | ||
"Number predicted less than 0: (0,)\n", | ||
"Number predicted less than 0: (0,)\n", | ||
"Number predicted less than 0: (0,)\n", | ||
"Number predicted less than 0: (0,)\n", | ||
"Number predicted less than 0: (0,)\n", | ||
"Number predicted less than 0: (0,)\n", | ||
"Number predicted less than 0: (0,)\n", | ||
"Number predicted less than 0: (0,)\n", | ||
"Number predicted less than 0: (0,)\n", | ||
"Number predicted less than 0: (0,)\n", | ||
"Number predicted less than 0: (0,)\n", | ||
"Number predicted less than 0: (0,)\n" | ||
] | ||
}, | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"GridSearchCV(cv=4, error_score='raise',\n", | ||
" estimator=XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,\n", | ||
" learning_rate=0.1, max_delta_step=0, max_depth=7,\n", | ||
" min_child_weight=1, missing=None, n_estimators=100, nthread=-1,\n", | ||
" objective='reg:linear', reg_alpha=0, reg_lambda=1,\n", | ||
" scale_pos_weight=1, seed=0, silent=True, subsample=1),\n", | ||
" fit_params={}, iid=True, n_jobs=1,\n", | ||
" param_grid={'learning_rate': [0.1, 0.01], 'subsample': [0.7, 0.8, 0.9]},\n", | ||
" pre_dispatch='2*n_jobs', refit=True,\n", | ||
" scoring=<function rmsle at 0x2b9697e48ae8>, verbose=0)" | ||
] | ||
}, | ||
"execution_count": 8, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"cv_params = {'learning_rate': [0.1, 0.01], 'subsample': [0.7,0.8,0.9]}\n", | ||
"ind_params = {'n_estimators': 100, 'seed':0, 'colsample_bytree': 1, \n", | ||
" 'max_depth': 7, 'min_child_weight': 1}\n", | ||
"\n", | ||
"\n", | ||
"optimized_GBM = GridSearchCV(xgboost.XGBRegressor(**ind_params), \n", | ||
" cv_params,scoring = rmsle, cv =4) \n", | ||
"optimized_GBM.fit(features, np.ravel(target))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 9, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"[mean: 0.34337, std: 0.00300, params: {'learning_rate': 0.1, 'subsample': 0.7}, mean: 0.34296, std: 0.00323, params: {'learning_rate': 0.1, 'subsample': 0.8}, mean: 0.34331, std: 0.00343, params: {'learning_rate': 0.1, 'subsample': 0.9}, mean: 0.52318, std: 0.00591, params: {'learning_rate': 0.01, 'subsample': 0.7}, mean: 0.52311, std: 0.00586, params: {'learning_rate': 0.01, 'subsample': 0.8}, mean: 0.52301, std: 0.00591, params: {'learning_rate': 0.01, 'subsample': 0.9}]\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"print(optimized_GBM.grid_scores_)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"The best parameters from both of the grid searches are:" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 10, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"params = {'n_estimators': 100, 'seed':0, 'colsample_bytree': 1, \n", | ||
" 'max_depth': 7, 'min_child_weight': 1,'learning_rate': 0.1, 'subsample': 0.8}" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python [default]", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.6.1" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 1 | ||
} |