Skip to content

Commit

Permalink
2nd XGB grid search
Browse files Browse the repository at this point in the history
  • Loading branch information
rebeccak1 authored Sep 23, 2017
1 parent 878bf7b commit 8f25bb1
Showing 1 changed file with 271 additions and 0 deletions.
271 changes: 271 additions & 0 deletions xgb2.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,271 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"_cell_guid": "b1b1c30b-5503-4896-a6ae-3fde8c58a857",
"_execution_state": "idle",
"_uuid": "6253fc3613cae37331065f8a7b510ab3ee469e8a"
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/n/home12/rebecca.krall/.conda/envs/py3xg/lib/python3.6/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
" \"This module will be removed in 0.20.\", DeprecationWarning)\n",
"/n/home12/rebecca.krall/.conda/envs/py3xg/lib/python3.6/site-packages/sklearn/grid_search.py:43: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.\n",
" DeprecationWarning)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"fastest_routes_test.csv\n",
"fastest_routes_train_part_1.csv\n",
"fastest_routes_train_part_2.csv\n",
"test.csv\n",
"train.csv\n",
"weather_data_nyc_centralpark_2016.csv\n",
"\n"
]
}
],
"source": [
"import numpy as np \n",
"import pandas as pd\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.model_selection import ShuffleSplit\n",
"from sklearn.model_selection import cross_val_score\n",
"from sklearn.grid_search import GridSearchCV\n",
"import xgboost\n",
"%matplotlib inline\n",
"\n",
"from subprocess import check_output\n",
"print(check_output([\"ls\", \"input/\"]).decode(\"utf8\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## XGBoost"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/n/home12/rebecca.krall/.conda/envs/py3xg/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (18,19) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" interactivity=interactivity, compiler=compiler, result=result)\n",
"/n/home12/rebecca.krall/.conda/envs/py3xg/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (27,28) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" interactivity=interactivity, compiler=compiler, result=result)\n"
]
}
],
"source": [
"df = pd.read_csv('train_data.csv')\n",
"tdf = pd.read_csv('test_data.csv')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"features = df[['vendor_id','passenger_count','pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude','pu_hour','wday','month','workday','precipitation','snowfall','snowdepth','total_distance','total_travel_time','jfk','lga']]\n",
"target = df['trip_duration']"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"tfeatures = tdf[['vendor_id','passenger_count','pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude','pu_hour','wday','month','workday','precipitation','snowfall','snowdepth','total_distance','total_travel_time','jfk','lga']]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def rmsle(evaluator,X,real):\n",
" sum = 0.0\n",
" predicted = evaluator.predict(X)\n",
" print(\"Number predicted less than 0: {}\".format(np.where(predicted < 0)[0].shape))\n",
"\n",
" predicted[predicted < 0] = 0\n",
" for x in range(len(predicted)):\n",
" p = np.log(predicted[x]+1)\n",
" r = np.log(real[x]+1)\n",
" sum = sum + (p-r)**2\n",
" return (sum/len(predicted))**0.5"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# cv_params = {'max_depth': [3,5,7], 'min_child_weight': [1,3,5]}\n",
"# ind_params = {'learning_rate': 0.08, 'n_estimators': 100, 'seed':0, 'subsample': 0.75, 'colsample_bytree': 1}\n",
"# optimized_GBM = GridSearchCV(xgboost.XGBRegressor(**ind_params), \n",
"# cv_params,scoring = rmsle, cv =4) \n",
"# optimized_GBM.fit(features, np.ravel(target))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# print(optimized_GBM.grid_scores_)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number predicted less than 0: (3,)\n",
"Number predicted less than 0: (1,)\n",
"Number predicted less than 0: (0,)\n",
"Number predicted less than 0: (1,)\n",
"Number predicted less than 0: (0,)\n",
"Number predicted less than 0: (2,)\n",
"Number predicted less than 0: (0,)\n",
"Number predicted less than 0: (1,)\n",
"Number predicted less than 0: (1,)\n",
"Number predicted less than 0: (0,)\n",
"Number predicted less than 0: (0,)\n",
"Number predicted less than 0: (0,)\n",
"Number predicted less than 0: (0,)\n",
"Number predicted less than 0: (0,)\n",
"Number predicted less than 0: (0,)\n",
"Number predicted less than 0: (0,)\n",
"Number predicted less than 0: (0,)\n",
"Number predicted less than 0: (0,)\n",
"Number predicted less than 0: (0,)\n",
"Number predicted less than 0: (0,)\n",
"Number predicted less than 0: (0,)\n",
"Number predicted less than 0: (0,)\n",
"Number predicted less than 0: (0,)\n",
"Number predicted less than 0: (0,)\n"
]
},
{
"data": {
"text/plain": [
"GridSearchCV(cv=4, error_score='raise',\n",
" estimator=XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,\n",
" learning_rate=0.1, max_delta_step=0, max_depth=7,\n",
" min_child_weight=1, missing=None, n_estimators=100, nthread=-1,\n",
" objective='reg:linear', reg_alpha=0, reg_lambda=1,\n",
" scale_pos_weight=1, seed=0, silent=True, subsample=1),\n",
" fit_params={}, iid=True, n_jobs=1,\n",
" param_grid={'learning_rate': [0.1, 0.01], 'subsample': [0.7, 0.8, 0.9]},\n",
" pre_dispatch='2*n_jobs', refit=True,\n",
" scoring=<function rmsle at 0x2b9697e48ae8>, verbose=0)"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cv_params = {'learning_rate': [0.1, 0.01], 'subsample': [0.7,0.8,0.9]}\n",
"ind_params = {'n_estimators': 100, 'seed':0, 'colsample_bytree': 1, \n",
" 'max_depth': 7, 'min_child_weight': 1}\n",
"\n",
"\n",
"optimized_GBM = GridSearchCV(xgboost.XGBRegressor(**ind_params), \n",
" cv_params,scoring = rmsle, cv =4) \n",
"optimized_GBM.fit(features, np.ravel(target))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[mean: 0.34337, std: 0.00300, params: {'learning_rate': 0.1, 'subsample': 0.7}, mean: 0.34296, std: 0.00323, params: {'learning_rate': 0.1, 'subsample': 0.8}, mean: 0.34331, std: 0.00343, params: {'learning_rate': 0.1, 'subsample': 0.9}, mean: 0.52318, std: 0.00591, params: {'learning_rate': 0.01, 'subsample': 0.7}, mean: 0.52311, std: 0.00586, params: {'learning_rate': 0.01, 'subsample': 0.8}, mean: 0.52301, std: 0.00591, params: {'learning_rate': 0.01, 'subsample': 0.9}]\n"
]
}
],
"source": [
"print(optimized_GBM.grid_scores_)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The best parameters from both of the grid searches are:"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"params = {'n_estimators': 100, 'seed':0, 'colsample_bytree': 1, \n",
" 'max_depth': 7, 'min_child_weight': 1,'learning_rate': 0.1, 'subsample': 0.8}"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 1
}

0 comments on commit 8f25bb1

Please sign in to comment.