-
Notifications
You must be signed in to change notification settings - Fork 6
/
BarleyBrain.py
336 lines (286 loc) · 12.4 KB
/
BarleyBrain.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
import cPickle as pickle
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
import matplotlib as mpl
mpl.use('agg')
from matplotlib import pyplot as plt
import seaborn as sns
from src.process_weather import *
from src.util_download_yields import *
# ===========================================================
# ====== Key file locations =================================
# ===========================================================
RAW_WEATHER_FILE = 'data/weather/weather_all.csv'
# RAW_YIELD = "data/yield/yield_raw_data_all.csv"
PROCESSED_YIELD = "data/yield/yield_proc_data.csv"
COMBINED = 'data/combined.csv'
IMAGES_FOLDER = 'images/'
# ===========================================================
# ====== Key CONSTANTS ======================================
# ===========================================================
TRAIN = True # ==> Set mode model is in
MIN_YEAR = 2010 # ==> Set min year to earliest
MAX_YEAR = 2016 # ==> Max year for full TTS data
TEST_SPLIT = 2016 # ==> Sets year Train test split
STATE = 'ALL' # ==> ALL is ['CO', 'MT', 'ID', 'WY']
if TRAIN == False:
MIN_YEAR = TEST_SPLIT # ==> 2014
MAX_YEAR = MAX_YEAR # ==> 2016
TEST_SPLIT = MAX_YEAR # ==> 2016
def weather_dates(df, MIN_YEAR, MAX_YEAR):
'''
:param df: dataframe containing location specific weather
:param MIN_YEAR: 2010 (start of training set)
:param MAX_YEAR: (end of test set)
:return: returns the pre-processed dataframe
'''
# work with the dates to get into needed forms for later
df['dates'] = pd.to_datetime(df['apparentTemperatureMaxTime'], unit='s')
df['year'] = [x.year for x in df['dates']]
df['month'] = [x.month for x in df['dates']]
df.drop('aDate', axis=1, inplace=True)
df['county'] = [x.lower() for x in df['county']]
# remove times not needed and junk columns
df_times = df.filter(like='Time', axis=1).columns.tolist()
df_unnamed = df.filter(like='Unnamed', axis=1).columns.tolist()
df.drop(df_times, axis=1, inplace=True)
df.drop(df_unnamed, axis=1, inplace=True)
df.drop(['moonPhase', 'key', 'time', 'precipType'], axis=1, inplace=True)
# strip out non summer growing months
df = df.drop(df[df.month < 4].index)
df = df.drop(df[df.month > 9].index)
# split out years
df = df.drop(df[df.year < MIN_YEAR].index)
df = df.drop(df[df.year > MAX_YEAR].index)
return df
def yield_dates(df, MIN_YEAR, TEST_SPLIT):
'''
:param df: pandas df of downloaded yield data
:param MIN_YEAR: 2010 (start of training set)
:param MAX_YEAR: (end of test set)
:return: returns the pre-processed dataframe
'''
# remove unneeded timstamps, etc
df.dropna(axis=1, how='all', inplace=True)
df['county_name'] = df['county_name'].apply(lambda x: x.lower())
df.rename(columns={'county_name': 'county', 'Value': 'cyield', 'state_alpha': 'state'}, inplace=True)
df.drop('Unnamed: 0', axis=1, inplace=True)
# split out years
df = df.drop(df[df.year < MIN_YEAR].index)
df = df.drop(df[df.year > TEST_SPLIT].index)
return df
def join_weather_yield(wdf, ydf):
'''
:param wdf: dataframe of weather data
:param ydf: dataframe of yield information
:return: combined weather/yield df based on state, county, year
'''
wdf.drop('dates', axis=1, inplace=True)
wdf['county'] = wdf['county'].apply(str.lower)
DROP_LIST = ['visibility', 'lat', 'long', 'month', 'windBearing', 'precipProbability', 'aveTemp', 'pressure',
'windBearing', 'precipIntensityMax', 'cloudCover', 'temperatureMax', 'windSpeed',
'apparentTemperatureMax']
# ======== add new features here =========================
wdf['aveTemp'] = (wdf['temperatureMax'] + wdf['temperatureMin']) / 2
wdf['temp_delta'] = (wdf['temperatureMax'] - wdf['temperatureMin'])
wdf['days_over42'] = wdf['temperatureMax'].map(lambda x: is_over(x, 42))
wdf['days_under0'] = wdf['temperatureMin'].map(lambda x: is_under(x, -20))
wdf['days_over32'] = wdf['temperatureMax'].map(lambda x: is_over(x, 32))
wdf['days_under_n10'] = wdf['temperatureMin'].map(lambda x: is_under(x, -10))
wdf['precip'] = wdf['precipIntensity'].map(lambda x: is_over(x, 0.0))
# === test see if i need to narrow the weather window to average
# wdf = wdf.drop(wdf[wdf.month < 4].index)
# wdf = wdf.drop(wdf[wdf.month > 9].index)
wdf.drop(DROP_LIST, axis=1, inplace=True)
# ======= prepare column list for sum and mean aggregations by state, county, year
agg_dict = dict()
cols_list = list(wdf.columns.tolist())
group_by_columns = ['county', 'state', 'year']
cols_to_sum = ['days_under0', 'days_over32', 'days_over42', 'days_under_n10', 'precipAccumulation', 'precip']
cols_to_aggregate = [x for x in cols_list if x not in group_by_columns]
for i in cols_to_aggregate:
if i in cols_to_sum:
agg_dict[i] = np.sum
else:
agg_dict[i] = np.mean
print "Fixing resulting NAs and missing data"
wdf_years = wdf.groupby(group_by_columns).agg(agg_dict)
# fix nans in precip accumulation for days of no accumulation
wdf_years['precipAccumulation'].fillna(0, inplace=True)
# save the processed weather file for later use
wdf_years.to_csv(PROC_WEATHER_FILE)
wdf_years = pd.read_csv(PROC_WEATHER_FILE)
# ========== finish the merge and return the dataframe ==============
ydf.groupby(group_by_columns).mean()
df_merged = pd.merge(ydf, wdf_years, how='left', on=['county', 'state', 'year'])
df_merged.dropna(axis=0, how='any', inplace=True)
return df_merged
def is_over(x, val):
'''
simple helper function for counting...
:param x: value of data point
:param val: comparision value
:return: returns a value 1 if it is greater than val 0 if lower
'''
if x > val:
return 1
else:
return 0
def is_under(x, val):
'''
simple helper function for counting...
:param x: value of data point
:param val: comparision value
:return: returns a value 0 if it is greater than val 1 if lower
'''
if x < val:
return 1
else:
return 0
def my_ada(X, y, m_depth=3, n_est=28, rnd=None, lr=0.8225):
'''
quick wrapping function for sklearn ada_boost analysis
:param X: features
:param y: target
:param m_depth: max depth of Dtree 'stumps'
:param n_est: number of estimators to use
:param rnd: sets the random seed if you want
:param lr: learning rate
:return: returns the model fully fitted and predicted
'''
reg_ada = AdaBoostRegressor(DecisionTreeRegressor(max_depth=m_depth), \
n_estimators=n_est, random_state=rnd, learning_rate=lr)
reg_ada.fit(X, y)
reg_ada.predict(X)
return reg_ada
def feature_importance(cols, feat_imps):
'''
function to assign feature names to feature importance list out of sklearn
:param cols: is the pands data series containing the list of features
:param feat_imps: the data series containing the resulting feature importance values
:return: returns a consolidated pandas DF of the features and there determined importance
'''
df = pd.DataFrame()
df['feat'] = cols
df['imp'] = feat_imps
df.sort_values('imp', ascending=False, inplace=True)
df = df.loc[df['imp'] > 0]
df.reset_index()
return df
if __name__ == '__main__':
print 'Loading raw weather data....'
df_weather = weather_dates(pd.read_csv(RAW_WEATHER_FILE), MIN_YEAR, TEST_SPLIT)
print 'Loading yield files.........'
df_yield = yield_dates(pd.read_csv(PROCESSED_YIELD), MIN_YEAR, TEST_SPLIT)
print 'combining the two files for analysis'
df_join = join_weather_yield(df_weather, df_yield)
df_join.to_csv(COMBINED)
# Code block for fine tuning and EDA experimentation
# ==============================================
_dummy = df_join[df_join['cyield'] >= 45]
_dummy = _dummy[_dummy['cyield'] <= 170]
if not STATE == 'ALL': _dummy = _dummy[_dummy.state != 'ID']
sfl = _dummy.copy()
_dummy.pop('year')
_dummy.pop('county')
_dummy = pd.get_dummies((_dummy))
# ==============================================
# Preping the y and X variables for input to Adaboost
# ==============================================
y = _dummy.pop('cyield')
X = _dummy
# =============================================
print '\n\n'
print 'Running model on TRAIN? ', TRAIN
if TRAIN == False:
# =========== Load pickle of training session ========
with open('data/pickles/reg_ada_' + STATE + '_xtr.pkl') as f:
reg_ada = pickle.load(f)
else:
# =========== Run Adaboost fit-pred ================
reg_ada = my_ada(X, y, m_depth=3, n_est=8, rnd=42, lr=1.28225)
# =========== Do GRIDSEARCH analysis ================
lr_range = np.linspace(0.1, 10, 100)
n_est = [int(i) for i in (np.linspace(5, 100, 1))]
params = {'n_estimators': n_est, 'learning_rate': lr_range}
grid = GridSearchCV(estimator=reg_ada, param_grid=params, n_jobs=-1, return_train_score=True)
grid.fit(X, y)
_best_params = grid.best_params_
print _best_params
# predict and score
y_ada = reg_ada.predict(X)
ada_mse = mse(y, y_ada)
print '\n\n'
print 'TRAINING SET? ', TRAIN
print 'STATES: %s Dates: %s --> %s' % (STATE, MIN_YEAR, TEST_SPLIT)
print '================================================'
print 'Adaboost r2: ', reg_ada.score(X, y)
print 'Adaboost MSE: ', ada_mse
print 'Adaboost RMSE: ', np.sqrt(ada_mse)
print '================================================'
# Extract table of most important features for later use if needed
feat_imp = feature_importance(X.columns.tolist(), reg_ada.feature_importances_)
# =========== Save pickle of training session ===========
if TRAIN == True:
with open('data/pickles/reg_ada_' + STATE + '_xtr.pkl', 'wb') as f:
pickle.dump(reg_ada, f)
# prep dataframe of inputs and predictions for plotting and later use
res = sfl
res['y_pred'] = y_ada
# res.to_csv('data/forecast.csv', index=False)
df_plt = res
df_plt['y_true'] = y
# plt.close('all')
# # =========== Plot by State y_true and y_predicted ===========
# fig = plt.figure()
# sns.regplot(x='y_true', y='y_pred', data=df_plt, label='state', marker='o', scatter_kws={'s': 80});
# plt.title(STATE + ' regression performance', fontsize=30);
# plt.xlabel('y_true (bu/acre)')
# plt.ylabel('y_pred (bu/acre)')
# plt.legend()
# plt.savefig(IMAGES_FOLDER + 'model_yt_yp.jpg')
#
# # =========== Plot by State y_true and y_predicted ===========
# g = sns.FacetGrid(res, col='irig_flag', hue='state')
# g.map(sns.regplot, 'y_true', 'y_pred', label='state', marker='o', scatter_kws={'s': 80});
# plt.xlim(0, 180)
# plt.ylim(0, 180)
# plt.legend()
# plt.savefig(IMAGES_FOLDER + 'facet_irig_2.jpg')
#
# # =========== Plot by IRIG_FLAG y_true and y_predicted ===========
# g = sns.FacetGrid(res, col='state', row='irig_flag', hue='irig_flag')
# g.map(sns.regplot, 'y_true', 'y_pred', label='state', marker='o', scatter_kws={'s': 80});
# plt.xlim(0, 180)
# plt.ylim(0, 180)
# plt.xticks(size=15)
# plt.yticks(size=15)
# plt.legend()
# plt.savefig(IMAGES_FOLDER + 'facet_irig_1.jpg')
#
# fig = plt.figure(figsize=(10,4))
# plt.plot(res.y_true, marker='o', lw=0)
# plt.plot(res.y_pred, lw=2)
#
# # plt.title(STATE + ' regression performance', fontsize=25);
# plt.xlabel('data-point', fontsize=1)
# plt.ylabel('yield (bu/acre)', fontsize=20)
# plt.ylim([0, 180])
# # plt.title('Model Performance', fontsize=30)
# plt.savefig(IMAGES_FOLDER + 'model_performance.jpg')
#
# # generate the feature chart with the irig_flag and save it to file
# feat_imp.set_index('feat')
# fig = plt.subplot()
# ax = sns.barplot(feat_imp.feat[:10], feat_imp.imp[:10], palette='Blues_d')
# plt.xticks(rotation=30)
# plt.savefig(IMAGES_FOLDER + 'features1.png')
#
# # generate the feature chart less the irig_flag and save it to file
# feat_imp.set_index('feat')
# fig = plt.subplot()
# ax = sns.barplot(feat_imp.feat[1:10], feat_imp.imp[1:10], palette='Blues_d')
# plt.xticks(rotation=30)
# plt.savefig(IMAGES_FOLDER + 'features2.png')