htoaa_BDT2.py

import numpy as npimport matplotlib.pyplot as pltimport pandas as pdimport xgboost as xgbimport picklefrom info import fileNames, allVarsfrom data_manager import processDatafrom sklearn.metrics import roc_curve, auc, accuracy_scorefrom sklearn.model_selection import train_test_splitfrom optparse import OptionParserparser = OptionParser()parser.add_option("--ntrees", type="int", dest="ntrees", help="hyp", default = 1000) #1500parser.add_option("--treeDeph", type="int", dest="treeDeph", help="hyp", default = 2) #3parser.add_option("--lr", type="float", dest="lr", help="hyp", default = 0.05)parser.add_option("--mcw", type="float", dest="mcw", help="hyp", default = 1)parser.add_option("--doXML", action="store_true", dest="doXML", help="Do save not write the xml file", default=True)(options, args) = parser.parse_args()hyppar="ntrees_"+str(options.ntrees)+"_deph_"+str(options.treeDeph)+"_mcw_"+str(options.mcw)+"_lr_"+str(options.lr)print(hyppar)## process and append them into 1 long dataframe containing all the ## signal and all bg (that I have)## should I be concerned that 200to300 returns only 7 events after the ## selection cutsdata = pd.DataFrame()for fileName in fileNames:     tmpData = processData(fileName)    data = data.append(tmpData, ignore_index=True, sort = False)## drop all columns and rows that all nandata = data.dropna(axis = 1, how = 'all') data = data.dropna(how = 'all')data = data.fillna(0)## get column names (without the weight, target)colNames = list(data.columns)colNames = colNames[:-2]## normalizing the weights?? why do we have to do this? how do we do this?data.loc[data['target']==0, ['weights']] *= 100000/data.loc[data['target']==0]['weights'].sum()data.loc[data['target']==1, ['weights']] *= 100000/data.loc[data['target']==1]['weights'].sum()## drop events with NaN weights - for safetydata.dropna(subset=['weights'],inplace = True) data.fillna(0)## split data into training and testingrandInt = 7trainData, testData = train_test_split(data, random_state=randInt)## training cls = xgb.XGBClassifier(    n_estimators = options.ntrees,    max_depth = options.treeDeph,    min_child_weight = options.mcw, # min_samples_leaf    learning_rate = options.lr,    # n_estimators = 800,    # max_depth = 2,     # min_child_weight = 1,    # learning_rate = 0.01    )cls.fit(trainData[colNames], trainData['target'], sample_weight=(trainData['weights']))print ("XGBoost trained")## data from rocs?proba = cls.predict_proba(trainData[colNames])print('proba')print(proba)fpr, tpr, thresholds = roc_curve(trainData['target'], proba[:,1])train_auc = auc(fpr, tpr)print("XGBoost train set auc - {}".format(train_auc))proba = cls.predict_proba(testData[colNames])fprt, tprt, thresholds = roc_curve(testData['target'], proba[:,1])test_auct = auc(fprt, tprt)print("XGBoost test set auc - {}".format(test_auct))fig, ax = plt.subplots()prediction = cls.predict(testData[colNames])accuracy = accuracy_score(testData['target'], prediction)print("XGBoost test accuracy - {}".format(accuracy))## draw them rocsfig, ax = plt.subplots(figsize=(8, 8))train_auc = auc(fpr, tpr)ax.plot(fpr, tpr, lw=1, color='g',label='XGB train (area = %0.5f)'%(train_auc))ax.plot(fprt, tprt, lw=1, ls='--',color='g',label='XGB test (area = %0.5f)'%(test_auct) )ax.set_ylim([0.0,1.0])ax.set_xlim([0.0,1.0])ax.set_xlabel('False Positive Rate')ax.set_ylabel('True Positive Rate')ax.legend(loc="lower right")ax.grid()ax.set_title(hyppar)fig.savefig("plots/%s_roc.png" % hyppar)## make and fill plots for how many events we have for each training parametersdataSig = data.loc[data.target == 1] #this used to be ix, but that depreciated :(dataBg = data.loc[data.target == 0]print('dataSig shape: ' + str(dataSig.shape))print('dataBg shape: ' + str(dataBg.shape))for colName in colNames:     hist_params = {'density': True, 'histtype': 'bar', 'fill': True , 'lw':3, 'alpha' : 0.4}    nbins = 8    min_valueS, max_valueS = np.percentile(dataSig[colName], [0.0, 99])            min_valueB, max_valueB = np.percentile(dataBg[colName], [0.0, 99])    range_local = (min(min_valueS,min_valueB),  max(max_valueS,max_valueB))    valuesS, binsS, _ = plt.hist(        dataSig[colName].values,        range = range_local,        bins = nbins, edgecolor='b', color='b',        label = "Signal", **hist_params        )       to_ymax = max(valuesS)    to_ymin = min(valuesS)    valuesB, binsB, _ = plt.hist(        dataBg[colName].values,        range = range_local,        bins = nbins, edgecolor='g', color='g',        label = "Background", **hist_params        )    to_ymax2 = max(valuesB)    to_ymax  = max([to_ymax2, to_ymax])    to_ymin2 = min(valuesB)    to_ymin  = max([to_ymin2, to_ymin])    plt.ylim(ymin=to_ymin*0.1, ymax=to_ymax*1.2)    plt.legend(loc='best')    plt.xlabel(colName)    plt.savefig("distributions/plot_%s.png" % colName)    plt.clf()# ## feature importance plot# fig, ax = plt.subplots()# f_score_dict = cls.get_booster().get_fscore()# #print("f_score_dict: {}".format(f_score_dict))# ## okay si think about what this line is doing# ## so I think siddesh had this line becuase his f_score_dict came out to be dict of {'f1': 34, 'f2': 21,...}# ## because he was using .values on everything going into the classifier. He is doing this so the dict has# ## correct names. I don't have to do this becuase mine went in with the column names # # f_score_dict = {trainVars[k[1:]] : v for k,v in f_score_dict.items()}# feat_imp = pd.Series(f_score_dict).sort_values(ascending=True)# feat_imp.plot(kind='barh', title='Feature Importances_'+hyppar)# fig.tight_layout()# fig.savefig("plots/%s_XGB_importance.png" % hyppar)## if Save to pkl? pklpath="XGB_classifier_"+str(len(allVars))+"Var"if options.doXML==True :    pickle.dump(cls, open(pklpath+".pkl", 'wb'))    file = open(pklpath+"pkl.log","w")    file.write(str(allVars)+"\n")    file.close()