-
Notifications
You must be signed in to change notification settings - Fork 1
/
get_shap_explainations.py
143 lines (107 loc) · 5 KB
/
get_shap_explainations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# Set up
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import shap
import seaborn as sns
import os
from sklearn.preprocessing import MinMaxScaler
import time
from utilities import get_mean_shap
from sklearn.manifold import TSNE
from copy import deepcopy
"""
get_shap_explainations main
- generates SHAP explainations for optimal models selected by the model selection process
- Save data within the pipeline dictionary and as csv within the run directory
"""
def main(pipeline, N_bins, refined = True):
print(f'\n###########################\n\n RUNNING SHAP EXPLANATIONS, Refined features: {refined}')
start_time = time.time()
#Config
cell = pipeline['Cell']
model_name = pipeline['Model_Selection']['Best_Model']['Model_Name']
trained_model = deepcopy(pipeline['Model_Selection']['Best_Model']['Model'])
X = pipeline['Data_preprocessing']['X']
y = pipeline['Data_preprocessing']['y']
shap_save_path = pipeline['Saving']['SHAP']
#check if feature reduction was conducted, if so, then refined parameters can be used, else default to unrefined parameters
if pipeline['STEPS_COMPLETED']['Feature_Reduction'] and refined:
input_params = pipeline['Feature_Reduction']['Refined_Params']
input_param_type = 'refined'
else:
input_params = pipeline['Data_preprocessing']['Input_Params']
refined = False
input_param_type = 'original'
### TRAINING DATA
X = X[input_params].copy()
trained_model.fit(X,np.ravel(y))
#initialize SHAP Explainer
if model_name == 'XGB':
explainer = shap.Explainer(trained_model.predict, X) #XGB
tree = False
elif model_name in ['LGBM', 'RF', 'DT']: #for other tree based
explainer = shap.TreeExplainer(trained_model)
tree = True
else:
explainer = shap.Explainer(trained_model)
tree = False
#Get SHAP Values
shap_values = explainer(X)
#Get Best Feature Values based on Average SHAP Values
best_feature_values, mean_shap = get_mean_shap(c = cell,
input_params=input_params,
shap_values= shap_values,
N_bins= N_bins)
#Only can get interaction values if treeExplainer
if tree:
# Get SHAP Interaction Values
shap_interaction_values = explainer.shap_interaction_values(X)
else:
shap_interaction_values = None
#Embed results using TSNE
projections = TSNE(n_components=2, perplexity=50, random_state = 0).fit_transform(shap_values.values)
###### SAVING RESULTS ######
new_save = f'{shap_save_path}{input_param_type}/'
if os.path.exists(new_save) == False:
os.makedirs(new_save, 0o666)
#save SHAP Values
with open(new_save + f"{model_name}_SHAP_values.pkl", 'wb') as file:
pickle.dump(shap_values, file)
shap_values.values.tofile(new_save + f"{model_name}_SHAP_values.csv",sep = ',')
shap_values.data.tofile(new_save + f"{model_name}_SHAP_data.csv",sep = ',')
#shap interaction
#save SHAP Interaction Values
with open(new_save + f"/{model_name}_SHAP_inter_values.pkl", 'wb') as file:
pickle.dump(shap_interaction_values, file)
#Save average shap of the binned features as csv
with open(new_save + f"/{model_name}_{cell}_mean_shap.csv", 'w', encoding = 'utf-8-sig') as f:
mean_shap.to_csv(f, index = False)
#Save average shap of the features as csv
with open(new_save + f"/{model_name}_{cell}_best_feature_values.csv", 'w', encoding = 'utf-8-sig') as f:
best_feature_values.to_csv(f, index = False)
#Update Pipeline
outer_key = input_param_type
inner_key_list = ['X', 'y', 'Input_Params','Explainer','SHAP_Values', 'SHAP_Interaction_Values','Best_Feature_Values', 'Norm_Best_Feature_Values', 'N_bins','Mean_SHAP_Values','TSNE_Embedding']
for new_inner_key in inner_key_list:
if outer_key in pipeline:
pipeline['SHAP'][outer_key][new_inner_key] = None
else:
pipeline['SHAP'][outer_key] = {new_inner_key: None}
pipeline['SHAP'][input_param_type]['X'] = X
pipeline['SHAP'][input_param_type]['y'] = pipeline['Data_preprocessing']['y']
pipeline['SHAP'][input_param_type]['Input_Params'] = input_params
pipeline['SHAP'][input_param_type]['Explainer'] = explainer
pipeline['SHAP'][input_param_type]['SHAP_Values'] = shap_values
pipeline['SHAP'][input_param_type]['SHAP_Interaction_Values'] = shap_interaction_values
pipeline['SHAP'][input_param_type]['Best_Feature_Values'] = best_feature_values
pipeline['SHAP'][input_param_type]['Norm_Best_Feature_Values'] = best_feature_values
pipeline['SHAP'][input_param_type]['N_bins'] = N_bins
pipeline['SHAP'][input_param_type]['Mean_SHAP_Values'] = mean_shap
pipeline['SHAP'][input_param_type]['TSNE_Embedding'] = projections
pipeline['STEPS_COMPLETED']['SHAP'] = True
print("\n\n--- %s minutes for SHAP explanation---" % ((time.time() - start_time)/60))
return pipeline, shap_values, shap_interaction_values, best_feature_values, mean_shap
if __name__ == "__main__":
main()