forked from jisungk/RIDDLE
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pipeline.py
executable file
·206 lines (166 loc) · 7.2 KB
/
pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
"""
pipeline.py
A centralized pipeline for data acquisition, processing, model training, and
model testing using RIDDLE.
Requires: NumPy, scikit-learn, RIDDLE (and their dependencies)
Author: Ji-Sung Kim, Rzhetsky Lab
Copyright: 2016, all rights reserved
"""
from __future__ import print_function
import sys; sys.dont_write_bytecode = True
import os
import time
import pickle
DATA_DIR = '_data'
SEED = 109971161161043253 % 8085
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# ---------------------------- HELPER FUNCTIONS ------------------------------ #
'''
* Prints to standard error.
'''
def eprint(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)
'''
* Pickles an object to file.
* Expects:
- obj = object
- fn = filename
'''
def pickle_object(obj, fn):
with open(fn, 'w') as f:
pickle.dump(obj, f)
'''
* Perform a detailed evaluation of the model, creating a confusion matrix,
a sklearn classification report, and ROC curves. Prints and/or saves relevant
information.
* Expects:
- y_test = list of true targets
- y_test_proba = probability vectors
- nb_classes = number of classes
- path = string path where output plots should be saved
'''
def evaluate(y_test, y_test_proba, nb_classes, path):
from riddle import roc # here so np can be seeded before run_pipeline() call
y_pred = [np.argmax(p) for p in y_test_proba]
print('Confusion matrix:')
print(confusion_matrix(y_test, y_pred))
print()
print('Classification report:')
print(classification_report(y_test, y_pred, digits=3))
print('ROC AUC values:')
roc_auc, fpr, tpr = roc.compute_roc(y_test, y_test_proba,
nb_classes=nb_classes)
roc.save_plots(roc_auc, fpr, tpr, nb_classes=nb_classes, path=path)
for l, r in roc_auc.items():
print(' {}: {:.5f}'.format(l, r))
print()
# ---------------------------- PUBLIC FUNCTIONS ------------------------------ #
'''
* Run a full deep learning pipeline (acquire data, process, train, test,
evaluate).
* Expects:
- model_module = module which contains functions to:
+ initalize a compiled Keras Sequential model
(model_module.create_base_model)
+ process feature data to appropriate form (model_module.process_X_data)
+ process class data to appropriate form (model_module.process_y_data)
- best_model_param = dictionary of best parameters for model_module
- data_partition_dict = dictionary of the train, validation, and test data
which should have these keys (with the appropriate value):
+ X_train
+ y_train
+ X_val
+ y_val
+ X_test
+ y_test
- nb_features = number of features
- nb_classes = number of classes
- interpret_model = boolean whether to compute feature importance scores
- out_directory = string path where output should be saved
- max_nb_epoch = maximum number of epochs
* Return
- tuple of metrics (loss, accuracy, runtime)
- tuple of sums of differences and sums of deepLIFT contrib scores;
these are used for t-tests in feature interpretation
(sums_D, sums_D2, sums_contrib)
- pairs of compared classes (pairs)
'''
def run_pipeline(model_module, best_model_param, data_partition_dict,
nb_features, nb_classes, interpret_model, out_directory, max_nb_epoch=100):
# here so np can be seeded before keras imports
from scipy.stats import uniform, randint
from keras import backend as K
from riddle import models, feature_importance
# ----------------------------- EXTRACT DATA ----------------------------- #
X_train = data_partition_dict['X_train']
y_train = data_partition_dict['y_train']
X_val = data_partition_dict['X_val']
y_val = data_partition_dict['y_val']
X_test = data_partition_dict['X_test']
y_test = data_partition_dict['y_test']
print('{} train samples / {} val samples / {} test samples'
.format(len(X_train), len(X_val), len(X_test)))
print('{} features / {} classes'.format(nb_features, nb_classes))
print()
# -------------------------------- SETUP --------------------------------- #
process_X_data_func = model_module.process_X_data
process_y_data_func = model_module.process_y_data
process_X_data_func_args = {'nb_features': nb_features}
process_y_data_func_args = {'nb_classes': nb_classes}
model = model_module.create_base_model(nb_features=nb_features,
nb_classes=nb_classes, **best_model_param)
# ----------------------------- TRAIN MODEL ------------------------------ #
start = time.time()
if 'debug' in out_directory or 'dummy' in out_directory: max_nb_epoch = 3
model = models.train(model, X_train, y_train, X_val, y_val,
process_X_data_func, process_y_data_func, nb_features=nb_features,
nb_classes=nb_classes, process_X_data_func_args=process_X_data_func_args,
process_y_data_func_args=process_y_data_func_args,
max_nb_epoch=max_nb_epoch)
# -------------------------- TEST/EVALUATE MODEL ------------------------- #
(loss, acc), y_test_proba = models.test(model, X_test,
y_test, process_X_data_func, process_y_data_func, nb_features=nb_features,
nb_classes=nb_classes, process_X_data_func_args=process_X_data_func_args,
process_y_data_func_args=process_y_data_func_args)
runtime = time.time() - start
print('Completed training and testing in {:.4f} seconds'.format(runtime))
print('-' * 72)
print()
# save results
test_results_path = out_directory + '/test_results.txt'
models.save_test_results(y_test_proba, y_test, path=test_results_path)
# evaluate model performance
roc_graph_path = out_directory + '/roc.png'
evaluate(y_test, y_test_proba, nb_classes=nb_classes, path=roc_graph_path)
# ------------------------ FEATURE IMPORTANCE PREP ----------------------- #
start = time.time()
if interpret_model:
sums_D, sums_D2, sums_contribs, pairs = \
feature_importance.get_diff_sums(model, X_test, process_X_data_func,
nb_features=nb_features, nb_classes=nb_classes,
process_X_data_func_args=process_X_data_func_args)
pickle_object(sums_D, out_directory + '/sums_D.pkl')
pickle_object(sums_D2, out_directory + '/sums_D2.pkl')
pickle_object(sums_contribs, out_directory + '/sums_contribs.pkl')
print('Computed deepLIFT scores and pre-analysis in {:.4f} seconds'
.format(time.time() - start))
print('-' * 72)
print()
else:
sums_D, sums_D2, sums_contribs, pairs = None, None, None, None
# ------------------------------ SAVE MODEL ------------------------------ #
model_path = out_directory + '/model.h5'
models.save_model(model, path=model_path)
K.clear_session()
return (loss, acc, runtime), (sums_D, sums_D2, sums_contribs, pairs)
# not implemented yet
def main():
# seed here to avoid re-seeding when run_pipeline() is called by other files
np.random.seed(SEED) # for reproducibility, must be before Keras imports!
pass # TODO write this
# if run as script, execute main
if __name__ == '__main__':
import sys
main(sys.argv)