-
Notifications
You must be signed in to change notification settings - Fork 13
/
util.py
219 lines (210 loc) · 10.5 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
import numpy as np
from scipy.sparse import csr_matrix
import gc
import sys
from sklearn.metrics import precision_recall_curve, auc
import random
def pr_auc_score(y_true, y_score):
precision, recall, _ = precision_recall_curve(y_true, y_score)
return auc(recall, precision, reorder=True), auc(precision, recall, reorder=True)
def get_consistent_filename(args, version=3):
if version == 1:
return "model_" + args.model + "_" + "epochs_" + str(args.epochs) + "_dropout_" + str(args.dropout) + "_rate_" + str(args.learning_rate) + "_batch_" + str(args.batch_size) + "_" + args.suffix
if version == 2:
return "model_" + args.model + "_epochs_" + str(args.epochs) + "_dropout_" + str(args.dropout) + "_rate_" + str(args.learning_rate) + "_batch_" + str(args.batch_size) + "_weight_" + str(args.pos_weight) + "_" + args.suffix
if version == 3:
return "model_" + args.model + "_epochs_" + str(args.epochs) + "_dropout_" + str(args.dropout) + "_rate_" + str(args.learning_rate) + "_batch_" + str(args.batch_size) + "_weight_" + str(args.pos_weight) + "_batchdecay_" + str(args.batch_decay) + "_" + args.suffix
if version == 4:
return "m_" + args.model + "_e_" + str(args.epochs) + "_d_" + str(args.dropout) + "_r_" + str(args.learning_rate) + "_b_" + str(args.batch_size) + "_w_" + str(args.pos_weight) + "_bd_" + str(args.batch_decay) + "_kw_" + str(args.kw) + "_size_" + str(args.w1) + "_" + str(args.w2) + "_" + str(args.w3) + "_hid_" + str(args.hidden) + "_" + args.suffix
class dotdict(dict):
"""dot.notation access to dictionary attributes.
From http://stackoverflow.com/questions/2352181/how-to-use-a-dot-to-access-members-of-dictionary"""
__getattr__ = dict.get
__setattr__ = dict.__setitem__
__delattr__ = dict.__delitem__
# This function assumes that the training data are in files: chr1_batched.npz, chr2_batched.npz, ...
def training_minibatcher(batch_size=4,file_extension="_batched_nomultimap.npz", base_path="/n/rush_lab/data/chromatin-features/chromatin-nn/", debug=False, small_dataset=False, start=0):
chromosomes = map(str, range(1, 23)) + ["X", "Y"]
# If only a few are wanted, just look at the X and Y chromosomes
if small_dataset:
chromosomes = ["Y", "X"]
random.shuffle(chromosomes)
for chromosome in chromosomes:
file = base_path + "hg19/chr" + chromosome + file_extension
if debug:
print("Loading the file")
sys.stdout.flush()
file_loaded = np.load(file)
if debug:
print("Loading the inputs")
sys.stdout.flush()
train_inputs = file_loaded["train_inputs"]
if debug:
print("Number of train_inputs", train_inputs.shape[0])
if debug:
print("Loading the outputs")
sys.stdout.flush()
train_outputs = file_loaded["train_outputs"]
assert(train_inputs.shape[0] == train_outputs.shape[0])
num_batches = train_inputs.shape[0]
del file_loaded
if debug:
print("Running garbage collection")
sys.stdout.flush()
gc.collect()
if debug:
print("Yielding results")
sys.stdout.flush()
for i in xrange(start*batch_size, num_batches, batch_size):
yield train_inputs[i:i+batch_size], train_outputs[i:i+batch_size]
train_inputs[i:i+batch_size] = 0
train_outputs[i:i+batch_size] = 0
del train_inputs
del train_outputs
gc.collect()
def test_minibatcher(batch_size=4,file_extension="_batched_nomultimap.npz", base_path="/n/rush_lab/data/chromatin-features/chromatin-nn/", debug=False, small_dataset=False):
chromosomes = map(str, range(1, 23)) + ["X", "Y"]
# If only a few are wanted, just look at the X and Y chromosomes
# if small_dataset:
# chromosomes = ["Y", "X"]
for chromosome in chromosomes:
file = base_path + "hg19/chr" + chromosome + file_extension
if debug:
print("Loading the file")
sys.stdout.flush()
file_loaded = np.load(file)
if debug:
print("Loading the inputs")
sys.stdout.flush()
test_inputs = file_loaded["test_inputs"]
if debug:
print("Loading the outputs")
sys.stdout.flush()
test_outputs = file_loaded["test_outputs"]
assert(test_inputs.shape[0] == test_outputs.shape[0])
num_batches = test_inputs.shape[0]
# If I only want a few, only show 5 percent of the full number of batches.
if small_dataset:
num_batches = int(num_batches*.05)
del file_loaded
if debug:
print("Running garbage collection")
sys.stdout.flush()
gc.collect()
if debug:
print("Yielding results")
sys.stdout.flush()
for i in xrange(0, num_batches, batch_size):
yield test_inputs[i:i+batch_size], test_outputs[i:i+batch_size]
test_inputs[i:i+batch_size] = 0
test_outputs[i:i+batch_size] = 0
del test_inputs
del test_outputs
gc.collect()
def valid_minibatcher(batch_size=4,file_extension="_batched_nomultimap.npz", base_path="/n/rush_lab/data/chromatin-features/chromatin-nn/", debug=False, small_dataset=False):
chromosomes = map(str, range(1, 23)) + ["X", "Y"]
# If only a few are wanted, just look at the X and Y chromosomes
for chromosome in chromosomes:
file = base_path + "hg19/chr" + chromosome + file_extension
if debug:
print("Loading the file")
sys.stdout.flush()
file_loaded = np.load(file)
if debug:
print("Loading the inputs")
sys.stdout.flush()
valid_inputs = file_loaded["valid_inputs"]
if debug:
print("Loading the outputs")
sys.stdout.flush()
valid_outputs = file_loaded["valid_outputs"]
assert(valid_inputs.shape[0] == valid_outputs.shape[0])
num_batches = valid_inputs.shape[0]
# If I only want a few, only show 10 percent of the full number of batches.
if small_dataset:
num_batches = int(num_batches*.10)
del file_loaded
if debug:
print("Running garbage collection")
sys.stdout.flush()
gc.collect()
if debug:
print("Yielding results")
sys.stdout.flush()
for i in xrange(0, num_batches, batch_size):
yield valid_inputs[i:i+batch_size], valid_outputs[i:i+batch_size]
valid_inputs[i:i+batch_size] = 0
valid_outputs[i:i+batch_size] = 0
del valid_inputs
del valid_outputs
gc.collect()
# def get_model(model, x, y_, dropout_keep_prob, batch_size, noutputs, is_training, decay_rate):
# if model == "dilated":
# return dilated_convolution_model(x, y_, dropout_keep_prob, batch_size, noutputs)
# elif model == "dilated_normed":
# return dilated_convolution_model_batchnorm(x, y_, dropout_keep_prob, batch_size, noutputs, is_training, decay_rate)
# elif model == "dilated_pooling":
# return dilated_convolution_with_pooling(x, y_, dropout_keep_prob, batch_size, noutputs)
# elif model == "dilated_pooling_normed":
# return dilated_convolution_with_pooling_batchnorm(x, y_, dropout_keep_prob, batch_size, noutputs, is_training, decay_rate)
# elif model == "dilated_dil_pooling":
# return dilated_convolution_with_dilated_pooling(x, y_, dropout_keep_prob, batch_size, noutputs)
# elif model == "dilated_dil_pooling_normed":
# return dilated_convolution_with_dilated_pooling_batchnorm(x, y_, dropout_keep_prob, batch_size, noutputs, is_training, decay_rate)
# elif model == "conv1":
# return convolution_1_layer(x, y_, dropout_keep_prob, batch_size, noutputs)
# elif model == "conv3":
# return convolution_3_layer(x, y_, dropout_keep_prob, batch_size, noutputs)
# elif model == "conv3_resizing":
# return convolution_3_layer_resizing(x, y_, dropout_keep_prob, batch_size, noutputs)
# elif model == "conv3_resizing_normed":
# return convolution_3_layer_resizing_batchnorm(x, y_, dropout_keep_prob, batch_size, noutputs, is_training, decay_rate)
# elif model == "conv7_resizing":
# return convolution_7_layer_resizing(x, y_, dropout_keep_prob, batch_size, noutputs)
# elif model == "conv7_resizing_normed":
# return convolution_7_layer_resizing_batchnorm(x, y_, dropout_keep_prob, batch_size, noutputs, is_training, decay_rate)
# elif model == "conv_bi_lstm":
# return conv_bi_lstm(x, y_, dropout_keep_prob, batch_size, noutputs)
# elif model == "id_cnn":
# return ID_CNN_model(x, y_, dropout_keep_prob, batch_size, noutputs)
# elif model == "id_cnn_normed":
# return ID_CNN_model(x, y_, dropout_keep_prob, batch_size, noutputs, is_training, decay_rate, use_batchnorm=True)
# else:
# print("Invalid model: " + model)
# sys.exit()
# return None, None, None, None, None
# def get_model(model, x, y_, dropout_keep_prob, batch_size, noutputs, is_training, decay_rate, kw, nkernels, hidden):
# if model == "dilated":
# return dilated_convolution_model(x, y_, dropout_keep_prob, batch_size, noutputs, kw, nkernels, hidden)
# elif model == "dilated_normed":
# return dilated_convolution_model_batchnorm(x, y_, dropout_keep_prob, batch_size, noutputs, is_training, decay_rate, kw, nkernels, hidden)
# elif model == "dilated_pooling":
# return dilated_convolution_with_pooling(x, y_, dropout_keep_prob, batch_size, noutputs, kw, nkernels, hidden)
# elif model == "dilated_pooling_normed":
# return dilated_convolution_with_pooling_batchnorm(x, y_, dropout_keep_prob, batch_size, noutputs, is_training, decay_rate, kw, nkernels, hidden)
# elif model == "dilated_dil_pooling":
# return dilated_convolution_with_dilated_pooling(x, y_, dropout_keep_prob, batch_size, noutputs, kw, nkernels, hidden)
# elif model == "dilated_dil_pooling_normed":
# return dilated_convolution_with_dilated_pooling_batchnorm(x, y_, dropout_keep_prob, batch_size, noutputs, is_training, decay_rate, kw, nkernels, hidden)
# elif model == "conv1":
# return convolution_1_layer(x, y_, dropout_keep_prob, batch_size, noutputs, kw, nkernels, hidden)
# elif model == "conv3":
# return convolution_3_layer(x, y_, dropout_keep_prob, batch_size, noutputs, kw, nkernels, hidden)
# elif model == "conv3_resizing":
# return convolution_3_layer_resizing(x, y_, dropout_keep_prob, batch_size, noutputs, kw, nkernels, hidden)
# elif model == "conv3_resizing_normed":
# return convolution_3_layer_resizing_batchnorm(x, y_, dropout_keep_prob, batch_size, noutputs, is_training, decay_rate, kw, nkernels, hidden)
# elif model == "conv7_resizing":
# return convolution_7_layer_resizing(x, y_, dropout_keep_prob, batch_size, noutputs, kw, nkernels, hidden)
# elif model == "conv7_resizing_normed":
# return convolution_7_layer_resizing_batchnorm(x, y_, dropout_keep_prob, batch_size, noutputs, is_training, decay_rate, kw, nkernels, hidden)
# elif model == "conv_bi_lstm":
# return conv_bi_lstm(x, y_, dropout_keep_prob, batch_size, noutputs, kw, nkernels, hidden)
# elif model == "id_cnn":
# return ID_CNN_model(x, y_, dropout_keep_prob, batch_size, noutputs, None, None, False, kw, nkernels, hidden)
# elif model == "id_cnn_normed":
# return ID_CNN_model(x, y_, dropout_keep_prob, batch_size, noutputs, is_training, decay_rate, True, kw, nkernels, hidden)
# else:
# print("Invalid model: " + model)
# sys.exit()
# return None, None, None, None, None