-
Notifications
You must be signed in to change notification settings - Fork 49
/
test.py
429 lines (377 loc) · 14.3 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
import unittest
import time
import os
import shutil
import json
from collections import OrderedDict
import pandas as pd
import numpy as np
import torch
from dfencoder import EncoderDataFrame
from dfencoder import AutoEncoder, compute_embedding_size, CompleteLayer, NullIndicator
from dfencoder import BasicLogger, IpynbLogger, TensorboardXLogger
from dfencoder import StandardScaler, NullScaler, GaussRankScaler
class TimedCase(unittest.TestCase):
def setUp(self):
self.startTime = time.time()
def tearDown(self):
t = time.time() - self.startTime
print("%s: %.3f seconds" % (self.id(), t))
class ModelBuilder(object):
def __init__(self):
self.model = None
self.out_df = None
def build_model(self):
if self.model is None:
encoder = AutoEncoder(
encoder_layers=[32, 32],
decoder_layers=[32, 32],
encoder_dropout=.5,
decoder_dropout=[.2, None],
activation='tanh',
swap_p=.2,
batch_size=123,
optimizer='sgd',
lr_decay=.95
)
encoder.build_model(df)
out_df = encoder.prepare_df(df)
assert not out_df.isna().any().any()
layers_count = 0
for prm in encoder.parameters():
layers_count += 1
assert layers_count == 33
self.model, self.out_df = encoder, out_df
return encoder, out_df
else:
return self.model, self.out_df
class TestCompleteLayer(TimedCase):
def test_init(self):
layer = CompleteLayer(12, 5, activation='sigmoid', dropout=.2)
assert len(layer.layers) == 3
return layer
def test_forward(self):
layer = self.test_init()
x = torch.randn((34, 12))
out = layer(x)
assert out.shape == (34, 5)
assert (out == 0).any().any()
def test_interpret_activation(self):
result = CompleteLayer.interpret_activation(None, 'leaky_relu')
assert result == torch.nn.functional.leaky_relu
class AutoEncoderTest(TimedCase):
def test_compute_embedding_size(self):
result = compute_embedding_size(5)
assert result == 4
def test_init(self):
encoder = AutoEncoder()
return encoder
def test_init_numeric(self):
encoder = AutoEncoder()
encoder.init_numeric(df)
assert len(encoder.numeric_fts) == 6
assert len(encoder.numeric_fts) == len(encoder.num_names)
def test_init_cats(self):
encoder = AutoEncoder()
encoder.init_cats(df)
assert len(encoder.categorical_fts) == 7
return encoder
def test_init_binaries(self):
df['mybin'] = np.random.randint(2, size=len(df)).astype(bool)
encoder = self.test_init_cats()
encoder.init_binary(df)
assert len(encoder.binary_fts) == 3
assert len(encoder.binary_fts) == len(encoder.bin_names)
del df['mybin']
def test_init_cyclical(self):
df['mytime'] = 1539435837534561201
df['mytime'] = pd.to_datetime(df['mytime'])
df.loc[10, 'mytime'] = np.nan
encoder = AutoEncoder()
encoder.init_cyclical(df)
assert list(encoder.cyclical_fts.keys()) == ['mytime']
def test_init_features(self):
encoder = AutoEncoder()
encoder.init_features(df)
assert len(encoder.binary_fts) == 2
assert len(encoder.categorical_fts) == 7
assert len(encoder.numeric_fts) == 6
def test_build_inputs(self):
encoder = AutoEncoder()
encoder.categorical_fts = {
'ft1' : {
'cats':['test1', 'test2', 'test3', 'test4']
}
}
encoder.build_inputs()
def test_build_vanilla_model(self):
encoder = AutoEncoder()
encoder.build_model(df)
def test_build_model(self):
encoder = AutoEncoder(
encoder_layers=[32, 32],
decoder_layers=[32, 32],
encoder_dropout=.5,
decoder_dropout=[.2, None],
activation='tanh',
swap_p=.2,
batch_size=123,
optimizer='sgd',
lr_decay=.95
)
encoder.build_model(df)
out_df = encoder.prepare_df(df)
assert not out_df.isna().any().any()
layers_count = 0
for prm in encoder.parameters():
layers_count += 1
assert layers_count == 33
return encoder, out_df
def test_encode_input(self):
encoder, out_df = self.test_build_model()
sample = out_df.sample(32)
out = encoder.encode_input(sample)
return encoder, sample
def test_forward(self):
encoder, sample = self.test_encode_input()
num, bin, cat, cls = encoder.forward(sample)
#raise Exception(num.shape)
if 'mytime' in encoder.cyclical_fts:
assert num.shape == (32, 15)
else:
assert num.shape == (32, 6)
assert bin.shape == (32, 2)
assert len(cat) == 7
return encoder, num, bin, cat, sample
def test_compute_loss(self):
encoder, num, bin, cat, sample = self.test_forward()
mse_loss, bce_loss, cce_loss, cls_loss, net_loss = encoder.compute_loss(num, bin, cat, sample)
def test_fit(self):
encoder = AutoEncoder(
verbose=False,
optimizer='sgd',
lr=.01,
lr_decay=.95,
progress_bar=False,
scaler={'age':'standard'},
)
df['mytime'] = 1539435837534561201
df['mytime'] = pd.to_datetime(df['mytime'])
df['mytime'] = pd.to_datetime(np.where(np.random.random(df.shape[0]) > .9, None, df['mytime']))
sample = df.sample(511)
encoder.fit(sample, epochs=2)
assert isinstance(encoder.numeric_fts['age']['scaler'], StandardScaler)
assert isinstance(encoder.numeric_fts['fnlwgt']['scaler'], GaussRankScaler)
assert encoder.lr_decay.get_lr()[0] < .01
anomaly_score = encoder.get_anomaly_score(sample)
assert anomaly_score.shape == (511,)
encoder.fit(sample, epochs=2)
data = encoder.df_predict(sample)
assert (data.columns == sample.columns).all()
assert data.shape == sample.shape
return encoder
def test_fit_with_label(self):
encoder = AutoEncoder(
verbose=False,
optimizer='sgd',
lr=.01,
lr_decay=.95,
progress_bar=False,
scaler={'age':'standard'},
label_col='salary'
)
df_cls['salary'] = np.where(np.random.rand(len(df_cls)) > 0.5, np.nan, df_cls['salary'])
df_cls['salary'] = np.where(df_cls['salary'] == "<50k", 0, df_cls['salary'])
df_cls['salary'] = np.where(df_cls['salary'] == ">=50k", 1, df_cls['salary'])
sample = df_cls.sample(511)
encoder.fit(sample, epochs=2)
assert isinstance(encoder.numeric_fts['age']['scaler'], StandardScaler)
assert isinstance(encoder.numeric_fts['fnlwgt']['scaler'], GaussRankScaler)
assert encoder.lr_decay.get_lr()[0] < .01
anomaly_score = encoder.get_anomaly_score(sample)
assert anomaly_score.shape == (511,)
encoder.fit(sample, epochs=2)
data = encoder.df_predict(sample)
expected_cols = set(list(sample.columns))
expected_cols.remove('salary')
got_cols = set(list(data.columns))
assert (expected_cols == got_cols)
return encoder
def test_inference(self):
record = df.sample()
js = record.iloc[0].to_json()
output = model._deserialize_json(js)
z_json = model.get_deep_stack_features_json(js)
dct = json.loads(js)
z_dict = model.get_deep_stack_features_json(dct)
z = model.get_deep_stack_features(record)
assert (z_json == z).all()
assert (z_json == z_dict).all()
def test_inference_missing_data(self):
record = df.sample()
record['salary'] = [None]
js = record.iloc[0].to_json()
with open('./_.json', 'w') as f:
f.write(json.dumps(js))
output = model._deserialize_json(js)
z_json = model.get_deep_stack_features_json(js)
dct = json.loads(js)
z_dict = model.get_deep_stack_features_json(dct)
z = model.get_deep_stack_features(record)
assert (z_json == z).all()
assert (z_json == z_dict).all()
def test_get_representation(self):
encoder = AutoEncoder()
sample = df.sample(1025)
z = encoder.get_representation(sample)
assert z.shape[0] == 1025
assert z.shape[1] > 1
assert isinstance(z, torch.Tensor)
def test_get_deep_stack_features(self):
encoder = AutoEncoder(
encoder_layers = [50, 100, 150],
decoder_layers = [44, 67]
)
sample = df.sample(1025)
z = encoder.get_deep_stack_features(sample)
assert z.shape[0] == 1025
assert z.shape[1] == 411
assert isinstance(z, torch.Tensor)
def test_compute_baseline_performance(self):
encoder = AutoEncoder()
encoder.init_features(df)
sample = df.sample(1000)
in_ = EncoderDataFrame(sample).swap()
out_ = sample
in_ = encoder.prepare_df(in_)
out_ = encoder.prepare_df(out_)
baseline = encoder.compute_baseline_performance(in_, out_)
class EncoderDataFrameTest(TimedCase):
def test_init(self):
ef = EncoderDataFrame()
ef['test1'] = [0,2,3]
ef['test2'] = ['a','b', 'c']
def test_swap(self):
cols = list(df.columns)
if 'mytime' in cols:
cols.remove('mytime')
df_ = df[cols]
ef = EncoderDataFrame(df_)
scr = ef.swap()
assert (scr == ef).any().all()
assert (scr != ef).any().all()
assert not (scr == ef).all().all()
class LoggerTest(TimedCase):
def test_basic_logger(self):
logger = BasicLogger(fts=['ft1', 'ft2', 'ft3'])
self.run_logging_test(logger)
def run_logging_test(self, logger):
n_epochs = logger.n_epochs
assert len(logger.train_fts) == 3
assert len(logger.val_fts) == 3
logger.training_step([0.2, 0.3, 0.2])
logger.training_step([0.1, 0.1, -0.2])
logger.val_step([0.2, 0.3, 0.2])
logger.val_step([0.1, 0.1, -0.2])
logger.id_val_step([0.2, 0.3, 0.2])
logger.id_val_step([0.1, 0.1, -0.2])
logger.end_epoch()
assert logger.train_fts['ft3'][1][-1] == 0
assert logger.val_fts['ft3'][1][-1] == 0
assert logger.id_val_fts['ft3'][1][-1] == 0
assert logger.n_epochs == n_epochs + 1
def test_ipynb_logger(self):
logger = IpynbLogger(fts=['ft1', 'ft2', 'ft3'], baseline_loss=0.2)
self.run_logging_test(logger)
logger.training_step([0.2, 0.3, 0.2])
logger.training_step([0.1, 0.1, -0.2])
logger.val_step([0.2, 0.3, 0.2])
logger.val_step([0.1, 0.1, -0.2])
logger.id_val_step([0.2, 0.3, 0.2])
logger.id_val_step([0.1, 0.1, -0.2])
#logger.end_epoch()
def test_tensorboardx_logger(self):
logger = TensorboardXLogger(logdir='_testlog/', fts=['ft1', 'ft2', 'ft3'])
cats = OrderedDict()
cats['cat1'] = {
'cats': ['cow', 'horse', 'pig', 'cat'],
'embedding': torch.nn.Embedding(5, 4),
'output_layer': None
}
cats['cat2'] = {
'cats': ['cow', 'horse', 'pig', 'cat'],
'embedding': torch.nn.Embedding(5, 4),
'output_layer': None
}
for i in range(10):
self.run_logging_test(logger)
logger.show_embeddings(cats)
class ScalerTest(TimedCase):
def test_standard_scaler(self):
scaler = StandardScaler()
x = np.random.randn(100)
x *= 3
x -= 3
x_ = scaler.fit_transform(x)
assert np.abs(x_.mean()) < 0.01
assert .99 < x_.std() < 1.01
def test_null_scaler(self):
scaler=NullScaler()
x = np.random.randn(100)
x *= 3
x -= 3
x_ = scaler.fit_transform(x)
assert (x_ == x).all()
def test_gauss_rank_scaler(self):
scaler = GaussRankScaler()
x = np.random.randn(10000)
x *= 3
x -= 3
x_ = scaler.fit_transform(x)
assert np.abs(x_.mean()) < 0.01
assert .99 < x_.std() < 1.01
class NullIndicatorTest(TimedCase):
def test_null_indicator(self):
ind = NullIndicator()
test_df = pd.DataFrame(columns=['has null', 'has no null'])
test_df['has null'] = np.random.randn(100)
test_df.loc[4, 'has null'] = np.nan
test_df['has no null'] = np.random.randn(100)
ind.fit(test_df)
assert 'has null' in ind.fts
assert 'has no null' not in ind.fts
output_df = ind.transform(test_df)
assert 'has null_was_nan' in output_df.columns
assert output_df.loc[4, 'has null_was_nan'] == True
ind2 = NullIndicator(required_fts=['has no null'])
ind2.fit(test_df)
output_df = ind2.transform(test_df)
assert len(output_df.columns) == 4
assert not output_df['has no null_was_nan'].any()
as_dict = ind.transform_dict(test_df.iloc[4].to_dict())
assert as_dict['has null_was_nan']
as_dict = ind.transform_dict(test_df.iloc[3].to_dict())
assert not as_dict['has null_was_nan']
def test_ghost_cols(self):
#wrote this test to debug a really weird issue
#went away after restarting ipynb kernel
# ¯\_(ツ)_/¯
ind = NullIndicator()
test_df = pd.DataFrame(columns=['has null', 'has no null', 'some col i dont want'])
test_df['has null'] = np.random.randn(100)
test_df.loc[4, 'has null'] = np.nan
test_df['has no null'] = np.random.randn(100)
test_df['some col i dont want'] = np.random.randn(100)
test_df.loc[4, 'some col i dont want'] = np.nan
cols_i_want = ['has null', 'has no null']
ind.fit(test_df[cols_i_want])
assert 'some col i dont want' not in ind.fts
if __name__ == '__main__':
os.mkdir('_testlog')
df = pd.read_csv('adult.csv')
df_cls = pd.read_csv('adult.csv')
b = ModelBuilder()
model, _ = b.build_model()
unittest.main(exit=False)
shutil.rmtree('_testlog')
quit()