-
Notifications
You must be signed in to change notification settings - Fork 5
/
testgen.py
executable file
·138 lines (125 loc) · 5.38 KB
/
testgen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/usr/bin/env python3
"""
This script generates test definitions for P3 Test Driver.
Usage: ./testgen.py | p3_test_driver -t - -c p3_test_driver.config.yaml
"""
import json
import sys
def add_test():
num_copies = num_copies_cached if cached else num_copies_uncached
if num_copies == 1 and image_resize_factor == 1.0:
data_dir_suffix = ''
elif num_copies > 1 and image_resize_factor == 1.0:
data_dir_suffix = '-%dx' % num_copies
elif num_copies < 1 and image_resize_factor == 1.0:
data_dir_suffix = '-%0.1fx' % num_copies
elif num_copies == 1 and image_resize_factor == 3.0:
data_dir_suffix = '1729'
else:
raise Exception()
data_dir_template = '/mnt/' + storage_type + '%%d/data/imagenet-scratch/tfrecords' + data_dir_suffix
flush_compute = not cached
flush_isilon = storage_type=='isilon' and not cached
t = dict(
test='simple',
record_as_test='tensorflow_cnn_benchmark',
max_test_attempts=1,
pre_commands=[
dict(key='tensorflow_benchmark_git_hash',
command_template='cd ../tensorflow-benchmarks && git rev-parse --short HEAD'),
dict(key='NVIDIA_TENSORFLOW_VERSION',
command_template='docker exec tf /bin/bash -c "echo \\$NVIDIA_TENSORFLOW_VERSION"'),
dict(key='TENSORFLOW_VERSION',
command_template='docker exec tf /bin/bash -c "echo \\$TENSORFLOW_VERSION"'),
],
command_template=[
'docker',
'exec',
'-e', 'PYTHONUNBUFFERED=1',
'tf',
'./run_benchmark.py',
'--batch_group_size', '%d' % batch_group_size,
'--batch_size', '%d' % batch_size,
'--data_dir_template', data_dir_template,
'--data_dir_template_count', '%d' % data_dir_template_count,
'--datasets_prefetch_buffer_size', '%d' % datasets_prefetch_buffer_size,
'--datasets_num_private_threads', '%d' % datasets_num_private_threads,
'--flush_compute', '%d' % flush_compute,
'--flush_isilon', '%d' % flush_isilon,
'--fp16', '%d' % fp16,
'--isilon_host', '%(isilon_host)s',
'--model', model,
'--mpi', '%d' % mpi,
'--noop', '%d' % noop,
'--np', '%d' % np,
'--npernode', '%d' % npernode,
'--num_batches', '%d' % num_batches,
'--num_hosts', '%d' % num_hosts,
'--num_intra_threads', '%d' % num_intra_threads,
'--num_inter_threads', '%d' % num_inter_threads,
'--use_tf_layers', '%d' % use_tf_layers,
],
batch_group_size=batch_group_size,
batch_size=batch_size,
cached=cached,
data_dir_suffix=data_dir_suffix,
data_dir_template=data_dir_template,
data_dir_template_count=data_dir_template_count,
datasets_prefetch_buffer_size=datasets_prefetch_buffer_size,
datasets_num_private_threads=datasets_num_private_threads,
flush_compute=flush_compute,
flush_isilon=flush_isilon,
fp16=fp16,
image_resize_factor=image_resize_factor,
isilon_flush=flush_isilon,
model=model,
mpi=mpi,
np=np,
npernode=npernode,
num_batches=num_batches,
num_copies=num_copies,
num_hosts=num_hosts,
num_intra_threads=num_intra_threads,
num_inter_threads=num_inter_threads,
nvlink=False,
storage_type=storage_type,
use_tf_layers=use_tf_layers,
)
t['metrics_group:nvidia'] = dict(
agents=dict(
nvidia_smi=dict(
start_cmd='nvidia-smi --query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,\
pcie.link.gen.current,temperature.gpu,utilization.gpu,utilization.memory,\
memory.total,memory.free,memory.used --format=csv -l 5',
stop_cmd='pkill nvidia-smi',
)
)
)
test_list.append(t)
test_list = []
num_copies_cached = 0.1
num_copies_uncached = 7
image_resize_factor = 1.0
fp16 = True
use_tf_layers = True
noop = False
# Full test suite
for repeat in range(3):
for storage_type in ['isilon']: # 'isilon','filestore'
for cached in [True] if storage_type=='local' else [False,True]:
for model in ['resnet50','vgg16','resnet152','inception3','inception4']:
for batch_group_size in [10]:
for batch_size in [64]:
for data_dir_template_count in [1 if cached or storage_type=='filestore' else 4]:
for datasets_prefetch_buffer_size in [40]:
for datasets_num_private_threads in [4]:
for num_batches in [500]:
for num_hosts in [30,24,16,8,4,2,1]:
for npernode in [4]:
np = num_hosts * npernode
for num_intra_threads in [1]:
for num_inter_threads in [40]:
for mpi in [True]:
add_test()
print(json.dumps(test_list, sort_keys=True, indent=4, ensure_ascii=False))
print('Number of tests generated: %d' % len(test_list), file=sys.stderr)