forked from usuyama/ePillID-benchmark
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathazureml_submit_runs.py
118 lines (94 loc) · 4.06 KB
/
azureml_submit_runs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import os
import sys
import argparse
from azureml.core.workspace import Workspace
from azureml.core import Experiment, Datastore, RunConfiguration
from azureml.train.estimator import Estimator
parser = argparse.ArgumentParser()
parser.add_argument('experiment', help='Azure ML experiment name')
parser.add_argument('--workspace-config', default="azureml_config.json", help='Download from the Azure ML portal')
parser.add_argument('--compute', default="nc6v3", help='Azure ML training cluster')
parser.add_argument('--max_epochs', type=int, default=300)
args = parser.parse_args()
print(args)
# load workspace configuration from the config.json file
ws = Workspace.from_config(path=args.workspace_config)
print('=' * 40)
print(ws)
# create an experiment
exp = Experiment(workspace=ws, name=args.experiment)
print('=' * 40)
print(exp)
# specify a cluster
compute_target = ws.compute_targets[args.compute]
print('=' * 40)
print(compute_target)
# Mount the blob to the training container
# NOTE: (prerequisite) unzip and upload the ePillID data to the blob
data_ds = Datastore.get(ws, datastore_name='data')
rc = RunConfiguration()
rc.environment.docker.enabled = True
# Using an image from https://hub.docker.com/r/naotous/pytorch-image
# TODO: clean up the Dockerfile
rc.environment.docker.base_image = "naotous/pytorch-image:py36torch041-legacy"
rc.environment.docker.gpu_support = True
# don't let the system build a new conda environment
rc.environment.python.user_managed_dependencies = True
# point to an existing python environment in the Docker image
rc.environment.python.interpreter_path = '/app/miniconda/envs/py36/bin/python'
# flag for the user input
SUBMIT_ALL = False
def submit(script_params):
global SUBMIT_ALL
input_data = None
# the files in source_directory will be uploaded to the cluster
est = Estimator(source_directory='src',
script_params=script_params,
compute_target=compute_target,
entry_script='train_cv.py',
environment_definition=rc.environment)
print('script_params', script_params)
while not SUBMIT_ALL and True:
input_data = input("Submit? [Y/y/n/s]")
if input_data in ['Y', 'y', 'n', 's']:
break
if input_data == 'Y':
SUBMIT_ALL = True
if SUBMIT_ALL or input_data == 'y':
run = exp.submit(est)
print('=' * 40)
print(run)
print('Monitor the training progress on the portal: URL=', run.get_portal_url())
elif input_data == 'n':
print("aborting!")
sys.exit(0)
else:
print("skip!")
# define the entry script
base_script_params = {
'--data_root_dir': data_ds.path('ePillID_data').as_mount().as_mount(),
'--max_epochs': args.max_epochs
}
loss_params = [
{'--contrastive_w': 0.0, '--triplet_w': 0.0, '--arcface_w': 0.0, '--ce_w': 1.0, '--focal_w': 0.0, '--dropout': 0.5}, # plain classification (logits)
{'--contrastive_w': 1.0, '--triplet_w': 1.0, '--arcface_w': 0.1, '--ce_w': 1.0, '--focal_w': 0.0}, # multihead metric learning
]
networks = [
#{'--appearance_network': 'resnet18'},
# {'--appearance_network': 'resnet34'},
#{'--appearance_network': 'resnet50', '--train_with_side_labels': '0', '--metric_simul_sidepairs_eval': '0'},
#{'--appearance_network': 'resnet50', '--train_with_side_labels': '0', '--metric_simul_sidepairs_eval': '1'},
#{'--appearance_network': 'resnet50', '--train_with_side_labels': '1', '--metric_simul_sidepairs_eval': '0'},
#{'--appearance_network': 'resnet50', '--train_with_side_labels': '1', '--metric_simul_sidepairs_eval': '1'},
{'--appearance_network': 'resnet50'},
{'--appearance_network': 'resnet50', '--pooling': 'CBP'},
{'--appearance_network': 'resnet50', '--pooling': 'BCNN'},
# {'--appearance_network': 'resnet101'},
# {'--appearance_network': 'resnet152'},
# {'--appearance_network': 'densenet121'},
# {'--appearance_network': 'densenet161'},
# {'--appearance_network': 'densenet201'},
]
for l in loss_params:
for n in networks:
submit({**l, **n, **base_script_params})