-
Notifications
You must be signed in to change notification settings - Fork 0
/
distributed_train_horovod.py
258 lines (182 loc) · 10.6 KB
/
distributed_train_horovod.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
#!/usr/bin/env python
import argparse
import tensorflow as tf
import os
from PIL import Image, ImageFile
import numpy as np
import Model.prepare as prepare
from matplotlib import pyplot as plt
from datetime import datetime
from tensorflow.python.client import timeline
import Model.modified_MCNN as model
import os
import re
import time
from datetime import datetime
import subprocess
import psutil
import horovod.tensorflow as hvd
tf.logging.set_verbosity(tf.logging.INFO)
# This function kills all the child processes associated with the parent process sent as function argument.
def kill(proc_pid):
process = psutil.Process(proc_pid)
for proc in process.children(recursive=True):
proc.kill()
process.kill()
def core_model(input_image):
mcnn_model = model.MCNN(input_image)
predicted_density_map = mcnn_model.final_layer_output
return predicted_density_map
def do_training(args):
config = tf.ConfigProto()
config.gpu_options.visible_device_list = str(hvd.local_rank())
config.log_device_placement = False
config.allow_soft_placement = True
train_set_image, train_set_gt, test_set_image, test_set_gt = prepare.get_train_test_DataSet(args["image_path"], args["gt_path"], args["dataset_train_test_ratio"])
TRAINSET_LENGTH = len(train_set_image)
print("Trainset Length : ", len(train_set_image) , len(train_set_gt))
images_input_train = tf.constant(train_set_image)
images_gt_train = tf.constant(train_set_gt)
dataset_train = tf.data.Dataset.from_tensor_slices((images_input_train, images_gt_train))
# At time of this writing Tensorflow doesn't support a mixture of user defined python function with tensorflow operations.
# So we can't use one py_func to process data using tenosrflow operation and nontensorflow operation.
# Train Set
Batched_dataset_train = dataset_train.map(
lambda img, gt: tf.py_func(prepare.read_npy_file, [img, gt], [img.dtype, tf.float32]))
Batched_dataset_train = Batched_dataset_train \
.shuffle(buffer_size=500) \
.map(prepare._parse_function,num_parallel_calls= args["num_parallel_threads"]) \
.apply(tf.contrib.data.batch_and_drop_remainder(args["batch_size_per_GPU"])) \
.prefetch(buffer_size = args["prefetch_buffer"])\
.repeat()
iterator = Batched_dataset_train.make_one_shot_iterator()
mini_batch = iterator.get_next()
image_names = mini_batch[0]
# If the number of GPUs is set to 1, then no splitting will be done
split_batches_imgs = tf.split(mini_batch[1], int(args["num_gpus"]))
split_batches_gt = tf.split(mini_batch[2], int(args["num_gpus"]))
predicted_density_map = core_model(split_batches_imgs[0])
cost = tf.reduce_mean(
tf.sqrt(
tf.reduce_sum(
tf.square(
tf.subtract(split_batches_gt[0], predicted_density_map)), axis=[1, 2, 3], keepdims=True)))
sum_of_gt = tf.reduce_sum(split_batches_gt[0], axis=[1, 2, 3], keepdims=True)
sum_of_predicted_density_map = tf.reduce_sum(predicted_density_map, axis=[1, 2, 3], keepdims=True)
#mse = tf.sqrt(tf.reduce_mean(tf.square(sum_of_gt - sum_of_predicted_density_map)))
# Changed the mean abosolute error.
mae = tf.reduce_mean(
tf.reduce_sum(tf.abs(tf.subtract(sum_of_gt, sum_of_predicted_density_map)), axis=[1, 2, 3], keepdims=True),name="mae")
# Adding summary to the graph.
# added a small threshold value with mae to prevent NaN to be stored in summary histogram.
#tf.summary.scalar("Mean Squared Error", mse)
tf.summary.scalar("Mean_Absolute_Error", mae)
# Retain the summaries from the final tower.
summaries = tf.get_collection(tf.GraphKeys.SUMMARIES)
for var in tf.trainable_variables():
summaries.append(tf.summary.histogram(var.op.name, var))
summary_op = tf.summary.merge(summaries)
global_step = tf.train.get_or_create_global_step()
#changed for Horovod.
#optimizer = tf.train.AdamOptimizer((args["learning_rate"]) * hvd.size())
optimizer = tf.train.AdamOptimizer((args["learning_rate"]))
#train_op = optimizer.minimize(cost, global_step=global_step)
# Synchronous training.
#opt = tf.train.SyncReplicasOptimizer(optimizer, replicas_to_aggregate=len(worker_hosts),total_num_replicas=len(worker_hosts))
# Add Horovod Distributed Optimizer
opt = hvd.DistributedOptimizer(optimizer)
# Some models have startup_delays to help stabilize the model but when using
# sync_replicas training, set it to 0.
# Now you can call `minimize()` or `compute_gradients()` and
# `apply_gradients()` normally
# !!!! Doubtful
training_op = opt.minimize(cost, global_step=global_step)
#config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)
# The StopAtStepHook handles stopping after running given steps.
#SHARDED_TRAINSET_LENGTH = int(TRAINSET_LENGTH/len(worker_hosts))
effective_batch_size = int(args["batch_size_per_GPU"]) * hvd.size()
end_point = int((TRAINSET_LENGTH * int(args["number_of_epoch"])) / effective_batch_size)
print("My Rank: ", hvd.rank())
print("My local Rank: ", hvd.local_rank())
print("End Point : ",end_point)
# You can create the hook which handles initialization and queues.
#sync_replicas_hook = opt.make_session_run_hook(is_chief)
# Creating profiler hook.
#profile_hook = tf.train.ProfilerHook(save_steps=1000, output_dir='/home/rashid/DistributedCNN/Model/timeline/')
# Simple Example of logging hooks
# last_step should be equal to the end_point
hooks=[hvd.BroadcastGlobalVariablesHook(0), tf.train.StopAtStepHook(last_step=end_point)]
checkpoint_dir = args["checkpoint_path"] if hvd.rank() == 0 else None
# The MonitoredTrainingSession takes care of session initialization,
# restoring from a checkpoint, saving to a checkpoint, and closing when done
# or an error occurs.
arr_examples_per_sec = []
with tf.train.MonitoredTrainingSession(checkpoint_dir= checkpoint_dir,
hooks=hooks, config=config) as mon_sess:
while not mon_sess.should_stop():
# Run a training step asynchronously.
# See <a href="../api_docs/python/tf/train/SyncReplicasOptimizer"><code>tf.train.SyncReplicasOptimizer</code></a> for additional details on how to
# perform *synchronous* training.
# mon_sess.run handles AbortedError in case of preempted PS.
start_time = time.time()
_, loss_value, step = mon_sess.run((training_op, mae,global_step))
duration = time.time() - start_time
examples_per_sec = (args["batch_size_per_GPU"] * hvd.size()) / duration
arr_examples_per_sec.append(examples_per_sec)
format_str = ('%s: step %d, loss = %.2f , examples/sec = %.1f ')
print(format_str % (datetime.now(), step, loss_value,
examples_per_sec))
#mon_sess.run(training_op)
# with open("exm_per_sec.txt","w") as file_object:
#
# for i in range(0,len(arr_examples_per_sec)):
# file_object.write(str(arr_examples_per_sec[i])+"\n")
print("--- Experiment Finished ---")
if __name__ == "__main__":
# The following default values will be used if not provided from the command line arguments.
DEFAULT_NUMBER_OF_GPUS = 1
DEFAULT_EPOCH = 3000
DEFAULT_NUMBER_OF_WORKERS = 1
DEFAULT_NUMBER_OF_PS = 1
DEFAULT_BATCHSIZE_PER_GPU = 8
DEFAULT_BATCHSIZE = DEFAULT_BATCHSIZE_PER_GPU * DEFAULT_NUMBER_OF_GPUS * DEFAULT_NUMBER_OF_WORKERS
DEFAULT_PARALLEL_THREADS = 4
#DEFAULT_PREFETCH_BUFFER_SIZE = DEFAULT_BATCHSIZE * DEFAULT_NUMBER_OF_GPUS * 2
DEFAULT_PREFETCH_BUFFER_SIZE = 32
DEFAULT_IMAGE_PATH = "/home/mrc689/Sampled_Dataset"
DEFAULT_GT_PATH = "/home/mrc689/Sampled_Dataset_GT/density_map"
DEFAULT_LOG_PATH = "/home/mrc689/tf_logs"
DEFAULT_RATIO_TRAINTEST_DATASET = 0.7
DEFAULT_LEARNING_RATE = 0.00001
DEFAULT_CHECKPOINT_PATH = "/home/mrc689/tf_ckpt"
DEFAULT_LOG_FREQUENCY = 10
#DEFAULT_MAXSTEPS = (DEFAULT_TRAINSET_LENGTH * DEFAULT_EPOCH) / DEFAULT_BATCHSIZE
# Create arguements to parse
ap = argparse.ArgumentParser(description="Script to train the FlowerCounter model using multiGPUs in single node.")
ap.add_argument("-g", "--num_gpus", required=False, help="How many GPUs to use.",default = DEFAULT_NUMBER_OF_GPUS)
ap.add_argument("-e", "--number_of_epoch", required=False, help="Number of epochs",default = DEFAULT_EPOCH)
ap.add_argument("-b", "--batch_size", required=False, help="Number of images to process in a minibatch",default = DEFAULT_BATCHSIZE)
ap.add_argument("-gb", "--batch_size_per_GPU", required=False, help="Number of images to process in a batch per GPU",default = DEFAULT_BATCHSIZE_PER_GPU)
ap.add_argument("-i", "--image_path", required=False, help="Input path of the images",default = DEFAULT_IMAGE_PATH)
ap.add_argument("-gt", "--gt_path", required=False, help="Ground truth path of input images",default = DEFAULT_GT_PATH)
ap.add_argument("-num_threads", "--num_parallel_threads", required=False, help="Number of threads to use in parallel for preprocessing elements in input pipeline", default = DEFAULT_PARALLEL_THREADS)
ap.add_argument("-l", "--log_path", required=False, help="Path to save the tensorflow log files",default=DEFAULT_LOG_PATH)
ap.add_argument("-r", "--dataset_train_test_ratio", required=False, help="Dataset ratio for train and test set .",default = DEFAULT_RATIO_TRAINTEST_DATASET)
ap.add_argument("-pbuff","--prefetch_buffer",required=False,help="An internal buffer to prefetch elements from the input dataset ahead of the time they are requested",default=DEFAULT_PREFETCH_BUFFER_SIZE)
ap.add_argument("-lr", "--learning_rate", required=False, help="Default learning rate.",default = DEFAULT_LEARNING_RATE)
ap.add_argument("-ckpt_path", "--checkpoint_path", required=False, help="Path to save the Tensorflow model as checkpoint file.",default = DEFAULT_CHECKPOINT_PATH)
ap.add_argument("-nw", "--number_of_workers", required=False, help="Number of Workers.",default = DEFAULT_NUMBER_OF_WORKERS)
ap.add_argument("-lg_freq", "--log_frequency", required=False, help="Log frequency.",default = DEFAULT_LOG_FREQUENCY)
args = vars(ap.parse_args())
# Start the application
start_time = time.time()
# Initialize Horovod
hvd.init()
tf.reset_default_graph()
# This process initiates the GPU profiling script.
#proc = subprocess.Popen(['./gpu_profile'])
#print("start GPU profiling process with pid %s" % proc.pid)
do_training(args)
duration = time.time() - start_time
#kill(proc.pid)
print("Duration : ", duration)