Skip to content

Commit

Permalink
Add benchmark implementations for the cnn ms example
Browse files Browse the repository at this point in the history
  • Loading branch information
NLGithubWP authored Apr 27, 2024
1 parent d6f52ff commit 2c2fbc6
Showing 1 changed file with 121 additions and 0 deletions.
121 changes: 121 additions & 0 deletions examples/cnn_ms/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#

# the code is modified from
# https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py

from singa import opt
from singa import device
from singa import tensor

import argparse
import time
import numpy as np
from tqdm import trange


def train_resnet(DIST=True, graph=True, sequential=False, verbosity=0):

# Define the hypermeters for the train_resnet
niters = 100
batch_size = 32
sgd = opt.SGD(lr=0.1, momentum=0.9, weight_decay=1e-5)

IMG_SIZE = 224

# For distributed training, sequential has better throughput in the current version
if DIST == True:
sgd = opt.DistOpt(sgd)
world_size = sgd.world_size
local_rank = sgd.local_rank
global_rank = sgd.global_rank
sequential = True
else:
local_rank = 0
world_size = 1
global_rank = 0
sequential = False

dev = device.create_cuda_gpu_on(local_rank)

tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev)
ty = tensor.Tensor((batch_size,), dev, tensor.int32)
x = np.random.randn(batch_size, 3, IMG_SIZE, IMG_SIZE).astype(np.float32)
y = np.random.randint(0, 1000, batch_size, dtype=np.int32)
tx.copy_from_numpy(x)
ty.copy_from_numpy(y)

dev.SetVerbosity(verbosity)
dev.SetSkipIteration(5)

# Construct the model
from model import resnet
model = resnet.resnet50(num_channels=3, num_classes=1000)

model.train()
model.set_optimizer(sgd)
model.compile([tx], is_train=True, use_graph=graph, sequential=sequential)

# Train model
dev.Sync()
start = time.time()
with trange(niters) as t:
for _ in t:
model(tx, ty, dist_option='fp32', spars=None)

dev.Sync()
end = time.time()
titer = (end - start) / float(niters)
throughput = float(niters * batch_size * world_size) / (end - start)
if global_rank == 0:
print("\nThroughput = {} per second".format(throughput), flush=True)
print("TotalTime={}".format(end - start), flush=True)
print("Total={}".format(titer), flush=True)
dev.PrintTimeProfiling()


if __name__ == "__main__":

parser = argparse.ArgumentParser(
description='Throughput test using Resnet 50')
parser.add_argument('--dist',
'--enable-dist',
default='False',
action='store_true',
help='enable distributed training',
dest='DIST')
parser.add_argument('--no-graph',
'--disable-graph',
default='True',
action='store_false',
help='disable graph',
dest='graph')
parser.add_argument('--verbosity',
'--log-verbosity',
default=0,
type=int,
help='logging verbosity',
dest='verbosity')

args = parser.parse_args()

train_resnet(DIST=args.DIST,
graph=args.graph,
sequential=False,
verbosity=args.verbosity)

0 comments on commit 2c2fbc6

Please sign in to comment.