diff --git a/benchmark/python/gluon/benchmark_gluon.py b/benchmark/python/gluon/benchmark_gluon.py new file mode 100644 index 000000000000..3dbb36404d07 --- /dev/null +++ b/benchmark/python/gluon/benchmark_gluon.py @@ -0,0 +1,164 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import mxnet as mx +import mxnet.gluon.model_zoo.vision as models +import time +import logging +import argparse +import subprocess +import os +import errno + +logging.basicConfig(level=logging.INFO) +parser = argparse.ArgumentParser(description='Gluon modelzoo-based CNN performance benchmark') + +parser.add_argument('--model', type=str, default='all', + choices=['all', 'alexnet', 'densenet121', 'densenet161', + 'densenet169', 'densenet201', 'inceptionv3', 'mobilenet0.25', + 'mobilenet0.5', 'mobilenet0.75', 'mobilenet1.0', 'mobilenetv2_0.25', + 'mobilenetv2_0.5', 'mobilenetv2_0.75', 'mobilenetv2_1.0', 'resnet101_v1', + 'resnet101_v2', 'resnet152_v1', 'resnet152_v2', 'resnet18_v1', + 'resnet18_v2', 'resnet34_v1', 'resnet34_v2', 'resnet50_v1', + 'resnet50_v2', 'squeezenet1.0', 'squeezenet1.1', 'vgg11', + 'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn', + 'vgg19', 'vgg19_bn']) +parser.add_argument('--batch-size', type=int, default=0, + help='Batch size to use for benchmarking. Example: 32, 64, 128.' + 'By default, runs benchmark for batch sizes - 1, 32, 64, 128, 256') +parser.add_argument('--num-batches', type=int, default=10) +parser.add_argument('--gpus', type=str, default='', + help='GPU IDs to use for this benchmark task. Example: --gpus=0,1,2,3 to use 4 GPUs.' + 'By default, use CPU only.') +parser.add_argument('--type', type=str, default='inference', choices=['all', 'training', 'inference']) + +opt = parser.parse_args() + +num_batches = opt.num_batches +dry_run = 10 # use 10 iterations to warm up +batch_inf = [1, 32, 64, 128, 256] +batch_train = [1, 32, 64, 128, 256] +image_shapes = [(3, 224, 224), (3, 299, 299)] + +def score(network, batch_size, ctx): + assert (batch_size >= len(ctx)), "ERROR: batch size should not be smaller than num of GPUs." + net = models.get_model(network) + if 'inceptionv3' == network: + data_shape = [('data', (batch_size,) + image_shapes[1])] + else: + data_shape = [('data', (batch_size,) + image_shapes[0])] + + data = mx.sym.var('data') + out = net(data) + softmax = mx.sym.SoftmaxOutput(out, name='softmax') + mod = mx.mod.Module(softmax, context=ctx) + mod.bind(for_training = False, + inputs_need_grad = False, + data_shapes = data_shape) + mod.init_params(initializer=mx.init.Xavier(magnitude=2.)) + data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=ctx[0]) for _, shape in mod.data_shapes] + batch = mx.io.DataBatch(data, []) + for i in range(dry_run + num_batches): + if i == dry_run: + tic = time.time() + mod.forward(batch, is_train=False) + for output in mod.get_outputs(): + output.wait_to_read() + fwd = time.time() - tic + return fwd + + +def train(network, batch_size, ctx): + assert (batch_size >= len(ctx)), "ERROR: batch size should not be smaller than num of GPUs." + net = models.get_model(network) + if 'inceptionv3' == network: + data_shape = [('data', (batch_size,) + image_shapes[1])] + else: + data_shape = [('data', (batch_size,) + image_shapes[0])] + + data = mx.sym.var('data') + out = net(data) + softmax = mx.sym.SoftmaxOutput(out, name='softmax') + mod = mx.mod.Module(softmax, context=ctx) + mod.bind(for_training = True, + inputs_need_grad = False, + data_shapes = data_shape) + mod.init_params(initializer=mx.init.Xavier(magnitude=2.)) + if len(ctx) > 1: + mod.init_optimizer(kvstore='device', optimizer='sgd') + else: + mod.init_optimizer(kvstore='local', optimizer='sgd') + data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=ctx[0]) for _, shape in mod.data_shapes] + batch = mx.io.DataBatch(data, []) + for i in range(dry_run + num_batches): + if i == dry_run: + tic = time.time() + mod.forward(batch, is_train=True) + for output in mod.get_outputs(): + output.wait_to_read() + mod.backward() + mod.update() + bwd = time.time() - tic + return bwd + +if __name__ == '__main__': + runtype = opt.type + bs = opt.batch_size + + if opt.model == 'all': + networks = ['alexnet', 'densenet121', 'densenet161', 'densenet169', 'densenet201', + 'inceptionv3', 'mobilenet0.25', 'mobilenet0.5', 'mobilenet0.75', + 'mobilenet1.0', 'mobilenetv2_0.25', 'mobilenetv2_0.5', 'mobilenetv2_0.75', + 'mobilenetv2_1.0', 'resnet101_v1', 'resnet101_v2', 'resnet152_v1', 'resnet152_v2', + 'resnet18_v1', 'resnet18_v2', 'resnet34_v1', 'resnet34_v2', 'resnet50_v1', + 'resnet50_v2', 'squeezenet1.0', 'squeezenet1.1', 'vgg11', 'vgg11_bn', 'vgg13', + 'vgg13_bn', 'vgg16', 'vgg16_bn', 'vgg19', 'vgg19_bn'] + logging.info('It may take some time to run all models, ' + 'set --network to run a specific one') + else: + networks = [opt.model] + + devs = [mx.gpu(int(i)) for i in opt.gpus.split(',')] if opt.gpus.strip() else [mx.cpu()] + num_gpus = len(devs) + + for network in networks: + logging.info('network: %s', network) + logging.info('device: %s', devs) + if runtype == 'inference' or runtype == 'all': + if bs != 0: + fwd_time = score(network, bs, devs) + fps = (bs * num_batches)/fwd_time + logging.info(network + ' inference perf for BS %d is %f img/s', bs, fps) + else: + logging.info('run batchsize [1, 2, 4, 8, 16, 32] by default, ' + 'set --batch-size to run a specific one') + for batch_size in batch_inf: + fwd_time = score(network, batch_size, devs) + fps = (batch_size * num_batches) / fwd_time + logging.info(network + ' inference perf for BS %d is %f img/s', batch_size, fps) + if runtype == 'training' or runtype == 'all': + if bs != 0: + bwd_time = train(network, bs, devs) + fps = (bs * num_batches) / bwd_time + logging.info(network + ' training perf for BS %d is %f img/s', bs, fps) + else: + logging.info('run batchsize [1, 2, 4, 8, 16, 32] by default, ' + 'set --batch-size to run a specific one') + for batch_size in batch_train: + bwd_time = train(network, batch_size, devs) + fps = (batch_size * num_batches) / bwd_time + logging.info(network + ' training perf for BS %d is %f img/s', batch_size, fps) diff --git a/example/image-classification/benchmark_score.py b/example/image-classification/benchmark_score.py index a4118ebcf76b..e81a30bd6439 100644 --- a/example/image-classification/benchmark_score.py +++ b/example/image-classification/benchmark_score.py @@ -21,26 +21,49 @@ from common import find_mxnet from common.util import get_gpus import mxnet as mx +import mxnet.gluon.model_zoo.vision as models from importlib import import_module import logging +import argparse import time import numpy as np logging.basicConfig(level=logging.DEBUG) +parser = argparse.ArgumentParser(description='SymbolAPI-based CNN inference performance benchmark') +parser.add_argument('--network', type=str, default='all', + choices=['all', 'alexnet', 'vgg-16', 'resnetv1-50', 'resnet-50', + 'resnet-152', 'inception-bn', 'inception-v3', + 'inception-v4', 'inception-resnet-v2', 'mobilenet', + 'densenet121', 'squeezenet1.1']) +parser.add_argument('--batch-size', type=int, default=0, + help='Batch size to use for benchmarking. Example: 32, 64, 128.' + 'By default, runs benchmark for batch sizes - 1, 32, 64, 128, 256') + +opt = parser.parse_args() + def get_symbol(network, batch_size, dtype): - image_shape = (3,299,299) if network == 'inception-v3' else (3,224,224) + image_shape = (3,299,299) if network in ['inception-v3', 'inception-v4'] else (3,224,224) num_layers = 0 - if 'resnet' in network: + if network == 'inception-resnet-v2': + network = network + elif 'resnet' in network: num_layers = int(network.split('-')[1]) network = network.split('-')[0] if 'vgg' in network: num_layers = int(network.split('-')[1]) network = 'vgg' - net = import_module('symbols.'+network) - sym = net.get_symbol(num_classes=1000, - image_shape=','.join([str(i) for i in image_shape]), - num_layers=num_layers, - dtype=dtype) + if network in ['densenet121', 'squeezenet1.1']: + sym = models.get_model(network) + sym.hybridize() + data = mx.sym.var('data') + sym = sym(data) + sym = mx.sym.SoftmaxOutput(sym, name='softmax') + else: + net = import_module('symbols.'+network) + sym = net.get_symbol(num_classes=1000, + image_shape=','.join([str(i) for i in image_shape]), + num_layers=num_layers, + dtype=dtype) return (sym, [('data', (batch_size,)+image_shape)]) def score(network, dev, batch_size, num_batches, dtype): @@ -69,14 +92,31 @@ def score(network, dev, batch_size, num_batches, dtype): return num_batches*batch_size/(time.time() - tic) if __name__ == '__main__': - networks = ['alexnet', 'vgg-16', 'inception-bn', 'inception-v3', 'resnetv1-50', 'resnet-50', 'resnet-152'] + if opt.network == 'all': + networks = ['alexnet', 'vgg-16', 'resnetv1-50', 'resnet-50', + 'resnet-152', 'inception-bn', 'inception-v3', + 'inception-v4', 'inception-resnet-v2', + 'mobilenet', 'densenet121', 'squeezenet1.1'] + logging.info('It may take some time to run all models, ' + 'set --network to run a specific one') + else: + networks = [opt.network] devs = [mx.gpu(0)] if len(get_gpus()) > 0 else [] # Enable USE_MKLDNN for better CPU performance devs.append(mx.cpu()) - batch_sizes = [1, 2, 4, 8, 16, 32] + if opt.batch_size == 0: + batch_sizes = [1, 32, 64, 128, 256] + logging.info('run batchsize [1, 32, 64, 128, 256] by default, ' + 'set --batch-size to run a specific one') + else: + batch_sizes = [opt.batch_size] + for net in networks: logging.info('network: %s', net) + if net in ['densenet121', 'squeezenet1.1']: + logging.info('network: %s is converted from gluon modelzoo', net) + logging.info('you can run benchmark/python/gluon/benchmark_gluon.py for more models') for d in devs: logging.info('device: %s', d) logged_fp16_warning = False