From 46961d97553bc592875045ab87f71934ca44ccf7 Mon Sep 17 00:00:00 2001 From: dperi Date: Tue, 31 May 2022 12:00:32 -0700 Subject: [PATCH 01/13] chore: additional options for perf_run tool Signed-off-by: dperi --- tools/perf/config/vgg16.yml | 17 ++++----- tools/perf/perf_run.py | 72 +++++++++++++++++++------------------ 2 files changed, 46 insertions(+), 43 deletions(-) diff --git a/tools/perf/config/vgg16.yml b/tools/perf/config/vgg16.yml index 360663cfc0..d88d489458 100755 --- a/tools/perf/config/vgg16.yml +++ b/tools/perf/config/vgg16.yml @@ -1,18 +1,19 @@ ---- -backend: +--- +backend: - all -input: - input0: +input: + input0: - 1 - 3 - 224 - 224 num_inputs: 1 -model: - filename: models/vgg16_traced.jit.pt + batch_size: 1 +model: + filename: models/vgg16_scripted.jit.pt name: vgg16 -runtime: +runtime: device: 0 - precision: + precision: - fp32 - fp16 diff --git a/tools/perf/perf_run.py b/tools/perf/perf_run.py index 69573f5908..08d792683c 100644 --- a/tools/perf/perf_run.py +++ b/tools/perf/perf_run.py @@ -26,7 +26,7 @@ def __init__(self, config_file): self.parser = None self.config = config_file self.params = None - + # Reads and loads the yaml file def read_config(self): with open(self.config, "r") as stream: @@ -45,7 +45,7 @@ def get(self, key, default_value=None): return self.params[key] # Runs inference using Torch backend -def run_torch(model, input_tensors, params, precision): +def run_torch(model, input_tensors, params, precision, batch_size): print("Running Torch for precision: ", precision) iters = params.get('iterations', 20) @@ -66,25 +66,25 @@ def run_torch(model, input_tensors, params, precision): meas_time = end_time - start_time timings.append(meas_time) print("Iteration {}: {:.6f} s".format(i, end_time - start_time)) - - printStats("Torch", timings, precision) + + printStats("Torch", timings, precision, batch_size) # Runs inference using Torch-TensorRT backend -def run_torch_tensorrt(model, input_tensors, params, precision): +def run_torch_tensorrt(model, input_tensors, params, precision, truncate_long_and_double, batch_size): print("Running Torch-TensorRT") - # Compiling Torch-TensorRT model compile_settings = { "inputs": input_tensors, - "enabled_precisions": {precision_to_dtype(precision)} + "enabled_precisions": {precision_to_dtype(precision)} , + "truncate_long_and_double": truncate_long_and_double, } if precision == 'int8': compile_settings.update({"calib": params.get('calibration_cache')}) - + model = torchtrt.compile(model, **compile_settings) - + iters = params.get('iterations', 20) # Warm up with torch.no_grad(): @@ -103,8 +103,8 @@ def run_torch_tensorrt(model, input_tensors, params, precision): meas_time = end_time - start_time timings.append(meas_time) print("Iteration {}: {:.6f} s".format(i, end_time - start_time)) - - printStats("Torch-TensorRT", timings, precision) + + printStats("Torch-TensorRT", timings, precision, batch_size) def torch_dtype_from_trt(dtype): if dtype == trt.int8: @@ -129,7 +129,7 @@ def torch_device_from_trt(device): return TypeError("%s is not supported by torch" % device) -def run_tensorrt(model, input_tensors, params, precision, is_trt_engine=False): +def run_tensorrt(model, input_tensors, params, precision, is_trt_engine=False, batch_size=1): engine = None # If the model file is a TensorRT engine then directly deserialize and run inference @@ -143,22 +143,21 @@ def run_tensorrt(model, input_tensors, params, precision, is_trt_engine=False): print("Converting method to TensorRT engine...") with torch.no_grad(): model = torchtrt.ts.convert_method_to_trt_engine(model, "forward", **compile_settings) - + # Deserialize the TensorRT engine with trt.Logger() as logger, trt.Runtime(logger) as runtime: engine = runtime.deserialize_cuda_engine(model) - + print("Running TensorRT") iters = params.get('iterations', 20) - batch_size = params.get('batch', 1) # Compiling the bindings bindings = engine.num_bindings * [None] - + # import pdb; pdb.set_trace() k = 0 for idx,_ in enumerate(bindings): dtype = torch_dtype_from_trt(engine.get_binding_dtype(idx)) - shape = (batch_size,) + tuple(engine.get_binding_shape(idx)) + shape = tuple(engine.get_binding_shape(idx)) device = torch_device_from_trt(engine.get_location(idx)) if not engine.binding_is_input(idx): # Output bindings @@ -168,26 +167,26 @@ def run_tensorrt(model, input_tensors, params, precision, is_trt_engine=False): # Input bindings bindings[idx] = input_tensors[k].data_ptr() k += 1 - + timings = [] with engine.create_execution_context() as context: for i in range(WARMUP_ITER): - context.execute_async(batch_size, bindings, torch.cuda.current_stream().cuda_stream) + context.execute_async(1, bindings, torch.cuda.current_stream().cuda_stream) torch.cuda.synchronize() for i in range(iters): start_time = timeit.default_timer() - context.execute_async(batch_size, bindings, torch.cuda.current_stream().cuda_stream) + context.execute_async(1, bindings, torch.cuda.current_stream().cuda_stream) torch.cuda.synchronize() end_time = timeit.default_timer() meas_time = end_time - start_time timings.append(meas_time) print("Iterations {}: {:.6f} s".format(i, end_time - start_time)) - - printStats("TensorRT", timings, precision) + + printStats("TensorRT", timings, precision, batch_size) # Deploys inference run for different backend configurations -def run(model, input_tensors, params, precision, is_trt_engine = False): +def run(model, input_tensors, params, precision, truncate_long_and_double = False, batch_size = 1, is_trt_engine = False): for backend in params.get('backend'): if precision == 'int8': @@ -200,18 +199,19 @@ def run(model, input_tensors, params, precision, is_trt_engine = False): return False if backend == 'all': - run_torch(model, input_tensors, params, precision) - run_torch_tensorrt(model, input_tensors, params, precision) - run_tensorrt(model, input_tensors, params, precision, is_trt_engine) - + run_torch(model, input_tensors, params, precision, batch_size) + run_torch_tensorrt(model, input_tensors, params, precision, truncate_long_and_double, batch_size) + # import pdb; pdb.set_trace() + run_tensorrt(model, input_tensors, params, precision, is_trt_engine, batch_size) + elif backend == "torch": - run_torch(model, input_tensors, params, precision) - + run_torch(model, input_tensors, params, precision, batch_size) + elif backend == "torch_tensorrt": - run_torch_tensorrt(model, input_tensors, params, precision) - + run_torch_tensorrt(model, input_tensors, params, precision, truncate_long_and_double, batch_size) + elif backend == "tensorrt": - run_tensorrt(model, input_tensors, params, precision, is_trt_engine) + run_tensorrt(model, input_tensors, params, precision, is_trt_engine, batch_size) # Generate report def printStats(backend, timings, precision, batch_size = 1): @@ -274,7 +274,7 @@ def load_model(params): arg_parser = argparse.ArgumentParser(description="Run inference on a model with random input values") arg_parser.add_argument("--config", help="Load YAML based configuration file to run the inference. If this is used other params will be ignored") args = arg_parser.parse_args() - + parser = ConfigParser(args.config) # Load YAML params params = parser.read_config() @@ -293,6 +293,8 @@ def load_model(params): torch.manual_seed(12345) num_input = params.get('input').get('num_inputs') + truncate_long_and_double = params.get('runtime').get('truncate_long_and_double', False) + batch_size = params.get('input').get('batch_size', 1) for precision in params.get('runtime').get('precision', 'fp32'): input_tensors = [] num_input = params.get('input').get('num_inputs', 1) @@ -306,9 +308,9 @@ def load_model(params): if not is_trt_engine and precision == "fp16" or precision == "half": # If model is TensorRT serialized engine then model.half will report failure model = model.half() - + # Run inference - status = run(model, input_tensors, params, precision, is_trt_engine) + status = run(model, input_tensors, params, precision, truncate_long_and_double, batch_size, is_trt_engine) if status == False: continue From 7779b500321ad591e7dce3f9bd978f6d9f3430a6 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Wed, 27 Jul 2022 14:58:54 -0700 Subject: [PATCH 02/13] feat: Add fx2trt backend and revamp current perf utility to accept CLI arguments Signed-off-by: Dheeraj Peri --- tools/perf/perf_run.py | 174 ++++++++++++++++++++++++++++------------- 1 file changed, 120 insertions(+), 54 deletions(-) diff --git a/tools/perf/perf_run.py b/tools/perf/perf_run.py index 08d792683c..226cc26b71 100644 --- a/tools/perf/perf_run.py +++ b/tools/perf/perf_run.py @@ -15,7 +15,11 @@ # Importing supported Backends import torch import torch_tensorrt as torchtrt +import torch_tensorrt.fx.tracer.acc_tracer.acc_tracer as acc_tracer +from torch_tensorrt.fx import InputTensorSpec, TRTInterpreter +from torch_tensorrt.fx import TRTModule import tensorrt as trt +from utils import parse_inputs, parse_backends, precision_to_dtype, BENCHMARK_MODELS WARMUP_ITER = 10 results = [] @@ -71,7 +75,7 @@ def run_torch(model, input_tensors, params, precision, batch_size): # Runs inference using Torch-TensorRT backend def run_torch_tensorrt(model, input_tensors, params, precision, truncate_long_and_double, batch_size): - print("Running Torch-TensorRT") + print("Running Torch-TensorRT for precision: ", precision) # Compiling Torch-TensorRT model compile_settings = { "inputs": input_tensors, @@ -82,8 +86,8 @@ def run_torch_tensorrt(model, input_tensors, params, precision, truncate_long_an if precision == 'int8': compile_settings.update({"calib": params.get('calibration_cache')}) - - model = torchtrt.compile(model, **compile_settings) + with torchtrt.logging.errors(): + model = torchtrt.compile(model, **compile_settings) iters = params.get('iterations', 20) # Warm up @@ -106,6 +110,55 @@ def run_torch_tensorrt(model, input_tensors, params, precision, truncate_long_an printStats("Torch-TensorRT", timings, precision, batch_size) +# Runs inference using FX2TRT backend +def run_fx2trt(model, input_tensors, params, precision, batch_size): + print("Running FX2TRT for precision: ", precision) + + # Trace the model with acc_tracer. + acc_mod = acc_tracer.trace(model, input_tensors) + # Generate input specs + input_specs = InputTensorSpec.from_tensors(input_tensors) + # Build a TRT interpreter. Set explicit_batch_dimension accordingly. + interpreter = TRTInterpreter( + acc_mod, input_specs, explicit_batch_dimension=True + ) + trt_interpreter_result = interpreter.run( + max_batch_size=batch_size, + lower_precision=precision, + max_workspace_size=1 << 25, + sparse_weights=False, + force_fp32_output=False, + strict_type_constraints=False, + algorithm_selector=None, + timing_cache=None, + profiling_verbosity=None) + + model = TRTModule( + trt_interpreter_result.engine, + trt_interpreter_result.input_names, + trt_interpreter_result.output_names) + + iters = params.get('iterations', 20) + # Warm up + with torch.no_grad(): + for _ in range(WARMUP_ITER): + features = model(*input_tensors) + + torch.cuda.synchronize() + + timings = [] + with torch.no_grad(): + for i in range(iters): + start_time = timeit.default_timer() + features = model(*input_tensors) + torch.cuda.synchronize() + end_time = timeit.default_timer() + meas_time = end_time - start_time + timings.append(meas_time) + print("Iteration {}: {:.6f} s".format(i, end_time - start_time)) + + printStats("FX-TensorRT", timings, precision, batch_size) + def torch_dtype_from_trt(dtype): if dtype == trt.int8: return torch.int8 @@ -141,19 +194,18 @@ def run_tensorrt(model, input_tensors, params, precision, is_trt_engine=False, b } print("Converting method to TensorRT engine...") - with torch.no_grad(): + with torch.no_grad(), torchtrt.logging.errors(): model = torchtrt.ts.convert_method_to_trt_engine(model, "forward", **compile_settings) # Deserialize the TensorRT engine with trt.Logger() as logger, trt.Runtime(logger) as runtime: engine = runtime.deserialize_cuda_engine(model) - print("Running TensorRT") + print("Running TensorRT for precision: ", precision) iters = params.get('iterations', 20) # Compiling the bindings bindings = engine.num_bindings * [None] - # import pdb; pdb.set_trace() k = 0 for idx,_ in enumerate(bindings): dtype = torch_dtype_from_trt(engine.get_binding_dtype(idx)) @@ -171,12 +223,12 @@ def run_tensorrt(model, input_tensors, params, precision, is_trt_engine=False, b timings = [] with engine.create_execution_context() as context: for i in range(WARMUP_ITER): - context.execute_async(1, bindings, torch.cuda.current_stream().cuda_stream) + context.execute_async_v2(bindings, torch.cuda.current_stream().cuda_stream) torch.cuda.synchronize() for i in range(iters): start_time = timeit.default_timer() - context.execute_async(1, bindings, torch.cuda.current_stream().cuda_stream) + context.execute_async_v2(bindings, torch.cuda.current_stream().cuda_stream) torch.cuda.synchronize() end_time = timeit.default_timer() meas_time = end_time - start_time @@ -186,9 +238,8 @@ def run_tensorrt(model, input_tensors, params, precision, is_trt_engine=False, b printStats("TensorRT", timings, precision, batch_size) # Deploys inference run for different backend configurations -def run(model, input_tensors, params, precision, truncate_long_and_double = False, batch_size = 1, is_trt_engine = False): - for backend in params.get('backend'): - +def run(model, backends, input_tensors, params, precision, truncate_long_and_double = False, batch_size = 1, is_trt_engine = False): + for backend in backends: if precision == 'int8': if backend == 'all' or backend == 'torch': print("int8 precision is not supported for torch runtime in this script yet") @@ -201,7 +252,6 @@ def run(model, input_tensors, params, precision, truncate_long_and_double = Fals if backend == 'all': run_torch(model, input_tensors, params, precision, batch_size) run_torch_tensorrt(model, input_tensors, params, precision, truncate_long_and_double, batch_size) - # import pdb; pdb.set_trace() run_tensorrt(model, input_tensors, params, precision, is_trt_engine, batch_size) elif backend == "torch": @@ -210,6 +260,9 @@ def run(model, input_tensors, params, precision, truncate_long_and_double = Fals elif backend == "torch_tensorrt": run_torch_tensorrt(model, input_tensors, params, precision, truncate_long_and_double, batch_size) + elif backend == "fx2trt": + run_fx2trt(model, input_tensors, params, precision, batch_size) + elif backend == "tensorrt": run_tensorrt(model, input_tensors, params, precision, is_trt_engine, batch_size) @@ -246,14 +299,6 @@ def printStats(backend, timings, precision, batch_size = 1): } results.append(meas) -def precision_to_dtype(pr): - if pr == 'fp32': - return torch.float - elif pr == 'fp16' or pr == 'half': - return torch.half - else: - return torch.int8 - def load_model(params): model = None is_trt_engine = False @@ -272,47 +317,68 @@ def load_model(params): if __name__ == '__main__': arg_parser = argparse.ArgumentParser(description="Run inference on a model with random input values") - arg_parser.add_argument("--config", help="Load YAML based configuration file to run the inference. If this is used other params will be ignored") + arg_parser.add_argument("--config", type=str, help="Load YAML based configuration file to run the inference. If this is used other params will be ignored") + # The following options are manual user provided settings + arg_parser.add_argument("--backends", type=str, help="Comma separated string of backends. Eg: torch,torch_tensorrt,tensorrt") + arg_parser.add_argument("--model", type=str, help="Name of the model file") + arg_parser.add_argument("--inputs", type=str, help="List of input shapes. Eg: (1, 3, 224, 224)@fp32 for Resnet or (1, 128)@int32;(1, 128)@int32 for BERT") + arg_parser.add_argument("--batch_size", type=int, default=1, help="Batch size") + arg_parser.add_argument("--precision", default="fp32", type=str, help="Precision of TensorRT engine") + arg_parser.add_argument("--device", type=int, help="device id") + arg_parser.add_argument("--truncate", action='store_true', help="Truncate long and double weights in the network") + arg_parser.add_argument("--is_trt_engine", action='store_true', help="Boolean flag to determine if the user provided model is a TRT engine or not") args = arg_parser.parse_args() - parser = ConfigParser(args.config) - # Load YAML params - params = parser.read_config() - print("Loading model: ", params.get('model').get('filename')) - - model = None - - # Default device is set to 0. Configurable using yaml config file. - torch.cuda.set_device(params.get('runtime').get('device', 0)) - - # Load the model file from disk. If the loaded file is TensorRT engine then is_trt_engine is returned as True - model, is_trt_engine = load_model(params) cudnn.benchmark = True - # Create random input tensor of certain size torch.manual_seed(12345) - num_input = params.get('input').get('num_inputs') - truncate_long_and_double = params.get('runtime').get('truncate_long_and_double', False) - batch_size = params.get('input').get('batch_size', 1) - for precision in params.get('runtime').get('precision', 'fp32'): - input_tensors = [] - num_input = params.get('input').get('num_inputs', 1) - for i in range(num_input): - inp_tensor = params.get('input').get('input' + str(i)) - input_tensors.append(torch.randint(0, 2, tuple(d for d in inp_tensor), dtype=precision_to_dtype(precision)).cuda()) - - if is_trt_engine: - print("Warning, TensorRT engine file is configured. Please make sure the precision matches with the TRT engine for reliable results") - - if not is_trt_engine and precision == "fp16" or precision == "half": - # If model is TensorRT serialized engine then model.half will report failure - model = model.half() - + if args.config: + parser = ConfigParser(args.config) + # Load YAML params + params = parser.read_config() + print("Loading model: ", params.get('model').get('filename')) + model_file = params.get('model').get('filename') + # Default device is set to 0. Configurable using yaml config file. + torch.cuda.set_device(params.get('runtime').get('device', 0)) + + num_input = params.get('input').get('num_inputs') + truncate_long_and_double = params.get('runtime').get('truncate_long_and_double', False) + batch_size = params.get('input').get('batch_size', 1) + for precision in params.get('runtime').get('precision', 'fp32'): + input_tensors = [] + num_input = params.get('input').get('num_inputs', 1) + for i in range(num_input): + inp_tensor = params.get('input').get('input' + str(i)) + input_tensors.append(torch.randint(0, 2, tuple(d for d in inp_tensor), dtype=precision_to_dtype(precision)).cuda()) + + if is_trt_engine: + print("Warning, TensorRT engine file is configured. Please make sure the precision matches with the TRT engine for reliable results") + + if not is_trt_engine and precision == "fp16" or precision == "half": + # If model is TensorRT serialized engine then model.half will report failure + model = model.half() + backends = params.get('backend') + # Run inference + status = run(model, backends, input_tensors, params, precision, truncate_long_and_double, batch_size, is_trt_engine) + else: + params = vars(args) + model_name = params['model'] + if os.path.exists(model_name): + print("Loading user provided model: ", model_name) + model = torch.jit.load(model_name).cuda().eval() + elif model_name in BENCHMARK_MODELS: + model = BENCHMARK_MODELS[model_name]['model'].eval().cuda() + else: + raise ValueError("Invalid model name. Please provide a torchscript model file or model name (among the following options vgg16|resnet50|efficientnet_b0|vit)") + precision = params['precision'] + input_tensors = parse_inputs(params['inputs']) + backends = parse_backends(params['backends']) + truncate_long_and_double = params.get('truncate', False) + batch_size = params['batch_size'] + is_trt_engine = params['is_trt_engine'] # Run inference - status = run(model, input_tensors, params, precision, truncate_long_and_double, batch_size, is_trt_engine) - if status == False: - continue + status = run(model, backends, input_tensors, params, precision, truncate_long_and_double, batch_size, is_trt_engine) # Generate report print('Model Summary:') From d108f87de677dc9407e3debd203d65e9da636b47 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Wed, 3 Aug 2022 13:50:43 -0700 Subject: [PATCH 03/13] chore: Refactor fx2trt functionality Signed-off-by: Dheeraj Peri --- tools/perf/perf_run.py | 43 +++++++++++++++++------------------------- 1 file changed, 17 insertions(+), 26 deletions(-) diff --git a/tools/perf/perf_run.py b/tools/perf/perf_run.py index 226cc26b71..da5a35f50a 100644 --- a/tools/perf/perf_run.py +++ b/tools/perf/perf_run.py @@ -15,9 +15,12 @@ # Importing supported Backends import torch import torch_tensorrt as torchtrt -import torch_tensorrt.fx.tracer.acc_tracer.acc_tracer as acc_tracer -from torch_tensorrt.fx import InputTensorSpec, TRTInterpreter -from torch_tensorrt.fx import TRTModule +# import torch_tensorrt.fx.tracer.acc_tracer.acc_tracer as acc_tracer +# from torch_tensorrt.fx import InputTensorSpec, TRTInterpreter +# from torch_tensorrt.fx import TRTModule +from torch_tensorrt.fx.lower import lower_to_trt +from torch_tensorrt.fx.utils import LowerPrecision + import tensorrt as trt from utils import parse_inputs, parse_backends, precision_to_dtype, BENCHMARK_MODELS @@ -113,30 +116,18 @@ def run_torch_tensorrt(model, input_tensors, params, precision, truncate_long_an # Runs inference using FX2TRT backend def run_fx2trt(model, input_tensors, params, precision, batch_size): print("Running FX2TRT for precision: ", precision) - - # Trace the model with acc_tracer. - acc_mod = acc_tracer.trace(model, input_tensors) - # Generate input specs - input_specs = InputTensorSpec.from_tensors(input_tensors) - # Build a TRT interpreter. Set explicit_batch_dimension accordingly. - interpreter = TRTInterpreter( - acc_mod, input_specs, explicit_batch_dimension=True + if precision == "fp32": + precision = LowerPrecision.FP32 + elif precision == "fp16": + precision = LowerPrecision.FP16 + # Run lowering eager mode benchmark + model = lower_to_trt( + model, + input_tensors, + max_batch_size=batch_size, + lower_precision=precision, + verbose_log=True, ) - trt_interpreter_result = interpreter.run( - max_batch_size=batch_size, - lower_precision=precision, - max_workspace_size=1 << 25, - sparse_weights=False, - force_fp32_output=False, - strict_type_constraints=False, - algorithm_selector=None, - timing_cache=None, - profiling_verbosity=None) - - model = TRTModule( - trt_interpreter_result.engine, - trt_interpreter_result.input_names, - trt_interpreter_result.output_names) iters = params.get('iterations', 20) # Warm up From e92a813720916203d657cc2537fbc7fb1544438c Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Wed, 3 Aug 2022 17:06:14 -0700 Subject: [PATCH 04/13] chore: Fix fp16 functionality for fx2trt backend Signed-off-by: Dheeraj Peri --- tools/perf/perf_run.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/perf/perf_run.py b/tools/perf/perf_run.py index da5a35f50a..13ff0858ba 100644 --- a/tools/perf/perf_run.py +++ b/tools/perf/perf_run.py @@ -120,13 +120,15 @@ def run_fx2trt(model, input_tensors, params, precision, batch_size): precision = LowerPrecision.FP32 elif precision == "fp16": precision = LowerPrecision.FP16 + model.half() + input_tensors = [tensor.half() for tensor in input_tensors] # Run lowering eager mode benchmark model = lower_to_trt( model, input_tensors, max_batch_size=batch_size, lower_precision=precision, - verbose_log=True, + verbose_log=False, ) iters = params.get('iterations', 20) From 2925c8a9ff03afe674b360072507600126245a48 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 9 Aug 2022 22:56:01 -0700 Subject: [PATCH 05/13] chore: refactor Signed-off-by: Dheeraj Peri --- tools/perf/benchmark.sh | 61 +++++++++++++++++ tools/perf/custom_models.py | 29 ++++++++ tools/perf/hub.py | 129 ++++++++++++++++++++++++++++++++++++ tools/perf/perf_run.py | 109 +++++++++++++++--------------- 4 files changed, 276 insertions(+), 52 deletions(-) create mode 100644 tools/perf/benchmark.sh create mode 100644 tools/perf/custom_models.py create mode 100644 tools/perf/hub.py diff --git a/tools/perf/benchmark.sh b/tools/perf/benchmark.sh new file mode 100644 index 0000000000..cdb9c2f3fd --- /dev/null +++ b/tools/perf/benchmark.sh @@ -0,0 +1,61 @@ +#!/bin/bash + +# Download the Torchscript models +# python hub.py + +batch_sizes=(1 2 4 8 16 32 64 128 256) + +# # Benchmark VGG16 model +# echo "Benchmarking VGG16 model" +# for bs in 1 2 +# do +# python perf_run.py --model models/vgg16_scripted.jit.pt \ +# --precision fp32,fp16 --inputs="(${bs}, 3, 224, 224)" \ +# --batch_size ${bs} \ +# --backends torch,torch_tensorrt,tensorrt \ +# --report "vgg_perf_bs${bs}.txt" +# done +# +# # Benchmark Resnet50 model +# echo "Benchmarking Resnet50 model" +# for bs in 1 2 +# do +# python perf_run.py --model models/resnet50_scripted.jit.pt \ +# --precision fp32,fp16 --inputs="(${bs}, 3, 224, 224)" \ +# --batch_size ${bs} \ +# --backends torch,torch_tensorrt,tensorrt \ +# --report "rn50_perf_bs${bs}.txt" +# done +# +# # Benchmark VIT model +# echo "Benchmarking VIT model" +# for bs in 1 2 +# do +# python perf_run.py --model models/vit_scripted.jit.pt \ +# --precision fp32,fp16 --inputs="(${bs}, 3, 224, 224)" \ +# --batch_size ${bs} \ +# --backends torch,torch_tensorrt,tensorrt \ +# --report "vit_perf_bs${bs}.txt" +# done +# +# # Benchmark EfficientNet-B0 model +# echo "Benchmarking EfficientNet-B0 model" +# for bs in 1 2 +# do +# python perf_run.py --model models/efficientnet_b0_scripted.jit.pt \ +# --precision fp32,fp16 --inputs="(${bs}, 3, 224, 224)" \ +# --batch_size ${bs} \ +# --backends torch,torch_tensorrt,tensorrt \ +# --report "eff_b0_perf_bs${bs}.txt" +# done + +# Benchmark BERT model +for bs in 1 +do + python perf_run.py --model models/bert_base_uncased_traced.jit.pt \ + --precision fp32 --inputs="(${bs}, 128)@int32;(${bs}, 128)@int32" \ + --batch_size ${bs} \ + --backends torch_tensorrt \ + --truncate \ + --report "bert_base_perf_bs${bs}.txt" +done diff --git a/tools/perf/custom_models.py b/tools/perf/custom_models.py new file mode 100644 index 0000000000..679425ca3d --- /dev/null +++ b/tools/perf/custom_models.py @@ -0,0 +1,29 @@ +import torch +import torch.nn as nn +from transformers import BertModel, BertTokenizer, BertConfig +import torch.nn.functional as F + +def BertModule(): + model_name = "bert-base-uncased" + enc = BertTokenizer.from_pretrained(model_name) + text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" + tokenized_text = enc.tokenize(text) + masked_index = 8 + tokenized_text[masked_index] = "[MASK]" + indexed_tokens = enc.convert_tokens_to_ids(tokenized_text) + segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1] + tokens_tensor = torch.tensor([indexed_tokens]) + segments_tensors = torch.tensor([segments_ids]) + config = BertConfig( + vocab_size_or_config_json_file=32000, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + torchscript=True, + ) + model = BertModel(config) + model.eval() + model = BertModel.from_pretrained(model_name, torchscript=True) + traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors]) + return traced_model diff --git a/tools/perf/hub.py b/tools/perf/hub.py new file mode 100644 index 0000000000..6f2a0fad9e --- /dev/null +++ b/tools/perf/hub.py @@ -0,0 +1,129 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchvision.models as models +import timm +from transformers import BertModel, BertTokenizer, BertConfig +import os +import json +import custom_models as cm + +torch.hub._validate_not_a_forked_repo = lambda a, b, c: True + +torch_version = torch.__version__ + +# Detect case of no GPU before deserialization of models on GPU +if not torch.cuda.is_available(): + raise Exception("No GPU found. Please check if installed torch version is compatible with CUDA version") + +# Downloads all model files again if manifest file is not present +MANIFEST_FILE = 'model_manifest.json' + +BENCHMARK_MODELS = { + "vgg16": { + "model": models.vgg16(weights=None), + "path": "script" + }, + "resnet50": { + "model": models.resnet50(weights=None), + "path": "script" + }, + "efficientnet_b0": { + "model": timm.create_model('efficientnet_b0', pretrained=True), + "path": "script" + }, + "vit": { + "model": timm.create_model('vit_base_patch16_224', pretrained=True), + "path": "script" + }, + "bert_base_uncased": { + "model": cm.BertModule(), + "path": "trace" + }, +} + + +def get(n, m, manifest): + print("Downloading {}".format(n)) + traced_filename = "models/" + n + '_traced.jit.pt' + script_filename = "models/" + n + '_scripted.jit.pt' + x = torch.ones((1, 3, 300, 300)).cuda() + if n == "bert-base-uncased": + traced_model = m["model"] + torch.jit.save(traced_model, traced_filename) + manifest.update({n: [traced_filename]}) + else: + m["model"] = m["model"].eval().cuda() + if m["path"] == "both" or m["path"] == "trace": + trace_model = torch.jit.trace(m["model"], [x]) + torch.jit.save(trace_model, traced_filename) + manifest.update({n: [traced_filename]}) + if m["path"] == "both" or m["path"] == "script": + script_model = torch.jit.script(m["model"]) + torch.jit.save(script_model, script_filename) + if n in manifest.keys(): + files = list(manifest[n]) if type(manifest[n]) != list else manifest[n] + files.append(script_filename) + manifest.update({n: files}) + else: + manifest.update({n: [script_filename]}) + return manifest + + +def download_models(version_matches, manifest): + # Download all models if torch version is different than model version + if not version_matches: + for n, m in BENCHMARK_MODELS.items(): + manifest = get(n, m, manifest) + else: + for n, m in BENCHMARK_MODELS.items(): + scripted_filename = "models/" + n + "_scripted.jit.pt" + traced_filename = "models/" + n + "_traced.jit.pt" + # Check if model file exists on disk + if (m["path"] == "both" and os.path.exists(scripted_filename) and os.path.exists(traced_filename)) or \ + (m["path"] == "script" and os.path.exists(scripted_filename)) or \ + (m["path"] == "trace" and os.path.exists(traced_filename)): + print("Skipping {} ".format(n)) + continue + manifest = get(n, m, manifest) + + +def main(): + manifest = None + version_matches = False + manifest_exists = False + + # Check if Manifest file exists or is empty + if not os.path.exists(MANIFEST_FILE) or os.stat(MANIFEST_FILE).st_size == 0: + manifest = {"version": torch_version} + + # Creating an empty manifest file for overwriting post setup + os.system('touch {}'.format(MANIFEST_FILE)) + else: + manifest_exists = True + + # Load manifest if already exists + with open(MANIFEST_FILE, 'r') as f: + manifest = json.load(f) + if manifest['version'] == torch_version: + version_matches = True + else: + print("Torch version: {} mismatches \ + with manifest's version: {}. Re-downloading \ + all models".format(torch_version, manifest['version'])) + + # Overwrite the manifest version as current torch version + manifest['version'] = torch_version + + download_models(version_matches, manifest) + + # Write updated manifest file to disk + with open(MANIFEST_FILE, 'r+') as f: + data = f.read() + f.seek(0) + record = json.dumps(manifest) + f.write(record) + f.truncate() + + +main() diff --git a/tools/perf/perf_run.py b/tools/perf/perf_run.py index cef3508d7f..421c1a444e 100644 --- a/tools/perf/perf_run.py +++ b/tools/perf/perf_run.py @@ -19,7 +19,7 @@ from torch_tensorrt.fx.utils import LowerPrecision import tensorrt as trt -from utils import parse_inputs, parse_backends, precision_to_dtype, BENCHMARK_MODELS +from utils import parse_inputs, parse_backends, precision_to_dtype, parse_precisions, BENCHMARK_MODELS WARMUP_ITER = 10 results = [] @@ -50,7 +50,7 @@ def get(self, key, default_value=None): # Runs inference using Torch backend def run_torch(model, input_tensors, params, precision, batch_size): - print("Running Torch for precision: ", precision) + print("Running Torch for precision: ", precision, " batch_size : ", batch_size) iters = params.get('iterations', 20) # Warm up @@ -69,24 +69,24 @@ def run_torch(model, input_tensors, params, precision, batch_size): end_time = timeit.default_timer() meas_time = end_time - start_time timings.append(meas_time) - print("Iteration {}: {:.6f} s".format(i, end_time - start_time)) - printStats("Torch", timings, precision, batch_size) + recordStats("Torch", timings, precision, batch_size) # Runs inference using Torch-TensorRT backend def run_torch_tensorrt(model, input_tensors, params, precision, truncate_long_and_double, batch_size): - print("Running Torch-TensorRT for precision: ", precision) + print("Running Torch-TensorRT for precision: ", precision, " batch_size : ", batch_size) # Compiling Torch-TensorRT model compile_settings = { "inputs": input_tensors, "enabled_precisions": {precision_to_dtype(precision)} , "truncate_long_and_double": truncate_long_and_double, + "min_block_size" : 1, } if precision == 'int8': compile_settings.update({"calib": params.get('calibration_cache')}) - with torchtrt.logging.errors(): + with torchtrt.logging.debug(): model = torchtrt.compile(model, **compile_settings) iters = params.get('iterations', 20) @@ -106,13 +106,12 @@ def run_torch_tensorrt(model, input_tensors, params, precision, truncate_long_an end_time = timeit.default_timer() meas_time = end_time - start_time timings.append(meas_time) - print("Iteration {}: {:.6f} s".format(i, end_time - start_time)) - printStats("Torch-TensorRT", timings, precision, batch_size) + recordStats("Torch-TensorRT", timings, precision, batch_size) # Runs inference using FX2TRT backend def run_fx2trt(model, input_tensors, params, precision, batch_size): - print("Running FX2TRT for precision: ", precision) + print("Running FX2TRT for precision: ", precision, " batch_size : ", batch_size) if precision == "fp32": precision = LowerPrecision.FP32 elif precision == "fp16": @@ -145,9 +144,8 @@ def run_fx2trt(model, input_tensors, params, precision, batch_size): end_time = timeit.default_timer() meas_time = end_time - start_time timings.append(meas_time) - print("Iteration {}: {:.6f} s".format(i, end_time - start_time)) - printStats("FX-TensorRT", timings, precision, batch_size) + recordStats("FX-TensorRT", timings, precision, batch_size) def torch_dtype_from_trt(dtype): if dtype == trt.int8: @@ -172,7 +170,7 @@ def torch_device_from_trt(device): return TypeError("%s is not supported by torch" % device) -def run_tensorrt(model, input_tensors, params, precision, is_trt_engine=False, batch_size=1): +def run_tensorrt(model, input_tensors, params, precision, truncate_long_and_double=False, is_trt_engine=False, batch_size=1): engine = None # If the model file is a TensorRT engine then directly deserialize and run inference @@ -180,7 +178,8 @@ def run_tensorrt(model, input_tensors, params, precision, is_trt_engine=False, b if not is_trt_engine: compile_settings = { "inputs": input_tensors, - "enabled_precisions": {precision_to_dtype(precision)} + "enabled_precisions": {precision_to_dtype(precision)}, + "truncate_long_and_double": truncate_long_and_double, } print("Converting method to TensorRT engine...") @@ -191,7 +190,7 @@ def run_tensorrt(model, input_tensors, params, precision, is_trt_engine=False, b with trt.Logger() as logger, trt.Runtime(logger) as runtime: engine = runtime.deserialize_cuda_engine(model) - print("Running TensorRT for precision: ", precision) + print("Running TensorRT for precision: ", precision, " batch_size : ", batch_size) iters = params.get('iterations', 20) # Compiling the bindings @@ -223,12 +222,11 @@ def run_tensorrt(model, input_tensors, params, precision, is_trt_engine=False, b end_time = timeit.default_timer() meas_time = end_time - start_time timings.append(meas_time) - print("Iterations {}: {:.6f} s".format(i, end_time - start_time)) - printStats("TensorRT", timings, precision, batch_size) + recordStats("TensorRT", timings, precision, batch_size) # Deploys inference run for different backend configurations -def run(model, backends, input_tensors, params, precision, truncate_long_and_double = False, batch_size = 1, is_trt_engine = False): +def run(model, backends, input_tensors, params, precision, truncate_long_and_double=False, batch_size=1, is_trt_engine=False): for backend in backends: if precision == 'int8': if backend == 'all' or backend == 'torch': @@ -242,7 +240,7 @@ def run(model, backends, input_tensors, params, precision, truncate_long_and_dou if backend == 'all': run_torch(model, input_tensors, params, precision, batch_size) run_torch_tensorrt(model, input_tensors, params, precision, truncate_long_and_double, batch_size) - run_tensorrt(model, input_tensors, params, precision, is_trt_engine, batch_size) + run_tensorrt(model, input_tensors, params, precision, truncate_long_and_double, is_trt_engine, batch_size) elif backend == "torch": run_torch(model, input_tensors, params, precision, batch_size) @@ -254,10 +252,10 @@ def run(model, backends, input_tensors, params, precision, truncate_long_and_dou run_fx2trt(model, input_tensors, params, precision, batch_size) elif backend == "tensorrt": - run_tensorrt(model, input_tensors, params, precision, is_trt_engine, batch_size) + run_tensorrt(model, input_tensors, params, precision, truncate_long_and_double, is_trt_engine, batch_size) # Generate report -def printStats(backend, timings, precision, batch_size = 1): +def recordStats(backend, timings, precision, batch_size = 1): times = np.array(timings) steps = len(times) speeds = batch_size / times @@ -268,41 +266,37 @@ def printStats(backend, timings, precision, batch_size = 1): speed_mean = np.mean(speeds) speed_med = np.median(speeds) - msg = ("\n%s =================================\n" - "batch size=%d, num iterations=%d\n" - " Median FPS: %.1f, mean: %.1f\n" - " Median latency: %.6f, mean: %.6f, 99th_p: %.6f, std_dev: %.6f\n" - ) % (backend, - batch_size, steps, - speed_med, speed_mean, - time_med, time_mean, time_99th, time_std) - print(msg) - meas = { + stats = { 'Backend' : backend, - 'precision' : precision, + 'Precision' : precision, + 'Batch size' : batch_size, 'Median(FPS)' : speed_med, 'Mean(FPS)' : speed_mean, 'Median-Latency(ms)' : time_med, 'Mean-Latency(ms)' : time_mean, - '99th_p' : time_99th, - 'std_dev': time_std } - results.append(meas) + results.append(stats) def load_model(params): model = None is_trt_engine = False # Load torch model traced/scripted model_file = params.get('model').get('filename') + try : + model_name = params.get('model').get('name') + except: + model_name = model_file - if model_file.endswith('.jit.pt'): - model = torch.jit.load(model_file).cuda() - else: + print("Loading model: ", model_file) + if model_file.endswith('.plan'): is_trt_engine = True # Read the TensorRT engine file with open(model_file, 'rb') as fin: model = fin.read() - return model, is_trt_engine + else: + model = torch.jit.load(model_file).cuda() + + return model, model_name, is_trt_engine if __name__ == '__main__': @@ -312,23 +306,24 @@ def load_model(params): arg_parser.add_argument("--backends", type=str, help="Comma separated string of backends. Eg: torch,torch_tensorrt,tensorrt") arg_parser.add_argument("--model", type=str, help="Name of the model file") arg_parser.add_argument("--inputs", type=str, help="List of input shapes. Eg: (1, 3, 224, 224)@fp32 for Resnet or (1, 128)@int32;(1, 128)@int32 for BERT") - arg_parser.add_argument("--batch_size", type=int, default=1, help="Batch size") + arg_parser.add_argument("--batch_size", type=int, default=1, help="Batch size to build and run") arg_parser.add_argument("--precision", default="fp32", type=str, help="Precision of TensorRT engine") arg_parser.add_argument("--device", type=int, help="device id") arg_parser.add_argument("--truncate", action='store_true', help="Truncate long and double weights in the network") arg_parser.add_argument("--is_trt_engine", action='store_true', help="Boolean flag to determine if the user provided model is a TRT engine or not") + arg_parser.add_argument("--report", type=str, help="Path of the output file where performance summary is written.") args = arg_parser.parse_args() cudnn.benchmark = True # Create random input tensor of certain size torch.manual_seed(12345) - + model_name = "Model" if args.config: parser = ConfigParser(args.config) # Load YAML params params = parser.read_config() - print("Loading model: ", params.get('model').get('filename')) - model_file = params.get('model').get('filename') + model, model_name, is_trt_engine = load_model(params) + # Default device is set to 0. Configurable using yaml config file. torch.cuda.set_device(params.get('runtime').get('device', 0)) @@ -345,12 +340,13 @@ def load_model(params): if is_trt_engine: print("Warning, TensorRT engine file is configured. Please make sure the precision matches with the TRT engine for reliable results") - if not is_trt_engine and precision == "fp16" or precision == "half": + if not is_trt_engine and (precision == "fp16" or precision == "half"): # If model is TensorRT serialized engine then model.half will report failure model = model.half() - backends = params.get('backend') - # Run inference - status = run(model, backends, input_tensors, params, precision, truncate_long_and_double, batch_size, is_trt_engine) + + backends = params.get('backend') + # Run inference + status = run(model, backends, input_tensors, params, precision, truncate_long_and_double, batch_size, is_trt_engine) else: params = vars(args) model_name = params['model'] @@ -361,16 +357,25 @@ def load_model(params): model = BENCHMARK_MODELS[model_name]['model'].eval().cuda() else: raise ValueError("Invalid model name. Please provide a torchscript model file or model name (among the following options vgg16|resnet50|efficientnet_b0|vit)") - precision = params['precision'] - input_tensors = parse_inputs(params['inputs']) + backends = parse_backends(params['backends']) - truncate_long_and_double = params.get('truncate', False) + truncate_long_and_double = params['truncate'] batch_size = params['batch_size'] is_trt_engine = params['is_trt_engine'] - # Run inference - status = run(model, backends, input_tensors, params, precision, truncate_long_and_double, batch_size, is_trt_engine) + precisions = parse_precisions(params['precision']) + + for precision in precisions: + input_tensors = parse_inputs(params['inputs'], precision_to_dtype(precision)) + if not is_trt_engine and (precision == "fp16" or precision == "half"): + # If model is TensorRT serialized engine then model.half will report failure + model = model.half() + status = run(model, backends, input_tensors, params, precision, truncate_long_and_double, batch_size, is_trt_engine) # Generate report - print('Model Summary:') + print('Model Summary: ', model_name) summary = pd.DataFrame(results) print(summary) + with open(args.report, 'w') as file: + file.write('Model Summary: ' + model_name + '\n') + file.write(summary.to_string()) + file.close() From 46d0e86ecb489ebfef48001356428e7311e52ca9 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 9 Aug 2022 23:21:31 -0700 Subject: [PATCH 06/13] chore: minor change Signed-off-by: Dheeraj Peri --- tools/perf/benchmark.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/benchmark.sh b/tools/perf/benchmark.sh index cdb9c2f3fd..618a31d017 100644 --- a/tools/perf/benchmark.sh +++ b/tools/perf/benchmark.sh @@ -1,7 +1,7 @@ #!/bin/bash # Download the Torchscript models -# python hub.py +python hub.py batch_sizes=(1 2 4 8 16 32 64 128 256) From f8285baee41df8015a33f349c430214319e819c2 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Thu, 11 Aug 2022 11:11:58 -0700 Subject: [PATCH 07/13] refactor: Refactor perf_run and add internal benchmark scripts Signed-off-by: Dheeraj Peri --- tools/perf/README.md | 219 +++++++++++++++++++++++----------------- tools/perf/benchmark.sh | 95 ++++++++--------- tools/perf/perf_run.py | 7 +- tools/perf/utils.py | 59 +++++++++++ 4 files changed, 241 insertions(+), 139 deletions(-) create mode 100644 tools/perf/utils.py diff --git a/tools/perf/README.md b/tools/perf/README.md index 4c4a58bfd0..9a127b12f2 100644 --- a/tools/perf/README.md +++ b/tools/perf/README.md @@ -1,90 +1,129 @@ -# Performance Benchmarking - -This is a comprehensive Python benchmark suite to run perf runs using different supported backends. Following backends are supported: - -1. Torch -2. Torch-TensorRT -3. TensorRT - -Note: Please note that for ONNX models, user can convert the ONNX model to TensorRT serialized engine and then use this package. - -## Prerequisite - -Benchmark scripts depends on following Python packages in addition to requirements.txt packages - -1. Torch-TensorRT -2. Torch -3. TensorRT - -## Structure - -``` -./ -├── config -│ ├── vgg16_trt.yml -│ └── vgg16.yml -├── models -├── perf_run.py -└── README.md -``` - -Please save your configuration files at config directory. Similarly, place your model files at models path. - -## Usage - -To run the benchmark for a given configuration file: - -``` -python perf_run.py --config=config/vgg16.yml -``` - -## Configuration - -There are two sample configuration files added. - -* vgg16.yml demonstrates a configuration with all the supported backends (Torch, Torch-TensorRT, TensorRT) -* vgg16_trt.yml demonstrates how to use an external TensorRT serialized engine file directly. - - -### Supported fields - -| Name | Supported Values | Description | -| --- | --- | --- | -| backend | all, torch, torch_tensorrt, tensorrt | Supported backends for inference. | -| input | - | Input binding names. Expected to list shapes of each input bindings | -| model | - | Configure the model filename and name | -| filename | - | Model file name to load from disk. | -| name | - | Model name | -| runtime | - | Runtime configurations | -| device | 0 | Target device ID to run inference. Range depends on available GPUs | -| precision | fp32, fp16 or half, int8 | Target precision to run inference. int8 cannot be used with 'all' backend | -| calibration_cache | - | Calibration cache file expected for torch_tensorrt runtime in int8 precision | - -Note: -1. Please note that torch runtime perf is not supported for int8 yet. -2. Torchscript module filename should end with .jit.pt otherwise it will be treated as a TensorRT engine. - - - -Additional sample use case: - -``` -backend: - - torch - - torch_tensorrt - - tensorrt -input: - input0: - - 3 - - 224 - - 224 - num_inputs: 1 -model: - filename: model.plan - name: vgg16 -runtime: - device: 0 - precision: - - fp32 - - fp16 -``` +# Performance Benchmarking + +This is a comprehensive Python benchmark suite to run perf runs using different supported backends. Following backends are supported: + +1. Torch +2. Torch-TensorRT +3. FX-TRT +4. TensorRT + + +Note: Please note that for ONNX models, user can convert the ONNX model to TensorRT serialized engine and then use this package. + +## Prerequisite + +Benchmark scripts depends on following Python packages in addition to requirements.txt packages + +1. Torch-TensorRT +2. Torch +3. TensorRT + +## Structure + +``` +./ +├── config +│ ├── vgg16_trt.yml +│ └── vgg16.yml +├── models +├── perf_run.py +├── hub.py +├── custom_models.py +├── requirements.txt +├── benchmark.sh +└── README.md +``` + + + +* `config` - Directory which contains sample yaml configuration files for VGG network. +* `models` - Model directory +* `perf_run.py` - Performance benchmarking script which supports torch, torch_tensorrt, fx2trt, tensorrt backends +* `hub.py` - Script to download torchscript models for VGG16, Resnet50, EfficientNet-B0, VIT, HF-BERT +* `custom_models.py` - Script which includes custom models other than torchvision and timm (eg: HF BERT) +* `utils.py` - utility functions script +* `benchmark.sh` - This is used for internal performance testing of VGG16, Resnet50, EfficientNet-B0, VIT, HF-BERT. + +## Usage + +There are two ways you can run a performance benchmark. + +### Using YAML config files + +To run the benchmark for a given configuration file: + +```python +python perf_run.py --config=config/vgg16.yml +``` + +There are two sample configuration files added. + +* vgg16.yml demonstrates a configuration with all the supported backends (Torch, Torch-TensorRT, TensorRT) +* vgg16_trt.yml demonstrates how to use an external TensorRT serialized engine file directly. + + +### Supported fields + +| Name | Supported Values | Description | +| ----------------- | ------------------------------------ | ------------------------------------------------------------ | +| backend | all, torch, torch_tensorrt, tensorrt | Supported backends for inference. | +| input | - | Input binding names. Expected to list shapes of each input bindings | +| model | - | Configure the model filename and name | +| filename | - | Model file name to load from disk. | +| name | - | Model name | +| runtime | - | Runtime configurations | +| device | 0 | Target device ID to run inference. Range depends on available GPUs | +| precision | fp32, fp16 or half, int8 | Target precision to run inference. int8 cannot be used with 'all' backend | +| calibration_cache | - | Calibration cache file expected for torch_tensorrt runtime in int8 precision | + +Additional sample use case: + +``` +backend: + - torch + - torch_tensorrt + - tensorrt +input: + input0: + - 3 + - 224 + - 224 + num_inputs: 1 +model: + filename: model.plan + name: vgg16 +runtime: + device: 0 + precision: + - fp32 + - fp16 +``` + +Note: + +1. Please note that measuring INT8 performance is only supported via a `calibration cache` file or QAT mode for `torch_tensorrt` backend. +2. TensorRT engine filename should end with `.plan` otherwise it will be treated as Torchscript module. + +### Using CompileSpec options via CLI + +Here are the list of `CompileSpec` options that can be provided directly to compile the pytorch module + +* `--backends` : Comma separated string of backends. Eg: torch,torch_tensorrt,tensorrt +* `--model` : Name of the model file (Can be a torchscript module or a tensorrt engine (ending in `.plan` extension)) +* `--inputs` : List of input shapes & dtypes. Eg: (1, 3, 224, 224)@fp32 for Resnet or (1, 128)@int32;(1, 128)@int32 for BERT +* `--batch_size` : Batch size +* `--precision` : Comma separated list of precisions to build TensorRT engine Eg: fp32,fp16 +* `--device` : Device ID +* `--truncate` : Truncate long and double weights in the network in Torch-TensorRT +* `--is_trt_engine` : Boolean flag to be enabled if the model file provided is a TensorRT engine. +* `--report` : Path of the output file where performance summary is written. + +Eg: + +``` + python perf_run.py --model ${MODELS_DIR}/vgg16_scripted.jit.pt \ + --precision fp32,fp16 --inputs="(1, 3, 224, 224)@fp32" \ + --batch_size 1 \ + --backends torch,torch_tensorrt,tensorrt \ + --report "vgg_perf_bs1.txt" +``` diff --git a/tools/perf/benchmark.sh b/tools/perf/benchmark.sh index 618a31d017..cf659cc86a 100644 --- a/tools/perf/benchmark.sh +++ b/tools/perf/benchmark.sh @@ -1,61 +1,64 @@ #!/bin/bash +MODELS_DIR="models" + # Download the Torchscript models python hub.py batch_sizes=(1 2 4 8 16 32 64 128 256) -# # Benchmark VGG16 model -# echo "Benchmarking VGG16 model" -# for bs in 1 2 -# do -# python perf_run.py --model models/vgg16_scripted.jit.pt \ -# --precision fp32,fp16 --inputs="(${bs}, 3, 224, 224)" \ -# --batch_size ${bs} \ -# --backends torch,torch_tensorrt,tensorrt \ -# --report "vgg_perf_bs${bs}.txt" -# done -# -# # Benchmark Resnet50 model -# echo "Benchmarking Resnet50 model" -# for bs in 1 2 -# do -# python perf_run.py --model models/resnet50_scripted.jit.pt \ -# --precision fp32,fp16 --inputs="(${bs}, 3, 224, 224)" \ -# --batch_size ${bs} \ -# --backends torch,torch_tensorrt,tensorrt \ -# --report "rn50_perf_bs${bs}.txt" -# done -# -# # Benchmark VIT model -# echo "Benchmarking VIT model" -# for bs in 1 2 -# do -# python perf_run.py --model models/vit_scripted.jit.pt \ -# --precision fp32,fp16 --inputs="(${bs}, 3, 224, 224)" \ -# --batch_size ${bs} \ -# --backends torch,torch_tensorrt,tensorrt \ -# --report "vit_perf_bs${bs}.txt" -# done -# -# # Benchmark EfficientNet-B0 model -# echo "Benchmarking EfficientNet-B0 model" -# for bs in 1 2 -# do -# python perf_run.py --model models/efficientnet_b0_scripted.jit.pt \ -# --precision fp32,fp16 --inputs="(${bs}, 3, 224, 224)" \ -# --batch_size ${bs} \ -# --backends torch,torch_tensorrt,tensorrt \ -# --report "eff_b0_perf_bs${bs}.txt" -# done +#Benchmark VGG16 model +echo "Benchmarking VGG16 model" +for bs in batch_sizes +do + python perf_run.py --model ${MODELS_DIR}/vgg16_scripted.jit.pt \ + --precision fp32,fp16 --inputs="(${bs}, 3, 224, 224)" \ + --batch_size ${bs} \ + --backends torch,torch_tensorrt,tensorrt \ + --report "vgg_perf_bs${bs}.txt" +done + +# Benchmark Resnet50 model +echo "Benchmarking Resnet50 model" +for bs in batch_sizes +do + python perf_run.py --model ${MODELS_DIR}/resnet50_scripted.jit.pt \ + --precision fp32,fp16 --inputs="(${bs}, 3, 224, 224)" \ + --batch_size ${bs} \ + --backends torch,torch_tensorrt,tensorrt \ + --report "rn50_perf_bs${bs}.txt" +done + +# Benchmark VIT model +echo "Benchmarking VIT model" +for bs in batch_sizes +do + python perf_run.py --model ${MODELS_DIR}/vit_scripted.jit.pt \ + --precision fp32,fp16 --inputs="(${bs}, 3, 224, 224)" \ + --batch_size ${bs} \ + --backends torch,torch_tensorrt,tensorrt \ + --report "vit_perf_bs${bs}.txt" +done + +# Benchmark EfficientNet-B0 model +echo "Benchmarking EfficientNet-B0 model" +for bs in batch_sizes +do + python perf_run.py --model ${MODELS_DIR}/efficientnet_b0_scripted.jit.pt \ + --precision fp32,fp16 --inputs="(${bs}, 3, 224, 224)" \ + --batch_size ${bs} \ + --backends torch,torch_tensorrt,tensorrt \ + --report "eff_b0_perf_bs${bs}.txt" +done # Benchmark BERT model -for bs in 1 +echo "Benchmarking Huggingface BERT base model" +for bs in batch_sizes do - python perf_run.py --model models/bert_base_uncased_traced.jit.pt \ + python perf_run.py --model ${MODELS_DIR}/bert_base_uncased_traced.jit.pt \ --precision fp32 --inputs="(${bs}, 128)@int32;(${bs}, 128)@int32" \ --batch_size ${bs} \ - --backends torch_tensorrt \ + --backends torch,torch_tensorrt \ --truncate \ --report "bert_base_perf_bs${bs}.txt" done diff --git a/tools/perf/perf_run.py b/tools/perf/perf_run.py index 421c1a444e..8517732543 100644 --- a/tools/perf/perf_run.py +++ b/tools/perf/perf_run.py @@ -86,7 +86,7 @@ def run_torch_tensorrt(model, input_tensors, params, precision, truncate_long_an if precision == 'int8': compile_settings.update({"calib": params.get('calibration_cache')}) - with torchtrt.logging.debug(): + with torchtrt.logging.errors(): model = torchtrt.compile(model, **compile_settings) iters = params.get('iterations', 20) @@ -307,9 +307,10 @@ def load_model(params): arg_parser.add_argument("--model", type=str, help="Name of the model file") arg_parser.add_argument("--inputs", type=str, help="List of input shapes. Eg: (1, 3, 224, 224)@fp32 for Resnet or (1, 128)@int32;(1, 128)@int32 for BERT") arg_parser.add_argument("--batch_size", type=int, default=1, help="Batch size to build and run") - arg_parser.add_argument("--precision", default="fp32", type=str, help="Precision of TensorRT engine") + arg_parser.add_argument("--precision", default="fp32", type=str, help="Comma separated list of precisions to build TensorRT engine Eg: fp32,fp16") + arg_parser.add_argument("--calibration_cache", type=str, help="Name of the calibration cache file") arg_parser.add_argument("--device", type=int, help="device id") - arg_parser.add_argument("--truncate", action='store_true', help="Truncate long and double weights in the network") + arg_parser.add_argument("--truncate", action='store_true', help="Truncate long and double weights in the network in Torch-TensorRT") arg_parser.add_argument("--is_trt_engine", action='store_true', help="Boolean flag to determine if the user provided model is a TRT engine or not") arg_parser.add_argument("--report", type=str, help="Path of the output file where performance summary is written.") args = arg_parser.parse_args() diff --git a/tools/perf/utils.py b/tools/perf/utils.py new file mode 100644 index 0000000000..3b38a89c8c --- /dev/null +++ b/tools/perf/utils.py @@ -0,0 +1,59 @@ +import torch +import torch_tensorrt +import custom_models as cm +import torchvision.models as models +import timm + +BENCHMARK_MODELS = { + "vgg16": { + "model": models.vgg16(pretrained=True), + "path": "script" + }, + "resnet50": { + "model": torch.hub.load('pytorch/vision:v0.9.0', 'resnet50', pretrained=True), + "path": "script" + }, + "efficientnet_b0": { + "model": timm.create_model('efficientnet_b0', pretrained=True), + "path": "script" + }, + "vit": { + "model": timm.create_model('vit_base_patch16_224', pretrained=True), + "path": "script" + }, + "bert_base_uncased": { + "model": cm.BertModule(), + "path": "trace" + }, +} + +def precision_to_dtype(pr): + if pr == 'fp32': + return torch.float + elif pr == 'fp16' or pr == 'half': + return torch.half + elif pr == 'int32': + return torch.int32 + elif pr == 'bool': + return torch.bool + else: + return torch.float32 + +def parse_inputs(user_inputs, dtype): + parsed_inputs = user_inputs.split(';') + torchtrt_inputs = [] + for input in parsed_inputs: + input_shape = [] + input_shape_and_dtype = input.split('@') + dtype = precision_to_dtype(input_shape_and_dtype[1]) if len(input_shape_and_dtype) == 2 else dtype + for input_dim in input_shape_and_dtype[0][1:-1].split(','): + input_shape.append(int(input_dim)) + torchtrt_inputs.append(torch.randint(0, 5, input_shape, dtype=dtype).cuda()) + + return torchtrt_inputs + +def parse_backends(backends): + return backends.split(',') + +def parse_precisions(precisions): + return precisions.split(',') From 561c3394fe5bc73a7495b72b79ad418ff062259d Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Thu, 11 Aug 2022 11:29:07 -0700 Subject: [PATCH 08/13] chore : minor refactor Signed-off-by: Dheeraj Peri --- tools/perf/README.md | 4 ++-- tools/perf/perf_run.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/perf/README.md b/tools/perf/README.md index 9a127b12f2..44dc88d1c4 100644 --- a/tools/perf/README.md +++ b/tools/perf/README.md @@ -108,8 +108,8 @@ Note: Here are the list of `CompileSpec` options that can be provided directly to compile the pytorch module -* `--backends` : Comma separated string of backends. Eg: torch,torch_tensorrt,tensorrt -* `--model` : Name of the model file (Can be a torchscript module or a tensorrt engine (ending in `.plan` extension)) +* `--backends` : Comma separated string of backends. Eg: torch,torch_tensorrt, tensorrt or fx2trt +* `--model` : Name of the model file (Can be a torchscript module or a tensorrt engine (ending in `.plan` extension)). If the backend is `fx2trt`, the input should be a Pytorch module (instead of a torchscript module) and the options for model are (`vgg16` | `resnet50` | `efficientnet_b0`) * `--inputs` : List of input shapes & dtypes. Eg: (1, 3, 224, 224)@fp32 for Resnet or (1, 128)@int32;(1, 128)@int32 for BERT * `--batch_size` : Batch size * `--precision` : Comma separated list of precisions to build TensorRT engine Eg: fp32,fp16 diff --git a/tools/perf/perf_run.py b/tools/perf/perf_run.py index 8517732543..c472100668 100644 --- a/tools/perf/perf_run.py +++ b/tools/perf/perf_run.py @@ -303,7 +303,7 @@ def load_model(params): arg_parser = argparse.ArgumentParser(description="Run inference on a model with random input values") arg_parser.add_argument("--config", type=str, help="Load YAML based configuration file to run the inference. If this is used other params will be ignored") # The following options are manual user provided settings - arg_parser.add_argument("--backends", type=str, help="Comma separated string of backends. Eg: torch,torch_tensorrt,tensorrt") + arg_parser.add_argument("--backends", type=str, help="Comma separated string of backends. Eg: torch,torch_tensorrt,fx2trt,tensorrt") arg_parser.add_argument("--model", type=str, help="Name of the model file") arg_parser.add_argument("--inputs", type=str, help="List of input shapes. Eg: (1, 3, 224, 224)@fp32 for Resnet or (1, 128)@int32;(1, 128)@int32 for BERT") arg_parser.add_argument("--batch_size", type=int, default=1, help="Batch size to build and run") From 5bf2f4afc8bb3bf49b13b77bf42c85c226f2139e Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Thu, 11 Aug 2022 12:35:39 -0700 Subject: [PATCH 09/13] chore: Apply precommit tooling Signed-off-by: Dheeraj Peri --- tools/perf/README.md | 258 ++++++++++++++++++------------------ tools/perf/custom_models.py | 1 + tools/perf/hub.py | 57 ++++---- tools/perf/perf_run.py | 168 ++++++++++++++--------- tools/perf/utils.py | 47 +++---- 5 files changed, 278 insertions(+), 253 deletions(-) diff --git a/tools/perf/README.md b/tools/perf/README.md index 44dc88d1c4..d430d2b234 100644 --- a/tools/perf/README.md +++ b/tools/perf/README.md @@ -1,129 +1,129 @@ -# Performance Benchmarking - -This is a comprehensive Python benchmark suite to run perf runs using different supported backends. Following backends are supported: - -1. Torch -2. Torch-TensorRT -3. FX-TRT -4. TensorRT - - -Note: Please note that for ONNX models, user can convert the ONNX model to TensorRT serialized engine and then use this package. - -## Prerequisite - -Benchmark scripts depends on following Python packages in addition to requirements.txt packages - -1. Torch-TensorRT -2. Torch -3. TensorRT - -## Structure - -``` -./ -├── config -│ ├── vgg16_trt.yml -│ └── vgg16.yml -├── models -├── perf_run.py -├── hub.py -├── custom_models.py -├── requirements.txt -├── benchmark.sh -└── README.md -``` - - - -* `config` - Directory which contains sample yaml configuration files for VGG network. -* `models` - Model directory -* `perf_run.py` - Performance benchmarking script which supports torch, torch_tensorrt, fx2trt, tensorrt backends -* `hub.py` - Script to download torchscript models for VGG16, Resnet50, EfficientNet-B0, VIT, HF-BERT -* `custom_models.py` - Script which includes custom models other than torchvision and timm (eg: HF BERT) -* `utils.py` - utility functions script -* `benchmark.sh` - This is used for internal performance testing of VGG16, Resnet50, EfficientNet-B0, VIT, HF-BERT. - -## Usage - -There are two ways you can run a performance benchmark. - -### Using YAML config files - -To run the benchmark for a given configuration file: - -```python -python perf_run.py --config=config/vgg16.yml -``` - -There are two sample configuration files added. - -* vgg16.yml demonstrates a configuration with all the supported backends (Torch, Torch-TensorRT, TensorRT) -* vgg16_trt.yml demonstrates how to use an external TensorRT serialized engine file directly. - - -### Supported fields - -| Name | Supported Values | Description | -| ----------------- | ------------------------------------ | ------------------------------------------------------------ | -| backend | all, torch, torch_tensorrt, tensorrt | Supported backends for inference. | -| input | - | Input binding names. Expected to list shapes of each input bindings | -| model | - | Configure the model filename and name | -| filename | - | Model file name to load from disk. | -| name | - | Model name | -| runtime | - | Runtime configurations | -| device | 0 | Target device ID to run inference. Range depends on available GPUs | -| precision | fp32, fp16 or half, int8 | Target precision to run inference. int8 cannot be used with 'all' backend | -| calibration_cache | - | Calibration cache file expected for torch_tensorrt runtime in int8 precision | - -Additional sample use case: - -``` -backend: - - torch - - torch_tensorrt - - tensorrt -input: - input0: - - 3 - - 224 - - 224 - num_inputs: 1 -model: - filename: model.plan - name: vgg16 -runtime: - device: 0 - precision: - - fp32 - - fp16 -``` - -Note: - -1. Please note that measuring INT8 performance is only supported via a `calibration cache` file or QAT mode for `torch_tensorrt` backend. -2. TensorRT engine filename should end with `.plan` otherwise it will be treated as Torchscript module. - -### Using CompileSpec options via CLI - -Here are the list of `CompileSpec` options that can be provided directly to compile the pytorch module - -* `--backends` : Comma separated string of backends. Eg: torch,torch_tensorrt, tensorrt or fx2trt -* `--model` : Name of the model file (Can be a torchscript module or a tensorrt engine (ending in `.plan` extension)). If the backend is `fx2trt`, the input should be a Pytorch module (instead of a torchscript module) and the options for model are (`vgg16` | `resnet50` | `efficientnet_b0`) -* `--inputs` : List of input shapes & dtypes. Eg: (1, 3, 224, 224)@fp32 for Resnet or (1, 128)@int32;(1, 128)@int32 for BERT -* `--batch_size` : Batch size -* `--precision` : Comma separated list of precisions to build TensorRT engine Eg: fp32,fp16 -* `--device` : Device ID -* `--truncate` : Truncate long and double weights in the network in Torch-TensorRT -* `--is_trt_engine` : Boolean flag to be enabled if the model file provided is a TensorRT engine. -* `--report` : Path of the output file where performance summary is written. - -Eg: - -``` - python perf_run.py --model ${MODELS_DIR}/vgg16_scripted.jit.pt \ - --precision fp32,fp16 --inputs="(1, 3, 224, 224)@fp32" \ - --batch_size 1 \ - --backends torch,torch_tensorrt,tensorrt \ - --report "vgg_perf_bs1.txt" -``` +# Performance Benchmarking + +This is a comprehensive Python benchmark suite to run perf runs using different supported backends. Following backends are supported: + +1. Torch +2. Torch-TensorRT +3. FX-TRT +4. TensorRT + + +Note: Please note that for ONNX models, user can convert the ONNX model to TensorRT serialized engine and then use this package. + +## Prerequisite + +Benchmark scripts depends on following Python packages in addition to requirements.txt packages + +1. Torch-TensorRT +2. Torch +3. TensorRT + +## Structure + +``` +./ +├── config +│ ├── vgg16_trt.yml +│ └── vgg16.yml +├── models +├── perf_run.py +├── hub.py +├── custom_models.py +├── requirements.txt +├── benchmark.sh +└── README.md +``` + + + +* `config` - Directory which contains sample yaml configuration files for VGG network. +* `models` - Model directory +* `perf_run.py` - Performance benchmarking script which supports torch, torch_tensorrt, fx2trt, tensorrt backends +* `hub.py` - Script to download torchscript models for VGG16, Resnet50, EfficientNet-B0, VIT, HF-BERT +* `custom_models.py` - Script which includes custom models other than torchvision and timm (eg: HF BERT) +* `utils.py` - utility functions script +* `benchmark.sh` - This is used for internal performance testing of VGG16, Resnet50, EfficientNet-B0, VIT, HF-BERT. + +## Usage + +There are two ways you can run a performance benchmark. + +### Using YAML config files + +To run the benchmark for a given configuration file: + +```python +python perf_run.py --config=config/vgg16.yml +``` + +There are two sample configuration files added. + +* vgg16.yml demonstrates a configuration with all the supported backends (Torch, Torch-TensorRT, TensorRT) +* vgg16_trt.yml demonstrates how to use an external TensorRT serialized engine file directly. + + +### Supported fields + +| Name | Supported Values | Description | +| ----------------- | ------------------------------------ | ------------------------------------------------------------ | +| backend | all, torch, torch_tensorrt, tensorrt | Supported backends for inference. | +| input | - | Input binding names. Expected to list shapes of each input bindings | +| model | - | Configure the model filename and name | +| filename | - | Model file name to load from disk. | +| name | - | Model name | +| runtime | - | Runtime configurations | +| device | 0 | Target device ID to run inference. Range depends on available GPUs | +| precision | fp32, fp16 or half, int8 | Target precision to run inference. int8 cannot be used with 'all' backend | +| calibration_cache | - | Calibration cache file expected for torch_tensorrt runtime in int8 precision | + +Additional sample use case: + +``` +backend: + - torch + - torch_tensorrt + - tensorrt +input: + input0: + - 3 + - 224 + - 224 + num_inputs: 1 +model: + filename: model.plan + name: vgg16 +runtime: + device: 0 + precision: + - fp32 + - fp16 +``` + +Note: + +1. Please note that measuring INT8 performance is only supported via a `calibration cache` file or QAT mode for `torch_tensorrt` backend. +2. TensorRT engine filename should end with `.plan` otherwise it will be treated as Torchscript module. + +### Using CompileSpec options via CLI + +Here are the list of `CompileSpec` options that can be provided directly to compile the pytorch module + +* `--backends` : Comma separated string of backends. Eg: torch,torch_tensorrt, tensorrt or fx2trt +* `--model` : Name of the model file (Can be a torchscript module or a tensorrt engine (ending in `.plan` extension)). If the backend is `fx2trt`, the input should be a Pytorch module (instead of a torchscript module) and the options for model are (`vgg16` | `resnet50` | `efficientnet_b0`) +* `--inputs` : List of input shapes & dtypes. Eg: (1, 3, 224, 224)@fp32 for Resnet or (1, 128)@int32;(1, 128)@int32 for BERT +* `--batch_size` : Batch size +* `--precision` : Comma separated list of precisions to build TensorRT engine Eg: fp32,fp16 +* `--device` : Device ID +* `--truncate` : Truncate long and double weights in the network in Torch-TensorRT +* `--is_trt_engine` : Boolean flag to be enabled if the model file provided is a TensorRT engine. +* `--report` : Path of the output file where performance summary is written. + +Eg: + +``` + python perf_run.py --model ${MODELS_DIR}/vgg16_scripted.jit.pt \ + --precision fp32,fp16 --inputs="(1, 3, 224, 224)@fp32" \ + --batch_size 1 \ + --backends torch,torch_tensorrt,tensorrt \ + --report "vgg_perf_bs1.txt" +``` diff --git a/tools/perf/custom_models.py b/tools/perf/custom_models.py index 679425ca3d..a8b8a5dae0 100644 --- a/tools/perf/custom_models.py +++ b/tools/perf/custom_models.py @@ -3,6 +3,7 @@ from transformers import BertModel, BertTokenizer, BertConfig import torch.nn.functional as F + def BertModule(): model_name = "bert-base-uncased" enc = BertTokenizer.from_pretrained(model_name) diff --git a/tools/perf/hub.py b/tools/perf/hub.py index 6f2a0fad9e..c209c64ecc 100644 --- a/tools/perf/hub.py +++ b/tools/perf/hub.py @@ -17,36 +17,21 @@ raise Exception("No GPU found. Please check if installed torch version is compatible with CUDA version") # Downloads all model files again if manifest file is not present -MANIFEST_FILE = 'model_manifest.json' +MANIFEST_FILE = "model_manifest.json" BENCHMARK_MODELS = { - "vgg16": { - "model": models.vgg16(weights=None), - "path": "script" - }, - "resnet50": { - "model": models.resnet50(weights=None), - "path": "script" - }, - "efficientnet_b0": { - "model": timm.create_model('efficientnet_b0', pretrained=True), - "path": "script" - }, - "vit": { - "model": timm.create_model('vit_base_patch16_224', pretrained=True), - "path": "script" - }, - "bert_base_uncased": { - "model": cm.BertModule(), - "path": "trace" - }, + "vgg16": {"model": models.vgg16(weights=None), "path": "script"}, + "resnet50": {"model": models.resnet50(weights=None), "path": "script"}, + "efficientnet_b0": {"model": timm.create_model("efficientnet_b0", pretrained=True), "path": "script"}, + "vit": {"model": timm.create_model("vit_base_patch16_224", pretrained=True), "path": "script"}, + "bert_base_uncased": {"model": cm.BertModule(), "path": "trace"}, } def get(n, m, manifest): print("Downloading {}".format(n)) - traced_filename = "models/" + n + '_traced.jit.pt' - script_filename = "models/" + n + '_scripted.jit.pt' + traced_filename = "models/" + n + "_traced.jit.pt" + script_filename = "models/" + n + "_scripted.jit.pt" x = torch.ones((1, 3, 300, 300)).cuda() if n == "bert-base-uncased": traced_model = m["model"] @@ -80,9 +65,11 @@ def download_models(version_matches, manifest): scripted_filename = "models/" + n + "_scripted.jit.pt" traced_filename = "models/" + n + "_traced.jit.pt" # Check if model file exists on disk - if (m["path"] == "both" and os.path.exists(scripted_filename) and os.path.exists(traced_filename)) or \ - (m["path"] == "script" and os.path.exists(scripted_filename)) or \ - (m["path"] == "trace" and os.path.exists(traced_filename)): + if ( + (m["path"] == "both" and os.path.exists(scripted_filename) and os.path.exists(traced_filename)) + or (m["path"] == "script" and os.path.exists(scripted_filename)) + or (m["path"] == "trace" and os.path.exists(traced_filename)) + ): print("Skipping {} ".format(n)) continue manifest = get(n, m, manifest) @@ -98,27 +85,31 @@ def main(): manifest = {"version": torch_version} # Creating an empty manifest file for overwriting post setup - os.system('touch {}'.format(MANIFEST_FILE)) + os.system("touch {}".format(MANIFEST_FILE)) else: manifest_exists = True # Load manifest if already exists - with open(MANIFEST_FILE, 'r') as f: + with open(MANIFEST_FILE, "r") as f: manifest = json.load(f) - if manifest['version'] == torch_version: + if manifest["version"] == torch_version: version_matches = True else: - print("Torch version: {} mismatches \ + print( + "Torch version: {} mismatches \ with manifest's version: {}. Re-downloading \ - all models".format(torch_version, manifest['version'])) + all models".format( + torch_version, manifest["version"] + ) + ) # Overwrite the manifest version as current torch version - manifest['version'] = torch_version + manifest["version"] = torch_version download_models(version_matches, manifest) # Write updated manifest file to disk - with open(MANIFEST_FILE, 'r+') as f: + with open(MANIFEST_FILE, "r+") as f: data = f.read() f.seek(0) record = json.dumps(manifest) diff --git a/tools/perf/perf_run.py b/tools/perf/perf_run.py index c472100668..af0bae7653 100644 --- a/tools/perf/perf_run.py +++ b/tools/perf/perf_run.py @@ -44,14 +44,17 @@ def read_config(self): def get(self, key, default_value=None): if not key in self.params: if not default_value: - raise ValueError('Key {} is not present and default_value is not configured. Please run it with default value', key) + raise ValueError( + "Key {} is not present and default_value is not configured. Please run it with default value", key + ) self.params[key] = default_value return self.params[key] + # Runs inference using Torch backend def run_torch(model, input_tensors, params, precision, batch_size): print("Running Torch for precision: ", precision, " batch_size : ", batch_size) - iters = params.get('iterations', 20) + iters = params.get("iterations", 20) # Warm up with torch.no_grad(): @@ -72,24 +75,24 @@ def run_torch(model, input_tensors, params, precision, batch_size): recordStats("Torch", timings, precision, batch_size) + # Runs inference using Torch-TensorRT backend def run_torch_tensorrt(model, input_tensors, params, precision, truncate_long_and_double, batch_size): print("Running Torch-TensorRT for precision: ", precision, " batch_size : ", batch_size) # Compiling Torch-TensorRT model compile_settings = { - "inputs": input_tensors, - "enabled_precisions": {precision_to_dtype(precision)} , - "truncate_long_and_double": truncate_long_and_double, - "min_block_size" : 1, + "inputs": input_tensors, + "enabled_precisions": {precision_to_dtype(precision)}, + "truncate_long_and_double": truncate_long_and_double, } - if precision == 'int8': - compile_settings.update({"calib": params.get('calibration_cache')}) + if precision == "int8": + compile_settings.update({"calib": params.get("calibration_cache")}) with torchtrt.logging.errors(): model = torchtrt.compile(model, **compile_settings) - iters = params.get('iterations', 20) + iters = params.get("iterations", 20) # Warm up with torch.no_grad(): for _ in range(WARMUP_ITER): @@ -109,6 +112,7 @@ def run_torch_tensorrt(model, input_tensors, params, precision, truncate_long_an recordStats("Torch-TensorRT", timings, precision, batch_size) + # Runs inference using FX2TRT backend def run_fx2trt(model, input_tensors, params, precision, batch_size): print("Running FX2TRT for precision: ", precision, " batch_size : ", batch_size) @@ -127,7 +131,7 @@ def run_fx2trt(model, input_tensors, params, precision, batch_size): verbose_log=False, ) - iters = params.get('iterations', 20) + iters = params.get("iterations", 20) # Warm up with torch.no_grad(): for _ in range(WARMUP_ITER): @@ -147,6 +151,7 @@ def run_fx2trt(model, input_tensors, params, precision, batch_size): recordStats("FX-TensorRT", timings, precision, batch_size) + def torch_dtype_from_trt(dtype): if dtype == trt.int8: return torch.int8 @@ -161,6 +166,7 @@ def torch_dtype_from_trt(dtype): else: raise TypeError("%s is not supported by torch" % dtype) + def torch_device_from_trt(device): if device == trt.TensorLocation.DEVICE: return torch.device("cuda") @@ -170,7 +176,9 @@ def torch_device_from_trt(device): return TypeError("%s is not supported by torch" % device) -def run_tensorrt(model, input_tensors, params, precision, truncate_long_and_double=False, is_trt_engine=False, batch_size=1): +def run_tensorrt( + model, input_tensors, params, precision, truncate_long_and_double=False, is_trt_engine=False, batch_size=1 +): engine = None # If the model file is a TensorRT engine then directly deserialize and run inference @@ -191,12 +199,12 @@ def run_tensorrt(model, input_tensors, params, precision, truncate_long_and_doub engine = runtime.deserialize_cuda_engine(model) print("Running TensorRT for precision: ", precision, " batch_size : ", batch_size) - iters = params.get('iterations', 20) + iters = params.get("iterations", 20) # Compiling the bindings bindings = engine.num_bindings * [None] k = 0 - for idx,_ in enumerate(bindings): + for idx, _ in enumerate(bindings): dtype = torch_dtype_from_trt(engine.get_binding_dtype(idx)) shape = tuple(engine.get_binding_shape(idx)) device = torch_device_from_trt(engine.get_location(idx)) @@ -225,19 +233,22 @@ def run_tensorrt(model, input_tensors, params, precision, truncate_long_and_doub recordStats("TensorRT", timings, precision, batch_size) + # Deploys inference run for different backend configurations -def run(model, backends, input_tensors, params, precision, truncate_long_and_double=False, batch_size=1, is_trt_engine=False): +def run( + model, backends, input_tensors, params, precision, truncate_long_and_double=False, batch_size=1, is_trt_engine=False +): for backend in backends: - if precision == 'int8': - if backend == 'all' or backend == 'torch': + if precision == "int8": + if backend == "all" or backend == "torch": print("int8 precision is not supported for torch runtime in this script yet") return False - if backend == 'all' or backend == 'torch_tensorrt' or params.get('calibration_cache', None) == None: + if backend == "all" or backend == "torch_tensorrt" or params.get("calibration_cache", None) == None: print("int8 precision expects calibration cache file for inference") return False - if backend == 'all': + if backend == "all": run_torch(model, input_tensors, params, precision, batch_size) run_torch_tensorrt(model, input_tensors, params, precision, truncate_long_and_double, batch_size) run_tensorrt(model, input_tensors, params, precision, truncate_long_and_double, is_trt_engine, batch_size) @@ -254,8 +265,9 @@ def run(model, backends, input_tensors, params, precision, truncate_long_and_dou elif backend == "tensorrt": run_tensorrt(model, input_tensors, params, precision, truncate_long_and_double, is_trt_engine, batch_size) + # Generate report -def recordStats(backend, timings, precision, batch_size = 1): +def recordStats(backend, timings, precision, batch_size=1): times = np.array(timings) steps = len(times) speeds = batch_size / times @@ -267,31 +279,32 @@ def recordStats(backend, timings, precision, batch_size = 1): speed_med = np.median(speeds) stats = { - 'Backend' : backend, - 'Precision' : precision, - 'Batch size' : batch_size, - 'Median(FPS)' : speed_med, - 'Mean(FPS)' : speed_mean, - 'Median-Latency(ms)' : time_med, - 'Mean-Latency(ms)' : time_mean, + "Backend": backend, + "Precision": precision, + "Batch size": batch_size, + "Median(FPS)": speed_med, + "Mean(FPS)": speed_mean, + "Median-Latency(ms)": time_med, + "Mean-Latency(ms)": time_mean, } results.append(stats) + def load_model(params): model = None is_trt_engine = False # Load torch model traced/scripted - model_file = params.get('model').get('filename') - try : - model_name = params.get('model').get('name') + model_file = params.get("model").get("filename") + try: + model_name = params.get("model").get("name") except: model_name = model_file print("Loading model: ", model_file) - if model_file.endswith('.plan'): + if model_file.endswith(".plan"): is_trt_engine = True # Read the TensorRT engine file - with open(model_file, 'rb') as fin: + with open(model_file, "rb") as fin: model = fin.read() else: model = torch.jit.load(model_file).cuda() @@ -299,19 +312,40 @@ def load_model(params): return model, model_name, is_trt_engine -if __name__ == '__main__': +if __name__ == "__main__": arg_parser = argparse.ArgumentParser(description="Run inference on a model with random input values") - arg_parser.add_argument("--config", type=str, help="Load YAML based configuration file to run the inference. If this is used other params will be ignored") + arg_parser.add_argument( + "--config", + type=str, + help="Load YAML based configuration file to run the inference. If this is used other params will be ignored", + ) # The following options are manual user provided settings - arg_parser.add_argument("--backends", type=str, help="Comma separated string of backends. Eg: torch,torch_tensorrt,fx2trt,tensorrt") + arg_parser.add_argument( + "--backends", type=str, help="Comma separated string of backends. Eg: torch,torch_tensorrt,fx2trt,tensorrt" + ) arg_parser.add_argument("--model", type=str, help="Name of the model file") - arg_parser.add_argument("--inputs", type=str, help="List of input shapes. Eg: (1, 3, 224, 224)@fp32 for Resnet or (1, 128)@int32;(1, 128)@int32 for BERT") + arg_parser.add_argument( + "--inputs", + type=str, + help="List of input shapes. Eg: (1, 3, 224, 224)@fp32 for Resnet or (1, 128)@int32;(1, 128)@int32 for BERT", + ) arg_parser.add_argument("--batch_size", type=int, default=1, help="Batch size to build and run") - arg_parser.add_argument("--precision", default="fp32", type=str, help="Comma separated list of precisions to build TensorRT engine Eg: fp32,fp16") + arg_parser.add_argument( + "--precision", + default="fp32", + type=str, + help="Comma separated list of precisions to build TensorRT engine Eg: fp32,fp16", + ) arg_parser.add_argument("--calibration_cache", type=str, help="Name of the calibration cache file") arg_parser.add_argument("--device", type=int, help="device id") - arg_parser.add_argument("--truncate", action='store_true', help="Truncate long and double weights in the network in Torch-TensorRT") - arg_parser.add_argument("--is_trt_engine", action='store_true', help="Boolean flag to determine if the user provided model is a TRT engine or not") + arg_parser.add_argument( + "--truncate", action="store_true", help="Truncate long and double weights in the network in Torch-TensorRT" + ) + arg_parser.add_argument( + "--is_trt_engine", + action="store_true", + help="Boolean flag to determine if the user provided model is a TRT engine or not", + ) arg_parser.add_argument("--report", type=str, help="Path of the output file where performance summary is written.") args = arg_parser.parse_args() @@ -326,57 +360,67 @@ def load_model(params): model, model_name, is_trt_engine = load_model(params) # Default device is set to 0. Configurable using yaml config file. - torch.cuda.set_device(params.get('runtime').get('device', 0)) + torch.cuda.set_device(params.get("runtime").get("device", 0)) - num_input = params.get('input').get('num_inputs') - truncate_long_and_double = params.get('runtime').get('truncate_long_and_double', False) - batch_size = params.get('input').get('batch_size', 1) - for precision in params.get('runtime').get('precision', 'fp32'): + num_input = params.get("input").get("num_inputs") + truncate_long_and_double = params.get("runtime").get("truncate_long_and_double", False) + batch_size = params.get("input").get("batch_size", 1) + for precision in params.get("runtime").get("precision", "fp32"): input_tensors = [] - num_input = params.get('input').get('num_inputs', 1) + num_input = params.get("input").get("num_inputs", 1) for i in range(num_input): - inp_tensor = params.get('input').get('input' + str(i)) - input_tensors.append(torch.randint(0, 2, tuple(d for d in inp_tensor), dtype=precision_to_dtype(precision)).cuda()) + inp_tensor = params.get("input").get("input" + str(i)) + input_tensors.append( + torch.randint(0, 2, tuple(d for d in inp_tensor), dtype=precision_to_dtype(precision)).cuda() + ) if is_trt_engine: - print("Warning, TensorRT engine file is configured. Please make sure the precision matches with the TRT engine for reliable results") + print( + "Warning, TensorRT engine file is configured. Please make sure the precision matches with the TRT engine for reliable results" + ) if not is_trt_engine and (precision == "fp16" or precision == "half"): # If model is TensorRT serialized engine then model.half will report failure model = model.half() - backends = params.get('backend') + backends = params.get("backend") # Run inference - status = run(model, backends, input_tensors, params, precision, truncate_long_and_double, batch_size, is_trt_engine) + status = run( + model, backends, input_tensors, params, precision, truncate_long_and_double, batch_size, is_trt_engine + ) else: params = vars(args) - model_name = params['model'] + model_name = params["model"] if os.path.exists(model_name): print("Loading user provided model: ", model_name) model = torch.jit.load(model_name).cuda().eval() elif model_name in BENCHMARK_MODELS: - model = BENCHMARK_MODELS[model_name]['model'].eval().cuda() + model = BENCHMARK_MODELS[model_name]["model"].eval().cuda() else: - raise ValueError("Invalid model name. Please provide a torchscript model file or model name (among the following options vgg16|resnet50|efficientnet_b0|vit)") + raise ValueError( + "Invalid model name. Please provide a torchscript model file or model name (among the following options vgg16|resnet50|efficientnet_b0|vit)" + ) - backends = parse_backends(params['backends']) - truncate_long_and_double = params['truncate'] - batch_size = params['batch_size'] - is_trt_engine = params['is_trt_engine'] - precisions = parse_precisions(params['precision']) + backends = parse_backends(params["backends"]) + truncate_long_and_double = params["truncate"] + batch_size = params["batch_size"] + is_trt_engine = params["is_trt_engine"] + precisions = parse_precisions(params["precision"]) for precision in precisions: - input_tensors = parse_inputs(params['inputs'], precision_to_dtype(precision)) + input_tensors = parse_inputs(params["inputs"], precision_to_dtype(precision)) if not is_trt_engine and (precision == "fp16" or precision == "half"): # If model is TensorRT serialized engine then model.half will report failure model = model.half() - status = run(model, backends, input_tensors, params, precision, truncate_long_and_double, batch_size, is_trt_engine) + status = run( + model, backends, input_tensors, params, precision, truncate_long_and_double, batch_size, is_trt_engine + ) # Generate report - print('Model Summary: ', model_name) + print("Model Summary: ", model_name) summary = pd.DataFrame(results) print(summary) - with open(args.report, 'w') as file: - file.write('Model Summary: ' + model_name + '\n') + with open(args.report, "w") as file: + file.write("Model Summary: " + model_name + "\n") file.write(summary.to_string()) file.close() diff --git a/tools/perf/utils.py b/tools/perf/utils.py index 3b38a89c8c..d7870d90fe 100644 --- a/tools/perf/utils.py +++ b/tools/perf/utils.py @@ -5,55 +5,44 @@ import timm BENCHMARK_MODELS = { - "vgg16": { - "model": models.vgg16(pretrained=True), - "path": "script" - }, - "resnet50": { - "model": torch.hub.load('pytorch/vision:v0.9.0', 'resnet50', pretrained=True), - "path": "script" - }, - "efficientnet_b0": { - "model": timm.create_model('efficientnet_b0', pretrained=True), - "path": "script" - }, - "vit": { - "model": timm.create_model('vit_base_patch16_224', pretrained=True), - "path": "script" - }, - "bert_base_uncased": { - "model": cm.BertModule(), - "path": "trace" - }, + "vgg16": {"model": models.vgg16(pretrained=True), "path": "script"}, + "resnet50": {"model": torch.hub.load("pytorch/vision:v0.9.0", "resnet50", pretrained=True), "path": "script"}, + "efficientnet_b0": {"model": timm.create_model("efficientnet_b0", pretrained=True), "path": "script"}, + "vit": {"model": timm.create_model("vit_base_patch16_224", pretrained=True), "path": "script"}, + "bert_base_uncased": {"model": cm.BertModule(), "path": "trace"}, } + def precision_to_dtype(pr): - if pr == 'fp32': + if pr == "fp32": return torch.float - elif pr == 'fp16' or pr == 'half': + elif pr == "fp16" or pr == "half": return torch.half - elif pr == 'int32': + elif pr == "int32": return torch.int32 - elif pr == 'bool': + elif pr == "bool": return torch.bool else: return torch.float32 + def parse_inputs(user_inputs, dtype): - parsed_inputs = user_inputs.split(';') + parsed_inputs = user_inputs.split(";") torchtrt_inputs = [] for input in parsed_inputs: input_shape = [] - input_shape_and_dtype = input.split('@') + input_shape_and_dtype = input.split("@") dtype = precision_to_dtype(input_shape_and_dtype[1]) if len(input_shape_and_dtype) == 2 else dtype - for input_dim in input_shape_and_dtype[0][1:-1].split(','): + for input_dim in input_shape_and_dtype[0][1:-1].split(","): input_shape.append(int(input_dim)) torchtrt_inputs.append(torch.randint(0, 5, input_shape, dtype=dtype).cuda()) return torchtrt_inputs + def parse_backends(backends): - return backends.split(',') + return backends.split(",") + def parse_precisions(precisions): - return precisions.split(',') + return precisions.split(",") From 3a14f23afba1ed0696c316ecec81fcaa1ee4eb2e Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Fri, 19 Aug 2022 17:28:10 -0700 Subject: [PATCH 10/13] chore: rebase and minor changes Signed-off-by: Dheeraj Peri --- tools/perf/README.md | 9 +++++++++ tools/perf/benchmark.sh | 10 +++++----- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/tools/perf/README.md b/tools/perf/README.md index d430d2b234..45630b4f29 100644 --- a/tools/perf/README.md +++ b/tools/perf/README.md @@ -127,3 +127,12 @@ Eg: --backends torch,torch_tensorrt,tensorrt \ --report "vgg_perf_bs1.txt" ``` + +### Example models + +This tool benchmarks any pytorch model or torchscript module. As an example, we provide VGG16, Resnet50, EfficientNet-B0, VIT, HF-BERT models in `hub.py` that we internally test for performance. +The torchscript modules for these models can be generated by running +``` +python hub.py +``` +You can refer to `benchmark.sh` on how we run/benchmark these models. diff --git a/tools/perf/benchmark.sh b/tools/perf/benchmark.sh index cf659cc86a..b84061025d 100644 --- a/tools/perf/benchmark.sh +++ b/tools/perf/benchmark.sh @@ -9,7 +9,7 @@ batch_sizes=(1 2 4 8 16 32 64 128 256) #Benchmark VGG16 model echo "Benchmarking VGG16 model" -for bs in batch_sizes +for bs in ${batch_sizes[@]} do python perf_run.py --model ${MODELS_DIR}/vgg16_scripted.jit.pt \ --precision fp32,fp16 --inputs="(${bs}, 3, 224, 224)" \ @@ -20,7 +20,7 @@ done # Benchmark Resnet50 model echo "Benchmarking Resnet50 model" -for bs in batch_sizes +for bs in ${batch_sizes[@]} do python perf_run.py --model ${MODELS_DIR}/resnet50_scripted.jit.pt \ --precision fp32,fp16 --inputs="(${bs}, 3, 224, 224)" \ @@ -31,7 +31,7 @@ done # Benchmark VIT model echo "Benchmarking VIT model" -for bs in batch_sizes +for bs in ${batch_sizes[@]} do python perf_run.py --model ${MODELS_DIR}/vit_scripted.jit.pt \ --precision fp32,fp16 --inputs="(${bs}, 3, 224, 224)" \ @@ -42,7 +42,7 @@ done # Benchmark EfficientNet-B0 model echo "Benchmarking EfficientNet-B0 model" -for bs in batch_sizes +for bs in ${batch_sizes[@]} do python perf_run.py --model ${MODELS_DIR}/efficientnet_b0_scripted.jit.pt \ --precision fp32,fp16 --inputs="(${bs}, 3, 224, 224)" \ @@ -53,7 +53,7 @@ done # Benchmark BERT model echo "Benchmarking Huggingface BERT base model" -for bs in batch_sizes +for bs in ${batch_sizes[@]} do python perf_run.py --model ${MODELS_DIR}/bert_base_uncased_traced.jit.pt \ --precision fp32 --inputs="(${bs}, 128)@int32;(${bs}, 128)@int32" \ From d5dbc4d51b2b9bebcbb0b693e3ffbc9d2a1aacd8 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Fri, 19 Aug 2022 17:48:03 -0700 Subject: [PATCH 11/13] chore: Fix reporting to a file setting Signed-off-by: Dheeraj Peri --- tools/perf/perf_run.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tools/perf/perf_run.py b/tools/perf/perf_run.py index 254a4a5b88..f82a1a517d 100644 --- a/tools/perf/perf_run.py +++ b/tools/perf/perf_run.py @@ -15,7 +15,7 @@ # Importing supported Backends import torch import torch_tensorrt as torchtrt -from torch_tensorrt.fx.lower import lower_to_trt +from torch_tensorrt.fx.lower import compile from torch_tensorrt.fx.utils import LowerPrecision import tensorrt as trt @@ -89,7 +89,7 @@ def run_torch_tensorrt(model, input_tensors, params, precision, truncate_long_an if precision == "int8": compile_settings.update({"calib": params.get("calibration_cache")}) - with torchtrt.logging.errors(): + with torchtrt.logging.debug(): model = torchtrt.compile(model, **compile_settings) iters = params.get("iterations", 20) @@ -123,7 +123,7 @@ def run_fx2trt(model, input_tensors, params, precision, batch_size): model.half() input_tensors = [tensor.half() for tensor in input_tensors] # Run lowering eager mode benchmark - model = lower_to_trt( + model = compile( model, input_tensors, max_batch_size=batch_size, @@ -430,7 +430,8 @@ def load_model(params): print("Model Summary: ", model_name) summary = pd.DataFrame(results) print(summary) - with open(args.report, "w") as file: - file.write("Model Summary: " + model_name + "\n") - file.write(summary.to_string()) - file.close() + if args.report: + with open(args.report, "w") as file: + file.write("Model Summary: " + model_name + "\n") + file.write(summary.to_string()) + file.close() From 2186177fe073dcca47bdce7b4adc80e3a4938f8d Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Thu, 8 Sep 2022 10:44:51 -0700 Subject: [PATCH 12/13] chore: minor fixes Signed-off-by: Dheeraj Peri --- tools/perf/perf_run.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tools/perf/perf_run.py b/tools/perf/perf_run.py index f82a1a517d..53008dc67a 100644 --- a/tools/perf/perf_run.py +++ b/tools/perf/perf_run.py @@ -89,8 +89,7 @@ def run_torch_tensorrt(model, input_tensors, params, precision, truncate_long_an if precision == "int8": compile_settings.update({"calib": params.get("calibration_cache")}) - with torchtrt.logging.debug(): - model = torchtrt.compile(model, **compile_settings) + model = torchtrt.compile(model, **compile_settings) iters = params.get("iterations", 20) # Warm up @@ -292,8 +291,8 @@ def recordStats(backend, timings, precision, batch_size=1): "Batch size": batch_size, "Median(FPS)": speed_med, "Mean(FPS)": speed_mean, - "Median-Latency(ms)": time_med, - "Mean-Latency(ms)": time_mean, + "Median-Latency(ms)": time_med*1000, + "Mean-Latency(ms)": time_mean*1000, } results.append(stats) From 77543a02fb6dfba821c0529ed81e3cf1d397e2a7 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Thu, 8 Sep 2022 10:53:12 -0700 Subject: [PATCH 13/13] chore: Linter fixes Signed-off-by: Dheeraj Peri --- tools/perf/hub.py | 20 ++++-- tools/perf/perf_run.py | 136 ++++++++++++++++++++++++++++++++++------- tools/perf/utils.py | 21 +++++-- 3 files changed, 147 insertions(+), 30 deletions(-) diff --git a/tools/perf/hub.py b/tools/perf/hub.py index c209c64ecc..e54734f8a1 100644 --- a/tools/perf/hub.py +++ b/tools/perf/hub.py @@ -14,7 +14,9 @@ # Detect case of no GPU before deserialization of models on GPU if not torch.cuda.is_available(): - raise Exception("No GPU found. Please check if installed torch version is compatible with CUDA version") + raise Exception( + "No GPU found. Please check if installed torch version is compatible with CUDA version" + ) # Downloads all model files again if manifest file is not present MANIFEST_FILE = "model_manifest.json" @@ -22,8 +24,14 @@ BENCHMARK_MODELS = { "vgg16": {"model": models.vgg16(weights=None), "path": "script"}, "resnet50": {"model": models.resnet50(weights=None), "path": "script"}, - "efficientnet_b0": {"model": timm.create_model("efficientnet_b0", pretrained=True), "path": "script"}, - "vit": {"model": timm.create_model("vit_base_patch16_224", pretrained=True), "path": "script"}, + "efficientnet_b0": { + "model": timm.create_model("efficientnet_b0", pretrained=True), + "path": "script", + }, + "vit": { + "model": timm.create_model("vit_base_patch16_224", pretrained=True), + "path": "script", + }, "bert_base_uncased": {"model": cm.BertModule(), "path": "trace"}, } @@ -66,7 +74,11 @@ def download_models(version_matches, manifest): traced_filename = "models/" + n + "_traced.jit.pt" # Check if model file exists on disk if ( - (m["path"] == "both" and os.path.exists(scripted_filename) and os.path.exists(traced_filename)) + ( + m["path"] == "both" + and os.path.exists(scripted_filename) + and os.path.exists(traced_filename) + ) or (m["path"] == "script" and os.path.exists(scripted_filename)) or (m["path"] == "trace" and os.path.exists(traced_filename)) ): diff --git a/tools/perf/perf_run.py b/tools/perf/perf_run.py index 53008dc67a..fbdf3b6c40 100644 --- a/tools/perf/perf_run.py +++ b/tools/perf/perf_run.py @@ -19,7 +19,13 @@ from torch_tensorrt.fx.utils import LowerPrecision import tensorrt as trt -from utils import parse_inputs, parse_backends, precision_to_dtype, parse_precisions, BENCHMARK_MODELS +from utils import ( + parse_inputs, + parse_backends, + precision_to_dtype, + parse_precisions, + BENCHMARK_MODELS, +) WARMUP_ITER = 10 results = [] @@ -45,7 +51,8 @@ def get(self, key, default_value=None): if not key in self.params: if not default_value: raise ValueError( - "Key {} is not present and default_value is not configured. Please run it with default value", key + "Key {} is not present and default_value is not configured. Please run it with default value", + key, ) self.params[key] = default_value return self.params[key] @@ -77,8 +84,15 @@ def run_torch(model, input_tensors, params, precision, batch_size): # Runs inference using Torch-TensorRT backend -def run_torch_tensorrt(model, input_tensors, params, precision, truncate_long_and_double, batch_size): - print("Running Torch-TensorRT for precision: ", precision, " batch_size : ", batch_size) +def run_torch_tensorrt( + model, input_tensors, params, precision, truncate_long_and_double, batch_size +): + print( + "Running Torch-TensorRT for precision: ", + precision, + " batch_size : ", + batch_size, + ) # Compiling Torch-TensorRT model compile_settings = { "inputs": input_tensors, @@ -176,7 +190,13 @@ def torch_device_from_trt(device): def run_tensorrt( - model, input_tensors, params, precision, truncate_long_and_double=False, is_trt_engine=False, batch_size=1 + model, + input_tensors, + params, + precision, + truncate_long_and_double=False, + is_trt_engine=False, + batch_size=1, ): engine = None @@ -237,7 +257,14 @@ def run_tensorrt( # Deploys inference run for different backend configurations def run( - model, backends, input_tensors, params, precision, truncate_long_and_double=False, batch_size=1, is_trt_engine=False + model, + backends, + input_tensors, + params, + precision, + truncate_long_and_double=False, + batch_size=1, + is_trt_engine=False, ): for backend in backends: if precision == "int8": @@ -257,20 +284,50 @@ def run( if backend == "all": run_torch(model, input_tensors, params, precision, batch_size) - run_torch_tensorrt(model, input_tensors, params, precision, truncate_long_and_double, batch_size) - run_tensorrt(model, input_tensors, params, precision, truncate_long_and_double, is_trt_engine, batch_size) + run_torch_tensorrt( + model, + input_tensors, + params, + precision, + truncate_long_and_double, + batch_size, + ) + run_tensorrt( + model, + input_tensors, + params, + precision, + truncate_long_and_double, + is_trt_engine, + batch_size, + ) elif backend == "torch": run_torch(model, input_tensors, params, precision, batch_size) elif backend == "torch_tensorrt": - run_torch_tensorrt(model, input_tensors, params, precision, truncate_long_and_double, batch_size) + run_torch_tensorrt( + model, + input_tensors, + params, + precision, + truncate_long_and_double, + batch_size, + ) elif backend == "fx2trt": run_fx2trt(model, input_tensors, params, precision, batch_size) elif backend == "tensorrt": - run_tensorrt(model, input_tensors, params, precision, truncate_long_and_double, is_trt_engine, batch_size) + run_tensorrt( + model, + input_tensors, + params, + precision, + truncate_long_and_double, + is_trt_engine, + batch_size, + ) # Generate report @@ -291,8 +348,8 @@ def recordStats(backend, timings, precision, batch_size=1): "Batch size": batch_size, "Median(FPS)": speed_med, "Mean(FPS)": speed_mean, - "Median-Latency(ms)": time_med*1000, - "Mean-Latency(ms)": time_mean*1000, + "Median-Latency(ms)": time_med * 1000, + "Mean-Latency(ms)": time_mean * 1000, } results.append(stats) @@ -330,7 +387,9 @@ def load_model(params): ) # The following options are manual user provided settings arg_parser.add_argument( - "--backends", type=str, help="Comma separated string of backends. Eg: torch,torch_tensorrt,fx2trt,tensorrt" + "--backends", + type=str, + help="Comma separated string of backends. Eg: torch,torch_tensorrt,fx2trt,tensorrt", ) arg_parser.add_argument("--model", type=str, help="Name of the model file") arg_parser.add_argument( @@ -338,24 +397,34 @@ def load_model(params): type=str, help="List of input shapes. Eg: (1, 3, 224, 224)@fp32 for Resnet or (1, 128)@int32;(1, 128)@int32 for BERT", ) - arg_parser.add_argument("--batch_size", type=int, default=1, help="Batch size to build and run") + arg_parser.add_argument( + "--batch_size", type=int, default=1, help="Batch size to build and run" + ) arg_parser.add_argument( "--precision", default="fp32", type=str, help="Comma separated list of precisions to build TensorRT engine Eg: fp32,fp16", ) - arg_parser.add_argument("--calibration_cache", type=str, help="Name of the calibration cache file") + arg_parser.add_argument( + "--calibration_cache", type=str, help="Name of the calibration cache file" + ) arg_parser.add_argument("--device", type=int, help="device id") arg_parser.add_argument( - "--truncate", action="store_true", help="Truncate long and double weights in the network in Torch-TensorRT" + "--truncate", + action="store_true", + help="Truncate long and double weights in the network in Torch-TensorRT", ) arg_parser.add_argument( "--is_trt_engine", action="store_true", help="Boolean flag to determine if the user provided model is a TRT engine or not", ) - arg_parser.add_argument("--report", type=str, help="Path of the output file where performance summary is written.") + arg_parser.add_argument( + "--report", + type=str, + help="Path of the output file where performance summary is written.", + ) args = arg_parser.parse_args() cudnn.benchmark = True @@ -372,7 +441,9 @@ def load_model(params): torch.cuda.set_device(params.get("runtime").get("device", 0)) num_input = params.get("input").get("num_inputs") - truncate_long_and_double = params.get("runtime").get("truncate_long_and_double", False) + truncate_long_and_double = params.get("runtime").get( + "truncate_long_and_double", False + ) batch_size = params.get("input").get("batch_size", 1) for precision in params.get("runtime").get("precision", "fp32"): input_tensors = [] @@ -380,7 +451,12 @@ def load_model(params): for i in range(num_input): inp_tensor = params.get("input").get("input" + str(i)) input_tensors.append( - torch.randint(0, 2, tuple(d for d in inp_tensor), dtype=precision_to_dtype(precision)).cuda() + torch.randint( + 0, + 2, + tuple(d for d in inp_tensor), + dtype=precision_to_dtype(precision), + ).cuda() ) if is_trt_engine: @@ -395,7 +471,14 @@ def load_model(params): backends = params.get("backend") # Run inference status = run( - model, backends, input_tensors, params, precision, truncate_long_and_double, batch_size, is_trt_engine + model, + backends, + input_tensors, + params, + precision, + truncate_long_and_double, + batch_size, + is_trt_engine, ) else: params = vars(args) @@ -417,12 +500,21 @@ def load_model(params): precisions = parse_precisions(params["precision"]) for precision in precisions: - input_tensors = parse_inputs(params["inputs"], precision_to_dtype(precision)) + input_tensors = parse_inputs( + params["inputs"], precision_to_dtype(precision) + ) if not is_trt_engine and (precision == "fp16" or precision == "half"): # If model is TensorRT serialized engine then model.half will report failure model = model.half() status = run( - model, backends, input_tensors, params, precision, truncate_long_and_double, batch_size, is_trt_engine + model, + backends, + input_tensors, + params, + precision, + truncate_long_and_double, + batch_size, + is_trt_engine, ) # Generate report diff --git a/tools/perf/utils.py b/tools/perf/utils.py index d7870d90fe..3d63dcd4b7 100644 --- a/tools/perf/utils.py +++ b/tools/perf/utils.py @@ -6,9 +6,18 @@ BENCHMARK_MODELS = { "vgg16": {"model": models.vgg16(pretrained=True), "path": "script"}, - "resnet50": {"model": torch.hub.load("pytorch/vision:v0.9.0", "resnet50", pretrained=True), "path": "script"}, - "efficientnet_b0": {"model": timm.create_model("efficientnet_b0", pretrained=True), "path": "script"}, - "vit": {"model": timm.create_model("vit_base_patch16_224", pretrained=True), "path": "script"}, + "resnet50": { + "model": torch.hub.load("pytorch/vision:v0.9.0", "resnet50", pretrained=True), + "path": "script", + }, + "efficientnet_b0": { + "model": timm.create_model("efficientnet_b0", pretrained=True), + "path": "script", + }, + "vit": { + "model": timm.create_model("vit_base_patch16_224", pretrained=True), + "path": "script", + }, "bert_base_uncased": {"model": cm.BertModule(), "path": "trace"}, } @@ -32,7 +41,11 @@ def parse_inputs(user_inputs, dtype): for input in parsed_inputs: input_shape = [] input_shape_and_dtype = input.split("@") - dtype = precision_to_dtype(input_shape_and_dtype[1]) if len(input_shape_and_dtype) == 2 else dtype + dtype = ( + precision_to_dtype(input_shape_and_dtype[1]) + if len(input_shape_and_dtype) == 2 + else dtype + ) for input_dim in input_shape_and_dtype[0][1:-1].split(","): input_shape.append(int(input_dim)) torchtrt_inputs.append(torch.randint(0, 5, input_shape, dtype=dtype).cuda())