fix: Fixed bugs and addressed review comments

Anurag Dixit · Anurag Dixit · commit 588e1d16c32c · 2021-12-15T05:05:52.000-08:00
Signed-off-by: Anurag Dixit &lt;anuragd@nvidia.com&gt;
diff --git a/examples/benchmark/py/README.md b/examples/benchmark/py/README.md
@@ -8,6 +8,14 @@ This is a comprehensive Python benchmark suite to run perf runs using different
 
 Note: Please note that for ONNX models, user can convert the ONNX model to TensorRT serialized engine and then use this package.
 
+## Prerequisite
+
+Benchmark scripts depends on following Python packages in addition to requirements.txt packages
+
+1. Torch-TensorRT
+2. Torch
+3. TensorRT
+
 ## Structure
 
 ```
@@ -42,14 +50,20 @@ There are two sample configuration files added.
 
 | Name | Supported Values | Description |
 | --- | --- | --- |
-| backend | all, torch, torch_tensorrt, tensorrt | Supported backends for inference |
+| backend | all, torch, torch_tensorrt, tensorrt | Supported backends for inference. |
 | input | - | Input binding names. Expected to list shapes of each input bindings |
 | model | - | Configure the model filename and name |
-| filename | - | Model file name to load from disk |
+| filename | - | Model file name to load from disk. |
 | name | - | Model name | 
 | runtime | - | Runtime configurations | 
 | device | 0 | Target device ID to run inference. Range depends on available GPUs |
-| precision | fp32, fp16 or half, int8 | Target precision to run inference |
+| precision | fp32, fp16 or half, int8 | Target precision to run inference. int8 cannot be used with 'all' backend |
+| calibration_cache | - | Calibration cache file expected for torch_tensorrt runtime in int8 precision |
+
+Note: 
+1. Please note that torch runtime perf is not supported for int8 yet.
+2. Torchscript module filename should end with .jit.pt otherwise it will be treated as a TensorRT engine.
+
 
 
 Additional sample use case:
@@ -64,7 +78,7 @@ input:
     - 3
     - 224
     - 224
-  num_of_input: 1
+  num_inputs: 1
 model: 
   filename: model.plan
   name: vgg16
diff --git a/examples/benchmark/py/config/vgg16.yml b/examples/benchmark/py/config/vgg16.yml
@@ -7,7 +7,7 @@ input:
     - 3
     - 224
     - 224
-  num_of_input: 1
+  num_inputs: 1
 model: 
   filename: vgg16_traced.jit.pt
   name: vgg16
diff --git a/examples/benchmark/py/config/vgg16_trt.yml b/examples/benchmark/py/config/vgg16_trt.yml
@@ -6,12 +6,15 @@ input:
     - 3
     - 224
     - 224
-  num_of_input: 1
+  num_inputs: 1
 model: 
   filename: model.plan
   name: vgg16
+calibration_cache:
+  - vgg16.cache
 runtime: 
   device: 0
   precision: 
     - fp32
     - fp16
+    - int8
diff --git a/examples/benchmark/py/perf_run.py b/examples/benchmark/py/perf_run.py
@@ -78,6 +78,10 @@ def run_torch_tensorrt(model, input_tensors, params, precision):
        "inputs": input_tensors,
        "enabled_precisions": {precision_to_dtype(precision)} 
     }
+
+    if precision == 'int8':
+        compile_settings.update({"calib": params.get('calibration_cache')})
+
     
     model = torchtrt.compile(model, **compile_settings)
  
@@ -166,26 +170,35 @@ def run_tensorrt(model, input_tensors, params, precision, is_trt_engine=False):
             k += 1
     
     timings = []
-    with torch.no_grad():
-        with engine.create_execution_context() as context:
-            for i in range(WARMUP_ITER):
-                context.execute_async(batch_size, bindings, torch.cuda.current_stream().cuda_stream)
-                torch.cuda.synchronize()
-            
-            for i in range(iters):
-                start_time = timeit.default_timer()
-                context.execute_async(batch_size, bindings, torch.cuda.current_stream().cuda_stream)
-                torch.cuda.synchronize()
-                end_time = timeit.default_timer()
-                meas_time = end_time - start_time
-                timings.append(meas_time)
-                print("Iterations {}: {:.6f} s".format(i, end_time - start_time))
+    with engine.create_execution_context() as context:
+        for i in range(WARMUP_ITER):
+            context.execute_async(batch_size, bindings, torch.cuda.current_stream().cuda_stream)
+            torch.cuda.synchronize()
+
+        for i in range(iters):
+            start_time = timeit.default_timer()
+            context.execute_async(batch_size, bindings, torch.cuda.current_stream().cuda_stream)
+            torch.cuda.synchronize()
+            end_time = timeit.default_timer()
+            meas_time = end_time - start_time
+            timings.append(meas_time)
+            print("Iterations {}: {:.6f} s".format(i, end_time - start_time))
     
     printStats("TensorRT", timings, precision)
 
 # Deploys inference run for different backend configurations
 def run(model, input_tensors, params, precision, is_trt_engine = False):
     for backend in params.get('backend'):
+
+        if precision == 'int8':
+            if backend == 'all' or backend == 'torch':
+                print("int8 precision is not supported for torch runtime in this script yet")
+                return False
+
+            if backend == 'all' or backend == 'torch_tensorrt' or params.get('calibration_cache', None) == None:
+                print("int8 precision expects calibration cache file for inference")
+                return False
+
         if backend == 'all':
             run_torch(model, input_tensors, params, precision)
             run_torch_tensorrt(model, input_tensors, params, precision)
@@ -280,20 +293,25 @@ def load_model(params):
     # Create random input tensor of certain size
     torch.manual_seed(12345)
 
-    num_input = params.get('input').get('num_of_input')
+    num_input = params.get('input').get('num_inputs')
     for precision in params.get('runtime').get('precision', 'fp32'):
         input_tensors = []
-        num_input = params.get('input').get('num_of_input', 1)
+        num_input = params.get('input').get('num_inputs', 1)
         for i in range(num_input):
             inp_tensor = params.get('input').get('input' + str(i))
             input_tensors.append(torch.randint(0, 2, tuple(d for d in inp_tensor), dtype=precision_to_dtype(precision)).cuda())
 
+        if is_trt_engine:
+            print("Warning, TensorRT engine file is configured. Please make sure the precision matches with the TRT engine for reliable results")
+
         if not is_trt_engine and precision == "fp16" or precision == "half":
             # If model is TensorRT serialized engine then model.half will report failure
             model = model.half()
         
         # Run inference
-        run(model, input_tensors, params, precision, is_trt_engine)
+        status = run(model, input_tensors, params, precision, is_trt_engine)
+        if status == False:
+            continue
 
     # Generate report
     print('Model Summary:')
diff --git a/examples/benchmark/py/requirements.txt b/examples/benchmark/py/requirements.txt
@@ -0,0 +1,5 @@
+timeit
+numpy
+argparse
+yaml
+pandas

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +timeit
 +numpy
 +argparse
 +yaml
 +pandas