Merge pull request #2513 from pytorch/cherry_picks_rel_2_1

gs-olive · web-flow · commit 9b13f101e057 · 2023-12-05T16:40:29.000-08:00
cherry-pick: Perf + Bugfix PRs
diff --git a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py
@@ -1,10 +1,16 @@
 import math
 from typing import Optional
 
+import numpy as np
+import tensorrt as trt
 from torch.fx.node import Target
 from torch_tensorrt.dynamo._SourceIR import SourceIR
+from torch_tensorrt.dynamo.conversion import impl
 from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext
-from torch_tensorrt.dynamo.conversion.converter_utils import get_positive_dim
+from torch_tensorrt.dynamo.conversion.converter_utils import (
+    get_positive_dim,
+    get_trt_tensor,
+)
 from torch_tensorrt.dynamo.conversion.impl.slice.base import slice
 from torch_tensorrt.fx.converters.converter_utils import (
     has_dynamic_shape,
@@ -96,3 +102,98 @@ def expand(
     layer = ctx.net.add_slice(input_t, start=start, shape=shape, stride=stride)
     set_layer_name(layer, target, name, source_ir)
     return layer.get_output(0)
+
+
+def chunk(
+    ctx: ConversionContext,
+    target: Target,
+    source_ir: Optional[SourceIR],
+    name: str,
+    input: TRTTensor,
+    chunks: int,
+    dim: int,
+) -> TRTTensor:
+    if chunks <= 0:
+        raise RuntimeError(
+            f"chunk expects `chunks` to be greater than 0, got: {chunks}"
+        )
+
+    shape = input.shape
+    dim = get_positive_dim(dim, len(shape))
+
+    if dim >= len(shape):
+        raise RuntimeError(
+            f"chunk expects `dim` to be less than the length of input shape, got: {dim}"
+        )
+
+    dynamic_shape = has_dynamic_shape(input.shape)
+    if dynamic_shape > 0:
+        # Check whether slice target dim is dynamic shape dim
+        assert input.shape[dim] != -1, "Can't chunk on dynamic shape dimension!"
+
+    size_dim = shape[dim]
+    chunk_size = math.ceil(size_dim / chunks)
+    result = []
+    start = 0
+    end = min(start + chunk_size, size_dim)
+    cnt = 0
+
+    while start < end:
+        result.append(
+            slice_op(
+                ctx,
+                target,
+                source_ir,
+                f"{name}_slice_{cnt}",
+                input,
+                dim,
+                start,
+                end,
+                1,
+            )
+        )
+        start = end
+        end = min(start + chunk_size, size_dim)
+        cnt += 1
+
+    return result
+
+
+def cumsum(
+    ctx: ConversionContext,
+    target: Target,
+    source_ir: Optional[SourceIR],
+    name: str,
+    input: TRTTensor,
+    dim: int,
+) -> TRTTensor:
+    input_shape = input.shape
+    dim = get_positive_dim(dim, len(input_shape))
+    loop = ctx.net.add_loop()
+    axis = np.array(input_shape[dim])
+    trip_limit = get_trt_tensor(ctx, axis, f"{name}_trip_limit")
+    loop.add_trip_limit(trip_limit, trt.TripLimit.COUNT)
+    iterator = loop.add_iterator(input, dim, reverse=False)
+    data = iterator.get_output(0)
+    new_dims = tuple(data.shape)
+    zeros = np.zeros(new_dims)
+    zero_trttensor = get_trt_tensor(ctx, zeros, f"{name}_initial_value")
+
+    running_sum = loop.add_recurrence(zero_trttensor)
+    set_layer_name(running_sum, target, f"{name}_running_sum", source_ir)
+    running_sum_tensor = running_sum.get_output(0)
+
+    current_sum = impl.elementwise.add(
+        ctx,
+        target,
+        source_ir,
+        f"{name}_elementwise_add",
+        data,
+        running_sum_tensor,
+    )
+    running_sum.set_input(1, current_sum)
+
+    loop_output = loop.add_loop_output(current_sum, trt.LoopOutput.CONCATENATE, dim)
+    set_layer_name(loop_output, target, f"{name}_loop_output", source_ir)
+    loop_output.set_input(1, trip_limit)
+    return loop_output.get_output(0)
diff --git a/tests/py/dynamo/conversion/test_cumsum_aten.py b/tests/py/dynamo/conversion/test_cumsum_aten.py
@@ -0,0 +1,69 @@
+import torch
+import torch.nn as nn
+from parameterized import parameterized
+from torch.testing._internal.common_utils import run_tests
+
+from .harness import DispatchTestCase
+
+
+class TestCumsumConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            ((1,), 0),
+            ((2,), 0),
+            ((3,), -1),
+        ]
+    )
+    def test_cumsum_1D(self, shape, dim):
+        class Cumsum(nn.Module):
+            def forward(self, x):
+                return torch.ops.aten.cumsum.default(x, dim)
+
+        inputs = [torch.randn(shape)]
+        self.run_test(
+            Cumsum(),
+            inputs,
+        )
+
+    @parameterized.expand(
+        [
+            ((3, 1), 0),
+            ((3, 1), 1),
+            ((2, 3), -1),
+            ((2, 3), -2),
+        ]
+    )
+    def test_cumsum_2D(self, shape, dims):
+        class Cumsum(nn.Module):
+            def forward(self, x):
+                return torch.ops.aten.cumsum.default(x, dims)
+
+        inputs = [torch.randn(shape)]
+        self.run_test(
+            Cumsum(),
+            inputs,
+        )
+
+    @parameterized.expand(
+        [
+            ((4, 2, 3), 0),
+            ((4, 2, 3), 1),
+            ((1, 2, 3), 2),
+            ((1, 2, 3), -1),
+            ((1, 2, 3), -2),
+        ]
+    )
+    def test_cumsum_3D(self, shape, dims):
+        class Cumsum(nn.Module):
+            def forward(self, x):
+                return torch.ops.aten.cumsum.default(x, dims)
+
+        inputs = [torch.randn(shape)]
+        self.run_test(
+            Cumsum(),
+            inputs,
+        )
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/tools/perf/benchmark.sh b/tools/perf/benchmark.sh
@@ -7,8 +7,8 @@ python hub.py
 
 batch_sizes=(1 2 4 8 16 32 64 128 256)
 large_model_batch_sizes=(1 2 4 8 16 32 64)
-backends=("torch" "ts_trt" "dynamo" "torch_compile" "inductor")
-backends_no_torchscript=("torch" "dynamo" "torch_compile" "inductor")
+backends=("torch" "ts_trt" "dynamo" "torch_compile" "inductor" "tensorrt")
+backends_no_torchscript=("torch" "dynamo" "torch_compile" "inductor" "tensorrt")
 
 
 # Benchmark VGG16 model
diff --git a/tools/perf/perf_run.py b/tools/perf/perf_run.py
@@ -293,29 +293,30 @@ def run_tensorrt(
     input_tensors,
     params,
     precision,
-    is_trt_engine=False,
     batch_size=1,
 ):
-    engine = None
-
-    # If the model file is a TensorRT engine then directly deserialize and run inference
-    # else convert the torch module to a TensorRT engine first and then run inference
-    if not is_trt_engine:
-        compile_settings = {
-            "inputs": input_tensors,
-            "enabled_precisions": {precision_to_dtype(precision)},
-            "truncate_long_and_double": params.get("truncate", False),
-        }
-
-        print("Converting method to TensorRT engine...")
-        with torch.no_grad(), torchtrt.logging.errors():
-            model = torchtrt.ts.convert_method_to_trt_engine(
-                model, "forward", **compile_settings
-            )
-
+    # Export an ONNX model and convert to TRT
+    torch.onnx.export(model.eval().cuda(), tuple(input_tensors), "./tmp.onnx")
+    logger = trt.Logger(trt.Logger.WARNING)
+    builder = trt.Builder(logger)
+    network = builder.create_network(
+        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    )
+    parser = trt.OnnxParser(network, logger)
+    success = parser.parse_from_file("./tmp.onnx")
+    if not success:
+        raise ValueError("ONNX conversion failed")
+
+    config = builder.create_builder_config()
+    if precision == "fp16":
+        config.set_flag(trt.BuilderFlag.FP16)
+    start_compile = time.time_ns()
+    serialized_engine = builder.build_serialized_network(network, config)
+    end_compile = time.time_ns()
+    compile_time_s = (end_compile - start_compile) / 1e9
     # Deserialize the TensorRT engine
-    with trt.Logger() as logger, trt.Runtime(logger) as runtime:
-        engine = runtime.deserialize_cuda_engine(model)
+    with trt.Runtime(logger) as runtime:
+        engine = runtime.deserialize_cuda_engine(serialized_engine)
 
     print("Running TensorRT for precision: ", precision, " batch_size : ", batch_size)
     iters = params.get("iterations", 20)
@@ -350,7 +351,7 @@ def run_tensorrt(
             meas_time = end_time - start_time
             timings.append(meas_time)
 
-    recordStats("TensorRT", timings, precision, batch_size)
+    recordStats("TensorRT", timings, precision, batch_size, compile_time_s)
 
 
 # Deploys inference run for different backend configurations
@@ -426,11 +427,10 @@ def run(
             )
         elif backend == "tensorrt":
             run_tensorrt(
-                model,
+                model_torch,
                 input_tensors,
                 params,
                 precision,
-                is_trt_engine,
                 batch_size,
             )
         elif backend == "dynamo":
@@ -439,9 +439,6 @@ def run(
         elif backend == "torch_compile":
             run_torch_compile(model_torch, input_tensors, params, precision, batch_size)
 
-        elif backend == "torch_compile":
-            run_torch_compile(model_torch, input_tensors, params, precision, batch_size)
-
         elif backend == "inductor":
             run_inductor(model_torch, input_tensors, params, precision, batch_size)
 
diff --git a/tools/perf/requirements.txt b/tools/perf/requirements.txt
@@ -1,7 +1,9 @@
 numpy
 argparse
 pyyaml
+onnx
 transformers==4.33.2
 diffusers==0.21.4
 pandas==2.0.1
 timm==0.9.8
+