fix: Add support for torch.int64 inputs in FXTRT

gs-olive · gs-olive · commit 5fe7c2380b61 · 2023-04-27T14:20:24.000-07:00
- Add utility capabilities for accepting `int64` inputs to TRTModules to
support multiple use cases
- Support cases include situations where internal tensors in split
modules are `int64` (generally used for indexing torch Tensors)
- This also supports cases where the user wants to input `long` tensors
as `forward` inputs
- Add test cases to verify functionality and accuracy
- Enable tests for `TRTModuleNext`, which are now fully supported on
`main`
diff --git a/py/torch_tensorrt/fx/test/core/test_trt_module.py b/py/torch_tensorrt/fx/test/core/test_trt_module.py
@@ -10,8 +10,8 @@
 from torch.testing._internal.common_utils import run_tests, TestCase
 from torch_tensorrt.fx import InputTensorSpec, TRTInterpreter, TRTModule
 
-# from torch_tensorrt import TRTModuleNext
-# from torch_tensorrt import Device
+from torch_tensorrt import TRTModuleNext
+from torch_tensorrt import Device
 from torch_tensorrt.fx.utils import LowerPrecision
 
 
@@ -58,89 +58,145 @@ def forward(self, x):
         )
 
 
-# TODO add unittest.skip later
-# class TestTRTModuleNext(TestCase):
-#     def test_save_and_load_trt_module(self):
-#         class TestModule(torch.nn.Module):
-#             def forward(self, x):
-#                 return x + x
-
-#         inputs = [torch.randn(1, 1)]
-#         mod = TestModule().eval()
-#         ref_output = mod(*inputs)
-
-#         mod = acc_tracer.trace(mod, inputs)
-
-#         interp = TRTInterpreter(
-#             mod,
-#             input_specs=InputTensorSpec.from_tensors(inputs),
-#             explicit_batch_dimension=True,
-#         )
-#         interp_res = interp.run(lower_precision=LowerPrecision.FP32)
-
-#         with io.BytesIO() as engine_bytes:
-#             engine_bytes.write(interp_res.engine.serialize())
-#             engine_str = engine_bytes.getvalue()
-
-#         trt_mod = TRTModuleNext(
-#             name="TestModule",
-#             serialized_engine=engine_str,
-#             input_binding_names=interp_res.input_names,
-#             output_binding_names=interp_res.output_names,
-#             target_device=Device(f"cuda:{torch.cuda.current_device()}"),
-#         )
-
-#         torch.save(trt_mod, "trt.pt")
-#         reload_trt_mod = torch.load("trt.pt")
-
-#         torch.testing.assert_allclose(
-#             reload_trt_mod(inputs[0].cuda()).cpu().reshape_as(ref_output),
-#             ref_output,
-#             rtol=1e-04,
-#             atol=1e-04,
-#         )
-#         os.remove(f"{os.getcwd()}/trt.pt")
-
-#     def test_save_and_load_state_dict(self):
-#         class TestModule(torch.nn.Module):
-#             def forward(self, x):
-#                 return x + x
-
-#         inputs = [torch.randn(1, 1)]
-#         mod = TestModule().eval()
-#         ref_output = mod(*inputs)
-
-#         mod = acc_tracer.trace(mod, inputs)
-#         interp = TRTInterpreter(
-#             mod,
-#             input_specs=InputTensorSpec.from_tensors(inputs),
-#             explicit_batch_dimension=True,
-#         )
-#         interp_res = interp.run(lower_precision=LowerPrecision.FP32)
-
-#         with io.BytesIO() as engine_bytes:
-#             engine_bytes.write(interp_res.engine.serialize())
-#             engine_str = engine_bytes.getvalue()
-
-#         trt_mod = TRTModuleNext(
-#             name="TestModule",
-#             serialized_engine=engine_str,
-#             input_binding_names=interp_res.input_names,
-#             output_binding_names=interp_res.output_names,
-#             target_device=Device(f"cuda:{torch.cuda.current_device()}"),
-#         )
-
-#         st = trt_mod.state_dict()
-
-#         new_trt_mod = TRTModuleNext()
-#         new_trt_mod.load_state_dict(st)
-
-#         torch.testing.assert_allclose(
-#             new_trt_mod(inputs[0].cuda()).cpu().reshape_as(ref_output),
-#             ref_output,
-#             rtol=1e-04,
-#             atol=1e-04,
-#         )
+class TestTRTModuleInt64Input(TestCase):
+    def test_save_and_load_trt_module(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x):
+                return x + x
+
+        inputs = [torch.randn(5, 5).long()]
+        mod = TestModule().eval()
+        ref_output = mod(*inputs)
+
+        mod = acc_tracer.trace(mod, inputs)
+        interp = TRTInterpreter(
+            mod,
+            input_specs=InputTensorSpec.from_tensors(inputs),
+        )
+        trt_mod = TRTModule(*interp.run(lower_precision=LowerPrecision.FP32))
+        torch.save(trt_mod, "trt.pt")
+        reload_trt_mod = torch.load("trt.pt")
+
+        torch.testing.assert_close(
+            reload_trt_mod(inputs[0].cuda()).cpu(),
+            ref_output,
+            rtol=1e-04,
+            atol=1e-04,
+            check_dtype=False,
+        )
+        os.remove(f"{os.getcwd()}/trt.pt")
+
+    def test_save_and_load_state_dict(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x):
+                return x + x
+
+        inputs = [torch.randn(5, 5).long()]
+        mod = TestModule().eval()
+        ref_output = mod(*inputs)
+
+        mod = acc_tracer.trace(mod, inputs)
+        interp = TRTInterpreter(
+            mod,
+            input_specs=InputTensorSpec.from_tensors(inputs),
+        )
+        trt_mod = TRTModule(*interp.run(lower_precision=LowerPrecision.FP32))
+        st = trt_mod.state_dict()
+
+        new_trt_mod = TRTModule()
+        new_trt_mod.load_state_dict(st)
+
+        torch.testing.assert_close(
+            new_trt_mod(inputs[0].cuda()).cpu(),
+            ref_output,
+            rtol=1e-04,
+            atol=1e-04,
+            check_dtype=False,
+        )
+
+
+class TestTRTModuleNext(TestCase):
+    def test_save_and_load_trt_module(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x):
+                return x + x
+
+        inputs = [torch.randn(1, 1)]
+        mod = TestModule().eval()
+        ref_output = mod(*inputs)
+
+        mod = acc_tracer.trace(mod, inputs)
+
+        interp = TRTInterpreter(
+            mod,
+            input_specs=InputTensorSpec.from_tensors(inputs),
+            explicit_batch_dimension=True,
+        )
+        interp_res = interp.run(lower_precision=LowerPrecision.FP32)
+
+        with io.BytesIO() as engine_bytes:
+            engine_bytes.write(interp_res.engine.serialize())
+            engine_str = engine_bytes.getvalue()
+
+        trt_mod = TRTModuleNext(
+            name="TestModule",
+            serialized_engine=engine_str,
+            input_binding_names=interp_res.input_names,
+            output_binding_names=interp_res.output_names,
+            target_device=Device(f"cuda:{torch.cuda.current_device()}"),
+        )
+
+        torch.save(trt_mod, "trt.pt")
+        reload_trt_mod = torch.load("trt.pt")
+
+        torch.testing.assert_allclose(
+            reload_trt_mod(inputs[0].cuda()).cpu().reshape_as(ref_output),
+            ref_output,
+            rtol=1e-04,
+            atol=1e-04,
+        )
+        os.remove(f"{os.getcwd()}/trt.pt")
+
+    def test_save_and_load_state_dict(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x):
+                return x + x
+
+        inputs = [torch.randn(1, 1)]
+        mod = TestModule().eval()
+        ref_output = mod(*inputs)
+
+        mod = acc_tracer.trace(mod, inputs)
+        interp = TRTInterpreter(
+            mod,
+            input_specs=InputTensorSpec.from_tensors(inputs),
+            explicit_batch_dimension=True,
+        )
+        interp_res = interp.run(lower_precision=LowerPrecision.FP32)
+
+        with io.BytesIO() as engine_bytes:
+            engine_bytes.write(interp_res.engine.serialize())
+            engine_str = engine_bytes.getvalue()
+
+        trt_mod = TRTModuleNext(
+            name="TestModule",
+            serialized_engine=engine_str,
+            input_binding_names=interp_res.input_names,
+            output_binding_names=interp_res.output_names,
+            target_device=Device(f"cuda:{torch.cuda.current_device()}"),
+        )
+
+        st = trt_mod.state_dict()
+
+        new_trt_mod = TRTModuleNext()
+        new_trt_mod.load_state_dict(st)
+
+        torch.testing.assert_allclose(
+            new_trt_mod(inputs[0].cuda()).cpu().reshape_as(ref_output),
+            ref_output,
+            rtol=1e-04,
+            atol=1e-04,
+        )
 
 
 if __name__ == "__main__":
diff --git a/py/torch_tensorrt/fx/trt_module.py b/py/torch_tensorrt/fx/trt_module.py
@@ -137,7 +137,6 @@ def forward(self, *inputs):
 
                 # This is only used when the trt engine is using implicit batch dim.
                 batch_size = inputs[0].shape[0]
-                contiguous_inputs: List[torch.Tensor] = [i.contiguous() for i in inputs]
                 bindings: List[Any] = [None] * (
                     len(self.input_names)
                     + len(self.output_names)
@@ -148,16 +147,27 @@ def forward(self, *inputs):
                     assert inputs[
                         i
                     ].is_cuda, f"{i}th input({input_name}) is not on cuda device."
+
+                    # Intercept int64 inputs to TRT Engines and cast them to int32
+                    if (
+                        inputs[i].dtype == torch.int64
+                        and self.input_dtypes[i] == torch.int32
+                    ):
+                        inputs = (
+                            inputs[:i] + (inputs[i].to(torch.int32),) + inputs[i + 1 :]
+                        )
+
                     assert (
                         inputs[i].dtype == self.input_dtypes[i]
                     ), f"Dtype mismatch for {i}th input({input_name}). Expect {self.input_dtypes[i]}, got {inputs[i].dtype}."
 
+                    contiguous_input = inputs[i].contiguous()
                     idx = self.input_binding_indices_in_order[i]
-                    bindings[idx] = contiguous_inputs[i].data_ptr()
+                    bindings[idx] = contiguous_input.data_ptr()
 
                     if not self.engine.has_implicit_batch_dimension:
                         self.context.set_binding_shape(
-                            idx, tuple(contiguous_inputs[i].shape)
+                            idx, tuple(contiguous_input.shape)
                         )
                     else:
                         assert inputs[i].size()[1:] == self.input_shapes[i], (
diff --git a/py/torch_tensorrt/fx/utils.py b/py/torch_tensorrt/fx/utils.py
@@ -5,6 +5,7 @@
 # @manual=//deeplearning/trt/python:py_tensorrt
 import tensorrt as trt
 import torch
+import logging
 from functorch import make_fx
 from functorch.experimental import functionalize
 from torch_tensorrt.fx.passes.lower_basic_pass import (
@@ -15,6 +16,9 @@
 from .types import Shape, TRTDataType
 
 
+_LOGGER: logging.Logger = logging.getLogger(__name__)
+
+
 class LowerPrecision(Enum):
     FP32 = "fp32"
     FP16 = "fp16"
@@ -37,6 +41,11 @@ def torch_dtype_to_trt(dtype: torch.dtype) -> TRTDataType:
         return trt.int8
     elif dtype == torch.int32:
         return trt.int32
+    elif dtype == torch.int64:
+        _LOGGER.warn(
+            "Detected Int64 Input, Casting to Int32 for TRT Engine Compatibility"
+        )
+        return trt.int32
     elif dtype == torch.float16:
         return trt.float16
     elif dtype == torch.float32: