feat: Add device checks in Python Runtime

gs-olive · gs-olive · commit cc32a3c4cfca · 2023-12-05T22:37:33.000-08:00
- Add support for device checks in the Python Runtime, to mirror those
in the C++ runtime
- Fix various issues in partitioning and runtime accordingly
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
@@ -129,13 +129,15 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
     }
     for (size_t i = 0; i < inputs.size(); i++) {
       std::string name = compiled_engine->in_binding_names[i];
-      TORCHTRT_CHECK(
-          inputs[i].is_cuda(), "Expected input tensors to have device cuda, found device " << inputs[i].device());
-      auto expected_type =
-          util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str()));
-      TORCHTRT_CHECK(
-          inputs[i].dtype() == expected_type,
-          "Expected input tensors to have type " << expected_type << ", found type " << inputs[i].dtype());
+      if (SAFE_MODE) {
+        TORCHTRT_CHECK(
+            inputs[i].is_cuda(), "Expected input tensors to have device cuda, found device " << inputs[i].device());
+        auto expected_type =
+            util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str()));
+        TORCHTRT_CHECK(
+            inputs[i].dtype() == expected_type,
+            "Expected input tensors to have type " << expected_type << ", found type " << inputs[i].dtype());
+      }
       auto dims = core::util::toDims(inputs[i].sizes());
       auto shape = core::util::toVec(dims);
       LOG_DEBUG("Input Name: " << name << " Shape: " << dims);
diff --git a/core/runtime/runtime.cpp b/core/runtime/runtime.cpp
@@ -33,13 +33,13 @@ c10::optional<RTDevice> get_most_compatible_device(const RTDevice& target_device
     if (device.device_name == target_device.device_name) {
       // First priority is selecting a candidate which agrees with the current device ID
       // If such a device is found, we can select it and break out of the loop
-      if (device.id == current_device.id && best_match.id != current_device.id) {
+      if (device.id == current_device.id) {
         best_match = device;
         break;
       }
       // Second priority is selecting a candidate which agrees with the target device ID
       // At deserialization time, the current device and target device may not agree
-      else if (device.id == target_device.id && best_match.id != target_device.id) {
+      else if (device.id == target_device.id) {
         best_match = device;
       }
       // If no such GPU ID is found, select the first available candidate GPU
diff --git a/py/torch_tensorrt/_compile.py b/py/torch_tensorrt/_compile.py
@@ -20,6 +20,8 @@
 
 DYNAMO_ENABLED = version.parse(sanitized_torch_version()) >= version.parse("2.1.dev")
 
+SAFE_MODE = torch.ops.tensorrt.get_safe_mode()
+
 if DYNAMO_ENABLED:
     from torch._export import ExportedProgram
     from torch_tensorrt.dynamo._compiler import compile as dynamo_compile
@@ -256,18 +258,22 @@ def torch_compile(module: torch.nn.Module, **kwargs: Any) -> Any:
     return boxed_fn
 
 
-def enable_unsafe_inference_mode():
+def enable_unsafe_inference_mode() -> None:
     """
     Enables unsafe inference mode for Torch-TensorRT
     """
+    global SAFE_MODE
+    SAFE_MODE = False
     torch.ops.tensorrt.set_safe_mode(False)
     logger.info("Enabled unsafe inference mode")
 
 
-def enable_safe_inference_mode():
+def enable_safe_inference_mode() -> None:
     """
     Enables safe inference mode for Torch-TensorRT
     """
+    global SAFE_MODE
+    SAFE_MODE = True
     torch.ops.tensorrt.set_safe_mode(True)
     logger.info("Enabled safe inference mode")
 
diff --git a/py/torch_tensorrt/dynamo/conversion/_conversion.py b/py/torch_tensorrt/dynamo/conversion/_conversion.py
@@ -3,15 +3,14 @@
 import io
 from typing import Sequence
 
+import tensorrt as trt
 import torch
 from torch_tensorrt._Input import Input
 from torch_tensorrt.dynamo._settings import CompilationSettings
 from torch_tensorrt.dynamo.conversion._TRTInterpreter import TRTInterpreter
 from torch_tensorrt.dynamo.runtime import PythonTorchTensorRTModule, TorchTensorRTModule
 from torch_tensorrt.dynamo.utils import get_torch_inputs
 
-import tensorrt as trt
-
 
 def convert_module(
     module: torch.fx.GraphModule,
@@ -72,6 +71,8 @@ def convert_module(
             engine=interpreter_result.engine,
             input_names=list(interpreter_result.input_names),
             output_names=list(interpreter_result.output_names),
+            target_device=settings.device,
+            profiling_enabled=settings.debug,
         )
 
     else:
diff --git a/py/torch_tensorrt/dynamo/partitioning/_adjacency_partitioner.py b/py/torch_tensorrt/dynamo/partitioning/_adjacency_partitioner.py
@@ -42,10 +42,10 @@ def is_node_supported(
         node_name = ConverterRegistry.qualified_name_or_str(node.target)
 
         if (
-            node in CONVERTERS or (node.op == "get_attr" and "constant" in node_name)
+            node in CONVERTERS or node.op == "get_attr"
         ) and node_name not in self.torch_executed_ops:
             # If node is a proper, supported computational node, store the operator
-            if not node.is_impure():
+            if not node.is_impure() and node.op != "get_attr":
                 if node_name not in self.supported_operators:
                     self.supported_operators[node_name] = 1
                 else:
diff --git a/py/torch_tensorrt/dynamo/partitioning/_global_partitioner.py b/py/torch_tensorrt/dynamo/partitioning/_global_partitioner.py
@@ -150,10 +150,10 @@ def is_node_supported(
         node_name = ConverterRegistry.qualified_name_or_str(node.target)
 
         if (
-            node in CONVERTERS or (node.op == "get_attr" and "constant" in node_name)
+            node in CONVERTERS or node.op == "get_attr"
         ) and node_name not in self.torch_executed_ops:
             # If node is a proper, supported computational node, store the operator
-            if not node.is_impure():
+            if not node.is_impure() and node.op != "get_attr":
                 if node_name not in self.supported_operators:
                     self.supported_operators[node_name] = 1
                 else:
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -1,13 +1,18 @@
 from __future__ import annotations
 
 import logging
+from contextlib import nullcontext
 from typing import Any, Dict, List, Optional, Sequence, Tuple
 
 import tensorrt as trt
 import torch
 from torch.nn import Module
+from torch_tensorrt._Device import Device
+from torch_tensorrt.dynamo.runtime.tools import _is_switch_required, _select_rt_device
 from torch_tensorrt.fx.utils import Frameworks, unified_dtype_converter
 
+import torch_tensorrt
+
 logger = logging.getLogger(__name__)
 
 
@@ -23,13 +28,22 @@ def __init__(
         engine: trt.ICudaEngine,
         input_names: Optional[List[str]] = None,
         output_names: Optional[List[str]] = None,
+        target_device: Device = Device._current_device(),
+        profiling_enabled: Optional[bool] = None,
     ):
         super(PythonTorchTensorRTModule, self).__init__()
         self._register_state_dict_hook(PythonTorchTensorRTModule._on_state_dict)
         self.engine = engine
         self.input_names = input_names if input_names is not None else []
         self.output_names = output_names if output_names is not None else []
         self.initialized = False
+        self.target_device_id = target_device.gpu_id
+        self.target_device_properties = torch.cuda.get_device_properties(
+            self.target_device_id
+        )
+        self.profiling_enabled = (
+            profiling_enabled if profiling_enabled is not None else False
+        )
         self._initialize()
 
     def _initialize(self) -> None:
@@ -141,15 +155,41 @@ def __setstate__(self, state: Dict[str, Any]) -> None:
         if self.engine:
             self.context = self.engine.create_execution_context()
 
-    def forward(self, *inputs: Any) -> torch.Tensor | Tuple[torch.Tensor, ...]:
+    def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, ...]:
         with torch.autograd.profiler.record_function(
             "PythonTorchTensorRTModule:Forward"
-        ):
+        ) if self.profiling_enabled else nullcontext():
             self._check_initialized()
 
+            # If in safe mode, check at each iteration for for whether a switch is required
+            if torch_tensorrt._compile.SAFE_MODE:
+                curr_device_id = torch.cuda.current_device()
+                curr_device_properties = torch.cuda.get_device_properties(
+                    curr_device_id
+                )
+                logger.debug(f"Current Device: cuda:{curr_device_id}")
+
+                # If a switch is required, move all inputs to new device and set as active device
+                if _is_switch_required(
+                    curr_device_id,
+                    self.target_device_id,
+                    curr_device_properties,
+                    self.target_device_properties,
+                ):
+                    device_id, _ = _select_rt_device(
+                        curr_device_id,
+                        self.target_device_id,
+                        self.target_device_properties,
+                    )
+                    device = torch.device(device_id)
+                    torch.cuda.set_device(device_id)
+
+                    inputs = tuple([tensor.to(device) for tensor in inputs])
+                    logger.warning(f"Moved all input Tensors to cuda:{device_id}")
+
             with torch.autograd.profiler.record_function(
                 "PythonTorchTensorRTModule:ProcessInputs"
-            ):
+            ) if self.profiling_enabled else nullcontext():
                 assert len(inputs) == len(
                     self.input_names
                 ), f"Wrong number of inputs, expect {len(self.input_names)} get {len(inputs)}."
@@ -162,22 +202,24 @@ def forward(self, *inputs: Any) -> torch.Tensor | Tuple[torch.Tensor, ...]:
                 )
 
                 for i, input_name in enumerate(self.input_names):
-                    if not contiguous_inputs[i].is_cuda:
-                        logger.warning(
-                            f"Detected input {input_name} of engine {self.engine.name} is not on a cuda device. "
-                            "This tensor is being moved by the runtime but for performance considerations, "
-                            "ensure your inputs are all on GPU and open an issue here "
-                            "(https://github.com/pytorch/TensorRT/issues) if this warning persists."
-                        )
-                        contiguous_inputs = (
-                            contiguous_inputs[:i]
-                            + [contiguous_inputs[i].cuda()]
-                            + contiguous_inputs[i + 1 :]
-                        )
-
-                    assert (
-                        contiguous_inputs[i].dtype == self.input_dtypes[i]
-                    ), f"Dtype mismatch for {i}th input({input_name}). Expect {self.input_dtypes[i]}, got {contiguous_inputs[i].dtype}."
+                    # Check that the inputs are on cuda and have the correct data type if in safe mode
+                    if torch_tensorrt._compile.SAFE_MODE:
+                        if not contiguous_inputs[i].is_cuda:
+                            logger.warning(
+                                f"Detected input {input_name} of engine {self.engine.name} is not on a cuda device. "
+                                "This tensor is being moved by the runtime but for performance considerations, "
+                                "ensure your inputs are all on GPU and open an issue here "
+                                "(https://github.com/pytorch/TensorRT/issues) if this warning persists."
+                            )
+                            contiguous_inputs = (
+                                contiguous_inputs[:i]
+                                + [contiguous_inputs[i].cuda()]
+                                + contiguous_inputs[i + 1 :]
+                            )
+
+                        assert (
+                            contiguous_inputs[i].dtype == self.input_dtypes[i]
+                        ), f"Dtype mismatch for {i}th input({input_name}). Expect {self.input_dtypes[i]}, got {contiguous_inputs[i].dtype}."
 
                     idx = self.input_binding_indices_in_order[i]
                     bindings[idx] = contiguous_inputs[i].data_ptr()
@@ -188,7 +230,7 @@ def forward(self, *inputs: Any) -> torch.Tensor | Tuple[torch.Tensor, ...]:
 
             with torch.autograd.profiler.record_function(
                 "PythonTorchTensorRTModule:ProcessOutputs"
-            ):
+            ) if self.profiling_enabled else nullcontext():
                 # create output tensors
                 outputs: List[torch.Tensor] = []
 
@@ -215,7 +257,7 @@ def forward(self, *inputs: Any) -> torch.Tensor | Tuple[torch.Tensor, ...]:
 
             with torch.autograd.profiler.record_function(
                 "PythonTorchTensorRTModule:TensorRTRuntime"
-            ):
+            ) if self.profiling_enabled else nullcontext():
                 self.context.execute_async_v2(
                     bindings, torch.cuda.current_stream().cuda_stream
                 )
@@ -235,6 +277,8 @@ def enable_profiling(self, profiler: "trt.IProfiler" = None) -> None:
         if not self.context.profiler:
             self.context.profiler = trt.Profiler() if profiler is None else profiler
 
+        self.profiling_enabled = True
+
     def disable_profiling(self) -> None:
         """
         Disable TensorRT profiling.
@@ -244,6 +288,7 @@ def disable_profiling(self) -> None:
         torch.cuda.synchronize()
         del self.context
         self.context = self.engine.create_execution_context()
+        self.profiling_enabled = False
 
     def get_layer_info(self) -> str:
         """
diff --git a/py/torch_tensorrt/dynamo/runtime/tools.py b/py/torch_tensorrt/dynamo/runtime/tools.py
diff --git a/tests/py/dynamo/runtime/test_safe_mode.py b/tests/py/dynamo/runtime/test_safe_mode.py