feat: Add documentation and warnings for Safe Mode

gs-olive · gs-olive · commit 518e0a6f9d43 · 2023-12-06T17:42:57.000-08:00
diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
@@ -52,6 +52,7 @@ TRTEngine::TRTEngine(
   auto most_compatible_device = get_most_compatible_device(cuda_device);
   TORCHTRT_CHECK(most_compatible_device, "No compatible device was found for instantiating TensorRT engine");
   device_info = most_compatible_device.value();
+  multi_gpu_device_check(device_info);
   set_rt_device(device_info);
 
   rt = make_trt(nvinfer1::createInferRuntime(util::logging::get_logger()));
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
@@ -74,7 +74,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
     LOG_INFO("" << log_info);
   }
 
-  if (SAFE_MODE) {
+  if (MULTI_DEVICE_SAFE_MODE) {
     std::unique_ptr<torch::autograd::profiler::RecordProfile> device_profiler_guard;
     if (compiled_engine->profile_execution) {
       device_profiler_guard =
@@ -129,15 +129,13 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
     }
     for (size_t i = 0; i < inputs.size(); i++) {
       std::string name = compiled_engine->in_binding_names[i];
-      if (SAFE_MODE) {
-        TORCHTRT_CHECK(
-            inputs[i].is_cuda(), "Expected input tensors to have device cuda, found device " << inputs[i].device());
-        auto expected_type =
-            util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str()));
-        TORCHTRT_CHECK(
-            inputs[i].dtype() == expected_type,
-            "Expected input tensors to have type " << expected_type << ", found type " << inputs[i].dtype());
-      }
+      TORCHTRT_CHECK(
+          inputs[i].is_cuda(), "Expected input tensors to have device cuda, found device " << inputs[i].device());
+      auto expected_type =
+          util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str()));
+      TORCHTRT_CHECK(
+          inputs[i].dtype() == expected_type,
+          "Expected input tensors to have type " << expected_type << ", found type " << inputs[i].dtype());
       auto dims = core::util::toDims(inputs[i].sizes());
       auto shape = core::util::toVec(dims);
       LOG_DEBUG("Input Name: " << name << " Shape: " << dims);
diff --git a/core/runtime/register_jit_hooks.cpp b/core/runtime/register_jit_hooks.cpp
@@ -114,8 +114,10 @@ TORCH_LIBRARY(tensorrt, m) {
   m.def("execute_engine", execute_engine);
   m.def("SERIALIZED_ENGINE_BINDING_DELIM", []() -> std::string { return std::string(1, TRTEngine::BINDING_DELIM); });
   m.def("ABI_VERSION", []() -> std::string { return ABI_VERSION; });
-  m.def("get_safe_mode", []() -> bool { return SAFE_MODE; });
-  m.def("set_safe_mode", [](bool safe_mode) -> void { SAFE_MODE = safe_mode; });
+  m.def("get_multi_device_safe_mode", []() -> bool { return MULTI_DEVICE_SAFE_MODE; });
+  m.def("set_multi_device_safe_mode", [](bool multi_device_safe_mode) -> void {
+    MULTI_DEVICE_SAFE_MODE = multi_device_safe_mode;
+  });
 }
 
 } // namespace
diff --git a/core/runtime/runtime.cpp b/core/runtime/runtime.cpp
@@ -7,7 +7,7 @@ namespace torch_tensorrt {
 namespace core {
 namespace runtime {
 
-bool SAFE_MODE = true;
+bool MULTI_DEVICE_SAFE_MODE = false;
 
 c10::optional<RTDevice> get_most_compatible_device(const RTDevice& target_device, const RTDevice& curr_device) {
   LOG_DEBUG("Target Device: " << target_device);
@@ -105,6 +105,19 @@ RTDevice get_current_device() {
   return RTDevice(device_id, nvinfer1::DeviceType::kGPU);
 }
 
+void multi_gpu_device_check(const RTDevice& most_compatible_device) {
+  // If multi-device safe mode is disabled and more than 1 device is registered on the machine, warn user
+  if (!(MULTI_DEVICE_SAFE_MODE) && get_available_device_list().get_devices().size() > 1) {
+    LOG_WARNING(
+        "Detected this engine is being instantitated in a multi-GPU system with "
+        << "multi-device safe mode disabled. For more on the implications of this "
+        << "as well as workarounds, see MULTI_DEVICE_SAFE_MODE.md "
+        << "(https://github.com/pytorch/TensorRT/blob/main/py/torch_tensorrt/dynamo/runtime/MULTI_DEVICE_SAFE_MODE.md). "
+        << "The engine is set to be instantiated on the cuda device, " << most_compatible_device << ". "
+        << "If this is incorrect, please set the desired cuda device as default and retry.");
+  }
+}
+
 namespace {
 static DeviceList cuda_device_list;
 }
diff --git a/core/runtime/runtime.h b/core/runtime/runtime.h
@@ -16,7 +16,7 @@ namespace runtime {
 
 using EngineID = int64_t;
 const std::string ABI_VERSION = "4";
-extern bool SAFE_MODE;
+extern bool MULTI_DEVICE_SAFE_MODE;
 typedef enum {
   ABI_TARGET_IDX = 0,
   NAME_IDX,
@@ -34,6 +34,8 @@ std::vector<RTDevice> find_compatible_devices(const RTDevice& target_device);
 
 std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine);
 
+void multi_gpu_device_check(const RTDevice& most_compatible_device);
+
 class DeviceList {
   using DeviceMap = std::unordered_map<int, RTDevice>;
   DeviceMap device_list;
diff --git a/py/torch_tensorrt/__init__.py b/py/torch_tensorrt/__init__.py
@@ -82,17 +82,14 @@ def _find_lib(name: str, paths: List[str]) -> str:
 
 import torch
 from torch_tensorrt._compile import *  # noqa: F403
-from torch_tensorrt._compile import (
-    enable_safe_inference_mode,
-    enable_unsafe_inference_mode,
-)
 from torch_tensorrt._Device import Device  # noqa: F401
 from torch_tensorrt._enums import *  # noqa: F403
 from torch_tensorrt._Input import Input  # noqa: F401
 from torch_tensorrt._utils import *  # noqa: F403
 from torch_tensorrt._utils import sanitized_torch_version
 from torch_tensorrt.logging import *
 from torch_tensorrt.ptq import *
+from torch_tensorrt.runtime import *  # noqa: F403
 
 if version.parse(sanitized_torch_version()) >= version.parse("2.1.dev"):
     from torch_tensorrt.dynamo import backend  # noqa: F401
diff --git a/py/torch_tensorrt/_compile.py b/py/torch_tensorrt/_compile.py
@@ -20,8 +20,6 @@
 
 DYNAMO_ENABLED = version.parse(sanitized_torch_version()) >= version.parse("2.1.dev")
 
-SAFE_MODE = torch.ops.tensorrt.get_safe_mode()
-
 if DYNAMO_ENABLED:
     from torch._export import ExportedProgram
     from torch_tensorrt.dynamo._compiler import compile as dynamo_compile
@@ -258,26 +256,6 @@ def torch_compile(module: torch.nn.Module, **kwargs: Any) -> Any:
     return boxed_fn
 
 
-def enable_unsafe_inference_mode() -> None:
-    """
-    Enables unsafe inference mode for Torch-TensorRT
-    """
-    global SAFE_MODE
-    SAFE_MODE = False
-    torch.ops.tensorrt.set_safe_mode(False)
-    logger.info("Enabled unsafe inference mode")
-
-
-def enable_safe_inference_mode() -> None:
-    """
-    Enables safe inference mode for Torch-TensorRT
-    """
-    global SAFE_MODE
-    SAFE_MODE = True
-    torch.ops.tensorrt.set_safe_mode(True)
-    logger.info("Enabled safe inference mode")
-
-
 def convert_method_to_trt_engine(
     module: Any,
     method_name: str = "forward",
diff --git a/py/torch_tensorrt/dynamo/runtime/MULTI_DEVICE_SAFE_MODE.md b/py/torch_tensorrt/dynamo/runtime/MULTI_DEVICE_SAFE_MODE.md
@@ -0,0 +1,28 @@
+Multi-device safe mode is a setting in Torch-TensorRT which allows the user to determine whether
+the runtime checks for device consistency prior to every inference call.
+
+There is a non-negligible, fixed cost per-inference call when multi-device safe mode, which is why
+it is now disabled by default. It can be controlled via the following convenience function which
+doubles as a context manager.
+```python
+# Enables Multi Device Safe Mode
+torch_tensorrt.runtime.set_multi_device_safe_mode(True)
+
+# Disables Multi Device Safe Mode [Default Behavior]
+torch_tensorrt.runtime.set_multi_device_safe_mode(False)
+
+# Enables Multi Device Safe Mode, then resets the safe mode to its prior setting
+with torch_tensorrt.runtime.set_multi_device_safe_mode(True):
+    ...
+```
+TensorRT requires that each engine be associated with the CUDA context in the active thread from which it is invoked.
+Therefore, if the device were to change in the active thread, which may be the case when invoking
+engines on multiple GPUs from the same Python process, safe mode will cause Torch-TensorRT to display
+an alert and switch GPUs accordingly. If safe mode were not enabled, there could be a mismatch in the engine
+device and CUDA context device, which could lead the program to crash.
+
+One technique for managing multiple TRT engines on different GPUs while not sacrificing performance for
+multi-device safe mode is to use Python threads. Each thread is responsible for all of the TRT engines
+on a single GPU, and the default CUDA device on each thread corresponds to the GPU for which it is
+responsible (can be set via `torch.cuda.set_device(...)`). In this way, multiple threads can be used in the same
+Python scripts without needing to switch CUDA contexts and incur performance overhead by leveraging threads.
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -8,11 +8,9 @@
 import torch
 from torch.nn import Module
 from torch_tensorrt._Device import Device
-from torch_tensorrt.dynamo.runtime.tools import _is_switch_required, _select_rt_device
+from torch_tensorrt.dynamo.runtime.tools import multi_gpu_device_check
 from torch_tensorrt.fx.utils import Frameworks, unified_dtype_converter
 
-import torch_tensorrt
-
 logger = logging.getLogger(__name__)
 
 
@@ -33,6 +31,10 @@ def __init__(
     ):
         super(PythonTorchTensorRTModule, self).__init__()
         self._register_state_dict_hook(PythonTorchTensorRTModule._on_state_dict)
+
+        # Run multi-gpu device check to validate engine instantiation
+        multi_gpu_device_check()
+
         self.engine = engine
         self.input_names = input_names if input_names is not None else []
         self.output_names = output_names if output_names is not None else []
@@ -133,6 +135,9 @@ def _load_from_state_dict(
     ) -> None:
         engine_bytes = state_dict[prefix + "engine"]
 
+        # Run multi-gpu device check to validate engine instantiation
+        multi_gpu_device_check()
+
         logger = trt.Logger()
         runtime = trt.Runtime(logger)
         self.engine = runtime.deserialize_cuda_engine(engine_bytes)
@@ -161,32 +166,6 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
         ) if self.profiling_enabled else nullcontext():
             self._check_initialized()
 
-            # If in safe mode, check at each iteration for for whether a switch is required
-            if torch_tensorrt._compile.SAFE_MODE:
-                curr_device_id = torch.cuda.current_device()
-                curr_device_properties = torch.cuda.get_device_properties(
-                    curr_device_id
-                )
-                logger.debug(f"Current Device: cuda:{curr_device_id}")
-
-                # If a switch is required, move all inputs to new device and set as active device
-                if _is_switch_required(
-                    curr_device_id,
-                    self.target_device_id,
-                    curr_device_properties,
-                    self.target_device_properties,
-                ):
-                    device_id, _ = _select_rt_device(
-                        curr_device_id,
-                        self.target_device_id,
-                        self.target_device_properties,
-                    )
-                    device = torch.device(device_id)
-                    torch.cuda.set_device(device_id)
-
-                    inputs = tuple([tensor.to(device) for tensor in inputs])
-                    logger.warning(f"Moved all input Tensors to cuda:{device_id}")
-
             with torch.autograd.profiler.record_function(
                 "PythonTorchTensorRTModule:ProcessInputs"
             ) if self.profiling_enabled else nullcontext():
@@ -202,24 +181,22 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
                 )
 
                 for i, input_name in enumerate(self.input_names):
-                    # Check that the inputs are on cuda and have the correct data type if in safe mode
-                    if torch_tensorrt._compile.SAFE_MODE:
-                        if not contiguous_inputs[i].is_cuda:
-                            logger.warning(
-                                f"Detected input {input_name} of engine {self.engine.name} is not on a cuda device. "
-                                "This tensor is being moved by the runtime but for performance considerations, "
-                                "ensure your inputs are all on GPU and open an issue here "
-                                "(https://github.com/pytorch/TensorRT/issues) if this warning persists."
-                            )
-                            contiguous_inputs = (
-                                contiguous_inputs[:i]
-                                + [contiguous_inputs[i].cuda()]
-                                + contiguous_inputs[i + 1 :]
-                            )
-
-                        assert (
-                            contiguous_inputs[i].dtype == self.input_dtypes[i]
-                        ), f"Dtype mismatch for {i}th input({input_name}). Expect {self.input_dtypes[i]}, got {contiguous_inputs[i].dtype}."
+                    if not contiguous_inputs[i].is_cuda:
+                        logger.warning(
+                            f"Detected input {input_name} of engine {self.engine.name} is not on a cuda device. "
+                            "This tensor is being moved by the runtime but for performance considerations, "
+                            "ensure your inputs are all on GPU and open an issue here "
+                            "(https://github.com/pytorch/TensorRT/issues) if this warning persists."
+                        )
+                        contiguous_inputs = (
+                            contiguous_inputs[:i]
+                            + [contiguous_inputs[i].cuda()]
+                            + contiguous_inputs[i + 1 :]
+                        )
+
+                    assert (
+                        contiguous_inputs[i].dtype == self.input_dtypes[i]
+                    ), f"Dtype mismatch for {i}th input({input_name}). Expect {self.input_dtypes[i]}, got {contiguous_inputs[i].dtype}."
 
                     idx = self.input_binding_indices_in_order[i]
                     bindings[idx] = contiguous_inputs[i].data_ptr()
diff --git a/py/torch_tensorrt/dynamo/runtime/tools.py b/py/torch_tensorrt/dynamo/runtime/tools.py
diff --git a/py/torch_tensorrt/runtime/__init__.py b/py/torch_tensorrt/runtime/__init__.py
diff --git a/py/torch_tensorrt/runtime/multi_device_safe_mode.py b/py/torch_tensorrt/runtime/multi_device_safe_mode.py
diff --git a/setup.py b/setup.py