From 47e5da2b54d7199bc00510e60bb292ea7926dc2f Mon Sep 17 00:00:00 2001
From: Naren Dasan <naren@narendasan.com>
Date: Tue, 29 Jul 2025 22:56:34 +0000
Subject: [PATCH 1/2] Allowing the allocation mode to be set at build time,
 some fixes for the mode switching

---
 core/runtime/TRTEngine.cpp                    | 21 ++++---
 core/runtime/TRTEngine.h                      |  6 +-
 core/runtime/register_jit_hooks.cpp           | 18 +-----
 examples/dynamo/dynamic_memory_allocation.py  | 16 +++--
 py/torch_tensorrt/dynamo/_compiler.py         |  6 ++
 py/torch_tensorrt/dynamo/_defaults.py         |  1 +
 py/torch_tensorrt/dynamo/_settings.py         |  4 ++
 .../dynamo/runtime/_ResourceAllocator.py      | 12 ++--
 .../dynamo/runtime/_TorchTensorRTModule.py    | 11 ++--
 py/torch_tensorrt/dynamo/runtime/__init__.py  |  2 +-
 uv.lock                                       | 58 +++++++++----------
 11 files changed, 83 insertions(+), 72 deletions(-)
diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
index aabb40c6dd..253738b434 100644
--- a/core/runtime/TRTEngine.cpp
+++ b/core/runtime/TRTEngine.cpp
@@ -62,7 +62,7 @@ TRTEngine::TRTEngine(
     bool hardware_compatible,
     bool requires_output_allocator,
     const std::string& serialized_metadata,
-    const ResourceAllocationStrategy& resource_allocation_strategy)
+    const ResourceAllocationStrategy resource_allocation_strategy)
     : TRTEngine(
           "deserialized_trt",
           serialized_engine,
@@ -86,7 +86,7 @@ TRTEngine::TRTEngine(std::vector<std::string> serialized_info)
           static_cast<bool>(std::stoi(serialized_info[HW_COMPATIBLE_IDX])),
           static_cast<bool>(std::stoi(serialized_info[REQUIRES_OUTPUT_ALLOCATOR_IDX])),
           serialized_info[SERIALIZED_METADATA_IDX],
-          resource_allocation_strategy_from_string(serialized_info[RESOURCE_ALLOCATION_STRATEGY_IDX])) {}
+          (static_cast<bool>(std::stoi(serialized_info[RESOURCE_ALLOCATION_STRATEGY_IDX])) ? ResourceAllocationStrategy::kDynamic : ResourceAllocationStrategy::kStatic)) {}
 
 TRTEngine::TRTEngine(
     const std::string& mod_name,
@@ -98,7 +98,7 @@ TRTEngine::TRTEngine(
     bool hardware_compatible,
     bool requires_output_allocator,
     const std::string& serialized_metadata,
-    const ResourceAllocationStrategy& resource_allocation_strategy) {
+    const ResourceAllocationStrategy resource_allocation_strategy) {
   TORCHTRT_CHECK(
       is_supported_on_current_platform(target_platform),
       "This engine was not built to run on this platform (built for: " << target_platform << ", current platform: "
@@ -128,9 +128,11 @@ TRTEngine::TRTEngine(
     cuda_engine->setWeightStreamingBudgetV2(budget_bytes);
   }
 
+  this->resource_allocation_strategy = resource_allocation_strategy;
+  LOG_DEBUG("Resource allocation strategy: " << (this->resource_allocation_strategy == ResourceAllocationStrategy::kDynamic ? "Dynamic" : "Static"));
   if (this->resource_allocation_strategy == ResourceAllocationStrategy::kDynamic) {
     this->exec_ctx =
-        make_trt(cuda_engine->createExecutionContext(nvinfer1::ExecutionContextAllocationStrategy::kON_PROFILE_CHANGE));
+        make_trt(cuda_engine->createExecutionContext(nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED));
   } else {
     this->exec_ctx = make_trt(cuda_engine->createExecutionContext());
   }
@@ -402,6 +404,7 @@ std::string TRTEngine::to_str() const {
   ss << "  Device: " << device_info << std::endl;
   ss << "  Hardware Compatibility: " << (hardware_compatible ? "Enabled" : "Disabled") << std::endl;
   ss << "  Target Platform: " << target_platform << std::endl;
+  ss << "  Resource Allocation Strategy: " << (resource_allocation_strategy == ResourceAllocationStrategy::kDynamic ? "Dynamic" : "Static") << std::endl;
   // clang-format on
   return ss.str();
 }
@@ -469,8 +472,7 @@ std::vector<std::string> TRTEngine::serialize() {
   serialized_info[REQUIRES_OUTPUT_ALLOCATOR_IDX] = this->requires_output_allocator ? "1" : "0";
   serialized_info[SERIALIZED_METADATA_IDX] = this->serialized_metadata;
   serialized_info[TARGET_PLATFORM_IDX] = this->target_platform.serialize();
-  serialized_info[RESOURCE_ALLOCATION_STRATEGY_IDX] =
-      resource_allocation_strategy_to_string(this->resource_allocation_strategy);
+  serialized_info[RESOURCE_ALLOCATION_STRATEGY_IDX] = this->resource_allocation_strategy == ResourceAllocationStrategy::kDynamic ? "1" : "0";
 
   return serialized_info;
 }
@@ -483,11 +485,12 @@ void TRTEngine::set_resource_allocation_strategy(TRTEngine::ResourceAllocationSt
   if (new_strategy != this->resource_allocation_strategy) {
     this->resource_allocation_strategy = new_strategy;
     if (this->resource_allocation_strategy == TRTEngine::ResourceAllocationStrategy::kDynamic) {
-      std::cout << "Setting resource allocation strategy to dynamic" << std::endl;
-      this->exec_ctx = make_trt(cuda_engine->createExecutionContext());
+      LOG_DEBUG("Setting resource allocation strategy to dynamic");
+      this->exec_ctx = make_trt(cuda_engine->createExecutionContext(nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED));
     } else {
+      LOG_DEBUG("Setting resource allocation strategy to static");
       this->exec_ctx = make_trt(
-          cuda_engine->createExecutionContext(nvinfer1::ExecutionContextAllocationStrategy::kON_PROFILE_CHANGE));
+          cuda_engine->createExecutionContext());
     }
   }
 }
diff --git a/core/runtime/TRTEngine.h b/core/runtime/TRTEngine.h
index 9c77ab325a..2ed07f0bcc 100644
--- a/core/runtime/TRTEngine.h
+++ b/core/runtime/TRTEngine.h
@@ -100,7 +100,7 @@ class DynamicOutputAllocator : public nvinfer1::IOutputAllocator {
 
 struct TRTEngine : torch::CustomClassHolder {
   // Resource Allocation Strategy
-  enum ResourceAllocationStrategy { kStatic, kDynamic };
+  typedef enum { kStatic = 0, kDynamic } ResourceAllocationStrategy;
   // Each engine needs it's own runtime object
   std::shared_ptr<nvinfer1::IRuntime> rt;
   std::shared_ptr<nvinfer1::ICudaEngine> cuda_engine;
@@ -132,7 +132,7 @@ struct TRTEngine : torch::CustomClassHolder {
       bool hardware_compatible = false,
       bool requires_output_allocator = false,
       const std::string& serialized_metadata = "",
-      const TRTEngine::ResourceAllocationStrategy& resource_allocation_strategy =
+      const TRTEngine::ResourceAllocationStrategy resource_allocation_strategy =
           TRTEngine::ResourceAllocationStrategy::kStatic);
 
   TRTEngine(std::vector<std::string> serialized_info);
@@ -147,7 +147,7 @@ struct TRTEngine : torch::CustomClassHolder {
       bool hardware_compatible = false,
       bool requires_output_allocator = false,
       const std::string& serialized_metadata = "",
-      const TRTEngine::ResourceAllocationStrategy& resource_allocation_strategy =
+      const TRTEngine::ResourceAllocationStrategy resource_allocation_strategy =
           TRTEngine::ResourceAllocationStrategy::kStatic);
 
   TRTEngine& operator=(const TRTEngine& other);
diff --git a/core/runtime/register_jit_hooks.cpp b/core/runtime/register_jit_hooks.cpp
index 99633a4e47..6d15bd8ae8 100644
--- a/core/runtime/register_jit_hooks.cpp
+++ b/core/runtime/register_jit_hooks.cpp
@@ -22,21 +22,6 @@ std::string serialize_bindings(const std::vector<std::string>& bindings) {
   return serialized_binding_info;
 }
 
-std::string resource_allocation_strategy_to_string(TRTEngine::ResourceAllocationStrategy strategy) {
-  if (strategy == TRTEngine::ResourceAllocationStrategy::kDynamic) {
-    return std::string("kDynamic");
-  } else {
-    return std::string("kStatic");
-  }
-}
-
-TRTEngine::ResourceAllocationStrategy resource_allocation_strategy_from_string(const std::string& str) {
-  if (str == "kDynamic")
-    return TRTEngine::ResourceAllocationStrategy::kDynamic;
-  else
-    return TRTEngine::ResourceAllocationStrategy::kStatic;
-}
-
 static const std::string sym_table = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; //=
 std::string base64_encode(const std::string& in) {
   std::string out;
@@ -106,7 +91,7 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion =
         .def("infer_outputs", &TRTEngine::infer_outputs)
         .def("reset_captured_graph", &TRTEngine::reset_captured_graph)
         .def(
-            "_use_dynamically_allocated_resources",
+            "use_dynamically_allocated_resources",
             [](const c10::intrusive_ptr<TRTEngine>& self, bool dynamic) -> void {
               self->set_resource_allocation_strategy(
                   dynamic ? TRTEngine::ResourceAllocationStrategy::kDynamic
@@ -124,6 +109,7 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion =
             [](const c10::intrusive_ptr<TRTEngine>& self) -> std::vector<std::string> { return self->serialize(); },
             [](std::vector<std::string> serialized_info) -> c10::intrusive_ptr<TRTEngine> {
               serialized_info[ENGINE_IDX] = base64_decode(serialized_info[ENGINE_IDX]);
+              LOG_DEBUG("Deserialized resource allocation strategy: " << (static_cast<bool>(std::stoi(serialized_info[RESOURCE_ALLOCATION_STRATEGY_IDX])) ? "Dynamic" : "Static"));
               TRTEngine::verify_serialization_fmt(serialized_info);
               return c10::make_intrusive<TRTEngine>(serialized_info);
             });
diff --git a/examples/dynamo/dynamic_memory_allocation.py b/examples/dynamo/dynamic_memory_allocation.py
index be7bc7e1bd..d609a83045 100644
--- a/examples/dynamo/dynamic_memory_allocation.py
+++ b/examples/dynamo/dynamic_memory_allocation.py
@@ -3,7 +3,8 @@
 import torch
 import torch_tensorrt as torch_trt
 import torchvision.models as models
-from diffusers import DiffusionPipeline
+import time
+import gc
 
 np.random.seed(5)
 torch.manual_seed(5)
@@ -14,6 +15,9 @@
     "use_python_runtime": False,
     "enabled_precisions": {torch.float32},
     "immutable_weights": False,
+    "lazy_engine_init": True,
+    "dynamically_allocate_resources": True
+
 }
 
 model = models.resnet152(pretrained=True).eval().to("cuda")
@@ -21,16 +25,18 @@
 print((torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]) / 1024**3)
 compiled_module(*inputs)
 
-breakpoint()
-with torch_trt.dynamo.runtime.ResourceAllocatorContext(compiled_module):
+time.sleep(30)
+with torch_trt.dynamo.runtime.ResourceAllocationStrategy(compiled_module, dynamically_allocate_resources=False):
     print(
         "Memory used (GB):",
         (torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]) / 1024**3,
     )
-    breakpoint()
     compiled_module(*inputs)
+    gc.collect()
+    torch.cuda.empty_cache()
+    time.sleep(30)
     print(
         "Memory used (GB):",
         (torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]) / 1024**3,
     )
-    breakpoint()
+    compiled_module(*inputs)
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
index 74cab980c4..cebbb88273 100644
--- a/py/torch_tensorrt/dynamo/_compiler.py
+++ b/py/torch_tensorrt/dynamo/_compiler.py
@@ -103,6 +103,7 @@ def cross_compile_for_windows(
     tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL,
     l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
     offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
+    dynamically_allocate_resources: bool = _defaults.DYNAMICALLY_ALLOCATE_RESOURCES,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module using TensorRT in Linux for Inference in Windows
@@ -177,6 +178,7 @@ def cross_compile_for_windows(
         enable_weight_streaming (bool): Enable weight streaming.
         tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
         l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
+        dynamically_allocate_resources (bool): Dynamically allocate resources during engine execution.
         **kwargs: Any,
     Returns:
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -340,6 +342,7 @@ def cross_compile_for_windows(
         "enable_weight_streaming": enable_weight_streaming,
         "tiling_optimization_level": tiling_optimization_level,
         "l2_limit_for_tiling": l2_limit_for_tiling,
+        "dynamically_allocate_resources": dynamically_allocate_resources,
     }
 
     # disable the following settings is not supported for cross compilation for windows feature
@@ -440,6 +443,7 @@ def compile(
     tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL,
     l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
     offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
+    dynamically_allocate_resources: bool = _defaults.DYNAMICALLY_ALLOCATE_RESOURCES,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT
@@ -517,6 +521,7 @@ def compile(
         tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
         l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
         offload_module_to_cpu (bool): Offload the module to CPU. This is useful when we need to minimize GPU memory usage.
+        dynamically_allocate_resources (bool): Dynamically allocate resources during engine execution.
         **kwargs: Any,
     Returns:
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -690,6 +695,7 @@ def compile(
         "tiling_optimization_level": tiling_optimization_level,
         "l2_limit_for_tiling": l2_limit_for_tiling,
         "offload_module_to_cpu": offload_module_to_cpu,
+        "dynamically_allocate_resources": dynamically_allocate_resources,
     }
 
     settings = CompilationSettings(**compilation_options)
diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py
index de970ecd81..b58d0a528b 100644
--- a/py/torch_tensorrt/dynamo/_defaults.py
+++ b/py/torch_tensorrt/dynamo/_defaults.py
@@ -57,6 +57,7 @@
 L2_LIMIT_FOR_TILING = -1
 USE_DISTRIBUTED_MODE_TRACE = False
 OFFLOAD_MODULE_TO_CPU = False
+DYNAMICALLY_ALLOCATE_RESOURCES = False
 
 if platform.system() == "Linux":
     import pwd
diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py
index d8f6809eae..5b09c5750c 100644
--- a/py/torch_tensorrt/dynamo/_settings.py
+++ b/py/torch_tensorrt/dynamo/_settings.py
@@ -11,6 +11,7 @@
     DLA_GLOBAL_DRAM_SIZE,
     DLA_LOCAL_DRAM_SIZE,
     DLA_SRAM_SIZE,
+    DYNAMICALLY_ALLOCATE_RESOURCES,
     DRYRUN,
     ENABLE_CROSS_COMPILE_FOR_WINDOWS,
     ENABLE_EXPERIMENTAL_DECOMPOSITIONS,
@@ -97,6 +98,8 @@ class CompilationSettings:
         tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
         l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
         use_distributed_mode_trace (bool):  Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model
+        offload_to_cpu (bool): Offload the model to CPU to reduce memory footprint during compilation
+        dynamically_allocate_resources (bool): Dynamically allocate resources for TensorRT engines
     """
 
     enabled_precisions: Set[dtype] = field(default_factory=lambda: ENABLED_PRECISIONS)
@@ -140,6 +143,7 @@ class CompilationSettings:
     l2_limit_for_tiling: int = L2_LIMIT_FOR_TILING
     use_distributed_mode_trace: bool = USE_DISTRIBUTED_MODE_TRACE
     offload_module_to_cpu: bool = OFFLOAD_MODULE_TO_CPU
+    dynamically_allocate_resources: bool = DYNAMICALLY_ALLOCATE_RESOURCES
 
     def __getstate__(self) -> dict[str, Any]:
         from torch_tensorrt.dynamo.conversion._ConverterRegistry import (
diff --git a/py/torch_tensorrt/dynamo/runtime/_ResourceAllocator.py b/py/torch_tensorrt/dynamo/runtime/_ResourceAllocator.py
index 5c72d4e180..f843cedcec 100644
--- a/py/torch_tensorrt/dynamo/runtime/_ResourceAllocator.py
+++ b/py/torch_tensorrt/dynamo/runtime/_ResourceAllocator.py
@@ -3,9 +3,9 @@
 import torch
 
 
-class ResourceAllocatorContext(torch.nn.Module):  # type: ignore[misc]
+class ResourceAllocationStrategy(torch.nn.Module):  # type: ignore[misc]
     """
-    ResourceAllocatorContext is a context manager module that temporarily enables dynamic resource allocation
+    ResourceAllocationStrategy is a context manager module that temporarily enables dynamic resource allocation
     for all TRT submodules of the given compiled_module. When entering the context,
     it sets these submodules to use dynamically allocated resources. Upon exiting, it restores them to their
     original (static) resource allocation mode.
@@ -14,17 +14,19 @@ class ResourceAllocatorContext(torch.nn.Module):  # type: ignore[misc]
     def __init__(
         self,
         compiled_module: torch.nn.Module,
+        dynamically_allocate_resources: bool = True
     ) -> None:
-        super(ResourceAllocatorContext, self).__init__()
+        super(ResourceAllocationStrategy, self).__init__()
         self.compiled_module = compiled_module
+        self.dynamically_allocate_resources = dynamically_allocate_resources
 
     def __enter__(self) -> None:
         print("Entering resource allocator context")
         for name, submodule in self.compiled_module.named_modules():
             if "_run_on_acc" in name:
-                submodule.use_dynamically_allocated_resources(dynamic=True)
+                submodule.use_dynamically_allocated_resources(dynamically_allocate_resources=self.dynamically_allocate_resources)
 
     def __exit__(self, exc_type: Any, exc_value: Any, exc_tb: Any) -> None:
         for name, submodule in self.compiled_module.named_modules():
             if "_run_on_acc" in name:
-                submodule.use_dynamically_allocated_resources(dynamic=False)
+                submodule.use_dynamically_allocated_resources(dynamically_allocate_resources=self.dynamically_allocate_resources)
diff --git a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
index c5929c16a7..9c279396d7 100644
--- a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
+++ b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
@@ -142,7 +142,7 @@ def __init__(
         self.serialized_engine = serialized_engine
         self.engine = None
         self.requires_output_allocator = requires_output_allocator
-        self.resource_allocation_strategy = 0  # Default to static allocation TODO: Make this configurable with the context manager
+        self.dynamically_allocate_resources = settings.dynamically_allocate_resources
 
         if (
             serialized_engine
@@ -188,9 +188,11 @@ def _pack_engine_info(self) -> List[str | bytes]:
         engine_info[REQUIRES_OUTPUT_ALLOCATOR_IDX] = str(
             int(self.requires_output_allocator)
         )
+        print(f"PROVIDED RESOURCE ALLOCATION STRATEGY: {self.dynamically_allocate_resources}")
         engine_info[RESOURCE_ALLOCATION_STRATEGY_IDX] = str(
-            int(self.resource_allocation_strategy)
+            int(self.dynamically_allocate_resources)
         )
+        print(engine_info[RESOURCE_ALLOCATION_STRATEGY_IDX])
 
         return engine_info
 
@@ -219,8 +221,9 @@ def set_device_memory_budget(self, budget_bytes: int) -> int:
     def _reset_captured_graph(self) -> None:
         self.engine.reset_captured_graph()
 
-    def use_dynamically_allocated_resources(self, dynamic: bool = False) -> None:
-        self.engine._use_dynamically_allocated_resources(dynamic)
+    def use_dynamically_allocated_resources(self, dynamically_allocate_resources: bool = False) -> None:
+        self.dynamically_allocate_resources = dynamically_allocate_resources
+        self.engine.use_dynamically_allocated_resources(self.dynamically_allocate_resources)
 
     def setup_engine(self) -> None:
         """
diff --git a/py/torch_tensorrt/dynamo/runtime/__init__.py b/py/torch_tensorrt/dynamo/runtime/__init__.py
index 19843a0a54..0eb66b24b0 100644
--- a/py/torch_tensorrt/dynamo/runtime/__init__.py
+++ b/py/torch_tensorrt/dynamo/runtime/__init__.py
@@ -3,7 +3,7 @@
     PythonTorchTensorRTModule,
 )
 from torch_tensorrt.dynamo.runtime._ResourceAllocator import (  # noqa: F401
-    ResourceAllocatorContext,
+    ResourceAllocationStrategy,
 )
 from torch_tensorrt.dynamo.runtime._TorchTensorRTModule import (  # noqa: F401
     TorchTensorRTModule,
diff --git a/uv.lock b/uv.lock
index 18b5f3d7ed..79621781be 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2526,7 +2526,7 @@ sdist = { url = "https://pypi.nvidia.com/tensorrt/tensorrt-10.3.0.tar.gz", hash
 
 [[package]]
 name = "tensorrt"
-version = "10.11.0.33"
+version = "10.12.0.36"
 source = { registry = "https://pypi.nvidia.com/" }
 resolution-markers = [
     "python_full_version >= '3.12' and platform_machine != 'aarch64' and 'tegra' not in platform_release and sys_platform == 'linux'",
@@ -2551,9 +2551,9 @@ resolution-markers = [
     "python_full_version < '3.10' and platform_machine != 'aarch64' and 'tegra' in platform_release and sys_platform == 'windows'",
 ]
 dependencies = [
-    { name = "tensorrt-cu12", version = "10.11.0.33", source = { registry = "https://pypi.nvidia.com/" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (platform_machine != 'aarch64' and sys_platform == 'windows') or ('tegra' not in platform_release and sys_platform == 'linux') or ('tegra' not in platform_release and sys_platform == 'windows')" },
+    { name = "tensorrt-cu12", version = "10.12.0.36", source = { registry = "https://pypi.nvidia.com/" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (platform_machine != 'aarch64' and sys_platform == 'windows') or ('tegra' not in platform_release and sys_platform == 'linux') or ('tegra' not in platform_release and sys_platform == 'windows')" },
 ]
-sdist = { url = "https://pypi.nvidia.com/tensorrt/tensorrt-10.11.0.33.tar.gz", hash = "sha256:a3d6048f86e11ea5202d473646194d3be866c0c8d578ac0b7eeb91d923f65d0b" }
+sdist = { url = "https://pypi.nvidia.com/tensorrt/tensorrt-10.12.0.36.tar.gz", hash = "sha256:b246a830c26713e097b73151917e101cfb81aa0e7274c3c3b4c1f9f8b886be2e" }
 
 [[package]]
 name = "tensorrt-cu12"
@@ -2573,7 +2573,7 @@ sdist = { url = "https://pypi.nvidia.com/tensorrt-cu12/tensorrt-cu12-10.3.0.tar.
 
 [[package]]
 name = "tensorrt-cu12"
-version = "10.11.0.33"
+version = "10.12.0.36"
 source = { registry = "https://pypi.nvidia.com/" }
 resolution-markers = [
     "python_full_version >= '3.12' and platform_machine != 'aarch64' and 'tegra' not in platform_release and sys_platform == 'linux'",
@@ -2598,10 +2598,10 @@ resolution-markers = [
     "python_full_version < '3.10' and platform_machine != 'aarch64' and 'tegra' in platform_release and sys_platform == 'windows'",
 ]
 dependencies = [
-    { name = "tensorrt-cu12-bindings", version = "10.11.0.33", source = { registry = "https://pypi.nvidia.com/" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (platform_machine != 'aarch64' and sys_platform == 'windows') or ('tegra' not in platform_release and sys_platform == 'linux') or ('tegra' not in platform_release and sys_platform == 'windows')" },
-    { name = "tensorrt-cu12-libs", version = "10.11.0.33", source = { registry = "https://pypi.nvidia.com/" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (platform_machine != 'aarch64' and sys_platform == 'windows') or ('tegra' not in platform_release and sys_platform == 'linux') or ('tegra' not in platform_release and sys_platform == 'windows')" },
+    { name = "tensorrt-cu12-bindings", version = "10.12.0.36", source = { registry = "https://pypi.nvidia.com/" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (platform_machine != 'aarch64' and sys_platform == 'windows') or ('tegra' not in platform_release and sys_platform == 'linux') or ('tegra' not in platform_release and sys_platform == 'windows')" },
+    { name = "tensorrt-cu12-libs", version = "10.12.0.36", source = { registry = "https://pypi.nvidia.com/" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (platform_machine != 'aarch64' and sys_platform == 'windows') or ('tegra' not in platform_release and sys_platform == 'linux') or ('tegra' not in platform_release and sys_platform == 'windows')" },
 ]
-sdist = { url = "https://pypi.nvidia.com/tensorrt-cu12/tensorrt_cu12-10.11.0.33.tar.gz", hash = "sha256:7e29c8b16771c025320035ba9609c2a074767d9a8c05696a30c9d5c0fdfb37df" }
+sdist = { url = "https://pypi.nvidia.com/tensorrt-cu12/tensorrt_cu12-10.12.0.36.tar.gz", hash = "sha256:aedeee0195c042592ac6b0536b19bc8cdbb1a548f35e09d24fbe78e1c76217c5" }
 
 [[package]]
 name = "tensorrt-cu12-bindings"
@@ -2620,7 +2620,7 @@ resolution-markers = [
 
 [[package]]
 name = "tensorrt-cu12-bindings"
-version = "10.11.0.33"
+version = "10.12.0.36"
 source = { registry = "https://pypi.nvidia.com/" }
 resolution-markers = [
     "python_full_version >= '3.12' and platform_machine != 'aarch64' and 'tegra' not in platform_release and sys_platform == 'linux'",
@@ -2645,16 +2645,16 @@ resolution-markers = [
     "python_full_version < '3.10' and platform_machine != 'aarch64' and 'tegra' in platform_release and sys_platform == 'windows'",
 ]
 wheels = [
-    { url = "https://pypi.nvidia.com/tensorrt-cu12-bindings/tensorrt_cu12_bindings-10.11.0.33-cp310-none-manylinux_2_28_x86_64.whl", hash = "sha256:a2d27745575be5d7f06caa9565230025b8e41a8915ee6a5dc735d41c3faf206d" },
-    { url = "https://pypi.nvidia.com/tensorrt-cu12-bindings/tensorrt_cu12_bindings-10.11.0.33-cp310-none-manylinux_2_31_aarch64.whl", hash = "sha256:546c7ee976366dc9cb76ffefbde555dec4feddcfb508b4c99ee626447b8c72de" },
-    { url = "https://pypi.nvidia.com/tensorrt-cu12-bindings/tensorrt_cu12_bindings-10.11.0.33-cp311-none-manylinux_2_28_x86_64.whl", hash = "sha256:e7b7a5b80174f8c4ddd8a63bc9fa97cad3320409eafad79428bc2b1e15884068" },
-    { url = "https://pypi.nvidia.com/tensorrt-cu12-bindings/tensorrt_cu12_bindings-10.11.0.33-cp311-none-manylinux_2_31_aarch64.whl", hash = "sha256:492e3e91d7c1083bff1f7c15fdd8f5fb09a782dcfa6d1d0f8d9034b2e3b38cad" },
-    { url = "https://pypi.nvidia.com/tensorrt-cu12-bindings/tensorrt_cu12_bindings-10.11.0.33-cp312-none-manylinux_2_28_x86_64.whl", hash = "sha256:a8f374f6d752ce4b0d4a8303d29c3ba9904eb29da0dc95b4db6b75c501997e4a" },
-    { url = "https://pypi.nvidia.com/tensorrt-cu12-bindings/tensorrt_cu12_bindings-10.11.0.33-cp312-none-manylinux_2_31_aarch64.whl", hash = "sha256:6a3b768cea69b153ed0c2eb50130d150406d5c1498fdb0bf6c8a1be160137a6a" },
-    { url = "https://pypi.nvidia.com/tensorrt-cu12-bindings/tensorrt_cu12_bindings-10.11.0.33-cp313-none-manylinux_2_28_x86_64.whl", hash = "sha256:1ceda290d1ed79b6107b0eb29eeb178f569d007c1506b72caae8248975d57662" },
-    { url = "https://pypi.nvidia.com/tensorrt-cu12-bindings/tensorrt_cu12_bindings-10.11.0.33-cp313-none-manylinux_2_31_aarch64.whl", hash = "sha256:3c27e0d6e36a3b1f06e1dc8b735e34f04f5b8aac3e7d9b21762b8264496e825f" },
-    { url = "https://pypi.nvidia.com/tensorrt-cu12-bindings/tensorrt_cu12_bindings-10.11.0.33-cp39-none-manylinux_2_28_x86_64.whl", hash = "sha256:9a801886f389b75f92e69fc6be40308392ec7746dbf4de4a2b76585d591960f0" },
-    { url = "https://pypi.nvidia.com/tensorrt-cu12-bindings/tensorrt_cu12_bindings-10.11.0.33-cp39-none-manylinux_2_31_aarch64.whl", hash = "sha256:42e9b3cc2e3c6bcc0785c9c96b4dd25cd7043ff95e4fd09c8d35331f63ce9634" },
+    { url = "https://pypi.nvidia.com/tensorrt-cu12-bindings/tensorrt_cu12_bindings-10.12.0.36-cp310-none-manylinux_2_28_x86_64.whl", hash = "sha256:7ecdb6fc2555caed7d4fbbd8158ed7ced64e230c125484f62a5369c40dcc70e5" },
+    { url = "https://pypi.nvidia.com/tensorrt-cu12-bindings/tensorrt_cu12_bindings-10.12.0.36-cp310-none-manylinux_2_31_aarch64.whl", hash = "sha256:d8548ab5976ca5c91279c68ee77f4c892e03460709cfa3fbd2a22aa8123cb731" },
+    { url = "https://pypi.nvidia.com/tensorrt-cu12-bindings/tensorrt_cu12_bindings-10.12.0.36-cp311-none-manylinux_2_28_x86_64.whl", hash = "sha256:58cf45605bb330e86f8ad49bc8997ed68cfdf5b09da229534fb7f84aa3fe5bf4" },
+    { url = "https://pypi.nvidia.com/tensorrt-cu12-bindings/tensorrt_cu12_bindings-10.12.0.36-cp311-none-manylinux_2_31_aarch64.whl", hash = "sha256:ae0866a89caaeada1c16776de85413a523f78f53b1fd83f1b903c39eed264d82" },
+    { url = "https://pypi.nvidia.com/tensorrt-cu12-bindings/tensorrt_cu12_bindings-10.12.0.36-cp312-none-manylinux_2_28_x86_64.whl", hash = "sha256:fb3a2ce96c7472a46bbee2030ce6a54fd6a32deda401c1c67d9de057550e0171" },
+    { url = "https://pypi.nvidia.com/tensorrt-cu12-bindings/tensorrt_cu12_bindings-10.12.0.36-cp312-none-manylinux_2_31_aarch64.whl", hash = "sha256:f5128b8b2a379e65c09745ba97df58abf3a418cbfd6508d37f76121d9bdd3bc8" },
+    { url = "https://pypi.nvidia.com/tensorrt-cu12-bindings/tensorrt_cu12_bindings-10.12.0.36-cp313-none-manylinux_2_28_x86_64.whl", hash = "sha256:0eb8d3e41279b1d0d329b85372d5d720c8d2ff1228f6273142d717b44d75935b" },
+    { url = "https://pypi.nvidia.com/tensorrt-cu12-bindings/tensorrt_cu12_bindings-10.12.0.36-cp313-none-manylinux_2_31_aarch64.whl", hash = "sha256:a850992cad842340e6fed41fe74f529064064ff61881d50ef5a2be1816526f9b" },
+    { url = "https://pypi.nvidia.com/tensorrt-cu12-bindings/tensorrt_cu12_bindings-10.12.0.36-cp39-none-manylinux_2_28_x86_64.whl", hash = "sha256:986cb86202ef9541279b59d4e254743aff43bae1def87d14dd06e02369107c8b" },
+    { url = "https://pypi.nvidia.com/tensorrt-cu12-bindings/tensorrt_cu12_bindings-10.12.0.36-cp39-none-manylinux_2_31_aarch64.whl", hash = "sha256:c5b86638ae5e3a2101755d469ac2ce831d4bdece1d20fa2bd546c05c554b5952" },
 ]
 
 [[package]]
@@ -2677,7 +2677,7 @@ dependencies = [
 
 [[package]]
 name = "tensorrt-cu12-libs"
-version = "10.11.0.33"
+version = "10.12.0.36"
 source = { registry = "https://pypi.nvidia.com/" }
 resolution-markers = [
     "python_full_version >= '3.12' and platform_machine != 'aarch64' and 'tegra' not in platform_release and sys_platform == 'linux'",
@@ -2706,8 +2706,8 @@ dependencies = [
     { name = "nvidia-cuda-runtime-cu12", version = "12.9.79", source = { registry = "https://download.pytorch.org/whl/nightly/cu129" }, marker = "platform_machine != 'aarch64' and sys_platform == 'linux'" },
 ]
 wheels = [
-    { url = "https://pypi.nvidia.com/tensorrt-cu12-libs/tensorrt_cu12_libs-10.11.0.33-py2.py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:81ace8d3284fdbef0804c444a4d7555343ee079370e79c93cb328c7d9b08f968" },
-    { url = "https://pypi.nvidia.com/tensorrt-cu12-libs/tensorrt_cu12_libs-10.11.0.33-py2.py3-none-manylinux_2_31_aarch64.whl", hash = "sha256:b6846dbc32d717a5031d9757f16293dd9e25de8a1c4aae8c00701d52351ef173" },
+    { url = "https://pypi.nvidia.com/tensorrt-cu12-libs/tensorrt_cu12_libs-10.12.0.36-py2.py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:3910039e1d49de0edfdc8bf273e40ad4b85a9d57c7c383fe0e22f75417df9610" },
+    { url = "https://pypi.nvidia.com/tensorrt-cu12-libs/tensorrt_cu12_libs-10.12.0.36-py2.py3-none-manylinux_2_31_aarch64.whl", hash = "sha256:1c117effa7318b65508457e9a11e67941859c8e5c346b59fd0090f66be28f2f4" },
 ]
 
 [[package]]
@@ -2886,13 +2886,13 @@ dependencies = [
     { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.10' and 'tegra' not in platform_release and sys_platform == 'linux') or (python_full_version >= '3.10' and 'tegra' not in platform_release and sys_platform == 'windows')" },
     { name = "packaging", marker = "sys_platform == 'linux' or sys_platform == 'windows'" },
     { name = "tensorrt", version = "10.3.0", source = { registry = "https://pypi.nvidia.com/" }, marker = "(platform_machine == 'aarch64' and 'tegra' in platform_release and sys_platform == 'linux') or (platform_machine == 'aarch64' and 'tegra' in platform_release and sys_platform == 'windows')" },
-    { name = "tensorrt", version = "10.11.0.33", source = { registry = "https://pypi.nvidia.com/" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (platform_machine != 'aarch64' and sys_platform == 'windows') or ('tegra' not in platform_release and sys_platform == 'linux') or ('tegra' not in platform_release and sys_platform == 'windows')" },
+    { name = "tensorrt", version = "10.12.0.36", source = { registry = "https://pypi.nvidia.com/" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (platform_machine != 'aarch64' and sys_platform == 'windows') or ('tegra' not in platform_release and sys_platform == 'linux') or ('tegra' not in platform_release and sys_platform == 'windows')" },
     { name = "tensorrt-cu12", version = "10.3.0", source = { registry = "https://pypi.nvidia.com/" }, marker = "(platform_machine == 'aarch64' and 'tegra' in platform_release and sys_platform == 'linux') or (platform_machine == 'aarch64' and 'tegra' in platform_release and sys_platform == 'windows')" },
-    { name = "tensorrt-cu12", version = "10.11.0.33", source = { registry = "https://pypi.nvidia.com/" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (platform_machine != 'aarch64' and sys_platform == 'windows') or ('tegra' not in platform_release and sys_platform == 'linux') or ('tegra' not in platform_release and sys_platform == 'windows')" },
+    { name = "tensorrt-cu12", version = "10.12.0.36", source = { registry = "https://pypi.nvidia.com/" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (platform_machine != 'aarch64' and sys_platform == 'windows') or ('tegra' not in platform_release and sys_platform == 'linux') or ('tegra' not in platform_release and sys_platform == 'windows')" },
     { name = "tensorrt-cu12-bindings", version = "10.3.0", source = { registry = "https://pypi.nvidia.com/" }, marker = "(platform_machine == 'aarch64' and 'tegra' in platform_release and sys_platform == 'linux') or (platform_machine == 'aarch64' and 'tegra' in platform_release and sys_platform == 'windows')" },
-    { name = "tensorrt-cu12-bindings", version = "10.11.0.33", source = { registry = "https://pypi.nvidia.com/" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (platform_machine != 'aarch64' and sys_platform == 'windows') or ('tegra' not in platform_release and sys_platform == 'linux') or ('tegra' not in platform_release and sys_platform == 'windows')" },
+    { name = "tensorrt-cu12-bindings", version = "10.12.0.36", source = { registry = "https://pypi.nvidia.com/" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (platform_machine != 'aarch64' and sys_platform == 'windows') or ('tegra' not in platform_release and sys_platform == 'linux') or ('tegra' not in platform_release and sys_platform == 'windows')" },
     { name = "tensorrt-cu12-libs", version = "10.3.0", source = { registry = "https://pypi.nvidia.com/" }, marker = "(platform_machine == 'aarch64' and 'tegra' in platform_release and sys_platform == 'linux') or (platform_machine == 'aarch64' and 'tegra' in platform_release and sys_platform == 'windows')" },
-    { name = "tensorrt-cu12-libs", version = "10.11.0.33", source = { registry = "https://pypi.nvidia.com/" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (platform_machine != 'aarch64' and sys_platform == 'windows') or ('tegra' not in platform_release and sys_platform == 'linux') or ('tegra' not in platform_release and sys_platform == 'windows')" },
+    { name = "tensorrt-cu12-libs", version = "10.12.0.36", source = { registry = "https://pypi.nvidia.com/" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (platform_machine != 'aarch64' and sys_platform == 'windows') or ('tegra' not in platform_release and sys_platform == 'linux') or ('tegra' not in platform_release and sys_platform == 'windows')" },
     { name = "torch", version = "2.7.0", source = { registry = "https://pypi.org/simple" }, marker = "(platform_machine == 'aarch64' and 'tegra' in platform_release and sys_platform == 'linux') or (platform_machine == 'aarch64' and 'tegra' in platform_release and sys_platform == 'windows')" },
     { name = "torch", version = "2.9.0.dev20250701+cu129", source = { registry = "https://download.pytorch.org/whl/nightly/cu129" }, marker = "(platform_machine != 'aarch64' and sys_platform == 'linux') or (platform_machine != 'aarch64' and sys_platform == 'windows') or ('tegra' not in platform_release and sys_platform == 'linux') or ('tegra' not in platform_release and sys_platform == 'windows')" },
     { name = "typing-extensions", marker = "sys_platform == 'linux' or sys_platform == 'windows'" },
@@ -2940,13 +2940,13 @@ requires-dist = [
     { name = "numpy", marker = "platform_machine == 'aarch64' and 'tegra' in platform_release", specifier = "<2.0.0" },
     { name = "nvidia-modelopt", extras = ["all"], marker = "extra == 'quantization'", specifier = ">=0.27.1" },
     { name = "packaging", specifier = ">=23" },
-    { name = "tensorrt", marker = "platform_machine != 'aarch64' or 'tegra' not in platform_release", specifier = ">=10.11.0,<10.12.0" },
+    { name = "tensorrt", marker = "platform_machine != 'aarch64' or 'tegra' not in platform_release", specifier = ">=10.12.0,<10.13.0" },
     { name = "tensorrt", marker = "platform_machine == 'aarch64' and 'tegra' in platform_release", specifier = ">=10.3.0,<10.4.0" },
-    { name = "tensorrt-cu12", marker = "platform_machine != 'aarch64' or 'tegra' not in platform_release", specifier = ">=10.11.0,<10.12.0" },
+    { name = "tensorrt-cu12", marker = "platform_machine != 'aarch64' or 'tegra' not in platform_release", specifier = ">=10.12.0,<10.13.0" },
     { name = "tensorrt-cu12", marker = "platform_machine == 'aarch64' and 'tegra' in platform_release", specifier = ">=10.3.0,<10.4.0" },
-    { name = "tensorrt-cu12-bindings", marker = "platform_machine != 'aarch64' or 'tegra' not in platform_release", specifier = ">=10.11.0,<10.12.0" },
+    { name = "tensorrt-cu12-bindings", marker = "platform_machine != 'aarch64' or 'tegra' not in platform_release", specifier = ">=10.12.0,<10.13.0" },
     { name = "tensorrt-cu12-bindings", marker = "platform_machine == 'aarch64' and 'tegra' in platform_release", specifier = ">=10.3.0,<10.4.0" },
-    { name = "tensorrt-cu12-libs", marker = "platform_machine != 'aarch64' or 'tegra' not in platform_release", specifier = ">=10.11.0,<10.12.0" },
+    { name = "tensorrt-cu12-libs", marker = "platform_machine != 'aarch64' or 'tegra' not in platform_release", specifier = ">=10.12.0,<10.13.0" },
     { name = "tensorrt-cu12-libs", marker = "platform_machine == 'aarch64' and 'tegra' in platform_release", specifier = ">=10.3.0,<10.4.0" },
     { name = "torch", marker = "platform_machine != 'aarch64' or 'tegra' not in platform_release", specifier = ">=2.9.0.dev0,<2.10.0", index = "https://download.pytorch.org/whl/nightly/cu129" },
     { name = "torch", marker = "platform_machine == 'aarch64' and 'tegra' in platform_release", specifier = ">=2.7.0,<2.8.0" },

From 83dbf3faac7ef22a845548b618c1ed68d6abd62b Mon Sep 17 00:00:00 2001
From: Adrian Wang <123616592+cehongwang@users.noreply.github.com>
Date: Tue, 29 Jul 2025 16:09:33 -0700
Subject: [PATCH 2/2] Update _settings.py

---
 py/torch_tensorrt/dynamo/_settings.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py
index 5b09c5750c..e9f5174e2c 100644
--- a/py/torch_tensorrt/dynamo/_settings.py
+++ b/py/torch_tensorrt/dynamo/_settings.py
@@ -98,7 +98,7 @@ class CompilationSettings:
         tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
         l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
         use_distributed_mode_trace (bool):  Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model
-        offload_to_cpu (bool): Offload the model to CPU to reduce memory footprint during compilation
+        offload_module_to_cpu (bool): Offload the model to CPU to reduce memory footprint during compilation
         dynamically_allocate_resources (bool): Dynamically allocate resources for TensorRT engines
     """