Delete one copy

cehongwang · cehongwang · commit 85107c3542e5 · 2025-06-17T21:56:44.000Z
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -702,7 +702,8 @@ def compile(
 
     # Move the weights in the state_dict to CPU
     if offload_module_to_cpu:
-        deallocate_module(exported_program.module(), delete_module=False)
+        deallocate_module(gm, delete_module=False)
+        # deallocate_module(exported_program.module(), delete_module=False)
         logger.info(
             "The PyTorch model was moved to the CPU to allocate all GPU memory to TensorRT. To retain the model on the GPU, set offload_module_to_cpu=False"
         )
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -400,7 +400,7 @@ def _construct_trt_network_def(self) -> None:
     @staticmethod
     def find_weight(
         weight_name: str,
-        np_map: dict[str, Any],
+        weight_refit_map: dict[str, Any],
         state_dict: dict[str, Any],
         device: torch.device,
     ) -> str:
@@ -413,7 +413,7 @@ def find_weight(
         state_dict: state of the graph module
         """
         with unset_fake_temporarily():
-            network_weight = torch.from_numpy(np_map[weight_name]).to(device)
+            network_weight = weight_refit_map[weight_name].to(device)
             for sd_w_name, sd_weight in state_dict.items():
                 if TRTInterpreter.check_weight_equal(sd_weight, network_weight, device):
                     del state_dict[sd_w_name]
@@ -427,8 +427,8 @@ def check_weight_equal(
         device: torch.device,
     ) -> Any:
         with unset_fake_temporarily():
-            if not isinstance(network_weight, torch.Tensor):
-                network_weight = torch.from_numpy(network_weight).to(device)
+            if network_weight.device != device:
+                network_weight = network_weight.to(device)
             try:
                 return sd_weight.shape == network_weight.shape and torch.all(
                     torch.abs(sd_weight - network_weight) < 0.01
@@ -494,11 +494,10 @@ def _save_weight_mapping(self) -> None:
         _LOGGER.info("Building weight name mapping...")
         # Stage 1: Name mapping
         torch_device = to_torch_device(self.compilation_settings.device)
-        self.module.to(torch_device)
-        sd = self.module.state_dict()
+        sd = {k: v.to(torch_device) for k, v in self.module.state_dict().items()}
         weight_name_map: dict[str, Any] = {}
-        np_map = self.ctx.weight_refit_map
-        constant_mapping = {k: v for k, v in np_map.items() if v.size == 1}
+        weight_refit_map = self.ctx.weight_refit_map
+        constant_mapping = {k: v for k, v in weight_refit_map.items() if v.size == 1}
         net = self.ctx.net
         for i in range(net.num_layers):
             layer = net[i]
@@ -540,7 +539,7 @@ def _save_weight_mapping(self) -> None:
                     else:
                         sd_weight_name = f"{sd_weight_name}.{torch_attr}"
 
-                    if engine_weight_name in np_map:
+                    if engine_weight_name in weight_refit_map:
                         weight_name_map[engine_weight_name] = sd_weight_name
 
         # Stage 2: Value mapping
@@ -549,10 +548,10 @@ def _save_weight_mapping(self) -> None:
                 # There is no direct connection in batch_norm layer. So skip it
                 pass
             elif sd_weight_name not in sd or not TRTInterpreter.check_weight_equal(
-                sd[sd_weight_name], np_map[engine_weight_name], torch_device
+                sd[sd_weight_name], weight_refit_map[engine_weight_name], torch_device
             ):
                 weight_name_map[engine_weight_name] = TRTInterpreter.find_weight(
-                    engine_weight_name, np_map, sd, torch_device
+                    engine_weight_name, weight_refit_map, sd, torch_device
                 )
                 if (
                     weight_name_map[engine_weight_name] != ""
@@ -563,12 +562,13 @@ def _save_weight_mapping(self) -> None:
 
             weight_name_map[engine_weight_name] = [
                 weight_name_map[engine_weight_name],
-                np_map[engine_weight_name].dtype,
+                weight_refit_map[engine_weight_name].dtype,
             ]
 
         weight_name_map["constant_mapping"] = constant_mapping
         self.weight_name_map = weight_name_map
-        del np_map, sd
+
+        del weight_refit_map, sd
         gc.collect()
         torch.cuda.empty_cache()
 
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/constant_folding.py b/py/torch_tensorrt/dynamo/lowering/passes/constant_folding.py
@@ -37,7 +37,9 @@ def constant_fold(
     # For TRT INetwork construction the constants are moved to CPU in get_attr call.
     for node, constant in cf.node_replacements.items():
         replace_node_with_constant(
-            gm, node, torch.nn.Parameter(constant, requires_grad=False)
+            gm,
+            node,
+            torch.nn.Parameter(constant.cpu().contiguous(), requires_grad=False),
         )
 
     erased_params = []

Original file line number	Diff line number	Diff line change
`@@ -702,7 +702,8 @@ def compile(`
`702`	`702`
`703`	`703`	`# Move the weights in the state_dict to CPU`
`704`	`704`	`if offload_module_to_cpu:`
`705`		`- deallocate_module(exported_program.module(), delete_module=False)`
	`705`	`+ deallocate_module(gm, delete_module=False)`
	`706`	`+ # deallocate_module(exported_program.module(), delete_module=False)`
`706`	`707`	`logger.info(`
`707`	`708`	`"The PyTorch model was moved to the CPU to allocate all GPU memory to TensorRT. To retain the model on the GPU, set offload_module_to_cpu=False"`
`708`	`709`	`)`