Fix async_with_pinned_mem doesn't set staging correctly (#1783)

fegin · web-flow · commit 8ec37d2bca7a · 2025-10-02T09:43:42.000-07:00
#1287 refactors async_with_pinned_mem to use DCP's implementation but that PR didn't set staging correctly. This PR fixes it. Fixes #1773
diff --git a/tests/unit_tests/test_checkpoint.py b/tests/unit_tests/test_checkpoint.py
@@ -75,8 +75,20 @@ def __init__(self):
         self.result = mock.Mock()
 
 
+class DummyAsyncResult:
+    """Mock object that mimics the return value of dcp.async_save with pinned memory"""
+
+    def __init__(self):
+        self.upload_completion = DummyFuture()
+        self.staging_completion = DummyFuture()
+
+
 def fake_async_save(*args, **kwargs):
-    return DummyFuture()
+    # Check if this is async_with_pinned_mem mode by looking for async_stager parameter
+    if "async_stager" in kwargs:
+        return DummyAsyncResult()
+    else:
+        return DummyFuture()
 
 
 class DummyJobConfig:
@@ -410,6 +422,62 @@ def test_last_save_model_only_and_initial_load_model_only(
         manager1.close()
         manager2.close()
 
+    @mock.patch("torch.distributed.get_rank", return_value=0)
+    @mock.patch("torch.cuda.Stream")
+    @mock.patch("torchtitan.components.checkpoint.DefaultStager")
+    @mock.patch("torchtitan.components.checkpoint.dist.new_group")
+    @mock.patch(
+        "torchtitan.components.checkpoint.dcp.async_save", side_effect=fake_async_save
+    )
+    def test_async_save_with_pinned_mem_sets_staging_flag(
+        self,
+        mock_async_save,
+        mock_new_group,
+        mock_default_stager,
+        mock_cuda_stream,
+        mock_rank,
+    ):
+        """
+        Test that AsyncMode.ASYNC_WITH_PINNED_MEM correctly sets staging flag.
+
+        This test verifies the bug fix where self.staging was not being set to True
+        when using ASYNC_WITH_PINNED_MEM mode, which caused maybe_wait_for_staging()
+        to not wait properly for staging completion.
+        """
+        # Configure async mode with pinned memory
+        job_config = DummyJobConfig(job=self.job_config.job)
+        checkpoint_config = job_config.checkpoint
+        checkpoint_config.async_mode = "async_with_pinned_mem"
+
+        manager = CheckpointManager(
+            dataloader=self.data_loader,
+            model_parts=self.model_parts,
+            optimizers=self.optimizers,
+            lr_schedulers=self.lr_schedulers,
+            states=self.states,
+            checkpoint_config=checkpoint_config,
+            sd_adapter=None,
+            base_folder=self.job_config.job.dump_folder,
+            ft_manager=self.ft_manager,
+        )
+
+        # Initially staging should be False
+        self.assertFalse(manager.staging)
+
+        # After save, staging should be set to True
+        manager.save(curr_step=1, last_step=False)
+        self.assertTrue(manager.staging)
+
+        # Verify that staging_future exists
+        self.assertIsNotNone(manager.staging_future)
+
+        # Verify that maybe_wait_for_staging actually waits when staging is True
+        manager.maybe_wait_for_staging()
+        # After waiting, staging should be set back to False
+        self.assertFalse(manager.staging)
+
+        manager.close()
+
     @mock.patch("torchtitan.components.checkpoint.dist.new_group")
     @mock.patch(
         "torchtitan.components.checkpoint.dcp.async_save", side_effect=fake_async_save
diff --git a/torchtitan/components/checkpoint.py b/torchtitan/components/checkpoint.py
@@ -494,6 +494,7 @@ def save(self, curr_step: int, last_step: bool = False) -> None:
                 )
                 self.save_future = result.upload_completion
                 self.staging_future = result.staging_completion
+                self.staging = True
             elif self.async_mode == AsyncMode.ASYNC:
                 GarbageCollection.collect("GC collection invoked by checkpointer.")
                 self.save_future = self.dcp_save(
@@ -615,6 +616,7 @@ def maybe_wait_for_staging(self) -> None:
         """
         if self.enable_staging and self.staging:
             self.staging_future.result()
+            self.staging = False
 
     def _find_load_step(self, folder: str = "") -> int:
         """Find the step to load the checkpoint for.