Cherry-picked commit with merge conflict

akashveramd · AMD AMD · commit 6733ae6bf600 · 2025-07-16T17:10:49.000Z
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
@@ -64,6 +64,8 @@
     TEST_WITH_DEV_DBG_ASAN,
     TEST_WITH_ROCM,
     TestCase,
+    is_arch,
+    NAVI_ARCH,
 )
 
 
@@ -422,6 +424,104 @@ def test_nan_check(self):
         # reset env
         os.environ["TORCH_NCCL_NAN_CHECK"] = "0"
 
+<<<<<<< HEAD
+=======
+    def _helper_test_extra_cuda_context_by_nvml(self):
+        """
+        A helper for `test_extra_cuda_context`, if pynvml is avaiable.
+        pynvml provides python bindings for NVIDIA NVML functionalities.
+        Here we are interested in: nvmlDeviceGetComputeRunningProcesses
+        """
+        import pynvml
+
+        pynvml.nvmlInit()
+
+        device = torch.device("cuda:%d" % self.rank)
+        x = torch.empty((1,), device=device)
+        work = c10d.all_reduce(x, async_op=True)
+
+        # Wait for non-0 ranks to garbage collect Work -- this is the latest
+        # point where extra CUDA context can be created
+        if self.rank == 0:
+            time.sleep(5)
+        del work
+        handle = pynvml.nvmlDeviceGetHandleByIndex(self.rank)
+        processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
+        nprocs = len(processes)
+
+        # A barrier for non-0 ranks
+        c10d.all_reduce(x)
+        torch.cuda.synchronize(device)
+        c10d.destroy_process_group()
+        self.assertLessEqual(
+            nprocs,
+            1,
+            f"Found {nprocs} processes creating contexts on {device}, expecting 1 at most",
+        )
+
+    def _helper_test_extra_cuda_context_by_memory(self):
+        """
+        A helper for `test_extra_cuda_context`, if pynvml is NOT avaiable.
+        If extra context is created, it would manifest into device 0's memory usage.
+        """
+        device = torch.device("cuda:%d" % self.rank)
+        x = torch.empty((1,), device=device)
+        # Rank 0 takes a snapshot before collective -- this snapshot should have
+        # included rank 0's own context.
+        if self.rank == 0:
+            # We need this extra sleep for NAVI_ARCH because rccl_init inside init_process_group 
+            # is happening in a separate process and it is taking longer to finish on NAVI_ARCH. 
+            # Sleeping here ensures that the init is competed successfully and mem_get_info can 
+            # get stable numbers.
+            if is_arch(NAVI_ARCH):
+                time.sleep(5)
+            free, total = torch.cuda.mem_get_info(device)
+            used_before = float(total - free)
+
+        work = c10d.all_reduce(x, async_op=True)
+
+        # Wait for non-0 ranks to garbage collect Work -- this is the latest
+        # point where extra CUDA context can be created
+        if self.rank == 0:
+            time.sleep(5)
+            free, total = torch.cuda.mem_get_info(device)
+            used_after = float(total - free)
+        del work
+
+        # A barrier for non-0 ranks
+        c10d.all_reduce(x)
+        torch.cuda.synchronize(device)
+        c10d.destroy_process_group()
+        if self.rank == 0:
+            # If non-0 rank creates a context on device 0, this assert would
+            # fail because one context takes about 1 GB -- much more than the
+            # tensor size created in this test.
+            self.assertTrue(
+                used_after < used_before * 1.5,
+                f"{device} used {used_after} bytes after collective, "
+                f"50% more than the status before ({used_before} bytes). "
+                f"Extra CUDA context may have been created.",
+            )
+
+    @requires_nccl()
+    @skip_if_lt_x_gpu(2)
+    def test_extra_cuda_context(self):
+        # Check if non-0 ranks would create extra CUDA context on device 0
+        store = c10d.FileStore(self.file_name, self.world_size)
+        device = torch.device("cuda:%d" % self.rank)
+        c10d.init_process_group(
+            backend="nccl",
+            store=store,
+            rank=self.rank,
+            world_size=self.world_size,
+            device_id=device,
+        )
+        try:
+            self._helper_test_extra_cuda_context_by_nvml()
+        except ModuleNotFoundError:
+            self._helper_test_extra_cuda_context_by_memory()
+
+>>>>>>> 71a21d9cde ([rocm6.4_internal_testing][SWDEV-535305] Fixed `test_extra_cuda_context` in `test_c10d_nccl.py` and refactored is_navi3_arch function (#2341))
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
     def test_destruct_before_terminate_pg(self):
diff --git a/test/inductor/test_decompose_mem_bound_mm.py b/test/inductor/test_decompose_mem_bound_mm.py
@@ -10,7 +10,8 @@
 from torch.testing import FileCheck
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
-    is_navi3_arch,
+    NAVI3_ARCH,
+    is_arch,
     parametrize,
 )
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CUDA
@@ -47,8 +48,8 @@ def forward(self, input1, input2):
 
 # We have to increase tolerance for navi3 because all fp16, bf16
 # GEMMs operations have an accuracy issue caused by hardware limitation
-default_atol = 3e-3 if is_navi3_arch() else 1e-3
-default_rtol = 4e-3 if is_navi3_arch() else 1e-3
+default_atol = 3e-3 if is_arch(NAVI3_ARCH) else 1e-3
+default_rtol = 4e-3 if is_arch(NAVI3_ARCH) else 1e-3
 
 
 @requires_gpu
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
@@ -109,11 +109,11 @@
 NAVI3_ARCH = ("gfx1100", "gfx1101")
 NAVI4_ARCH = ("gfx1200", "gfx1201")
 
-def is_navi3_arch():
+def is_arch(arch_list):
     if torch.cuda.is_available():
         prop = torch.cuda.get_device_properties(0)
         gfx_arch = prop.gcnArchName.split(":")[0]
-        if gfx_arch in NAVI3_ARCH:
+        if gfx_arch in arch_list:
             return True
     return False