Skip to content

Commit 6733ae6

Browse files
akashveramdAMD AMD
authored andcommitted
Cherry-picked commit with merge conflict
1 parent 627200a commit 6733ae6

File tree

3 files changed

+106
-5
lines changed

3 files changed

+106
-5
lines changed

test/distributed/test_c10d_nccl.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@
6464
TEST_WITH_DEV_DBG_ASAN,
6565
TEST_WITH_ROCM,
6666
TestCase,
67+
is_arch,
68+
NAVI_ARCH,
6769
)
6870

6971

@@ -422,6 +424,104 @@ def test_nan_check(self):
422424
# reset env
423425
os.environ["TORCH_NCCL_NAN_CHECK"] = "0"
424426

427+
<<<<<<< HEAD
428+
=======
429+
def _helper_test_extra_cuda_context_by_nvml(self):
430+
"""
431+
A helper for `test_extra_cuda_context`, if pynvml is avaiable.
432+
pynvml provides python bindings for NVIDIA NVML functionalities.
433+
Here we are interested in: nvmlDeviceGetComputeRunningProcesses
434+
"""
435+
import pynvml
436+
437+
pynvml.nvmlInit()
438+
439+
device = torch.device("cuda:%d" % self.rank)
440+
x = torch.empty((1,), device=device)
441+
work = c10d.all_reduce(x, async_op=True)
442+
443+
# Wait for non-0 ranks to garbage collect Work -- this is the latest
444+
# point where extra CUDA context can be created
445+
if self.rank == 0:
446+
time.sleep(5)
447+
del work
448+
handle = pynvml.nvmlDeviceGetHandleByIndex(self.rank)
449+
processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
450+
nprocs = len(processes)
451+
452+
# A barrier for non-0 ranks
453+
c10d.all_reduce(x)
454+
torch.cuda.synchronize(device)
455+
c10d.destroy_process_group()
456+
self.assertLessEqual(
457+
nprocs,
458+
1,
459+
f"Found {nprocs} processes creating contexts on {device}, expecting 1 at most",
460+
)
461+
462+
def _helper_test_extra_cuda_context_by_memory(self):
463+
"""
464+
A helper for `test_extra_cuda_context`, if pynvml is NOT avaiable.
465+
If extra context is created, it would manifest into device 0's memory usage.
466+
"""
467+
device = torch.device("cuda:%d" % self.rank)
468+
x = torch.empty((1,), device=device)
469+
# Rank 0 takes a snapshot before collective -- this snapshot should have
470+
# included rank 0's own context.
471+
if self.rank == 0:
472+
# We need this extra sleep for NAVI_ARCH because rccl_init inside init_process_group
473+
# is happening in a separate process and it is taking longer to finish on NAVI_ARCH.
474+
# Sleeping here ensures that the init is competed successfully and mem_get_info can
475+
# get stable numbers.
476+
if is_arch(NAVI_ARCH):
477+
time.sleep(5)
478+
free, total = torch.cuda.mem_get_info(device)
479+
used_before = float(total - free)
480+
481+
work = c10d.all_reduce(x, async_op=True)
482+
483+
# Wait for non-0 ranks to garbage collect Work -- this is the latest
484+
# point where extra CUDA context can be created
485+
if self.rank == 0:
486+
time.sleep(5)
487+
free, total = torch.cuda.mem_get_info(device)
488+
used_after = float(total - free)
489+
del work
490+
491+
# A barrier for non-0 ranks
492+
c10d.all_reduce(x)
493+
torch.cuda.synchronize(device)
494+
c10d.destroy_process_group()
495+
if self.rank == 0:
496+
# If non-0 rank creates a context on device 0, this assert would
497+
# fail because one context takes about 1 GB -- much more than the
498+
# tensor size created in this test.
499+
self.assertTrue(
500+
used_after < used_before * 1.5,
501+
f"{device} used {used_after} bytes after collective, "
502+
f"50% more than the status before ({used_before} bytes). "
503+
f"Extra CUDA context may have been created.",
504+
)
505+
506+
@requires_nccl()
507+
@skip_if_lt_x_gpu(2)
508+
def test_extra_cuda_context(self):
509+
# Check if non-0 ranks would create extra CUDA context on device 0
510+
store = c10d.FileStore(self.file_name, self.world_size)
511+
device = torch.device("cuda:%d" % self.rank)
512+
c10d.init_process_group(
513+
backend="nccl",
514+
store=store,
515+
rank=self.rank,
516+
world_size=self.world_size,
517+
device_id=device,
518+
)
519+
try:
520+
self._helper_test_extra_cuda_context_by_nvml()
521+
except ModuleNotFoundError:
522+
self._helper_test_extra_cuda_context_by_memory()
523+
524+
>>>>>>> 71a21d9cde ([rocm6.4_internal_testing][SWDEV-535305] Fixed `test_extra_cuda_context` in `test_c10d_nccl.py` and refactored is_navi3_arch function (#2341))
425525
@requires_nccl()
426526
@skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
427527
def test_destruct_before_terminate_pg(self):

test/inductor/test_decompose_mem_bound_mm.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@
1010
from torch.testing import FileCheck
1111
from torch.testing._internal.common_utils import (
1212
instantiate_parametrized_tests,
13-
is_navi3_arch,
13+
NAVI3_ARCH,
14+
is_arch,
1415
parametrize,
1516
)
1617
from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CUDA
@@ -47,8 +48,8 @@ def forward(self, input1, input2):
4748

4849
# We have to increase tolerance for navi3 because all fp16, bf16
4950
# GEMMs operations have an accuracy issue caused by hardware limitation
50-
default_atol = 3e-3 if is_navi3_arch() else 1e-3
51-
default_rtol = 4e-3 if is_navi3_arch() else 1e-3
51+
default_atol = 3e-3 if is_arch(NAVI3_ARCH) else 1e-3
52+
default_rtol = 4e-3 if is_arch(NAVI3_ARCH) else 1e-3
5253

5354

5455
@requires_gpu

torch/testing/_internal/common_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,11 +109,11 @@
109109
NAVI3_ARCH = ("gfx1100", "gfx1101")
110110
NAVI4_ARCH = ("gfx1200", "gfx1201")
111111

112-
def is_navi3_arch():
112+
def is_arch(arch_list):
113113
if torch.cuda.is_available():
114114
prop = torch.cuda.get_device_properties(0)
115115
gfx_arch = prop.gcnArchName.split(":")[0]
116-
if gfx_arch in NAVI3_ARCH:
116+
if gfx_arch in arch_list:
117117
return True
118118
return False
119119

0 commit comments

Comments
 (0)