|
64 | 64 | TEST_WITH_DEV_DBG_ASAN,
|
65 | 65 | TEST_WITH_ROCM,
|
66 | 66 | TestCase,
|
| 67 | + is_arch, |
| 68 | + NAVI_ARCH, |
67 | 69 | )
|
68 | 70 |
|
69 | 71 |
|
@@ -422,6 +424,104 @@ def test_nan_check(self):
|
422 | 424 | # reset env
|
423 | 425 | os.environ["TORCH_NCCL_NAN_CHECK"] = "0"
|
424 | 426 |
|
| 427 | +<<<<<<< HEAD |
| 428 | +======= |
| 429 | + def _helper_test_extra_cuda_context_by_nvml(self): |
| 430 | + """ |
| 431 | + A helper for `test_extra_cuda_context`, if pynvml is avaiable. |
| 432 | + pynvml provides python bindings for NVIDIA NVML functionalities. |
| 433 | + Here we are interested in: nvmlDeviceGetComputeRunningProcesses |
| 434 | + """ |
| 435 | + import pynvml |
| 436 | + |
| 437 | + pynvml.nvmlInit() |
| 438 | + |
| 439 | + device = torch.device("cuda:%d" % self.rank) |
| 440 | + x = torch.empty((1,), device=device) |
| 441 | + work = c10d.all_reduce(x, async_op=True) |
| 442 | + |
| 443 | + # Wait for non-0 ranks to garbage collect Work -- this is the latest |
| 444 | + # point where extra CUDA context can be created |
| 445 | + if self.rank == 0: |
| 446 | + time.sleep(5) |
| 447 | + del work |
| 448 | + handle = pynvml.nvmlDeviceGetHandleByIndex(self.rank) |
| 449 | + processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) |
| 450 | + nprocs = len(processes) |
| 451 | + |
| 452 | + # A barrier for non-0 ranks |
| 453 | + c10d.all_reduce(x) |
| 454 | + torch.cuda.synchronize(device) |
| 455 | + c10d.destroy_process_group() |
| 456 | + self.assertLessEqual( |
| 457 | + nprocs, |
| 458 | + 1, |
| 459 | + f"Found {nprocs} processes creating contexts on {device}, expecting 1 at most", |
| 460 | + ) |
| 461 | + |
| 462 | + def _helper_test_extra_cuda_context_by_memory(self): |
| 463 | + """ |
| 464 | + A helper for `test_extra_cuda_context`, if pynvml is NOT avaiable. |
| 465 | + If extra context is created, it would manifest into device 0's memory usage. |
| 466 | + """ |
| 467 | + device = torch.device("cuda:%d" % self.rank) |
| 468 | + x = torch.empty((1,), device=device) |
| 469 | + # Rank 0 takes a snapshot before collective -- this snapshot should have |
| 470 | + # included rank 0's own context. |
| 471 | + if self.rank == 0: |
| 472 | + # We need this extra sleep for NAVI_ARCH because rccl_init inside init_process_group |
| 473 | + # is happening in a separate process and it is taking longer to finish on NAVI_ARCH. |
| 474 | + # Sleeping here ensures that the init is competed successfully and mem_get_info can |
| 475 | + # get stable numbers. |
| 476 | + if is_arch(NAVI_ARCH): |
| 477 | + time.sleep(5) |
| 478 | + free, total = torch.cuda.mem_get_info(device) |
| 479 | + used_before = float(total - free) |
| 480 | + |
| 481 | + work = c10d.all_reduce(x, async_op=True) |
| 482 | + |
| 483 | + # Wait for non-0 ranks to garbage collect Work -- this is the latest |
| 484 | + # point where extra CUDA context can be created |
| 485 | + if self.rank == 0: |
| 486 | + time.sleep(5) |
| 487 | + free, total = torch.cuda.mem_get_info(device) |
| 488 | + used_after = float(total - free) |
| 489 | + del work |
| 490 | + |
| 491 | + # A barrier for non-0 ranks |
| 492 | + c10d.all_reduce(x) |
| 493 | + torch.cuda.synchronize(device) |
| 494 | + c10d.destroy_process_group() |
| 495 | + if self.rank == 0: |
| 496 | + # If non-0 rank creates a context on device 0, this assert would |
| 497 | + # fail because one context takes about 1 GB -- much more than the |
| 498 | + # tensor size created in this test. |
| 499 | + self.assertTrue( |
| 500 | + used_after < used_before * 1.5, |
| 501 | + f"{device} used {used_after} bytes after collective, " |
| 502 | + f"50% more than the status before ({used_before} bytes). " |
| 503 | + f"Extra CUDA context may have been created.", |
| 504 | + ) |
| 505 | + |
| 506 | + @requires_nccl() |
| 507 | + @skip_if_lt_x_gpu(2) |
| 508 | + def test_extra_cuda_context(self): |
| 509 | + # Check if non-0 ranks would create extra CUDA context on device 0 |
| 510 | + store = c10d.FileStore(self.file_name, self.world_size) |
| 511 | + device = torch.device("cuda:%d" % self.rank) |
| 512 | + c10d.init_process_group( |
| 513 | + backend="nccl", |
| 514 | + store=store, |
| 515 | + rank=self.rank, |
| 516 | + world_size=self.world_size, |
| 517 | + device_id=device, |
| 518 | + ) |
| 519 | + try: |
| 520 | + self._helper_test_extra_cuda_context_by_nvml() |
| 521 | + except ModuleNotFoundError: |
| 522 | + self._helper_test_extra_cuda_context_by_memory() |
| 523 | + |
| 524 | +>>>>>>> 71a21d9cde ([rocm6.4_internal_testing][SWDEV-535305] Fixed `test_extra_cuda_context` in `test_c10d_nccl.py` and refactored is_navi3_arch function (#2341)) |
425 | 525 | @requires_nccl()
|
426 | 526 | @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
|
427 | 527 | def test_destruct_before_terminate_pg(self):
|
|
0 commit comments