Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions test/distributed/fsdp/test_distributed_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def test_distributed_checkpoint(self, state_dict_type) -> None:
# TODO: add resharding test case.


devices = ("cuda", "hpu")
instantiate_device_type_tests(TestDistributedCheckpoint, globals(), only_for=devices)
devices = ("cuda", "hpu", "xpu")
instantiate_device_type_tests(TestDistributedCheckpoint, globals(), only_for=devices, allow_xpu=True)
if __name__ == "__main__":
run_tests()
4 changes: 2 additions & 2 deletions test/distributed/fsdp/test_fsdp_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def test_apply_in_summon_raises_error(self):
transformer.apply(self._init_linear_weights)


devices = ("cuda", "hpu")
instantiate_device_type_tests(TestApply, globals(), only_for=devices)
devices = ("cuda", "hpu", "xpu")
instantiate_device_type_tests(TestApply, globals(), only_for=devices, allow_xpu=True)
if __name__ == "__main__":
run_tests()
4 changes: 2 additions & 2 deletions test/distributed/fsdp/test_fsdp_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@ def test_checkpoint_submodule(self, device, use_reentrant: bool):
self.assertTrue(p1.grad.allclose(p2.grad))


devices = ("cuda", "hpu")
instantiate_device_type_tests(TestFSDPCheckpointSubmodule, globals(), only_for=devices)
devices = ("cuda", "hpu", "xpu")
instantiate_device_type_tests(TestFSDPCheckpointSubmodule, globals(), only_for=devices, allow_xpu=True)
if __name__ == "__main__":
run_tests()
4 changes: 2 additions & 2 deletions test/distributed/fsdp/test_fsdp_clip_grad_norm.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,7 @@ def _test_no_gradients(self, device, use_orig_params: bool):
self.assertEqual(total_norm, torch.tensor(0.0, device=self.device_type))


devices = ("cuda", "hpu")
instantiate_device_type_tests(TestClipGradNorm, globals(), only_for=devices)
devices = ("cuda", "hpu", "xpu")
instantiate_device_type_tests(TestClipGradNorm, globals(), only_for=devices, allow_xpu=True)
if __name__ == "__main__":
run_tests()
2 changes: 1 addition & 1 deletion test/distributed/fsdp/test_fsdp_comm.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,7 @@ def forward(self, x: torch.Tensor):
model.module.mlps._wait_unshard_streams_on_current_stream()


devices = ("cuda", "hpu")
devices = ("cuda", "hpu", "xpu")
instantiate_device_type_tests(TestCommunication, globals(), only_for=devices)
instantiate_device_type_tests(TestExplicitUnshard, globals(), only_for=devices)
if __name__ == "__main__":
Expand Down
12 changes: 6 additions & 6 deletions test/distributed/fsdp/test_fsdp_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,11 +512,11 @@ def _patch_use_unsharded_views(self, new_use_unsharded_views: Callable):
FlatParamHandle._use_unsharded_views = orig_use_unsharded_views


devices = ("cuda", "hpu")
instantiate_device_type_tests(TestHooks, globals(), only_for=devices)
instantiate_device_type_tests(TestParityWithDDP, globals(), only_for=devices)
instantiate_device_type_tests(TestNoGrad, globals(), only_for=devices)
instantiate_device_type_tests(TestParamInit, globals(), only_for=devices)
instantiate_device_type_tests(TestAutograd, globals(), only_for=devices)
devices = ("cuda", "hpu", "xpu")
instantiate_device_type_tests(TestHooks, globals(), only_for=devices, allow_xpu=True)
instantiate_device_type_tests(TestParityWithDDP, globals(), only_for=devices, allow_xpu=True)
instantiate_device_type_tests(TestNoGrad, globals(), only_for=devices, allow_xpu=True)
instantiate_device_type_tests(TestParamInit, globals(), only_for=devices, allow_xpu=True)
instantiate_device_type_tests(TestAutograd, globals(), only_for=devices, allow_xpu=True)
if __name__ == "__main__":
run_tests()
4 changes: 2 additions & 2 deletions test/distributed/fsdp/test_fsdp_dtensor_state_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,9 +285,9 @@ def test_raises_warning_or_errors(self):
FSDP.optim_state_dict(model, optim)


devices = ("cuda", "hpu")
devices = ("cuda", "hpu", "xpu")
instantiate_device_type_tests(
TestFSDPWithDeviceMeshAndDTensor, globals(), only_for=devices
TestFSDPWithDeviceMeshAndDTensor, globals(), only_for=devices, allow_xpu=True
)
if __name__ == "__main__":
run_tests()
4 changes: 2 additions & 2 deletions test/distributed/fsdp/test_fsdp_exec_order.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ def test_train_eval(self, device, sharding_strategy: ShardingStrategy):
# an `AssertionError` will be raised above for both sharding strategies


devices = ("cuda", "hpu")
instantiate_device_type_tests(TestFSDPExecOrder, globals(), only_for=devices)
devices = ("cuda", "hpu", "xpu")
instantiate_device_type_tests(TestFSDPExecOrder, globals(), only_for=devices, allow_xpu=True)
if __name__ == "__main__":
run_tests()
4 changes: 2 additions & 2 deletions test/distributed/fsdp/test_fsdp_fine_tune.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,7 @@ def _test_parity_with_non_frozen_fsdp(
self.assertEqual(param, ref_param)


devices = ("cuda", "hpu")
instantiate_device_type_tests(TestFSDPFineTune, globals(), only_for=devices)
devices = ("cuda", "hpu", "xpu")
instantiate_device_type_tests(TestFSDPFineTune, globals(), only_for=devices, allow_xpu=True)
if __name__ == "__main__":
run_tests()
4 changes: 2 additions & 2 deletions test/distributed/fsdp/test_fsdp_fx.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def test_symbolic_tracing_outputs(self):
self.assertEqual(exec_info.visited_params, set(exec_info.param_forward_order))


devices = ("cuda", "hpu")
instantiate_device_type_tests(TestSymbolicTracing, globals(), only_for=devices)
devices = ("cuda", "hpu", "xpu")
instantiate_device_type_tests(TestSymbolicTracing, globals(), only_for=devices, allow_xpu=True)
if __name__ == "__main__":
run_tests()
4 changes: 2 additions & 2 deletions test/distributed/fsdp/test_fsdp_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def forward(self, input):
optim.zero_grad()


devices = ("cuda", "hpu")
instantiate_device_type_tests(TestInput, globals(), only_for=devices)
devices = ("cuda", "hpu", "xpu")
instantiate_device_type_tests(TestInput, globals(), only_for=devices, allow_xpu=True)
if __name__ == "__main__":
run_tests()
2 changes: 1 addition & 1 deletion test/distributed/fsdp/test_fsdp_multiple_forward.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def test_multi_forward(self):
self.assertEqual(ddp_state, fsdp_state)


devices = ("cpu", "hpu")
devices = ("cpu", "hpu", "xpu")
instantiate_device_type_tests(TestMultiForward, globals(), only_for=devices)
if __name__ == "__main__":
run_tests()
4 changes: 2 additions & 2 deletions test/distributed/fsdp/test_fsdp_multiple_wrapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def test_multiple_wrapping(self, device):
self.assertEqual(output, rewrapped_output)


devices = ("cuda", "hpu")
instantiate_device_type_tests(TestMultipleWrapping, globals(), only_for=devices)
devices = ("cuda", "hpu", "xpu")
instantiate_device_type_tests(TestMultipleWrapping, globals(), only_for=devices, allow_xpu=True)
if __name__ == "__main__":
run_tests()
4 changes: 2 additions & 2 deletions test/distributed/fsdp/test_fsdp_overlap.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,9 +256,9 @@ def world_size(self):
return 2


devices = ("cuda", "hpu")
devices = ("cuda", "hpu", "xpu")
instantiate_device_type_tests(
TestForwardOverlapWorldSizeOne, globals(), only_for=devices
TestForwardOverlapWorldSizeOne, globals(), only_for=devices, allow_xpu=True
)
if __name__ == "__main__":
run_tests()
4 changes: 2 additions & 2 deletions test/distributed/fsdp/test_fsdp_pure_fp16.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def _test_fp16_dtypes(
self.assertEqual(param.grad.dtype, torch.float16)


devices = ("cuda", "hpu")
instantiate_device_type_tests(TestPureFP16, globals(), only_for=devices)
devices = ("cuda", "hpu", "xpu")
instantiate_device_type_tests(TestPureFP16, globals(), only_for=devices, allow_xpu=True)
if __name__ == "__main__":
run_tests()
4 changes: 2 additions & 2 deletions test/distributed/fsdp/test_fsdp_traversal.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def test_fsdp_modules(self):
)


devices = ("cuda", "hpu")
instantiate_device_type_tests(TestTraversal, globals(), only_for=devices)
devices = ("cuda", "hpu", "xpu")
instantiate_device_type_tests(TestTraversal, globals(), only_for=devices, allow_xpu=True)
if __name__ == "__main__":
run_tests()
4 changes: 2 additions & 2 deletions test/distributed/fsdp/test_fsdp_uneven.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def test_one_iteration(self, device):
self.assertEqual(ref_weight_out, weight_out)


devices = ("cuda", "hpu")
instantiate_device_type_tests(TestUnevenParamShard, globals(), only_for=devices)
devices = ("cuda", "hpu", "xpu")
instantiate_device_type_tests(TestUnevenParamShard, globals(), only_for=devices, allow_xpu=True)
if __name__ == "__main__":
run_tests()
4 changes: 2 additions & 2 deletions test/distributed/fsdp/test_hsdp_dtensor_state_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,9 +324,9 @@ def forward(self, x):
self.assertIsInstance(state["exp_avg_sq"], torch.Tensor)


devices = ("cuda", "hpu")
devices = ("cuda", "hpu", "xpu")
instantiate_device_type_tests(
TestHSDPWithDeviceMeshAndDTensor, globals(), only_for=devices
TestHSDPWithDeviceMeshAndDTensor, globals(), only_for=devices, allow_xpu=True
)
if __name__ == "__main__":
run_tests()
4 changes: 2 additions & 2 deletions test/distributed/fsdp/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def fill_fn(x):
self.assertEqual(torch.sum(x), 0)


devices = ("cuda", "hpu")
instantiate_device_type_tests(TestUtils, globals(), only_for=devices)
devices = ("cuda", "hpu", "xpu")
instantiate_device_type_tests(TestUtils, globals(), only_for=devices, allow_xpu=True)
if __name__ == "__main__":
run_tests()
10 changes: 9 additions & 1 deletion test/distributed/tensor/test_dtensor_compile.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
skipIfTorchDynamo,
TEST_CUDA,
TEST_HPU,
TEST_XPU,
)
from torch.testing._internal.distributed._tensor.common_dtensor import (
DTensorTestBase,
Expand Down Expand Up @@ -108,7 +109,14 @@ def tearDown(self):

@property
def device_type(self) -> str:
return "cuda" if TEST_CUDA else "hpu" if TEST_HPU else "cpu"
if TEST_CUDA:
return "cuda"
elif TEST_HPU:
return "hpu"
elif TEST_XPU:
return "xpu"
else:
return "xpu"

@property
def world_size(self) -> int:
Expand Down
10 changes: 7 additions & 3 deletions test/distributed/tensor/test_random_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,20 @@
)
from torch.distributed.tensor.debug import CommDebugMode
from torch.distributed.tensor.parallel import ColwiseParallel, parallelize_module
from torch.testing._internal.common_utils import run_tests, TEST_HPU
from torch.testing._internal.common_utils import run_tests, TEST_HPU, TEST_XPU
from torch.testing._internal.distributed._tensor.common_dtensor import (
DTensorTestBase,
skip_if_lt_x_gpu,
skip_unless_torch_gpu,
with_comms,
)


TYPE_DEVICE = "hpu" if TEST_HPU else "cuda"
if TEST_XPU:
TYPE_DEVICE = "xpu"
elif TEST_HPU:
TYPE_DEVICE = "hpu"
else:
TYPE_DEVICE = "cuda"


class DistTensorRandomInitTest(DTensorTestBase):
Expand Down
4 changes: 2 additions & 2 deletions test/distributed/tensor/test_redistribute.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from torch.distributed.device_mesh import init_device_mesh
from torch.distributed.tensor._collective_utils import shard_dim_alltoall
from torch.distributed.tensor.debug import CommDebugMode
from torch.testing._internal.common_utils import run_tests, TEST_CUDA, TEST_HPU
from torch.testing._internal.common_utils import run_tests, TEST_CUDA, TEST_HPU, TEST_XPU
from torch.testing._internal.distributed._tensor.common_dtensor import (
DTensorTestBase,
with_comms,
Expand Down Expand Up @@ -366,7 +366,7 @@ def test_redistribute_shard_dim_change(self):
local_out_dt = out_dt.to_local()
local_expected_dt = expected_dt.to_local()
self.assertEqual(out_dt.to_local(), expected_dt.to_local())
if TEST_HPU or TEST_CUDA:
if TEST_HPU or TEST_CUDA or TEST_XPU:
self.assertEqual(
comm_mode.get_comm_counts()[
torch.ops._dtensor.shard_dim_alltoall
Expand Down
2 changes: 1 addition & 1 deletion test/distributed/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def test_create_pg(self, device) -> None:
dist.destroy_process_group()


devices = ["cpu", "cuda", "hpu"]
devices = ["cpu", "cuda", "hpu", "xpu"]
instantiate_device_type_tests(TestMiscCollectiveUtils, globals(), only_for=devices)

if __name__ == "__main__":
Expand Down
9 changes: 9 additions & 0 deletions test/distributed/test_functional_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
skipIfHpu,
TEST_CUDA,
TEST_HPU,
TEST_XPU,
TestCase,
)

Expand Down Expand Up @@ -66,6 +67,9 @@
DEVICE = "hpu"
elif TEST_CUDA:
devices.append("cuda")
elif TEST_XPU:
devices.append("xpu")
DEVICE = "xpu"


def new_subgroups(group_size: int, pg_tag=None):
Expand Down Expand Up @@ -474,6 +478,8 @@ def allred_mesh_dim(input):
# And then set the BACKEND variable appropriately.
if TEST_HPU:
BACKEND = dist.Backend.HCCL
elif TEST_XPU:
BACKEND = dist.Backend.XCCL


# allows you to check for multiple accelerator irrespective of device type
Expand All @@ -486,6 +492,9 @@ def exit_if_lt_x_accelerators(x):
elif TEST_HPU:
if torch.hpu.device_count() < x:
sys.exit(TEST_SKIPS[f"multi-hpu-{x}"].exit_code)
elif TEST_XPU:
if torch.xpu.device_count() < x:
sys.exit(TEST_SKIPS[f"multi-hpu-{x}"].exit_code)


def with_comms(func=None):
Expand Down
6 changes: 6 additions & 0 deletions torch/testing/_internal/common_device_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -1586,6 +1586,10 @@ class dtypesIfCUDA(dtypes):
def __init__(self, *args):
super().__init__(*args, device_type="cuda")

# Overrides specified dtypes on CUDA.
class dtypesIfXPU(dtypes):
def __init__(self, *args):
super().__init__(*args, device_type="xpu")

class dtypesIfMPS(dtypes):
def __init__(self, *args):
Expand Down Expand Up @@ -1951,6 +1955,8 @@ def skipMPS(fn):
def skipHPU(fn):
return skipHPUIf(True, "test doesn't work on HPU backend")(fn)

def skipXPU(fn):
return skipXPUIf(True, "test doesn't work on XPU backend")(fn)

def skipPRIVATEUSE1(fn):
return skipPRIVATEUSE1If(True, "test doesn't work on privateuse1 backend")(fn)
Expand Down
16 changes: 12 additions & 4 deletions torch/testing/_internal/common_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
TestCase,
run_tests,
TEST_HPU,
TEST_XPU,
)
from torch.testing._internal.distributed.multi_threaded_pg import (
_install_threaded_pg,
Expand Down Expand Up @@ -105,6 +106,8 @@ class DistTestCases:
backend_feature["plugin"] = set()
if TEST_HPU:
backend_feature["hpu"] = {"hccl"}
if TEST_XPU:
backend_feature["xpu"] = {"xccl"}


def skip_if_no_gpu(func):
Expand All @@ -120,6 +123,8 @@ def wrapper(*args, **kwargs):
sys.exit(TEST_SKIPS[f"multi-gpu-{world_size}"].exit_code)
if TEST_HPU and torch.hpu.device_count < world_size:
sys.exit(TEST_SKIPS[f"multi-gpu-{world_size}"].exit_code)
if TEST_XPU and torch.xpu.device_count < world_size:
sys.exit(TEST_SKIPS[f"multi-xpu-{world_size}"].exit_code)

return func(*args, **kwargs)

Expand Down Expand Up @@ -199,6 +204,8 @@ def wrapper(*args, **kwargs):
return func(*args, **kwargs)
if TEST_HPU and torch.hpu.device_count() >= x:
return func(*args, **kwargs)
if TEST_XPU and torch.xpu.device_count() >= x:
return func(*args, **kwargs)
sys.exit(TEST_SKIPS[f"multi-gpu-{x}"].exit_code)

return wrapper
Expand Down Expand Up @@ -510,7 +517,8 @@ def init_multigpu_helper(world_size: int, backend: str):
nGPUs = torch.cuda.device_count()
if TEST_HPU:
nGPUs = torch.hpu.device_count()

if TEST_XPU:
nGPUs = torch.xpu.device_count()
visible_devices = range(nGPUs)

# If rank is less than or equal to number of available GPU's
Expand Down Expand Up @@ -953,8 +961,8 @@ def create_pg(self, device):
rank=self.rank,
store=store
)
if "nccl" in self.backend(device):
torch.cuda.set_device(self.rank)
if "nccl" or "xccl" in self.backend(device):
torch.accelerator.set_device_index(self.rank)
return torch.distributed.distributed_c10d._get_default_group()

def rank_to_device(self, device):
Expand Down Expand Up @@ -1347,7 +1355,7 @@ def _dynamo_dist_per_rank_init(rank, world_size, init_pg=True, fake_pg=False):
# To avoid multiple inheritance from _dynamo.test_case.TestCase and MultiProcessTestCase,
# Just manually implement the most important part of the dynamo behavior to reset/clear.
if not fake_pg:
torch.cuda.set_device(rank)
torch.accelerator.set_device_index(rank)
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '6789'
if init_pg:
Expand Down
Loading