Chao1Han · Chao1Han · Jan 9, 2025 · Jan 17, 2025 · Jan 17, 2025 · Jan 22, 2025
diff --git a/test/distributed/fsdp/test_distributed_checkpoint.py b/test/distributed/fsdp/test_distributed_checkpoint.py
@@ -89,7 +89,7 @@ def test_distributed_checkpoint(self, state_dict_type) -> None:
         # TODO: add resharding test case.
 
 
-devices = ("cuda", "hpu")
-instantiate_device_type_tests(TestDistributedCheckpoint, globals(), only_for=devices)
+devices = ("cuda", "hpu", "xpu")
+instantiate_device_type_tests(TestDistributedCheckpoint, globals(), only_for=devices, allow_xpu=True)
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_apply.py b/test/distributed/fsdp/test_fsdp_apply.py
@@ -113,7 +113,7 @@ def test_apply_in_summon_raises_error(self):
                 transformer.apply(self._init_linear_weights)
 
 
-devices = ("cuda", "hpu")
-instantiate_device_type_tests(TestApply, globals(), only_for=devices)
+devices = ("cuda", "hpu", "xpu")
+instantiate_device_type_tests(TestApply, globals(), only_for=devices, allow_xpu=True)
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_checkpoint.py b/test/distributed/fsdp/test_fsdp_checkpoint.py
@@ -334,7 +334,7 @@ def test_checkpoint_submodule(self, device, use_reentrant: bool):
             self.assertTrue(p1.grad.allclose(p2.grad))
 
 
-devices = ("cuda", "hpu")
-instantiate_device_type_tests(TestFSDPCheckpointSubmodule, globals(), only_for=devices)
+devices = ("cuda", "hpu", "xpu")
+instantiate_device_type_tests(TestFSDPCheckpointSubmodule, globals(), only_for=devices, allow_xpu=True)
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_clip_grad_norm.py b/test/distributed/fsdp/test_fsdp_clip_grad_norm.py
@@ -338,7 +338,7 @@ def _test_no_gradients(self, device, use_orig_params: bool):
         self.assertEqual(total_norm, torch.tensor(0.0, device=self.device_type))
 
 
-devices = ("cuda", "hpu")
-instantiate_device_type_tests(TestClipGradNorm, globals(), only_for=devices)
+devices = ("cuda", "hpu", "xpu")
+instantiate_device_type_tests(TestClipGradNorm, globals(), only_for=devices, allow_xpu=True)
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_comm.py b/test/distributed/fsdp/test_fsdp_comm.py
@@ -382,7 +382,7 @@ def forward(self, x: torch.Tensor):
             model.module.mlps._wait_unshard_streams_on_current_stream()
 
 
-devices = ("cuda", "hpu")
+devices = ("cuda", "hpu", "xpu")
 instantiate_device_type_tests(TestCommunication, globals(), only_for=devices)
 instantiate_device_type_tests(TestExplicitUnshard, globals(), only_for=devices)
 if __name__ == "__main__":

diff --git a/test/distributed/fsdp/test_fsdp_core.py b/test/distributed/fsdp/test_fsdp_core.py
@@ -512,11 +512,11 @@ def _patch_use_unsharded_views(self, new_use_unsharded_views: Callable):
             FlatParamHandle._use_unsharded_views = orig_use_unsharded_views
 
 
-devices = ("cuda", "hpu")
-instantiate_device_type_tests(TestHooks, globals(), only_for=devices)
-instantiate_device_type_tests(TestParityWithDDP, globals(), only_for=devices)
-instantiate_device_type_tests(TestNoGrad, globals(), only_for=devices)
-instantiate_device_type_tests(TestParamInit, globals(), only_for=devices)
-instantiate_device_type_tests(TestAutograd, globals(), only_for=devices)
+devices = ("cuda", "hpu", "xpu")
+instantiate_device_type_tests(TestHooks, globals(), only_for=devices, allow_xpu=True)
+instantiate_device_type_tests(TestParityWithDDP, globals(), only_for=devices, allow_xpu=True)
+instantiate_device_type_tests(TestNoGrad, globals(), only_for=devices, allow_xpu=True)
+instantiate_device_type_tests(TestParamInit, globals(), only_for=devices, allow_xpu=True)
+instantiate_device_type_tests(TestAutograd, globals(), only_for=devices, allow_xpu=True)
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_dtensor_state_dict.py b/test/distributed/fsdp/test_fsdp_dtensor_state_dict.py
@@ -285,9 +285,9 @@ def test_raises_warning_or_errors(self):
                 FSDP.optim_state_dict(model, optim)
 
 
-devices = ("cuda", "hpu")
+devices = ("cuda", "hpu", "xpu")
 instantiate_device_type_tests(
-    TestFSDPWithDeviceMeshAndDTensor, globals(), only_for=devices
+    TestFSDPWithDeviceMeshAndDTensor, globals(), only_for=devices, allow_xpu=True
 )
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_exec_order.py b/test/distributed/fsdp/test_fsdp_exec_order.py
@@ -211,7 +211,7 @@ def test_train_eval(self, device, sharding_strategy: ShardingStrategy):
         # an `AssertionError` will be raised above for both sharding strategies
 
 
-devices = ("cuda", "hpu")
-instantiate_device_type_tests(TestFSDPExecOrder, globals(), only_for=devices)
+devices = ("cuda", "hpu", "xpu")
+instantiate_device_type_tests(TestFSDPExecOrder, globals(), only_for=devices, allow_xpu=True)
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_fine_tune.py b/test/distributed/fsdp/test_fsdp_fine_tune.py
@@ -404,7 +404,7 @@ def _test_parity_with_non_frozen_fsdp(
                     self.assertEqual(param, ref_param)
 
 
-devices = ("cuda", "hpu")
-instantiate_device_type_tests(TestFSDPFineTune, globals(), only_for=devices)
+devices = ("cuda", "hpu", "xpu")
+instantiate_device_type_tests(TestFSDPFineTune, globals(), only_for=devices, allow_xpu=True)
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_fx.py b/test/distributed/fsdp/test_fsdp_fx.py
@@ -113,7 +113,7 @@ def test_symbolic_tracing_outputs(self):
         self.assertEqual(exec_info.visited_params, set(exec_info.param_forward_order))
 
 
-devices = ("cuda", "hpu")
-instantiate_device_type_tests(TestSymbolicTracing, globals(), only_for=devices)
+devices = ("cuda", "hpu", "xpu")
+instantiate_device_type_tests(TestSymbolicTracing, globals(), only_for=devices, allow_xpu=True)
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_input.py b/test/distributed/fsdp/test_fsdp_input.py
@@ -70,7 +70,7 @@ def forward(self, input):
             optim.zero_grad()
 
 
-devices = ("cuda", "hpu")
-instantiate_device_type_tests(TestInput, globals(), only_for=devices)
+devices = ("cuda", "hpu", "xpu")
+instantiate_device_type_tests(TestInput, globals(), only_for=devices, allow_xpu=True)
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_multiple_forward.py b/test/distributed/fsdp/test_fsdp_multiple_forward.py
@@ -73,7 +73,7 @@ def test_multi_forward(self):
         self.assertEqual(ddp_state, fsdp_state)
 
 
-devices = ("cpu", "hpu")
+devices = ("cpu", "hpu", "xpu")
 instantiate_device_type_tests(TestMultiForward, globals(), only_for=devices)
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_multiple_wrapping.py b/test/distributed/fsdp/test_fsdp_multiple_wrapping.py
@@ -61,7 +61,7 @@ def test_multiple_wrapping(self, device):
         self.assertEqual(output, rewrapped_output)
 
 
-devices = ("cuda", "hpu")
-instantiate_device_type_tests(TestMultipleWrapping, globals(), only_for=devices)
+devices = ("cuda", "hpu", "xpu")
+instantiate_device_type_tests(TestMultipleWrapping, globals(), only_for=devices, allow_xpu=True)
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_overlap.py b/test/distributed/fsdp/test_fsdp_overlap.py
@@ -256,9 +256,9 @@ def world_size(self):
         return 2
 
 
-devices = ("cuda", "hpu")
+devices = ("cuda", "hpu", "xpu")
 instantiate_device_type_tests(
-    TestForwardOverlapWorldSizeOne, globals(), only_for=devices
+    TestForwardOverlapWorldSizeOne, globals(), only_for=devices, allow_xpu=True
 )
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_pure_fp16.py b/test/distributed/fsdp/test_fsdp_pure_fp16.py
@@ -151,7 +151,7 @@ def _test_fp16_dtypes(
                 self.assertEqual(param.grad.dtype, torch.float16)
 
 
-devices = ("cuda", "hpu")
-instantiate_device_type_tests(TestPureFP16, globals(), only_for=devices)
+devices = ("cuda", "hpu", "xpu")
+instantiate_device_type_tests(TestPureFP16, globals(), only_for=devices, allow_xpu=True)
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_traversal.py b/test/distributed/fsdp/test_fsdp_traversal.py
@@ -61,7 +61,7 @@ def test_fsdp_modules(self):
         )
 
 
-devices = ("cuda", "hpu")
-instantiate_device_type_tests(TestTraversal, globals(), only_for=devices)
+devices = ("cuda", "hpu", "xpu")
+instantiate_device_type_tests(TestTraversal, globals(), only_for=devices, allow_xpu=True)
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_uneven.py b/test/distributed/fsdp/test_fsdp_uneven.py
@@ -68,7 +68,7 @@ def test_one_iteration(self, device):
             self.assertEqual(ref_weight_out, weight_out)
 
 
-devices = ("cuda", "hpu")
-instantiate_device_type_tests(TestUnevenParamShard, globals(), only_for=devices)
+devices = ("cuda", "hpu", "xpu")
+instantiate_device_type_tests(TestUnevenParamShard, globals(), only_for=devices, allow_xpu=True)
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/fsdp/test_hsdp_dtensor_state_dict.py b/test/distributed/fsdp/test_hsdp_dtensor_state_dict.py
@@ -324,9 +324,9 @@ def forward(self, x):
                 self.assertIsInstance(state["exp_avg_sq"], torch.Tensor)
 
 
-devices = ("cuda", "hpu")
+devices = ("cuda", "hpu", "xpu")
 instantiate_device_type_tests(
-    TestHSDPWithDeviceMeshAndDTensor, globals(), only_for=devices
+    TestHSDPWithDeviceMeshAndDTensor, globals(), only_for=devices, allow_xpu=True
 )
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/fsdp/test_utils.py b/test/distributed/fsdp/test_utils.py
@@ -129,7 +129,7 @@ def fill_fn(x):
         self.assertEqual(torch.sum(x), 0)
 
 
-devices = ("cuda", "hpu")
-instantiate_device_type_tests(TestUtils, globals(), only_for=devices)
+devices = ("cuda", "hpu", "xpu")
+instantiate_device_type_tests(TestUtils, globals(), only_for=devices, allow_xpu=True)
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/test_dtensor_compile.py b/test/distributed/tensor/test_dtensor_compile.py
@@ -43,6 +43,7 @@
     skipIfTorchDynamo,
     TEST_CUDA,
     TEST_HPU,
+    TEST_XPU,
 )
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
@@ -108,7 +109,14 @@ def tearDown(self):
 
     @property
     def device_type(self) -> str:
-        return "cuda" if TEST_CUDA else "hpu" if TEST_HPU else "cpu"
+        if TEST_CUDA:
+            return "cuda"
+        elif TEST_HPU:
+            return "hpu"
+        elif TEST_XPU:
+            return "xpu"
+        else:
+            return "xpu"
 
     @property
     def world_size(self) -> int:

diff --git a/test/distributed/tensor/test_random_ops.py b/test/distributed/tensor/test_random_ops.py
@@ -19,16 +19,20 @@
 )
 from torch.distributed.tensor.debug import CommDebugMode
 from torch.distributed.tensor.parallel import ColwiseParallel, parallelize_module
-from torch.testing._internal.common_utils import run_tests, TEST_HPU
+from torch.testing._internal.common_utils import run_tests, TEST_HPU, TEST_XPU
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
     skip_if_lt_x_gpu,
     skip_unless_torch_gpu,
     with_comms,
 )
 
-
-TYPE_DEVICE = "hpu" if TEST_HPU else "cuda"
+if TEST_XPU:
+    TYPE_DEVICE = "xpu"
+elif TEST_HPU:
+    TYPE_DEVICE = "hpu"
+else:
+    TYPE_DEVICE = "cuda"
 
 
 class DistTensorRandomInitTest(DTensorTestBase):

diff --git a/test/distributed/tensor/test_redistribute.py b/test/distributed/tensor/test_redistribute.py
@@ -9,7 +9,7 @@
 from torch.distributed.device_mesh import init_device_mesh
 from torch.distributed.tensor._collective_utils import shard_dim_alltoall
 from torch.distributed.tensor.debug import CommDebugMode
-from torch.testing._internal.common_utils import run_tests, TEST_CUDA, TEST_HPU
+from torch.testing._internal.common_utils import run_tests, TEST_CUDA, TEST_HPU, TEST_XPU
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
     with_comms,
@@ -366,7 +366,7 @@ def test_redistribute_shard_dim_change(self):
                 local_out_dt = out_dt.to_local()
                 local_expected_dt = expected_dt.to_local()
                 self.assertEqual(out_dt.to_local(), expected_dt.to_local())
-                if TEST_HPU or TEST_CUDA:
+                if TEST_HPU or TEST_CUDA or TEST_XPU:
                     self.assertEqual(
                         comm_mode.get_comm_counts()[
                             torch.ops._dtensor.shard_dim_alltoall

diff --git a/test/distributed/test_backends.py b/test/distributed/test_backends.py
@@ -44,7 +44,7 @@ def test_create_pg(self, device) -> None:
         dist.destroy_process_group()
 
 
-devices = ["cpu", "cuda", "hpu"]
+devices = ["cpu", "cuda", "hpu", "xpu"]
 instantiate_device_type_tests(TestMiscCollectiveUtils, globals(), only_for=devices)
 
 if __name__ == "__main__":

diff --git a/test/distributed/test_functional_api.py b/test/distributed/test_functional_api.py
@@ -34,6 +34,7 @@
     skipIfHpu,
     TEST_CUDA,
     TEST_HPU,
+    TEST_XPU,
     TestCase,
 )
 
@@ -66,6 +67,9 @@
     DEVICE = "hpu"
 elif TEST_CUDA:
     devices.append("cuda")
+elif TEST_XPU:
+    devices.append("xpu")
+    DEVICE = "xpu"
 
 
 def new_subgroups(group_size: int, pg_tag=None):
@@ -474,6 +478,8 @@ def allred_mesh_dim(input):
 # And then set the BACKEND variable appropriately.
 if TEST_HPU:
     BACKEND = dist.Backend.HCCL
+elif TEST_XPU:
+    BACKEND = dist.Backend.XCCL
 
 
 # allows you to check for multiple accelerator irrespective of device type
@@ -486,6 +492,9 @@ def exit_if_lt_x_accelerators(x):
     elif TEST_HPU:
         if torch.hpu.device_count() < x:
             sys.exit(TEST_SKIPS[f"multi-hpu-{x}"].exit_code)
+    elif TEST_XPU:
+        if torch.xpu.device_count() < x:
+            sys.exit(TEST_SKIPS[f"multi-hpu-{x}"].exit_code)
 
 
 def with_comms(func=None):

diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
@@ -1586,6 +1586,10 @@ class dtypesIfCUDA(dtypes):
     def __init__(self, *args):
         super().__init__(*args, device_type="cuda")
 
+# Overrides specified dtypes on CUDA.
+class dtypesIfXPU(dtypes):
+    def __init__(self, *args):
+        super().__init__(*args, device_type="xpu")
 
 class dtypesIfMPS(dtypes):
     def __init__(self, *args):
@@ -1951,6 +1955,8 @@ def skipMPS(fn):
 def skipHPU(fn):
     return skipHPUIf(True, "test doesn't work on HPU backend")(fn)
 
+def skipXPU(fn):
+    return skipXPUIf(True, "test doesn't work on XPU backend")(fn)
 
 def skipPRIVATEUSE1(fn):
     return skipPRIVATEUSE1If(True, "test doesn't work on privateuse1 backend")(fn)

diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
@@ -44,6 +44,7 @@
     TestCase,
     run_tests,
     TEST_HPU,
+    TEST_XPU,
 )
 from torch.testing._internal.distributed.multi_threaded_pg import (
     _install_threaded_pg,
@@ -105,6 +106,8 @@ class DistTestCases:
     backend_feature["plugin"] = set()
     if TEST_HPU:
         backend_feature["hpu"] = {"hccl"}
+    if TEST_XPU:
+        backend_feature["xpu"] = {"xccl"}
 
 
 def skip_if_no_gpu(func):
@@ -120,6 +123,8 @@ def wrapper(*args, **kwargs):
             sys.exit(TEST_SKIPS[f"multi-gpu-{world_size}"].exit_code)
         if TEST_HPU and torch.hpu.device_count < world_size:
             sys.exit(TEST_SKIPS[f"multi-gpu-{world_size}"].exit_code)
+        if TEST_XPU and torch.xpu.device_count < world_size:
+            sys.exit(TEST_SKIPS[f"multi-xpu-{world_size}"].exit_code)
 
         return func(*args, **kwargs)
 
@@ -199,6 +204,8 @@ def wrapper(*args, **kwargs):
                 return func(*args, **kwargs)
             if TEST_HPU and torch.hpu.device_count() >= x:
                 return func(*args, **kwargs)
+            if TEST_XPU and torch.xpu.device_count() >= x:
+                return func(*args, **kwargs)
             sys.exit(TEST_SKIPS[f"multi-gpu-{x}"].exit_code)
 
         return wrapper
@@ -510,7 +517,8 @@ def init_multigpu_helper(world_size: int, backend: str):
     nGPUs = torch.cuda.device_count()
     if TEST_HPU:
         nGPUs = torch.hpu.device_count()
-
+    if TEST_XPU:
+        nGPUs = torch.xpu.device_count()
     visible_devices = range(nGPUs)
 
     # If rank is less than or equal to number of available GPU's
@@ -953,8 +961,8 @@ def create_pg(self, device):
             rank=self.rank,
             store=store
         )
-        if "nccl" in self.backend(device):
-            torch.cuda.set_device(self.rank)
+        if "nccl" or "xccl" in self.backend(device):
+            torch.accelerator.set_device_index(self.rank)
         return torch.distributed.distributed_c10d._get_default_group()
 
     def rank_to_device(self, device):
@@ -1347,7 +1355,7 @@ def _dynamo_dist_per_rank_init(rank, world_size, init_pg=True, fake_pg=False):
     # To avoid multiple inheritance from _dynamo.test_case.TestCase and MultiProcessTestCase,
     # Just manually implement the most important part of the dynamo behavior to reset/clear.
     if not fake_pg:
-        torch.cuda.set_device(rank)
+        torch.accelerator.set_device_index(rank)
     os.environ['MASTER_ADDR'] = 'localhost'
     os.environ['MASTER_PORT'] = '6789'
     if init_pg: