Add float4_e2m1fn_x2 support for concat (#2315)

CuiYifeng · web-flow · commit 993ab70da750 · 2025-11-18T02:20:40.000Z
This PR adds support for the `float4_e2m1fn_x2` data type to the `cat`
(concatenate) kernel on XPU devices.
diff --git a/src/ATen/native/xpu/sycl/Shape.cpp b/src/ATen/native/xpu/sycl/Shape.cpp
@@ -395,7 +395,8 @@ void cat_out_kernel(
         kBool,
         kBFloat16,
         AT_EXPAND(AT_FLOAT8_TYPES),
-        AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
+        AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES),
+        kFloat4_e2m1fn_x2);
   } else {
     offset = 0;
     for (j = 0; j < numInputs; j++) {
diff --git a/test/regressions/test_cat.py b/test/regressions/test_cat.py
@@ -7,6 +7,9 @@
 from torch.testing._internal.common_dtype import float8_types_and
 from torch.testing._internal.common_utils import run_tests, TestCase
 
+cpu_device = torch.device("cpu")
+xpu_device = torch.device("xpu")
+
 
 class TestTorchMethod(TestCase):
     def _create_input_tensors(self, shape, dtype, memory_format=None):
@@ -61,6 +64,21 @@ def test_cat_simple(self, dtype):
 
         self._test_cat_float8_core(tensors, dim, dtype)
 
+    def _float4_dummy_tensor(self, shape, device):
+        data = torch.ones(shape, dtype=torch.uint8, device=device)
+        return data.view(torch.float4_e2m1fn_x2)
+
+    def test_cat_float4_simple(self):
+        input_cpu1 = self._float4_dummy_tensor([2, 2, 6], device=cpu_device)
+        input_cpu2 = self._float4_dummy_tensor([2, 2, 6], device=cpu_device)
+        output_cpu = torch.stack([input_cpu1, input_cpu2]).view(torch.uint8)
+
+        input_xpu1 = self._float4_dummy_tensor([2, 2, 6], device=xpu_device)
+        input_xpu2 = self._float4_dummy_tensor([2, 2, 6], device=xpu_device)
+        output_xpu = torch.stack([input_xpu1, input_xpu2]).view(torch.uint8)
+
+        self.assertEqual(output_xpu, output_cpu)
+
     def test_cat_8d(self, dtype=torch.float):
         input1 = torch.randn([256, 8, 8, 3, 3, 3, 3], dtype=dtype)
         input2 = torch.randn([256, 8, 8, 3, 3, 3, 3], dtype=dtype)