Support PLAIN_INT32 for AWQ on Intel GPU

xiaowangintel · xiaowangintel · commit 3a325f34991b · 2025-09-18T00:45:18.000-07:00
diff --git a/test/prototype/test_awq.py b/test/prototype/test_awq.py
@@ -51,6 +51,10 @@ def forward(self, x):
     devices.append("cuda")
 
 
+if torch.xpu.is_available():
+    devices.append("xpu")
+
+
 class TestAWQ(TestCase):
     def test_awq_config(self):
         base_config = Int4WeightOnlyConfig()
@@ -79,6 +83,10 @@ def test_awq_functionality(self, device):
         # baseline quantization
         if device == "cuda":
             base_config = Int4WeightOnlyConfig(group_size=group_size)
+        elif device == "xpu":
+            base_config = Int4WeightOnlyConfig(
+                group_size=group_size, int4_packing_format="plain_int32"
+            )
         elif device == "cpu":
             base_config = Int4WeightOnlyConfig(
                 group_size=group_size, int4_packing_format="opaque"
@@ -137,6 +145,10 @@ def test_awq_loading(self, device):
         # calibrate
         if device == "cuda":
             base_config = Int4WeightOnlyConfig(group_size=group_size)
+        elif device == "xpu":
+            base_config = Int4WeightOnlyConfig(
+                group_size=group_size, int4_packing_format="plain_int32"
+            )
         elif device == "cpu":
             base_config = Int4WeightOnlyConfig(
                 group_size=group_size, int4_packing_format="opaque"
@@ -198,6 +210,10 @@ def test_awq_loading_vllm(self, device):
         # calibrate
         if device == "cuda":
             base_config = Int4WeightOnlyConfig(group_size=group_size)
+        elif device == "xpu":
+            base_config = Int4WeightOnlyConfig(
+                group_size=group_size, int4_packing_format="plain_int32"
+            )
         elif device == "cpu":
             base_config = Int4WeightOnlyConfig(
                 group_size=group_size, int4_packing_format="opaque"
diff --git a/test/quantization/quantize_/workflows/int4/test_int4_plain_int32_tensor.py b/test/quantization/quantize_/workflows/int4/test_int4_plain_int32_tensor.py
@@ -19,6 +19,7 @@
     Int4WeightOnlyConfig,
     quantize_,
 )
+from torchao.quantization.quantize_.common import SupportsActivationPreScaling
 from torchao.quantization.utils import compute_error
 from torchao.utils import (
     torch_version_at_least,
@@ -77,6 +78,25 @@ def test_module_path(self, dtype):
                 "<class 'torchao.quantization.Int4PlainInt32Tensor'>",
             )
 
+    def test_activation_prescaling(self):
+        dtype = torch.bfloat16
+        device = "xpu"
+        input = torch.randn(1, 128, dtype=dtype, device=device)
+        linear = torch.nn.Linear(128, 256, bias=False, dtype=dtype, device=device)
+        original = linear(input)
+        quantize_(linear, get_config(128))
+        qw = linear.weight
+        assert isinstance(qw, SupportsActivationPreScaling), (
+            "Expected int4 tensor supports activation prescaling"
+        )
+        assert qw.act_pre_scale is None, "Default `act_pre_scale` is None"
+        _ACT_PRE_SCALE = 2
+        qw.act_pre_scale = _ACT_PRE_SCALE
+        quantized = linear(input)
+
+        # making sure activation pre scaling is successfully applied to the activation
+        self.assertTrue(compute_error(original * _ACT_PRE_SCALE, quantized) > 20)
+
 
 instantiate_parametrized_tests(Int4PlainInt32Tensor)