Add NPU (Ascend) backend support for INT4 weight-only quantization workflow (#3172)

orangeH25 · web-flow · commit 7a2a7b348775 · 2025-11-18T18:02:03.000-05:00
* Add NPU (Ascend) backend support for INT4 weight-only quantization workflow

* use torch.ops.npu prefix and drop redundant torch_npu import

* Modify test file and update comments

* add: merge NPU(Ascend) backend logic in Int4PlainInt32Tensor subclass

* ruff format cleanup, replace error types, add torch version check

* add torch_npu version assertion and show downstream testing result

* add downstream testing result

* unify NPU and XPU test cases into a single class

* move CI display to quantization README and update test file
diff --git a/test/quantization/quantize_/workflows/int4/test_int4_plain_int32_tensor.py b/test/quantization/quantize_/workflows/int4/test_int4_plain_int32_tensor.py
@@ -5,12 +5,12 @@
 # LICENSE file in the root directory of this source tree.
 
 import tempfile
-import unittest
 
+import pytest
 import torch
+from torch.testing._internal.common_device_type import instantiate_device_type_tests
 from torch.testing._internal.common_utils import (
     TestCase,
-    instantiate_parametrized_tests,
     parametrize,
     run_tests,
 )
@@ -33,9 +33,19 @@ def get_config(group_size):
     )
 
 
-@unittest.skipIf(not torch_version_at_least("2.8.0"), "Need pytorch 2.8+")
-@unittest.skipIf(not torch.xpu.is_available(), "XPU not available")
 class Int4PlainInt32Tensor(TestCase):
+    _MIN_VER = {
+        "xpu": "2.8.0",
+        "npu": "2.7.1",
+    }
+
+    def setUp(self):
+        min_req = type(self)._MIN_VER.get(self.device_type)
+        if not torch_version_at_least(min_req):
+            self.skipTest(
+                f"{self.device_type} requires torch >= {min_req}, current {torch.__version__}"
+            )
+
     @parametrize(
         "sizes",
         [
@@ -46,24 +56,35 @@ class Int4PlainInt32Tensor(TestCase):
     )
     @parametrize("dtype", [torch.bfloat16, torch.half])
     @parametrize("group_size", [32, 64, 128])
-    def test_linear(self, sizes, dtype, group_size):
-        device = "xpu"
+    @parametrize("thresholds", [{"xpu": 20, "npu": 10}])
+    def test_linear(self, device, sizes, dtype, group_size, thresholds):
         M, N, K = sizes
+        if "npu" in device and group_size == K:
+            pytest.skip(
+                f"{device} does not support group_size equal to K dimension ({group_size} == {K})"
+            )
+        threshold = thresholds.get(device.split(":")[0])
+
         input = torch.randn(*M, K, dtype=dtype, device=device)
         linear = torch.nn.Linear(K, N, dtype=dtype, device=device)
         original = linear(input)
         quantize_(linear, get_config(group_size))
         quantized = linear(input)
-        self.assertTrue(compute_error(original, quantized) > 20)
+        self.assertTrue(compute_error(original, quantized) > threshold)
 
-        compiled_linear = torch.compile(linear)
-        quantized_and_compiled = compiled_linear(input)
-        self.assertTrue(compute_error(original, quantized_and_compiled) > 20)
+        if "xpu" in device:
+            compiled_linear = torch.compile(linear)
+            quantized_and_compiled = compiled_linear(input)
+            self.assertTrue(compute_error(original, quantized_and_compiled) > threshold)
 
     @parametrize("dtype", [torch.bfloat16, torch.half])
-    def test_module_path(self, dtype):
-        linear = torch.nn.Linear(128, 256, dtype=dtype, device="xpu")
-        quantize_(linear, get_config(group_size=128))
+    def test_module_path(self, device, dtype):
+        K, N, group_size = 128, 256, 128
+        if "npu" in device:
+            group_size = 64
+
+        linear = torch.nn.Linear(K, N, dtype=dtype, device=device)
+        quantize_(linear, get_config(group_size))
         self.assertEqual(
             str(type(linear.weight)),
             "<class 'torchao.quantization.Int4PlainInt32Tensor'>",
@@ -78,13 +99,21 @@ def test_module_path(self, dtype):
                 "<class 'torchao.quantization.Int4PlainInt32Tensor'>",
             )
 
-    def test_activation_prescaling(self):
-        dtype = torch.bfloat16
-        device = "xpu"
-        input = torch.randn(1, 128, dtype=dtype, device=device)
-        linear = torch.nn.Linear(128, 256, bias=False, dtype=dtype, device=device)
+    @parametrize("dtype", [torch.float16, torch.bfloat16])
+    @parametrize("thresholds", [{"xpu": 20, "npu": 10}])
+    def test_activation_prescaling(self, device, dtype, thresholds):
+        if "xpu" in device and dtype == torch.float16:
+            pytest.skip(f"{device} test_activation_prescaling don't test {dtype}")
+
+        threshold = thresholds.get(device.split(":")[0])
+        K, N, group_size = 128, 256, 128
+        if "npu" in device:
+            group_size = 64
+
+        input = torch.randn(1, K, dtype=dtype, device=device)
+        linear = torch.nn.Linear(K, N, bias=False, dtype=dtype, device=device)
         original = linear(input)
-        quantize_(linear, get_config(128))
+        quantize_(linear, get_config(group_size))
         qw = linear.weight
         assert isinstance(qw, SupportsActivationPreScaling), (
             "Expected int4 tensor supports activation prescaling"
@@ -95,10 +124,12 @@ def test_activation_prescaling(self):
         quantized = linear(input)
 
         # making sure activation pre scaling is successfully applied to the activation
-        self.assertTrue(compute_error(original * _ACT_PRE_SCALE, quantized) > 20)
+        self.assertTrue(compute_error(original * _ACT_PRE_SCALE, quantized) > threshold)
 
 
-instantiate_parametrized_tests(Int4PlainInt32Tensor)
+instantiate_device_type_tests(
+    Int4PlainInt32Tensor, globals(), only_for=("xpu", "npu"), allow_xpu=True
+)
 
 
 if __name__ == "__main__":
diff --git a/torchao/quantization/README.md b/torchao/quantization/README.md
@@ -71,8 +71,12 @@ use_hqq = False
 quantize_(model, Int4WeightOnlyConfig(group_size=group_size, int4_packing_format="tile_packed_to_4d", int4_choose_qparams_algorithm="hqq"))
 ```
 
-Note: The quantization error incurred by applying int4 quantization to your model can be fairly significant, so using external techniques like GPTQ may be necessary to obtain a usable model.
-
+Note: 
+- The quantization error incurred by applying int4 quantization to your model can be fairly significant, so using external techniques like GPTQ may be necessary to obtain a usable model.
+- Third-party backend CI status:
+  - Ascend NPU(requires torch_npu ≥ 2.7.1)
+  [![Ascend NPU](https://github.com/Ascend/Ascend-CI/actions/workflows/torchao.yml/badge.svg)](https://github.com/Ascend/Ascend-CI/actions/workflows/torchao.yml)
+  
 #### A16W8 Int8 WeightOnly Quantization
 
 ```python
diff --git a/torchao/quantization/quantize_/workflows/int4/int4_plain_int32_tensor.py b/torchao/quantization/quantize_/workflows/int4/int4_plain_int32_tensor.py