Add Int4TensorCoreTilePackedTensor for tensor core tiled int4 quantization

jerryzh168 · jerryzh168 · commit b878ee5e2bfa · 2025-08-19T20:08:07.000-07:00
This commit introduces Int4TensorCoreTilePackedTensor, a new tensor subclass for int4 weight-only quantization using tensor core tiled packing format.

Key features:
- Implements tensor core tiled packing for efficient computation on tensor cores
- Uses tinygemm quantization path instead of HQQ for consistency
- Supports PackingFormat.TENSOR_CORE_TILE_PACKED in Int4WeightOnlyConfig version 2
- Optimized for tinygemm int4mm kernel (_weight_int4pack_mm)
- Includes comprehensive test suite

The implementation follows the same pattern as other int4 tensor subclasses but uses
a specialized packing format optimized for tensor core matrix multiplication performance.

Changes:
- Add Int4TensorCoreTilePackedTensor implementation
- Update Int4WeightOnlyConfig version 2 to support TENSOR_CORE_TILE_PACKED packing format
- Add TENSOR_CORE_TILE_PACKED to PackingFormat enum
- Replace HQQ quantization with _quantize_affine_tinygemm for consistency
- Add comprehensive tests including serialization, different group sizes, and error conditions
- Update __init__.py files to export new tensor class

Test:
python test/quantization/quantize_/workflows/int4/test_int4_tensor_core_tile_packed_tensor.py
diff --git a/test/quantization/quantize_/workflows/int4/test_int4_tensor_core_tile_packed_tensor.py b/test/quantization/quantize_/workflows/int4/test_int4_tensor_core_tile_packed_tensor.py
@@ -0,0 +1,156 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+import tempfile
+import unittest
+
+import torch
+from torch.testing._internal.common_utils import (
+    TestCase,
+    instantiate_parametrized_tests,
+    parametrize,
+    run_tests,
+)
+
+from torchao.quantization import Int4WeightOnlyConfig, quantize_
+from torchao.quantization.quantize_.common.packing_format import PackingFormat
+from torchao.quantization.quantize_.workflows.int4.int4_tensor_core_tile_packed_tensor import (
+    Int4TensorCoreTilePackedTensor,
+)
+from torchao.quantization.utils import compute_error
+from torchao.utils import TORCH_VERSION_AT_LEAST_2_4
+
+TENSOR_CORE_TILED_CONFIG = Int4WeightOnlyConfig(
+    group_size=128,
+    packing_format=PackingFormat.TENSOR_CORE_TILE_PACKED,
+    version=2,
+)
+
+
+@unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_4, "Need pytorch 2.4+")
+@unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+class TestInt4TensorCoreTilePackedTensor(TestCase):
+    def setUp(self):
+        self.GPU_DEVICES = ["cuda"] if torch.cuda.is_available() else []
+
+    @parametrize(
+        "sizes",
+        [
+            ((128,), 256, 128),
+            ((32, 128), 512, 128),
+            ((2, 32, 128), 256, 128),
+        ],
+    )
+    def test_linear(self, sizes):
+        config = TENSOR_CORE_TILED_CONFIG
+        dtype = torch.bfloat16
+        device = "cuda"
+
+        M, N, K = sizes
+        input = torch.randn(*M, K, dtype=dtype, device=device)
+        linear = torch.nn.Linear(K, N, dtype=dtype, device=device)
+
+        original = linear(input)
+        quantize_(linear, config)
+        quantized = linear(input)
+        self.assertTrue(compute_error(original, quantized) > 1)
+
+        compiled_linear = torch.compile(linear)
+        quantized_and_compiled = compiled_linear(input)
+        self.assertTrue(compute_error(original, quantized_and_compiled) > 1)
+
+    def test_to_device(self):
+        config = TENSOR_CORE_TILED_CONFIG
+        for device in self.GPU_DEVICES:
+            linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
+            quantize_(linear.cuda(), config)
+            linear.to(device)
+
+            linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
+            quantize_(linear.cuda(), config)
+            linear.to(device=device)
+
+    def test_module_path(self):
+        config = TENSOR_CORE_TILED_CONFIG
+        linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
+        quantize_(linear.cuda(), config)
+        self.assertEqual(
+            str(type(linear.weight)),
+            "<class 'torchao.quantization.Int4TensorCoreTilePackedTensor'>",
+        )
+
+    def test_serialization(self):
+        """Test saving and loading the tensor directly and via state_dict"""
+        dtype = torch.bfloat16
+        device = "cuda"
+        hp_tensor = torch.randn(128, 256, dtype=dtype, device=device)
+        block_size = (1, 64)
+
+        tensor = Int4TensorCoreTilePackedTensor.from_hp(hp_tensor, block_size)
+
+        # Test direct tensor serialization
+        with tempfile.NamedTemporaryFile() as f:
+            torch.save(tensor, f)
+            f.seek(0)
+            loaded_tensor = torch.load(f)
+
+            self.assertEqual(loaded_tensor.shape, tensor.shape)
+            self.assertEqual(loaded_tensor.block_size, tensor.block_size)
+            self.assertEqual(
+                str(type(loaded_tensor)),
+                "<class 'torchao.quantization.Int4TensorCoreTilePackedTensor'>",
+            )
+
+        # Test state_dict serialization
+        linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
+        quantize_(linear.cuda(), TENSOR_CORE_TILED_CONFIG)
+
+        with tempfile.NamedTemporaryFile() as f:
+            torch.save(linear.state_dict(), f)
+            f.seek(0)
+            state_dict = torch.load(f)
+            self.assertEqual(
+                str(type(state_dict["weight"])),
+                "<class 'torchao.quantization.Int4TensorCoreTilePackedTensor'>",
+            )
+
+    @parametrize("group_size", [32, 64, 128])
+    def test_different_group_sizes(self, group_size):
+        """Test with different group sizes"""
+        dtype = torch.bfloat16
+        device = "cuda"
+        hp_tensor = torch.randn(256, 512, dtype=dtype, device=device)
+        block_size = (1, group_size)
+
+        tensor = Int4TensorCoreTilePackedTensor.from_hp(hp_tensor, block_size)
+
+        self.assertEqual(tensor.shape, hp_tensor.shape)
+        self.assertEqual(tensor.block_size, block_size)
+
+    def test_error_conditions(self):
+        """Test various error conditions"""
+        dtype = torch.bfloat16
+        device = "cuda"
+        hp_tensor = torch.randn(128, 256, dtype=dtype, device=device)
+
+        # Test invalid block_size length
+        with self.assertRaises(AssertionError):
+            Int4TensorCoreTilePackedTensor.from_hp(
+                hp_tensor, (64,)
+            )  # block_size length mismatch
+
+        # Test non-groupwise quantization
+        with self.assertRaises(AssertionError):
+            Int4TensorCoreTilePackedTensor.from_hp(
+                hp_tensor, (2, 64)
+            )  # first element should be 1
+
+
+instantiate_parametrized_tests(TestInt4TensorCoreTilePackedTensor)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/torchao/quantization/__init__.py b/torchao/quantization/__init__.py
@@ -1,7 +1,4 @@
-from torchao.kernel import (
-    int_scaled_matmul,
-    safe_int_mm,
-)
+from torchao.kernel import int_scaled_matmul, safe_int_mm
 
 from .autoquant import (
     ALL_AUTOQUANT_CLASS_LIST,
@@ -13,18 +10,8 @@
     OTHER_AUTOQUANT_CLASS_LIST,
     autoquant,
 )
-from .GPTQ import (
-    Int4WeightOnlyGPTQQuantizer,
-    MultiTensor,
-    MultiTensorInputRecorder,
-)
-from .granularity import (
-    PerAxis,
-    PerGroup,
-    PerRow,
-    PerTensor,
-    PerToken,
-)
+from .GPTQ import Int4WeightOnlyGPTQQuantizer, MultiTensor, MultiTensorInputRecorder
+from .granularity import PerAxis, PerGroup, PerRow, PerTensor, PerToken
 from .linear_activation_quantized_tensor import (
     LinearActivationQuantizedTensor,
     to_linear_activation_quantized,
@@ -37,10 +24,7 @@
     Int8DynActInt4WeightLinear,
     Int8DynActInt4WeightQuantizer,
 )
-from .observer import (
-    AffineQuantizedMinMaxObserver,
-    AffineQuantizedObserverBase,
-)
+from .observer import AffineQuantizedMinMaxObserver, AffineQuantizedObserverBase
 from .quant_api import (
     CutlassInt4PackedLayout,
     FbgemmConfig,
@@ -94,6 +78,7 @@
     Int4PreshuffledTensor,
     Int4Tensor,
     IntxUnpackedTensor,
+    Int4TensorCoreTilePackedTensor,
 )
 from .smoothquant import (
     SmoothFakeDynamicallyQuantizedLinear,
@@ -106,9 +91,7 @@
 from .subclass import *  # noqa: F403
 from .transform_module import register_quantize_module_handler
 from .unified import Quantizer, TwoStepQuantizer
-from .utils import (
-    compute_error,
-)
+from .utils import compute_error
 from .weight_only import WeightOnlyInt8QuantLinear
 
 # TODO: remove after migration of APIs are done
@@ -163,6 +146,7 @@
     "Int4PreshuffledTensor",
     "Int4MarlinSparseTensor",
     "IntxUnpackedTensor",
+    "Int4TensorCoreTilePackedTensor",
     "Float8Tensor",
     # smooth quant - subject to change
     "get_scale",
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -66,16 +66,14 @@
     LinearActivationWeightObservedTensor,
 )
 from torchao.quantization.observer import AffineQuantizedObserverBase, get_block_size
-from torchao.quantization.quantize_.common import (
-    KernelPreference,
-    PackingFormat,
-)
+from torchao.quantization.quantize_.common import KernelPreference, PackingFormat
 from torchao.quantization.quantize_.workflows import (
     Float8Tensor,
     Int4MarlinSparseTensor,
     Int4PreshuffledTensor,
     Int4Tensor,
     IntxUnpackedTensor,
+    Int4TensorCoreTilePackedTensor,
     QuantizeTensorToFloat8Kwargs,
 )
 from torchao.quantization.transform_module import (
@@ -93,35 +91,16 @@
 )
 
 from .autoquant import AutoQuantizableLinearWeight, autoquant
-from .GPTQ import (
-    Int4WeightOnlyGPTQQuantizer,
-)
-from .granularity import (
-    Granularity,
-    PerAxis,
-    PerGroup,
-    PerRow,
-    PerTensor,
-)
+from .GPTQ import Int4WeightOnlyGPTQQuantizer
+from .granularity import Granularity, PerAxis, PerGroup, PerRow, PerTensor
 from .linear_activation_quantized_tensor import (
     LinearActivationQuantizedTensor,
     to_linear_activation_quantized,
 )
-from .linear_quant_modules import (
-    Int4WeightOnlyQuantizer,
-    Int8DynActInt4WeightQuantizer,
-)
-from .qat import (
-    intx_quantization_aware_training,
-)
-from .quant_primitives import (
-    _DTYPE_TO_QVALUE_BOUNDS,
-    MappingType,
-    ZeroPointDomain,
-)
-from .subclass import (
-    QuantizedLinearWeightBase,
-)
+from .linear_quant_modules import Int4WeightOnlyQuantizer, Int8DynActInt4WeightQuantizer
+from .qat import intx_quantization_aware_training
+from .quant_primitives import _DTYPE_TO_QVALUE_BOUNDS, MappingType, ZeroPointDomain
+from .subclass import QuantizedLinearWeightBase
 from .unified import Quantizer, TwoStepQuantizer
 from .utils import _get_per_token_block_size
 
@@ -1080,6 +1059,12 @@ def _int4_weight_only_quantize_tensor(weight, config):
                 block_size,
             )
             return new_weight
+        elif packing_format == PackingFormat.TENSOR_CORE_TILE_PACKED:
+            new_weight = Int4TensorCoreTilePackedTensor.from_hp(
+                weight,
+                block_size,
+            )
+            return new_weight
         else:
             raise ValueError(f"Unsupported packing format: {packing_format}")
 
@@ -1454,10 +1439,12 @@ def int8_dynamic_activation_int8_semi_sparse_weight():
     Applies int8 dnynamic symmetric per-token activation and int8 per-channel weight
     quantization + 2:4 sparsity to linear layers.
     """
-    warnings.warn("""int8_dyanmic_activation_int8_semi_sparse_weight() will be deprecated at a later release. Please use the layout kwarg in int8_dynamic_activation_int8_weight instead.
+    warnings.warn(
+        """int8_dyanmic_activation_int8_semi_sparse_weight() will be deprecated at a later release. Please use the layout kwarg in int8_dynamic_activation_int8_weight instead.
 
     from torchao.dtypes import SemiSparseLayout
-    int8_dynamic_activation_int8_weight(layout=SemiSparseLayout()""")
+    int8_dynamic_activation_int8_weight(layout=SemiSparseLayout()"""
+    )
 
     return int8_dynamic_activation_int8_weight(layout=SemiSparseLayout())
 
@@ -2007,7 +1994,10 @@ def __post_init__(self):
             assert self.granularity.axis == 0, (
                 f"axis must be 0 with PerAxis, but got {self.granularity.axis}"
             )
-        assert self.mapping_type in [MappingType.ASYMMETRIC, MappingType.SYMMETRIC], (
+        assert self.mapping_type in [
+            MappingType.ASYMMETRIC,
+            MappingType.SYMMETRIC,
+        ], (
             f"mapping_type must be MappingType.ASYMMETRIC or MappingType.SYMMETRIC, but got {self.mapping_type}"
         )
 
diff --git a/torchao/quantization/quantize_/common/packing_format.py b/torchao/quantization/quantize_/common/packing_format.py
@@ -40,3 +40,8 @@ class PackingFormat(str, Enum):
     Unpacked means the subbyte quantized data is stored as int8
     """
     UNPACKED_TO_INT8 = "unpacked_to_int8"
+
+    """
+    tensor_core_tile_packed is referring to the format used by tensor core tiled kernels for int4 quantization
+    """
+    TENSOR_CORE_TILE_PACKED = "tensor_core_tile_packed"
diff --git a/torchao/quantization/quantize_/workflows/__init__.py b/torchao/quantization/quantize_/workflows/__init__.py
@@ -14,11 +14,13 @@
 from .intx.intx_unpacked_tensor import (
     IntxUnpackedTensor,
 )
+from .int4.int4_tensor_core_tile_packed_tensor import Int4TensorCoreTilePackedTensor
 
 __all__ = [
     "Int4Tensor",
     "Int4PreshuffledTensor",
     "Int4MarlinSparseTensor",
+    "Int4TensorCoreTilePackedTensor",
     "Float8Tensor",
     "QuantizeTensorToFloat8Kwargs",
     "IntxUnpackedTensor",
diff --git a/torchao/quantization/quantize_/workflows/int4/__init__.py b/torchao/quantization/quantize_/workflows/int4/__init__.py
diff --git a/torchao/quantization/quantize_/workflows/int4/int4_tensor_core_tile_packed_tensor.py b/torchao/quantization/quantize_/workflows/int4/int4_tensor_core_tile_packed_tensor.py