pytorch
diff --git a/‎test/quantization/quantize_/workflows/int4/test_int4_tensor_core_tile_packed_tensor.py‎
Lines changed: 222 additions & 0 deletions b/‎test/quantization/quantize_/workflows/int4/test_int4_tensor_core_tile_packed_tensor.py‎
Lines changed: 222 additions & 0 deletions
diff --git a/‎torchao/quantization/__init__.py‎
Lines changed: 7 additions & 23 deletions b/‎torchao/quantization/__init__.py‎
Lines changed: 7 additions & 23 deletions
diff --git a/‎torchao/quantization/quant_api.py‎
Lines changed: 22 additions & 32 deletions b/‎torchao/quantization/quant_api.py‎
Lines changed: 22 additions & 32 deletions
@@ -0,0 +1,222 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+import tempfile
+import unittest
+
+import torch
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+    run_tests,
+)
+
+from torchao.quantization import Int4WeightOnlyConfig, quantize_
+from torchao.quantization.quantize_.common.packing_format import PackingFormat
+from torchao.quantization.quantize_.workflows.int4.int4_tensor_core_tile_packed_tensor import (
+    Int4TensorCoreTilePackedTensor,
+)
+from torchao.quantization.utils import compute_error
+from torchao.testing.utils import TorchAOIntegrationTestCase
+from torchao.utils import is_sm_at_least_90
+
+TENSOR_CORE_TILED_CONFIG = Int4WeightOnlyConfig(
+    group_size=128,
+    packing_format=PackingFormat.TENSOR_CORE_TILE_PACKED,
+    version=2,
+)
+
+
+@unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+@unittest.skipIf(not is_sm_at_least_90(), "Need sm90+")
+class TestInt4TensorCoreTilePackedTensor(TorchAOIntegrationTestCase):
+    def setUp(self):
+        self.GPU_DEVICES = ["cuda"] if torch.cuda.is_available() else []
+
+    @parametrize(
+        "sizes",
+        [
+            ((128,), 256, 128),
+            ((32, 128), 512, 128),
+            ((2, 32, 128), 256, 128),
+        ],
+    )
+    def test_linear(self, sizes):
+        config = TENSOR_CORE_TILED_CONFIG
+        dtype = torch.bfloat16
+        device = "cuda"
+
+        M, N, K = sizes
+        input = torch.randn(*M, K, dtype=dtype, device=device)
+        linear = torch.nn.Linear(K, N, dtype=dtype, device=device)
+
+        original = linear(input)
+        quantize_(linear, config)
+        quantized = linear(input)
+        self.assertTrue(compute_error(original, quantized) > 20)
+
+        compiled_linear = torch.compile(linear)
+        quantized_and_compiled = compiled_linear(input)
+        self.assertTrue(compute_error(original, quantized_and_compiled) > 20)
+
+    def test_module_path(self):
+        config = TENSOR_CORE_TILED_CONFIG
+        linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
+        quantize_(linear.cuda(), config)
+        self.assertEqual(
+            str(type(linear.weight)),
+            "<class 'torchao.quantization.Int4TensorCoreTilePackedTensor'>",
+        )
+
+        with tempfile.NamedTemporaryFile() as f:
+            torch.save(linear.state_dict(), f)
+            f.seek(0)
+            state_dict = torch.load(f)
+            self.assertEqual(
+                str(type(state_dict["weight"])),
+                "<class 'torchao.quantization.Int4TensorCoreTilePackedTensor'>",
+            )
+
+    def test_slice(self):
+        config = TENSOR_CORE_TILED_CONFIG
+        dtype = torch.bfloat16
+        device = "cuda"
+
+        # Create a 256x256 linear layer for testing
+        dummy = torch.nn.Linear(256, 256, bias=False, dtype=dtype, device=device)
+
+        # Create reference sliced linear layers
+        dummy1 = torch.nn.Linear(256, 64, bias=False, dtype=dtype, device=device)
+        dummy1.weight = torch.nn.Parameter(
+            dummy.weight.narrow(0, 0, 64), requires_grad=False
+        )
+        dummy2 = torch.nn.Linear(128, 256, dtype=dtype, device=device)
+        dummy2.weight = torch.nn.Parameter(
+            dummy.weight.narrow(1, 0, 128), requires_grad=False
+        )
+
+        # Quantize the main linear layer
+        quantize_(dummy, config)
+
+        # Shape analysis for tensor core tile packed format:
+        # Original weight shape: (256, 256) -> after padding: (256, 1024)
+        # n = 256, k = 1024, inner_k_tiles = 8, group_size = 128
+        # inner_k_tiles = 8, group_size = 128
+        #
+        # qdata shape: [n/8, k/(inner_k_tiles*16), 32, inner_k_tiles/2]
+        #             = [256/8, 1024/(8*16), 32, 8/2]
+        #             = [32, 8, 32, 4]
+        #
+        # scale_and_zero shape: [in_features/group_size, out_features, 2] (packed format)
+        #                     = [1024/128, 256, 2] = [8, 256, 2]
+
+        # Test slicing along output dimension (dim=0: 256 -> 64)
+        weight1 = dummy.weight.narrow(0, 0, 64)
+
+        # qdata slicing: narrow from [32, 8, 32, 4] to [8, 8, 32, 4]
+        # Calculation: 64 out_features / 256 total * 32 qdata_dim0 = 8
+        expected_qdata_slice_0 = dummy.weight.qdata.narrow(0, 0, 8)
+        self.assertEqual(weight1.qdata, expected_qdata_slice_0)
+
+        # scale_and_zero slicing: narrow from [8, 256， 2] to [8, 64, 2]
+        # slicing 0th dim of qdata means we have to slice 1th dim of scale_and_zero
+        expected_scale_zero_slice_0 = dummy.weight.scale_and_zero.narrow(1, 0, 64)
+        self.assertEqual(weight1.scale_and_zero, expected_scale_zero_slice_0)
+
+        # Test slicing along input dimension (dim=1: 256 -> 128)
+        weight2 = dummy.weight.narrow(1, 0, 128)
+
+        # qdata slicing: narrow from [32, 8, 32, 4] to [32, 4, 32, 4]
+        # k = 1024
+        # Calculation: 128 in_features (1/2 of in_features) corresponds to 1/2 of qdata dimension 1
+        # which is k / (inner_k_tiles * 16) / 2 = 1024 / (8 * 16) / 2 = 4
+        expected_qdata_slice_1 = dummy.weight.qdata.narrow(1, 0, 4)
+        self.assertEqual(weight2.qdata, expected_qdata_slice_1)
+
+        # scale_and_zero slicing: narrow from [8, 256, 2] to [4, 256, 2]
+        expected_scale_zero_slice_1 = dummy.weight.scale_and_zero.narrow(0, 0, 4)
+        self.assertEqual(weight2.scale_and_zero, expected_scale_zero_slice_1)
+
+        # Verify that sliced weights produce similar results to reference implementations
+        input1 = torch.randn(2, 256, dtype=dtype, device=device)
+        res_ref1 = dummy1(input1)
+
+        # Create a new linear layer with the sliced weight
+        test_linear1 = torch.nn.Linear(256, 64, bias=False, dtype=dtype, device=device)
+        test_linear1.weight = torch.nn.Parameter(
+            weight1.contiguous(), requires_grad=False
+        )
+        res1 = test_linear1(input1)
+        self.assertGreater(compute_error(res_ref1, res1), 15)
+
+        # input2 = torch.randn(2, 128, dtype=dtype, device=device)
+        # res_ref2 = dummy2(input2)
+
+        # Create a new linear layer with the sliced weight
+        # WIP
+        # test_linear2 = torch.nn.Linear(128, 256, bias=False, dtype=dtype, device=device)
+        # test_linear2.weight = torch.nn.Parameter(
+        #     weight2.contiguous(), requires_grad=False
+        # )
+        # res2 = test_linear2(input2)
+        # self.assertGreater(compute_error(res_ref2, res2), 15)
+
+    def test_slice_preserves_aliasing(self):
+        config = TENSOR_CORE_TILED_CONFIG
+        l = torch.nn.Linear(1024, 1024).to("cuda").to(torch.bfloat16)
+        l.weight = torch.nn.Parameter(
+            torch.zeros(1024, 1024, dtype=torch.bfloat16, device="cuda")
+        )
+        quantize_(l, config)
+        param = l.weight
+        param_data = param.data
+        param_data = param_data.narrow(0, 0, 512)
+        # Making sure the aliasing is preserved in sliced quantized Tensor
+        assert param.data.qdata.data_ptr() == param_data.qdata.data_ptr()
+        assert (
+            param.data.scale_and_zero.data_ptr() == param_data.scale_and_zero.data_ptr()
+        )
+
+    def test_slice_and_copy_similar_to_vllm(self):
+        self._test_slice_and_copy_similar_to_vllm(TENSOR_CORE_TILED_CONFIG)
+
+    @parametrize("group_size", [32, 64, 128])
+    def test_different_group_sizes(self, group_size):
+        """Test with different group sizes"""
+        dtype = torch.bfloat16
+        device = "cuda"
+        hp_tensor = torch.randn(256, 512, dtype=dtype, device=device)
+        block_size = (1, group_size)
+
+        tensor = Int4TensorCoreTilePackedTensor.from_hp(hp_tensor, block_size)
+
+        self.assertEqual(tensor.shape, hp_tensor.shape)
+        self.assertEqual(tensor.block_size, block_size)
+
+    def test_error_conditions(self):
+        """Test various error conditions"""
+        dtype = torch.bfloat16
+        device = "cuda"
+        hp_tensor = torch.randn(128, 256, dtype=dtype, device=device)
+
+        # Test invalid block_size length
+        with self.assertRaises(AssertionError):
+            Int4TensorCoreTilePackedTensor.from_hp(
+                hp_tensor, (64,)
+            )  # block_size length mismatch
+
+        # Test non-groupwise quantization
+        with self.assertRaises(AssertionError):
+            Int4TensorCoreTilePackedTensor.from_hp(
+                hp_tensor, (2, 64)
+            )  # first element should be 1
+
+
+instantiate_parametrized_tests(TestInt4TensorCoreTilePackedTensor)
+
+
+if __name__ == "__main__":
+    run_tests()
@@ -1,7 +1,4 @@
-from torchao.kernel import (
-    int_scaled_matmul,
-    safe_int_mm,
-)
+from torchao.kernel import int_scaled_matmul, safe_int_mm
 
 from .autoquant import (
     ALL_AUTOQUANT_CLASS_LIST,
@@ -13,18 +10,8 @@
     OTHER_AUTOQUANT_CLASS_LIST,
     autoquant,
 )
-from .GPTQ import (
-    Int4WeightOnlyGPTQQuantizer,
-    MultiTensor,
-    MultiTensorInputRecorder,
-)
-from .granularity import (
-    PerAxis,
-    PerGroup,
-    PerRow,
-    PerTensor,
-    PerToken,
-)
+from .GPTQ import Int4WeightOnlyGPTQQuantizer, MultiTensor, MultiTensorInputRecorder
+from .granularity import PerAxis, PerGroup, PerRow, PerTensor, PerToken
 from .linear_activation_quantized_tensor import (
     LinearActivationQuantizedTensor,
     to_linear_activation_quantized,
@@ -37,10 +24,7 @@
     Int8DynActInt4WeightLinear,
     Int8DynActInt4WeightQuantizer,
 )
-from .observer import (
-    AffineQuantizedMinMaxObserver,
-    AffineQuantizedObserverBase,
-)
+from .observer import AffineQuantizedMinMaxObserver, AffineQuantizedObserverBase
 from .quant_api import (
     CutlassInt4PackedLayout,
     FbgemmConfig,
@@ -94,6 +78,7 @@
     Int4PreshuffledTensor,
     Int4Tensor,
     IntxUnpackedTensor,
+    Int4TensorCoreTilePackedTensor,
 )
 from .smoothquant import (
     SmoothFakeDynamicallyQuantizedLinear,
@@ -106,9 +91,7 @@
 from .subclass import *  # noqa: F403
 from .transform_module import register_quantize_module_handler
 from .unified import Quantizer, TwoStepQuantizer
-from .utils import (
-    compute_error,
-)
+from .utils import compute_error
 from .weight_only import WeightOnlyInt8QuantLinear
 
 # TODO: remove after migration of APIs are done
@@ -163,6 +146,7 @@
     "Int4PreshuffledTensor",
     "Int4MarlinSparseTensor",
     "IntxUnpackedTensor",
+    "Int4TensorCoreTilePackedTensor",
     "Float8Tensor",
     # smooth quant - subject to change
     "get_scale",
 
@@ -66,16 +66,14 @@
     LinearActivationWeightObservedTensor,
 )
 from torchao.quantization.observer import AffineQuantizedObserverBase, get_block_size
-from torchao.quantization.quantize_.common import (
-    KernelPreference,
-    PackingFormat,
-)
+from torchao.quantization.quantize_.common import KernelPreference, PackingFormat
 from torchao.quantization.quantize_.workflows import (
     Float8Tensor,
     Int4MarlinSparseTensor,
     Int4PreshuffledTensor,
     Int4Tensor,
     IntxUnpackedTensor,
+    Int4TensorCoreTilePackedTensor,
     QuantizeTensorToFloat8Kwargs,
 )
 from torchao.quantization.transform_module import (
@@ -93,35 +91,16 @@
 )
 
 from .autoquant import AutoQuantizableLinearWeight, autoquant
-from .GPTQ import (
-    Int4WeightOnlyGPTQQuantizer,
-)
-from .granularity import (
-    Granularity,
-    PerAxis,
-    PerGroup,
-    PerRow,
-    PerTensor,
-)
+from .GPTQ import Int4WeightOnlyGPTQQuantizer
+from .granularity import Granularity, PerAxis, PerGroup, PerRow, PerTensor
 from .linear_activation_quantized_tensor import (
     LinearActivationQuantizedTensor,
     to_linear_activation_quantized,
 )
-from .linear_quant_modules import (
-    Int4WeightOnlyQuantizer,
-    Int8DynActInt4WeightQuantizer,
-)
-from .qat import (
-    intx_quantization_aware_training,
-)
-from .quant_primitives import (
-    _DTYPE_TO_QVALUE_BOUNDS,
-    MappingType,
-    ZeroPointDomain,
-)
-from .subclass import (
-    QuantizedLinearWeightBase,
-)
+from .linear_quant_modules import Int4WeightOnlyQuantizer, Int8DynActInt4WeightQuantizer
+from .qat import intx_quantization_aware_training
+from .quant_primitives import _DTYPE_TO_QVALUE_BOUNDS, MappingType, ZeroPointDomain
+from .subclass import QuantizedLinearWeightBase
 from .unified import Quantizer, TwoStepQuantizer
 from .utils import _get_per_token_block_size
 
@@ -1080,6 +1059,12 @@ def _int4_weight_only_quantize_tensor(weight, config):
                 block_size,
             )
             return new_weight
+        elif packing_format == PackingFormat.TENSOR_CORE_TILE_PACKED:
+            new_weight = Int4TensorCoreTilePackedTensor.from_hp(
+                weight,
+                block_size,
+            )
+            return new_weight
         else:
             raise ValueError(f"Unsupported packing format: {packing_format}")
 
@@ -1454,10 +1439,12 @@ def int8_dynamic_activation_int8_semi_sparse_weight():
     Applies int8 dnynamic symmetric per-token activation and int8 per-channel weight
     quantization + 2:4 sparsity to linear layers.
     """
-    warnings.warn("""int8_dyanmic_activation_int8_semi_sparse_weight() will be deprecated at a later release. Please use the layout kwarg in int8_dynamic_activation_int8_weight instead.
+    warnings.warn(
+        """int8_dyanmic_activation_int8_semi_sparse_weight() will be deprecated at a later release. Please use the layout kwarg in int8_dynamic_activation_int8_weight instead.
 
     from torchao.dtypes import SemiSparseLayout
-    int8_dynamic_activation_int8_weight(layout=SemiSparseLayout()""")
+    int8_dynamic_activation_int8_weight(layout=SemiSparseLayout()"""
+    )
 
     return int8_dynamic_activation_int8_weight(layout=SemiSparseLayout())
 
@@ -2007,7 +1994,10 @@ def __post_init__(self):
             assert self.granularity.axis == 0, (
                 f"axis must be 0 with PerAxis, but got {self.granularity.axis}"
             )
-        assert self.mapping_type in [MappingType.ASYMMETRIC, MappingType.SYMMETRIC], (
+        assert self.mapping_type in [
+            MappingType.ASYMMETRIC,
+            MappingType.SYMMETRIC,
+        ], (
             f"mapping_type must be MappingType.ASYMMETRIC or MappingType.SYMMETRIC, but got {self.mapping_type}"
         )