pytorch
diff --git a/‎test/quantization/quantize_/workflows/int4/test_int4_marlin_sparse_tensor.py‎
Lines changed: 107 additions & 0 deletions b/‎test/quantization/quantize_/workflows/int4/test_int4_marlin_sparse_tensor.py‎
Lines changed: 107 additions & 0 deletions
diff --git a/‎test/quantization/quantize_/workflows/int4/test_int4_tensor_core_tile_packed_tensor.py‎
Lines changed: 170 additions & 0 deletions b/‎test/quantization/quantize_/workflows/int4/test_int4_tensor_core_tile_packed_tensor.py‎
Lines changed: 170 additions & 0 deletions
diff --git a/‎torchao/quantization/__init__.py‎
Lines changed: 9 additions & 23 deletions b/‎torchao/quantization/__init__.py‎
Lines changed: 9 additions & 23 deletions
@@ -0,0 +1,107 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+import tempfile
+import unittest
+
+import torch
+from torch.testing._internal.common_utils import (
+    TestCase,
+    instantiate_parametrized_tests,
+    parametrize,
+    run_tests,
+)
+
+from torchao.quantization import (
+    Int4WeightOnlyConfig,
+    quantize_,
+)
+from torchao.quantization.utils import compute_error
+from torchao.sparsity.sparse_api import apply_fake_sparsity
+from torchao.utils import (
+    TORCH_VERSION_AT_LEAST_2_8,
+)
+
+BF16_ACT_CONFIG = Int4WeightOnlyConfig(
+    group_size=128,
+    packing_format="marlin_sparse",
+    version=2,
+)
+
+
+@unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_8, "Need pytorch 2.8+")
+@unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+class TestInt4MarlinSparseTensor(TestCase):
+    def setUp(self):
+        self.GPU_DEVICES = ["cuda"] if torch.cuda.is_available() else []
+
+    @parametrize("config", [BF16_ACT_CONFIG])
+    @parametrize(
+        "sizes",
+        [
+            ((128,), 256, 128),
+            ((32, 128), 512, 128),
+            ((2, 32, 128), 256, 12),
+        ],
+    )
+    def test_linear(self, config, sizes):
+        dtype = torch.float16
+        device = "cuda"
+
+        M, N, K = sizes
+        input = torch.randn(*M, K, dtype=dtype, device=device)
+        linear = torch.nn.Linear(K, N, dtype=dtype, device=device)
+
+        apply_fake_sparsity(linear)
+        original = linear(input)
+        quantize_(linear, config)
+        quantized = linear(input)
+        self.assertTrue(compute_error(original, quantized) > 20)
+
+        compiled_linear = torch.compile(linear)
+        quantized_and_compiled = compiled_linear(input)
+        self.assertTrue(compute_error(original, quantized_and_compiled) > 20)
+
+    @unittest.skip("Fix later")
+    @parametrize("config", [BF16_ACT_CONFIG])
+    def test_to_device(self, config):
+        for device in self.GPU_DEVICES:
+            linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
+            quantize_(linear, config)
+            linear.to(device)
+
+            linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
+            quantize_(linear, config)
+            linear.to(device=device)
+
+            linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
+            quantize_(linear, config)
+            linear.to(device)
+
+    @parametrize("config", [BF16_ACT_CONFIG])
+    def test_module_path(self, config):
+        linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
+        quantize_(linear.cuda(), config)
+        self.assertEqual(
+            str(type(linear.weight)),
+            "<class 'torchao.quantization.Int4MarlinSparseTensor'>",
+        )
+
+        with tempfile.NamedTemporaryFile() as f:
+            torch.save(linear.state_dict(), f)
+            f.seek(0)
+            state_dict = torch.load(f)
+            self.assertEqual(
+                str(type(state_dict["weight"])),
+                "<class 'torchao.quantization.Int4MarlinSparseTensor'>",
+            )
+
+
+instantiate_parametrized_tests(TestInt4MarlinSparseTensor)
+
+
+if __name__ == "__main__":
+    run_tests()
@@ -0,0 +1,170 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+import tempfile
+import unittest
+
+import torch
+from torch.testing._internal.common_utils import (
+    TestCase,
+    instantiate_parametrized_tests,
+    parametrize,
+    run_tests,
+)
+
+from torchao.quantization import Int4WeightOnlyConfig, quantize_
+from torchao.quantization.quantize_.common.packing_format import PackingFormat
+from torchao.quantization.quantize_.workflows.int4.int4_tensor_core_tile_packed_tensor import (
+    Int4TensorCoreTilePackedTensor,
+)
+from torchao.quantization.utils import compute_error
+from torchao.utils import TORCH_VERSION_AT_LEAST_2_4
+
+TENSOR_CORE_TILED_CONFIG = Int4WeightOnlyConfig(
+    group_size=128,
+    packing_format=PackingFormat.TENSOR_CORE_TILE_PACKED,
+    version=2,
+)
+
+
+@unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_4, "Need pytorch 2.4+")
+@unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+class TestInt4TensorCoreTilePackedTensor(TestCase):
+    def setUp(self):
+        self.GPU_DEVICES = ["cuda"] if torch.cuda.is_available() else []
+
+    @parametrize("config", [TENSOR_CORE_TILED_CONFIG])
+    @parametrize(
+        "sizes",
+        [
+            ((128,), 256, 128),
+            ((32, 128), 512, 128),
+            ((2, 32, 128), 256, 128),
+        ],
+    )
+    def test_linear(self, config, sizes):
+        dtype = torch.bfloat16
+        device = "cuda"
+
+        M, N, K = sizes
+        input = torch.randn(*M, K, dtype=dtype, device=device)
+        linear = torch.nn.Linear(K, N, dtype=dtype, device=device)
+
+        original = linear(input)
+        quantize_(linear, config)
+        quantized = linear(input)
+        self.assertTrue(compute_error(original, quantized) > 1)
+
+        compiled_linear = torch.compile(linear)
+        quantized_and_compiled = compiled_linear(input)
+        self.assertTrue(compute_error(original, quantized_and_compiled) > 1)
+
+    def test_from_hp(self):
+        """Test creating Int4TensorCoreTilePackedTensor from high precision tensor"""
+        dtype = torch.bfloat16
+        device = "cuda"
+        hp_tensor = torch.randn(256, 128, dtype=dtype, device=device)
+        block_size = (1, 64)
+
+        tensor = Int4TensorCoreTilePackedTensor.from_hp(hp_tensor, block_size)
+
+        self.assertEqual(tensor.shape, hp_tensor.shape)
+        self.assertEqual(tensor.block_size, block_size)
+        self.assertEqual(tensor.device.type, device)
+        self.assertEqual(tensor.dtype, dtype)
+
+    @parametrize("config", [TENSOR_CORE_TILED_CONFIG])
+    def test_to_device(self, config):
+        for device in self.GPU_DEVICES:
+            linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
+            quantize_(linear.cuda(), config)
+            linear.to(device)
+
+            linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
+            quantize_(linear.cuda(), config)
+            linear.to(device=device)
+
+    @parametrize("config", [TENSOR_CORE_TILED_CONFIG])
+    def test_module_path(self, config):
+        linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
+        quantize_(linear.cuda(), config)
+        self.assertEqual(
+            str(type(linear.weight)),
+            "<class 'torchao.quantization.Int4TensorCoreTilePackedTensor'>",
+        )
+
+    def test_serialization(self):
+        """Test saving and loading the tensor directly and via state_dict"""
+        dtype = torch.bfloat16
+        device = "cuda"
+        hp_tensor = torch.randn(128, 256, dtype=dtype, device=device)
+        block_size = (1, 64)
+
+        tensor = Int4TensorCoreTilePackedTensor.from_hp(hp_tensor, block_size)
+
+        # Test direct tensor serialization
+        with tempfile.NamedTemporaryFile() as f:
+            torch.save(tensor, f)
+            f.seek(0)
+            loaded_tensor = torch.load(f)
+
+            self.assertEqual(loaded_tensor.shape, tensor.shape)
+            self.assertEqual(loaded_tensor.block_size, tensor.block_size)
+            self.assertEqual(
+                str(type(loaded_tensor)),
+                "<class 'torchao.quantization.Int4TensorCoreTilePackedTensor'>",
+            )
+
+        # Test state_dict serialization
+        linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
+        quantize_(linear.cuda(), TENSOR_CORE_TILED_CONFIG)
+
+        with tempfile.NamedTemporaryFile() as f:
+            torch.save(linear.state_dict(), f)
+            f.seek(0)
+            state_dict = torch.load(f)
+            self.assertEqual(
+                str(type(state_dict["weight"])),
+                "<class 'torchao.quantization.Int4TensorCoreTilePackedTensor'>",
+            )
+
+    @parametrize("group_size", [32, 64, 128])
+    def test_different_group_sizes(self, group_size):
+        """Test with different group sizes"""
+        dtype = torch.bfloat16
+        device = "cuda"
+        hp_tensor = torch.randn(256, 512, dtype=dtype, device=device)
+        block_size = (1, group_size)
+
+        tensor = Int4TensorCoreTilePackedTensor.from_hp(hp_tensor, block_size)
+
+        self.assertEqual(tensor.shape, hp_tensor.shape)
+        self.assertEqual(tensor.block_size, block_size)
+
+    def test_error_conditions(self):
+        """Test various error conditions"""
+        dtype = torch.bfloat16
+        device = "cuda"
+        hp_tensor = torch.randn(128, 256, dtype=dtype, device=device)
+
+        # Test invalid block_size length
+        with self.assertRaises(AssertionError):
+            Int4TensorCoreTilePackedTensor.from_hp(
+                hp_tensor, (64,)
+            )  # block_size length mismatch
+
+        # Test non-groupwise quantization
+        with self.assertRaises(AssertionError):
+            Int4TensorCoreTilePackedTensor.from_hp(
+                hp_tensor, (2, 64)
+            )  # first element should be 1
+
+
+instantiate_parametrized_tests(TestInt4TensorCoreTilePackedTensor)
+
+
+if __name__ == "__main__":
+    run_tests()
@@ -1,7 +1,4 @@
-from torchao.kernel import (
-    int_scaled_matmul,
-    safe_int_mm,
-)
+from torchao.kernel import int_scaled_matmul, safe_int_mm
 
 from .autoquant import (
     ALL_AUTOQUANT_CLASS_LIST,
@@ -13,18 +10,8 @@
     OTHER_AUTOQUANT_CLASS_LIST,
     autoquant,
 )
-from .GPTQ import (
-    Int4WeightOnlyGPTQQuantizer,
-    MultiTensor,
-    MultiTensorInputRecorder,
-)
-from .granularity import (
-    PerAxis,
-    PerGroup,
-    PerRow,
-    PerTensor,
-    PerToken,
-)
+from .GPTQ import Int4WeightOnlyGPTQQuantizer, MultiTensor, MultiTensorInputRecorder
+from .granularity import PerAxis, PerGroup, PerRow, PerTensor, PerToken
 from .linear_activation_quantized_tensor import (
     LinearActivationQuantizedTensor,
     to_linear_activation_quantized,
@@ -37,10 +24,7 @@
     Int8DynActInt4WeightLinear,
     Int8DynActInt4WeightQuantizer,
 )
-from .observer import (
-    AffineQuantizedMinMaxObserver,
-    AffineQuantizedObserverBase,
-)
+from .observer import AffineQuantizedMinMaxObserver, AffineQuantizedObserverBase
 from .quant_api import (
     CutlassInt4PackedLayout,
     FbgemmConfig,
@@ -90,8 +74,10 @@
 )
 from .quantize_.workflows import (
     Float8Tensor,
+    Int4MarlinSparseTensor,
     Int4PreshuffledTensor,
     Int4Tensor,
+    Int4TensorCoreTilePackedTensor,
 )
 from .smoothquant import (
     SmoothFakeDynamicallyQuantizedLinear,
@@ -104,9 +90,7 @@
 from .subclass import *  # noqa: F403
 from .transform_module import register_quantize_module_handler
 from .unified import Quantizer, TwoStepQuantizer
-from .utils import (
-    compute_error,
-)
+from .utils import compute_error
 from .weight_only import WeightOnlyInt8QuantLinear
 
 # TODO: remove after migration of APIs are done
@@ -159,6 +143,8 @@
     # tensor subclasses
     "Int4Tensor",
     "Int4PreshuffledTensor",
+    "Int4MarlinSparseTensor",
+    "Int4TensorCoreTilePackedTensor",
     "Float8Tensor",
     # smooth quant - subject to change
     "get_scale",