pytorch
diff --git a/‎.github/workflows/torchao_experimental_test.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/torchao_experimental_test.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎test/quantization/quantize_/workflows/intx/test_intx_opaque_tensor.py‎
Lines changed: 339 additions & 0 deletions b/‎test/quantization/quantize_/workflows/intx/test_intx_opaque_tensor.py‎
Lines changed: 339 additions & 0 deletions
diff --git a/‎torchao/quantization/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎torchao/quantization/__init__.py‎
Lines changed: 2 additions & 0 deletions
@@ -54,6 +54,7 @@ jobs:
           python torchao/experimental/tests/test_embedding_xbit_quantizer.py
           python torchao/experimental/tests/test_quant_passes.py
           pytest -s test/prototype/test_dynamic_activation_lut.py
+          pytest -s test/quantization/quantize_/workflows/intx/test_intx_opaque_tensor.py
       - name: Run kernels/cpu/aarch64/tests
         if: runner.os == 'macOS'
         run: |
 
@@ -0,0 +1,339 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+import copy
+import tempfile
+import unittest
+
+import torch
+from parameterized import param, parameterized
+from torch.testing._internal.common_utils import (
+    TestCase,
+    run_tests,
+)
+
+from torchao.experimental.op_lib_utils import _check_torchao_ops_loaded
+from torchao.quantization.granularity import PerAxis, PerGroup
+from torchao.quantization.quant_api import (
+    Int8DynamicActivationIntxWeightConfig,
+    MappingType,
+    quantize_,
+)
+from torchao.quantization.quantize_.common import PackingFormat
+from torchao.quantization.utils import compute_error
+
+
+def _get_accuracy_test_cases():
+    MODEL_DTYPES = [
+        torch.float32,
+        torch.bfloat16,
+    ]
+
+    PACKING_FORMATS = [
+        (PackingFormat.UNPACKED_TO_INT8, None),
+        (PackingFormat.OPAQUE, "aten"),
+        (PackingFormat.OPAQUE, "torchao_auto"),
+        (PackingFormat.OPAQUE, "torchao_lowbit"),
+        (PackingFormat.OPAQUE, "torchao_kleidiai"),
+    ]
+
+    WEIGHT_DTYPES = [
+        torch.int1,
+        torch.int2,
+        torch.int3,
+        torch.int4,
+        torch.int5,
+        torch.int6,
+        torch.int7,
+        torch.int8,
+    ]
+
+    MAPPING_TYPES = [
+        MappingType.SYMMETRIC,
+        MappingType.ASYMMETRIC,
+        MappingType.SYMMETRIC_NO_CLIPPING_ERR,
+    ]
+
+    GRANULARITIES = [PerGroup(128), PerAxis(0)]
+
+    def _is_valid_test_combination(
+        model_dtype,
+        packing_format,
+        compute_target,
+        weight_dtype,
+        weight_mapping_type,
+        weight_granularity,
+    ):
+        # ATEN restrictions
+        if (packing_format == PackingFormat.OPAQUE) and (compute_target == "aten"):
+            if weight_dtype != torch.int4:
+                return False
+            if weight_mapping_type == MappingType.ASYMMETRIC:
+                return False
+            if model_dtype != torch.float32:
+                return False
+
+        # TORCHAO_KLEIDIAI restrictions
+        if (packing_format == PackingFormat.OPAQUE) and (
+            compute_target == "torchao_kleidiai"
+        ):
+            if weight_dtype != torch.int4:
+                return False
+            if weight_mapping_type == MappingType.ASYMMETRIC:
+                return False
+
+        # SYMMETRIC_NO_CLIPPING_ERR does not work well with int1
+        if (
+            weight_dtype == torch.int1
+            and weight_mapping_type == MappingType.SYMMETRIC_NO_CLIPPING_ERR
+        ):
+            return False
+
+        return True
+
+    test_cases = [
+        param(
+            model_dtype=mdt,
+            packing_format=pf,
+            compute_target=ct,
+            weight_dtype=dt,
+            weight_mapping_type=mt,
+            weight_granularity=gr,
+        )
+        for mdt in MODEL_DTYPES
+        for pf, ct in PACKING_FORMATS
+        for dt in WEIGHT_DTYPES
+        for mt in MAPPING_TYPES
+        for gr in GRANULARITIES
+        if _is_valid_test_combination(dt, pf, ct, dt, mt, gr)
+    ]
+
+    return test_cases
+
+
+_TORCHAO_OPS_LOADED = False
+try:
+    _check_torchao_ops_loaded()
+    _TORCHAO_OPS_LOADED = True
+except Exception:
+    pass
+
+
+@unittest.skipIf(not _TORCHAO_OPS_LOADED, "Need torchao ops")
+class TestIntxOpaqueTensor(TestCase):
+    @parameterized.expand(
+        _get_accuracy_test_cases(),
+        name_func=lambda f, _, params: f.__name__ + f"_{params.kwargs}",
+    )
+    def test_accuracy(
+        self,
+        model_dtype,
+        packing_format,
+        compute_target,
+        weight_dtype,
+        weight_mapping_type,
+        weight_granularity,
+    ):
+        """
+        Checks the accuracy of packed layouts
+        """
+        m = 3
+        n = 1071
+        k = 2048
+        activations = torch.randn(m, k).to(model_dtype)
+        model = torch.nn.Sequential(
+            *[torch.nn.Linear(k, k, bias=False), torch.nn.Linear(k, n, bias=True)]
+        ).to(model_dtype)
+
+        quantized_model = copy.deepcopy(model)
+        quantize_(
+            quantized_model,
+            Int8DynamicActivationIntxWeightConfig(
+                weight_dtype=weight_dtype,
+                weight_granularity=weight_granularity,
+                weight_mapping_type=weight_mapping_type,
+                packing_format=packing_format,
+                compute_target=compute_target,
+                version=2,
+            ),
+        )
+
+        quantized_model_reference = copy.deepcopy(model)
+        quantize_(
+            quantized_model_reference,
+            Int8DynamicActivationIntxWeightConfig(
+                weight_dtype=weight_dtype,
+                weight_granularity=weight_granularity,
+                weight_mapping_type=weight_mapping_type,
+                packing_format=PackingFormat.UNPACKED_TO_INT8,
+                compute_target=None,
+                version=2,
+            ),
+        )
+
+        with torch.no_grad():
+            result = quantized_model(activations)
+            expected_result = quantized_model_reference(activations)
+
+        sqnr = compute_error(result, expected_result)
+        self.assertTrue(sqnr > 30, f"Got SQNR of {sqnr}")
+
+    def test_export_compile_aoti(
+        self,
+    ):
+        m = 3
+        k0 = 512
+        k1 = 256
+        k2 = 128
+        k3 = 1024
+        weight_dtype = torch.int4
+        weight_granularity = PerAxis(0)
+        weight_mapping_type = MappingType.ASYMMETRIC
+
+        layers = [
+            torch.nn.Linear(k0, k1, bias=False),
+            torch.nn.Linear(k1, k2, bias=True),
+            torch.nn.Linear(k2, k3, bias=False),
+        ]
+        model = torch.nn.Sequential(*layers)
+        activations = torch.randn(2, 1, m, k0, dtype=torch.float32)
+        dynamic_shapes = {
+            "input": {
+                0: torch.export.Dim.AUTO,
+                1: torch.export.Dim.STATIC,
+                2: torch.export.Dim.AUTO,
+                3: torch.export.Dim.STATIC,
+            }
+        }
+
+        quantize_(
+            model,
+            Int8DynamicActivationIntxWeightConfig(
+                weight_dtype=weight_dtype,
+                weight_granularity=weight_granularity,
+                weight_mapping_type=weight_mapping_type,
+                packing_format=PackingFormat.OPAQUE,
+                compute_target="torchao_auto",
+                version=2,
+            ),
+        )
+        eager_results = model(activations)
+
+        # Export
+        exported = torch.export.export(
+            model, (activations,), strict=True, dynamic_shapes=dynamic_shapes
+        )
+        exported_results = exported.module()(activations)
+        self.assertTrue(torch.allclose(eager_results, exported_results))
+
+        # Compile
+        compiled = torch.compile(model)
+        with torch.no_grad():
+            compiled_results = compiled(activations)
+        self.assertTrue(torch.allclose(eager_results, compiled_results))
+
+        # AOTI
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            package_path = f"{tmpdirname}/model.pt2"
+            torch._inductor.aoti_compile_and_package(
+                exported, package_path=package_path
+            )
+            fn = torch._inductor.aoti_load_package(package_path)
+            aoti_results = fn(activations)
+            self.assertTrue(torch.allclose(eager_results, aoti_results))
+
+    @parameterized.expand(
+        [
+            param(packing_format=pf, compute_target=ct)
+            for (pf, ct) in [
+                (PackingFormat.OPAQUE, "torchao_auto"),
+                (PackingFormat.OPAQUE, "aten"),
+            ]
+        ],
+        name_func=lambda f, _, params: f.__name__ + f"_{params.kwargs}",
+    )
+    def test_serialization(self, packing_format, compute_target):
+        layers = [
+            torch.nn.Linear(512, 256),
+        ]
+        model = torch.nn.Sequential(*layers)
+        model2 = torch.nn.Sequential(*layers)
+        activations = torch.randn(1, 512, dtype=torch.float32)
+
+        quantize_(
+            model,
+            Int8DynamicActivationIntxWeightConfig(
+                weight_dtype=torch.int4,
+                weight_granularity=PerGroup(64),
+                packing_format=packing_format,
+                compute_target=compute_target,
+                version=2,
+            ),
+        )
+        expected = model(activations)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            torch.save(model.state_dict(), f"{tmpdirname}/model.pt")
+            state_dict = torch.load(
+                f"{tmpdirname}/model.pt", map_location="cpu", weights_only=True
+            )
+
+            # Load deserialized weights into model2 and check result
+            model2.load_state_dict(state_dict, assign=True)
+            actual = model2(activations)
+            self.assertTrue(torch.allclose(expected, actual))
+
+    def test_moe_quant_intx(self):
+        from torchao.prototype.moe_quant.quantizable_moe_modules import (
+            MOEFeedForwardAOQuantizable,
+        )
+        from torchao.prototype.moe_quant.utils import (
+            FakeExtraDimTensor,
+            MoEQuantConfig,
+            UseFakeExtraDimTensor,
+            cond_ffn_filter,
+        )
+        from torchao.quantization.quant_api import (
+            Int8DynamicActivationIntxWeightConfig,
+            quantize_,
+        )
+        from torchao.quantization.utils import compute_error
+
+        with torch.device("cpu"):
+            model = MOEFeedForwardAOQuantizable(512, 256, 8, 2, empty_init=False).to(
+                torch.float32
+            )
+            x = torch.randn(8, 512, dtype=torch.float32)
+
+        out = model(x).clone()
+
+        base_config = Int8DynamicActivationIntxWeightConfig(
+            packing_format=PackingFormat.OPAQUE,
+            compute_target="torchao_auto",
+            version=2,
+        )
+        moe_config = MoEQuantConfig(
+            base_config, use_fake_extra_dim_tensor=UseFakeExtraDimTensor.TRUE
+        )
+
+        quantize_(model, moe_config, cond_ffn_filter)
+
+        out_q = model(x).clone()
+        assert isinstance(model.experts.w1, FakeExtraDimTensor)
+
+        mod_c = torch.compile(model, mode="reduce-overhead")
+
+        mod_c(x)
+        mod_c(x)
+
+        out_qc = mod_c(x).clone()
+
+        self.assertTrue(compute_error(out_q, out) > 30)
+        self.assertTrue(compute_error(out_qc, out) > 30)
+
+
+if __name__ == "__main__":
+    run_tests()
@@ -94,6 +94,7 @@
     Int4OpaqueTensor,
     Int4PreshuffledTensor,
     Int4Tensor,
+    IntxOpaqueTensor,
     IntxUnpackedToInt8Tensor,
 )
 from .smoothquant import (
@@ -163,6 +164,7 @@
     "Int4Tensor",
     "Int4PreshuffledTensor",
     "Int4MarlinSparseTensor",
+    "IntxOpaqueTensor",
     "IntxUnpackedToInt8Tensor",
     "Float8Tensor",
     "Int4OpaqueTensor",