Add gguf q4_k_s quantization

jerryzh168 · jerryzh168 · commit 1dc5fe0a8125 · 2025-04-01T22:33:31.000-07:00
Summary: Didn't implement the algorithm to choose_qparams from gguf, since it's complicated, e.g. https://github.com/ggml-org/llama.cpp/blob/f423981ac806bf031d83784bcb47d2721bc70f97/ggml/src/ggml-quants.c#L744 and https://github.com/ggml-org/llama.cpp/blob/f423981ac806bf031d83784bcb47d2721bc70f97/ggml/src/ggml-quants.c#L827C14-L827C28 but implemented a simple choose_qparams that can fit the gguf format: Q4_K: w = q * block_scale(6-bit) + block_min(6-bit) Test Plan: python test/prototype/test_gguf_quant.py Reviewers: Subscribers: Tasks: Tags:
diff --git a/test/prototype/test_gguf_quant.py b/test/prototype/test_gguf_quant.py
@@ -0,0 +1,53 @@
+import unittest
+
+import torch
+
+from torchao.prototype.quantization.gguf import (
+    GGUFQuantizedTensor,
+    GGUFWeightOnlyConfig,
+    choose_qparams_gguf,
+)
+from torchao.quantization import quantize_
+from torchao.quantization.utils import compute_error
+
+
+class TestGGUFQuantization(unittest.TestCase):
+    def setUp(self):
+        torch.manual_seed(123)
+        self.input = torch.randn(2, 256, dtype=torch.float32)
+        self.n_super_blocks = 8
+        self.block_size = (1, 32)
+        self.dtype = torch.uint4
+
+    def test_choose_qparams_gguf(self):
+        (
+            super_block_scale_scale,
+            super_block_min_scale,
+            quantized_block_scale,
+            quantized_block_min,
+        ) = choose_qparams_gguf(self.input, self.block_size, self.dtype)
+
+        assert super_block_scale_scale.shape, (2, 8)
+        assert super_block_min_scale.shape, (2, 8)
+        assert quantized_block_scale.shape, (2, 32)
+
+    def test_gguf_quantized_tensor_from_float(self):
+        gqt = GGUFQuantizedTensor.from_float(
+            self.input,
+            self.n_super_blocks,
+            self.dtype,
+        )
+
+        dequant = gqt.dequantize()
+
+        sqnr = compute_error(dequant, self.input)
+        self.assertGreater(sqnr, 30)
+
+    def test_quantize_api(self):
+        m = torch.nn.Sequential(torch.nn.Linear(256, 64))
+        quantize_(m, GGUFWeightOnlyConfig())
+        assert type(m[0].weight) == GGUFQuantizedTensor
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/torchao/prototype/quantization/gguf/__init__.py b/torchao/prototype/quantization/gguf/__init__.py
@@ -0,0 +1,11 @@
+from .gguf_quantized_tensor import (
+    GGUFQuantizedTensor,
+    GGUFWeightOnlyConfig,
+    choose_qparams_gguf,
+)
+
+__all__ = [
+    "GGUFQuantizedTensor",
+    "choose_qparams_gguf",
+    "GGUFWeightOnlyConfig",
+]
diff --git a/torchao/prototype/quantization/gguf/gguf_quantized_tensor.py b/torchao/prototype/quantization/gguf/gguf_quantized_tensor.py
@@ -0,0 +1,252 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Define a Tensor subclass to wrap around ggml q4_0 tensor layout.
+# The layout is the following:
+# ┌─────────────────────┬───────────────────────────┐
+# │                     │                           │
+# │                     │                           │
+# │  2 bytes (1xfp16)   │    16 bytes (32xint4)     │
+# │  group-wise scale   │    group-wise weights     │
+# │                     │                           │
+# │                     │                           │
+# └─────────────────────┴───────────────────────────┘
+#
+# Notice that the 16 bytes (32 int4) are interleved:
+# [0th value, 16th value, 1st value, 17th value, ..., 15th, 31st]
+#
+# This layout is handled internally in the tensor subclass.
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+
+from torchao.core.config import AOBaseConfig
+from torchao.quantization.quant_primitives import (
+    choose_qparams_gguf,
+    dequantize_gguf,
+    quantize_gguf,
+)
+from torchao.quantization.transform_module import register_quantize_module_handler
+from torchao.utils import TorchAOBaseTensor
+
+_QK_K = 256
+
+__all__ = [
+    "GGUFQuantizedTensor",
+    "choose_qparams_gguf",
+    "quantize_gguf",
+    "dequantize_gguf",
+    "GGUFWeightOnlyConfig",
+]
+
+
+class GGUFQuantizedTensor(TorchAOBaseTensor):
+    """
+    A Tensor subclass that when applied to a weight used in a linear op/module,
+    changes that linear op to a weight-only int4 quantized linear op with groupwise
+    affine quantization on the weight.
+    """
+
+    @staticmethod
+    def __new__(
+        cls,
+        n_super_blocks,
+        super_block_scale_scale,
+        super_block_min_scale,
+        quantized_block_scale,
+        quantized_block_min,
+        int_data,
+        shape,
+        **kwargs,
+    ):
+        kwargs["device"] = kwargs.get("device", super_block_scale_scale.device)
+        kwargs["dtype"] = kwargs.get("dtype", super_block_scale_scale.dtype)
+        kwargs["requires_grad"] = False
+        return torch.Tensor._make_wrapper_subclass(cls, shape, **kwargs)  # type: ignore[attr-defined]
+
+    def __init__(
+        self,
+        n_super_blocks,
+        super_block_scale_scale,
+        super_block_min_scale,
+        quantized_block_scale,
+        quantized_block_min,
+        int_data,
+        shape,
+        **kwargs,
+    ):
+        self.n_super_blocks = n_super_blocks
+        self.super_block_scale_scale = super_block_scale_scale
+        self.super_block_min_scale = super_block_min_scale
+        self.quantized_block_scale = quantized_block_scale
+        self.quantized_block_min = quantized_block_min
+        self.int_data = int_data
+
+    def _apply_fn_to_data(self, fn):
+        return self.__class__(
+            self.n_super_blocks,
+            fn(self.super_block_scale_scale),
+            fn(self.super_block_min_sclae),
+            fn(self.quantized_block_scale),
+            fn(self.quantized_block_min),
+            fn(self.int_data),
+            self.shape,
+            dtype=self.dtype,
+        )
+
+    def __tensor_flatten__(self):
+        return [
+            "super_block_scale_scale",
+            "super_block_min_scale",
+            "quantized_block_scale",
+            "quantized_block_min",
+            "int_data",
+        ], (
+            self.n_super_blocks,
+            self.dtype,
+            self.shape,
+        )
+
+    @classmethod
+    def __tensor_unflatten__(
+        cls, tensor_data_dict, attributes, outer_size=None, outer_stride=None
+    ):
+        (
+            super_block_scale_scale,
+            super_block_min_scale,
+            quantized_block_scale,
+            quantized_block_min,
+            int_data,
+        ) = (
+            tensor_data_dict["super_block_scale_scale"],
+            tensor_data_dict["super_block_min_scale"],
+            tensor_data_dict["quantized_block_scale"],
+            tensor_data_dict["quantized_block_min"],
+            tensor_data_dict["int_data"],
+        )
+        n_super_blocks, dtype, shape = attributes
+        return cls(
+            n_super_blocks,
+            super_block_scale_scale,
+            super_block_min_scale,
+            quantized_block_scale,
+            quantized_block_min,
+            int_data,
+            shape if outer_size is None else outer_size,
+            dtype=dtype,
+        )
+
+    def dequantize(self, output_dtype: Optional[torch.dtype] = None) -> torch.Tensor:
+        block_size = tuple(
+            [1] * (self.int_data.ndim - 1) + [_QK_K // self.n_super_blocks]
+        )
+        return dequantize_gguf(
+            self.int_data,
+            block_size,
+            self.dtype,
+            self.super_block_scale_scale,
+            self.super_block_min_scale,
+            self.quantized_block_scale,
+            self.quantized_block_min,
+        )
+
+    def detach(self):
+        """
+        Returns a new `CodebookQuantizedTensor`.
+        """
+        return self.__class__(
+            self.n_super_blocks,
+            self.super_block_scale_scale.detach(),
+            self.super_block_min_scale.detach(),
+            self.quantized_block_scale.detach(),
+            self.quantized_block_min.detach(),
+            self.int_data.detach(),
+            self.shape,
+            dtype=self.dtype,
+        )
+
+    def requires_grad_(self, requires_grad=False):
+        """
+        Modifies the tensor's `requires_grad` status in-place.
+        """
+        assert not requires_grad, "Only requires_grad == False is supported"
+        return self
+
+    @classmethod
+    def from_float(cls, input_float, n_super_blocks, target_dtype):
+        """
+        Method used to convert a linear weight tensor to an instance of the
+        GGMLInt4LinearWeight subclass.
+
+        Example usage::
+
+            model.lin_mod.weight = (
+                GGMLInt4LinearWeight.from_float(model.lin_mod.weight)
+            )
+        """
+        assert (
+            target_dtype == torch.uint4
+        ), "only uint4 quantization is supported right now"
+        block_size = (1, _QK_K // n_super_blocks)
+        (
+            super_block_scale_scale,
+            super_block_min_scale,
+            quantized_block_scale,
+            quantized_block_min,
+        ) = choose_qparams_gguf(input_float, block_size, target_dtype)
+
+        int_data = quantize_gguf(
+            input_float,
+            block_size,
+            target_dtype,
+            super_block_scale_scale,
+            super_block_min_scale,
+            quantized_block_scale,
+            quantized_block_min,
+        )
+        return cls(
+            n_super_blocks,
+            super_block_scale_scale,
+            super_block_min_scale,
+            quantized_block_scale,
+            quantized_block_min,
+            int_data,
+            input_float.shape,
+            dtype=torch.float16,
+        )
+
+
+@dataclass
+class GGUFWeightOnlyConfig(AOBaseConfig):
+    dtype: torch.dtype = torch.uint4
+    n_super_blocks: int = 8
+
+
+@register_quantize_module_handler(GGUFWeightOnlyConfig)
+def _gguf_weight_only_transform(
+    module: torch.nn.Module,
+    config: GGUFWeightOnlyConfig,
+):
+    """
+    Applies gguf weight-only quantization to linear layers.
+
+    Args:
+        dtype: torch.uint1 to torch.uint8, torch.int32 supported.
+        n_super_blocks: the number of super blocks in a 256 element block for gguf, e.g. when it is 8
+            it means we have blocks of 32 and 8 blocks in a superblock of 256 elements.
+    Returns:
+        Callable for quantization transformation.
+    """
+    weight = module.weight
+    if (weight.ndim != 2) or (weight.shape[-1] % 256 != 0):
+        return module
+
+    quantized_weight = GGUFQuantizedTensor.from_float(
+        weight, n_super_blocks=config.n_super_blocks, target_dtype=config.dtype
+    )
+    module.weight = torch.nn.Parameter(quantized_weight, requires_grad=False)
+    return module
diff --git a/torchao/quantization/quant_primitives.py b/torchao/quantization/quant_primitives.py