Improve Marlin accuracy (default) but add MARLIN_FP16 backend for faster with less-accuracy (#1317)

Qubitium · web-flow · commit d30c9835df52 · 2025-02-21T23:04:32.000+08:00
* sync marlin kernel from upstream for fp32 reduce ops precision fix

Signed-off-by: Qubitium &lt;Qubitium@modelcloud.ai&gt;

* add `MARLIN_FP16` backend

Signed-off-by: Qubitium &lt;Qubitium@modelcloud.ai&gt;

---------

Signed-off-by: Qubitium &lt;Qubitium@modelcloud.ai&gt;
diff --git a/gptqmodel/models/loader.py b/gptqmodel/models/loader.py
@@ -47,8 +47,7 @@
 from ..utils.backend import BACKEND
 from ..utils.importer import auto_select_device, normalize_device_device_map, select_quant_linear
 from ..utils.logger import setup_logger
-from ..utils.marlin import (_validate_marlin_compatibility,
-                            _validate_marlin_device_support)
+from ..utils.marlin import _validate_marlin_compatibility, _validate_marlin_device_support
 from ..utils.model import (auto_dtype, convert_gptq_v1_to_v2_format, find_modules, get_checkpoints,
                            get_moe_layer_modules, gptqmodel_post_init, load_checkpoint_in_model_then_tie_weights,
                            make_quant, simple_dispatch_model, verify_model_hash, verify_sharded_model_hashes)
@@ -339,14 +338,14 @@ def from_quantized(
 
         if qcfg.format == FORMAT.MARLIN:
             # format marlin requires marlin kernel
-            if backend != BACKEND.MARLIN and backend != BACKEND.AUTO:
+            if backend not in [BACKEND.MARLIN, BACKEND.MARLIN_FP16] and backend != BACKEND.AUTO:
                 raise TypeError(f"FORMAT.MARLIN requires BACKEND.AUTO or BACKEND.MARLIN: actual = `{backend}`.")
             backend = BACKEND.MARLIN
 
         marlin_compatible = False if backend == BACKEND.IPEX else _validate_marlin_device_support()
 
         # check for marlin compat for cuda device onnly
-        if backend != BACKEND.MARLIN and device == DEVICE.CUDA:
+        if backend not in [BACKEND.MARLIN, BACKEND.MARLIN_FP16] and device == DEVICE.CUDA:
             unsupported = _validate_marlin_compatibility(qcfg)
             if unsupported is None and marlin_compatible:
                 logger.info(
@@ -504,7 +503,7 @@ def skip(*args, **kwargs):
             load_checkpoint_in_model = False
             qcfg.runtime_format = FORMAT.GPTQ_V2
 
-        if backend == BACKEND.MARLIN and (
+        if backend in [BACKEND.MARLIN, BACKEND.MARLIN_FP16] and (
                 preload_qlinear_kernel == ExllamaV2QuantLinear or qcfg.format == FORMAT.MARLIN):
             if is_sharded:
                 raise ValueError(
@@ -541,7 +540,7 @@ def skip(*args, **kwargs):
 
         # If we use marlin or bitblas to load the quantized model, the model is already a converted model,
         # and we no longer need to call load_checkpoint_in_model()
-        if load_checkpoint_in_model and backend not in [BACKEND.MARLIN, BACKEND.BITBLAS]:
+        if load_checkpoint_in_model and backend not in [BACKEND.MARLIN, BACKEND.MARLIN_FP16, BACKEND.BITBLAS]:
             load_checkpoint_in_model_then_tie_weights(
                 model,
                 dtype=torch_dtype,
diff --git a/gptqmodel/nn_modules/qlinear/__init__.py b/gptqmodel/nn_modules/qlinear/__init__.py
@@ -23,6 +23,7 @@
 import torch.nn as nn
 import transformers
 from gptqmodel.adapter.adapter import LORA_MERGED_WEIGHT_PATHS, Adapter
+from gptqmodel.utils.backend import BACKEND
 
 from ...models._const import DEVICE, PLATFORM
 
@@ -52,6 +53,7 @@ def __init__(self,
                  out_features: int,
                  bias: bool,
                  pack_dtype: t.dtype,
+                 backend: BACKEND,
                  adapter: Adapter,
                  name: str = None,
                  register_buffers: bool = False,
@@ -68,6 +70,7 @@ def __init__(self,
         self.bits = bits
         self.desc_act = desc_act
         self.pack_dtype = pack_dtype
+        self.backend = backend
         self.maxq = 2 ** self.bits - 1
         self.pack_dtype = pack_dtype
         # we need to clone the adapter since passed in adapter may be shared
diff --git a/gptqmodel/nn_modules/qlinear/exllama.py b/gptqmodel/nn_modules/qlinear/exllama.py
@@ -43,15 +43,6 @@ def ext_make_q4(qweight, qzeros, scales, g_idx, device):
     return make_q4(qweight, qzeros, scales, g_idx if g_idx is not None else NON_TENSOR, device)
 
 
-def ext_q4_matmul(x, q4, q4_width):
-    """Matrix multiplication, returns x @ q4"""
-    outshape = x.shape[:-1] + (q4_width,)
-    x = x.view(-1, x.shape[-1])
-    output = torch.empty((x.shape[0], q4_width), dtype=torch.float16, device=x.device)
-
-    q4_matmul(x, q4, output)
-
-    return output.view(outshape)
 
 
 class ExllamaQuantLinear(BaseQuantLinear):
@@ -151,6 +142,22 @@ def post_init(self):
 
         super().post_init()
 
+    def ext_q4_matmul(self, x, q4, q4_width):
+        """Matrix multiplication, returns x @ q4"""
+        outshape = x.shape[:-1] + (q4_width,)
+        x = x.view(-1, x.shape[-1])
+
+        output = torch.empty((x.shape[0], q4_width), dtype=torch.float16, device=x.device)
+        q4_matmul(x, q4, output)
+
+        if self.bias is not None:
+            output.add_(self.bias)
+
+        if self.adapter:
+            output = self.adapter.apply(x=x, out=output)
+
+        return output.view(outshape)
+
 
     def forward(self, x):
         x_dtype = x.dtype
@@ -166,12 +173,6 @@ def forward(self, x):
         # if x.size(-1) != self.in_features:
         #     x = F.pad(x, self.in_features_padding_shape)
 
-        out = ext_q4_matmul(x, self.q4, self.width)
-
-        if self.bias is not None:
-            out.add_(self.bias)
-
-        if self.adapter:
-            out = self.adapter.apply(x=x, out=out)
+        out = self.ext_q4_matmul(x, self.q4, self.width)
 
         return out.to(x_dtype)
diff --git a/gptqmodel/nn_modules/qlinear/marlin.py b/gptqmodel/nn_modules/qlinear/marlin.py
@@ -23,6 +23,7 @@
 import torch
 from gptqmodel.adapter.adapter import Adapter, Lora
 from gptqmodel.nn_modules.qlinear import BaseQuantLinear
+from gptqmodel.utils.backend import BACKEND
 from torch.nn.parameter import Parameter
 
 from ...models._const import DEVICE, PLATFORM
@@ -133,23 +134,29 @@ def apply_gptq_marlin_linear(
         output_size_per_partition: int,
         input_size_per_partition: int,
         is_k_full: bool,
-        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        bias: torch.Tensor,
+        fp32: bool,
+) -> torch.Tensor:
+
     reshaped_x = input.reshape(-1, input.shape[-1])
     out_shape = input.shape[:-1] + (output_size_per_partition, )
 
-    output = gptqmodel_marlin_kernels.gptq_marlin_gemm(reshaped_x,
-                                  weight,
-                                  weight_scale,
-                                  weight_zp,
-                                  g_idx,
-                                  g_idx_sort_indices,
-                                  workspace,
-                                  num_bits,
-                                  reshaped_x.shape[0],
-                                  output_size_per_partition,
-                                  input_size_per_partition,
-                                  is_k_full,
-                                  False)
+    output = gptqmodel_marlin_kernels.gptq_marlin_gemm(
+        reshaped_x,
+        weight,
+        weight_scale,
+        weight_zp,
+        g_idx,
+        g_idx_sort_indices,
+        workspace,
+        num_bits,
+        reshaped_x.shape[0],
+        output_size_per_partition,
+        input_size_per_partition,
+        is_k_full,
+        False,
+        fp32, # <- True: enable fp32 reduce for higher accuracy, False: fp16
+    )
 
     if bias is not None:
         output.add_(bias)  # In-place add
@@ -191,8 +198,8 @@ def __init__(
                 f"Trying to use the marlin backend, but could not import the C++/CUDA dependencies with the following error: {marlin_import_exception}"
             )
 
-        self.original_in_features = in_features
-        self.original_out_features = out_features
+        # self.original_in_features = in_features
+        # self.original_out_features = out_features
 
         if desc_act and group_size == -1:
             # In this case, act_order == True is the same as act_order == False
@@ -212,6 +219,9 @@ def __init__(
             register_buffers=False,
             **kwargs)
 
+        # toggle fp32 mode depending on MARLIN or MARLIN_FP16 backend
+        self.fp32 = True if self.backend is BACKEND.MARLIN else False
+
         # Determine sharding
         if marlin_repeat_scales_on_all_ranks(desc_act,
                                              self.group_size,
@@ -390,6 +400,7 @@ def forward(self, A: torch.Tensor):
             input_size_per_partition=self.in_features,
             is_k_full=self.is_k_full,
             bias=self.bias,
+            fp32=self.fp32,
         )
 
         if self.adapter:
diff --git a/gptqmodel/quantization/__init__.py b/gptqmodel/quantization/__init__.py
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .config import (FORMAT, FORMAT_FIELD_CODE, FORMAT_FIELD_JSON,
-                     QUANT_CONFIG_FILENAME, QUANT_METHOD, QUANT_METHOD_FIELD, BaseQuantizeConfig, QuantizeConfig)
+from .config import (FORMAT, FORMAT_FIELD_CODE, FORMAT_FIELD_JSON, QUANT_CONFIG_FILENAME,
+                     QUANT_METHOD, QUANT_METHOD_FIELD, BaseQuantizeConfig, QuantizeConfig)
 from .gptq import GPTQ
 from .quantizer import Quantizer, quantize
diff --git a/gptqmodel/utils/backend.py b/gptqmodel/utils/backend.py
@@ -20,15 +20,16 @@
 class BACKEND(str, Enum):
     AUTO = "auto"  # choose the optimal local kernel based on quant_config compatibility
     AUTO_TRAINABLE = "auto_trainable" # choose the optimal trainable local kernel for post-quant training
-    CUDA = "cuda"
-    TORCH = "torch"
-    TRITON = "triton"
-    EXLLAMA_V1 = "exllama_v1"
-    EXLLAMA_V2 = "exllama_v2"
+    CUDA = "cuda" # OK: Performance same as Torch for most cases
+    TORCH = "torch" # GOOD: about 80% of triton
+    TRITON = "triton" # VERY GOOD: all-around kernel
+    EXLLAMA_V1 = "exllama_v1" # FAST: optimized for batching == 1
+    EXLLAMA_V2 = "exllama_v2" # FASTER: optimized for batching > 1
     # EXLLAMA_EORA = "exllama_eora"
-    MARLIN = "marlin"
-    BITBLAS = "bitblas"
-    IPEX = "ipex"
-    VLLM = "vllm" # external inference engine (CUDA + ROCM + IPEX)
-    SGLANG = "sglang" # external inference engine (CUDA + ROCm)
-    MLX = "mlx" # external inference engine (Apple MLX on M1+)
+    MARLIN = "marlin" # FASTEST: marlin reduce ops in fp32 (higher precision -> more accurate, slightly slower)
+    MARLIN_FP16 = "marlin_fp16" # FASTEST and then some: marlin reduce ops in fp16 (lower precision -> less accurate, slightly faster)
+    BITBLAS = "bitblas" # EXTREMELY FAST: speed at the cost of 10+ minutes of AOT (ahead of time compilation with disk cache)
+    IPEX = "ipex" # Best kernel for Intel XPU and Intel/AMD CPU with AVX512, AMX, XMX
+    VLLM = "vllm" # External inference engine: CUDA + ROCm + IPEX
+    SGLANG = "sglang" # External inference engine: CUDA + ROCm
+    MLX = "mlx" # External inference engine: Apple MLX on M1+ (Apple Silicon)
diff --git a/gptqmodel/utils/importer.py b/gptqmodel/utils/importer.py
@@ -52,9 +52,9 @@
 })
 
 FORMAT_DICT = {
-    FORMAT.GPTQ: [BACKEND.MARLIN, BACKEND.EXLLAMA_V2, BACKEND.EXLLAMA_V1, BACKEND.TRITON, BACKEND.CUDA, BACKEND.IPEX, BACKEND.TORCH], # BACKEND.EXLLAMA_EORA
-    FORMAT.GPTQ_V2: [BACKEND.MARLIN, BACKEND.EXLLAMA_V2, BACKEND.EXLLAMA_V1, BACKEND.TRITON, BACKEND.CUDA, BACKEND.TORCH], # , BACKEND.EXLLAMA_EORA
-    FORMAT.MARLIN: [BACKEND.MARLIN],
+    FORMAT.GPTQ: [BACKEND.MARLIN, BACKEND.EXLLAMA_V2, BACKEND.EXLLAMA_V1, BACKEND.TRITON, BACKEND.CUDA, BACKEND.IPEX, BACKEND.TORCH, BACKEND.MARLIN_FP16], # BACKEND.EXLLAMA_EORA
+    FORMAT.GPTQ_V2: [BACKEND.EXLLAMA_V2, BACKEND.EXLLAMA_V1, BACKEND.TRITON, BACKEND.CUDA, BACKEND.TORCH], # , BACKEND.EXLLAMA_EORA
+    FORMAT.MARLIN: [BACKEND.MARLIN, BACKEND.MARLIN_FP16],
     FORMAT.BITBLAS: [BACKEND.BITBLAS],
     FORMAT.IPEX: [BACKEND.IPEX],
 }
@@ -228,7 +228,7 @@ def select_quant_linear(
         qlinear = TritonV2QuantLinear
     elif backend == BACKEND.BITBLAS:
         qlinear = BitBLASQuantLinear
-    elif backend == BACKEND.MARLIN:
+    elif backend in [BACKEND.MARLIN, BACKEND.MARLIN_FP16]:
         qlinear = MarlinQuantLinear
     # elif backend == BACKEND.EXLLAMA_EORA:
     #     qlinear = ExllamaEoraQuantLinear
diff --git a/gptqmodel/utils/marlin.py b/gptqmodel/utils/marlin.py
@@ -40,4 +40,4 @@ def _validate_marlin_compatibility(cfg: QuantizeConfig, throw_error: bool = Fals
     validate, err = MarlinQuantLinear.validate(bits=cfg.bits, group_size=cfg.group_size, desc_act=cfg.desc_act, sym=cfg.sym, pack_dtype=cfg.pack_dtype, dynamic=cfg.dynamic)
     if throw_error and err is not None:
         raise ValueError(err)
-    return err
+    return err
diff --git a/gptqmodel/utils/model.py b/gptqmodel/utils/model.py
@@ -226,6 +226,7 @@ def make_quant(
                 device=device,
                 lm_head_name=lm_head_name,
                 pack_dtype=pack_dtype,
+                backend=backend,
                 adapter=qcfg.adapter,
             )
             logger.info(f"Kernel: selected -> `{linear_cls.__name__}`.")
@@ -252,6 +253,7 @@ def create_quant_layer(
         device: DEVICE,
         lm_head_name: str,
         pack_dtype: torch.dtype,
+        backend: BACKEND,
         adapter: Optional[Adapter] = None,
 ) -> Type[BaseQuantLinear]:
     if isinstance(module, linear_cls):
@@ -334,6 +336,7 @@ def create_quant_layer(
             #weight_dtype=submodule.qweight.dtype if isinstance(submodule, BaseQuantLinear) else submodule.weight.dtype,
             name=name,
             lm_head_name=lm_head_name,
+            backend=backend,
             adapter=adapter,
         )
         new_layer.device = ori_layer_device
diff --git a/gptqmodel_ext/marlin/marlin_cuda_kernel.cu b/gptqmodel_ext/marlin/marlin_cuda_kernel.cu
diff --git a/gptqmodel_ext/marlin/marlin_cuda_kernel.cuh b/gptqmodel_ext/marlin/marlin_cuda_kernel.cuh
diff --git a/tests/test_kernel_output.py b/tests/test_kernel_output.py
diff --git a/tests/test_packing.py b/tests/test_packing.py