fix bitblas loading regression (#1324)

Qubitium · web-flow · commit 2acc7615c0d1 · 2025-02-22T09:50:47.000+08:00
Signed-off-by: Qubitium &lt;Qubitium@modelcloud.ai&gt;
diff --git a/gptqmodel/models/loader.py b/gptqmodel/models/loader.py
@@ -24,6 +24,9 @@
 import torch
 import transformers
 
+from ..nn_modules.qlinear.exllama_eora import ExllamaEoraQuantLinear
+from ..nn_modules.qlinear.marlin import MarlinQuantLinear
+
 if os.getenv('GPTQMODEL_USE_MODELSCOPE', 'False').lower() in ['true', '1']:
     try:
         from modelscope import snapshot_download
@@ -342,15 +345,14 @@ def from_quantized(
                 raise TypeError(f"FORMAT.MARLIN requires BACKEND.AUTO or BACKEND.MARLIN: actual = `{backend}`.")
             backend = BACKEND.MARLIN
 
-        marlin_compatible = False if backend == BACKEND.IPEX else _validate_marlin_device_support()
-
-        # check for marlin compat for cuda device onnly
-        if backend not in [BACKEND.MARLIN, BACKEND.MARLIN_FP16] and device == DEVICE.CUDA:
-            unsupported = _validate_marlin_compatibility(qcfg)
-            if unsupported is None and marlin_compatible:
-                logger.info(
-                    "Hint: Model is compatible with the Marlin kernel. Marlin is optimized for batched inference on Nvidia GPU: `model = GPTQModel.load(..., backend=BACKEND.MARLIN)`."
-                )
+        # marlin_compatible = False if backend == BACKEND.IPEX else _validate_marlin_device_support()
+        # check for marlin compat for cuda device only
+        # if backend not in [BACKEND.MARLIN, BACKEND.MARLIN_FP16] and device == DEVICE.CUDA:
+        #     unsupported = _validate_marlin_compatibility(qcfg)
+        #     if unsupported is None and marlin_compatible:
+        #         logger.info(
+        #             "Hint: Model is compatible with the Marlin kernel. Marlin is optimized for batched inference on Nvidia GPU: `model = GPTQModel.load(..., backend=BACKEND.MARLIN)`."
+        #         )
 
         if qcfg.format == FORMAT.BITBLAS:
             # format bitblas requires bitblas kernel
@@ -491,14 +493,16 @@ def skip(*args, **kwargs):
                     f"Format: Loading of a sym=False model with format={FORMAT.GPTQ} is only supported if produced by gptqmodel version >= {MIN_VERSION_WITH_V2}"
                 )
 
-            t = time.time()
-            logger.info(f"Format: Converting `{FORMAT_FIELD_JSON}` from `{FORMAT.GPTQ}` to internal `{FORMAT.GPTQ_V2}`.")
-            model = convert_gptq_v1_to_v2_format(
-                model,
-                cfg=qcfg,
-                qlinear_kernel=preload_qlinear_kernel,
-            )
-            logger.info(f"Format: Conversion complete: {time.time() - t}s")
+            # skip v1 to v2 conversion for kernels that can only operate on sym=True (gptq_v1)
+            if preload_qlinear_kernel not in [IPEXQuantLinear, MarlinQuantLinear, ExllamaEoraQuantLinear]:
+                t = time.time()
+                logger.info(f"Format: Converting `{FORMAT_FIELD_JSON}` from `{FORMAT.GPTQ}` to internal `{FORMAT.GPTQ_V2}`.")
+                model = convert_gptq_v1_to_v2_format(
+                    model,
+                    cfg=qcfg,
+                    qlinear_kernel=preload_qlinear_kernel,
+                )
+                logger.info(f"Format: Conversion complete: {time.time() - t}s")
 
             load_checkpoint_in_model = False
             qcfg.runtime_format = FORMAT.GPTQ_V2
diff --git a/gptqmodel/nn_modules/qlinear/bitblas.py b/gptqmodel/nn_modules/qlinear/bitblas.py
@@ -25,6 +25,7 @@
 import torch.nn as nn
 from gptqmodel.adapter.adapter import Adapter, Lora
 from gptqmodel.nn_modules.qlinear import PackableQuantLinear
+from gptqmodel.utils import BACKEND
 
 from ...models._const import DEVICE, PLATFORM
 from ...utils.logger import setup_logger
@@ -140,6 +141,7 @@ def __init__(
             out_features=out_features,
             bias=bias,
             pack_dtype=pack_dtype,
+            backend=BACKEND.BITBLAS,
             adapter=adapter,
             register_buffers=False,
             **kwargs)
diff --git a/gptqmodel/utils/model.py b/gptqmodel/utils/model.py
@@ -357,16 +357,11 @@ def hf_convert_gptq_v1_to_v2_format(
     else:
         return model, False
 
-# TODO: FIXME: the v1 -> v2 zeropoint offsets are assuming INT32 pack_dtype
 def convert_gptq_v1_to_v2_format(
     model,
     cfg: QuantizeConfig,
     qlinear_kernel: Type[BaseQuantLinear],
 ):
-    # skip v1 to v2 conversion for kernels that can only operate on sym=True (gptq_v1)
-    if qlinear_kernel in [IPEXQuantLinear, MarlinQuantLinear, ExllamaEoraQuantLinear]:
-        return model
-
     # Limit thread usage to avoid auto-parallizataion regression
     with tctl.threadpool_limits(limits=1):
         for _, submodule in model.named_modules():