Skip to content

Commit 363b28c

Browse files
authored
fix v2 to v1 missed logic bypass (#1347)
Signed-off-by: Qubitium <[email protected]>
1 parent f91cb97 commit 363b28c

File tree

2 files changed

+21
-13
lines changed

2 files changed

+21
-13
lines changed

gptqmodel/models/loader.py

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -493,16 +493,11 @@ def skip(*args, **kwargs):
493493
f"Format: Loading of a sym=False model with format={FORMAT.GPTQ} is only supported if produced by gptqmodel version >= {MIN_VERSION_WITH_V2}"
494494
)
495495

496-
# skip v1 to v2 conversion for kernels that can only operate on sym=True (gptq_v1)
497-
if preload_qlinear_kernel not in [IPEXQuantLinear, MarlinQuantLinear, ExllamaEoraQuantLinear]:
498-
t = time.time()
499-
logger.info(f"Format: Converting `{FORMAT_FIELD_JSON}` from `{FORMAT.GPTQ}` to internal `{FORMAT.GPTQ_V2}`.")
500-
model = convert_gptq_v1_to_v2_format(
501-
model,
502-
cfg=qcfg,
503-
qlinear_kernel=preload_qlinear_kernel,
504-
)
505-
logger.info(f"Format: Conversion complete: {time.time() - t}s")
496+
model = convert_gptq_v1_to_v2_format(
497+
model,
498+
cfg=qcfg,
499+
qlinear_kernel=preload_qlinear_kernel,
500+
)
506501

507502
load_checkpoint_in_model = False
508503
qcfg.runtime_format = FORMAT.GPTQ_V2

gptqmodel/utils/model.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@
3333
import torch
3434
import torch.nn as nn
3535
import transformers
36+
from gptqmodel.nn_modules.qlinear.exllama_eora import ExllamaEoraQuantLinear
37+
from gptqmodel.nn_modules.qlinear.marlin import MarlinQuantLinear
3638
from huggingface_hub import HfApi, hf_hub_download
3739
from packaging import version
3840
from transformers import AutoConfig, PretrainedConfig
@@ -353,13 +355,22 @@ def hf_convert_gptq_v1_to_v2_format(
353355
else:
354356
return model, False
355357

358+
# Optionally convert weight from gptq_v1 to v2 format if Kernel is compatible with v2
356359
def convert_gptq_v1_to_v2_format(
357360
model,
358361
cfg: QuantizeConfig,
359362
qlinear_kernel: Type[BaseQuantLinear],
360363
):
364+
# skip v2 to v1 conversion for gptq_v1 kernels
365+
if qlinear_kernel in [IPEXQuantLinear, MarlinQuantLinear, ExllamaEoraQuantLinear]:
366+
return model
367+
361368
# Limit thread usage to avoid auto-parallizataion regression
362369
with tctl.threadpool_limits(limits=1):
370+
t = time.time()
371+
logger.info(
372+
f"Format: Converting `{FORMAT_FIELD_JSON}` from `{FORMAT.GPTQ}` to internal `{FORMAT.GPTQ_V2}`.")
373+
363374
for _, submodule in model.named_modules():
364375
# v1 checkpoint format used to do `qzeros = qzeros -= 1` before serialization, thus the
365376
# additions here do not overflow.
@@ -438,6 +449,8 @@ def convert_gptq_v1_to_v2_format(
438449
else:
439450
raise NotImplementedError("Only 2,3,4,8 bits are supported.")
440451

452+
logger.info(f"Format: Conversion complete: {time.time() - t}s")
453+
441454
return model
442455

443456

@@ -457,14 +470,14 @@ def hf_convert_gptq_v2_to_v1_format(
457470
else:
458471
return model, False
459472

460-
473+
# Optionally convert weight from gptq_v2 to v1 export format if Kernel is compatible with v2
461474
def convert_gptq_v2_to_v1_format(
462475
model,
463476
quantize_config: QuantizeConfig,
464477
qlinear_kernel: Type[BaseQuantLinear],
465478
):
466-
# skip v2 to v1 conversion for ipex
467-
if qlinear_kernel == IPEXQuantLinear:
479+
# skip v2 to v1 conversion for gptq_v1 kernels
480+
if qlinear_kernel in [IPEXQuantLinear, MarlinQuantLinear, ExllamaEoraQuantLinear]:
468481
return model
469482

470483
# Limit thread usage to avoid auto-parallizataion regression

0 commit comments

Comments
 (0)