3333import torch
3434import torch .nn as nn
3535import transformers
36+ from gptqmodel .nn_modules .qlinear .exllama_eora import ExllamaEoraQuantLinear
37+ from gptqmodel .nn_modules .qlinear .marlin import MarlinQuantLinear
3638from huggingface_hub import HfApi , hf_hub_download
3739from packaging import version
3840from transformers import AutoConfig , PretrainedConfig
@@ -353,13 +355,22 @@ def hf_convert_gptq_v1_to_v2_format(
353355 else :
354356 return model , False
355357
358+ # Optionally convert weight from gptq_v1 to v2 format if Kernel is compatible with v2
356359def convert_gptq_v1_to_v2_format (
357360 model ,
358361 cfg : QuantizeConfig ,
359362 qlinear_kernel : Type [BaseQuantLinear ],
360363):
364+ # skip v2 to v1 conversion for gptq_v1 kernels
365+ if qlinear_kernel in [IPEXQuantLinear , MarlinQuantLinear , ExllamaEoraQuantLinear ]:
366+ return model
367+
361368 # Limit thread usage to avoid auto-parallizataion regression
362369 with tctl .threadpool_limits (limits = 1 ):
370+ t = time .time ()
371+ logger .info (
372+ f"Format: Converting `{ FORMAT_FIELD_JSON } ` from `{ FORMAT .GPTQ } ` to internal `{ FORMAT .GPTQ_V2 } `." )
373+
363374 for _ , submodule in model .named_modules ():
364375 # v1 checkpoint format used to do `qzeros = qzeros -= 1` before serialization, thus the
365376 # additions here do not overflow.
@@ -438,6 +449,8 @@ def convert_gptq_v1_to_v2_format(
438449 else :
439450 raise NotImplementedError ("Only 2,3,4,8 bits are supported." )
440451
452+ logger .info (f"Format: Conversion complete: { time .time () - t } s" )
453+
441454 return model
442455
443456
@@ -457,14 +470,14 @@ def hf_convert_gptq_v2_to_v1_format(
457470 else :
458471 return model , False
459472
460-
473+ # Optionally convert weight from gptq_v2 to v1 export format if Kernel is compatible with v2
461474def convert_gptq_v2_to_v1_format (
462475 model ,
463476 quantize_config : QuantizeConfig ,
464477 qlinear_kernel : Type [BaseQuantLinear ],
465478):
466- # skip v2 to v1 conversion for ipex
467- if qlinear_kernel == IPEXQuantLinear :
479+ # skip v2 to v1 conversion for gptq_v1 kernels
480+ if qlinear_kernel in [ IPEXQuantLinear , MarlinQuantLinear , ExllamaEoraQuantLinear ] :
468481 return model
469482
470483 # Limit thread usage to avoid auto-parallizataion regression
0 commit comments