@@ -447,7 +447,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
447
447
self .device_config : DeviceConfig = vllm_config .device_config
448
448
self .model_config : ModelConfig = vllm_config .model_config
449
449
self .parallel_config : ParallelConfig = vllm_config .parallel_config
450
- self .quant_config : QuantizationConfig = vllm_config .quant_config
450
+ self .quant_config : Optional [
451
+ QuantizationConfig ] = vllm_config .quant_config
451
452
452
453
self .pp_group = get_pp_group ()
453
454
self .pp_size = self .pp_group .world_size
@@ -456,7 +457,18 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
456
457
457
458
# Weights to skip in `self.load_weights`
458
459
self .skip_prefixes : list [str ] = []
460
+ """Skip loading weights whose qualname starts with these prefixes."""
459
461
self .skip_substrs : list [str ] = []
462
+ """Skip loading weights whose qualname contains these substrings."""
463
+ self .ignore_unexpected_prefixes : list [str ] = []
464
+ """Ignore unexpected weights whose qualname starts with these prefixes.
465
+ """
466
+ self .ignore_unexpected_suffixes : list [str ] = []
467
+ """Ignore unexpected weights whose qualname ends with these suffixes."""
468
+
469
+ # Skip loading extra bias for GPTQ models.
470
+ if self .quant_config and "gptq" in self .quant_config .get_name ():
471
+ self .ignore_unexpected_suffixes .append (".bias" )
460
472
461
473
# Set correct attn and init on "meta" to delay allocating GPU tensors
462
474
# TODO: @raushan, use the public `model.set_attn_implementation()`
@@ -563,9 +575,7 @@ def tensor_parallel(self):
563
575
raise ValueError (
564
576
f"{ type (self .model )} does not support tensor parallel. { tip } " )
565
577
566
- def _tensor_parallel (module : nn .Module ,
567
- prefix : str = "" ,
568
- tp_plan = None ):
578
+ def _tensor_parallel (module : nn .Module , prefix : str , tp_plan = None ):
569
579
tp_plan = tp_plan or {}
570
580
571
581
# If the current module is a PreTrainedModel, set the tp_plan for
@@ -597,7 +607,7 @@ def _tensor_parallel(module: nn.Module,
597
607
prefix = qual_name ,
598
608
tp_plan = tp_plan )
599
609
600
- _tensor_parallel (self .model )
610
+ _tensor_parallel (self .model , prefix = "model" )
601
611
602
612
def create_attention_instances (
603
613
self ,
@@ -696,6 +706,8 @@ def load_weights(self, weights: Iterable[tuple[str,
696
706
self ,
697
707
skip_prefixes = self .skip_prefixes ,
698
708
skip_substrs = self .skip_substrs ,
709
+ ignore_unexpected_prefixes = self .ignore_unexpected_prefixes ,
710
+ ignore_unexpected_suffixes = self .ignore_unexpected_suffixes ,
699
711
)
700
712
return loader .load_weights (weights , mapper = self .hf_to_vllm_mapper )
701
713
0 commit comments