51
51
from vllm .model_executor .layers .linear import (ColumnParallelLinear ,
52
52
RowParallelLinear )
53
53
from vllm .model_executor .layers .logits_processor import LogitsProcessor
54
- from vllm .model_executor .layers .quantization import QuantizationConfig
54
+ from vllm .model_executor .layers .quantization import (GPTQConfig ,
55
+ GPTQMarlinConfig ,
56
+ QuantizationConfig )
55
57
from vllm .model_executor .layers .sampler import SamplerOutput , get_sampler
56
58
from vllm .model_executor .layers .vocab_parallel_embedding import ParallelLMHead
57
59
from vllm .model_executor .model_loader .weight_utils import default_weight_loader
@@ -982,7 +984,7 @@ def __init__(self,
982
984
self .visual = Qwen2VisionTransformer (
983
985
config .vision_config ,
984
986
norm_eps = getattr (config , "rms_norm_eps" , 1e-6 ),
985
- quant_config = quant_config ,
987
+ quant_config = self . _maybe_ignore_quant_config ( quant_config ) ,
986
988
prefix = "visual" ,
987
989
)
988
990
@@ -1008,6 +1010,14 @@ def __init__(self,
1008
1010
make_empty_intermediate_tensors_factory (
1009
1011
["hidden_states" , "residual" ], config .hidden_size ))
1010
1012
1013
+ def _maybe_ignore_quant_config (self , quant_config : QuantizationConfig ):
1014
+ # GPTQ configs do not have a list of ignored modules, however AutoGPTQ
1015
+ # seems to avoid vision encoder sections for some models.
1016
+ # See: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4
1017
+ if isinstance (quant_config , (GPTQConfig , GPTQMarlinConfig )):
1018
+ return None
1019
+ return quant_config
1020
+
1011
1021
def _validate_and_reshape_mm_tensor (self ,
1012
1022
mm_input : Union [torch .Tensor ,
1013
1023
List [torch .Tensor ]],
0 commit comments