diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 7810089a05c7..3cbd38dbd46a 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -11,6 +11,7 @@ from vllm.attention.selector import backend_name_to_enum, get_attn_backend from vllm.config import CacheConfig, get_current_vllm_config from vllm.forward_context import ForwardContext, get_forward_context +from vllm.model_executor.layers.linear import UnquantizedLinearMethod from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod @@ -97,7 +98,8 @@ def __init__( quant_method = quant_config.get_quant_method( self, prefix=prefix) if quant_config else None - if quant_method is not None: + if quant_method is not None and not isinstance( + quant_method, UnquantizedLinearMethod): assert isinstance(quant_method, BaseKVCacheMethod) # TODO (mgoin): kv cache dtype should be specified in the FP8 # checkpoint config and become the "auto" behavior