From ccc4227ebb3592dd74afe7e3ec7a41ef16a06300 Mon Sep 17 00:00:00 2001 From: "jiang1.li" Date: Mon, 30 Dec 2024 07:40:10 +0000 Subject: [PATCH 1/2] fix cpu ci Signed-off-by: jiang1.li --- vllm/model_executor/layers/quantization/fp8.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 7f779ac8d3b3..f0ab2cf1d262 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -15,8 +15,6 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod -from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - apply_w8a8_block_fp8_linear) from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin) from vllm.model_executor.layers.quantization.utils.quant_utils import ( @@ -339,6 +337,11 @@ def apply(self, if self.block_quant: assert self.quant_config.weight_block_size is not None + + # Note: lazy import to avoid triton import error. + from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + apply_w8a8_block_fp8_linear) + return apply_w8a8_block_fp8_linear( input=x, weight=layer.weight, From b368b245f3ab1a8137102b45e74d1c11c45c6957 Mon Sep 17 00:00:00 2001 From: "jiang1.li" Date: Mon, 30 Dec 2024 07:57:49 +0000 Subject: [PATCH 2/2] fix format Signed-off-by: jiang1.li --- vllm/model_executor/layers/quantization/fp8.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index f0ab2cf1d262..2fe22903a385 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -335,13 +335,11 @@ def apply(self, size_k=layer.input_size_per_partition, bias=bias) + # Note: lazy import to avoid triton import error. + from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + apply_w8a8_block_fp8_linear) if self.block_quant: assert self.quant_config.weight_block_size is not None - - # Note: lazy import to avoid triton import error. - from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - apply_w8a8_block_fp8_linear) - return apply_w8a8_block_fp8_linear( input=x, weight=layer.weight,