From d60537f3b8347184f6e3f64034d84be397c23cdd Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 18 Dec 2024 10:33:33 +0000 Subject: [PATCH 01/20] Move `print_*_once` from utils to logger Signed-off-by: DarkLight1337 --- vllm/attention/backends/torch_sdpa.py | 9 +++-- vllm/attention/backends/xformers.py | 8 ++-- vllm/config.py | 9 ++--- vllm/entrypoints/chat_utils.py | 7 ++-- vllm/inputs/preprocess.py | 20 +++++----- vllm/inputs/registry.py | 4 +- vllm/logger.py | 39 +++++++++++++++++-- vllm/lora/punica_wrapper/punica_selector.py | 8 ++-- vllm/model_executor/custom_op.py | 3 +- .../compressed_tensors_moe.py | 8 ++-- .../model_executor/layers/quantization/fp8.py | 5 +-- .../layers/quantization/kv_cache.py | 6 ++- .../quantization/utils/marlin_utils_fp8.py | 6 ++- .../model_loader/weight_utils.py | 7 ++-- vllm/model_executor/models/chameleon.py | 6 ++- vllm/model_executor/models/olmoe.py | 6 ++- vllm/model_executor/models/qwen2_moe.py | 6 ++- vllm/model_executor/models/utils.py | 4 +- vllm/utils.py | 12 ------ 19 files changed, 105 insertions(+), 68 deletions(-) diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index 0cff6f5952ab..d782de24fd3e 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -13,9 +13,12 @@ from vllm.attention.backends.utils import CommonAttentionState from vllm.attention.ops.ipex_attn import PagedAttention from vllm.attention.ops.paged_attn import PagedAttentionMetadata -from vllm.utils import make_tensor_with_pad, print_warning_once +from vllm.logger import init_logger +from vllm.utils import make_tensor_with_pad from vllm.worker.cpu_model_runner import ModelInputForCPUBuilder +logger = init_logger(__name__) + class TorchSDPABackend(AttentionBackend): @@ -395,8 +398,8 @@ def __init__( raise ValueError( "Torch SPDA does not support block-sparse attention.") if logits_soft_cap is not None: - print_warning_once("Torch SPDA does not support logits soft cap. " - "Outputs may be slightly off.") + logger.warning_once("Torch SPDA does not support logits soft cap. " + "Outputs may be slightly off.") self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index 3e59b3603d2c..2c4997ea0a93 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -17,7 +17,9 @@ is_all_cross_attn_metadata_set, is_all_encoder_attn_metadata_set) from vllm.attention.ops.paged_attn import (PagedAttention, PagedAttentionMetadata) -from vllm.utils import print_warning_once +from vllm.logger import init_logger + +logger = init_logger(__name__) class XFormersBackend(AttentionBackend): @@ -384,8 +386,8 @@ def __init__( raise ValueError( "XFormers does not support block-sparse attention.") if logits_soft_cap is not None: - print_warning_once("XFormers does not support logits soft cap. " - "Outputs may be slightly off.") + logger.warning_once("XFormers does not support logits soft cap. " + "Outputs may be slightly off.") self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) diff --git a/vllm/config.py b/vllm/config.py index 307cf9c8d5b2..07e3d34d47f6 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -29,8 +29,7 @@ get_hf_text_config, get_pooling_config, get_sentence_transformer_tokenizer_config, is_encoder_decoder, uses_mrope) from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless, - get_cpu_memory, print_warning_once, random_uuid, - resolve_obj_by_qualname) + get_cpu_memory, random_uuid, resolve_obj_by_qualname) if TYPE_CHECKING: from ray.util.placement_group import PlacementGroup @@ -302,7 +301,7 @@ def __init__(self, sliding_window_len_min = get_min_sliding_window( self.hf_text_config.sliding_window) - print_warning_once( + logger.warning_once( f"{self.hf_text_config.model_type} has interleaved " "attention, which is currently not supported by the " "XFORMERS backend. Disabling sliding window and capping " @@ -2639,7 +2638,7 @@ def uuid(self): def model_post_init(self, __context: Any) -> None: if not self.enable_reshape and self.enable_fusion: - print_warning_once( + logger.warning_once( "Fusion enabled but reshape elimination disabled." "RMSNorm + quant (fp8) fusion might not work") @@ -3018,7 +3017,7 @@ def __post_init__(self): self.scheduler_config.chunked_prefill_enabled and \ self.model_config.dtype == torch.float32 and \ current_platform.get_device_capability() == (7, 5): - print_warning_once( + logger.warning_once( "Turing devices tensor cores do not support float32 matmul. " "To workaround this limitation, vLLM will set 'ieee' input " "precision for chunked prefill triton kernels.") diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 3df08c740d65..cc9936741a45 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -37,7 +37,6 @@ get_and_parse_audio, get_and_parse_image, get_and_parse_video) from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer -from vllm.utils import print_warning_once logger = init_logger(__name__) @@ -1000,14 +999,14 @@ def apply_mistral_chat_template( **kwargs: Any, ) -> List[int]: if chat_template is not None: - print_warning_once( + logger.warning_once( "'chat_template' cannot be overridden for mistral tokenizer.") if "add_generation_prompt" in kwargs: - print_warning_once( + logger.warning_once( "'add_generation_prompt' is not supported for mistral tokenizer, " "so it will be ignored.") if "continue_final_message" in kwargs: - print_warning_once( + logger.warning_once( "'continue_final_message' is not supported for mistral tokenizer, " "so it will be ignored.") diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 3d606817e90a..b4ec89db5d73 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -10,7 +10,6 @@ from vllm.multimodal.processing import MultiModalDataDict, MultiModalInputsV2 from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup -from vllm.utils import print_info_once, print_warning_once from .data import (DecoderOnlyInputs, EncoderDecoderInputs, ProcessorInputs, PromptType, SingletonInputs, SingletonPrompt, token_inputs) @@ -68,21 +67,24 @@ def get_decoder_start_token_id(self) -> Optional[int]: ''' if not self.model_config.is_encoder_decoder: - print_warning_once("Using None for decoder start token id because " - "this is not an encoder/decoder model.") + logger.warning_once( + "Using None for decoder start token id because " + "this is not an encoder/decoder model.") return None if (self.model_config is None or self.model_config.hf_config is None): - print_warning_once("Using None for decoder start token id because " - "model config is not available.") + logger.warning_once( + "Using None for decoder start token id because " + "model config is not available.") return None dec_start_token_id = getattr(self.model_config.hf_config, 'decoder_start_token_id', None) if dec_start_token_id is None: - print_warning_once("Falling back on for decoder start token " - "id because decoder start token id is not " - "available.") + logger.warning_once( + "Falling back on for decoder start token " + "id because decoder start token id is not " + "available.") dec_start_token_id = self.get_bos_token_id() return dec_start_token_id @@ -212,7 +214,7 @@ def _can_process_multimodal(self) -> bool: # updated to use the new multi-modal processor can_process_multimodal = self.mm_registry.has_processor(model_config) if not can_process_multimodal: - print_info_once( + logger.info_once( "Your model uses the legacy input pipeline instead of the new " "multi-modal processor. Please note that the legacy pipeline " "will be removed in a future release. For more details, see: " diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 0b85484c4871..d767b56ff60d 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -12,7 +12,7 @@ from vllm.transformers_utils.processor import cached_get_processor from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils import (ClassRegistry, get_allowed_kwarg_only_overrides, - print_warning_once, resolve_mm_processor_kwargs) + resolve_mm_processor_kwargs) from .data import ProcessorInputs, SingletonInputs from .parse import is_encoder_decoder_inputs @@ -316,7 +316,7 @@ def dummy_data_for_profiling( num_tokens = dummy_data.seq_data.prompt_token_ids if len(num_tokens) < seq_len: if is_encoder_data: - print_warning_once( + logger.warning_once( f"Expected at least {seq_len} dummy encoder tokens for " f"profiling, but found {len(num_tokens)} tokens instead.") else: diff --git a/vllm/logger.py b/vllm/logger.py index 538db0dcf19a..3b6258ca173e 100644 --- a/vllm/logger.py +++ b/vllm/logger.py @@ -4,7 +4,7 @@ import logging import os import sys -from functools import partial +from functools import lru_cache, partial from logging import Logger from logging.config import dictConfig from os import path @@ -49,6 +49,35 @@ } +@lru_cache +def _print_info_once(logger: Logger, msg: str) -> None: + # Set the stacklevel to 4 to print the original caller's line info + logger.info(msg, stacklevel=4) + + +@lru_cache +def _print_warning_once(logger: Logger, msg: str) -> None: + # Set the stacklevel to 4 to print the original caller's line info + logger.warning(msg, stacklevel=4) + + +class VllmLogger(Logger): + + def info_once(self, msg: str) -> None: + """ + As :meth:`info`, but subsequent calls with the same message + are silently dropped. + """ + _print_info_once(self, msg) + + def warning_once(self, msg: str) -> None: + """ + As :meth:`warning`, but subsequent calls with the same message + are silently dropped. + """ + _print_warning_once(self, msg) + + def _configure_vllm_root_logger() -> None: logging_config: Dict = {} @@ -83,13 +112,17 @@ def _configure_vllm_root_logger() -> None: if logging_config: dictConfig(logging_config) + logging.setLoggerClass(VllmLogger) + -def init_logger(name: str) -> Logger: +def init_logger(name: str) -> VllmLogger: """The main purpose of this function is to ensure that loggers are retrieved in such a way that we can be sure the root vllm logger has already been configured.""" - return logging.getLogger(name) + logger = logging.getLogger(name) + assert isinstance(logger, VllmLogger) + return logger # The root logger is initialized when the module is imported. diff --git a/vllm/lora/punica_wrapper/punica_selector.py b/vllm/lora/punica_wrapper/punica_selector.py index cd64878d95ae..9791d492d8e4 100644 --- a/vllm/lora/punica_wrapper/punica_selector.py +++ b/vllm/lora/punica_wrapper/punica_selector.py @@ -1,19 +1,21 @@ +from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.utils import print_info_once from .punica_base import PunicaWrapperBase +logger = init_logger(__name__) + def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase: if current_platform.is_cuda_alike(): # Lazy import to avoid ImportError from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU - print_info_once("Using PunicaWrapperGPU.") + logger.info_once("Using PunicaWrapperGPU.") return PunicaWrapperGPU(*args, **kwargs) elif current_platform.is_hpu(): # Lazy import to avoid ImportError from vllm.lora.punica_wrapper.punica_hpu import PunicaWrapperHPU - print_info_once("Using PunicaWrapperHPU.") + logger.info_once("Using PunicaWrapperHPU.") return PunicaWrapperHPU(*args, **kwargs) else: raise NotImplementedError diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index fddc8bad09ef..401606e8c76f 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -5,7 +5,6 @@ from vllm.config import get_current_vllm_config from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.utils import print_warning_once logger = init_logger(__name__) @@ -91,7 +90,7 @@ def enabled(cls) -> bool: compilation_config = get_current_vllm_config().compilation_config custom_ops = compilation_config.custom_ops if not hasattr(cls, "name"): - print_warning_once( + logger.warning_once( f"Custom op {cls.__name__} was not registered, " f"which means it won't appear in the op registry. " f"It will be enabled/disabled based on the global settings.") diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index dad04017d321..712dc486f236 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -8,6 +8,7 @@ import vllm.model_executor.layers.fused_moe # noqa from vllm import _custom_ops as ops +from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported) from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( @@ -16,7 +17,8 @@ all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize) from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform -from vllm.utils import print_warning_once + +logger = init_logger(__name__) class GPTQMarlinState(Enum): @@ -142,10 +144,10 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: "activation scales are None.") if (not all_close_1d(layer.w13_input_scale) or not all_close_1d(layer.w2_input_scale)): - print_warning_once( + logger.warning_once( "Found input_scales that are not equal for " "fp8 MoE layer. Using the maximum across experts " - "for each layer. ") + "for each layer.") layer.w13_input_scale = torch.nn.Parameter( layer.w13_input_scale.max(), requires_grad=False) layer.w2_input_scale = torch.nn.Parameter( diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 978e727bc7cb..259ff1997f37 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -26,7 +26,6 @@ PerTensorScaleParameter) from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform -from vllm.utils import print_warning_once ACTIVATION_SCHEMES = ["static", "dynamic"] @@ -408,10 +407,10 @@ def process_weights_after_loading(self, layer: Module) -> None: "activation scales are None.") if (not all_close_1d(layer.w13_input_scale) or not all_close_1d(layer.w2_input_scale)): - print_warning_once( + logger.warning_once( "Found input_scales that are not equal for " "fp8 MoE layer. Using the maximum across experts " - "for each layer. ") + "for each layer.") layer.w13_input_scale = torch.nn.Parameter( layer.w13_input_scale.max(), requires_grad=False) layer.w2_input_scale = torch.nn.Parameter( diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py index d79536d196b9..a74f5415c8a5 100644 --- a/vllm/model_executor/layers/quantization/kv_cache.py +++ b/vllm/model_executor/layers/quantization/kv_cache.py @@ -1,8 +1,10 @@ import torch +from vllm.logger import init_logger from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) -from vllm.utils import print_warning_once + +logger = init_logger(__name__) class BaseKVCacheMethod(QuantizeMethodBase): @@ -67,7 +69,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: layer._v_scale = v_scale if (layer._k_scale == 1.0 and layer._v_scale == 1.0 and "e5m2" not in layer.kv_cache_dtype): - print_warning_once( + logger.warning_once( "Using KV cache scaling factor 1.0 for fp8_e4m3. This " "may cause accuracy issues. Please make sure k/v_scale " "scaling factors are available in the fp8 checkpoint.") diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py index 8b3dfaae971c..245fe9238e42 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py @@ -3,11 +3,13 @@ import torch import vllm._custom_ops as ops +from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.utils import print_warning_once from .marlin_utils import marlin_make_workspace, marlin_permute_scales +logger = init_logger(__name__) + def is_fp8_marlin_supported(): return current_platform.has_device_capability(80) @@ -47,7 +49,7 @@ def apply_fp8_marlin_linear( def prepare_fp8_layer_for_marlin(layer: torch.nn.Module, strategy: str = "tensor") -> None: - print_warning_once( + logger.warning_once( "Your GPU does not have native support for FP8 computation but " "FP8 quantization is being used. Weight-only FP8 compression will " "be used leveraging the Marlin kernel. This may degrade " diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 9488d54edf36..b3ac10370293 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -25,7 +25,6 @@ get_quantization_config) from vllm.model_executor.layers.quantization.schema import QuantParamSchema from vllm.platforms import current_platform -from vllm.utils import print_warning_once logger = init_logger(__name__) @@ -647,7 +646,7 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]: None: If the remapped name is not found in params_dict. """ if name.endswith(".kv_scale"): - print_warning_once( + logger.warning_once( "DEPRECATED. Found kv_scale in the checkpoint. " "This format is deprecated in favor of separate k_scale and " "v_scale tensors and will be removed in a future release. " @@ -656,7 +655,7 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]: # NOTE: we remap the deprecated kv_scale to k_scale remapped_name = name.replace(".kv_scale", ".attn.k_scale") if remapped_name not in params_dict: - print_warning_once( + logger.warning_once( f"Found kv_scale in the checkpoint (e.g. {name}), " "but not found the expected name in the model " f"(e.g. {remapped_name}). kv_scale is " @@ -669,7 +668,7 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]: if name.endswith(scale_name): remapped_name = name.replace(scale_name, f".attn{scale_name}") if remapped_name not in params_dict: - print_warning_once( + logger.warning_once( f"Found {scale_name} in the checkpoint (e.g. {name}), " "but not found the expected name in the model " f"(e.g. {remapped_name}). {scale_name} is " diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index a40c321ce0a5..22dcba94672d 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -13,6 +13,7 @@ from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, InputContext, token_inputs) +from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, @@ -34,13 +35,14 @@ consecutive_placeholder_ranges, repeat_and_pad_placeholder_tokens) from vllm.sequence import IntermediateTensors, SequenceData -from vllm.utils import print_warning_once from .interfaces import SupportsMultiModal, SupportsPP from .utils import (is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, merge_multimodal_embeddings) +logger = init_logger(__name__) + # These configs are not part of the model config but the preprocessor # and processor files, so we hardcode them in the model file for now. CHAMELEON_CROP_SIZE_HEIGHT = CHAMELEON_CROP_SIZE_WIDTH = 512 @@ -1123,7 +1125,7 @@ def load_weights(self, weights: Iterable[Tuple[str, remapped_kv_scale_name = name.replace( ".kv_scale", ".attn.kv_scale") if remapped_kv_scale_name not in params_dict: - print_warning_once( + logger.warning_once( "Found kv scale in the checkpoint (e.g. " f"{name}), but not found the expected name in " f"the model (e.g. {remapped_kv_scale_name}). " diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index 5d9091cfb931..fbe5d1aee04b 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -20,6 +20,7 @@ from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size +from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (QKVParallelLinear, @@ -34,13 +35,14 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from vllm.utils import print_warning_once from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) +logger = init_logger(__name__) + class OlmoeMoE(nn.Module): """A tensor-parallel MoE implementation for Olmoe that shards each expert @@ -446,7 +448,7 @@ def load_weights(self, weights: Iterable[Tuple[str, remapped_kv_scale_name = name.replace( ".kv_scale", ".attn.kv_scale") if remapped_kv_scale_name not in params_dict: - print_warning_once( + logger.warning_once( "Found kv scale in the checkpoint " f"(e.g. {name}), but not found the expected " f"name in the model " diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index ba70243c6533..95de6c21871b 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -34,6 +34,7 @@ from vllm.distributed import (get_pp_group, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) +from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm @@ -50,13 +51,14 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from vllm.utils import print_warning_once from .interfaces import SupportsPP from .utils import (extract_layer_index, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) +logger = init_logger(__name__) + class Qwen2MoeMLP(nn.Module): @@ -524,7 +526,7 @@ def load_weights(self, weights: Iterable[Tuple[str, remapped_kv_scale_name = name.replace( ".kv_scale", ".attn.kv_scale") if remapped_kv_scale_name not in params_dict: - print_warning_once( + logger.warning_once( "Found kv scale in the checkpoint " f"(e.g. {name}), but not found the expected " f"name in the model " diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 269b66806adf..30381846e4b1 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -17,7 +17,7 @@ from vllm.multimodal import MultiModalPlaceholderMap, NestedTensors from vllm.platforms import _Backend, current_platform from vllm.sequence import IntermediateTensors -from vllm.utils import is_pin_memory_available, print_warning_once +from vllm.utils import is_pin_memory_available logger = init_logger(__name__) @@ -621,7 +621,7 @@ def get_vit_attn_backend(support_fa: bool = False) -> _Backend: if is_flash_attn_2_available(): selected_backend = _Backend.FLASH_ATTN else: - print_warning_once( + logger.warning_once( "Current `vllm-flash-attn` has a bug inside vision module, " "so we use xformers backend instead. You can run " "`pip install flash-attn` to use flash-attention backend.") diff --git a/vllm/utils.py b/vllm/utils.py index 38c7dea6d2d3..96553cb23275 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -661,18 +661,6 @@ def create_kv_caches_with_random( return key_caches, value_caches -@lru_cache -def print_info_once(msg: str) -> None: - # Set the stacklevel to 2 to print the caller's line info - logger.info(msg, stacklevel=2) - - -@lru_cache -def print_warning_once(msg: str) -> None: - # Set the stacklevel to 2 to print the caller's line info - logger.warning(msg, stacklevel=2) - - @lru_cache(maxsize=None) def is_pin_memory_available() -> bool: return current_platform.is_pin_memory_available() From 1677ecabcf1ca62b31d2ea9b22d000e5e6ccb607 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 19 Dec 2024 03:43:59 +0000 Subject: [PATCH 02/20] Avoid name clash Signed-off-by: DarkLight1337 --- vllm/attention/backends/torch_sdpa.py | 5 +++-- vllm/attention/backends/xformers.py | 5 +++-- vllm/config.py | 6 +++--- vllm/entrypoints/chat_utils.py | 6 +++--- vllm/inputs/preprocess.py | 6 +++--- vllm/inputs/registry.py | 2 +- vllm/logger.py | 7 +++++-- vllm/lora/punica_wrapper/punica_selector.py | 4 ++-- vllm/model_executor/custom_op.py | 2 +- .../compressed_tensors/compressed_tensors_moe.py | 2 +- vllm/model_executor/layers/quantization/fp8.py | 2 +- vllm/model_executor/layers/quantization/kv_cache.py | 2 +- .../layers/quantization/utils/marlin_utils_fp8.py | 2 +- vllm/model_executor/model_loader/weight_utils.py | 6 +++--- vllm/model_executor/models/chameleon.py | 2 +- vllm/model_executor/models/olmoe.py | 2 +- vllm/model_executor/models/qwen2_moe.py | 2 +- vllm/model_executor/models/utils.py | 2 +- 18 files changed, 35 insertions(+), 30 deletions(-) diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index d782de24fd3e..87283633cf32 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -398,8 +398,9 @@ def __init__( raise ValueError( "Torch SPDA does not support block-sparse attention.") if logits_soft_cap is not None: - logger.warning_once("Torch SPDA does not support logits soft cap. " - "Outputs may be slightly off.") + logger.print_warning_once( + "Torch SPDA does not support logits soft cap. " + "Outputs may be slightly off.") self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index 2c4997ea0a93..958ce41294b0 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -386,8 +386,9 @@ def __init__( raise ValueError( "XFormers does not support block-sparse attention.") if logits_soft_cap is not None: - logger.warning_once("XFormers does not support logits soft cap. " - "Outputs may be slightly off.") + logger.print_warning_once( + "XFormers does not support logits soft cap. " + "Outputs may be slightly off.") self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) diff --git a/vllm/config.py b/vllm/config.py index 07e3d34d47f6..61059351401f 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -301,7 +301,7 @@ def __init__(self, sliding_window_len_min = get_min_sliding_window( self.hf_text_config.sliding_window) - logger.warning_once( + logger.print_warning_once( f"{self.hf_text_config.model_type} has interleaved " "attention, which is currently not supported by the " "XFORMERS backend. Disabling sliding window and capping " @@ -2638,7 +2638,7 @@ def uuid(self): def model_post_init(self, __context: Any) -> None: if not self.enable_reshape and self.enable_fusion: - logger.warning_once( + logger.print_warning_once( "Fusion enabled but reshape elimination disabled." "RMSNorm + quant (fp8) fusion might not work") @@ -3017,7 +3017,7 @@ def __post_init__(self): self.scheduler_config.chunked_prefill_enabled and \ self.model_config.dtype == torch.float32 and \ current_platform.get_device_capability() == (7, 5): - logger.warning_once( + logger.print_warning_once( "Turing devices tensor cores do not support float32 matmul. " "To workaround this limitation, vLLM will set 'ieee' input " "precision for chunked prefill triton kernels.") diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index cc9936741a45..ff6bdb76acb2 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -999,14 +999,14 @@ def apply_mistral_chat_template( **kwargs: Any, ) -> List[int]: if chat_template is not None: - logger.warning_once( + logger.print_warning_once( "'chat_template' cannot be overridden for mistral tokenizer.") if "add_generation_prompt" in kwargs: - logger.warning_once( + logger.print_warning_once( "'add_generation_prompt' is not supported for mistral tokenizer, " "so it will be ignored.") if "continue_final_message" in kwargs: - logger.warning_once( + logger.print_warning_once( "'continue_final_message' is not supported for mistral tokenizer, " "so it will be ignored.") diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index b4ec89db5d73..d3076caae7af 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -67,13 +67,13 @@ def get_decoder_start_token_id(self) -> Optional[int]: ''' if not self.model_config.is_encoder_decoder: - logger.warning_once( + logger.print_warning_once( "Using None for decoder start token id because " "this is not an encoder/decoder model.") return None if (self.model_config is None or self.model_config.hf_config is None): - logger.warning_once( + logger.print_warning_once( "Using None for decoder start token id because " "model config is not available.") return None @@ -81,7 +81,7 @@ def get_decoder_start_token_id(self) -> Optional[int]: dec_start_token_id = getattr(self.model_config.hf_config, 'decoder_start_token_id', None) if dec_start_token_id is None: - logger.warning_once( + logger.print_warning_once( "Falling back on for decoder start token " "id because decoder start token id is not " "available.") diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index d767b56ff60d..d2aab36e006e 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -316,7 +316,7 @@ def dummy_data_for_profiling( num_tokens = dummy_data.seq_data.prompt_token_ids if len(num_tokens) < seq_len: if is_encoder_data: - logger.warning_once( + logger.print_warning_once( f"Expected at least {seq_len} dummy encoder tokens for " f"profiling, but found {len(num_tokens)} tokens instead.") else: diff --git a/vllm/logger.py b/vllm/logger.py index 3b6258ca173e..f88cab90ebbf 100644 --- a/vllm/logger.py +++ b/vllm/logger.py @@ -63,14 +63,17 @@ def _print_warning_once(logger: Logger, msg: str) -> None: class VllmLogger(Logger): - def info_once(self, msg: str) -> None: + # NOTE: We can't use info_once and warning_once because they + # are overwritten by transformers: + # https://github.com/huggingface/transformers/blob/2c47618c1a282f925446506d53108dc6e82d9ef0/src/transformers/utils/logging.py#L331 + def print_info_once(self, msg: str) -> None: """ As :meth:`info`, but subsequent calls with the same message are silently dropped. """ _print_info_once(self, msg) - def warning_once(self, msg: str) -> None: + def print_warning_once(self, msg: str) -> None: """ As :meth:`warning`, but subsequent calls with the same message are silently dropped. diff --git a/vllm/lora/punica_wrapper/punica_selector.py b/vllm/lora/punica_wrapper/punica_selector.py index 9791d492d8e4..de8c1bc9f903 100644 --- a/vllm/lora/punica_wrapper/punica_selector.py +++ b/vllm/lora/punica_wrapper/punica_selector.py @@ -10,12 +10,12 @@ def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase: if current_platform.is_cuda_alike(): # Lazy import to avoid ImportError from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU - logger.info_once("Using PunicaWrapperGPU.") + logger.print_info_once("Using PunicaWrapperGPU.") return PunicaWrapperGPU(*args, **kwargs) elif current_platform.is_hpu(): # Lazy import to avoid ImportError from vllm.lora.punica_wrapper.punica_hpu import PunicaWrapperHPU - logger.info_once("Using PunicaWrapperHPU.") + logger.print_info_once("Using PunicaWrapperHPU.") return PunicaWrapperHPU(*args, **kwargs) else: raise NotImplementedError diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index 401606e8c76f..70d187c2db6d 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -90,7 +90,7 @@ def enabled(cls) -> bool: compilation_config = get_current_vllm_config().compilation_config custom_ops = compilation_config.custom_ops if not hasattr(cls, "name"): - logger.warning_once( + logger.print_warning_once( f"Custom op {cls.__name__} was not registered, " f"which means it won't appear in the op registry. " f"It will be enabled/disabled based on the global settings.") diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 712dc486f236..6f93927fb49a 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -144,7 +144,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: "activation scales are None.") if (not all_close_1d(layer.w13_input_scale) or not all_close_1d(layer.w2_input_scale)): - logger.warning_once( + logger.print_warning_once( "Found input_scales that are not equal for " "fp8 MoE layer. Using the maximum across experts " "for each layer.") diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 259ff1997f37..1bc77f1e912d 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -407,7 +407,7 @@ def process_weights_after_loading(self, layer: Module) -> None: "activation scales are None.") if (not all_close_1d(layer.w13_input_scale) or not all_close_1d(layer.w2_input_scale)): - logger.warning_once( + logger.print_warning_once( "Found input_scales that are not equal for " "fp8 MoE layer. Using the maximum across experts " "for each layer.") diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py index a74f5415c8a5..b1e63cd5cc40 100644 --- a/vllm/model_executor/layers/quantization/kv_cache.py +++ b/vllm/model_executor/layers/quantization/kv_cache.py @@ -69,7 +69,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: layer._v_scale = v_scale if (layer._k_scale == 1.0 and layer._v_scale == 1.0 and "e5m2" not in layer.kv_cache_dtype): - logger.warning_once( + logger.print_warning_once( "Using KV cache scaling factor 1.0 for fp8_e4m3. This " "may cause accuracy issues. Please make sure k/v_scale " "scaling factors are available in the fp8 checkpoint.") diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py index 245fe9238e42..09955307a5d4 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py @@ -49,7 +49,7 @@ def apply_fp8_marlin_linear( def prepare_fp8_layer_for_marlin(layer: torch.nn.Module, strategy: str = "tensor") -> None: - logger.warning_once( + logger.print_warning_once( "Your GPU does not have native support for FP8 computation but " "FP8 quantization is being used. Weight-only FP8 compression will " "be used leveraging the Marlin kernel. This may degrade " diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index b3ac10370293..75c8cddc3a12 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -646,7 +646,7 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]: None: If the remapped name is not found in params_dict. """ if name.endswith(".kv_scale"): - logger.warning_once( + logger.print_warning_once( "DEPRECATED. Found kv_scale in the checkpoint. " "This format is deprecated in favor of separate k_scale and " "v_scale tensors and will be removed in a future release. " @@ -655,7 +655,7 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]: # NOTE: we remap the deprecated kv_scale to k_scale remapped_name = name.replace(".kv_scale", ".attn.k_scale") if remapped_name not in params_dict: - logger.warning_once( + logger.print_warning_once( f"Found kv_scale in the checkpoint (e.g. {name}), " "but not found the expected name in the model " f"(e.g. {remapped_name}). kv_scale is " @@ -668,7 +668,7 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]: if name.endswith(scale_name): remapped_name = name.replace(scale_name, f".attn{scale_name}") if remapped_name not in params_dict: - logger.warning_once( + logger.print_warning_once( f"Found {scale_name} in the checkpoint (e.g. {name}), " "but not found the expected name in the model " f"(e.g. {remapped_name}). {scale_name} is " diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 22dcba94672d..54017700ccc3 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -1125,7 +1125,7 @@ def load_weights(self, weights: Iterable[Tuple[str, remapped_kv_scale_name = name.replace( ".kv_scale", ".attn.kv_scale") if remapped_kv_scale_name not in params_dict: - logger.warning_once( + logger.print_warning_once( "Found kv scale in the checkpoint (e.g. " f"{name}), but not found the expected name in " f"the model (e.g. {remapped_kv_scale_name}). " diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index fbe5d1aee04b..2cc47aeb2d99 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -448,7 +448,7 @@ def load_weights(self, weights: Iterable[Tuple[str, remapped_kv_scale_name = name.replace( ".kv_scale", ".attn.kv_scale") if remapped_kv_scale_name not in params_dict: - logger.warning_once( + logger.print_warning_once( "Found kv scale in the checkpoint " f"(e.g. {name}), but not found the expected " f"name in the model " diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 95de6c21871b..a0ed344637ee 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -526,7 +526,7 @@ def load_weights(self, weights: Iterable[Tuple[str, remapped_kv_scale_name = name.replace( ".kv_scale", ".attn.kv_scale") if remapped_kv_scale_name not in params_dict: - logger.warning_once( + logger.print_warning_once( "Found kv scale in the checkpoint " f"(e.g. {name}), but not found the expected " f"name in the model " diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 30381846e4b1..324730f59f18 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -621,7 +621,7 @@ def get_vit_attn_backend(support_fa: bool = False) -> _Backend: if is_flash_attn_2_available(): selected_backend = _Backend.FLASH_ATTN else: - logger.warning_once( + logger.print_warning_once( "Current `vllm-flash-attn` has a bug inside vision module, " "so we use xformers backend instead. You can run " "`pip install flash-attn` to use flash-attention backend.") From da18374c4a5bc1997a337085e5eff733d67e1ad3 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 19 Dec 2024 03:45:34 +0000 Subject: [PATCH 03/20] fix Signed-off-by: DarkLight1337 --- vllm/inputs/preprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index d3076caae7af..d4c175e4de17 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -214,7 +214,7 @@ def _can_process_multimodal(self) -> bool: # updated to use the new multi-modal processor can_process_multimodal = self.mm_registry.has_processor(model_config) if not can_process_multimodal: - logger.info_once( + logger.print_info_once( "Your model uses the legacy input pipeline instead of the new " "multi-modal processor. Please note that the legacy pipeline " "will be removed in a future release. For more details, see: " From d6a80810851cf5822ffd1405ecfd1a4b8add2eb2 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 25 Dec 2024 07:24:43 +0000 Subject: [PATCH 04/20] Update Signed-off-by: DarkLight1337 --- vllm/lora/utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index 3a84a6ae1c02..09ab2ec776de 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -32,7 +32,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.models.utils import WeightsMapper -from vllm.utils import print_warning_once logger = init_logger(__name__) @@ -122,7 +121,7 @@ def parse_fine_tuned_lora_name( ("orig_to_new_suffix", w_mapper.orig_to_new_suffix), ]: if mapping: - print_warning_once( + logger.print_warning_once( f"vLLM currently does not support mapping of LoRA weights " f"for {mapping}.") setattr(w_mapper, attr, {}) From 02143692ba86a7ba6198a67e12982c5317c8bd2f Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 6 Jan 2025 15:58:56 +0000 Subject: [PATCH 05/20] Oops Signed-off-by: DarkLight1337 --- vllm/lora/peft_helper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py index acd7f7de8dd4..4bc8685233cd 100644 --- a/vllm/lora/peft_helper.py +++ b/vllm/lora/peft_helper.py @@ -6,7 +6,7 @@ from vllm.logger import init_logger -logger = init_logger(__name++) +logger = init_logger(__name__) @dataclass From cae7efbeb9327ec28d0581bef5358a2cc96aa4c8 Mon Sep 17 00:00:00 2001 From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> Date: Tue, 7 Jan 2025 17:44:39 +0100 Subject: [PATCH 06/20] add debug command during helm deployment in lint and deploy github workflow Signed-off-by: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> --- .github/workflows/lint-and-deploy.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml index ab6f6e5d2060..c0467b62a37b 100644 --- a/.github/workflows/lint-and-deploy.yaml +++ b/.github/workflows/lint-and-deploy.yaml @@ -64,6 +64,7 @@ jobs: run: | export AWS_ACCESS_KEY_ID=minioadmin export AWS_SECRET_ACCESS_KEY=minioadmin + kubectl -n ns-vllm get pods & helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env" - name: curl test From f882571befa131ac54a90480a9abe3f8a0d17c46 Mon Sep 17 00:00:00 2001 From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> Date: Wed, 8 Jan 2025 08:12:13 +0100 Subject: [PATCH 07/20] update debug command during helm deployment in lint and deploy github workflow Signed-off-by: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> --- .github/workflows/lint-and-deploy.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml index c0467b62a37b..0055465b9b17 100644 --- a/.github/workflows/lint-and-deploy.yaml +++ b/.github/workflows/lint-and-deploy.yaml @@ -64,7 +64,7 @@ jobs: run: | export AWS_ACCESS_KEY_ID=minioadmin export AWS_SECRET_ACCESS_KEY=minioadmin - kubectl -n ns-vllm get pods & + kubectl -n ns-vllm get all & helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env" - name: curl test From 856168cff062112ff3719fc38fd573d0f51c29f5 Mon Sep 17 00:00:00 2001 From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> Date: Wed, 8 Jan 2025 08:23:52 +0100 Subject: [PATCH 08/20] update debug command during helm deployment in lint and deploy github workflow Signed-off-by: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> --- .github/workflows/lint-and-deploy.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml index 0055465b9b17..0d85e9b2d01c 100644 --- a/.github/workflows/lint-and-deploy.yaml +++ b/.github/workflows/lint-and-deploy.yaml @@ -64,7 +64,7 @@ jobs: run: | export AWS_ACCESS_KEY_ID=minioadmin export AWS_SECRET_ACCESS_KEY=minioadmin - kubectl -n ns-vllm get all & + watch -n 5 kubectl -n ns-vllm get pods & helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env" - name: curl test From cf5186fa756d876e21f15ef0de8a438e9a347647 Mon Sep 17 00:00:00 2001 From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> Date: Wed, 8 Jan 2025 08:38:47 +0100 Subject: [PATCH 09/20] update debug command during helm deployment in lint and deploy github workflow Signed-off-by: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> --- .github/workflows/lint-and-deploy.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml index 0d85e9b2d01c..c71e0241e42a 100644 --- a/.github/workflows/lint-and-deploy.yaml +++ b/.github/workflows/lint-and-deploy.yaml @@ -64,8 +64,7 @@ jobs: run: | export AWS_ACCESS_KEY_ID=minioadmin export AWS_SECRET_ACCESS_KEY=minioadmin - watch -n 5 kubectl -n ns-vllm get pods & - helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env" + watch -n 5 kubectl -n ns-vllm get pods; helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env" - name: curl test run: | From 7caacd7fb7e59526d366e4d2b00d15a2838efb59 Mon Sep 17 00:00:00 2001 From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> Date: Wed, 8 Jan 2025 08:55:50 +0100 Subject: [PATCH 10/20] update debug command during helm deployment in lint and deploy github workflow Signed-off-by: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> --- .github/workflows/lint-and-deploy.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml index c71e0241e42a..031939bd3975 100644 --- a/.github/workflows/lint-and-deploy.yaml +++ b/.github/workflows/lint-and-deploy.yaml @@ -64,7 +64,8 @@ jobs: run: | export AWS_ACCESS_KEY_ID=minioadmin export AWS_SECRET_ACCESS_KEY=minioadmin - watch -n 5 kubectl -n ns-vllm get pods; helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env" + for i in {1..30}; do kubectl -n ns-vllm get pods; sleep 5; done & + helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env" - name: curl test run: | From b293a89ce8002d49ee57c2a4e6a3b1d5d67daed7 Mon Sep 17 00:00:00 2001 From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> Date: Wed, 8 Jan 2025 09:05:05 +0100 Subject: [PATCH 11/20] update debug command during helm deployment in lint and deploy github workflow Signed-off-by: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> --- .github/workflows/lint-and-deploy.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml index 031939bd3975..3ba80c587fb5 100644 --- a/.github/workflows/lint-and-deploy.yaml +++ b/.github/workflows/lint-and-deploy.yaml @@ -64,7 +64,7 @@ jobs: run: | export AWS_ACCESS_KEY_ID=minioadmin export AWS_SECRET_ACCESS_KEY=minioadmin - for i in {1..30}; do kubectl -n ns-vllm get pods; sleep 5; done & + while true; do kubectl -n ns-vllm get pods; sleep 5; done & helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env" - name: curl test From 847bb2e4e4bc882456da769232edda51d0540471 Mon Sep 17 00:00:00 2001 From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> Date: Wed, 8 Jan 2025 09:25:34 +0100 Subject: [PATCH 12/20] update debug command during helm deployment in lint and deploy github workflow Signed-off-by: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> --- .github/workflows/lint-and-deploy.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml index 3ba80c587fb5..300ea5c04d51 100644 --- a/.github/workflows/lint-and-deploy.yaml +++ b/.github/workflows/lint-and-deploy.yaml @@ -64,7 +64,7 @@ jobs: run: | export AWS_ACCESS_KEY_ID=minioadmin export AWS_SECRET_ACCESS_KEY=minioadmin - while true; do kubectl -n ns-vllm get pods; sleep 5; done & + sleep 30; kubectl -n ns-vllm logs -f $(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}') & helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env" - name: curl test From f531a152f0a3575d2fd02422162dc3572de1c9a4 Mon Sep 17 00:00:00 2001 From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> Date: Wed, 8 Jan 2025 09:33:50 +0100 Subject: [PATCH 13/20] update debug command format during helm deployment in lint and deploy github workflow Signed-off-by: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> --- .github/workflows/lint-and-deploy.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml index 300ea5c04d51..c78f7098012b 100644 --- a/.github/workflows/lint-and-deploy.yaml +++ b/.github/workflows/lint-and-deploy.yaml @@ -64,7 +64,7 @@ jobs: run: | export AWS_ACCESS_KEY_ID=minioadmin export AWS_SECRET_ACCESS_KEY=minioadmin - sleep 30; kubectl -n ns-vllm logs -f $(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}') & + sleep 30; kubectl -n ns-vllm logs -f $(kubectl -n ns-vllm get pods | awk "/deployment/ {print $1;exit}") & helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env" - name: curl test From c24454a49666e3b31ed5636dac0a6880a650f3c8 Mon Sep 17 00:00:00 2001 From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> Date: Wed, 8 Jan 2025 09:39:24 +0100 Subject: [PATCH 14/20] update debug command format during helm deployment in lint and deploy github workflow Signed-off-by: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> --- .github/workflows/lint-and-deploy.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml index c78f7098012b..272f672c02e0 100644 --- a/.github/workflows/lint-and-deploy.yaml +++ b/.github/workflows/lint-and-deploy.yaml @@ -64,7 +64,7 @@ jobs: run: | export AWS_ACCESS_KEY_ID=minioadmin export AWS_SECRET_ACCESS_KEY=minioadmin - sleep 30; kubectl -n ns-vllm logs -f $(kubectl -n ns-vllm get pods | awk "/deployment/ {print $1;exit}") & + sleep 30 && kubectl -n ns-vllm logs -f $(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}') & helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env" - name: curl test From 5c3f87ba96bcd59c2167bf2908bde9e7bdafbf8e Mon Sep 17 00:00:00 2001 From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> Date: Wed, 8 Jan 2025 09:42:07 +0100 Subject: [PATCH 15/20] update debug command format during helm deployment in lint and deploy github workflow Signed-off-by: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com> --- .github/workflows/lint-and-deploy.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml index 272f672c02e0..9db3906cf593 100644 --- a/.github/workflows/lint-and-deploy.yaml +++ b/.github/workflows/lint-and-deploy.yaml @@ -64,7 +64,7 @@ jobs: run: | export AWS_ACCESS_KEY_ID=minioadmin export AWS_SECRET_ACCESS_KEY=minioadmin - sleep 30 && kubectl -n ns-vllm logs -f $(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}') & + sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" & helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env" - name: curl test From 2c1181905ec20abc0475e9b10030193de58e3a8e Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 8 Jan 2025 10:08:15 +0000 Subject: [PATCH 16/20] Ensure init_logger is available only after configuration Signed-off-by: DarkLight1337 --- vllm/logger.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/vllm/logger.py b/vllm/logger.py index f88cab90ebbf..9d7c08716690 100644 --- a/vllm/logger.py +++ b/vllm/logger.py @@ -8,7 +8,7 @@ from logging import Logger from logging.config import dictConfig from os import path -from typing import Dict, Optional +from typing import Any, Optional import vllm.envs as envs @@ -81,8 +81,8 @@ def print_warning_once(self, msg: str) -> None: _print_warning_once(self, msg) -def _configure_vllm_root_logger() -> None: - logging_config: Dict = {} +def _configure_vllm_root_logger() -> bool: + logging_config = dict[str, Any]() if not VLLM_CONFIGURE_LOGGING and VLLM_LOGGING_CONFIG_PATH: raise RuntimeError( @@ -117,6 +117,14 @@ def _configure_vllm_root_logger() -> None: logging.setLoggerClass(VllmLogger) + return True + + +# The root logger is initialized when the module is imported. +# This is thread-safe as the module is only imported once, +# guaranteed by the Python GIL. +is_configured = _configure_vllm_root_logger() + def init_logger(name: str) -> VllmLogger: """The main purpose of this function is to ensure that loggers are @@ -124,15 +132,10 @@ def init_logger(name: str) -> VllmLogger: already been configured.""" logger = logging.getLogger(name) - assert isinstance(logger, VllmLogger) + assert isinstance(logger, VllmLogger), (is_configured, type(logger)) return logger -# The root logger is initialized when the module is imported. -# This is thread-safe as the module is only imported once, -# guaranteed by the Python GIL. -_configure_vllm_root_logger() - logger = init_logger(__name__) From f0dfbea421e6aa5b38346681f1e21db171e9b3cb Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 8 Jan 2025 10:10:23 +0000 Subject: [PATCH 17/20] Update Signed-off-by: DarkLight1337 --- vllm/logger.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/vllm/logger.py b/vllm/logger.py index 9d7c08716690..fa9e96618e0a 100644 --- a/vllm/logger.py +++ b/vllm/logger.py @@ -81,7 +81,7 @@ def print_warning_once(self, msg: str) -> None: _print_warning_once(self, msg) -def _configure_vllm_root_logger() -> bool: +def _configure_vllm_root_logger() -> None: logging_config = dict[str, Any]() if not VLLM_CONFIGURE_LOGGING and VLLM_LOGGING_CONFIG_PATH: @@ -117,13 +117,9 @@ def _configure_vllm_root_logger() -> bool: logging.setLoggerClass(VllmLogger) - return True - # The root logger is initialized when the module is imported. -# This is thread-safe as the module is only imported once, -# guaranteed by the Python GIL. -is_configured = _configure_vllm_root_logger() +_configure_vllm_root_logger() def init_logger(name: str) -> VllmLogger: @@ -132,7 +128,7 @@ def init_logger(name: str) -> VllmLogger: already been configured.""" logger = logging.getLogger(name) - assert isinstance(logger, VllmLogger), (is_configured, type(logger)) + assert isinstance(logger, VllmLogger), type(logger) return logger From 2381eeba014afa9571175533a42279b9c04ea0aa Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 8 Jan 2025 13:47:10 +0000 Subject: [PATCH 18/20] Patch methods instead of creating a subclass Signed-off-by: DarkLight1337 --- vllm/logger.py | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/vllm/logger.py b/vllm/logger.py index fa9e96618e0a..af1f01c4d933 100644 --- a/vllm/logger.py +++ b/vllm/logger.py @@ -8,7 +8,8 @@ from logging import Logger from logging.config import dictConfig from os import path -from typing import Any, Optional +from types import MethodType +from typing import Any, Optional, cast import vllm.envs as envs @@ -61,7 +62,10 @@ def _print_warning_once(logger: Logger, msg: str) -> None: logger.warning(msg, stacklevel=4) -class VllmLogger(Logger): +# NOTE: This class is just to provide type information. +# We don't set the logger class to avoid conflicting with other +# libraries (e.g. `intel_extension_for_pytorch.utils._logger`). +class _VllmLogger(Logger): # NOTE: We can't use info_once and warning_once because they # are overwritten by transformers: @@ -115,22 +119,31 @@ def _configure_vllm_root_logger() -> None: if logging_config: dictConfig(logging_config) - logging.setLoggerClass(VllmLogger) - -# The root logger is initialized when the module is imported. -_configure_vllm_root_logger() - - -def init_logger(name: str) -> VllmLogger: +def init_logger(name: str) -> _VllmLogger: """The main purpose of this function is to ensure that loggers are retrieved in such a way that we can be sure the root vllm logger has already been configured.""" logger = logging.getLogger(name) - assert isinstance(logger, VllmLogger), type(logger) - return logger + for method_name in ("print_info_once", "print_warning_once"): + method = getattr(_VllmLogger, method_name) + + if hasattr(logger, method_name): + raise RuntimeError( + f"Unable to patch `{method_name}` for {type(logger)} " + "because a method with the same name already exists.") + + setattr(logger, method_name, MethodType(method, logger)) + + return cast(_VllmLogger, logger) + + +# The root logger is initialized when the module is imported. +# This is thread-safe as the module is only imported once, +# guaranteed by the Python GIL. +_configure_vllm_root_logger() logger = init_logger(__name__) From b0f6ab871b2405f1a9a812e77083ada3161ef14f Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 8 Jan 2025 14:01:15 +0000 Subject: [PATCH 19/20] Fix Signed-off-by: DarkLight1337 --- .github/workflows/lint-and-deploy.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml index 494a061c91d0..556b60d2fca1 100644 --- a/.github/workflows/lint-and-deploy.yaml +++ b/.github/workflows/lint-and-deploy.yaml @@ -65,7 +65,7 @@ jobs: export AWS_ACCESS_KEY_ID=minioadmin export AWS_SECRET_ACCESS_KEY=minioadmin sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" & - helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/chart-helm/online_serving/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env" + helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env" - name: curl test run: | From a679077641904c180d507a81eca625607643226e Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 8 Jan 2025 14:21:39 +0000 Subject: [PATCH 20/20] Rename Signed-off-by: DarkLight1337 --- vllm/attention/backends/torch_sdpa.py | 5 +-- vllm/attention/backends/xformers.py | 5 +-- vllm/config.py | 6 +-- vllm/entrypoints/chat_utils.py | 6 +-- vllm/inputs/preprocess.py | 8 ++-- vllm/inputs/registry.py | 2 +- vllm/logger.py | 37 +++++++++---------- vllm/lora/peft_helper.py | 2 +- vllm/lora/punica_wrapper/punica_selector.py | 4 +- vllm/model_executor/custom_op.py | 2 +- .../compressed_tensors_moe.py | 2 +- .../model_executor/layers/quantization/fp8.py | 2 +- .../layers/quantization/kv_cache.py | 2 +- .../quantization/utils/marlin_utils_fp8.py | 2 +- .../model_loader/weight_utils.py | 6 +-- vllm/model_executor/models/chameleon.py | 2 +- vllm/model_executor/models/olmoe.py | 2 +- vllm/model_executor/models/qwen2_moe.py | 2 +- vllm/model_executor/models/utils.py | 2 +- 19 files changed, 48 insertions(+), 51 deletions(-) diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index d75823735211..ca1c4618615d 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -399,9 +399,8 @@ def __init__( raise ValueError( "Torch SPDA does not support block-sparse attention.") if logits_soft_cap is not None: - logger.print_warning_once( - "Torch SPDA does not support logits soft cap. " - "Outputs may be slightly off.") + logger.warning_once("Torch SPDA does not support logits soft cap. " + "Outputs may be slightly off.") self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index 175d1e20bb9c..8c8ca8520a9d 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -387,9 +387,8 @@ def __init__( raise ValueError( "XFormers does not support block-sparse attention.") if logits_soft_cap is not None: - logger.print_warning_once( - "XFormers does not support logits soft cap. " - "Outputs may be slightly off.") + logger.warning_once("XFormers does not support logits soft cap. " + "Outputs may be slightly off.") self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) diff --git a/vllm/config.py b/vllm/config.py index e88103bf80fd..19609085cc96 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -313,7 +313,7 @@ def __init__(self, sliding_window_len_min = get_min_sliding_window( self.hf_text_config.sliding_window) - logger.print_warning_once( + logger.warning_once( f"{self.hf_text_config.model_type} has interleaved " "attention, which is currently not supported by the " "XFORMERS backend. Disabling sliding window and capping " @@ -2757,7 +2757,7 @@ def uuid(self): def model_post_init(self, __context: Any) -> None: if not self.enable_reshape and self.enable_fusion: - logger.print_warning_once( + logger.warning_once( "Fusion enabled but reshape elimination disabled." "RMSNorm + quant (fp8) fusion might not work") @@ -3150,7 +3150,7 @@ def __post_init__(self): self.scheduler_config.chunked_prefill_enabled and \ self.model_config.dtype == torch.float32 and \ current_platform.get_device_capability() == (7, 5): - logger.print_warning_once( + logger.warning_once( "Turing devices tensor cores do not support float32 matmul. " "To workaround this limitation, vLLM will set 'ieee' input " "precision for chunked prefill triton kernels.") diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 3a315b731560..923c7459f694 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -984,14 +984,14 @@ def apply_mistral_chat_template( **kwargs: Any, ) -> List[int]: if chat_template is not None: - logger.print_warning_once( + logger.warning_once( "'chat_template' cannot be overridden for mistral tokenizer.") if "add_generation_prompt" in kwargs: - logger.print_warning_once( + logger.warning_once( "'add_generation_prompt' is not supported for mistral tokenizer, " "so it will be ignored.") if "continue_final_message" in kwargs: - logger.print_warning_once( + logger.warning_once( "'continue_final_message' is not supported for mistral tokenizer, " "so it will be ignored.") diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 9b8ef7a8e6d4..0a789d39346e 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -67,13 +67,13 @@ def get_decoder_start_token_id(self) -> Optional[int]: ''' if not self.model_config.is_encoder_decoder: - logger.print_warning_once( + logger.warning_once( "Using None for decoder start token id because " "this is not an encoder/decoder model.") return None if (self.model_config is None or self.model_config.hf_config is None): - logger.print_warning_once( + logger.warning_once( "Using None for decoder start token id because " "model config is not available.") return None @@ -81,7 +81,7 @@ def get_decoder_start_token_id(self) -> Optional[int]: dec_start_token_id = getattr(self.model_config.hf_config, 'decoder_start_token_id', None) if dec_start_token_id is None: - logger.print_warning_once( + logger.warning_once( "Falling back on for decoder start token " "id because decoder start token id is not " "available.") @@ -227,7 +227,7 @@ def _can_process_multimodal(self) -> bool: # updated to use the new multi-modal processor can_process_multimodal = self.mm_registry.has_processor(model_config) if not can_process_multimodal: - logger.print_info_once( + logger.info_once( "Your model uses the legacy input pipeline instead of the new " "multi-modal processor. Please note that the legacy pipeline " "will be removed in a future release. For more details, see: " diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index dc2a4496a72b..aad0dfab94a0 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -352,7 +352,7 @@ def dummy_data_for_profiling( num_tokens = dummy_data.seq_data.prompt_token_ids if len(num_tokens) < seq_len: if is_encoder_data: - logger.print_warning_once( + logger.warning_once( f"Expected at least {seq_len} dummy encoder tokens for " f"profiling, but found {len(num_tokens)} tokens instead.") else: diff --git a/vllm/logger.py b/vllm/logger.py index af1f01c4d933..cac174f7ba02 100644 --- a/vllm/logger.py +++ b/vllm/logger.py @@ -52,32 +52,33 @@ @lru_cache def _print_info_once(logger: Logger, msg: str) -> None: - # Set the stacklevel to 4 to print the original caller's line info - logger.info(msg, stacklevel=4) + # Set the stacklevel to 2 to print the original caller's line info + logger.info(msg, stacklevel=2) @lru_cache def _print_warning_once(logger: Logger, msg: str) -> None: - # Set the stacklevel to 4 to print the original caller's line info - logger.warning(msg, stacklevel=4) + # Set the stacklevel to 2 to print the original caller's line info + logger.warning(msg, stacklevel=2) -# NOTE: This class is just to provide type information. -# We don't set the logger class to avoid conflicting with other -# libraries (e.g. `intel_extension_for_pytorch.utils._logger`). class _VllmLogger(Logger): + """ + Note: + This class is just to provide type information. + We actually patch the methods directly on the :class:`logging.Logger` + instance to avoid conflicting with other libraries such as + `intel_extension_for_pytorch.utils._logger`. + """ - # NOTE: We can't use info_once and warning_once because they - # are overwritten by transformers: - # https://github.com/huggingface/transformers/blob/2c47618c1a282f925446506d53108dc6e82d9ef0/src/transformers/utils/logging.py#L331 - def print_info_once(self, msg: str) -> None: + def info_once(self, msg: str) -> None: """ As :meth:`info`, but subsequent calls with the same message are silently dropped. """ _print_info_once(self, msg) - def print_warning_once(self, msg: str) -> None: + def warning_once(self, msg: str) -> None: """ As :meth:`warning`, but subsequent calls with the same message are silently dropped. @@ -127,14 +128,12 @@ def init_logger(name: str) -> _VllmLogger: logger = logging.getLogger(name) - for method_name in ("print_info_once", "print_warning_once"): - method = getattr(_VllmLogger, method_name) - - if hasattr(logger, method_name): - raise RuntimeError( - f"Unable to patch `{method_name}` for {type(logger)} " - "because a method with the same name already exists.") + methods_to_patch = { + "info_once": _print_info_once, + "warning_once": _print_warning_once, + } + for method_name, method in methods_to_patch.items(): setattr(logger, method_name, MethodType(method, logger)) return cast(_VllmLogger, logger) diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py index 4bc8685233cd..dacfb9ebd148 100644 --- a/vllm/lora/peft_helper.py +++ b/vllm/lora/peft_helper.py @@ -44,7 +44,7 @@ def _validate_features(self): def __post_init__(self): self._validate_features() if self.use_rslora: - logger.print_info_once("Loading LoRA weights trained with rsLoRA.") + logger.info_once("Loading LoRA weights trained with rsLoRA.") self.vllm_lora_scaling_factor = self.lora_alpha / math.sqrt(self.r) else: self.vllm_lora_scaling_factor = self.lora_alpha / self.r diff --git a/vllm/lora/punica_wrapper/punica_selector.py b/vllm/lora/punica_wrapper/punica_selector.py index de8c1bc9f903..9791d492d8e4 100644 --- a/vllm/lora/punica_wrapper/punica_selector.py +++ b/vllm/lora/punica_wrapper/punica_selector.py @@ -10,12 +10,12 @@ def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase: if current_platform.is_cuda_alike(): # Lazy import to avoid ImportError from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU - logger.print_info_once("Using PunicaWrapperGPU.") + logger.info_once("Using PunicaWrapperGPU.") return PunicaWrapperGPU(*args, **kwargs) elif current_platform.is_hpu(): # Lazy import to avoid ImportError from vllm.lora.punica_wrapper.punica_hpu import PunicaWrapperHPU - logger.print_info_once("Using PunicaWrapperHPU.") + logger.info_once("Using PunicaWrapperHPU.") return PunicaWrapperHPU(*args, **kwargs) else: raise NotImplementedError diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index 70d187c2db6d..401606e8c76f 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -90,7 +90,7 @@ def enabled(cls) -> bool: compilation_config = get_current_vllm_config().compilation_config custom_ops = compilation_config.custom_ops if not hasattr(cls, "name"): - logger.print_warning_once( + logger.warning_once( f"Custom op {cls.__name__} was not registered, " f"which means it won't appear in the op registry. " f"It will be enabled/disabled based on the global settings.") diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 2a68b407ab3f..4fb8fd84e92d 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -144,7 +144,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: "activation scales are None.") if (not all_close_1d(layer.w13_input_scale) or not all_close_1d(layer.w2_input_scale)): - logger.print_warning_once( + logger.warning_once( "Found input_scales that are not equal for " "fp8 MoE layer. Using the maximum across experts " "for each layer.") diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 6c70f4297cb4..a1be45a49e94 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -538,7 +538,7 @@ def process_weights_after_loading(self, layer: Module) -> None: "activation scales are None.") if (not all_close_1d(layer.w13_input_scale) or not all_close_1d(layer.w2_input_scale)): - logger.print_warning_once( + logger.warning_once( "Found input_scales that are not equal for " "fp8 MoE layer. Using the maximum across experts " "for each layer.") diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py index b1e63cd5cc40..a74f5415c8a5 100644 --- a/vllm/model_executor/layers/quantization/kv_cache.py +++ b/vllm/model_executor/layers/quantization/kv_cache.py @@ -69,7 +69,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: layer._v_scale = v_scale if (layer._k_scale == 1.0 and layer._v_scale == 1.0 and "e5m2" not in layer.kv_cache_dtype): - logger.print_warning_once( + logger.warning_once( "Using KV cache scaling factor 1.0 for fp8_e4m3. This " "may cause accuracy issues. Please make sure k/v_scale " "scaling factors are available in the fp8 checkpoint.") diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py index 09955307a5d4..245fe9238e42 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py @@ -49,7 +49,7 @@ def apply_fp8_marlin_linear( def prepare_fp8_layer_for_marlin(layer: torch.nn.Module, strategy: str = "tensor") -> None: - logger.print_warning_once( + logger.warning_once( "Your GPU does not have native support for FP8 computation but " "FP8 quantization is being used. Weight-only FP8 compression will " "be used leveraging the Marlin kernel. This may degrade " diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index ebb5fec61403..11d5fd7135d9 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -673,7 +673,7 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]: None: If the remapped name is not found in params_dict. """ if name.endswith(".kv_scale"): - logger.print_warning_once( + logger.warning_once( "DEPRECATED. Found kv_scale in the checkpoint. " "This format is deprecated in favor of separate k_scale and " "v_scale tensors and will be removed in a future release. " @@ -682,7 +682,7 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]: # NOTE: we remap the deprecated kv_scale to k_scale remapped_name = name.replace(".kv_scale", ".attn.k_scale") if remapped_name not in params_dict: - logger.print_warning_once( + logger.warning_once( f"Found kv_scale in the checkpoint (e.g. {name}), " "but not found the expected name in the model " f"(e.g. {remapped_name}). kv_scale is " @@ -695,7 +695,7 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]: if name.endswith(scale_name): remapped_name = name.replace(scale_name, f".attn{scale_name}") if remapped_name not in params_dict: - logger.print_warning_once( + logger.warning_once( f"Found {scale_name} in the checkpoint (e.g. {name}), " "but not found the expected name in the model " f"(e.g. {remapped_name}). {scale_name} is " diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 6ac046596c35..452fe727875f 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -1113,7 +1113,7 @@ def load_weights(self, weights: Iterable[Tuple[str, remapped_kv_scale_name = name.replace( ".kv_scale", ".attn.kv_scale") if remapped_kv_scale_name not in params_dict: - logger.print_warning_once( + logger.warning_once( "Found kv scale in the checkpoint (e.g. " f"{name}), but not found the expected name in " f"the model (e.g. {remapped_kv_scale_name}). " diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index 2cc47aeb2d99..fbe5d1aee04b 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -448,7 +448,7 @@ def load_weights(self, weights: Iterable[Tuple[str, remapped_kv_scale_name = name.replace( ".kv_scale", ".attn.kv_scale") if remapped_kv_scale_name not in params_dict: - logger.print_warning_once( + logger.warning_once( "Found kv scale in the checkpoint " f"(e.g. {name}), but not found the expected " f"name in the model " diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index a0ed344637ee..95de6c21871b 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -526,7 +526,7 @@ def load_weights(self, weights: Iterable[Tuple[str, remapped_kv_scale_name = name.replace( ".kv_scale", ".attn.kv_scale") if remapped_kv_scale_name not in params_dict: - logger.print_warning_once( + logger.warning_once( "Found kv scale in the checkpoint " f"(e.g. {name}), but not found the expected " f"name in the model " diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 94d0c09a7c4d..c294f7e8fcad 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -630,7 +630,7 @@ def get_vit_attn_backend(support_fa: bool = False) -> _Backend: if is_flash_attn_2_available(): selected_backend = _Backend.FLASH_ATTN else: - logger.print_warning_once( + logger.warning_once( "Current `vllm-flash-attn` has a bug inside vision module, " "so we use xformers backend instead. You can run " "`pip install flash-attn` to use flash-attention backend.")