From d60537f3b8347184f6e3f64034d84be397c23cdd Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 18 Dec 2024 10:33:33 +0000
Subject: [PATCH 01/20] Move `print_*_once` from utils to logger

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/attention/backends/torch_sdpa.py         |  9 +++--
 vllm/attention/backends/xformers.py           |  8 ++--
 vllm/config.py                                |  9 ++---
 vllm/entrypoints/chat_utils.py                |  7 ++--
 vllm/inputs/preprocess.py                     | 20 +++++-----
 vllm/inputs/registry.py                       |  4 +-
 vllm/logger.py                                | 39 +++++++++++++++++--
 vllm/lora/punica_wrapper/punica_selector.py   |  8 ++--
 vllm/model_executor/custom_op.py              |  3 +-
 .../compressed_tensors_moe.py                 |  8 ++--
 .../model_executor/layers/quantization/fp8.py |  5 +--
 .../layers/quantization/kv_cache.py           |  6 ++-
 .../quantization/utils/marlin_utils_fp8.py    |  6 ++-
 .../model_loader/weight_utils.py              |  7 ++--
 vllm/model_executor/models/chameleon.py       |  6 ++-
 vllm/model_executor/models/olmoe.py           |  6 ++-
 vllm/model_executor/models/qwen2_moe.py       |  6 ++-
 vllm/model_executor/models/utils.py           |  4 +-
 vllm/utils.py                                 | 12 ------
 19 files changed, 105 insertions(+), 68 deletions(-)

diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index 0cff6f5952ab..d782de24fd3e 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -13,9 +13,12 @@
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.attention.ops.ipex_attn import PagedAttention
 from vllm.attention.ops.paged_attn import PagedAttentionMetadata
-from vllm.utils import make_tensor_with_pad, print_warning_once
+from vllm.logger import init_logger
+from vllm.utils import make_tensor_with_pad
 from vllm.worker.cpu_model_runner import ModelInputForCPUBuilder
 
+logger = init_logger(__name__)
+
 
 class TorchSDPABackend(AttentionBackend):
 
@@ -395,8 +398,8 @@ def __init__(
             raise ValueError(
                 "Torch SPDA does not support block-sparse attention.")
         if logits_soft_cap is not None:
-            print_warning_once("Torch SPDA does not support logits soft cap. "
-                               "Outputs may be slightly off.")
+            logger.warning_once("Torch SPDA does not support logits soft cap. "
+                                "Outputs may be slightly off.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 3e59b3603d2c..2c4997ea0a93 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -17,7 +17,9 @@
     is_all_cross_attn_metadata_set, is_all_encoder_attn_metadata_set)
 from vllm.attention.ops.paged_attn import (PagedAttention,
                                            PagedAttentionMetadata)
-from vllm.utils import print_warning_once
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
 
 
 class XFormersBackend(AttentionBackend):
@@ -384,8 +386,8 @@ def __init__(
             raise ValueError(
                 "XFormers does not support block-sparse attention.")
         if logits_soft_cap is not None:
-            print_warning_once("XFormers does not support logits soft cap. "
-                               "Outputs may be slightly off.")
+            logger.warning_once("XFormers does not support logits soft cap. "
+                                "Outputs may be slightly off.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/config.py b/vllm/config.py
index 307cf9c8d5b2..07e3d34d47f6 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -29,8 +29,7 @@
     get_hf_text_config, get_pooling_config,
     get_sentence_transformer_tokenizer_config, is_encoder_decoder, uses_mrope)
 from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
-                        get_cpu_memory, print_warning_once, random_uuid,
-                        resolve_obj_by_qualname)
+                        get_cpu_memory, random_uuid, resolve_obj_by_qualname)
 
 if TYPE_CHECKING:
     from ray.util.placement_group import PlacementGroup
@@ -302,7 +301,7 @@ def __init__(self,
                 sliding_window_len_min = get_min_sliding_window(
                     self.hf_text_config.sliding_window)
 
-                print_warning_once(
+                logger.warning_once(
                     f"{self.hf_text_config.model_type} has interleaved "
                     "attention, which is currently not supported by the "
                     "XFORMERS backend. Disabling sliding window and capping "
@@ -2639,7 +2638,7 @@ def uuid(self):
 
         def model_post_init(self, __context: Any) -> None:
             if not self.enable_reshape and self.enable_fusion:
-                print_warning_once(
+                logger.warning_once(
                     "Fusion enabled but reshape elimination disabled."
                     "RMSNorm + quant (fp8) fusion might not work")
 
@@ -3018,7 +3017,7 @@ def __post_init__(self):
             self.scheduler_config.chunked_prefill_enabled and \
             self.model_config.dtype == torch.float32 and \
             current_platform.get_device_capability() == (7, 5):
-            print_warning_once(
+            logger.warning_once(
                 "Turing devices tensor cores do not support float32 matmul. "
                 "To workaround this limitation, vLLM will set 'ieee' input "
                 "precision for chunked prefill triton kernels.")
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 3df08c740d65..cc9936741a45 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -37,7 +37,6 @@
                                    get_and_parse_audio, get_and_parse_image,
                                    get_and_parse_video)
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.utils import print_warning_once
 
 logger = init_logger(__name__)
 
@@ -1000,14 +999,14 @@ def apply_mistral_chat_template(
     **kwargs: Any,
 ) -> List[int]:
     if chat_template is not None:
-        print_warning_once(
+        logger.warning_once(
             "'chat_template' cannot be overridden for mistral tokenizer.")
     if "add_generation_prompt" in kwargs:
-        print_warning_once(
+        logger.warning_once(
             "'add_generation_prompt' is not supported for mistral tokenizer, "
             "so it will be ignored.")
     if "continue_final_message" in kwargs:
-        print_warning_once(
+        logger.warning_once(
             "'continue_final_message' is not supported for mistral tokenizer, "
             "so it will be ignored.")
 
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 3d606817e90a..b4ec89db5d73 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -10,7 +10,6 @@
 from vllm.multimodal.processing import MultiModalDataDict, MultiModalInputsV2
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
-from vllm.utils import print_info_once, print_warning_once
 
 from .data import (DecoderOnlyInputs, EncoderDecoderInputs, ProcessorInputs,
                    PromptType, SingletonInputs, SingletonPrompt, token_inputs)
@@ -68,21 +67,24 @@ def get_decoder_start_token_id(self) -> Optional[int]:
         '''
 
         if not self.model_config.is_encoder_decoder:
-            print_warning_once("Using None for decoder start token id because "
-                               "this is not an encoder/decoder model.")
+            logger.warning_once(
+                "Using None for decoder start token id because "
+                "this is not an encoder/decoder model.")
             return None
 
         if (self.model_config is None or self.model_config.hf_config is None):
-            print_warning_once("Using None for decoder start token id because "
-                               "model config is not available.")
+            logger.warning_once(
+                "Using None for decoder start token id because "
+                "model config is not available.")
             return None
 
         dec_start_token_id = getattr(self.model_config.hf_config,
                                      'decoder_start_token_id', None)
         if dec_start_token_id is None:
-            print_warning_once("Falling back on <BOS> for decoder start token "
-                               "id because decoder start token id is not "
-                               "available.")
+            logger.warning_once(
+                "Falling back on <BOS> for decoder start token "
+                "id because decoder start token id is not "
+                "available.")
             dec_start_token_id = self.get_bos_token_id()
 
         return dec_start_token_id
@@ -212,7 +214,7 @@ def _can_process_multimodal(self) -> bool:
         # updated to use the new multi-modal processor
         can_process_multimodal = self.mm_registry.has_processor(model_config)
         if not can_process_multimodal:
-            print_info_once(
+            logger.info_once(
                 "Your model uses the legacy input pipeline instead of the new "
                 "multi-modal processor. Please note that the legacy pipeline "
                 "will be removed in a future release. For more details, see: "
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 0b85484c4871..d767b56ff60d 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -12,7 +12,7 @@
 from vllm.transformers_utils.processor import cached_get_processor
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import (ClassRegistry, get_allowed_kwarg_only_overrides,
-                        print_warning_once, resolve_mm_processor_kwargs)
+                        resolve_mm_processor_kwargs)
 
 from .data import ProcessorInputs, SingletonInputs
 from .parse import is_encoder_decoder_inputs
@@ -316,7 +316,7 @@ def dummy_data_for_profiling(
         num_tokens = dummy_data.seq_data.prompt_token_ids
         if len(num_tokens) < seq_len:
             if is_encoder_data:
-                print_warning_once(
+                logger.warning_once(
                     f"Expected at least {seq_len} dummy encoder tokens for "
                     f"profiling, but found {len(num_tokens)} tokens instead.")
             else:
diff --git a/vllm/logger.py b/vllm/logger.py
index 538db0dcf19a..3b6258ca173e 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -4,7 +4,7 @@
 import logging
 import os
 import sys
-from functools import partial
+from functools import lru_cache, partial
 from logging import Logger
 from logging.config import dictConfig
 from os import path
@@ -49,6 +49,35 @@
 }
 
 
+@lru_cache
+def _print_info_once(logger: Logger, msg: str) -> None:
+    # Set the stacklevel to 4 to print the original caller's line info
+    logger.info(msg, stacklevel=4)
+
+
+@lru_cache
+def _print_warning_once(logger: Logger, msg: str) -> None:
+    # Set the stacklevel to 4 to print the original caller's line info
+    logger.warning(msg, stacklevel=4)
+
+
+class VllmLogger(Logger):
+
+    def info_once(self, msg: str) -> None:
+        """
+        As :meth:`info`, but subsequent calls with the same message
+        are silently dropped.
+        """
+        _print_info_once(self, msg)
+
+    def warning_once(self, msg: str) -> None:
+        """
+        As :meth:`warning`, but subsequent calls with the same message
+        are silently dropped.
+        """
+        _print_warning_once(self, msg)
+
+
 def _configure_vllm_root_logger() -> None:
     logging_config: Dict = {}
 
@@ -83,13 +112,17 @@ def _configure_vllm_root_logger() -> None:
     if logging_config:
         dictConfig(logging_config)
 
+    logging.setLoggerClass(VllmLogger)
+
 
-def init_logger(name: str) -> Logger:
+def init_logger(name: str) -> VllmLogger:
     """The main purpose of this function is to ensure that loggers are
     retrieved in such a way that we can be sure the root vllm logger has
     already been configured."""
 
-    return logging.getLogger(name)
+    logger = logging.getLogger(name)
+    assert isinstance(logger, VllmLogger)
+    return logger
 
 
 # The root logger is initialized when the module is imported.
diff --git a/vllm/lora/punica_wrapper/punica_selector.py b/vllm/lora/punica_wrapper/punica_selector.py
index cd64878d95ae..9791d492d8e4 100644
--- a/vllm/lora/punica_wrapper/punica_selector.py
+++ b/vllm/lora/punica_wrapper/punica_selector.py
@@ -1,19 +1,21 @@
+from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import print_info_once
 
 from .punica_base import PunicaWrapperBase
 
+logger = init_logger(__name__)
+
 
 def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase:
     if current_platform.is_cuda_alike():
         # Lazy import to avoid ImportError
         from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU
-        print_info_once("Using PunicaWrapperGPU.")
+        logger.info_once("Using PunicaWrapperGPU.")
         return PunicaWrapperGPU(*args, **kwargs)
     elif current_platform.is_hpu():
         # Lazy import to avoid ImportError
         from vllm.lora.punica_wrapper.punica_hpu import PunicaWrapperHPU
-        print_info_once("Using PunicaWrapperHPU.")
+        logger.info_once("Using PunicaWrapperHPU.")
         return PunicaWrapperHPU(*args, **kwargs)
     else:
         raise NotImplementedError
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index fddc8bad09ef..401606e8c76f 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -5,7 +5,6 @@
 from vllm.config import get_current_vllm_config
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import print_warning_once
 
 logger = init_logger(__name__)
 
@@ -91,7 +90,7 @@ def enabled(cls) -> bool:
         compilation_config = get_current_vllm_config().compilation_config
         custom_ops = compilation_config.custom_ops
         if not hasattr(cls, "name"):
-            print_warning_once(
+            logger.warning_once(
                 f"Custom op {cls.__name__} was not registered, "
                 f"which means it won't appear in the op registry. "
                 f"It will be enabled/disabled based on the global settings.")
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index dad04017d321..712dc486f236 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -8,6 +8,7 @@
 
 import vllm.model_executor.layers.fused_moe  # noqa
 from vllm import _custom_ops as ops
+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
                                                   FusedMoeWeightScaleSupported)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
@@ -16,7 +17,8 @@
     all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
-from vllm.utils import print_warning_once
+
+logger = init_logger(__name__)
 
 
 class GPTQMarlinState(Enum):
@@ -142,10 +144,10 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                     "activation scales are None.")
             if (not all_close_1d(layer.w13_input_scale)
                     or not all_close_1d(layer.w2_input_scale)):
-                print_warning_once(
+                logger.warning_once(
                     "Found input_scales that are not equal for "
                     "fp8 MoE layer. Using the maximum across experts "
-                    "for each layer. ")
+                    "for each layer.")
             layer.w13_input_scale = torch.nn.Parameter(
                 layer.w13_input_scale.max(), requires_grad=False)
             layer.w2_input_scale = torch.nn.Parameter(
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 978e727bc7cb..259ff1997f37 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -26,7 +26,6 @@
                                            PerTensorScaleParameter)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
-from vllm.utils import print_warning_once
 
 ACTIVATION_SCHEMES = ["static", "dynamic"]
 
@@ -408,10 +407,10 @@ def process_weights_after_loading(self, layer: Module) -> None:
                         "activation scales are None.")
                 if (not all_close_1d(layer.w13_input_scale)
                         or not all_close_1d(layer.w2_input_scale)):
-                    print_warning_once(
+                    logger.warning_once(
                         "Found input_scales that are not equal for "
                         "fp8 MoE layer. Using the maximum across experts "
-                        "for each layer. ")
+                        "for each layer.")
                 layer.w13_input_scale = torch.nn.Parameter(
                     layer.w13_input_scale.max(), requires_grad=False)
                 layer.w2_input_scale = torch.nn.Parameter(
diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py
index d79536d196b9..a74f5415c8a5 100644
--- a/vllm/model_executor/layers/quantization/kv_cache.py
+++ b/vllm/model_executor/layers/quantization/kv_cache.py
@@ -1,8 +1,10 @@
 import torch
 
+from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
-from vllm.utils import print_warning_once
+
+logger = init_logger(__name__)
 
 
 class BaseKVCacheMethod(QuantizeMethodBase):
@@ -67,7 +69,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer._v_scale = v_scale
             if (layer._k_scale == 1.0 and layer._v_scale == 1.0
                     and "e5m2" not in layer.kv_cache_dtype):
-                print_warning_once(
+                logger.warning_once(
                     "Using KV cache scaling factor 1.0 for fp8_e4m3. This "
                     "may cause accuracy issues. Please make sure k/v_scale "
                     "scaling factors are available in the fp8 checkpoint.")
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
index 8b3dfaae971c..245fe9238e42 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -3,11 +3,13 @@
 import torch
 
 import vllm._custom_ops as ops
+from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import print_warning_once
 
 from .marlin_utils import marlin_make_workspace, marlin_permute_scales
 
+logger = init_logger(__name__)
+
 
 def is_fp8_marlin_supported():
     return current_platform.has_device_capability(80)
@@ -47,7 +49,7 @@ def apply_fp8_marlin_linear(
 
 def prepare_fp8_layer_for_marlin(layer: torch.nn.Module,
                                  strategy: str = "tensor") -> None:
-    print_warning_once(
+    logger.warning_once(
         "Your GPU does not have native support for FP8 computation but "
         "FP8 quantization is being used. Weight-only FP8 compression will "
         "be used leveraging the Marlin kernel. This may degrade "
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 9488d54edf36..b3ac10370293 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -25,7 +25,6 @@
                                                      get_quantization_config)
 from vllm.model_executor.layers.quantization.schema import QuantParamSchema
 from vllm.platforms import current_platform
-from vllm.utils import print_warning_once
 
 logger = init_logger(__name__)
 
@@ -647,7 +646,7 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
         None: If the remapped name is not found in params_dict.
     """
     if name.endswith(".kv_scale"):
-        print_warning_once(
+        logger.warning_once(
             "DEPRECATED. Found kv_scale in the checkpoint. "
             "This format is deprecated in favor of separate k_scale and "
             "v_scale tensors and will be removed in a future release. "
@@ -656,7 +655,7 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
         # NOTE: we remap the deprecated kv_scale to k_scale
         remapped_name = name.replace(".kv_scale", ".attn.k_scale")
         if remapped_name not in params_dict:
-            print_warning_once(
+            logger.warning_once(
                 f"Found kv_scale in the checkpoint (e.g. {name}), "
                 "but not found the expected name in the model "
                 f"(e.g. {remapped_name}). kv_scale is "
@@ -669,7 +668,7 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
         if name.endswith(scale_name):
             remapped_name = name.replace(scale_name, f".attn{scale_name}")
             if remapped_name not in params_dict:
-                print_warning_once(
+                logger.warning_once(
                     f"Found {scale_name} in the checkpoint (e.g. {name}), "
                     "but not found the expected name in the model "
                     f"(e.g. {remapped_name}). {scale_name} is "
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index a40c321ce0a5..22dcba94672d 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -13,6 +13,7 @@
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext, token_inputs)
+from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@@ -34,13 +35,14 @@
                                    consecutive_placeholder_ranges,
                                    repeat_and_pad_placeholder_tokens)
 from vllm.sequence import IntermediateTensors, SequenceData
-from vllm.utils import print_warning_once
 
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix, merge_multimodal_embeddings)
 
+logger = init_logger(__name__)
+
 # These configs are not part of the model config but the preprocessor
 # and processor files, so we hardcode them in the model file for now.
 CHAMELEON_CROP_SIZE_HEIGHT = CHAMELEON_CROP_SIZE_WIDTH = 512
@@ -1123,7 +1125,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                         remapped_kv_scale_name = name.replace(
                             ".kv_scale", ".attn.kv_scale")
                         if remapped_kv_scale_name not in params_dict:
-                            print_warning_once(
+                            logger.warning_once(
                                 "Found kv scale in the checkpoint (e.g. "
                                 f"{name}), but not found the expected name in "
                                 f"the model (e.g. {remapped_kv_scale_name}). "
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index 5d9091cfb931..fbe5d1aee04b 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -20,6 +20,7 @@
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (QKVParallelLinear,
@@ -34,13 +35,14 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
-from vllm.utils import print_warning_once
 
 from .interfaces import SupportsPP
 from .utils import (is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
+logger = init_logger(__name__)
+
 
 class OlmoeMoE(nn.Module):
     """A tensor-parallel MoE implementation for Olmoe that shards each expert
@@ -446,7 +448,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                         remapped_kv_scale_name = name.replace(
                             ".kv_scale", ".attn.kv_scale")
                         if remapped_kv_scale_name not in params_dict:
-                            print_warning_once(
+                            logger.warning_once(
                                 "Found kv scale in the checkpoint "
                                 f"(e.g. {name}), but not found the expected "
                                 f"name in the model "
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index ba70243c6533..95de6c21871b 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -34,6 +34,7 @@
 from vllm.distributed import (get_pp_group,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
+from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -50,13 +51,14 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
-from vllm.utils import print_warning_once
 
 from .interfaces import SupportsPP
 from .utils import (extract_layer_index, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
+logger = init_logger(__name__)
+
 
 class Qwen2MoeMLP(nn.Module):
 
@@ -524,7 +526,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                         remapped_kv_scale_name = name.replace(
                             ".kv_scale", ".attn.kv_scale")
                         if remapped_kv_scale_name not in params_dict:
-                            print_warning_once(
+                            logger.warning_once(
                                 "Found kv scale in the checkpoint "
                                 f"(e.g. {name}), but not found the expected "
                                 f"name in the model "
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 269b66806adf..30381846e4b1 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -17,7 +17,7 @@
 from vllm.multimodal import MultiModalPlaceholderMap, NestedTensors
 from vllm.platforms import _Backend, current_platform
 from vllm.sequence import IntermediateTensors
-from vllm.utils import is_pin_memory_available, print_warning_once
+from vllm.utils import is_pin_memory_available
 
 logger = init_logger(__name__)
 
@@ -621,7 +621,7 @@ def get_vit_attn_backend(support_fa: bool = False) -> _Backend:
             if is_flash_attn_2_available():
                 selected_backend = _Backend.FLASH_ATTN
             else:
-                print_warning_once(
+                logger.warning_once(
                     "Current `vllm-flash-attn` has a bug inside vision module, "
                     "so we use xformers backend instead. You can run "
                     "`pip install flash-attn` to use flash-attention backend.")
diff --git a/vllm/utils.py b/vllm/utils.py
index 38c7dea6d2d3..96553cb23275 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -661,18 +661,6 @@ def create_kv_caches_with_random(
     return key_caches, value_caches
 
 
-@lru_cache
-def print_info_once(msg: str) -> None:
-    # Set the stacklevel to 2 to print the caller's line info
-    logger.info(msg, stacklevel=2)
-
-
-@lru_cache
-def print_warning_once(msg: str) -> None:
-    # Set the stacklevel to 2 to print the caller's line info
-    logger.warning(msg, stacklevel=2)
-
-
 @lru_cache(maxsize=None)
 def is_pin_memory_available() -> bool:
     return current_platform.is_pin_memory_available()

From 1677ecabcf1ca62b31d2ea9b22d000e5e6ccb607 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 19 Dec 2024 03:43:59 +0000
Subject: [PATCH 02/20] Avoid name clash

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/attention/backends/torch_sdpa.py                      | 5 +++--
 vllm/attention/backends/xformers.py                        | 5 +++--
 vllm/config.py                                             | 6 +++---
 vllm/entrypoints/chat_utils.py                             | 6 +++---
 vllm/inputs/preprocess.py                                  | 6 +++---
 vllm/inputs/registry.py                                    | 2 +-
 vllm/logger.py                                             | 7 +++++--
 vllm/lora/punica_wrapper/punica_selector.py                | 4 ++--
 vllm/model_executor/custom_op.py                           | 2 +-
 .../compressed_tensors/compressed_tensors_moe.py           | 2 +-
 vllm/model_executor/layers/quantization/fp8.py             | 2 +-
 vllm/model_executor/layers/quantization/kv_cache.py        | 2 +-
 .../layers/quantization/utils/marlin_utils_fp8.py          | 2 +-
 vllm/model_executor/model_loader/weight_utils.py           | 6 +++---
 vllm/model_executor/models/chameleon.py                    | 2 +-
 vllm/model_executor/models/olmoe.py                        | 2 +-
 vllm/model_executor/models/qwen2_moe.py                    | 2 +-
 vllm/model_executor/models/utils.py                        | 2 +-
 18 files changed, 35 insertions(+), 30 deletions(-)

diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index d782de24fd3e..87283633cf32 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -398,8 +398,9 @@ def __init__(
             raise ValueError(
                 "Torch SPDA does not support block-sparse attention.")
         if logits_soft_cap is not None:
-            logger.warning_once("Torch SPDA does not support logits soft cap. "
-                                "Outputs may be slightly off.")
+            logger.print_warning_once(
+                "Torch SPDA does not support logits soft cap. "
+                "Outputs may be slightly off.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 2c4997ea0a93..958ce41294b0 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -386,8 +386,9 @@ def __init__(
             raise ValueError(
                 "XFormers does not support block-sparse attention.")
         if logits_soft_cap is not None:
-            logger.warning_once("XFormers does not support logits soft cap. "
-                                "Outputs may be slightly off.")
+            logger.print_warning_once(
+                "XFormers does not support logits soft cap. "
+                "Outputs may be slightly off.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/config.py b/vllm/config.py
index 07e3d34d47f6..61059351401f 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -301,7 +301,7 @@ def __init__(self,
                 sliding_window_len_min = get_min_sliding_window(
                     self.hf_text_config.sliding_window)
 
-                logger.warning_once(
+                logger.print_warning_once(
                     f"{self.hf_text_config.model_type} has interleaved "
                     "attention, which is currently not supported by the "
                     "XFORMERS backend. Disabling sliding window and capping "
@@ -2638,7 +2638,7 @@ def uuid(self):
 
         def model_post_init(self, __context: Any) -> None:
             if not self.enable_reshape and self.enable_fusion:
-                logger.warning_once(
+                logger.print_warning_once(
                     "Fusion enabled but reshape elimination disabled."
                     "RMSNorm + quant (fp8) fusion might not work")
 
@@ -3017,7 +3017,7 @@ def __post_init__(self):
             self.scheduler_config.chunked_prefill_enabled and \
             self.model_config.dtype == torch.float32 and \
             current_platform.get_device_capability() == (7, 5):
-            logger.warning_once(
+            logger.print_warning_once(
                 "Turing devices tensor cores do not support float32 matmul. "
                 "To workaround this limitation, vLLM will set 'ieee' input "
                 "precision for chunked prefill triton kernels.")
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index cc9936741a45..ff6bdb76acb2 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -999,14 +999,14 @@ def apply_mistral_chat_template(
     **kwargs: Any,
 ) -> List[int]:
     if chat_template is not None:
-        logger.warning_once(
+        logger.print_warning_once(
             "'chat_template' cannot be overridden for mistral tokenizer.")
     if "add_generation_prompt" in kwargs:
-        logger.warning_once(
+        logger.print_warning_once(
             "'add_generation_prompt' is not supported for mistral tokenizer, "
             "so it will be ignored.")
     if "continue_final_message" in kwargs:
-        logger.warning_once(
+        logger.print_warning_once(
             "'continue_final_message' is not supported for mistral tokenizer, "
             "so it will be ignored.")
 
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index b4ec89db5d73..d3076caae7af 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -67,13 +67,13 @@ def get_decoder_start_token_id(self) -> Optional[int]:
         '''
 
         if not self.model_config.is_encoder_decoder:
-            logger.warning_once(
+            logger.print_warning_once(
                 "Using None for decoder start token id because "
                 "this is not an encoder/decoder model.")
             return None
 
         if (self.model_config is None or self.model_config.hf_config is None):
-            logger.warning_once(
+            logger.print_warning_once(
                 "Using None for decoder start token id because "
                 "model config is not available.")
             return None
@@ -81,7 +81,7 @@ def get_decoder_start_token_id(self) -> Optional[int]:
         dec_start_token_id = getattr(self.model_config.hf_config,
                                      'decoder_start_token_id', None)
         if dec_start_token_id is None:
-            logger.warning_once(
+            logger.print_warning_once(
                 "Falling back on <BOS> for decoder start token "
                 "id because decoder start token id is not "
                 "available.")
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index d767b56ff60d..d2aab36e006e 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -316,7 +316,7 @@ def dummy_data_for_profiling(
         num_tokens = dummy_data.seq_data.prompt_token_ids
         if len(num_tokens) < seq_len:
             if is_encoder_data:
-                logger.warning_once(
+                logger.print_warning_once(
                     f"Expected at least {seq_len} dummy encoder tokens for "
                     f"profiling, but found {len(num_tokens)} tokens instead.")
             else:
diff --git a/vllm/logger.py b/vllm/logger.py
index 3b6258ca173e..f88cab90ebbf 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -63,14 +63,17 @@ def _print_warning_once(logger: Logger, msg: str) -> None:
 
 class VllmLogger(Logger):
 
-    def info_once(self, msg: str) -> None:
+    # NOTE: We can't use info_once and warning_once because they
+    # are overwritten by transformers:
+    # https://github.com/huggingface/transformers/blob/2c47618c1a282f925446506d53108dc6e82d9ef0/src/transformers/utils/logging.py#L331
+    def print_info_once(self, msg: str) -> None:
         """
         As :meth:`info`, but subsequent calls with the same message
         are silently dropped.
         """
         _print_info_once(self, msg)
 
-    def warning_once(self, msg: str) -> None:
+    def print_warning_once(self, msg: str) -> None:
         """
         As :meth:`warning`, but subsequent calls with the same message
         are silently dropped.
diff --git a/vllm/lora/punica_wrapper/punica_selector.py b/vllm/lora/punica_wrapper/punica_selector.py
index 9791d492d8e4..de8c1bc9f903 100644
--- a/vllm/lora/punica_wrapper/punica_selector.py
+++ b/vllm/lora/punica_wrapper/punica_selector.py
@@ -10,12 +10,12 @@ def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase:
     if current_platform.is_cuda_alike():
         # Lazy import to avoid ImportError
         from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU
-        logger.info_once("Using PunicaWrapperGPU.")
+        logger.print_info_once("Using PunicaWrapperGPU.")
         return PunicaWrapperGPU(*args, **kwargs)
     elif current_platform.is_hpu():
         # Lazy import to avoid ImportError
         from vllm.lora.punica_wrapper.punica_hpu import PunicaWrapperHPU
-        logger.info_once("Using PunicaWrapperHPU.")
+        logger.print_info_once("Using PunicaWrapperHPU.")
         return PunicaWrapperHPU(*args, **kwargs)
     else:
         raise NotImplementedError
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index 401606e8c76f..70d187c2db6d 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -90,7 +90,7 @@ def enabled(cls) -> bool:
         compilation_config = get_current_vllm_config().compilation_config
         custom_ops = compilation_config.custom_ops
         if not hasattr(cls, "name"):
-            logger.warning_once(
+            logger.print_warning_once(
                 f"Custom op {cls.__name__} was not registered, "
                 f"which means it won't appear in the op registry. "
                 f"It will be enabled/disabled based on the global settings.")
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 712dc486f236..6f93927fb49a 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -144,7 +144,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                     "activation scales are None.")
             if (not all_close_1d(layer.w13_input_scale)
                     or not all_close_1d(layer.w2_input_scale)):
-                logger.warning_once(
+                logger.print_warning_once(
                     "Found input_scales that are not equal for "
                     "fp8 MoE layer. Using the maximum across experts "
                     "for each layer.")
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 259ff1997f37..1bc77f1e912d 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -407,7 +407,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
                         "activation scales are None.")
                 if (not all_close_1d(layer.w13_input_scale)
                         or not all_close_1d(layer.w2_input_scale)):
-                    logger.warning_once(
+                    logger.print_warning_once(
                         "Found input_scales that are not equal for "
                         "fp8 MoE layer. Using the maximum across experts "
                         "for each layer.")
diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py
index a74f5415c8a5..b1e63cd5cc40 100644
--- a/vllm/model_executor/layers/quantization/kv_cache.py
+++ b/vllm/model_executor/layers/quantization/kv_cache.py
@@ -69,7 +69,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer._v_scale = v_scale
             if (layer._k_scale == 1.0 and layer._v_scale == 1.0
                     and "e5m2" not in layer.kv_cache_dtype):
-                logger.warning_once(
+                logger.print_warning_once(
                     "Using KV cache scaling factor 1.0 for fp8_e4m3. This "
                     "may cause accuracy issues. Please make sure k/v_scale "
                     "scaling factors are available in the fp8 checkpoint.")
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
index 245fe9238e42..09955307a5d4 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -49,7 +49,7 @@ def apply_fp8_marlin_linear(
 
 def prepare_fp8_layer_for_marlin(layer: torch.nn.Module,
                                  strategy: str = "tensor") -> None:
-    logger.warning_once(
+    logger.print_warning_once(
         "Your GPU does not have native support for FP8 computation but "
         "FP8 quantization is being used. Weight-only FP8 compression will "
         "be used leveraging the Marlin kernel. This may degrade "
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index b3ac10370293..75c8cddc3a12 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -646,7 +646,7 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
         None: If the remapped name is not found in params_dict.
     """
     if name.endswith(".kv_scale"):
-        logger.warning_once(
+        logger.print_warning_once(
             "DEPRECATED. Found kv_scale in the checkpoint. "
             "This format is deprecated in favor of separate k_scale and "
             "v_scale tensors and will be removed in a future release. "
@@ -655,7 +655,7 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
         # NOTE: we remap the deprecated kv_scale to k_scale
         remapped_name = name.replace(".kv_scale", ".attn.k_scale")
         if remapped_name not in params_dict:
-            logger.warning_once(
+            logger.print_warning_once(
                 f"Found kv_scale in the checkpoint (e.g. {name}), "
                 "but not found the expected name in the model "
                 f"(e.g. {remapped_name}). kv_scale is "
@@ -668,7 +668,7 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
         if name.endswith(scale_name):
             remapped_name = name.replace(scale_name, f".attn{scale_name}")
             if remapped_name not in params_dict:
-                logger.warning_once(
+                logger.print_warning_once(
                     f"Found {scale_name} in the checkpoint (e.g. {name}), "
                     "but not found the expected name in the model "
                     f"(e.g. {remapped_name}). {scale_name} is "
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 22dcba94672d..54017700ccc3 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -1125,7 +1125,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                         remapped_kv_scale_name = name.replace(
                             ".kv_scale", ".attn.kv_scale")
                         if remapped_kv_scale_name not in params_dict:
-                            logger.warning_once(
+                            logger.print_warning_once(
                                 "Found kv scale in the checkpoint (e.g. "
                                 f"{name}), but not found the expected name in "
                                 f"the model (e.g. {remapped_kv_scale_name}). "
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index fbe5d1aee04b..2cc47aeb2d99 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -448,7 +448,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                         remapped_kv_scale_name = name.replace(
                             ".kv_scale", ".attn.kv_scale")
                         if remapped_kv_scale_name not in params_dict:
-                            logger.warning_once(
+                            logger.print_warning_once(
                                 "Found kv scale in the checkpoint "
                                 f"(e.g. {name}), but not found the expected "
                                 f"name in the model "
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 95de6c21871b..a0ed344637ee 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -526,7 +526,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                         remapped_kv_scale_name = name.replace(
                             ".kv_scale", ".attn.kv_scale")
                         if remapped_kv_scale_name not in params_dict:
-                            logger.warning_once(
+                            logger.print_warning_once(
                                 "Found kv scale in the checkpoint "
                                 f"(e.g. {name}), but not found the expected "
                                 f"name in the model "
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 30381846e4b1..324730f59f18 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -621,7 +621,7 @@ def get_vit_attn_backend(support_fa: bool = False) -> _Backend:
             if is_flash_attn_2_available():
                 selected_backend = _Backend.FLASH_ATTN
             else:
-                logger.warning_once(
+                logger.print_warning_once(
                     "Current `vllm-flash-attn` has a bug inside vision module, "
                     "so we use xformers backend instead. You can run "
                     "`pip install flash-attn` to use flash-attention backend.")

From da18374c4a5bc1997a337085e5eff733d67e1ad3 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 19 Dec 2024 03:45:34 +0000
Subject: [PATCH 03/20] fix

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/inputs/preprocess.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index d3076caae7af..d4c175e4de17 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -214,7 +214,7 @@ def _can_process_multimodal(self) -> bool:
         # updated to use the new multi-modal processor
         can_process_multimodal = self.mm_registry.has_processor(model_config)
         if not can_process_multimodal:
-            logger.info_once(
+            logger.print_info_once(
                 "Your model uses the legacy input pipeline instead of the new "
                 "multi-modal processor. Please note that the legacy pipeline "
                 "will be removed in a future release. For more details, see: "

From d6a80810851cf5822ffd1405ecfd1a4b8add2eb2 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 25 Dec 2024 07:24:43 +0000
Subject: [PATCH 04/20] Update

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/lora/utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index 3a84a6ae1c02..09ab2ec776de 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -32,7 +32,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.models.utils import WeightsMapper
-from vllm.utils import print_warning_once
 
 logger = init_logger(__name__)
 
@@ -122,7 +121,7 @@ def parse_fine_tuned_lora_name(
             ("orig_to_new_suffix", w_mapper.orig_to_new_suffix),
         ]:
             if mapping:
-                print_warning_once(
+                logger.print_warning_once(
                     f"vLLM currently does not support mapping of LoRA weights "
                     f"for {mapping}.")
                 setattr(w_mapper, attr, {})

From 02143692ba86a7ba6198a67e12982c5317c8bd2f Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 6 Jan 2025 15:58:56 +0000
Subject: [PATCH 05/20] Oops

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/lora/peft_helper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py
index acd7f7de8dd4..4bc8685233cd 100644
--- a/vllm/lora/peft_helper.py
+++ b/vllm/lora/peft_helper.py
@@ -6,7 +6,7 @@
 
 from vllm.logger import init_logger
 
-logger = init_logger(__name++)
+logger = init_logger(__name__)
 
 
 @dataclass

From cae7efbeb9327ec28d0581bef5358a2cc96aa4c8 Mon Sep 17 00:00:00 2001
From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com>
Date: Tue, 7 Jan 2025 17:44:39 +0100
Subject: [PATCH 06/20] add debug command during helm deployment in lint and
 deploy github workflow

Signed-off-by: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com>
---
 .github/workflows/lint-and-deploy.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
index ab6f6e5d2060..c0467b62a37b 100644
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -64,6 +64,7 @@ jobs:
         run: |
           export AWS_ACCESS_KEY_ID=minioadmin
           export AWS_SECRET_ACCESS_KEY=minioadmin
+          kubectl -n ns-vllm get pods &
           helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
     
       - name: curl test

From f882571befa131ac54a90480a9abe3f8a0d17c46 Mon Sep 17 00:00:00 2001
From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com>
Date: Wed, 8 Jan 2025 08:12:13 +0100
Subject: [PATCH 07/20] update debug command during helm deployment in lint and
 deploy github workflow

Signed-off-by: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com>
---
 .github/workflows/lint-and-deploy.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
index c0467b62a37b..0055465b9b17 100644
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -64,7 +64,7 @@ jobs:
         run: |
           export AWS_ACCESS_KEY_ID=minioadmin
           export AWS_SECRET_ACCESS_KEY=minioadmin
-          kubectl -n ns-vllm get pods &
+          kubectl -n ns-vllm get all &
           helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
     
       - name: curl test

From 856168cff062112ff3719fc38fd573d0f51c29f5 Mon Sep 17 00:00:00 2001
From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com>
Date: Wed, 8 Jan 2025 08:23:52 +0100
Subject: [PATCH 08/20] update debug command during helm deployment in lint and
 deploy github workflow

Signed-off-by: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com>
---
 .github/workflows/lint-and-deploy.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
index 0055465b9b17..0d85e9b2d01c 100644
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -64,7 +64,7 @@ jobs:
         run: |
           export AWS_ACCESS_KEY_ID=minioadmin
           export AWS_SECRET_ACCESS_KEY=minioadmin
-          kubectl -n ns-vllm get all &
+          watch -n 5 kubectl -n ns-vllm get pods &
           helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
     
       - name: curl test

From cf5186fa756d876e21f15ef0de8a438e9a347647 Mon Sep 17 00:00:00 2001
From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com>
Date: Wed, 8 Jan 2025 08:38:47 +0100
Subject: [PATCH 09/20] update debug command during helm deployment in lint and
 deploy github workflow

Signed-off-by: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com>
---
 .github/workflows/lint-and-deploy.yaml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
index 0d85e9b2d01c..c71e0241e42a 100644
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -64,8 +64,7 @@ jobs:
         run: |
           export AWS_ACCESS_KEY_ID=minioadmin
           export AWS_SECRET_ACCESS_KEY=minioadmin
-          watch -n 5 kubectl -n ns-vllm get pods &
-          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
+          watch -n 5 kubectl -n ns-vllm get pods; helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
     
       - name: curl test
         run: |

From 7caacd7fb7e59526d366e4d2b00d15a2838efb59 Mon Sep 17 00:00:00 2001
From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com>
Date: Wed, 8 Jan 2025 08:55:50 +0100
Subject: [PATCH 10/20] update debug command during helm deployment in lint and
 deploy github workflow

Signed-off-by: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com>
---
 .github/workflows/lint-and-deploy.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
index c71e0241e42a..031939bd3975 100644
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -64,7 +64,8 @@ jobs:
         run: |
           export AWS_ACCESS_KEY_ID=minioadmin
           export AWS_SECRET_ACCESS_KEY=minioadmin
-          watch -n 5 kubectl -n ns-vllm get pods; helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
+          for i in {1..30}; do kubectl -n ns-vllm get pods; sleep 5; done &
+          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
     
       - name: curl test
         run: |

From b293a89ce8002d49ee57c2a4e6a3b1d5d67daed7 Mon Sep 17 00:00:00 2001
From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com>
Date: Wed, 8 Jan 2025 09:05:05 +0100
Subject: [PATCH 11/20] update debug command during helm deployment in lint and
 deploy github workflow

Signed-off-by: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com>
---
 .github/workflows/lint-and-deploy.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
index 031939bd3975..3ba80c587fb5 100644
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -64,7 +64,7 @@ jobs:
         run: |
           export AWS_ACCESS_KEY_ID=minioadmin
           export AWS_SECRET_ACCESS_KEY=minioadmin
-          for i in {1..30}; do kubectl -n ns-vllm get pods; sleep 5; done &
+          while true; do kubectl -n ns-vllm get pods; sleep 5; done &
           helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
     
       - name: curl test

From 847bb2e4e4bc882456da769232edda51d0540471 Mon Sep 17 00:00:00 2001
From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com>
Date: Wed, 8 Jan 2025 09:25:34 +0100
Subject: [PATCH 12/20] update debug command during helm deployment in lint and
 deploy github workflow

Signed-off-by: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com>
---
 .github/workflows/lint-and-deploy.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
index 3ba80c587fb5..300ea5c04d51 100644
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -64,7 +64,7 @@ jobs:
         run: |
           export AWS_ACCESS_KEY_ID=minioadmin
           export AWS_SECRET_ACCESS_KEY=minioadmin
-          while true; do kubectl -n ns-vllm get pods; sleep 5; done &
+          sleep 30; kubectl -n ns-vllm logs -f $(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}') &
           helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
     
       - name: curl test

From f531a152f0a3575d2fd02422162dc3572de1c9a4 Mon Sep 17 00:00:00 2001
From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com>
Date: Wed, 8 Jan 2025 09:33:50 +0100
Subject: [PATCH 13/20] update debug command format during helm deployment in
 lint and deploy github workflow

Signed-off-by: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com>
---
 .github/workflows/lint-and-deploy.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
index 300ea5c04d51..c78f7098012b 100644
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -64,7 +64,7 @@ jobs:
         run: |
           export AWS_ACCESS_KEY_ID=minioadmin
           export AWS_SECRET_ACCESS_KEY=minioadmin
-          sleep 30; kubectl -n ns-vllm logs -f $(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}') &
+          sleep 30; kubectl -n ns-vllm logs -f $(kubectl -n ns-vllm get pods | awk "/deployment/ {print $1;exit}") &
           helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
     
       - name: curl test

From c24454a49666e3b31ed5636dac0a6880a650f3c8 Mon Sep 17 00:00:00 2001
From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com>
Date: Wed, 8 Jan 2025 09:39:24 +0100
Subject: [PATCH 14/20] update debug command format during helm deployment in
 lint and deploy github workflow

Signed-off-by: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com>
---
 .github/workflows/lint-and-deploy.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
index c78f7098012b..272f672c02e0 100644
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -64,7 +64,7 @@ jobs:
         run: |
           export AWS_ACCESS_KEY_ID=minioadmin
           export AWS_SECRET_ACCESS_KEY=minioadmin
-          sleep 30; kubectl -n ns-vllm logs -f $(kubectl -n ns-vllm get pods | awk "/deployment/ {print $1;exit}") &
+          sleep 30 && kubectl -n ns-vllm logs -f $(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}') &
           helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
     
       - name: curl test

From 5c3f87ba96bcd59c2167bf2908bde9e7bdafbf8e Mon Sep 17 00:00:00 2001
From: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com>
Date: Wed, 8 Jan 2025 09:42:07 +0100
Subject: [PATCH 15/20] update debug command format during helm deployment in
 lint and deploy github workflow

Signed-off-by: Maxime Fournioux <55544262+mfournioux@users.noreply.github.com>
---
 .github/workflows/lint-and-deploy.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
index 272f672c02e0..9db3906cf593 100644
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -64,7 +64,7 @@ jobs:
         run: |
           export AWS_ACCESS_KEY_ID=minioadmin
           export AWS_SECRET_ACCESS_KEY=minioadmin
-          sleep 30 && kubectl -n ns-vllm logs -f $(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}') &
+          sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
           helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
     
       - name: curl test

From 2c1181905ec20abc0475e9b10030193de58e3a8e Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 8 Jan 2025 10:08:15 +0000
Subject: [PATCH 16/20] Ensure init_logger is available only after
 configuration

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/logger.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/vllm/logger.py b/vllm/logger.py
index f88cab90ebbf..9d7c08716690 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -8,7 +8,7 @@
 from logging import Logger
 from logging.config import dictConfig
 from os import path
-from typing import Dict, Optional
+from typing import Any, Optional
 
 import vllm.envs as envs
 
@@ -81,8 +81,8 @@ def print_warning_once(self, msg: str) -> None:
         _print_warning_once(self, msg)
 
 
-def _configure_vllm_root_logger() -> None:
-    logging_config: Dict = {}
+def _configure_vllm_root_logger() -> bool:
+    logging_config = dict[str, Any]()
 
     if not VLLM_CONFIGURE_LOGGING and VLLM_LOGGING_CONFIG_PATH:
         raise RuntimeError(
@@ -117,6 +117,14 @@ def _configure_vllm_root_logger() -> None:
 
     logging.setLoggerClass(VllmLogger)
 
+    return True
+
+
+# The root logger is initialized when the module is imported.
+# This is thread-safe as the module is only imported once,
+# guaranteed by the Python GIL.
+is_configured = _configure_vllm_root_logger()
+
 
 def init_logger(name: str) -> VllmLogger:
     """The main purpose of this function is to ensure that loggers are
@@ -124,15 +132,10 @@ def init_logger(name: str) -> VllmLogger:
     already been configured."""
 
     logger = logging.getLogger(name)
-    assert isinstance(logger, VllmLogger)
+    assert isinstance(logger, VllmLogger), (is_configured, type(logger))
     return logger
 
 
-# The root logger is initialized when the module is imported.
-# This is thread-safe as the module is only imported once,
-# guaranteed by the Python GIL.
-_configure_vllm_root_logger()
-
 logger = init_logger(__name__)
 
 

From f0dfbea421e6aa5b38346681f1e21db171e9b3cb Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 8 Jan 2025 10:10:23 +0000
Subject: [PATCH 17/20] Update

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/logger.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/vllm/logger.py b/vllm/logger.py
index 9d7c08716690..fa9e96618e0a 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -81,7 +81,7 @@ def print_warning_once(self, msg: str) -> None:
         _print_warning_once(self, msg)
 
 
-def _configure_vllm_root_logger() -> bool:
+def _configure_vllm_root_logger() -> None:
     logging_config = dict[str, Any]()
 
     if not VLLM_CONFIGURE_LOGGING and VLLM_LOGGING_CONFIG_PATH:
@@ -117,13 +117,9 @@ def _configure_vllm_root_logger() -> bool:
 
     logging.setLoggerClass(VllmLogger)
 
-    return True
-
 
 # The root logger is initialized when the module is imported.
-# This is thread-safe as the module is only imported once,
-# guaranteed by the Python GIL.
-is_configured = _configure_vllm_root_logger()
+_configure_vllm_root_logger()
 
 
 def init_logger(name: str) -> VllmLogger:
@@ -132,7 +128,7 @@ def init_logger(name: str) -> VllmLogger:
     already been configured."""
 
     logger = logging.getLogger(name)
-    assert isinstance(logger, VllmLogger), (is_configured, type(logger))
+    assert isinstance(logger, VllmLogger), type(logger)
     return logger
 
 

From 2381eeba014afa9571175533a42279b9c04ea0aa Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 8 Jan 2025 13:47:10 +0000
Subject: [PATCH 18/20] Patch methods instead of creating a subclass

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/logger.py | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/vllm/logger.py b/vllm/logger.py
index fa9e96618e0a..af1f01c4d933 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -8,7 +8,8 @@
 from logging import Logger
 from logging.config import dictConfig
 from os import path
-from typing import Any, Optional
+from types import MethodType
+from typing import Any, Optional, cast
 
 import vllm.envs as envs
 
@@ -61,7 +62,10 @@ def _print_warning_once(logger: Logger, msg: str) -> None:
     logger.warning(msg, stacklevel=4)
 
 
-class VllmLogger(Logger):
+# NOTE: This class is just to provide type information.
+# We don't set the logger class to avoid conflicting with other
+# libraries (e.g. `intel_extension_for_pytorch.utils._logger`).
+class _VllmLogger(Logger):
 
     # NOTE: We can't use info_once and warning_once because they
     # are overwritten by transformers:
@@ -115,22 +119,31 @@ def _configure_vllm_root_logger() -> None:
     if logging_config:
         dictConfig(logging_config)
 
-    logging.setLoggerClass(VllmLogger)
 
-
-# The root logger is initialized when the module is imported.
-_configure_vllm_root_logger()
-
-
-def init_logger(name: str) -> VllmLogger:
+def init_logger(name: str) -> _VllmLogger:
     """The main purpose of this function is to ensure that loggers are
     retrieved in such a way that we can be sure the root vllm logger has
     already been configured."""
 
     logger = logging.getLogger(name)
-    assert isinstance(logger, VllmLogger), type(logger)
-    return logger
 
+    for method_name in ("print_info_once", "print_warning_once"):
+        method = getattr(_VllmLogger, method_name)
+
+        if hasattr(logger, method_name):
+            raise RuntimeError(
+                f"Unable to patch `{method_name}` for {type(logger)} "
+                "because a method with the same name already exists.")
+
+        setattr(logger, method_name, MethodType(method, logger))
+
+    return cast(_VllmLogger, logger)
+
+
+# The root logger is initialized when the module is imported.
+# This is thread-safe as the module is only imported once,
+# guaranteed by the Python GIL.
+_configure_vllm_root_logger()
 
 logger = init_logger(__name__)
 

From b0f6ab871b2405f1a9a812e77083ada3161ef14f Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 8 Jan 2025 14:01:15 +0000
Subject: [PATCH 19/20] Fix

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .github/workflows/lint-and-deploy.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
index 494a061c91d0..556b60d2fca1 100644
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -65,7 +65,7 @@ jobs:
           export AWS_ACCESS_KEY_ID=minioadmin
           export AWS_SECRET_ACCESS_KEY=minioadmin
           sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
-          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/chart-helm/online_serving/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
+          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
     
       - name: curl test
         run: |

From a679077641904c180d507a81eca625607643226e Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 8 Jan 2025 14:21:39 +0000
Subject: [PATCH 20/20] Rename

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/attention/backends/torch_sdpa.py         |  5 +--
 vllm/attention/backends/xformers.py           |  5 +--
 vllm/config.py                                |  6 +--
 vllm/entrypoints/chat_utils.py                |  6 +--
 vllm/inputs/preprocess.py                     |  8 ++--
 vllm/inputs/registry.py                       |  2 +-
 vllm/logger.py                                | 37 +++++++++----------
 vllm/lora/peft_helper.py                      |  2 +-
 vllm/lora/punica_wrapper/punica_selector.py   |  4 +-
 vllm/model_executor/custom_op.py              |  2 +-
 .../compressed_tensors_moe.py                 |  2 +-
 .../model_executor/layers/quantization/fp8.py |  2 +-
 .../layers/quantization/kv_cache.py           |  2 +-
 .../quantization/utils/marlin_utils_fp8.py    |  2 +-
 .../model_loader/weight_utils.py              |  6 +--
 vllm/model_executor/models/chameleon.py       |  2 +-
 vllm/model_executor/models/olmoe.py           |  2 +-
 vllm/model_executor/models/qwen2_moe.py       |  2 +-
 vllm/model_executor/models/utils.py           |  2 +-
 19 files changed, 48 insertions(+), 51 deletions(-)

diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index d75823735211..ca1c4618615d 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -399,9 +399,8 @@ def __init__(
             raise ValueError(
                 "Torch SPDA does not support block-sparse attention.")
         if logits_soft_cap is not None:
-            logger.print_warning_once(
-                "Torch SPDA does not support logits soft cap. "
-                "Outputs may be slightly off.")
+            logger.warning_once("Torch SPDA does not support logits soft cap. "
+                                "Outputs may be slightly off.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 175d1e20bb9c..8c8ca8520a9d 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -387,9 +387,8 @@ def __init__(
             raise ValueError(
                 "XFormers does not support block-sparse attention.")
         if logits_soft_cap is not None:
-            logger.print_warning_once(
-                "XFormers does not support logits soft cap. "
-                "Outputs may be slightly off.")
+            logger.warning_once("XFormers does not support logits soft cap. "
+                                "Outputs may be slightly off.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/config.py b/vllm/config.py
index e88103bf80fd..19609085cc96 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -313,7 +313,7 @@ def __init__(self,
                 sliding_window_len_min = get_min_sliding_window(
                     self.hf_text_config.sliding_window)
 
-                logger.print_warning_once(
+                logger.warning_once(
                     f"{self.hf_text_config.model_type} has interleaved "
                     "attention, which is currently not supported by the "
                     "XFORMERS backend. Disabling sliding window and capping "
@@ -2757,7 +2757,7 @@ def uuid(self):
 
         def model_post_init(self, __context: Any) -> None:
             if not self.enable_reshape and self.enable_fusion:
-                logger.print_warning_once(
+                logger.warning_once(
                     "Fusion enabled but reshape elimination disabled."
                     "RMSNorm + quant (fp8) fusion might not work")
 
@@ -3150,7 +3150,7 @@ def __post_init__(self):
             self.scheduler_config.chunked_prefill_enabled and \
             self.model_config.dtype == torch.float32 and \
             current_platform.get_device_capability() == (7, 5):
-            logger.print_warning_once(
+            logger.warning_once(
                 "Turing devices tensor cores do not support float32 matmul. "
                 "To workaround this limitation, vLLM will set 'ieee' input "
                 "precision for chunked prefill triton kernels.")
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 3a315b731560..923c7459f694 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -984,14 +984,14 @@ def apply_mistral_chat_template(
     **kwargs: Any,
 ) -> List[int]:
     if chat_template is not None:
-        logger.print_warning_once(
+        logger.warning_once(
             "'chat_template' cannot be overridden for mistral tokenizer.")
     if "add_generation_prompt" in kwargs:
-        logger.print_warning_once(
+        logger.warning_once(
             "'add_generation_prompt' is not supported for mistral tokenizer, "
             "so it will be ignored.")
     if "continue_final_message" in kwargs:
-        logger.print_warning_once(
+        logger.warning_once(
             "'continue_final_message' is not supported for mistral tokenizer, "
             "so it will be ignored.")
 
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 9b8ef7a8e6d4..0a789d39346e 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -67,13 +67,13 @@ def get_decoder_start_token_id(self) -> Optional[int]:
         '''
 
         if not self.model_config.is_encoder_decoder:
-            logger.print_warning_once(
+            logger.warning_once(
                 "Using None for decoder start token id because "
                 "this is not an encoder/decoder model.")
             return None
 
         if (self.model_config is None or self.model_config.hf_config is None):
-            logger.print_warning_once(
+            logger.warning_once(
                 "Using None for decoder start token id because "
                 "model config is not available.")
             return None
@@ -81,7 +81,7 @@ def get_decoder_start_token_id(self) -> Optional[int]:
         dec_start_token_id = getattr(self.model_config.hf_config,
                                      'decoder_start_token_id', None)
         if dec_start_token_id is None:
-            logger.print_warning_once(
+            logger.warning_once(
                 "Falling back on <BOS> for decoder start token "
                 "id because decoder start token id is not "
                 "available.")
@@ -227,7 +227,7 @@ def _can_process_multimodal(self) -> bool:
         # updated to use the new multi-modal processor
         can_process_multimodal = self.mm_registry.has_processor(model_config)
         if not can_process_multimodal:
-            logger.print_info_once(
+            logger.info_once(
                 "Your model uses the legacy input pipeline instead of the new "
                 "multi-modal processor. Please note that the legacy pipeline "
                 "will be removed in a future release. For more details, see: "
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index dc2a4496a72b..aad0dfab94a0 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -352,7 +352,7 @@ def dummy_data_for_profiling(
         num_tokens = dummy_data.seq_data.prompt_token_ids
         if len(num_tokens) < seq_len:
             if is_encoder_data:
-                logger.print_warning_once(
+                logger.warning_once(
                     f"Expected at least {seq_len} dummy encoder tokens for "
                     f"profiling, but found {len(num_tokens)} tokens instead.")
             else:
diff --git a/vllm/logger.py b/vllm/logger.py
index af1f01c4d933..cac174f7ba02 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -52,32 +52,33 @@
 
 @lru_cache
 def _print_info_once(logger: Logger, msg: str) -> None:
-    # Set the stacklevel to 4 to print the original caller's line info
-    logger.info(msg, stacklevel=4)
+    # Set the stacklevel to 2 to print the original caller's line info
+    logger.info(msg, stacklevel=2)
 
 
 @lru_cache
 def _print_warning_once(logger: Logger, msg: str) -> None:
-    # Set the stacklevel to 4 to print the original caller's line info
-    logger.warning(msg, stacklevel=4)
+    # Set the stacklevel to 2 to print the original caller's line info
+    logger.warning(msg, stacklevel=2)
 
 
-# NOTE: This class is just to provide type information.
-# We don't set the logger class to avoid conflicting with other
-# libraries (e.g. `intel_extension_for_pytorch.utils._logger`).
 class _VllmLogger(Logger):
+    """
+    Note:
+        This class is just to provide type information.
+        We actually patch the methods directly on the :class:`logging.Logger`
+        instance to avoid conflicting with other libraries such as
+        `intel_extension_for_pytorch.utils._logger`.
+    """
 
-    # NOTE: We can't use info_once and warning_once because they
-    # are overwritten by transformers:
-    # https://github.com/huggingface/transformers/blob/2c47618c1a282f925446506d53108dc6e82d9ef0/src/transformers/utils/logging.py#L331
-    def print_info_once(self, msg: str) -> None:
+    def info_once(self, msg: str) -> None:
         """
         As :meth:`info`, but subsequent calls with the same message
         are silently dropped.
         """
         _print_info_once(self, msg)
 
-    def print_warning_once(self, msg: str) -> None:
+    def warning_once(self, msg: str) -> None:
         """
         As :meth:`warning`, but subsequent calls with the same message
         are silently dropped.
@@ -127,14 +128,12 @@ def init_logger(name: str) -> _VllmLogger:
 
     logger = logging.getLogger(name)
 
-    for method_name in ("print_info_once", "print_warning_once"):
-        method = getattr(_VllmLogger, method_name)
-
-        if hasattr(logger, method_name):
-            raise RuntimeError(
-                f"Unable to patch `{method_name}` for {type(logger)} "
-                "because a method with the same name already exists.")
+    methods_to_patch = {
+        "info_once": _print_info_once,
+        "warning_once": _print_warning_once,
+    }
 
+    for method_name, method in methods_to_patch.items():
         setattr(logger, method_name, MethodType(method, logger))
 
     return cast(_VllmLogger, logger)
diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py
index 4bc8685233cd..dacfb9ebd148 100644
--- a/vllm/lora/peft_helper.py
+++ b/vllm/lora/peft_helper.py
@@ -44,7 +44,7 @@ def _validate_features(self):
     def __post_init__(self):
         self._validate_features()
         if self.use_rslora:
-            logger.print_info_once("Loading LoRA weights trained with rsLoRA.")
+            logger.info_once("Loading LoRA weights trained with rsLoRA.")
             self.vllm_lora_scaling_factor = self.lora_alpha / math.sqrt(self.r)
         else:
             self.vllm_lora_scaling_factor = self.lora_alpha / self.r
diff --git a/vllm/lora/punica_wrapper/punica_selector.py b/vllm/lora/punica_wrapper/punica_selector.py
index de8c1bc9f903..9791d492d8e4 100644
--- a/vllm/lora/punica_wrapper/punica_selector.py
+++ b/vllm/lora/punica_wrapper/punica_selector.py
@@ -10,12 +10,12 @@ def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase:
     if current_platform.is_cuda_alike():
         # Lazy import to avoid ImportError
         from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU
-        logger.print_info_once("Using PunicaWrapperGPU.")
+        logger.info_once("Using PunicaWrapperGPU.")
         return PunicaWrapperGPU(*args, **kwargs)
     elif current_platform.is_hpu():
         # Lazy import to avoid ImportError
         from vllm.lora.punica_wrapper.punica_hpu import PunicaWrapperHPU
-        logger.print_info_once("Using PunicaWrapperHPU.")
+        logger.info_once("Using PunicaWrapperHPU.")
         return PunicaWrapperHPU(*args, **kwargs)
     else:
         raise NotImplementedError
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index 70d187c2db6d..401606e8c76f 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -90,7 +90,7 @@ def enabled(cls) -> bool:
         compilation_config = get_current_vllm_config().compilation_config
         custom_ops = compilation_config.custom_ops
         if not hasattr(cls, "name"):
-            logger.print_warning_once(
+            logger.warning_once(
                 f"Custom op {cls.__name__} was not registered, "
                 f"which means it won't appear in the op registry. "
                 f"It will be enabled/disabled based on the global settings.")
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 2a68b407ab3f..4fb8fd84e92d 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -144,7 +144,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                     "activation scales are None.")
             if (not all_close_1d(layer.w13_input_scale)
                     or not all_close_1d(layer.w2_input_scale)):
-                logger.print_warning_once(
+                logger.warning_once(
                     "Found input_scales that are not equal for "
                     "fp8 MoE layer. Using the maximum across experts "
                     "for each layer.")
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 6c70f4297cb4..a1be45a49e94 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -538,7 +538,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
                         "activation scales are None.")
                 if (not all_close_1d(layer.w13_input_scale)
                         or not all_close_1d(layer.w2_input_scale)):
-                    logger.print_warning_once(
+                    logger.warning_once(
                         "Found input_scales that are not equal for "
                         "fp8 MoE layer. Using the maximum across experts "
                         "for each layer.")
diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py
index b1e63cd5cc40..a74f5415c8a5 100644
--- a/vllm/model_executor/layers/quantization/kv_cache.py
+++ b/vllm/model_executor/layers/quantization/kv_cache.py
@@ -69,7 +69,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer._v_scale = v_scale
             if (layer._k_scale == 1.0 and layer._v_scale == 1.0
                     and "e5m2" not in layer.kv_cache_dtype):
-                logger.print_warning_once(
+                logger.warning_once(
                     "Using KV cache scaling factor 1.0 for fp8_e4m3. This "
                     "may cause accuracy issues. Please make sure k/v_scale "
                     "scaling factors are available in the fp8 checkpoint.")
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
index 09955307a5d4..245fe9238e42 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -49,7 +49,7 @@ def apply_fp8_marlin_linear(
 
 def prepare_fp8_layer_for_marlin(layer: torch.nn.Module,
                                  strategy: str = "tensor") -> None:
-    logger.print_warning_once(
+    logger.warning_once(
         "Your GPU does not have native support for FP8 computation but "
         "FP8 quantization is being used. Weight-only FP8 compression will "
         "be used leveraging the Marlin kernel. This may degrade "
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index ebb5fec61403..11d5fd7135d9 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -673,7 +673,7 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
         None: If the remapped name is not found in params_dict.
     """
     if name.endswith(".kv_scale"):
-        logger.print_warning_once(
+        logger.warning_once(
             "DEPRECATED. Found kv_scale in the checkpoint. "
             "This format is deprecated in favor of separate k_scale and "
             "v_scale tensors and will be removed in a future release. "
@@ -682,7 +682,7 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
         # NOTE: we remap the deprecated kv_scale to k_scale
         remapped_name = name.replace(".kv_scale", ".attn.k_scale")
         if remapped_name not in params_dict:
-            logger.print_warning_once(
+            logger.warning_once(
                 f"Found kv_scale in the checkpoint (e.g. {name}), "
                 "but not found the expected name in the model "
                 f"(e.g. {remapped_name}). kv_scale is "
@@ -695,7 +695,7 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
         if name.endswith(scale_name):
             remapped_name = name.replace(scale_name, f".attn{scale_name}")
             if remapped_name not in params_dict:
-                logger.print_warning_once(
+                logger.warning_once(
                     f"Found {scale_name} in the checkpoint (e.g. {name}), "
                     "but not found the expected name in the model "
                     f"(e.g. {remapped_name}). {scale_name} is "
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 6ac046596c35..452fe727875f 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -1113,7 +1113,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                         remapped_kv_scale_name = name.replace(
                             ".kv_scale", ".attn.kv_scale")
                         if remapped_kv_scale_name not in params_dict:
-                            logger.print_warning_once(
+                            logger.warning_once(
                                 "Found kv scale in the checkpoint (e.g. "
                                 f"{name}), but not found the expected name in "
                                 f"the model (e.g. {remapped_kv_scale_name}). "
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index 2cc47aeb2d99..fbe5d1aee04b 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -448,7 +448,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                         remapped_kv_scale_name = name.replace(
                             ".kv_scale", ".attn.kv_scale")
                         if remapped_kv_scale_name not in params_dict:
-                            logger.print_warning_once(
+                            logger.warning_once(
                                 "Found kv scale in the checkpoint "
                                 f"(e.g. {name}), but not found the expected "
                                 f"name in the model "
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index a0ed344637ee..95de6c21871b 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -526,7 +526,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                         remapped_kv_scale_name = name.replace(
                             ".kv_scale", ".attn.kv_scale")
                         if remapped_kv_scale_name not in params_dict:
-                            logger.print_warning_once(
+                            logger.warning_once(
                                 "Found kv scale in the checkpoint "
                                 f"(e.g. {name}), but not found the expected "
                                 f"name in the model "
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 94d0c09a7c4d..c294f7e8fcad 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -630,7 +630,7 @@ def get_vit_attn_backend(support_fa: bool = False) -> _Backend:
             if is_flash_attn_2_available():
                 selected_backend = _Backend.FLASH_ATTN
             else:
-                logger.print_warning_once(
+                logger.warning_once(
                     "Current `vllm-flash-attn` has a bug inside vision module, "
                     "so we use xformers backend instead. You can run "
                     "`pip install flash-attn` to use flash-attention backend.")