From c036146598399f2a9d4e5d4bde3aaf26af647535 Mon Sep 17 00:00:00 2001
From: Jiang-Jia-Jun <jiangjiajun@baidu.com>
Date: Mon, 17 Nov 2025 14:48:34 +0800
Subject: [PATCH 1/5] [Polish] Polish error message

---
 fastdeploy/__init__.py |   4 +-
 fastdeploy/config.py   | 129 +++++++++++++++++++++++------------------
 2 files changed, 77 insertions(+), 56 deletions(-)

diff --git a/fastdeploy/__init__.py b/fastdeploy/__init__.py
index 9c26737a199..a13a37cde62 100644
--- a/fastdeploy/__init__.py
+++ b/fastdeploy/__init__.py
@@ -33,8 +33,10 @@
 if envs.FD_DEBUG != 1:
     import logging
 
-    pf_logger.logger.setLevel(logging.INFO)
+    pf_logger.logger.setLevel(logging.ERROR)
+    import warnings
 
+    warnings.filterwarnings("ignore", module="paddleformers")
 try:
     import use_triton_in_paddle
 
diff --git a/fastdeploy/config.py b/fastdeploy/config.py
index 5ec3df934ac..8c05b90b84b 100644
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -20,7 +20,7 @@
 import os
 from dataclasses import field
 from enum import Enum
-from typing import Any, Dict, Literal, Optional, Union
+from typing import Any, Dict, List, Literal, Optional, Union
 
 import paddle
 import paddle.distributed as dist
@@ -33,9 +33,13 @@
 from fastdeploy.platforms import current_platform
 from fastdeploy.scheduler import SchedulerConfig
 from fastdeploy.transformer_utils.config import get_pooling_config
-from fastdeploy.utils import ceil_div, check_unified_ckpt, get_host_ip, get_logger
-
-logger = get_logger("config", "config.log")
+from fastdeploy.utils import (
+    ceil_div,
+    check_unified_ckpt,
+    console_logger,
+    get_host_ip,
+    llm_logger,
+)
 
 TaskOption = Literal["auto", "generate", "embedding", "embed"]
 
@@ -314,9 +318,9 @@ def reset_config_value(key, value):
             if not hasattr(self, key.lower()):
                 if os.getenv(key, None):
                     value = eval(os.getenv(key))
-                    logger.info(f"Get parameter `{key}` = {value} from environment.")
+                    llm_logger.info(f"Get parameter `{key}` = {value} from environment.")
                 else:
-                    logger.info(f"Parameter `{key}` will use default value {value}.")
+                    llm_logger.info(f"Parameter `{key}` will use default value {value}.")
                 setattr(self, key.lower(), value)
 
         reset_config_value("COMPRESSION_RATIO", 1.0)
@@ -334,10 +338,10 @@ def read_model_config(self):
                 )
             elif "torch_dtype" in self.model_config:
                 self.model_format = "torch"
-                logger.info("The model format is Hugging Face")
+                llm_logger.info("The model format is Hugging Face")
             elif "dtype" in self.model_config:
                 self.model_format = "paddle"
-                logger.info("The model format is Paddle")
+                llm_logger.info("The model format is Paddle")
             else:
                 raise ValueError(
                     "Unknown model format. Please ensure your config.json contains "
@@ -400,7 +404,7 @@ def _get_runner_type(
 
         runner_type = self._get_default_runner_type(architectures)
         if runner_type != "generate":
-            logger.info(
+            llm_logger.info(
                 "Resolved `--runner auto` to `--runner %s`. " "Pass the value explicitly to silence this message.",
                 runner_type,
             )
@@ -419,7 +423,7 @@ def _get_convert_type(
         convert_type = self._get_default_convert_type(architectures, runner_type)
 
         if convert_type != "none":
-            logger.info(
+            llm_logger.info(
                 "Resolved `--convert auto` to `--convert %s`. " "Pass the value explicitly to silence this message.",
                 convert_type,
             )
@@ -512,10 +516,10 @@ def print(self):
         """
         Print all configuration information.
         """
-        logger.info("Model Configuration Information :")
+        llm_logger.info("Model Configuration Information :")
         for k, v in self.__dict__.items():
-            logger.info("{:<20}:{:<6}{}".format(k, "", v))
-        logger.info("=============================================================")
+            llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
+        llm_logger.info("=============================================================")
 
 
 class ParallelConfig:
@@ -559,7 +563,7 @@ def __init__(
                 setattr(self, key, value)
         if isinstance(self.engine_worker_queue_port, str):
             self.engine_worker_queue_port = [int(port) for port in self.engine_worker_queue_port.split(",")]
-            logger.info(f"engine_worker_queue_port: {self.engine_worker_queue_port}")
+            llm_logger.info(f"engine_worker_queue_port: {self.engine_worker_queue_port}")
         elif isinstance(self.engine_worker_queue_port, int):
             self.engine_worker_queue_port = [self.engine_worker_queue_port]
         # currently, the expert parallel size is equal data parallel size
@@ -586,7 +590,7 @@ def __init__(
             and self.expert_parallel_size > 1
             and self.tensor_parallel_size > 1
         )
-        logger.info(f"use_sequence_parallel_moe: {self.use_sequence_parallel_moe}")
+        llm_logger.info(f"use_sequence_parallel_moe: {self.use_sequence_parallel_moe}")
 
     def set_communicate_group(self):
         # different tp group id
@@ -606,7 +610,7 @@ def set_communicate_group(self):
             dist.collective._set_custom_gid(self.data_parallel_size + tp_gid_offset)
             self.ep_group = dist.new_group(range(self.expert_parallel_size))
             dist.collective._set_custom_gid(None)
-        logger.info(
+        llm_logger.info(
             f"data_parallel_size: {self.data_parallel_size}, tensor_parallel_size: {self.tensor_parallel_size}, expert_parallel_size: {self.expert_parallel_size}, data_parallel_rank: {self.data_parallel_rank}, tensor_parallel_rank: {self.tensor_parallel_rank}, expert_parallel_rank: {self.expert_parallel_rank}, tp_group: {self.tp_group}."
         )
 
@@ -615,10 +619,10 @@ def print(self):
         print all config
 
         """
-        logger.info("Parallel Configuration Information :")
+        llm_logger.info("Parallel Configuration Information :")
         for k, v in self.__dict__.items():
-            logger.info("{:<20}:{:<6}{}".format(k, "", v))
-        logger.info("=============================================================")
+            llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
+        llm_logger.info("=============================================================")
 
 
 class SpeculativeConfig:
@@ -729,10 +733,10 @@ def print(self):
         print all config
 
         """
-        logger.info("Speculative Decoding Configuration Information :")
+        llm_logger.info("Speculative Decoding Configuration Information :")
         for k, v in self.__dict__.items():
-            logger.info("{:<20}:{:<6}{}".format(k, "", v))
-        logger.info("=============================================================")
+            llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
+        llm_logger.info("=============================================================")
 
     def check_legality_parameters(
         self,
@@ -752,7 +756,7 @@ def check_legality_parameters(
 
             if self.method in ["mtp", "hybrid_mtp_ngram"]:
                 if self.num_speculative_tokens < self.num_model_steps:
-                    logger.warning(
+                    llm_logger.warning(
                         f"Get num_model_steps > num_speculative_tokens. Reset num_speculative_tokens to {self.num_model_steps}"
                     )
                     self.num_speculative_tokens = self.num_model_steps
@@ -864,7 +868,7 @@ def init_with_cudagrpah_size(self, max_capture_size: int = 0) -> None:
         self.cudagraph_capture_sizes = [size for size in self.cudagraph_capture_sizes if size <= max_capture_size]
         dedup_sizes = list(set(self.cudagraph_capture_sizes))
         if len(dedup_sizes) < len(self.cudagraph_capture_sizes):
-            logger.info(
+            llm_logger.info(
                 ("cudagraph sizes specified by model runner" " %s is overridden by config %s"),
                 self.cudagraph_capture_sizes,
                 dedup_sizes,
@@ -1298,7 +1302,7 @@ def postprocess(self, num_total_tokens, number_of_tasks):
             block_num = (length + self.block_size - 1 + self.dec_token_num) // self.block_size
             self.total_block_num = block_num * number_of_tasks
             self.prefill_kvcache_block_num = self.total_block_num
-            logger.info(f"Doing profile, the total_block_num:{self.total_block_num}")
+            llm_logger.info(f"Doing profile, the total_block_num:{self.total_block_num}")
 
     def reset(self, num_gpu_blocks):
         """
@@ -1309,23 +1313,26 @@ def reset(self, num_gpu_blocks):
             self.prefill_kvcache_block_num = self.total_block_num
         else:
             self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
-        logger.info(
+        llm_logger.info(
             f"Reset block num, the total_block_num:{self.total_block_num},"
             f" prefill_kvcache_block_num:{self.prefill_kvcache_block_num}"
         )
-        assert (
-            self.prefill_kvcache_block_num >= self.max_block_num_per_seq
-        ), f"current block number :{self.prefill_kvcache_block_num} should be greater than or equal to current model len needed minimum block number :{self.max_block_num_per_seq}"
+        if self.prefill_kvcache_block_num < self.max_block_num_per_seq:
+            available_cache_tokens = self.prefill_kvcache_block_num * self.block_size
+            console_logger.error(
+                f"The current KV Cache can only support caching {available_cache_tokens} tokens, which is less than the set max_model_len={self.model_cfg.max_model_len}. Please deploy this model under on GPUs with larger memory or reduce your `max_model_len` to {available_cache_tokens} or less."
+            )
+            raise RuntimeError("Resource is not sufficient.")
 
     def print(self):
         """
         print all config
 
         """
-        logger.info("Cache Configuration Information :")
+        llm_logger.info("Cache Configuration Information :")
         for k, v in self.__dict__.items():
-            logger.info("{:<20}:{:<6}{}".format(k, "", v))
-        logger.info("=============================================================")
+            llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
+        llm_logger.info("=============================================================")
 
 
 class RouterConfig:
@@ -1388,19 +1395,19 @@ def _load_from_version_file(self, file_path: str = None):
                     elif line.startswith("CXX compiler version:"):
                         self.compiler_version = line.split(":")[1].strip()
         except FileNotFoundError:
-            logger.info(f"Warning: Version file not found at {file_path}")
+            llm_logger.info(f"Warning: Version file not found at {file_path}")
         except Exception as e:
-            logger.info(f"Warning: Could not read version file - {e!s}")
+            llm_logger.info(f"Warning: Could not read version file - {e!s}")
 
     def print(self):
         """
         print all config
 
         """
-        logger.info("Fasedeploy Commit Information :")
+        llm_logger.info("Fasedeploy Commit Information :")
         for k, v in self.__dict__.items():
-            logger.info("{:<20}:{:<6}{}".format(k, "", v))
-        logger.info("=============================================================")
+            llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
+        llm_logger.info("=============================================================")
 
 
 class StructuredOutputsConfig:
@@ -1453,6 +1460,7 @@ def __init__(
         use_warmup: bool = False,
         limit_mm_per_prompt: Optional[Dict[str, Any]] = None,
         mm_processor_kwargs: Optional[Dict[str, Any]] = None,
+        innode_prefill_ports: Optional[List[int]] = None,
         max_num_partial_prefills: int = 1,
         max_long_partial_prefills: int = 1,
         long_prefill_token_threshold: int = 0,
@@ -1516,10 +1524,13 @@ def __init__(
         self.limit_mm_per_prompt = limit_mm_per_prompt
         self.mm_processor_kwargs = mm_processor_kwargs
         self.use_warmup = use_warmup
+        self.innode_prefill_ports = innode_prefill_ports
         self.max_num_partial_prefills = max_num_partial_prefills
         self.max_long_partial_prefills = max_long_partial_prefills
         self.long_prefill_token_threshold = long_prefill_token_threshold
 
+        self._str_to_list("innode_prefill_ports", int)
+
         if envs.FD_FOR_TORCH_MODEL_FORMAT:
             self.model_config.model_format = "torch"
 
@@ -1597,7 +1608,7 @@ def postprocess(self):
             and self.structured_outputs_config.guided_decoding_backend == "auto"
         ):
             if current_platform.is_xpu() or self.speculative_config.method is not None:
-                logger.warning("Speculative Decoding and XPU currently do not support Guided decoding, set off.")
+                llm_logger.warning("Speculative Decoding and XPU currently do not support Guided decoding, set off.")
                 self.structured_outputs_config.guided_decoding_backend = "off"
             else:
                 self.structured_outputs_config.guided_decoding_backend = "xgrammar"
@@ -1607,7 +1618,7 @@ def postprocess(self):
                 self.cache_config.max_encoder_cache = self.scheduler_config.max_num_batched_tokens
             elif self.cache_config.max_encoder_cache != 0:
                 if self.cache_config.max_encoder_cache < self.scheduler_config.max_num_batched_tokens:
-                    logger.warning(
+                    llm_logger.warning(
                         f"max_encoder_cache{self.cache_config.max_encoder_cache} is less than "
                         f"max_num_batched_tokens{self.scheduler_config.max_num_batched_tokens}, "
                         f"set to max_num_batched_tokens."
@@ -1621,16 +1632,16 @@ def postprocess(self):
             self.graph_opt_config.use_cudagraph = self.graph_opt_config.cudagraph_only_prefill
         if self.load_config is not None and self.load_config.dynamic_load_weight is True:
             self.graph_opt_config.graph_opt_level = 0
-            logger.info(
+            llm_logger.info(
                 "Static Graph does not support to be started together with RL Training, and automatically switch to dynamic graph!"
             )
         if self.device_config is not None and self.device_config.device_type != "cuda":
             self.graph_opt_config.use_cudagraph = False
-            logger.info(f"CUDAGraph only support on GPU, current device type is {self.device_config.device_type}!")
+            llm_logger.info(f"CUDAGraph only support on GPU, current device type is {self.device_config.device_type}!")
 
         if self.model_config.enable_mm and self.graph_opt_config.use_cudagraph:
             self.cache_config.enable_prefix_caching = False
-            logger.info("Multi-modal models do not support prefix caching when using CUDAGraph!")
+            llm_logger.info("Multi-modal models do not support prefix caching when using CUDAGraph!")
 
         if self.scheduler_config.splitwise_role == "mixed":
             self.model_config.moe_phase = MoEPhase(phase="prefill")
@@ -1747,11 +1758,11 @@ def print(self):
         """
         print all config
         """
-        logger.info("=================== Configuration Information ===============")
+        llm_logger.info("=================== Configuration Information ===============")
         for k, v in self.__dict__.items():
             if k == "generation_config" and v is not None:
                 for gck, gcv in v.to_dict().items():
-                    logger.info("{:<20}:{:<6}{}".format(gck, "", gcv))
+                    llm_logger.info("{:<20}:{:<6}{}".format(gck, "", gcv))
             elif (
                 k == "cache_config"
                 or k == "model_config"
@@ -1762,22 +1773,30 @@ def print(self):
                 if v is not None:
                     v.print()
             else:
-                logger.info("{:<20}:{:<6}{}".format(k, "", v))
-        logger.info("=============================================================")
+                llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
+        llm_logger.info("=============================================================")
 
     def init_cache_info(self):
         """
         initialize cache info
         """
-        # TODO: group the splitiwse params
-        # There are two methods for splitwise deployment:
-        # 1. v0 splitwise_scheduler or dp_scheduler
-        # 2. v1 local_scheduler + router
+        # TODO: group the splitiwse params, remove code of v0
+        # v0 requires prefill and decode in one node and it uses local scheduler
+        # v1 supports prefill and decode in multi node and it uses splitwise or dp scheduler
+        # v2 supports prefill and decode in multi node and it uses router and local scheduler
         self.splitwise_version = None
-        if self.scheduler_config.name in ("splitwise", "dp"):
+        if self.scheduler_config.name == "local" and (self.router_config is None or self.router_config.router is None):
             self.splitwise_version = "v0"
-        elif self.scheduler_config.name == "local" and self.router_config and self.router_config.router:
+        elif self.scheduler_config.name in ("splitwise", "dp"):
             self.splitwise_version = "v1"
+        elif self.scheduler_config.name == "local" and self.router_config and self.router_config.router:
+            self.splitwise_version = "v2"
+        else:
+            raise ValueError(
+                f"Unsupported scheduler mode, scheduler_name: {self.scheduler_config.name}, "
+                f"router_config: {self.router_config}"
+            )
+        llm_logger.info(f"splitwise_version: {self.splitwise_version}")
 
         if isinstance(self.parallel_config.engine_worker_queue_port, (int, str)):
             engine_worker_queue_port = self.parallel_config.engine_worker_queue_port
@@ -1807,7 +1826,7 @@ def init_cache_info(self):
                         "port": connector_port,
                         "rdma_port": self.cache_config.rdma_comm_ports,
                     }
-            logger.info(f"disaggregate_info: {self.disaggregate_info}")
+            llm_logger.info(f"disaggregate_info: {self.disaggregate_info}")
 
         if self.router_config:
             self.register_info = {
@@ -1820,7 +1839,7 @@ def init_cache_info(self):
                 "device_ids": self.local_device_ids,
                 "transfer_protocol": self.cache_config.cache_transfer_protocol.split(","),
             }
-            logger.info(f"register_info: {self.register_info}")
+            llm_logger.info(f"register_info: {self.register_info}")
 
     def read_from_config(self):
         """
@@ -1831,7 +1850,7 @@ def reset_value(cls, value_name, key):
             if hasattr(cls, key):
                 value = getattr(cls, key)
                 setattr(cls, value_name, value)
-                logger.info(f"Reset parameter {value_name} = {value} from configuration.")
+                llm_logger.info(f"Reset parameter {value_name} = {value} from configuration.")
 
         reset_value(self.cache_config, "block_size", "infer_model_block_size")
         reset_value(

From 0c349ac5cafac9435adc4cd8ef973819d7a9586d Mon Sep 17 00:00:00 2001
From: Jiang-Jia-Jun <jiangjiajun@baidu.com>
Date: Mon, 17 Nov 2025 15:02:45 +0800
Subject: [PATCH 2/5] [Polish] Polish some logger

---
 fastdeploy/scheduler/config.py                 | 10 ++++------
 fastdeploy/splitwise/internal_adapter_utils.py | 16 ++++++++--------
 fastdeploy/transformer_utils/config.py         |  8 +++-----
 fastdeploy/worker/model_runner_base.py         |  3 ---
 4 files changed, 15 insertions(+), 22 deletions(-)

diff --git a/fastdeploy/scheduler/config.py b/fastdeploy/scheduler/config.py
index 83ee476e467..894baf44611 100644
--- a/fastdeploy/scheduler/config.py
+++ b/fastdeploy/scheduler/config.py
@@ -16,9 +16,7 @@
 
 import redis
 
-from fastdeploy.utils import get_logger, llm_logger
-
-config_logger = get_logger("config", "config.log")
+from fastdeploy.utils import llm_logger
 
 from .dp_scheduler import DPScheduler
 from .global_scheduler import GlobalScheduler
@@ -86,10 +84,10 @@ def print(self):
         """
         Print the current configuration to logs.
         """
-        config_logger.info("LocalScheduler Configuration Information :")
+        llm_logger.info("LocalScheduler Configuration Information :")
         for k, v in self.__dict__.items():
-            config_logger.info("{:<20}:{:<6}{}".format(k, "", v))
-        config_logger.info("=============================================================")
+            llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
+        llm_logger.info("=============================================================")
 
 
 class DPLocalSchedulerConfig(LocalSchedulerConfig):
diff --git a/fastdeploy/splitwise/internal_adapter_utils.py b/fastdeploy/splitwise/internal_adapter_utils.py
index 30aa74d7ded..ba0b9aa901e 100644
--- a/fastdeploy/splitwise/internal_adapter_utils.py
+++ b/fastdeploy/splitwise/internal_adapter_utils.py
@@ -25,8 +25,6 @@
 from fastdeploy.metrics.metrics import get_filtered_metrics, main_process_metrics
 from fastdeploy.utils import envs, get_logger
 
-logger = get_logger("internal_adapter_utils", "internal_adapter_utils.log")
-
 
 class InternalAdapter:
     def __init__(self, cfg, engine, dp_rank):
@@ -40,6 +38,8 @@ def __init__(self, cfg, engine, dp_rank):
             target=self._recv_external_module_control_instruct, daemon=True
         )
         self.recv_external_instruct_thread.start()
+        self.logger = get_logger("internal_adapter_utils", "internal_adapter_utils.log")
+
         if cfg.scheduler_config.splitwise_role != "mixed":
             self.response_external_instruct_thread = threading.Thread(
                 target=self._response_external_module_control_instruct, daemon=True
@@ -78,12 +78,12 @@ def _recv_external_module_control_instruct(self):
                 if task is None:
                     time.sleep(0.001)
                     continue
-                logger.info(f"dprank {self.dp_rank} Recieve control task: {task}")
+                self.logger.info(f"dprank {self.dp_rank} Recieve control task: {task}")
                 task_id_str = task["task_id"]
                 if task["cmd"] == "get_payload":
                     payload_info = self._get_current_server_info()
                     result = {"task_id": task_id_str, "result": payload_info}
-                    logger.debug(f"Response for task: {task_id_str}")
+                    self.logger.debug(f"Response for task: {task_id_str}")
                     with self.response_lock:
                         self.recv_control_cmd_server.response_for_control_cmd(task_id_str, result)
 
@@ -93,14 +93,14 @@ def _recv_external_module_control_instruct(self):
                         extra_register_func=lambda reg: main_process_metrics.register_all(reg, workers=1),
                     )
                     result = {"task_id": task_id_str, "result": metrics_text}
-                    logger.debug(f"Response for task: {task_id_str}")
+                    self.logger.debug(f"Response for task: {task_id_str}")
                     with self.response_lock:
                         self.recv_control_cmd_server.response_for_control_cmd(task_id_str, result)
                 elif task["cmd"] == "connect_rdma":
                     self.engine.engine_worker_queue.put_connect_rdma_task(task)
 
             except Exception as e:
-                logger.error(f"handle_control_cmd got error: {e}, {traceback.format_exc()!s}")
+                self.logger.error(f"handle_control_cmd got error: {e}, {traceback.format_exc()!s}")
 
     def _response_external_module_control_instruct(self):
         while True:
@@ -109,10 +109,10 @@ def _response_external_module_control_instruct(self):
                 if result_data:
                     task_id_str = result_data["task_id"]
                     result = {"task_id": task_id_str, "result": result_data}
-                    logger.info(f"Response for task: {task_id_str}")
+                    self.logger.info(f"Response for task: {task_id_str}")
                     with self.response_lock:
                         self.recv_control_cmd_server.response_for_control_cmd(task_id_str, result)
                 else:
                     time.sleep(0.001)
             except Exception as e:
-                logger.error(f"_handle_connect_rdma_results got error: {e}, {traceback.format_exc() !s}")
+                self.logger.error(f"_handle_connect_rdma_results got error: {e}, {traceback.format_exc() !s}")
diff --git a/fastdeploy/transformer_utils/config.py b/fastdeploy/transformer_utils/config.py
index 4eed4745849..576f7286dd1 100644
--- a/fastdeploy/transformer_utils/config.py
+++ b/fastdeploy/transformer_utils/config.py
@@ -12,9 +12,7 @@
     RevisionNotFoundError,
 )
 
-from fastdeploy.utils import get_logger
-
-logger = get_logger("transformer_config", "transformer_config.log")
+from fastdeploy.utils import llm_logger
 
 
 def file_or_path_exists(model, config_name):
@@ -80,10 +78,10 @@ def get_hf_file_to_dict(file_name: str, model: Union[str, Path], revision: Optio
         except huggingface_hub.errors.OfflineModeIsEnabled:
             return None
         except (RepositoryNotFoundError, RevisionNotFoundError, EntryNotFoundError, LocalEntryNotFoundError) as e:
-            logger.debug("File or repository not found in hf_hub_download", e)
+            llm_logger.debug("File or repository not found in hf_hub_download", e)
             return None
         except HfHubHTTPError as e:
-            logger.warning(
+            llm_logger.warning(
                 "Cannot connect to Hugging Face Hub. Skipping file " "download for '%s':", file_name, exc_info=e
             )
             return None
diff --git a/fastdeploy/worker/model_runner_base.py b/fastdeploy/worker/model_runner_base.py
index 699182576b2..4d5f7b197b2 100644
--- a/fastdeploy/worker/model_runner_base.py
+++ b/fastdeploy/worker/model_runner_base.py
@@ -19,11 +19,8 @@
 from paddle import nn
 
 from fastdeploy.config import FDConfig
-from fastdeploy.utils import get_logger
 from fastdeploy.worker.output import ModelRunnerOutput
 
-logger = get_logger("model_runner_base", "model_runner_base.log")
-
 
 class ModelRunnerBase(ABC):
     """

From 0794f8e4d8520419e0371dfe378a4cdff6a15833 Mon Sep 17 00:00:00 2001
From: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
Date: Mon, 17 Nov 2025 15:06:26 +0800
Subject: [PATCH 3/5] Update fastdeploy/__init__.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 fastdeploy/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fastdeploy/__init__.py b/fastdeploy/__init__.py
index a13a37cde62..ec44a27ae41 100644
--- a/fastdeploy/__init__.py
+++ b/fastdeploy/__init__.py
@@ -33,7 +33,8 @@
 if envs.FD_DEBUG != 1:
     import logging
 
-    pf_logger.logger.setLevel(logging.ERROR)
+    # Set paddleformers logger to WARNING to suppress INFO logs but still show warnings and errors.
+    pf_logger.logger.setLevel(logging.WARNING)
     import warnings
 
     warnings.filterwarnings("ignore", module="paddleformers")

From d6568c033c59518c0c374cd830fdeba8c8fed06b Mon Sep 17 00:00:00 2001
From: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
Date: Mon, 17 Nov 2025 15:06:57 +0800
Subject: [PATCH 4/5] Update fastdeploy/config.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 fastdeploy/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fastdeploy/config.py b/fastdeploy/config.py
index 8c05b90b84b..3b5b7a946d4 100644
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -1320,7 +1320,7 @@ def reset(self, num_gpu_blocks):
         if self.prefill_kvcache_block_num < self.max_block_num_per_seq:
             available_cache_tokens = self.prefill_kvcache_block_num * self.block_size
             console_logger.error(
-                f"The current KV Cache can only support caching {available_cache_tokens} tokens, which is less than the set max_model_len={self.model_cfg.max_model_len}. Please deploy this model under on GPUs with larger memory or reduce your `max_model_len` to {available_cache_tokens} or less."
+                f"The current KV Cache can only support caching {available_cache_tokens} tokens, which is less than the set max_model_len={self.model_cfg.max_model_len}. Please deploy this model on GPUs with larger memory or reduce your `max_model_len` to {available_cache_tokens} or less."
             )
             raise RuntimeError("Resource is not sufficient.")
 

From 07eb3cd21d9b8d7a828f8b043ba7693f9c4d0e16 Mon Sep 17 00:00:00 2001
From: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
Date: Mon, 17 Nov 2025 15:07:14 +0800
Subject: [PATCH 5/5] Update fastdeploy/config.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 fastdeploy/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fastdeploy/config.py b/fastdeploy/config.py
index 3b5b7a946d4..0ac86622e87 100644
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -1780,7 +1780,7 @@ def init_cache_info(self):
         """
         initialize cache info
         """
-        # TODO: group the splitiwse params, remove code of v0
+        # TODO: group the splitwise params, remove code of v0
         # v0 requires prefill and decode in one node and it uses local scheduler
         # v1 supports prefill and decode in multi node and it uses splitwise or dp scheduler
         # v2 supports prefill and decode in multi node and it uses router and local scheduler