From c036146598399f2a9d4e5d4bde3aaf26af647535 Mon Sep 17 00:00:00 2001 From: Jiang-Jia-Jun Date: Mon, 17 Nov 2025 14:48:34 +0800 Subject: [PATCH 1/5] [Polish] Polish error message --- fastdeploy/__init__.py | 4 +- fastdeploy/config.py | 129 +++++++++++++++++++++++------------------ 2 files changed, 77 insertions(+), 56 deletions(-) diff --git a/fastdeploy/__init__.py b/fastdeploy/__init__.py index 9c26737a199..a13a37cde62 100644 --- a/fastdeploy/__init__.py +++ b/fastdeploy/__init__.py @@ -33,8 +33,10 @@ if envs.FD_DEBUG != 1: import logging - pf_logger.logger.setLevel(logging.INFO) + pf_logger.logger.setLevel(logging.ERROR) + import warnings + warnings.filterwarnings("ignore", module="paddleformers") try: import use_triton_in_paddle diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 5ec3df934ac..8c05b90b84b 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -20,7 +20,7 @@ import os from dataclasses import field from enum import Enum -from typing import Any, Dict, Literal, Optional, Union +from typing import Any, Dict, List, Literal, Optional, Union import paddle import paddle.distributed as dist @@ -33,9 +33,13 @@ from fastdeploy.platforms import current_platform from fastdeploy.scheduler import SchedulerConfig from fastdeploy.transformer_utils.config import get_pooling_config -from fastdeploy.utils import ceil_div, check_unified_ckpt, get_host_ip, get_logger - -logger = get_logger("config", "config.log") +from fastdeploy.utils import ( + ceil_div, + check_unified_ckpt, + console_logger, + get_host_ip, + llm_logger, +) TaskOption = Literal["auto", "generate", "embedding", "embed"] @@ -314,9 +318,9 @@ def reset_config_value(key, value): if not hasattr(self, key.lower()): if os.getenv(key, None): value = eval(os.getenv(key)) - logger.info(f"Get parameter `{key}` = {value} from environment.") + llm_logger.info(f"Get parameter `{key}` = {value} from environment.") else: - logger.info(f"Parameter `{key}` will use default value {value}.") + llm_logger.info(f"Parameter `{key}` will use default value {value}.") setattr(self, key.lower(), value) reset_config_value("COMPRESSION_RATIO", 1.0) @@ -334,10 +338,10 @@ def read_model_config(self): ) elif "torch_dtype" in self.model_config: self.model_format = "torch" - logger.info("The model format is Hugging Face") + llm_logger.info("The model format is Hugging Face") elif "dtype" in self.model_config: self.model_format = "paddle" - logger.info("The model format is Paddle") + llm_logger.info("The model format is Paddle") else: raise ValueError( "Unknown model format. Please ensure your config.json contains " @@ -400,7 +404,7 @@ def _get_runner_type( runner_type = self._get_default_runner_type(architectures) if runner_type != "generate": - logger.info( + llm_logger.info( "Resolved `--runner auto` to `--runner %s`. " "Pass the value explicitly to silence this message.", runner_type, ) @@ -419,7 +423,7 @@ def _get_convert_type( convert_type = self._get_default_convert_type(architectures, runner_type) if convert_type != "none": - logger.info( + llm_logger.info( "Resolved `--convert auto` to `--convert %s`. " "Pass the value explicitly to silence this message.", convert_type, ) @@ -512,10 +516,10 @@ def print(self): """ Print all configuration information. """ - logger.info("Model Configuration Information :") + llm_logger.info("Model Configuration Information :") for k, v in self.__dict__.items(): - logger.info("{:<20}:{:<6}{}".format(k, "", v)) - logger.info("=============================================================") + llm_logger.info("{:<20}:{:<6}{}".format(k, "", v)) + llm_logger.info("=============================================================") class ParallelConfig: @@ -559,7 +563,7 @@ def __init__( setattr(self, key, value) if isinstance(self.engine_worker_queue_port, str): self.engine_worker_queue_port = [int(port) for port in self.engine_worker_queue_port.split(",")] - logger.info(f"engine_worker_queue_port: {self.engine_worker_queue_port}") + llm_logger.info(f"engine_worker_queue_port: {self.engine_worker_queue_port}") elif isinstance(self.engine_worker_queue_port, int): self.engine_worker_queue_port = [self.engine_worker_queue_port] # currently, the expert parallel size is equal data parallel size @@ -586,7 +590,7 @@ def __init__( and self.expert_parallel_size > 1 and self.tensor_parallel_size > 1 ) - logger.info(f"use_sequence_parallel_moe: {self.use_sequence_parallel_moe}") + llm_logger.info(f"use_sequence_parallel_moe: {self.use_sequence_parallel_moe}") def set_communicate_group(self): # different tp group id @@ -606,7 +610,7 @@ def set_communicate_group(self): dist.collective._set_custom_gid(self.data_parallel_size + tp_gid_offset) self.ep_group = dist.new_group(range(self.expert_parallel_size)) dist.collective._set_custom_gid(None) - logger.info( + llm_logger.info( f"data_parallel_size: {self.data_parallel_size}, tensor_parallel_size: {self.tensor_parallel_size}, expert_parallel_size: {self.expert_parallel_size}, data_parallel_rank: {self.data_parallel_rank}, tensor_parallel_rank: {self.tensor_parallel_rank}, expert_parallel_rank: {self.expert_parallel_rank}, tp_group: {self.tp_group}." ) @@ -615,10 +619,10 @@ def print(self): print all config """ - logger.info("Parallel Configuration Information :") + llm_logger.info("Parallel Configuration Information :") for k, v in self.__dict__.items(): - logger.info("{:<20}:{:<6}{}".format(k, "", v)) - logger.info("=============================================================") + llm_logger.info("{:<20}:{:<6}{}".format(k, "", v)) + llm_logger.info("=============================================================") class SpeculativeConfig: @@ -729,10 +733,10 @@ def print(self): print all config """ - logger.info("Speculative Decoding Configuration Information :") + llm_logger.info("Speculative Decoding Configuration Information :") for k, v in self.__dict__.items(): - logger.info("{:<20}:{:<6}{}".format(k, "", v)) - logger.info("=============================================================") + llm_logger.info("{:<20}:{:<6}{}".format(k, "", v)) + llm_logger.info("=============================================================") def check_legality_parameters( self, @@ -752,7 +756,7 @@ def check_legality_parameters( if self.method in ["mtp", "hybrid_mtp_ngram"]: if self.num_speculative_tokens < self.num_model_steps: - logger.warning( + llm_logger.warning( f"Get num_model_steps > num_speculative_tokens. Reset num_speculative_tokens to {self.num_model_steps}" ) self.num_speculative_tokens = self.num_model_steps @@ -864,7 +868,7 @@ def init_with_cudagrpah_size(self, max_capture_size: int = 0) -> None: self.cudagraph_capture_sizes = [size for size in self.cudagraph_capture_sizes if size <= max_capture_size] dedup_sizes = list(set(self.cudagraph_capture_sizes)) if len(dedup_sizes) < len(self.cudagraph_capture_sizes): - logger.info( + llm_logger.info( ("cudagraph sizes specified by model runner" " %s is overridden by config %s"), self.cudagraph_capture_sizes, dedup_sizes, @@ -1298,7 +1302,7 @@ def postprocess(self, num_total_tokens, number_of_tasks): block_num = (length + self.block_size - 1 + self.dec_token_num) // self.block_size self.total_block_num = block_num * number_of_tasks self.prefill_kvcache_block_num = self.total_block_num - logger.info(f"Doing profile, the total_block_num:{self.total_block_num}") + llm_logger.info(f"Doing profile, the total_block_num:{self.total_block_num}") def reset(self, num_gpu_blocks): """ @@ -1309,23 +1313,26 @@ def reset(self, num_gpu_blocks): self.prefill_kvcache_block_num = self.total_block_num else: self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio) - logger.info( + llm_logger.info( f"Reset block num, the total_block_num:{self.total_block_num}," f" prefill_kvcache_block_num:{self.prefill_kvcache_block_num}" ) - assert ( - self.prefill_kvcache_block_num >= self.max_block_num_per_seq - ), f"current block number :{self.prefill_kvcache_block_num} should be greater than or equal to current model len needed minimum block number :{self.max_block_num_per_seq}" + if self.prefill_kvcache_block_num < self.max_block_num_per_seq: + available_cache_tokens = self.prefill_kvcache_block_num * self.block_size + console_logger.error( + f"The current KV Cache can only support caching {available_cache_tokens} tokens, which is less than the set max_model_len={self.model_cfg.max_model_len}. Please deploy this model under on GPUs with larger memory or reduce your `max_model_len` to {available_cache_tokens} or less." + ) + raise RuntimeError("Resource is not sufficient.") def print(self): """ print all config """ - logger.info("Cache Configuration Information :") + llm_logger.info("Cache Configuration Information :") for k, v in self.__dict__.items(): - logger.info("{:<20}:{:<6}{}".format(k, "", v)) - logger.info("=============================================================") + llm_logger.info("{:<20}:{:<6}{}".format(k, "", v)) + llm_logger.info("=============================================================") class RouterConfig: @@ -1388,19 +1395,19 @@ def _load_from_version_file(self, file_path: str = None): elif line.startswith("CXX compiler version:"): self.compiler_version = line.split(":")[1].strip() except FileNotFoundError: - logger.info(f"Warning: Version file not found at {file_path}") + llm_logger.info(f"Warning: Version file not found at {file_path}") except Exception as e: - logger.info(f"Warning: Could not read version file - {e!s}") + llm_logger.info(f"Warning: Could not read version file - {e!s}") def print(self): """ print all config """ - logger.info("Fasedeploy Commit Information :") + llm_logger.info("Fasedeploy Commit Information :") for k, v in self.__dict__.items(): - logger.info("{:<20}:{:<6}{}".format(k, "", v)) - logger.info("=============================================================") + llm_logger.info("{:<20}:{:<6}{}".format(k, "", v)) + llm_logger.info("=============================================================") class StructuredOutputsConfig: @@ -1453,6 +1460,7 @@ def __init__( use_warmup: bool = False, limit_mm_per_prompt: Optional[Dict[str, Any]] = None, mm_processor_kwargs: Optional[Dict[str, Any]] = None, + innode_prefill_ports: Optional[List[int]] = None, max_num_partial_prefills: int = 1, max_long_partial_prefills: int = 1, long_prefill_token_threshold: int = 0, @@ -1516,10 +1524,13 @@ def __init__( self.limit_mm_per_prompt = limit_mm_per_prompt self.mm_processor_kwargs = mm_processor_kwargs self.use_warmup = use_warmup + self.innode_prefill_ports = innode_prefill_ports self.max_num_partial_prefills = max_num_partial_prefills self.max_long_partial_prefills = max_long_partial_prefills self.long_prefill_token_threshold = long_prefill_token_threshold + self._str_to_list("innode_prefill_ports", int) + if envs.FD_FOR_TORCH_MODEL_FORMAT: self.model_config.model_format = "torch" @@ -1597,7 +1608,7 @@ def postprocess(self): and self.structured_outputs_config.guided_decoding_backend == "auto" ): if current_platform.is_xpu() or self.speculative_config.method is not None: - logger.warning("Speculative Decoding and XPU currently do not support Guided decoding, set off.") + llm_logger.warning("Speculative Decoding and XPU currently do not support Guided decoding, set off.") self.structured_outputs_config.guided_decoding_backend = "off" else: self.structured_outputs_config.guided_decoding_backend = "xgrammar" @@ -1607,7 +1618,7 @@ def postprocess(self): self.cache_config.max_encoder_cache = self.scheduler_config.max_num_batched_tokens elif self.cache_config.max_encoder_cache != 0: if self.cache_config.max_encoder_cache < self.scheduler_config.max_num_batched_tokens: - logger.warning( + llm_logger.warning( f"max_encoder_cache{self.cache_config.max_encoder_cache} is less than " f"max_num_batched_tokens{self.scheduler_config.max_num_batched_tokens}, " f"set to max_num_batched_tokens." @@ -1621,16 +1632,16 @@ def postprocess(self): self.graph_opt_config.use_cudagraph = self.graph_opt_config.cudagraph_only_prefill if self.load_config is not None and self.load_config.dynamic_load_weight is True: self.graph_opt_config.graph_opt_level = 0 - logger.info( + llm_logger.info( "Static Graph does not support to be started together with RL Training, and automatically switch to dynamic graph!" ) if self.device_config is not None and self.device_config.device_type != "cuda": self.graph_opt_config.use_cudagraph = False - logger.info(f"CUDAGraph only support on GPU, current device type is {self.device_config.device_type}!") + llm_logger.info(f"CUDAGraph only support on GPU, current device type is {self.device_config.device_type}!") if self.model_config.enable_mm and self.graph_opt_config.use_cudagraph: self.cache_config.enable_prefix_caching = False - logger.info("Multi-modal models do not support prefix caching when using CUDAGraph!") + llm_logger.info("Multi-modal models do not support prefix caching when using CUDAGraph!") if self.scheduler_config.splitwise_role == "mixed": self.model_config.moe_phase = MoEPhase(phase="prefill") @@ -1747,11 +1758,11 @@ def print(self): """ print all config """ - logger.info("=================== Configuration Information ===============") + llm_logger.info("=================== Configuration Information ===============") for k, v in self.__dict__.items(): if k == "generation_config" and v is not None: for gck, gcv in v.to_dict().items(): - logger.info("{:<20}:{:<6}{}".format(gck, "", gcv)) + llm_logger.info("{:<20}:{:<6}{}".format(gck, "", gcv)) elif ( k == "cache_config" or k == "model_config" @@ -1762,22 +1773,30 @@ def print(self): if v is not None: v.print() else: - logger.info("{:<20}:{:<6}{}".format(k, "", v)) - logger.info("=============================================================") + llm_logger.info("{:<20}:{:<6}{}".format(k, "", v)) + llm_logger.info("=============================================================") def init_cache_info(self): """ initialize cache info """ - # TODO: group the splitiwse params - # There are two methods for splitwise deployment: - # 1. v0 splitwise_scheduler or dp_scheduler - # 2. v1 local_scheduler + router + # TODO: group the splitiwse params, remove code of v0 + # v0 requires prefill and decode in one node and it uses local scheduler + # v1 supports prefill and decode in multi node and it uses splitwise or dp scheduler + # v2 supports prefill and decode in multi node and it uses router and local scheduler self.splitwise_version = None - if self.scheduler_config.name in ("splitwise", "dp"): + if self.scheduler_config.name == "local" and (self.router_config is None or self.router_config.router is None): self.splitwise_version = "v0" - elif self.scheduler_config.name == "local" and self.router_config and self.router_config.router: + elif self.scheduler_config.name in ("splitwise", "dp"): self.splitwise_version = "v1" + elif self.scheduler_config.name == "local" and self.router_config and self.router_config.router: + self.splitwise_version = "v2" + else: + raise ValueError( + f"Unsupported scheduler mode, scheduler_name: {self.scheduler_config.name}, " + f"router_config: {self.router_config}" + ) + llm_logger.info(f"splitwise_version: {self.splitwise_version}") if isinstance(self.parallel_config.engine_worker_queue_port, (int, str)): engine_worker_queue_port = self.parallel_config.engine_worker_queue_port @@ -1807,7 +1826,7 @@ def init_cache_info(self): "port": connector_port, "rdma_port": self.cache_config.rdma_comm_ports, } - logger.info(f"disaggregate_info: {self.disaggregate_info}") + llm_logger.info(f"disaggregate_info: {self.disaggregate_info}") if self.router_config: self.register_info = { @@ -1820,7 +1839,7 @@ def init_cache_info(self): "device_ids": self.local_device_ids, "transfer_protocol": self.cache_config.cache_transfer_protocol.split(","), } - logger.info(f"register_info: {self.register_info}") + llm_logger.info(f"register_info: {self.register_info}") def read_from_config(self): """ @@ -1831,7 +1850,7 @@ def reset_value(cls, value_name, key): if hasattr(cls, key): value = getattr(cls, key) setattr(cls, value_name, value) - logger.info(f"Reset parameter {value_name} = {value} from configuration.") + llm_logger.info(f"Reset parameter {value_name} = {value} from configuration.") reset_value(self.cache_config, "block_size", "infer_model_block_size") reset_value( From 0c349ac5cafac9435adc4cd8ef973819d7a9586d Mon Sep 17 00:00:00 2001 From: Jiang-Jia-Jun Date: Mon, 17 Nov 2025 15:02:45 +0800 Subject: [PATCH 2/5] [Polish] Polish some logger --- fastdeploy/scheduler/config.py | 10 ++++------ fastdeploy/splitwise/internal_adapter_utils.py | 16 ++++++++-------- fastdeploy/transformer_utils/config.py | 8 +++----- fastdeploy/worker/model_runner_base.py | 3 --- 4 files changed, 15 insertions(+), 22 deletions(-) diff --git a/fastdeploy/scheduler/config.py b/fastdeploy/scheduler/config.py index 83ee476e467..894baf44611 100644 --- a/fastdeploy/scheduler/config.py +++ b/fastdeploy/scheduler/config.py @@ -16,9 +16,7 @@ import redis -from fastdeploy.utils import get_logger, llm_logger - -config_logger = get_logger("config", "config.log") +from fastdeploy.utils import llm_logger from .dp_scheduler import DPScheduler from .global_scheduler import GlobalScheduler @@ -86,10 +84,10 @@ def print(self): """ Print the current configuration to logs. """ - config_logger.info("LocalScheduler Configuration Information :") + llm_logger.info("LocalScheduler Configuration Information :") for k, v in self.__dict__.items(): - config_logger.info("{:<20}:{:<6}{}".format(k, "", v)) - config_logger.info("=============================================================") + llm_logger.info("{:<20}:{:<6}{}".format(k, "", v)) + llm_logger.info("=============================================================") class DPLocalSchedulerConfig(LocalSchedulerConfig): diff --git a/fastdeploy/splitwise/internal_adapter_utils.py b/fastdeploy/splitwise/internal_adapter_utils.py index 30aa74d7ded..ba0b9aa901e 100644 --- a/fastdeploy/splitwise/internal_adapter_utils.py +++ b/fastdeploy/splitwise/internal_adapter_utils.py @@ -25,8 +25,6 @@ from fastdeploy.metrics.metrics import get_filtered_metrics, main_process_metrics from fastdeploy.utils import envs, get_logger -logger = get_logger("internal_adapter_utils", "internal_adapter_utils.log") - class InternalAdapter: def __init__(self, cfg, engine, dp_rank): @@ -40,6 +38,8 @@ def __init__(self, cfg, engine, dp_rank): target=self._recv_external_module_control_instruct, daemon=True ) self.recv_external_instruct_thread.start() + self.logger = get_logger("internal_adapter_utils", "internal_adapter_utils.log") + if cfg.scheduler_config.splitwise_role != "mixed": self.response_external_instruct_thread = threading.Thread( target=self._response_external_module_control_instruct, daemon=True @@ -78,12 +78,12 @@ def _recv_external_module_control_instruct(self): if task is None: time.sleep(0.001) continue - logger.info(f"dprank {self.dp_rank} Recieve control task: {task}") + self.logger.info(f"dprank {self.dp_rank} Recieve control task: {task}") task_id_str = task["task_id"] if task["cmd"] == "get_payload": payload_info = self._get_current_server_info() result = {"task_id": task_id_str, "result": payload_info} - logger.debug(f"Response for task: {task_id_str}") + self.logger.debug(f"Response for task: {task_id_str}") with self.response_lock: self.recv_control_cmd_server.response_for_control_cmd(task_id_str, result) @@ -93,14 +93,14 @@ def _recv_external_module_control_instruct(self): extra_register_func=lambda reg: main_process_metrics.register_all(reg, workers=1), ) result = {"task_id": task_id_str, "result": metrics_text} - logger.debug(f"Response for task: {task_id_str}") + self.logger.debug(f"Response for task: {task_id_str}") with self.response_lock: self.recv_control_cmd_server.response_for_control_cmd(task_id_str, result) elif task["cmd"] == "connect_rdma": self.engine.engine_worker_queue.put_connect_rdma_task(task) except Exception as e: - logger.error(f"handle_control_cmd got error: {e}, {traceback.format_exc()!s}") + self.logger.error(f"handle_control_cmd got error: {e}, {traceback.format_exc()!s}") def _response_external_module_control_instruct(self): while True: @@ -109,10 +109,10 @@ def _response_external_module_control_instruct(self): if result_data: task_id_str = result_data["task_id"] result = {"task_id": task_id_str, "result": result_data} - logger.info(f"Response for task: {task_id_str}") + self.logger.info(f"Response for task: {task_id_str}") with self.response_lock: self.recv_control_cmd_server.response_for_control_cmd(task_id_str, result) else: time.sleep(0.001) except Exception as e: - logger.error(f"_handle_connect_rdma_results got error: {e}, {traceback.format_exc() !s}") + self.logger.error(f"_handle_connect_rdma_results got error: {e}, {traceback.format_exc() !s}") diff --git a/fastdeploy/transformer_utils/config.py b/fastdeploy/transformer_utils/config.py index 4eed4745849..576f7286dd1 100644 --- a/fastdeploy/transformer_utils/config.py +++ b/fastdeploy/transformer_utils/config.py @@ -12,9 +12,7 @@ RevisionNotFoundError, ) -from fastdeploy.utils import get_logger - -logger = get_logger("transformer_config", "transformer_config.log") +from fastdeploy.utils import llm_logger def file_or_path_exists(model, config_name): @@ -80,10 +78,10 @@ def get_hf_file_to_dict(file_name: str, model: Union[str, Path], revision: Optio except huggingface_hub.errors.OfflineModeIsEnabled: return None except (RepositoryNotFoundError, RevisionNotFoundError, EntryNotFoundError, LocalEntryNotFoundError) as e: - logger.debug("File or repository not found in hf_hub_download", e) + llm_logger.debug("File or repository not found in hf_hub_download", e) return None except HfHubHTTPError as e: - logger.warning( + llm_logger.warning( "Cannot connect to Hugging Face Hub. Skipping file " "download for '%s':", file_name, exc_info=e ) return None diff --git a/fastdeploy/worker/model_runner_base.py b/fastdeploy/worker/model_runner_base.py index 699182576b2..4d5f7b197b2 100644 --- a/fastdeploy/worker/model_runner_base.py +++ b/fastdeploy/worker/model_runner_base.py @@ -19,11 +19,8 @@ from paddle import nn from fastdeploy.config import FDConfig -from fastdeploy.utils import get_logger from fastdeploy.worker.output import ModelRunnerOutput -logger = get_logger("model_runner_base", "model_runner_base.log") - class ModelRunnerBase(ABC): """ From 0794f8e4d8520419e0371dfe378a4cdff6a15833 Mon Sep 17 00:00:00 2001 From: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com> Date: Mon, 17 Nov 2025 15:06:26 +0800 Subject: [PATCH 3/5] Update fastdeploy/__init__.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- fastdeploy/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fastdeploy/__init__.py b/fastdeploy/__init__.py index a13a37cde62..ec44a27ae41 100644 --- a/fastdeploy/__init__.py +++ b/fastdeploy/__init__.py @@ -33,7 +33,8 @@ if envs.FD_DEBUG != 1: import logging - pf_logger.logger.setLevel(logging.ERROR) + # Set paddleformers logger to WARNING to suppress INFO logs but still show warnings and errors. + pf_logger.logger.setLevel(logging.WARNING) import warnings warnings.filterwarnings("ignore", module="paddleformers") From d6568c033c59518c0c374cd830fdeba8c8fed06b Mon Sep 17 00:00:00 2001 From: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com> Date: Mon, 17 Nov 2025 15:06:57 +0800 Subject: [PATCH 4/5] Update fastdeploy/config.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- fastdeploy/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 8c05b90b84b..3b5b7a946d4 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -1320,7 +1320,7 @@ def reset(self, num_gpu_blocks): if self.prefill_kvcache_block_num < self.max_block_num_per_seq: available_cache_tokens = self.prefill_kvcache_block_num * self.block_size console_logger.error( - f"The current KV Cache can only support caching {available_cache_tokens} tokens, which is less than the set max_model_len={self.model_cfg.max_model_len}. Please deploy this model under on GPUs with larger memory or reduce your `max_model_len` to {available_cache_tokens} or less." + f"The current KV Cache can only support caching {available_cache_tokens} tokens, which is less than the set max_model_len={self.model_cfg.max_model_len}. Please deploy this model on GPUs with larger memory or reduce your `max_model_len` to {available_cache_tokens} or less." ) raise RuntimeError("Resource is not sufficient.") From 07eb3cd21d9b8d7a828f8b043ba7693f9c4d0e16 Mon Sep 17 00:00:00 2001 From: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com> Date: Mon, 17 Nov 2025 15:07:14 +0800 Subject: [PATCH 5/5] Update fastdeploy/config.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- fastdeploy/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 3b5b7a946d4..0ac86622e87 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -1780,7 +1780,7 @@ def init_cache_info(self): """ initialize cache info """ - # TODO: group the splitiwse params, remove code of v0 + # TODO: group the splitwise params, remove code of v0 # v0 requires prefill and decode in one node and it uses local scheduler # v1 supports prefill and decode in multi node and it uses splitwise or dp scheduler # v2 supports prefill and decode in multi node and it uses router and local scheduler