PaddlePaddle
diff --git a/‎.github/workflows/ci_xpu.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/ci_xpu.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎custom_ops/xpu_ops/src/ops/moe_expert_ffn.cc‎
Lines changed: 1 addition & 1 deletion b/‎custom_ops/xpu_ops/src/ops/moe_expert_ffn.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎fastdeploy/cache_manager/ops.py‎
Lines changed: 93 additions & 63 deletions b/‎fastdeploy/cache_manager/ops.py‎
Lines changed: 93 additions & 63 deletions
diff --git a/‎fastdeploy/config.py‎
Lines changed: 48 additions & 18 deletions b/‎fastdeploy/config.py‎
Lines changed: 48 additions & 18 deletions
diff --git a/‎fastdeploy/engine/args_utils.py‎
Lines changed: 36 additions & 4 deletions b/‎fastdeploy/engine/args_utils.py‎
Lines changed: 36 additions & 4 deletions
diff --git a/‎fastdeploy/engine/async_llm.py‎
Lines changed: 1 addition & 0 deletions b/‎fastdeploy/engine/async_llm.py‎
Lines changed: 1 addition & 0 deletions
@@ -16,6 +16,7 @@ concurrency:
 
 jobs:
   CI_XPU:
+    timeout-minutes: 60
     runs-on: [self-hosted, XPU-P800-8Card]
     steps:
       - name: Print current runner name
 
@@ -441,7 +441,7 @@ std::vector<paddle::Tensor> MoeExpertFFN(
     const std::string& quant_method,
     const int hadamard_blocksize,
     const int valid_token_num) {
-  if (ffn_in.numel() == 0) {
+  if (ffn_in.numel() == 0 || valid_token_num == 0) {
     paddle::Tensor ffn2_out =
         paddle::empty_like(ffn_in, paddle::DataType::BFLOAT16);
     return {ffn2_out};
 
@@ -1,77 +1,107 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
 import paddle
 
 from fastdeploy.platforms import current_platform
 
-if current_platform.is_cuda():
-    from fastdeploy.model_executor.ops.gpu import (
-        cuda_host_alloc,
-        cuda_host_free,
-        get_data_ptr_ipc,
-        get_output_kv_signal,
-        ipc_sent_key_value_cache_by_remote_ptr,
-        ipc_sent_key_value_cache_by_remote_ptr_block_sync,
-        set_data_ipc,
-        share_external_data,
-        swap_cache_all_layers,
-        unset_data_ipc,
-    )
-
-    memory_allocated = paddle.device.cuda.memory_allocated
-
-    def get_peer_mem_addr(*args, **kwargs):
-        raise RuntimeError("CUDA no need of get_peer_mem_addr!")
-
-elif current_platform.is_xpu():
-    from fastdeploy.model_executor.ops.xpu import (
-        cuda_host_alloc,
-        cuda_host_free,
-        get_output_kv_signal,
-        get_peer_mem_addr,
-        set_data_ipc,
-        share_external_data,
-        swap_cache_all_layers,
-    )
-
-    unset_data_ipc = None
-    memory_allocated = paddle.device.xpu.memory_allocated
-
-    def get_data_ptr_ipc(*args, **kwargs):
-        raise RuntimeError("XPU get_data_ptr_ipc UNIMPLENENTED!")
-
-    def ipc_sent_key_value_cache_by_remote_ptr(*args, **kwargs):
-        raise RuntimeError("XPU ipc_sent_key_value_cache_by_remote_ptr UNIMPLENENTED")
-
-    def ipc_sent_key_value_cache_by_remote_ptr_block_sync(*args, **kwargs):
-        raise RuntimeError("XPU No ipc_sent_key_value_cache_by_remote_ptr UNIMPLENENTED")
-
-else:
-    raise RuntimeError("Prefix cache ops only supported CUDA nor XPU platform ")
-
-
-def set_device(device):
+try:
     if current_platform.is_cuda():
-        paddle.set_device(f"gpu:{device}")
+        from fastdeploy.model_executor.ops.gpu import (
+            cuda_host_alloc,
+            cuda_host_free,
+            get_data_ptr_ipc,
+            get_output_kv_signal,
+            ipc_sent_key_value_cache_by_remote_ptr,
+            ipc_sent_key_value_cache_by_remote_ptr_block_sync,
+            set_data_ipc,
+            share_external_data,
+            swap_cache_all_layers,
+            unset_data_ipc,
+        )
+
+        memory_allocated = paddle.device.cuda.memory_allocated
+
+        def get_peer_mem_addr(*args, **kwargs):
+            raise RuntimeError("CUDA no need of get_peer_mem_addr!")
+
     elif current_platform.is_xpu():
-        paddle.set_device(f"xpu:{device}")
-    else:
-        raise RuntimeError("No supported platform")
+        from fastdeploy.model_executor.ops.xpu import (
+            cuda_host_alloc,
+            cuda_host_free,
+            get_output_kv_signal,
+            get_peer_mem_addr,
+            set_data_ipc,
+            share_external_data,
+            swap_cache_all_layers,
+        )
 
+        unset_data_ipc = None
+        memory_allocated = paddle.device.xpu.memory_allocated
 
-def share_external_data_(cache, cache_name, cache_shape, use_ipc):
-    if current_platform.is_cuda():
-        cache = share_external_data(cache, cache_name, cache_shape)
-    elif current_platform.is_xpu():
-        cache = share_external_data(cache, cache_name, cache_shape, use_ipc)
-    else:
-        raise RuntimeError("No supported platform")
-    return cache
+        def get_data_ptr_ipc(*args, **kwargs):
+            raise RuntimeError("XPU get_data_ptr_ipc UNIMPLENENTED!")
 
+        def ipc_sent_key_value_cache_by_remote_ptr(*args, **kwargs):
+            raise RuntimeError("XPU ipc_sent_key_value_cache_by_remote_ptr UNIMPLENENTED")
+
+        def ipc_sent_key_value_cache_by_remote_ptr_block_sync(*args, **kwargs):
+            raise RuntimeError("XPU No ipc_sent_key_value_cache_by_remote_ptr UNIMPLENENTED")
 
-def get_all_visible_devices():
-    if current_platform.is_xpu():
-        return "XPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7"
     else:
-        return "CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7"
+        raise RuntimeError("Prefix cache ops only supported CUDA nor XPU platform ")
+
+    def set_device(device):
+        if current_platform.is_cuda():
+            paddle.set_device(f"gpu:{device}")
+        elif current_platform.is_xpu():
+            paddle.set_device(f"xpu:{device}")
+        else:
+            raise RuntimeError("No supported platform")
+
+    def share_external_data_(cache, cache_name, cache_shape, use_ipc):
+        if current_platform.is_cuda():
+            cache = share_external_data(cache, cache_name, cache_shape)
+        elif current_platform.is_xpu():
+            cache = share_external_data(cache, cache_name, cache_shape, use_ipc)
+        else:
+            raise RuntimeError("No supported platform")
+        return cache
+
+    def get_all_visible_devices():
+        if current_platform.is_xpu():
+            return "XPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7"
+        else:
+            return "CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7"
+
+except:
+    cuda_host_alloc = None
+    cuda_host_free = None
+    set_data_ipc = None
+    share_external_data_ = None
+    swap_cache_all_layers = None
+    unset_data_ipc = None
+    set_device = None
+    memory_allocated = None
+    get_output_kv_signal = None
+    get_data_ptr_ipc = None
+    ipc_sent_key_value_cache_by_remote_ptr = None
+    ipc_sent_key_value_cache_by_remote_ptr_block_sync = None
+    get_peer_mem_addr = None
+    get_all_visible_devices = None
 
 
 __all__ = [
 
@@ -180,12 +180,12 @@ def __init__(
     ):
         self.model = ""
         self.is_quantized = False
+        self.is_moe_quantized = False
         self.max_model_len = 0
         self.dtype = "bfloat16"
         self.enable_logprob = False
         self.max_logprobs = 20
         self.logprobs_mode = "raw_logprobs"
-        self.enable_redundant_experts = False
         self.redundant_experts_num = 0
         self.seed = 0
         self.quantization = None
@@ -1159,20 +1159,54 @@ class EPLBConfig:
 
     def __init__(
         self,
+        args,
     ):
-        self.enable_redundant_experts = envs.FD_ENABLE_REDUNDANT_EXPERTS
-        self.redundant_experts_num = envs.FD_REDUNDANT_EXPERTS_NUM
-        self.redundant_expert_ip_shm_size = envs.FD_REDUNDANT_EXPERT_IP_SHM_SIZE
-        self.redundant_expert_meta_dir = envs.FD_REDUNDANT_EXPERT_META_DIR
-        self.redundant_expert_api_user = envs.FD_REDUNDANT_EXPERT_API_USER
-        self.redundant_expert_api_password = envs.FD_REDUNDANT_EXPERT_API_PASSWORD
-        self.redundant_expert_eplb_strategy = envs.FD_REDUNDANT_EXPERT_EPLB_STRATEGY
-        self.redundant_expert_dump_workload_interval = envs.FD_REDUNDANT_EXPERT_DUMP_WORKLOAD_INTERVAL
-        self.redundant_expert_async_load_model_shmem_size_gb = envs.FD_REDUNDANT_EXPERT_ASYNC_LOAD_MODEL_SHMEM_SIZE_GB
-        self.redundant_expert_enable_schedule_cordon = envs.FD_REDUNDANT_EXPERT_ENABLE_SCHEDULE_CORDON
-        self.model_use_safetensors = envs.FD_MODEL_USE_SAFETENSORS
-        self.model_use_offline_quant = envs.FD_MODEL_USE_OFFLINE_QUANT
-        self.moe_quant_type = envs.FD_MOE_QUANT_TYPE
+        if args is None:
+            args = {}
+
+        # enable eplb
+        self.enable_eplb: bool = False
+        # redundant experts num
+        self.redundant_experts_num: int = 0
+        # expert ip shm size
+        self.redundant_expert_ip_shm_size: int = 1024
+        # expert meta dir
+        self.redundant_expert_meta_dir: str = "/tmp/redundant_expert_meta"
+        # expert api user and password
+        self.redundant_expert_api_user: str = ""
+        self.redundant_expert_api_password: str = ""
+        # expert eplb strategy
+        self.redundant_expert_eplb_strategy: str = ""
+        # expert dump workload interval
+        self.redundant_expert_dump_workload_interval: int = 10
+        # expert async load model shmem size gb
+        self.redundant_expert_async_load_model_shmem_size_gb: int = 0
+        # expert enable schedule cordon
+        self.redundant_expert_enable_schedule_cordon: bool = True
+        # model use safetensors
+        self.model_use_safetensors: bool = True
+        # model use offline quant
+        self.model_use_offline_quant: bool = True
+        # moe quant type
+        self.moe_quant_type: str = "w4a8"
+        for key, value in args.items():
+            if hasattr(self, key):
+                setattr(self, key, value)
+
+    def to_json_string(self):
+        """
+        Convert eplb_config to json string.
+        """
+        return json.dumps({key: value for key, value in self.__dict__.items() if value is not None})
+
+    def print(self):
+        """
+        Print all configuration information.
+        """
+        logger.info("EPLB Configuration Information :")
+        for k, v in self.__dict__.items():
+            logger.info("{:<20}:{:<6}{}".format(k, "", v))
+        logger.info("=============================================================")
 
 
 class CacheConfig:
@@ -1601,10 +1635,6 @@ def postprocess(self):
                 else:
                     self.scheduler_config.max_num_batched_tokens = self.model_config.max_model_len
 
-        self.scheduler_config.max_chunk_len = (
-            self.scheduler_config.max_num_batched_tokens + self.scheduler_config.max_extra_num_batched_tokens
-        )
-
         if self.long_prefill_token_threshold == 0:
             self.long_prefill_token_threshold = int(self.model_config.max_model_len * 0.04)
 
 
@@ -467,6 +467,16 @@ class EngineArgs:
     Url for router server, such as `0.0.0.0:30000`.
     """
 
+    enable_eplb: bool = False
+    """
+    Flag to enable eplb
+    """
+
+    eplb_config: Optional[Dict[str, Any]] = None
+    """
+    Configuration for eplb.
+    """
+
     def __post_init__(self):
         """
         Post-initialization processing to set default tokenizer if not provided.
@@ -523,7 +533,7 @@ def __post_init__(self):
                         f"= {expected_ports}, but got {len(self.rdma_comm_ports)}."
                     )
 
-        if not current_platform.is_cuda() and not current_platform.is_xpu():
+        if not (current_platform.is_cuda() or current_platform.is_xpu() or current_platform.is_maca()):
             envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
         if self.guided_decoding_backend != "off":
             envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
@@ -850,6 +860,18 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             default=EngineArgs.enable_expert_parallel,
             help="Enable expert parallelism.",
         )
+        parallel_group.add_argument(
+            "--enable-eplb",
+            action="store_true",
+            default=EngineArgs.enable_eplb,
+            help="Enable eplb.",
+        )
+        parallel_group.add_argument(
+            "--eplb-config",
+            type=json.loads,
+            default=EngineArgs.eplb_config,
+            help="Config of eplb.",
+        )
 
         # Load group
         load_group = parser.add_argument_group("Load Configuration")
@@ -1126,7 +1148,7 @@ def create_speculative_config(self) -> SpeculativeConfig:
 
     def create_scheduler_config(self) -> SchedulerConfig:
         """
-        Create and retuan a SchedulerConfig object based on the current settings.
+        Create and return a SchedulerConfig object based on the current settings.
         """
         prefix = "scheduler_"
         prefix_len = len(prefix)
@@ -1173,13 +1195,22 @@ def create_early_stop_config(self) -> EarlyStopConfig:
                 early_stop_args[k] = v
         return EarlyStopConfig(early_stop_args)
 
+    def create_eplb_config(self) -> EPLBConfig:
+        """
+        Create and retuan an EPLBConfig object based on the current settings.
+        """
+        eplb_args = asdict(self)
+        if self.eplb_config is not None:
+            for k, v in self.eplb_config.items():
+                eplb_args[k] = v
+        eplb_args["enable_eplb"] = self.enable_eplb
+        return EPLBConfig(eplb_args)
+
     def create_engine_config(self, port_availability_check=True) -> FDConfig:
         """
         Create and return a Config object based on the current settings.
         """
         all_dict = asdict(self)
-        eplb_cfg = EPLBConfig()
-        all_dict["enable_redundant_experts"] = eplb_cfg.enable_redundant_experts
         model_cfg = ModelConfig(all_dict)
 
         # XPU currently disable prefix cache for VL model
@@ -1221,6 +1252,7 @@ def create_engine_config(self, port_availability_check=True) -> FDConfig:
         scheduler_cfg = self.create_scheduler_config()
         graph_opt_cfg = self.create_graph_optimization_config()
         plas_attention_config = self.create_plas_attention_config()
+        eplb_cfg = self.create_eplb_config()
         router_config = RouterConfig(all_dict)
 
         early_stop_cfg = self.create_early_stop_config()
 
@@ -833,6 +833,7 @@ def _start_worker_service(self):
             f" --override-pooler-config {self.cfg.model_config.override_pooler_config}"
             f" --logprobs_mode {self.cfg.model_config.logprobs_mode}"
             f" --max_logprobs {self.cfg.model_config.max_logprobs}"
+            f" --eplb_config '{self.cfg.eplb_config.to_json_string()}'"
         )
 
         worker_store_true_flag = {
Original file line number	Diff line number	Diff line change
`@@ -833,6 +833,7 @@ def _start_worker_service(self):`
`833`	`833`	`f" --override-pooler-config {self.cfg.model_config.override_pooler_config}"`
`834`	`834`	`f" --logprobs_mode {self.cfg.model_config.logprobs_mode}"`
`835`	`835`	`f" --max_logprobs {self.cfg.model_config.max_logprobs}"`
	`836`	`+ f" --eplb_config '{self.cfg.eplb_config.to_json_string()}'"`
`836`	`837`	`)`
`837`	`838`
`838`	`839`	`worker_store_true_flag = {`