PaddlePaddle
diff --git a/‎.github/workflows/_unit_test_coverage.yml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/_unit_test_coverage.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.github/workflows/remove-skip-ci-labels.yml‎
Lines changed: 53 additions & 0 deletions b/‎.github/workflows/remove-skip-ci-labels.yml‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎benchmarks/yaml/eb45-vl-28b-thinking-128k-wint8.yaml‎
Lines changed: 8 additions & 0 deletions b/‎benchmarks/yaml/eb45-vl-28b-thinking-128k-wint8.yaml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎benchmarks/yaml/eb45-vl-28b-thinking-32k-wint8.yaml‎
Lines changed: 8 additions & 0 deletions b/‎benchmarks/yaml/eb45-vl-28b-thinking-32k-wint8.yaml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎custom_ops/gpu_ops/helper.h‎
Lines changed: 5 additions & 0 deletions b/‎custom_ops/gpu_ops/helper.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎custom_ops/xpu_ops/build.sh‎
Lines changed: 24 additions & 4 deletions b/‎custom_ops/xpu_ops/build.sh‎
Lines changed: 24 additions & 4 deletions
diff --git a/‎fastdeploy/config.py‎
Lines changed: 2 additions & 0 deletions b/‎fastdeploy/config.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎fastdeploy/engine/args_utils.py‎
Lines changed: 11 additions & 0 deletions b/‎fastdeploy/engine/args_utils.py‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎fastdeploy/engine/common_engine.py‎
Lines changed: 27 additions & 50 deletions b/‎fastdeploy/engine/common_engine.py‎
Lines changed: 27 additions & 50 deletions
diff --git a/‎fastdeploy/engine/request.py‎
Lines changed: 4 additions & 0 deletions b/‎fastdeploy/engine/request.py‎
Lines changed: 4 additions & 0 deletions
@@ -43,6 +43,7 @@ jobs:
     runs-on: [self-hosted, GPU-h1z1-2Cards]
     timeout-minutes: 90
     needs: check_cov_skip
+    if: needs.check_cov_skip.outputs.can-skip != 'true'
     outputs:
       diff_cov_file_url: ${{ steps.cov_upload.outputs.diff_cov_file_url }}
       unittest_failed_url: ${{ steps.cov_upload.outputs.unittest_failed_url }}
@@ -319,7 +320,7 @@ jobs:
           echo "All tests passed"
 
       - name: Verify Code Coverage Threshold (80%)
-        if: ${{ github.event_name == 'pull_request' && (needs.check_cov_skip.outputs['can-skip'] != 'true') }}
+        if: ${{ github.event_name == 'pull_request' }}
         shell: bash
         run: |
           cd FastDeploy
 
@@ -0,0 +1,53 @@
+name: Remove Skip-CI Labels
+
+on:
+  pull_request_target:
+    types: [synchronize]
+
+permissions:
+  pull-requests: write
+
+jobs:
+  remove-skip-ci-labels:
+    name: Remove skip-ci labels on new commits
+    runs-on: ubuntu-latest
+    steps:
+      - name: Get PR labels
+        id: get-labels
+        uses: actions/github-script@v7
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            const { data: labels } = await github.rest.issues.listLabelsOnIssue({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number
+            });
+
+            const skipCiLabels = labels
+              .filter(label => label.name.startsWith('skip-ci:'))
+              .map(label => label.name);
+
+            console.log('Found skip-ci labels:', skipCiLabels);
+            core.setOutput('skip-ci-labels', JSON.stringify(skipCiLabels));
+            core.setOutput('has-skip-ci-labels', skipCiLabels.length > 0 ? 'true' : 'false');
+
+      - name: Remove skip-ci labels
+        if: steps.get-labels.outputs.has-skip-ci-labels == 'true'
+        uses: actions/github-script@v7
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            const skipCiLabels = JSON.parse('${{ steps.get-labels.outputs.skip-ci-labels }}');
+
+            for (const label of skipCiLabels) {
+              console.log(`Removing label: ${label}`);
+              await github.rest.issues.removeLabel({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: context.issue.number,
+                name: label
+              });
+            }
+
+            console.log(`Successfully removed ${skipCiLabels.length} skip-ci label(s)`);
@@ -0,0 +1,8 @@
+max_model_len: 131072
+tensor_parallel_size: 1
+quantization: wint8
+max_num_seqs: 32
+reasoning_parser: ernie-45-vl-thinking
+tool_call_parser: ernie-45-vl-thinking
+load_choices: "default_v1"
+mm-processor-kwargs: '{"image_max_pixels": 12845056 }'
@@ -0,0 +1,8 @@
+max_model_len: 32768
+tensor_parallel_size: 1
+quantization: wint8
+max_num_seqs: 32
+reasoning_parser: ernie-45-vl-thinking
+tool_call_parser: ernie-45-vl-thinking
+load_choices: "default_v1"
+mm-processor-kwargs: '{"image_max_pixels": 12845056 }'
@@ -725,6 +725,10 @@ inline void launchWithPdlWhenEnabled(KernelFn kernelFn,
                                      size_t dynamicShmSize,
                                      cudaStream_t stream,
                                      Args &&...args) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU
+  (*kernelFn)<<<grid, block, dynamicShmSize, stream>>>(
+      std::forward<Args>(args)...);
+#else
   cudaLaunchConfig_t kernelConfig;
   kernelConfig.gridDim = grid;
   kernelConfig.blockDim = block;
@@ -738,5 +742,6 @@ inline void launchWithPdlWhenEnabled(KernelFn kernelFn,
   kernelConfig.numAttrs = 1;
 
   cudaLaunchKernelEx(&kernelConfig, kernelFn, std::forward<Args>(args)...);
+#endif
 }
 #endif
@@ -26,8 +26,28 @@ SYSTEM_VERSION=`${python} -c "import platform; print(platform.system().lower())"
 PROCESSOR_VERSION=`${python} -c "import platform; print(platform.processor())"`
 WHEEL_NAME="fastdeploy_ops-${OPS_VERSION}-${PY_VERSION}-${SYSTEM_VERSION}-${PROCESSOR_VERSION}.egg"
 
+# Add compatibility for modern python packaging methods
+WHEEL_MODERN_NAME="fastdeploy_ops"
+
 ${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
-mkdir -p ${OPS_TMP_DIR}/${WHEEL_NAME}/libs
-cp ${XVLLM_PATH}/xft_blocks/so/libxft_blocks.so ${OPS_TMP_DIR}/${WHEEL_NAME}/libs/
-cp ${XVLLM_PATH}/infer_ops/so/libapiinfer.so ${OPS_TMP_DIR}/${WHEEL_NAME}/libs/
-patchelf --set-rpath '$ORIGIN/libs' ${OPS_TMP_DIR}/${WHEEL_NAME}/fastdeploy_ops_pd_.so
+
+# Handle directory compatibility between modern and legacy naming
+if [ -d "./${OPS_TMP_DIR}/${WHEEL_MODERN_NAME}" ]; then
+    echo -e "${GREEN}[Info]${NONE} Ready to use ops from modern directory ${WHEEL_MODERN_NAME}"
+    # Use modern directory name
+    TARGET_DIR="${OPS_TMP_DIR}/${WHEEL_MODERN_NAME}"
+else
+    # If modern directory doesn't exist, check for legacy directory
+    if [ -d "./${OPS_TMP_DIR}/${WHEEL_NAME}" ]; then
+        echo -e "${YELLOW}[Warning]${NONE} ${WHEEL_NAME} directory exists. This is a deprecated packaging and distribution method."
+    else
+        echo -e "${RED}[Error]${NONE} Neither modern nor legacy directory found in ${OPS_TMP_DIR}"
+    fi
+    # Use legacy directory name
+    TARGET_DIR="${OPS_TMP_DIR}/${WHEEL_NAME}"
+fi
+
+mkdir -p ${TARGET_DIR}/libs
+cp ${XVLLM_PATH}/xft_blocks/so/libxft_blocks.so ${TARGET_DIR}/libs/
+cp ${XVLLM_PATH}/infer_ops/so/libapiinfer.so ${TARGET_DIR}/libs/
+patchelf --set-rpath '$ORIGIN/libs' ${TARGET_DIR}/fastdeploy_ops_pd_.so
@@ -557,6 +557,8 @@ def __init__(
         self.use_internode_ll_two_stage: bool = False
         # disable sequence parallel moe
         self.disable_sequence_parallel_moe: bool = False
+        # enable async download features
+        self.enable_async_download_features: bool = False
 
         self.pod_ip: str = None
         # enable the custom all-reduce kernel and fall back to NCCL(dist.all_reduce).
 
@@ -467,6 +467,11 @@ class EngineArgs:
     Url for router server, such as `0.0.0.0:30000`.
     """
 
+    enable_async_download_features: bool = False
+    """
+    Flag to enable async download features. Default is False (disabled).
+    """
+
     def __post_init__(self):
         """
         Post-initialization processing to set default tokenizer if not provided.
@@ -849,6 +854,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             default=EngineArgs.enable_expert_parallel,
             help="Enable expert parallelism.",
         )
+        parallel_group.add_argument(
+            "--enable-async-download-features",
+            action="store_true",
+            default=EngineArgs.enable_async_download_features,
+            help="Enable async download features.",
+        )
 
         # Load group
         load_group = parser.add_argument_group("Load Configuration")
 
@@ -51,14 +51,7 @@
 from fastdeploy.splitwise.splitwise_connector import SplitwiseConnector
 from fastdeploy.trace.constants import LoggingEventName
 from fastdeploy.trace.trace_logger import print as trace_print
-from fastdeploy.utils import (
-    EngineError,
-    check_download_links,
-    envs,
-    get_logger,
-    init_bos_client,
-    llm_logger,
-)
+from fastdeploy.utils import EngineError, envs, get_logger, llm_logger
 
 try:
     TokenProcessor = load_token_processor_plugins()
@@ -808,7 +801,7 @@ def _fetch_request():
                             else:
                                 raise
                 # 2. Schedule requests
-                tasks = self.resource_manager.schedule()
+                tasks, error_tasks = self.resource_manager.schedule()
 
                 # 3. Send to engine
                 if tasks:
@@ -833,7 +826,16 @@ def _fetch_request():
                         trace_print(LoggingEventName.REQUEST_SCHEDULE_END, task.request_id, getattr(task, "user", ""))
                         trace_print(LoggingEventName.INFERENCE_START, task.request_id, getattr(task, "user", ""))
                     self.engine_worker_queue.put_tasks((tasks, self.resource_manager.real_bsz))
-                else:
+
+                # 4. Response error tasks
+                if error_tasks:
+                    for request_id, failed in error_tasks:
+                        if failed is None:
+                            llm_logger.warning(f"Request {request_id} has no error, skip sending error response.")
+                            continue
+                        self._send_error_response(request_id, failed)
+
+                if not tasks and not error_tasks:
                     time.sleep(0.005)
 
             except RuntimeError as e:
@@ -909,24 +911,6 @@ def _insert_zmq_task_to_scheduler(self):
                             self.llm_logger.error(f"Receive request error: {err_msg}")
                             results.append((request.request_id, err_msg))
 
-                    if self._has_features_info(request) and err_msg is None:
-                        if self.bos_client is None:
-                            self.bos_client = init_bos_client()
-
-                        download_urls = []
-                        inputs = request.multimodal_inputs
-                        if inputs.get("video_feature_urls") is not None:
-                            download_urls.extend(inputs.get("video_feature_urls"))
-                        if inputs.get("image_feature_urls") is not None:
-                            download_urls.extend(inputs.get("image_feature_urls"))
-                        if inputs.get("audio_feature_urls") is not None:
-                            download_urls.extend(inputs.get("audio_feature_urls"))
-
-                        err_msg = check_download_links(self.bos_client, download_urls)
-                        if err_msg:
-                            llm_logger.error(f"Receive request {request.request_id} download error: {err_msg}")
-                            results.append((request.request_id, err_msg))
-
                     if err_msg is None:
                         insert_task.append(request)
 
@@ -948,21 +932,27 @@ def _insert_zmq_task_to_scheduler(self):
                         main_process_metrics.num_requests_waiting.inc(1)
                         continue
 
-                    error_result = RequestOutput(
-                        request_id=request_id,
-                        finished=True,
-                        error_code=500,
-                        error_msg=failed,
-                    )
-                    # Since the request is not in scheduler
-                    # Send result by zmq directly
-                    self.send_response_server.send_response(request_id, [error_result])
+                    self._send_error_response(request_id, failed)
             except Exception as e:
                 self.llm_logger.error(
                     f"Error happened while receiving new request from zmq, details={e}, "
                     f"traceback={traceback.format_exc()}"
                 )
 
+    def _send_error_response(self, request_id, error_msg, error_code: int = 500):
+        llm_logger.error(
+            f"Send error response to client, request_id: {request_id}, error_msg: {error_msg}, error_code: {error_code}"
+        )
+        error_result = RequestOutput(
+            request_id=request_id,
+            finished=True,
+            error_code=error_code,
+            error_msg=error_msg,
+        )
+        # Since the request is not in scheduler
+        # Send result by zmq directly
+        self.send_response_server.send_response(request_id, [error_result])
+
     def _decode_token(self, token_ids, req_id, is_end):
         delta_text = ""
         if envs.FD_ENABLE_RETURN_TEXT:
@@ -977,19 +967,6 @@ def _decode_token(self, token_ids, req_id, is_end):
                 del self.data_processor.decode_status[req_id]
         return delta_text, token_ids
 
-    def _has_features_info(self, task):
-        inputs = task.multimodal_inputs
-        if inputs is None or len(inputs) == 0:
-            return False
-
-        if (
-            (inputs.get("video_feature_urls") is not None and len(inputs["video_feature_urls"]) > 0)
-            or (inputs.get("image_feature_urls") is not None and len(inputs["image_feature_urls"]) > 0)
-            or (inputs.get("audio_feature_urls") is not None and len(inputs["audio_feature_urls"]) > 0)
-        ):
-            return True
-        return False
-
     def _zmq_send_generated_tokens(self):
         """
         Recieve output for zmq
 
@@ -168,6 +168,10 @@ def __init__(
         # dp
         self.dp_rank = dp_rank
 
+        self.async_process_futures = []
+        self.error_message = None
+        self.error_code = None
+
     @classmethod
     def from_dict(cls, d: dict):
         data_processor_logger.debug(f"{d}")