vllm-project · Yikun · May 8, 2025 · May 7, 2025 · May 8, 2025
diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
@@ -7,3 +7,4 @@ sphinx-togglebutton
 myst-parser
 msgspec
 sphinx-substitution-extensions
+snowballstemmer<3.0.0
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -72,6 +72,8 @@
     # This value should be updated when cut down release.
     'pip_vllm_ascend_version': "0.7.3rc2",
     'pip_vllm_version': "0.7.3",
+    # The maching MindIE Turbo for vLLM Ascend
+    'pip_mindie_turbo_version': "2.0rc1",
     # CANN image tag
     'cann_image_tag': "8.1.rc1-910b-ubuntu22.04-py3.10",
 }

diff --git a/docs/source/installation.md b/docs/source/installation.md
@@ -195,6 +195,16 @@ The default workdir is `/workspace`, vLLM and vLLM Ascend code are placed in `/v
 
 :::::
 
+## (Optional) Install MindIE Turbo
+
+Install MindIE Turbo for performance acceleration:
+
+```{code-block} bash
+   :substitutions:
+
+pip install mindie_turbo==|pip_mindie_turbo_version|
+```
+
 ## Extra information
 
 ### Verify installation
@@ -254,3 +264,10 @@ Prompt: 'The president of the United States is', Generated text: ' a very import
 Prompt: 'The capital of France is', Generated text: ' Paris. The oldest part of the city is Saint-Germain-des-Pr'
 Prompt: 'The future of AI is', Generated text: ' not bright\n\nThere is no doubt that the evolution of AI will have a huge'
 ```
+
+### Compile Enhancement
+
+Get more performance gains by optimizing Python and torch-npu with the Bisheng compiler, please follow these official turtorial:
+
+[Optimizing Python with Bisheng](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0063.html)
+[Optimizing torch-npu with Bisheng](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0058.html)
diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md
@@ -33,6 +33,15 @@ docker run --rm \
 
 The default workdir is `/workspace`, vLLM and vLLM Ascend code are placed in `/vllm-workspace` and installed in [development mode](https://setuptools.pypa.io/en/latest/userguide/development_mode.html)(`pip install -e`) to help developer immediately take place changes without requiring a new installation.
 
+## (Optional) Install MindIE Turbo
+
+Install MindIE Turbo for performance acceleration:
+
+```{code-block} bash
+   :substitutions:
+pip install mindie_turbo==|pip_mindie_turbo_version|
+```
+
 ## Usage
 
 You can use Modelscope mirror to speed up download:
@@ -130,4 +139,8 @@ INFO:     Application shutdown complete.
 
 Finally, you can exit container by using `ctrl-D`.
 ::::
-:::::
+:::::
+
+### Performance enhancement related environment variables in Mindie Turbo
+
+Currently, some performance enhancement features in MindIE Turbo have certain scenario restrictions. For these features, environment variables are used to control whether to enable them. For related environment variables, see its [official documentation](https://www.hiascend.com/document/detail/zh/mindie/20RC1/AcceleratePlugin/turbodev/mindie-turbo-0010.html).
diff --git a/docs/source/tutorials/multi_node.md b/docs/source/tutorials/multi_node.md
@@ -30,6 +30,14 @@ docker run --rm \
 -it quay.io/ascend/vllm-ascend:|vllm_ascend_version| bash
 ```
 
+(Optional) Install MindIE Turbo for performance acceleration:
+
+```{code-block} bash
+   :substitutions:
+
+pip install mindie_turbo==|pip_mindie_turbo_version|
+```
+
 Choose one machine as head node, the other are worker nodes, then start ray on each machine:
 
 :::{note}

diff --git a/docs/source/tutorials/multi_npu.md b/docs/source/tutorials/multi_npu.md
@@ -27,6 +27,14 @@ docker run --rm \
 -it $IMAGE bash
 ```
 
+(Optional) Install MindIE Turbo for performance acceleration:
+
+```{code-block} bash
+   :substitutions:
+
+pip install mindie_turbo==|pip_mindie_turbo_version|
+```
+
 Setup environment variables:
 
 ```bash

diff --git a/docs/source/tutorials/single_npu.md b/docs/source/tutorials/single_npu.md
@@ -26,6 +26,14 @@ docker run --rm \
 -it $IMAGE bash
 ```
 
+(Optional) Install MindIE Turbo for performance acceleration:
+
+```{code-block} bash
+   :substitutions:
+
+pip install mindie_turbo==|pip_mindie_turbo_version|
+```
+
 Setup environment variables:
 
 ```bash
@@ -90,7 +98,20 @@ docker run --rm \
 -p 8000:8000 \
 -e VLLM_USE_MODELSCOPE=True \
 -e PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 \
--it $IMAGE \
+-it $IMAGE bash
+```
+
+(Optional) Install MindIE Turbo for performance acceleration:
+
+```{code-block} bash
+   :substitutions:
+
+pip install mindie_turbo==|pip_mindie_turbo_version|
+```
+
+Run the following script to start the vLLM server:
+
+```
 vllm serve Qwen/Qwen2.5-7B-Instruct --max_model_len 26240
 ```
 

diff --git a/docs/source/tutorials/single_npu_multimodal.md b/docs/source/tutorials/single_npu_multimodal.md
@@ -26,6 +26,14 @@ docker run --rm \
 -it $IMAGE bash
 ```
 
+(Optional) Install MindIE Turbo for performance acceleration:
+
+```{code-block} bash
+   :substitutions:
+
+pip install mindie_turbo==|pip_mindie_turbo_version|
+```
+
 Setup environment variables:
 
 ```bash
@@ -143,7 +151,20 @@ docker run --rm \
 -p 8000:8000 \
 -e VLLM_USE_MODELSCOPE=True \
 -e PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 \
--it $IMAGE \
+-it $IMAGE bash
+```
+
+(Optional) Install MindIE Turbo for performance acceleration:
+
+```{code-block} bash
+   :substitutions:
+
+pip install mindie_turbo==|pip_mindie_turbo_version|
+```
+
+Run the following script to start the vLLM server:
+
+```
 vllm serve Qwen/Qwen2.5-VL-7B-Instruct --dtype bfloat16 --max_model_len 16384 --max-num-batched-tokens 16384
 ```
 

diff --git a/docs/source/user_guide/suppoted_features.md b/docs/source/user_guide/suppoted_features.md
@@ -1,21 +1,38 @@
 # Feature Support
 
-|           Feature        | Supported | CI Coverage | Guidance Document |     Current Status        |    Next Step       |
-|--------------------------|-----------|-------------|-------------------|---------------------------|--------------------|
-| Chunked Prefill          |     ❌    |             |                   |          NA               | Rely on CANN 8.1 NNAL package release |
-| Automatic Prefix Caching |     ✅    |             |                   | Basic functions available |   Rely on CANN 8.1 NNAL package release |
-|          LoRA            |     ❌    |             |                   |          NA               | Plan in 2025.06.30 |
-|      Prompt adapter      |     ❌    |             |                   |          NA               | Plan in 2025.06.30 |
-|    Speculative decoding  |     ✅    |             |                   | Basic functions available |   Need fully test  |
-|        Pooling           |     ✅    |             |                   | Basic functions available(Bert) | Need fully test and add more models support|
-|        Enc-dec           |     ❌    |             |                   |          NA               | Plan in 2025.06.30|
-|      Multi Modality      |     ✅    |             |         ✅        | Basic functions available(LLaVA/Qwen2-vl/Qwen2-audio/internVL)| Improve perforamance, and add more models support |
-|        LogProbs          |     ✅    |             |                   | Basic functions available |   Need fully test  |
-|     Prompt logProbs      |     ✅    |             |                   | Basic functions available |   Need fully test  |
-|       Async output       |     ✅    |             |                   | Basic functions available |   Need fully test  |
-|   Multi step scheduler   |     ✅    |             |                   | Basic functions available |   Need fully test, Find more details at [<u> Blog </u>](https://blog.vllm.ai/2024/09/05/perf-update.html#batch-scheduling-multiple-steps-ahead-pr-7000), [<u> RFC </u>](https://github.com/vllm-project/vllm/issues/6854) and [<u>issue</u>](https://github.com/vllm-project/vllm/pull/7000)  |
-|          Best of         |     ✅    |             |                   | Basic functions available |   Need fully test  |
-|        Beam search       |     ✅    |             |                   | Basic functions available |   Need fully test  |
-|      Guided Decoding     |     ✅    |             |                   | Basic functions available | Find more details at the [<u>issue</u>](https://github.com/vllm-project/vllm-ascend/issues/177) |
-|      Tensor Parallel     |     ✅    |             |                   | Basic functions available |   Need fully test  |
-|     Pipeline Parallel    |     ✅    |             |                   | Basic functions available |   Need fully test  |
+The feature support principle of vLLM Ascend is: **aligned with the vLLM**. We are also actively collaborating with the community to accelerate support.
+
+vLLM Ascend offers the overall functional support of the most features in vLLM, and the usage keep the same with vLLM except for some limits.
+
+```{note}
+MindIE Turbo is an optional performace optimization plugin. Find more information about the feature support of MindIE Turbo here(UPDATE_ME_AS_A_LINK).
+```
+
+| Feature                       | vLLM Ascend    | MindIE Turbo    | Notes                                                                  |
+|-------------------------------|----------------|-----------------|------------------------------------------------------------------------|
+| V1Engine                      | 🔵 Experimental| 🔵 Experimental| Will enhance in v0.8.x                                                 |
+| Chunked Prefill               | 🟢 Functional  | 🟢 Functional  | /                                                                      |
+| Automatic Prefix Caching      | 🟢 Functional  | 🟢 Functional  | [Usage Limits][#732](https://github.com/vllm-project/vllm-ascend/issues/732) |
+| LoRA                          | 🟢 Functional  | 🟢 Functional  | /                                                                      |
+| Prompt adapter                | 🟡 Planned     | 🟡 Planned     | /                                                                      |
+| Speculative decoding          | 🟢 Functional  | 🟢 Functional  | [Usage Limits][#734](https://github.com/vllm-project/vllm-ascend/issues/734) |
+| Pooling                       | 🟢 Functional  | 🟢 Functional  | /                                                                      |
+| Enc-dec                       | 🟡 Planned     | 🟡 Planned     | /                                                                      |
+| Multi Modality                | 🟢 Functional  | 🟢 Functional  | /                                                                      |
+| LogProbs                      | 🟢 Functional  | 🟢 Functional  | /                                                                      |
+| Prompt logProbs               | 🟢 Functional  | 🟢 Functional  | /                                                                      |
+| Async output                  | 🟢 Functional  | 🟢 Functional  | /                                                                      |
+| Multi step scheduler          | 🟢 Functional  | 🟢 Functional  | /                                                                      | 
+| Best of                       | 🟢 Functional  | 🟢 Functional  | /                                                                      |
+| Beam search                   | 🟢 Functional  | 🟢 Functional  | /                                                                      |
+| Guided Decoding               | 🟢 Functional  | 🟢 Functional  | /                                                                      |
+| Tensor Parallel               | 🟢 Functional  | ⚡Optimized    | /                                                                      |
+| Pipeline Parallel             | 🟢 Functional  | ⚡Optimized    | /                                                                      |
+| Expert Parallel               | 🟡 Planned     | 🟡 Planned     | Will support in v0.8.x                                                 |
+| Data Parallel                 | 🟡 Planned     | 🟡 Planned     | Will support in v0.8.x                                                 |
+| Prefill Decode Disaggregation | 🟢 Functional  | 🟢 Functional  | todo                                                                   |
+| Quantization                  | 🟡 Planned     | 🟢 Functional  | Will support in v0.8.x                                                 |
+| Graph Mode                    | 🟡 Planned     | 🟡 Planned     | Will support in v0.8.x                                                 |
+| Sleep Mode                    | 🟢 Functional  | 🟢 Functional  | [Usage Limits][#733](https://github.com/vllm-project/vllm-ascend/issues/733) |
+| MTP                           | 🟢 Functional  | 🟢 Functional  | [Usage Limits][#734](https://github.com/vllm-project/vllm-ascend/issues/734) |
+| Custom Scheduler              | 🟢 Functional  | 🟢 Functional  | [Usage Limits][#788](https://github.com/vllm-project/vllm-ascend/issues/788) |
diff --git a/setup.py b/setup.py
@@ -368,7 +368,7 @@ def _read_requirements(filename: str) -> List[str]:
     install_requires=get_requirements(),
     ext_modules=ext_modules,
     cmdclass=cmdclass,
-    extras_require={},
+    extras_require={"mindie_turbo": ["mindie-turbo==2.0rc1"]},
     entry_points={
         "vllm.platform_plugins": ["ascend = vllm_ascend:register"],
         "vllm.general_plugins":

diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
@@ -20,7 +20,9 @@
 from vllm.logger import logger
 
 
-def try_register_lib(lib_name: str, lib_info: str = ""):
+def try_register_lib(lib_name: str,
+                     lib_info: str = "",
+                     exception_info: str = ""):
     import importlib
     import importlib.util
     try:
@@ -30,4 +32,4 @@ def try_register_lib(lib_name: str, lib_info: str = ""):
             if lib_info:
                 logger.info(lib_info)
     except Exception:
-        pass
+        logger.warning_once(exception_info)
diff --git a/vllm_ascend/worker/worker.py b/vllm_ascend/worker/worker.py
@@ -77,7 +77,8 @@ def __init__(
         # Try to import mindie_turbo to accelerate vLLM inference.
         try_register_lib(
             "mindie_turbo",
-            "MindIE Turbo is installed. vLLM inference will be accelerated with MindIE Turbo."
+            "MindIE Turbo is installed. vLLM inference will be accelerated with MindIE Turbo.",
+            "MindIE Turbo is installed but unable to `import mindie_turbo`, skip MindIE Turbo acceleration."
         )
         # distribute related config
         self.parallel_config.rank = rank

diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py
@@ -77,7 +77,8 @@ def __init__(self,
         # Try to import mindie_turbo to accelerate vLLM inference.
         try_register_lib(
             "mindie_turbo",
-            "MindIE Turbo is installed. vLLM inference will be accelerated with MindIE Turbo."
+            "MindIE Turbo is installed. vLLM inference will be accelerated with MindIE Turbo.",
+            "MindIE Turbo is installed but unable to `import mindie_turbo`, skip MindIE Turbo acceleration."
         )
 
         if self.cache_config.cache_dtype == "auto":