[Feat] Implement primal full graph with limited scenario (vllm-project#1503)

yiz-liu · yiz-liu · commit 0c38f41bce34 · 2025-08-11T14:15:45.000+08:00
This pull request introduces full-graph capture, replacing the previous
piecewise-graph approach. Key improvements include:

* **Reduced dispatch latency:** By capturing the entire model execution
graph at once, we minimize overhead compared to multiple smaller
captures.
* **Stabilized multi-GPU performance:** Eliminates throughput
fluctuations during the `MODEL_EXECUTE` phase across multiple cards.
* **Stream resource savings:** Consolidating graph captures frees up
streams, allowing more graphs to be captured concurrently.
**Known issues:**

1. Capturing larger or more numerous graphs increases GPU memory usage,
which can lead to OOM errors or inference hangs.
2. The new paged-attention implementation relies on the FIA operator,
which in certain workloads is slower than the previous
approach—resulting in a regression in end-to-end throughput.
There may be other undiscovered corner cases. This PR is the first in a
planned series; we will continue to iterate on and address any remaining
issues in subsequent submissions.

```python
compilation_config={
    "full_cuda_graph": True,
},
```

---------

Signed-off-by: Yizhou Liu &lt;liu_yizhou@outlook.com&gt;
diff --git a/tests/e2e/singlecard/test_aclgraph.py b/tests/e2e/singlecard/test_aclgraph.py
@@ -36,9 +36,12 @@
 
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("full_graph", [False])
 def test_models_with_aclgraph(
     model: str,
     max_tokens: int,
+    full_graph: bool,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     prompts = [
         "Hello, my name is", "The president of the United States is",
@@ -48,7 +51,15 @@ def test_models_with_aclgraph(
     sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
     # TODO: change to use vllmrunner when the registry of custom op is solved
     # while running pytest
-    vllm_model = LLM(model, max_model_len=1024)
+    if full_graph:
+        vllm_model = LLM(model,
+                            compilation_config={
+                                "full_cuda_graph": True,
+                                "cudagraph_capture_sizes":
+                                [1, 4, 16, 64, 256]
+                            })
+    else:
+        vllm_model = LLM(model, max_model_len=1024)
     vllm_aclgraph_outputs = vllm_model.generate(prompts, sampling_params)
     del vllm_model
     torch.npu.empty_cache()
diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
@@ -24,12 +24,15 @@
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionLayer, AttentionType)
 from vllm.attention.backends.utils import CommonAttentionState
+from vllm.config import get_current_vllm_config
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.utils import direct_register_custom_op
 from vllm.v1.core.sched.output import SchedulerOutput
 
+from vllm_ascend.attention.utils import \
+    AscendCommonAttentionMetadata as CommonAttentionMetadata
 from vllm_ascend.ops.attention import vanilla_chunked_prefill
-from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
+from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, get_graph_params, is_310p,
                                nd_to_nz_2d, nd_to_nz_spec)
 from vllm_ascend.worker.npu_input_batch import InputBatch
 
@@ -132,7 +135,7 @@ class AscendMetadata:
     # tokens + new tokens (is None if it is a decoding).
     # (batch_size,)
     seq_lens: torch.Tensor = None
-
+    seq_lens_list: list
     query_start_loc: torch.Tensor = None
     query_lens: torch.Tensor = None
     # Maximum query length in the batch (None for decoding).
@@ -167,6 +170,7 @@ def build(self,
               num_reqs,
               num_actual_tokens,
               max_query_len,
+              common_attn_metadata: CommonAttentionMetadata,
               enable_dbo_across_dp: bool = False,
               is_only_prefill: bool = False):
 
@@ -175,15 +179,16 @@ def build(self,
         block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = (
             block_table[:num_reqs])
 
-        query_lens = self.runner.query_lens
-        seq_lens = self.runner.seq_lens_cpu[:num_reqs]
-        slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to(
-            self.runner.device, non_blocking=True)
+        query_start_loc = common_attn_metadata.query_start_loc
+        seq_lens = common_attn_metadata.seq_lens
+        # TODO: Refactor these two param to common metadata in runners,
+        # preparing for the hybrid KV groups feature
+        query_lens = common_attn_metadata.query_lens if common_attn_metadata.query_lens is not None else self.runner.query_lens
+        seq_lens_list = common_attn_metadata.seq_lens_list if common_attn_metadata.seq_lens_list is not None else self.runner.seq_lens_list
+
+        slot_mapping = self.runner.slot_mapping[:num_actual_tokens]
         attn_mask = self.runner.attn_mask
         attn_state = self.runner.attn_state
-        query_start_loc_cpu = self.runner.query_start_loc_cpu[:num_reqs + 1]
-        query_start_loc = query_start_loc_cpu.to(self.runner.device,
-                                                 non_blocking=True)
 
         if is_310p():
             if attn_state == AscendAttentionState.PrefillNoCache:
@@ -201,6 +206,7 @@ def build(self,
             query_start_loc=query_start_loc,
             query_lens=query_lens,
             seq_lens=seq_lens,
+            seq_lens_list=seq_lens_list,
             max_query_len=max_query_len,
             slot_mapping=slot_mapping,
             attn_mask=attn_mask,
@@ -209,6 +215,34 @@ def build(self,
             is_only_prefill=is_only_prefill)
         return attn_metadata
 
+    def build_dummy_metadata(self, num_actual_tokens, num_reqs,
+                             num_scheduled_tokens, attn_state):
+        if attn_state == AscendAttentionState.DecodeOnly:
+            # NOTE: We only need to pay attention to seq_lens_list and block_table here
+            common_attn_metadata = CommonAttentionMetadata(seq_lens_list=[2] *
+                                                           num_reqs)
+
+            block_table = self.runner.input_batch.block_table[0].block_table
+            block_table[:num_reqs, 0] = torch.arange(1,
+                                                     num_reqs + 1,
+                                                     device=block_table.device,
+                                                     dtype=block_table.dtype)
+
+            attn_metadata = self.build(
+                num_reqs=num_reqs,
+                num_actual_tokens=num_actual_tokens,
+                max_query_len=num_scheduled_tokens.max(),
+                common_prefix_len=0,
+                common_attn_metadata=common_attn_metadata,
+            )
+        else:
+            raise NotImplementedError(
+                "Currently we only support building dummy metadata for DecodeOnly state"
+            )
+
+        attn_metadata.attn_state = attn_state
+        return attn_metadata
+
 
 class AscendAttentionBackendImpl(AttentionImpl):
 
@@ -245,6 +279,10 @@ def __init__(
         self.key_cache = None
         self.value_cache = None
 
+        vllm_config = get_current_vllm_config()
+        self.full_graph = vllm_config.compilation_config.full_cuda_graph
+        self.block_size = vllm_config.cache_config.block_size
+
     def forward(
         self,
         layer: AttentionLayer,
@@ -369,20 +407,96 @@ def forward(
                     scale_value=self.scale,
                     out=output)
             elif attn_metadata.attn_state == AscendAttentionState.DecodeOnly:
-                if is_310p():
-                    # # seq_lens_tensor needs to be transferred to the device for 310P
-                    attn_metadata.seq_lens = \
-                        attn_metadata.seq_lens.to(device=query.device)
-                torch_npu._npu_paged_attention(
-                    query=query,
-                    key_cache=self.key_cache,
-                    value_cache=self.value_cache,
-                    num_kv_heads=self.num_kv_heads,
-                    num_heads=self.num_heads,
-                    scale_value=self.scale,
-                    block_table=attn_metadata.block_tables,
-                    context_lens=attn_metadata.seq_lens,
-                    out=output)
+                if self.full_graph:
+                    graph_params = get_graph_params()
+                    q = query.view(num_tokens, -1, self.hidden_size)
+                    k = self.key_cache.view(  # type: ignore
+                        -1, self.block_size,
+                        self.num_kv_heads * self.head_size)
+                    v = self.value_cache.view(  # type: ignore
+                        -1, self.block_size,
+                        self.num_kv_heads * self.head_size)
+                    actual_seq_lens = attn_metadata.seq_lens_list
+                    attn_args = {
+                        "query": q,
+                        "key": k,
+                        "value": v,
+                        "actual_seq_lengths_kv": actual_seq_lens,
+                        "block_table": attn_metadata.block_tables,
+                        "num_heads": self.num_heads,
+                        "scale": self.scale,
+                        "input_layout": "BSH",
+                        "num_key_value_heads": self.num_kv_heads,
+                        "block_size": self.block_size,
+                    }
+
+                    # Prepare tensors for attention output
+                    # TODO: Refactor this to step-level instead of layer-level
+                    attn_output = torch.empty(num_tokens,
+                                              1,
+                                              self.hidden_size,
+                                              dtype=output.dtype,
+                                              device=output.device)
+                    softmax_lse = torch.empty(num_tokens,
+                                              dtype=output.dtype,
+                                              device=output.device)
+
+                    # Get workspace from cache or calculate it if not present.
+                    workspace = graph_params.workspaces.get(num_tokens)
+                    if workspace is None:
+                        workspace = torch_npu._npu_fused_infer_attention_score_get_max_workspace(
+                            **attn_args)
+                        graph_params.workspaces[num_tokens] = workspace
+
+                    forward_context = get_forward_context()
+                    if not forward_context.capturing:
+                        # Execute attention kernel directly in non-capturing mode
+                        torch.ops.npu.npu_fused_infer_attention_score.out(
+                            workspace=workspace,
+                            out=[attn_output, softmax_lse],
+                            **attn_args)
+                    else:
+                        # Handle graph capturing mode
+                        stream = torch_npu.npu.current_stream()
+
+                        event = torch.npu.ExternalEvent()
+                        event.wait(stream)
+                        event.reset(stream)
+                        graph_params.events[num_tokens].append(event)
+
+                        graph_params.attn_params[num_tokens].append(
+                            (q, k, v, actual_seq_lens,
+                             attn_metadata.block_tables, self.num_heads,
+                             self.scale, self.num_kv_heads, attn_output,
+                             softmax_lse))
+
+                        torch.npu.graph_task_group_begin(stream)
+                        torch.ops.npu.npu_fused_infer_attention_score.out(
+                            workspace=workspace,
+                            out=[attn_output, softmax_lse],
+                            **attn_args)
+                        handle = torch.npu.graph_task_group_end(stream)
+                        graph_params.handles[num_tokens].append(handle)
+
+                    # Reshape output to match the expected format
+                    output.copy_(
+                        attn_output.view(num_tokens, self.num_heads,
+                                         self.head_size))
+                else:
+                    if is_310p():
+                        # seq_lens_tensor needs to be transferred to the device for 310P
+                        attn_metadata.seq_lens = \
+                            attn_metadata.seq_lens.to(device=query.device)
+                    torch_npu._npu_paged_attention(
+                            query=query,
+                            key_cache=self.key_cache,
+                            value_cache=self.value_cache,
+                            num_kv_heads=self.num_kv_heads,
+                            num_heads=self.num_heads,
+                            scale_value=self.scale,
+                            block_table=attn_metadata.block_tables,
+                            context_lens=attn_metadata.seq_lens,
+                            out=output)
             # Normal V1 situation.
             else:
                 # use chunked prefill for head size 192 scenario, like deepseek
diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
@@ -17,6 +17,8 @@
 from vllm_ascend import envs
 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
+from vllm_ascend.attention.utils import \
+    AscendCommonAttentionMetadata as CommonAttentionMetadata
 from vllm_ascend.multistream.base import MSAttentionMetadataSplitConfig
 from vllm_ascend.multistream.context import get_multistream_comm_context
 from vllm_ascend.multistream.ms_split import model_input_split_v1_mla_attn
diff --git a/vllm_ascend/attention/utils.py b/vllm_ascend/attention/utils.py
@@ -0,0 +1,23 @@
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+
+
+@dataclass
+class AscendCommonAttentionMetadata:
+    """
+    Attention metadata attributes that can be shared by layers in different KV
+    cache groups and thus having different block table.
+    """
+
+    query_start_loc: torch.Tensor = None
+    """(batch_size + 1,), the start location of each request in query Tensor"""
+    seq_lens: Optional[torch.Tensor] = None
+    """(batch_size,), the length of each request including both computed tokens
+    and newly scheduled tokens"""
+    query_lens: Optional[torch.Tensor] = None
+    """(batch_size,), the length of each request including only the newly
+    scheduled tokens"""
+    seq_lens_list: Optional[list] = None
+    """(num_input_tokens,), note that this is specifically for FIA kernel"""
diff --git a/vllm_ascend/compilation/piecewise_backend.py b/vllm_ascend/compilation/piecewise_backend.py
@@ -28,9 +28,13 @@
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.monitor import end_monitoring_torch_compile
 from vllm.config import VllmConfig
+from vllm.forward_context import get_forward_context
 from vllm.logger import logger
 from vllm.utils import weak_ref_tensors
 
+from vllm_ascend.attention.attention_v1 import AscendAttentionState
+from vllm_ascend.utils import get_graph_params, set_graph_params
+
 
 @dataclasses.dataclass
 class ConcreteSizeEntry:
@@ -95,6 +99,10 @@ def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
 
         self.is_debugging_mode = envs.VLLM_LOGGING_LEVEL == "DEBUG"
 
+        if self.compilation_config.full_cuda_graph:
+            self.update_stream = torch.npu.Stream()
+            set_graph_params(self.aclgraph_capture_sizes)
+
         # the entries for different shapes that we need to either
         # compile or capture aclgraph
         self.concrete_size_entries: Dict[int, ConcreteSizeEntry] = {}
@@ -116,7 +124,40 @@ def check_for_ending_compilation(self):
             self.vllm_backend.compiler_manager.save_to_file()
             end_monitoring_torch_compile(self.vllm_config)
 
+    def update_attn_params(self, graph_params, forward_context, runtime_shape):
+        for layer_idx in range(len(graph_params.handles[runtime_shape])):
+            query, key, value, actual_seq_lens, block_table, num_heads, scale, num_kv_heads, output, softmax_lse = graph_params.attn_params[
+                runtime_shape][layer_idx]
+            block_table = forward_context.attn_metadata.block_tables
+            actual_seq_lens = forward_context.attn_metadata.seq_lens_list
+
+            with torch.npu.stream(self.update_stream):
+                torch.npu.graph_task_update_begin(
+                    self.update_stream,
+                    graph_params.handles[runtime_shape][layer_idx])
+                torch.ops.npu.npu_fused_infer_attention_score.out(
+                    query,
+                    key,
+                    value,
+                    workspace=graph_params.workspaces[runtime_shape],
+                    actual_seq_lengths_kv=actual_seq_lens,
+                    block_table=block_table,
+                    num_heads=num_heads,
+                    scale=scale,
+                    input_layout="BSH",
+                    num_key_value_heads=num_kv_heads,
+                    block_size=128,
+                    out=[output, softmax_lse],
+                )
+                torch.npu.graph_task_update_end(self.update_stream)
+
+                graph_params.events[runtime_shape][layer_idx].record(
+                    self.update_stream)
+
     def __call__(self, *args) -> Any:
+        forward_context = get_forward_context()
+        graph_params = get_graph_params()
+
         if not self.first_run_finished:
             self.first_run_finished = True
             self.check_for_ending_compilation()
@@ -127,6 +168,11 @@ def __call__(self, *args) -> Any:
             # we don't need to do anything for this shape
             return self.compiled_graph_for_general_shape(*args)
 
+        if (getattr(forward_context.attn_metadata, "attn_state",
+                    None) != AscendAttentionState.DecodeOnly
+                and self.compilation_config.full_cuda_graph):
+            return self.compiled_graph_for_general_shape(*args)
+
         entry = self.concrete_size_entries[runtime_shape]
 
         if entry.runnable is None:
@@ -189,6 +235,7 @@ def __call__(self, *args) -> Any:
                         patch("torch.npu.empty_cache", lambda: None))
 
                 # mind-exploding: carefully manage the reference and memory.
+                forward_context.capturing = True
                 with torch.npu.graph(aclgraph, pool=self.graph_pool):
                     # `output` is managed by pytorch's aclgraph pool
                     output = entry.runnable(*args)
@@ -222,4 +269,9 @@ def __call__(self, *args) -> Any:
             )
 
         entry.aclgraph.replay()
+
+        if self.compilation_config.full_cuda_graph:
+            self.update_attn_params(graph_params, forward_context,
+                                    runtime_shape)
+
         return entry.output
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
@@ -163,8 +163,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 "PIECEWISE compilation enabled on NPU. use_inductor not supported - "
                 "using only ACL Graph mode")
             compilation_config.use_inductor = False
-            compilation_config.splitting_ops.extend(
-                ["vllm.unified_ascend_attention_with_output"])
+            if not compilation_config.full_cuda_graph:
+                compilation_config.splitting_ops.extend(
+                    ["vllm.unified_ascend_attention_with_output"])
             update_aclgraph_sizes(vllm_config)
 
         if parallel_config and parallel_config.worker_cls == "auto":
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py