support async_scheduling for spec-decode

Ronald1995 · Ronald1995 · commit 0eee2711dbcf · 2025-10-22T22:11:05.000+08:00
Signed-off-by: Ronald1995 &lt;ronaldautomobile@163.com&gt;
diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py
@@ -3,7 +3,7 @@
 
 import ast
 import hashlib
-from typing import TYPE_CHECKING, Any, Literal
+from typing import TYPE_CHECKING, Any, Literal, get_args
 
 from pydantic import SkipValidation, model_validator
 from pydantic.dataclasses import dataclass
@@ -30,28 +30,23 @@
 
 logger = init_logger(__name__)
 
-SpeculativeMethod = Literal[
-    "ngram",
-    "eagle",
-    "eagle3",
-    "medusa",
-    "mlp_speculator",
-    "draft_model",
-    "deepseek_mtp",
-    "ernie_mtp",
-    "qwen3_next_mtp",
-    "mimo_mtp",
-    "longcat_flash_mtp",
-    "mtp",
-]
-MTP_MODEL_TYPES = (
+MTPModelTypes = Literal[
     "deepseek_mtp",
     "mimo_mtp",
     "glm4_moe_mtp",
     "ernie_mtp",
     "qwen3_next_mtp",
     "longcat_flash_mtp",
-)
+    "mtp",
+]
+EagleModelTypes = Literal["eagle", "eagle3", MTPModelTypes]
+SpeculativeMethod = Literal[
+    "ngram",
+    "medusa",
+    "mlp_speculator",
+    "draft_model",
+    EagleModelTypes,
+]
 
 
 @config
@@ -224,7 +219,7 @@ def __post_init__(self):
         # can not be detected, it will be considered as the "draft_model" by
         # default.
 
-        if self.method in MTP_MODEL_TYPES:
+        if self.method in get_args(MTPModelTypes) and self.method != "mtp":
             logger.warning(
                 "method `%s` is deprecated and replaced with mtp.", self.method
             )
@@ -338,7 +333,9 @@ def __post_init__(self):
                     self.method = "medusa"
                 elif self.draft_model_config.hf_config.model_type == "mlp_speculator":
                     self.method = "mlp_speculator"
-                elif self.draft_model_config.hf_config.model_type in MTP_MODEL_TYPES:
+                elif self.draft_model_config.hf_config.model_type in get_args(
+                    MTPModelTypes
+                ):
                     self.method = "mtp"
                     if self.num_speculative_tokens > 1:
                         logger.warning(
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -69,6 +69,7 @@
 from vllm.config.observability import DetailedTraceModules
 from vllm.config.parallel import DistributedExecutorBackend, ExpertPlacementStrategy
 from vllm.config.scheduler import SchedulerPolicy
+from vllm.config.speculative import EagleModelTypes
 from vllm.config.utils import get_field
 from vllm.logger import init_logger
 from vllm.platforms import CpuArchEnum, current_platform
@@ -1465,13 +1466,25 @@ def create_engine_config(
                     "Async scheduling is not supported with pipeline-parallel-size > 1."
                 )
 
-            # Currently, async scheduling does not support speculative decoding.
-            # TODO(woosuk): Support it.
+            # Currently, async scheduling only support eagle speculative
+            # decoding.
+            # TODO(woosuk): Support other kinds of speculative decoding.
             if self.speculative_config is not None:
-                raise ValueError(
-                    "Currently, speculative decoding is not supported with "
-                    "async scheduling."
-                )
+                if self.speculative_config.get("method") not in get_args(
+                    EagleModelTypes
+                ):
+                    raise ValueError(
+                        "Currently, async scheduling is only supported "
+                        "with EAGLE/MTP kind of speculative decodeing"
+                    )
+                elif self.speculative_config.get("disable_padded_drafter_batch"):
+                    raise ValueError(
+                        "async scheduling for EAGLE/MTP kind of speculative "
+                        "decodeing is enabled, but disable_padded_drafter_batch=True "
+                        "disable_padded_drafter_batch=True is not supported for "
+                        "this situation now. please set "
+                        "disable_padded_drafter_batch=Fasle"
+                    )
 
         # Forward the deprecated CLI args to the EPLB config.
         if self.num_redundant_experts is not None:
diff --git a/vllm/v1/core/sched/async_scheduler.py b/vllm/v1/core/sched/async_scheduler.py
@@ -5,6 +5,7 @@
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.core.sched.scheduler import Scheduler
 from vllm.v1.request import Request, RequestStatus
+from vllm.v1.spec_decode.metrics import SpecDecodingStats
 
 logger = init_logger(__name__)
 
@@ -15,15 +16,25 @@ def _update_after_schedule(
         scheduler_output: SchedulerOutput,
     ) -> None:
         super()._update_after_schedule(scheduler_output)
+        spec_decode_tokens = scheduler_output.scheduled_spec_decode_tokens
         for req_id in scheduler_output.num_scheduled_tokens:
             request = self.requests[req_id]
+            cur_num_spec_tokens = len(spec_decode_tokens.get(req_id, []))
             if (
                 request.num_computed_tokens
-                == request.num_tokens + request.num_output_placeholders
+                == request.num_tokens
+                + request.num_output_placeholders
+                + cur_num_spec_tokens
             ):
-                # The request will generate a new token in this scheduling step.
-                # TODO(woosuk): Support speculative decoding.
-                request.num_output_placeholders += 1
+                # The request will generate a new token plus num_spec_tokens
+                # in this scheduling step.
+                request.num_output_placeholders += 1 + cur_num_spec_tokens
+                # Add a placeholder for the new token in spec_token_ids.
+                # because the actual token id is not known yet. so just use -1
+                # as a placeholder and the length of spec_token_ids is set to
+                # self.num_spec_tokens. we will update the actual spec token id
+                # in worker process.
+                request.spec_token_ids = [-1] * self.num_spec_tokens
 
     def _update_request_with_output(
         self,
@@ -34,9 +45,13 @@ def _update_request_with_output(
         new_token_ids, stopped = super()._update_request_with_output(
             request, new_token_ids
         )
-
-        # Update the number of output placeholders.
-        request.num_output_placeholders -= len(new_token_ids)
+        # num_output_placeholders = 0 happend when a request is preempted.
+        # a preempted request will be added to waitting queue again and
+        # num_output_placeholders is reset to 0,
+        # so don't need to revert num_output_placeholders for this situation.
+        if request.num_output_placeholders > 0:
+            # Update the number of output placeholders.
+            request.num_output_placeholders -= len(new_token_ids)
         assert request.num_output_placeholders >= 0
 
         # Cache the new tokens. Preempted requests should be skipped.
@@ -45,3 +60,40 @@ def _update_request_with_output(
                 request, request.num_computed_tokens - request.num_output_placeholders
             )
         return new_token_ids, stopped
+
+    def _update_computed_tokens(
+        self,
+        request: Request,
+        num_draft_tokens: int,
+        num_accepted: int,
+        num_rejected: int,
+        spec_decoding_stats: SpecDecodingStats | None,
+    ):
+        """Update the computed tokens for each request, which is necessary
+        for spec decoding. In sync scheduler, we need to revert
+        num_computed_tokens by num_rejected tokens,
+        but in async scheduler, we also need to revert num_output_placeholders
+        by num_rejected tokens for spec decoding.
+        """
+        # num_computed_tokens = 0 happend when a request is preempted.
+        # a preempted request will be added to waitting queue again and
+        # num_computed_tokens is reset to 0,
+        # so don't need to revert num_computed_tokens for this situation.
+        if request.num_computed_tokens > 0:
+            # when spec decoding is enabled, num_output_placeholders
+            # is increased by num_spec_tokens in _update_after_schedule.
+            # update num_output_placeholders here to reflect the actual number
+            # of accepted output tokens.
+            request.num_output_placeholders -= num_rejected
+            # num_computed_tokens represents the number of tokens
+            # processed in the current step, considering scheduled
+            # tokens and rejections. If some tokens are rejected,
+            # num_computed_tokens is decreased by the number of rejected
+            # tokens.
+            request.num_computed_tokens -= num_rejected
+        spec_decoding_stats = self.make_spec_decoding_stats(
+            spec_decoding_stats,
+            num_draft_tokens=num_draft_tokens,
+            num_accepted_tokens=num_accepted,
+        )
+        return spec_decoding_stats
diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
@@ -174,3 +174,8 @@ class SchedulerOutput:
 
     # KV Cache Connector metadata.
     kv_connector_metadata: KVConnectorMetadata | None = None
+
+    # Total number of speculative scheduled tokens for all requests.
+    # this is needed when using async_scheduling and speculative
+    # togather.
+    total_num_scheduled_spec_tokens: int = 0
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
@@ -198,7 +198,7 @@ def schedule(self) -> SchedulerOutput:
         encoder_compute_budget = self.max_num_encoder_input_tokens
         # Spec decode-related.
         scheduled_spec_decode_tokens: dict[str, list[int]] = {}
-
+        total_num_spec_tokens = 0
         # For logging.
         scheduled_timestamp = time.monotonic()
 
@@ -285,7 +285,11 @@ def schedule(self) -> SchedulerOutput:
                 self.encoder_cache_manager.free(preempted_req)
                 preempted_req.status = RequestStatus.PREEMPTED
                 preempted_req.num_computed_tokens = 0
+                preempted_req.num_output_placeholders = 0
                 preempted_req.num_preemptions += 1
+                # both sync and async scheduling don't use spec_token_ids
+                # in waiting queue, so we can just clear it here.
+                preempted_req.spec_token_ids.clear()
                 if self.log_stats:
                     preempted_req.record_event(
                         EngineCoreEventType.PREEMPTED, scheduled_timestamp
@@ -311,9 +315,13 @@ def schedule(self) -> SchedulerOutput:
             # Speculative decode related.
             if request.spec_token_ids:
                 num_scheduled_spec_tokens = (
-                    num_new_tokens + request.num_computed_tokens - request.num_tokens
+                    num_new_tokens
+                    + request.num_computed_tokens
+                    - request.num_tokens
+                    - request.num_output_placeholders
                 )
                 if num_scheduled_spec_tokens > 0:
+                    total_num_spec_tokens += num_scheduled_spec_tokens
                     # Trim spec_token_ids list to num_scheduled_spec_tokens.
                     del request.spec_token_ids[num_scheduled_spec_tokens:]
                     scheduled_spec_decode_tokens[request.request_id] = (
@@ -631,6 +639,7 @@ def schedule(self) -> SchedulerOutput:
             free_encoder_mm_hashes=self.encoder_cache_manager.get_freed_mm_hashes(),
             structured_output_request_ids=structured_output_request_ids,
             grammar_bitmask=grammar_bitmask,
+            total_num_scheduled_spec_tokens=total_num_spec_tokens,
         )
 
         # NOTE(Kuntai): this function is designed for multiple purposes:
@@ -962,16 +971,12 @@ def update_from_output(
                 num_draft_tokens = len(scheduled_spec_token_ids)
                 num_accepted = len(generated_token_ids) - 1
                 num_rejected = num_draft_tokens - num_accepted
-                # num_computed_tokens represents the number of tokens
-                # processed in the current step, considering scheduled
-                # tokens and rejections. If some tokens are rejected,
-                # num_computed_tokens is decreased by the number of rejected
-                # tokens.
-                request.num_computed_tokens -= num_rejected
-                spec_decoding_stats = self.make_spec_decoding_stats(
+                spec_decoding_stats = self._update_computed_tokens(
+                    request,
+                    num_draft_tokens,
+                    num_accepted,
+                    num_rejected,
                     spec_decoding_stats,
-                    num_draft_tokens=num_draft_tokens,
-                    num_accepted_tokens=num_accepted,
                 )
 
             stopped = False
@@ -1085,6 +1090,27 @@ def update_from_output(
 
         return engine_core_outputs
 
+    def _update_computed_tokens(
+        self,
+        request: Request,
+        num_draft_tokens: int,
+        num_accepted: int,
+        num_rejected: int,
+        spec_decoding_stats: SpecDecodingStats | None,
+    ):
+        # num_computed_tokens represents the number of tokens
+        # processed in the current step, considering scheduled
+        # tokens and rejections. If some tokens are rejected,
+        # num_computed_tokens is decreased by the number of rejected
+        # tokens.
+        request.num_computed_tokens -= num_rejected
+        spec_decoding_stats = self.make_spec_decoding_stats(
+            spec_decoding_stats,
+            num_draft_tokens=num_draft_tokens,
+            num_accepted_tokens=num_accepted,
+        )
+        return spec_decoding_stats
+
     def _update_request_with_output(
         self,
         request: Request,
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
@@ -198,6 +198,7 @@ def __init__(
         self.step_fn = (
             self.step if self.batch_queue is None else self.step_with_batch_queue
         )
+        self.async_scheduling = vllm_config.scheduler_config.async_scheduling
 
     def _initialize_kv_caches(
         self, vllm_config: VllmConfig
@@ -330,7 +331,10 @@ def step(self) -> tuple[dict[int, EngineCoreOutputs], bool]:
         return engine_core_outputs, scheduler_output.total_num_scheduled_tokens > 0
 
     def post_step(self, model_executed: bool) -> None:
-        if self.use_spec_decode and model_executed:
+        # when using async scheduling we can't get draft token ids in adavance,
+        # so we update draft token ids in the worker process and don't
+        # need to update draft token ids here.
+        if self.use_spec_decode and model_executed and not self.async_scheduling:
             # Take the draft token ids.
             draft_token_ids = self.model_executor.take_draft_token_ids()
             if draft_token_ids is not None:
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
@@ -185,6 +185,7 @@ def __init__(
             device=device,
             dtype=torch.int32,
         ).repeat(max_batch_size, 1)
+        self.use_async_scheduling = self.vllm_config.scheduler_config.async_scheduling
 
     def _get_positions(self, num_tokens: int):
         if self.uses_mrope:
@@ -387,14 +388,27 @@ def propose(
                 positions += 1
                 exceeds_max_model_len = positions >= self.max_model_len
                 clamped_positions = torch.where(exceeds_max_model_len, 0, positions)
+            # when enable use_async_scheduling, we shouldn't use in place
+            # operations in case they are modified in next step `prepare_input`
+            # of main model.
+            if self.use_async_scheduling:
+                # Increment the sequence lengths.
+                common_attn_metadata.seq_lens = common_attn_metadata.seq_lens + 1
+                common_attn_metadata.seq_lens_cpu = (
+                    common_attn_metadata.seq_lens_cpu + 1
+                )
+                # For the requests that exceed the max model length, we set the
+                # sequence length to 1 to minimize their overheads in attention.
 
-            # Increment the sequence lengths.
-            common_attn_metadata.seq_lens += 1
-            common_attn_metadata.seq_lens_cpu += 1
-            # For the requests that exceed the max model length, we set the
-            # sequence length to 1 to minimize their overheads in attention.
+                common_attn_metadata.seq_lens.masked_fill(exceeds_max_model_len, 1)
+            else:
+                # Increment the sequence lengths.
+                common_attn_metadata.seq_lens += 1
+                common_attn_metadata.seq_lens_cpu += 1
+                # For the requests that exceed the max model length, we set the
+                # sequence length to 1 to minimize their overheads in attention.
 
-            common_attn_metadata.seq_lens.masked_fill_(exceeds_max_model_len, 1)
+                common_attn_metadata.seq_lens.masked_fill_(exceeds_max_model_len, 1)
 
             common_attn_metadata.num_computed_tokens_cpu = (
                 common_attn_metadata.seq_lens_cpu - 1
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
@@ -45,6 +45,11 @@ class CachedRequestState:
 
     lora_request: LoRARequest | None = None
     prompt_embeds: torch.Tensor | None = None
+    # these are used when both async_scheduling and spec_decode are enabled.
+    prev_num_draft_len: int = 0
+    prev_sampled_tokens: torch.Tensor | None = None
+    prev_draft_tokens: torch.Tensor | None = None
+    resumed_from_preemption: bool = False
 
     def __post_init__(self):
         self.num_prompt_tokens = length_from_prompt_token_ids_or_embeds(
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py