Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/source/user_guide/configuration/additional_config.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ The details of each config option are as follows:
| Name | Type | Default | Description |
| ---- | ---- | ------- | ----------- |
| `enabled` | bool | `False` | Whether to enable ascend scheduler for V1 engine|
| `max_long_partial_prefills` | Union[int, float] | `float('inf')` | the maximum number of prompts longer than long_prefill_token_threshold that will be prefilled concurrently. |
| `long_prefill_token_threshold` | Union[int, float] | `False` | a request is considered long if the prompt is longer than this number of tokens. |
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The default value for long_prefill_token_threshold is documented as False, which is inconsistent with its type Union[int, float] and the implementation. In the code, its default is None, and it's then either set to float('inf') or calculated based on max_model_len if max_long_partial_prefills is set. Using False here is misleading for users. Please update the default value to None to align with the implementation.

Suggested change
| `long_prefill_token_threshold` | Union[int, float] | `False` | a request is considered long if the prompt is longer than this number of tokens. |
| `long_prefill_token_threshold` | Union[int, float] | `None` | a request is considered long if the prompt is longer than this number of tokens. |


ascend_scheduler_config also support the options from [vllm scheduler config](https://docs.vllm.ai/en/stable/api/vllm/config.html#vllm.config.SchedulerConfig). For example, you can add `enable_chunked_prefill: True` to ascend_scheduler_config as well.

Expand All @@ -73,6 +75,8 @@ An example of additional configuration is as follows:
"ascend_scheduler_config": {
"enabled": True,
"enable_chunked_prefill": True,
"max_long_partial_prefills": 1,
"long_prefill_token_threshold": 4096,
},
"refresh": False,
}
Expand Down
17 changes: 16 additions & 1 deletion vllm_ascend/core/schedule_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,16 @@
#

from dataclasses import dataclass, fields
from typing import Type, Union
from typing import Optional, Type, Union

from vllm.config import SchedulerConfig


@dataclass
class AscendSchedulerConfig(SchedulerConfig):
enable_chunked_prefill: bool = False
max_long_partial_prefills: Optional[Union[int, float]] = None
long_prefill_token_threshold: Optional[Union[int, float]] = None
policy: str = "fcfs"
num_scheduler_steps: int = 1
scheduler_cls: Union[str, Type[object]] = (
Expand All @@ -41,6 +43,8 @@ def initialize_from_config(
}
# Override default values into original SchedulerConfig
scheduler_config["enable_chunked_prefill"] = False
scheduler_config["max_long_partial_prefills"] = None
scheduler_config["long_prefill_token_threshold"] = None
scheduler_config["policy"] = "fcfs"
scheduler_config["num_scheduler_steps"] = 1
scheduler_config["scheduler_cls"] = (
Expand All @@ -55,6 +59,17 @@ def __post_init__(self) -> None:
self.max_num_encoder_input_tokens = self.max_num_batched_tokens
self.encoder_cache_size = self.max_num_batched_tokens
self.chunked_prefill_enabled = self.enable_chunked_prefill
# concurrent partial prefills. Default is inf
if self.max_long_partial_prefills is None:
self.max_long_partial_prefills = float('inf')
self.long_prefill_token_threshold = float('inf')
else:
if self.long_prefill_token_threshold is None:
self.long_prefill_token_threshold = \
int(self.max_model_len * 0.04)
Comment on lines +67 to +69
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

The default value calculation for long_prefill_token_threshold can result in 0 if self.max_model_len is small (less than 25). This will cause the assertion self.long_prefill_token_threshold > 0 on line 72 to fail, leading to a crash on startup. To prevent this, ensure the calculated default value is at least 1.

            if self.long_prefill_token_threshold is None:
                self.long_prefill_token_threshold = max(1, int(self.max_model_len * 0.04))


assert (self.max_long_partial_prefills > 0)
assert (self.long_prefill_token_threshold > 0)
if self.policy != "fcfs":
raise NotImplementedError(
f"currently AscendScheduler only supports fcfs policy, got {self.policy}"
Expand Down
10 changes: 10 additions & 0 deletions vllm_ascend/core/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,10 @@ def schedule(self) -> SchedulerOutput:
# and put back at the head of the waiting queue later
skipped_waiting_requests: deque[Request] = deque()

# Skip long prompt requests in prefill stage.
# long_prefill_budget is float('inf') if not use.
long_prefill_budget = self.vllm_config.scheduler_config.max_long_partial_prefills

# Schedule prefill requests first.
while self.waiting and token_budget > 0:
if len(self.running) == self.max_num_running_reqs:
Expand Down Expand Up @@ -173,6 +177,11 @@ def skip_cur_request():
skip_cur_request()
continue

if num_new_tokens > self.vllm_config.scheduler_config.long_prefill_token_threshold \
and long_prefill_budget <= 0:
skip_cur_request()
continue

new_blocks = self.kv_cache_manager.allocate_slots(
request,
num_new_tokens + num_external_computed_tokens,
Expand Down Expand Up @@ -222,6 +231,7 @@ def skip_cur_request():
# Update request info.
num_scheduled_tokens[request.request_id] = num_new_tokens
token_budget -= num_new_tokens
long_prefill_budget -= 1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

The long_prefill_budget is intended to limit the number of concurrent long prompt prefills. However, it is currently decremented for every scheduled prefill request, regardless of its length. This will cause the budget to be consumed by short requests, incorrectly preventing subsequent long requests from being scheduled. The budget should only be decremented for long requests.

Suggested change
long_prefill_budget -= 1
if num_new_tokens > self.vllm_config.scheduler_config.long_prefill_token_threshold:
long_prefill_budget -= 1

request.status = RequestStatus.RUNNING
request.num_computed_tokens = num_computed_tokens
# Count the number of prefix cached tokens.
Expand Down
Loading