Skip to content

Commit 3ca4480

Browse files
authored
[V1] Fix assert failure when finishing a batch (#62)
* [V1] Fix assert failure when finishing a batch Signed-off-by: Wallas Santos <[email protected]> * update Dockerfile to use current latest vllm Signed-off-by: Wallas Santos <[email protected]> * fix dockerfile Signed-off-by: Wallas Santos <[email protected]> * fix: dockerfile build Signed-off-by: Wallas Santos <[email protected]> * disable stats for test and warn users Signed-off-by: Wallas Santos <[email protected]> * Add workaround for requests that does not fit in warmup shapes Revert "disable stats for test and warn users" This reverts commit 03cc587. Signed-off-by: Wallas Santos <[email protected]> * fix: change dummy token id Signed-off-by: Wallas Santos <[email protected]> * fix: more workarounds Signed-off-by: Wallas Santos <[email protected]> * fix linting Signed-off-by: Wallas Santos <[email protected]> * feat: upgrade vllm Signed-off-by: Wallas Santos <[email protected]> * trying to fix docker build Signed-off-by: Wallas Santos <[email protected]> --------- Signed-off-by: Wallas Santos <[email protected]>
1 parent e355aa7 commit 3ca4480

File tree

6 files changed

+39
-8
lines changed

6 files changed

+39
-8
lines changed

Dockerfile.spyre

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,11 @@ RUN pip install torch==2.5.1+cpu --index-url https://download.pytorch.org/whl/cp
2323
# Install uv
2424
RUN pip install uv
2525
# Install the plugin in a new venv, along with dev deps to test with
26+
ENV VLLM_TARGET_DEVICE=empty
2627
RUN cd /workspace/vllm-spyre \
2728
&& uv venv .venv --system-site-packages \
2829
&& source .venv/bin/activate \
29-
&& VLLM_TARGET_DEVICE=empty uv pip install -v -e . --system \
30+
&& uv pip install -v -e . --system \
3031
&& uv sync --frozen --group dev
3132
ENV VLLM_PLUGINS=spyre
3233

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ override-dependencies = [
3939
]
4040

4141
[tool.uv.sources]
42-
vllm = { git = "https://github.com/vllm-project/vllm", rev = "v0.8.0" }
42+
vllm = { git = "https://github.com/vllm-project/vllm", rev = "v0.8.3" }
4343

4444
[tool.ruff]
4545
# Allow lines to be as long as 80.

tests/spyre_util.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,10 +163,14 @@ def generate_spyre_vllm_output(model: str, prompts: List[str],
163163
vllm_outputs = vllm_model.generate(prompts, sampling_params)
164164

165165
results = []
166+
166167
for req_output in vllm_outputs:
167168
result = {}
168169
result['text'] = req_output.outputs[0].text
169-
result['token_ids'] = tuple(req_output.outputs[0].token_ids)
170+
# TODO: Workaround for V1, if request does not fit in a warmup shape
171+
# token_ids may be filled with -1.
172+
token_ids = [t for t in req_output.outputs[0].token_ids if t >= 0]
173+
result['token_ids'] = tuple(token_ids)
170174
result['tokens'] = tuple([
171175
req_output.outputs[0].logprobs[i][t].decoded_token
172176
for i, t in enumerate(result['token_ids'])

tests/test_spyre_online.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
from tests.spyre_util import (RemoteOpenAIServer, get_spyre_backend_list,
55
get_spyre_model_list)
6+
from vllm_spyre.v1.core.scheduler import NO_WARMUP_FIT_STOP_REASON
67

78

89
@pytest.mark.parametrize("model", get_spyre_model_list())
@@ -71,4 +72,11 @@ def test_openai_serving(model, warmup_shape, backend, vllm_version):
7172
max_tokens=25)
7273

7374
assert len(completion.choices) == 1
75+
76+
# TODO: V0 and V1 have slight different behavior for requests
77+
# that do not fit in a warmup shape
78+
7479
assert len(completion.choices[0].text) == 0
80+
if vllm_version == 'V1':
81+
assert completion.choices[0].stop_reason == \
82+
NO_WARMUP_FIT_STOP_REASON

vllm_spyre/v1/core/scheduler.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222

2323
logger = init_logger(__name__)
2424

25+
NO_WARMUP_FIT_STOP_REASON = "Request did not fit any warmup shape"
26+
2527

2628
class SpyreScheduler(Scheduler):
2729
"""Small extension of the V1 scheduler that adds constraints for Sypre:
@@ -185,11 +187,13 @@ def _reject_from_queue(self,
185187
for request in rejected_requests:
186188
queue.remove(request)
187189
reject_outputs.append(
188-
EngineCoreOutput(request.request_id,
189-
new_token_ids=[],
190-
finish_reason=FinishReason.ABORT,
191-
stop_reason="Request did not fit any warmup "
192-
"shape"))
190+
EngineCoreOutput(
191+
request.request_id,
192+
# TODO: FIXME
193+
# Dummy token prevent stats collection crash
194+
new_token_ids=[-1],
195+
finish_reason=FinishReason.ABORT,
196+
stop_reason=NO_WARMUP_FIT_STOP_REASON))
193197
request.status = RequestStatus.FINISHED_ABORTED
194198
self._free_request(request)
195199
self.rejected_requests.remove(request.request_id)

vllm_spyre/v1/worker/spyre_model_runner.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,20 @@ def execute_model(
305305

306306
t0 = time.time()
307307

308+
# TODO: change to EMPTY_MODEL_RUNNER_OUTPUT, right now this
309+
# will be a breaking change, or clumsy to make retrocompatible
310+
# with conditional import
311+
if not scheduler_output.total_num_scheduled_tokens:
312+
# Return empty ModelRunnerOuptut if there's no work to do.
313+
return ModelRunnerOutput(
314+
req_ids=[],
315+
req_id_to_index={},
316+
sampled_token_ids=[],
317+
spec_token_ids=None,
318+
logprobs=None,
319+
prompt_logprobs_dict={},
320+
)
321+
308322
self._update_states(scheduler_output)
309323

310324
model_input = self.prepare_model_input(scheduler_output)

0 commit comments

Comments
 (0)