[V1] Fix assert failure when finishing a batch (#62)

wallashss · web-flow · commit 3ca448008d6a · 2025-04-07T14:28:18.000-06:00
* [V1] Fix assert failure when finishing a batch Signed-off-by: Wallas Santos <wallashss@ibm.com> * update Dockerfile to use current latest vllm Signed-off-by: Wallas Santos <wallashss@ibm.com> * fix dockerfile Signed-off-by: Wallas Santos <wallashss@ibm.com> * fix: dockerfile build Signed-off-by: Wallas Santos <wallashss@ibm.com> * disable stats for test and warn users Signed-off-by: Wallas Santos <wallashss@ibm.com> * Add workaround for requests that does not fit in warmup shapes Revert "disable stats for test and warn users" This reverts commit 03cc587. Signed-off-by: Wallas Santos <wallashss@ibm.com> * fix: change dummy token id Signed-off-by: Wallas Santos <wallashss@ibm.com> * fix: more workarounds Signed-off-by: Wallas Santos <wallashss@ibm.com> * fix linting Signed-off-by: Wallas Santos <wallashss@ibm.com> * feat: upgrade vllm Signed-off-by: Wallas Santos <wallashss@ibm.com> * trying to fix docker build Signed-off-by: Wallas Santos <wallashss@ibm.com> --------- Signed-off-by: Wallas Santos <wallashss@ibm.com>
diff --git a/Dockerfile.spyre b/Dockerfile.spyre
@@ -23,10 +23,11 @@ RUN pip install torch==2.5.1+cpu --index-url https://download.pytorch.org/whl/cp
 # Install uv
 RUN pip install uv
 # Install the plugin in a new venv, along with dev deps to test with
+ENV VLLM_TARGET_DEVICE=empty
 RUN cd /workspace/vllm-spyre \
     && uv venv .venv --system-site-packages \
     && source .venv/bin/activate \
-    && VLLM_TARGET_DEVICE=empty uv pip install -v -e . --system \
+    && uv pip install -v -e . --system \
     && uv sync --frozen --group dev
 ENV VLLM_PLUGINS=spyre
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -39,7 +39,7 @@ override-dependencies = [
 ]
 
 [tool.uv.sources]
-vllm = { git = "https://github.com/vllm-project/vllm", rev = "v0.8.0" }
+vllm = { git = "https://github.com/vllm-project/vllm", rev = "v0.8.3" }
 
 [tool.ruff]
 # Allow lines to be as long as 80.
diff --git a/tests/spyre_util.py b/tests/spyre_util.py
@@ -163,10 +163,14 @@ def generate_spyre_vllm_output(model: str, prompts: List[str],
     vllm_outputs = vllm_model.generate(prompts, sampling_params)
 
     results = []
+
     for req_output in vllm_outputs:
         result = {}
         result['text'] = req_output.outputs[0].text
-        result['token_ids'] = tuple(req_output.outputs[0].token_ids)
+        # TODO: Workaround for V1, if request does not fit in a warmup shape
+        # token_ids may be filled with -1.
+        token_ids = [t for t in req_output.outputs[0].token_ids if t >= 0]
+        result['token_ids'] = tuple(token_ids)
         result['tokens'] = tuple([
             req_output.outputs[0].logprobs[i][t].decoded_token
             for i, t in enumerate(result['token_ids'])
diff --git a/tests/test_spyre_online.py b/tests/test_spyre_online.py
@@ -3,6 +3,7 @@
 
 from tests.spyre_util import (RemoteOpenAIServer, get_spyre_backend_list,
                               get_spyre_model_list)
+from vllm_spyre.v1.core.scheduler import NO_WARMUP_FIT_STOP_REASON
 
 
 @pytest.mark.parametrize("model", get_spyre_model_list())
@@ -71,4 +72,11 @@ def test_openai_serving(model, warmup_shape, backend, vllm_version):
                                                max_tokens=25)
 
         assert len(completion.choices) == 1
+
+        # TODO: V0 and V1 have slight different behavior for requests
+        # that do not fit in a warmup shape
+
         assert len(completion.choices[0].text) == 0
+        if vllm_version == 'V1':
+            assert completion.choices[0].stop_reason == \
+                NO_WARMUP_FIT_STOP_REASON
diff --git a/vllm_spyre/v1/core/scheduler.py b/vllm_spyre/v1/core/scheduler.py
@@ -22,6 +22,8 @@
 
 logger = init_logger(__name__)
 
+NO_WARMUP_FIT_STOP_REASON = "Request did not fit any warmup shape"
+
 
 class SpyreScheduler(Scheduler):
     """Small extension of the V1 scheduler that adds constraints for Sypre:
@@ -185,11 +187,13 @@ def _reject_from_queue(self,
         for request in rejected_requests:
             queue.remove(request)
             reject_outputs.append(
-                EngineCoreOutput(request.request_id,
-                                 new_token_ids=[],
-                                 finish_reason=FinishReason.ABORT,
-                                 stop_reason="Request did not fit any warmup "
-                                 "shape"))
+                EngineCoreOutput(
+                    request.request_id,
+                    # TODO: FIXME
+                    # Dummy token prevent stats collection crash
+                    new_token_ids=[-1],
+                    finish_reason=FinishReason.ABORT,
+                    stop_reason=NO_WARMUP_FIT_STOP_REASON))
             request.status = RequestStatus.FINISHED_ABORTED
             self._free_request(request)
             self.rejected_requests.remove(request.request_id)
diff --git a/vllm_spyre/v1/worker/spyre_model_runner.py b/vllm_spyre/v1/worker/spyre_model_runner.py
@@ -305,6 +305,20 @@ def execute_model(
 
         t0 = time.time()
 
+        # TODO: change to EMPTY_MODEL_RUNNER_OUTPUT, right now this
+        # will be a breaking change, or clumsy to make retrocompatible
+        # with conditional import
+        if not scheduler_output.total_num_scheduled_tokens:
+            # Return empty ModelRunnerOuptut if there's no work to do.
+            return ModelRunnerOutput(
+                req_ids=[],
+                req_id_to_index={},
+                sampled_token_ids=[],
+                spec_token_ids=None,
+                logprobs=None,
+                prompt_logprobs_dict={},
+            )
+
         self._update_states(scheduler_output)
 
         model_input = self.prepare_model_input(scheduler_output)

Original file line number	Diff line number	Diff line change
`@@ -39,7 +39,7 @@ override-dependencies = [`
`39`	`39`	`]`
`40`	`40`
`41`	`41`	`[tool.uv.sources]`
`42`		`-vllm = { git = "https://github.com/vllm-project/vllm", rev = "v0.8.0" }`
	`42`	`+vllm = { git = "https://github.com/vllm-project/vllm", rev = "v0.8.3" }`
`43`	`43`
`44`	`44`	`[tool.ruff]`
`45`	`45`	`# Allow lines to be as long as 80.`