Add workaround for requests that does not fit in warmup shapes

wallashss · wallashss · commit 5fb35ad67c8f · 2025-04-04T10:27:50.000-03:00
Revert "disable stats for test and warn users" This reverts commit 03cc587. Signed-off-by: Wallas Santos <wallashss@ibm.com>
diff --git a/tests/spyre_util.py b/tests/spyre_util.py
@@ -64,10 +64,6 @@ def __init__(self,
         env = os.environ.copy()
         if env_dict is not None:
             env.update(env_dict)
-
-        # TODO: Re-enable stats for vllm-spyre plugin
-        # See: https://github.com/vllm-project/vllm-spyre/issues/68
-        vllm_serve_args.append("--disable-log-stats")
         self.proc = subprocess.Popen(
             ["vllm", "serve", model, *vllm_serve_args],
             env=env,
diff --git a/tests/test_spyre_online.py b/tests/test_spyre_online.py
@@ -3,6 +3,7 @@
 
 from tests.spyre_util import (RemoteOpenAIServer, get_spyre_backend_list,
                               get_spyre_model_list)
+from vllm_spyre.v1.core.scheduler import NO_WARMUP_FIT_STOP_REASON
 
 
 @pytest.mark.parametrize("model", get_spyre_model_list())
@@ -71,4 +72,11 @@ def test_openai_serving(model, warmup_shape, backend, vllm_version):
                                                max_tokens=25)
 
         assert len(completion.choices) == 1
-        assert len(completion.choices[0].text) == 0
+
+        # TODO: V0 and V1 have slight different behavior for requests
+        # that do not fit in a warmup shape
+        if vllm_version == 'V0':
+            assert len(completion.choices[0].text) == 0
+        elif vllm_version == 'V1':
+            assert completion.choices[0].stop_reason == \
+                NO_WARMUP_FIT_STOP_REASON
diff --git a/vllm_spyre/v1/core/scheduler.py b/vllm_spyre/v1/core/scheduler.py
@@ -22,6 +22,8 @@
 
 logger = init_logger(__name__)
 
+NO_WARMUP_FIT_STOP_REASON = "Request did not fit any warmup shape"
+
 
 class SpyreScheduler(Scheduler):
     """Small extension of the V1 scheduler that adds constraints for Sypre:
@@ -47,13 +49,6 @@ def __init__(self, *args, **kwargs) -> None:
 
         self.rejected_requests: set[str] = set()
 
-        if self.log_stats:
-            logger.warning_once(
-                "Log stats for V1 is not working properly. Requests that do "
-                "not fit in warmup shapes will crash the engine. "
-                "Pass --disable-log-stats to disable stats and this message. "
-                "See https://github.com/vllm-project/vllm-spyre/issues/68")
-
     def add_request(self, request: Request) -> None:
         """This override rejects requests that fit no warmup shape"""
         if len(
@@ -192,11 +187,13 @@ def _reject_from_queue(self,
         for request in rejected_requests:
             queue.remove(request)
             reject_outputs.append(
-                EngineCoreOutput(request.request_id,
-                                 new_token_ids=[],
-                                 finish_reason=FinishReason.ABORT,
-                                 stop_reason="Request did not fit any warmup "
-                                 "shape"))
+                EngineCoreOutput(
+                    request.request_id,
+                    # TODO: FIXME
+                    # Dummy token prevent stats collection crash
+                    new_token_ids=[0],
+                    finish_reason=FinishReason.ABORT,
+                    stop_reason=NO_WARMUP_FIT_STOP_REASON))
             request.status = RequestStatus.FINISHED_ABORTED
             self._free_request(request)
             self.rejected_requests.remove(request.request_id)