Try fetching stop_reason from EngineOutput before checking the request

bnellnm · bnellnm · commit b80a1b607cf9 · 2025-02-11T19:02:44.000Z
Signed-off-by: Bill Nell &lt;bill@neuralmagic.com&gt;
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
@@ -179,11 +179,14 @@ def process_outputs(
             # in the EngineCore.
             req_state.is_prefilling = not new_token_ids
 
+            stop_reason = engine_core_output.stop_reason
+
             # 2) Detokenize the token ids into text and check for stop
             #    strings.
-            stop_reason = req_state.detokenizer.update(new_token_ids)
-            if stop_reason:
+            stop_string = req_state.detokenizer.update(new_token_ids)
+            if stop_string and finish_reason != FinishReason.STOP:
                 finish_reason = FinishReason.STOP
+                stop_reason = stop_string
 
             # 3) Compute sample and prompt logprobs for request,
             #    if required.