refactor(open): use independent server/client

max-wittig · max-wittig · commit 460e4fa5cc00 · 2025-10-13T16:02:40.000+02:00
Signed-off-by: Max Wittig &lt;max.wittig@siemens.com&gt;
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
@@ -432,46 +432,6 @@ async def test_chat_completion_stream_options(
     assert last_completion_tokens == 10
 
 
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
-)
-@pytest.mark.extra_server_args(['--enable-force-include-usage'])
-async def test_chat_with_enable_force_include_usage(client: openai.AsyncOpenAI,
-                                                    model_name: str):
-    messages = [{
-        "role": "system",
-        "content": "You are a helpful assistant."
-    }, {
-        "role": "user",
-        "content": "What is the capital of France?"
-    }]
-
-    stream = await client.chat.completions.create(
-        model=model_name,
-        messages=messages,
-        max_completion_tokens=10,
-        extra_body=dict(min_tokens=10),
-        temperature=0.0,
-        stream=True,
-    )
-    last_completion_tokens = 0
-    async for chunk in stream:
-        if not len(chunk.choices):
-            assert chunk.usage.prompt_tokens >= 0
-            assert last_completion_tokens == 0 or \
-               chunk.usage.completion_tokens > last_completion_tokens or \
-               (
-                   not chunk.choices and
-                   chunk.usage.completion_tokens == last_completion_tokens
-               )
-            assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
-                                                chunk.usage.completion_tokens)
-        else:
-            assert chunk.usage is None
-
-
 @pytest.mark.asyncio
 async def test_structured_outputs_choice_chat(
     client: openai.AsyncOpenAI,
diff --git a/tests/entrypoints/openai/test_enable_force_include_usage.py b/tests/entrypoints/openai/test_enable_force_include_usage.py
@@ -0,0 +1,126 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import openai
+import pytest
+import pytest_asyncio
+
+from ...utils import RemoteOpenAIServer
+
+
+@pytest.fixture(scope="module")
+def chat_server_with_force_include_usage(request):  # noqa: F811
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "128",
+        "--enforce-eager",
+        "--max-num-seqs",
+        "1",
+        "--enable-force-include-usage",
+        "--port",
+        "55857",
+        "--gpu-memory-utilization",
+        "0.2",
+    ]
+
+    with RemoteOpenAIServer("Qwen/Qwen3-0.6B", args, auto_port=False) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def chat_client_with_force_include_usage(chat_server_with_force_include_usage):
+    async with chat_server_with_force_include_usage.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_chat_with_enable_force_include_usage(
+    chat_client_with_force_include_usage: openai.AsyncOpenAI,
+):
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "What is the capital of France?"},
+    ]
+
+    stream = await chat_client_with_force_include_usage.chat.completions.create(
+        model="Qwen/Qwen3-0.6B",
+        messages=messages,
+        max_completion_tokens=10,
+        extra_body=dict(min_tokens=10),
+        temperature=0.0,
+        stream=True,
+    )
+    last_completion_tokens = 0
+    async for chunk in stream:
+        if not len(chunk.choices):
+            assert chunk.usage.prompt_tokens >= 0
+            assert (
+                last_completion_tokens == 0
+                or chunk.usage.completion_tokens > last_completion_tokens
+                or (
+                    not chunk.choices
+                    and chunk.usage.completion_tokens == last_completion_tokens
+                )
+            )
+            assert chunk.usage.total_tokens == (
+                chunk.usage.prompt_tokens + chunk.usage.completion_tokens
+            )
+        else:
+            assert chunk.usage is None
+
+
+@pytest.fixture(scope="module")
+def transcription_server_with_force_include_usage():
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-num-seqs",
+        "1",
+        "--enforce-eager",
+        "--enable-force-include-usage",
+        "--gpu-memory-utilization",
+        "0.2",
+    ]
+
+    with RemoteOpenAIServer("openai/whisper-large-v3-turbo", args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def transcription_client_with_force_include_usage(
+    transcription_server_with_force_include_usage,
+):
+    async with (
+        transcription_server_with_force_include_usage.get_async_client() as async_client
+    ):
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_transcription_with_enable_force_include_usage(
+    transcription_client_with_force_include_usage, winning_call
+):
+    res = (
+        await transcription_client_with_force_include_usage.audio.transcriptions.create(
+            model="openai/whisper-large-v3-turbo",
+            file=winning_call,
+            language="en",
+            temperature=0.0,
+            stream=True,
+            timeout=30,
+        )
+    )
+
+    async for chunk in res:
+        if not len(chunk.choices):
+            # final usage sent
+            usage = chunk.usage
+            assert isinstance(usage, dict)
+            assert usage["prompt_tokens"] > 0
+            assert usage["completion_tokens"] > 0
+            assert usage["total_tokens"] > 0
+        else:
+            assert not hasattr(chunk, "usage")
diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py
@@ -27,10 +27,7 @@
 
 
 @pytest.fixture(scope="module")
-def server(request):
-    if marker := request.node.get_closest_marker("extra_server_args"):
-        SERVER_ARGS.append(marker.args[0])
-
+def server():
     with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
         yield remote_server
 
@@ -202,29 +199,6 @@ async def test_stream_options(winning_call, client):
     assert final and continuous
 
 
-@pytest.mark.asyncio
-@pytest.mark.extra_server_args(['--enable-force-include-usage'])
-async def test_transcription_with_enable_force_include_usage(
-        client, winning_call):
-    res = await client.audio.transcriptions.create(model=MODEL_NAME,
-                                                   file=winning_call,
-                                                   language="en",
-                                                   temperature=0.0,
-                                                   stream=True,
-                                                   timeout=30)
-
-    async for chunk in res:
-        if not len(chunk.choices):
-            # final usage sent
-            usage = chunk.usage
-            assert isinstance(usage, dict)
-            assert usage['prompt_tokens'] > 0
-            assert usage['completion_tokens'] > 0
-            assert usage['total_tokens'] > 0
-        else:
-            assert not hasattr(chunk, 'usage')
-
-
 @pytest.mark.asyncio
 async def test_sampling_params(mary_had_lamb, client):
     """
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
@@ -102,12 +102,15 @@ def make_arg_parser(parser: FlexibleArgumentParser):
         "--enable-prompt-tokens-details",
         action="store_true",
         default=False,
-        help="If set to True, enable prompt_tokens_details in usage.")
-    parser.add_argument("--enable-force-include-usage",
-                        action='store_true',
-                        default=False,
-                        help="If set to True, include usage on every request "
-                        "(even when stream_options is not specified)")
+        help="If set to True, enable prompt_tokens_details in usage.",
+    )
+    parser.add_argument(
+        "--enable-force-include-usage",
+        action="store_true",
+        default=False,
+        help="If set to True, include usage on every request "
+        "(even when stream_options is not specified)",
+    )
 
     return parser
 
@@ -356,25 +359,33 @@ async def run_batch(
         base_model_paths=base_model_paths,
         lora_modules=None,
     )
-    openai_serving_chat = OpenAIServingChat(
-        engine_client,
-        model_config,
-        openai_serving_models,
-        args.response_role,
-        request_logger=request_logger,
-        chat_template=None,
-        chat_template_content_format="auto",
-        enable_prompt_tokens_details=args.enable_prompt_tokens_details,
-        enable_force_include_usage=args.enable_force_include_usage,
-    ) if "generate" in supported_tasks else None
-    openai_serving_embedding = OpenAIServingEmbedding(
-        engine_client,
-        model_config,
-        openai_serving_models,
-        request_logger=request_logger,
-        chat_template=None,
-        chat_template_content_format="auto",
-    ) if "embed" in supported_tasks else None
+    openai_serving_chat = (
+        OpenAIServingChat(
+            engine_client,
+            model_config,
+            openai_serving_models,
+            args.response_role,
+            request_logger=request_logger,
+            chat_template=None,
+            chat_template_content_format="auto",
+            enable_prompt_tokens_details=args.enable_prompt_tokens_details,
+            enable_force_include_usage=args.enable_force_include_usage,
+        )
+        if "generate" in supported_tasks
+        else None
+    )
+    openai_serving_embedding = (
+        OpenAIServingEmbedding(
+            engine_client,
+            model_config,
+            openai_serving_models,
+            request_logger=request_logger,
+            chat_template=None,
+            chat_template_content_format="auto",
+        )
+        if "embed" in supported_tasks
+        else None
+    )
 
     enable_serving_reranking = (
         "classify" in supported_tasks
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
@@ -59,7 +59,6 @@
 from vllm.entrypoints.openai.tool_parsers import ToolParser
 from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolCall
 from vllm.entrypoints.utils import get_max_tokens, should_include_usage
-
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.logger import init_logger
 from vllm.logprobs import Logprob
diff --git a/vllm/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/serving_transcription.py
@@ -46,7 +46,7 @@ def __init__(
             return_tokens_as_token_ids=return_tokens_as_token_ids,
             task_type="transcribe",
             log_error_stack=log_error_stack,
-            enable_force_include_usage=enable_force_include_usage
+            enable_force_include_usage=enable_force_include_usage,
         )
 
     async def create_transcription(
@@ -107,7 +107,7 @@ def __init__(
             return_tokens_as_token_ids=return_tokens_as_token_ids,
             task_type="translate",
             log_error_stack=log_error_stack,
-            enable_force_include_usage=enable_force_include_usage
+            enable_force_include_usage=enable_force_include_usage,
         )
 
     async def create_translation(
diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py
@@ -246,10 +246,12 @@ async def _create_speech_to_text(
             return self.create_error_response(str(e))
 
     async def _speech_to_text_stream_generator(
-        self, request: SpeechToTextRequest,
-        list_result_generator: list[AsyncGenerator[RequestOutput,
-                                                   None]], request_id: str,
-        request_metadata: RequestResponseMetadata, audio_duration_s: float,
+        self,
+        request: SpeechToTextRequest,
+        list_result_generator: list[AsyncGenerator[RequestOutput, None]],
+        request_id: str,
+        request_metadata: RequestResponseMetadata,
+        audio_duration_s: float,
         chunk_object_type: Literal["translation.chunk", "transcription.chunk"],
         response_stream_choice_class: type[TranscriptionResponseStreamChoice]
         | type[TranslationResponseStreamChoice],
@@ -262,11 +264,12 @@ async def _speech_to_text_stream_generator(
         completion_tokens = 0
         num_prompt_tokens = 0
 
-        include_usage = self.enable_force_include_usage or \
-            request.stream_include_usage
-        include_continuous_usage = request.stream_continuous_usage_stats \
-            if include_usage and request.stream_continuous_usage_stats \
+        include_usage = self.enable_force_include_usage or request.stream_include_usage
+        include_continuous_usage = (
+            request.stream_continuous_usage_stats
+            if include_usage and request.stream_continuous_usage_stats
             else False
+        )
 
         try:
             for result_generator in list_result_generator:
diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
@@ -14,7 +14,11 @@
 
 from vllm.engine.arg_utils import EngineArgs
 from vllm.entrypoints.openai.cli_args import make_arg_parser
-from vllm.entrypoints.openai.protocol import ChatCompletionRequest, CompletionRequest, StreamOptions
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    CompletionRequest,
+    StreamOptions,
+)
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils import FlexibleArgumentParser
@@ -240,14 +244,13 @@ def log_non_default_args(args: Namespace | EngineArgs):
 
 
 def should_include_usage(
-        stream_options: StreamOptions | None,
-        enable_force_include_usage: bool) -> tuple[bool, bool]:
+    stream_options: StreamOptions | None, enable_force_include_usage: bool
+) -> tuple[bool, bool]:
     if stream_options:
-        include_usage = stream_options.include_usage \
-                        or enable_force_include_usage
-        include_continuous_usage = include_usage and \
-                                   bool(stream_options.continuous_usage_stats)
+        include_usage = stream_options.include_usage or enable_force_include_usage
+        include_continuous_usage = include_usage and bool(
+            stream_options.continuous_usage_stats
+        )
     else:
-        include_usage, include_continuous_usage \
-            = enable_force_include_usage, False
+        include_usage, include_continuous_usage = enable_force_include_usage, False
     return include_usage, include_continuous_usage