refactor(open): use independent server/client

max-wittig · max-wittig · commit 77cd6a8d3eba · 2025-10-13T16:00:17.000+02:00
Signed-off-by: Max Wittig &lt;max.wittig@siemens.com&gt;
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
@@ -432,46 +432,6 @@ async def test_chat_completion_stream_options(
     assert last_completion_tokens == 10
 
 
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
-)
-@pytest.mark.extra_server_args(['--enable-force-include-usage'])
-async def test_chat_with_enable_force_include_usage(client: openai.AsyncOpenAI,
-                                                    model_name: str):
-    messages = [{
-        "role": "system",
-        "content": "You are a helpful assistant."
-    }, {
-        "role": "user",
-        "content": "What is the capital of France?"
-    }]
-
-    stream = await client.chat.completions.create(
-        model=model_name,
-        messages=messages,
-        max_completion_tokens=10,
-        extra_body=dict(min_tokens=10),
-        temperature=0.0,
-        stream=True,
-    )
-    last_completion_tokens = 0
-    async for chunk in stream:
-        if not len(chunk.choices):
-            assert chunk.usage.prompt_tokens >= 0
-            assert last_completion_tokens == 0 or \
-               chunk.usage.completion_tokens > last_completion_tokens or \
-               (
-                   not chunk.choices and
-                   chunk.usage.completion_tokens == last_completion_tokens
-               )
-            assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
-                                                chunk.usage.completion_tokens)
-        else:
-            assert chunk.usage is None
-
-
 @pytest.mark.asyncio
 async def test_structured_outputs_choice_chat(
     client: openai.AsyncOpenAI,
diff --git a/tests/entrypoints/openai/test_enable_force_include_usage.py b/tests/entrypoints/openai/test_enable_force_include_usage.py
@@ -0,0 +1,124 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import openai
+import pytest
+import pytest_asyncio
+
+from ...utils import RemoteOpenAIServer
+
+
+@pytest.fixture(scope="module")
+def chat_server_with_force_include_usage(request):  #noqa: F811
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "128",
+        "--enforce-eager",
+        "--max-num-seqs",
+        "1",
+        "--enable-force-include-usage",
+        "--port",
+        "55857",
+        "--gpu-memory-utilization",
+        "0.2"
+    ]
+
+    with RemoteOpenAIServer("Qwen/Qwen3-0.6B", args,
+                            auto_port=False) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def chat_client_with_force_include_usage(
+        chat_server_with_force_include_usage):
+    async with chat_server_with_force_include_usage.get_async_client(
+    ) as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_chat_with_enable_force_include_usage(
+        chat_client_with_force_include_usage: openai.AsyncOpenAI):
+    messages = [{
+        "role": "system",
+        "content": "You are a helpful assistant."
+    }, {
+        "role": "user",
+        "content": "What is the capital of France?"
+    }]
+
+    stream = await chat_client_with_force_include_usage.chat.completions.create(
+        model="Qwen/Qwen3-0.6B",
+        messages=messages,
+        max_completion_tokens=10,
+        extra_body=dict(min_tokens=10),
+        temperature=0.0,
+        stream=True,
+    )
+    last_completion_tokens = 0
+    async for chunk in stream:
+        if not len(chunk.choices):
+            assert chunk.usage.prompt_tokens >= 0
+            assert last_completion_tokens == 0 or \
+               chunk.usage.completion_tokens > last_completion_tokens or \
+               (
+                   not chunk.choices and
+                   chunk.usage.completion_tokens == last_completion_tokens
+               )
+            assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
+                                                chunk.usage.completion_tokens)
+        else:
+            assert chunk.usage is None
+
+
+@pytest.fixture(scope="module")
+def transcription_server_with_force_include_usage():
+    args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-num-seqs",
+        "1",
+        "--enforce-eager",
+        "--enable-force-include-usage",
+        "--gpu-memory-utilization",
+        "0.2"
+    ]
+
+    with RemoteOpenAIServer("openai/whisper-large-v3-turbo",
+                            args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def transcription_client_with_force_include_usage(
+        transcription_server_with_force_include_usage):
+    async with transcription_server_with_force_include_usage.get_async_client(
+    ) as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_transcription_with_enable_force_include_usage(
+        transcription_client_with_force_include_usage, winning_call):
+    res = await transcription_client_with_force_include_usage \
+        .audio.transcriptions.create(
+        model="openai/whisper-large-v3-turbo",
+        file=winning_call,
+        language="en",
+        temperature=0.0,
+        stream=True,
+        timeout=30)
+
+    async for chunk in res:
+        if not len(chunk.choices):
+            # final usage sent
+            usage = chunk.usage
+            assert isinstance(usage, dict)
+            assert usage['prompt_tokens'] > 0
+            assert usage['completion_tokens'] > 0
+            assert usage['total_tokens'] > 0
+        else:
+            assert not hasattr(chunk, 'usage')
diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py
@@ -27,10 +27,7 @@
 
 
 @pytest.fixture(scope="module")
-def server(request):
-    if marker := request.node.get_closest_marker("extra_server_args"):
-        SERVER_ARGS.append(marker.args[0])
-
+def server():
     with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
         yield remote_server
 
@@ -202,29 +199,6 @@ async def test_stream_options(winning_call, client):
     assert final and continuous
 
 
-@pytest.mark.asyncio
-@pytest.mark.extra_server_args(['--enable-force-include-usage'])
-async def test_transcription_with_enable_force_include_usage(
-        client, winning_call):
-    res = await client.audio.transcriptions.create(model=MODEL_NAME,
-                                                   file=winning_call,
-                                                   language="en",
-                                                   temperature=0.0,
-                                                   stream=True,
-                                                   timeout=30)
-
-    async for chunk in res:
-        if not len(chunk.choices):
-            # final usage sent
-            usage = chunk.usage
-            assert isinstance(usage, dict)
-            assert usage['prompt_tokens'] > 0
-            assert usage['completion_tokens'] > 0
-            assert usage['total_tokens'] > 0
-        else:
-            assert not hasattr(chunk, 'usage')
-
-
 @pytest.mark.asyncio
 async def test_sampling_params(mary_had_lamb, client):
     """