Skip to content

Commit 77cd6a8

Browse files
committed
refactor(open): use independent server/client
Signed-off-by: Max Wittig <[email protected]>
1 parent fda6e8e commit 77cd6a8

File tree

3 files changed

+125
-67
lines changed

3 files changed

+125
-67
lines changed

tests/entrypoints/openai/test_chat.py

Lines changed: 0 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -432,46 +432,6 @@ async def test_chat_completion_stream_options(
432432
assert last_completion_tokens == 10
433433

434434

435-
@pytest.mark.asyncio
436-
@pytest.mark.parametrize(
437-
"model_name",
438-
["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
439-
)
440-
@pytest.mark.extra_server_args(['--enable-force-include-usage'])
441-
async def test_chat_with_enable_force_include_usage(client: openai.AsyncOpenAI,
442-
model_name: str):
443-
messages = [{
444-
"role": "system",
445-
"content": "You are a helpful assistant."
446-
}, {
447-
"role": "user",
448-
"content": "What is the capital of France?"
449-
}]
450-
451-
stream = await client.chat.completions.create(
452-
model=model_name,
453-
messages=messages,
454-
max_completion_tokens=10,
455-
extra_body=dict(min_tokens=10),
456-
temperature=0.0,
457-
stream=True,
458-
)
459-
last_completion_tokens = 0
460-
async for chunk in stream:
461-
if not len(chunk.choices):
462-
assert chunk.usage.prompt_tokens >= 0
463-
assert last_completion_tokens == 0 or \
464-
chunk.usage.completion_tokens > last_completion_tokens or \
465-
(
466-
not chunk.choices and
467-
chunk.usage.completion_tokens == last_completion_tokens
468-
)
469-
assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
470-
chunk.usage.completion_tokens)
471-
else:
472-
assert chunk.usage is None
473-
474-
475435
@pytest.mark.asyncio
476436
async def test_structured_outputs_choice_chat(
477437
client: openai.AsyncOpenAI,
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
import openai
4+
import pytest
5+
import pytest_asyncio
6+
7+
from ...utils import RemoteOpenAIServer
8+
9+
10+
@pytest.fixture(scope="module")
11+
def chat_server_with_force_include_usage(request): #noqa: F811
12+
args = [
13+
# use half precision for speed and memory savings in CI environment
14+
"--dtype",
15+
"bfloat16",
16+
"--max-model-len",
17+
"128",
18+
"--enforce-eager",
19+
"--max-num-seqs",
20+
"1",
21+
"--enable-force-include-usage",
22+
"--port",
23+
"55857",
24+
"--gpu-memory-utilization",
25+
"0.2"
26+
]
27+
28+
with RemoteOpenAIServer("Qwen/Qwen3-0.6B", args,
29+
auto_port=False) as remote_server:
30+
yield remote_server
31+
32+
33+
@pytest_asyncio.fixture
34+
async def chat_client_with_force_include_usage(
35+
chat_server_with_force_include_usage):
36+
async with chat_server_with_force_include_usage.get_async_client(
37+
) as async_client:
38+
yield async_client
39+
40+
41+
@pytest.mark.asyncio
42+
async def test_chat_with_enable_force_include_usage(
43+
chat_client_with_force_include_usage: openai.AsyncOpenAI):
44+
messages = [{
45+
"role": "system",
46+
"content": "You are a helpful assistant."
47+
}, {
48+
"role": "user",
49+
"content": "What is the capital of France?"
50+
}]
51+
52+
stream = await chat_client_with_force_include_usage.chat.completions.create(
53+
model="Qwen/Qwen3-0.6B",
54+
messages=messages,
55+
max_completion_tokens=10,
56+
extra_body=dict(min_tokens=10),
57+
temperature=0.0,
58+
stream=True,
59+
)
60+
last_completion_tokens = 0
61+
async for chunk in stream:
62+
if not len(chunk.choices):
63+
assert chunk.usage.prompt_tokens >= 0
64+
assert last_completion_tokens == 0 or \
65+
chunk.usage.completion_tokens > last_completion_tokens or \
66+
(
67+
not chunk.choices and
68+
chunk.usage.completion_tokens == last_completion_tokens
69+
)
70+
assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
71+
chunk.usage.completion_tokens)
72+
else:
73+
assert chunk.usage is None
74+
75+
76+
@pytest.fixture(scope="module")
77+
def transcription_server_with_force_include_usage():
78+
args = [
79+
# use half precision for speed and memory savings in CI environment
80+
"--dtype",
81+
"bfloat16",
82+
"--max-num-seqs",
83+
"1",
84+
"--enforce-eager",
85+
"--enable-force-include-usage",
86+
"--gpu-memory-utilization",
87+
"0.2"
88+
]
89+
90+
with RemoteOpenAIServer("openai/whisper-large-v3-turbo",
91+
args) as remote_server:
92+
yield remote_server
93+
94+
95+
@pytest_asyncio.fixture
96+
async def transcription_client_with_force_include_usage(
97+
transcription_server_with_force_include_usage):
98+
async with transcription_server_with_force_include_usage.get_async_client(
99+
) as async_client:
100+
yield async_client
101+
102+
103+
@pytest.mark.asyncio
104+
async def test_transcription_with_enable_force_include_usage(
105+
transcription_client_with_force_include_usage, winning_call):
106+
res = await transcription_client_with_force_include_usage \
107+
.audio.transcriptions.create(
108+
model="openai/whisper-large-v3-turbo",
109+
file=winning_call,
110+
language="en",
111+
temperature=0.0,
112+
stream=True,
113+
timeout=30)
114+
115+
async for chunk in res:
116+
if not len(chunk.choices):
117+
# final usage sent
118+
usage = chunk.usage
119+
assert isinstance(usage, dict)
120+
assert usage['prompt_tokens'] > 0
121+
assert usage['completion_tokens'] > 0
122+
assert usage['total_tokens'] > 0
123+
else:
124+
assert not hasattr(chunk, 'usage')

tests/entrypoints/openai/test_transcription_validation.py

Lines changed: 1 addition & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,7 @@
2727

2828

2929
@pytest.fixture(scope="module")
30-
def server(request):
31-
if marker := request.node.get_closest_marker("extra_server_args"):
32-
SERVER_ARGS.append(marker.args[0])
33-
30+
def server():
3431
with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
3532
yield remote_server
3633

@@ -202,29 +199,6 @@ async def test_stream_options(winning_call, client):
202199
assert final and continuous
203200

204201

205-
@pytest.mark.asyncio
206-
@pytest.mark.extra_server_args(['--enable-force-include-usage'])
207-
async def test_transcription_with_enable_force_include_usage(
208-
client, winning_call):
209-
res = await client.audio.transcriptions.create(model=MODEL_NAME,
210-
file=winning_call,
211-
language="en",
212-
temperature=0.0,
213-
stream=True,
214-
timeout=30)
215-
216-
async for chunk in res:
217-
if not len(chunk.choices):
218-
# final usage sent
219-
usage = chunk.usage
220-
assert isinstance(usage, dict)
221-
assert usage['prompt_tokens'] > 0
222-
assert usage['completion_tokens'] > 0
223-
assert usage['total_tokens'] > 0
224-
else:
225-
assert not hasattr(chunk, 'usage')
226-
227-
228202
@pytest.mark.asyncio
229203
async def test_sampling_params(mary_had_lamb, client):
230204
"""

0 commit comments

Comments
 (0)