Skip to content

Commit 460e4fa

Browse files
committed
refactor(open): use independent server/client
Signed-off-by: Max Wittig <[email protected]>
1 parent fda6e8e commit 460e4fa

File tree

8 files changed

+188
-112
lines changed

8 files changed

+188
-112
lines changed

tests/entrypoints/openai/test_chat.py

Lines changed: 0 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -432,46 +432,6 @@ async def test_chat_completion_stream_options(
432432
assert last_completion_tokens == 10
433433

434434

435-
@pytest.mark.asyncio
436-
@pytest.mark.parametrize(
437-
"model_name",
438-
["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
439-
)
440-
@pytest.mark.extra_server_args(['--enable-force-include-usage'])
441-
async def test_chat_with_enable_force_include_usage(client: openai.AsyncOpenAI,
442-
model_name: str):
443-
messages = [{
444-
"role": "system",
445-
"content": "You are a helpful assistant."
446-
}, {
447-
"role": "user",
448-
"content": "What is the capital of France?"
449-
}]
450-
451-
stream = await client.chat.completions.create(
452-
model=model_name,
453-
messages=messages,
454-
max_completion_tokens=10,
455-
extra_body=dict(min_tokens=10),
456-
temperature=0.0,
457-
stream=True,
458-
)
459-
last_completion_tokens = 0
460-
async for chunk in stream:
461-
if not len(chunk.choices):
462-
assert chunk.usage.prompt_tokens >= 0
463-
assert last_completion_tokens == 0 or \
464-
chunk.usage.completion_tokens > last_completion_tokens or \
465-
(
466-
not chunk.choices and
467-
chunk.usage.completion_tokens == last_completion_tokens
468-
)
469-
assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
470-
chunk.usage.completion_tokens)
471-
else:
472-
assert chunk.usage is None
473-
474-
475435
@pytest.mark.asyncio
476436
async def test_structured_outputs_choice_chat(
477437
client: openai.AsyncOpenAI,
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
import openai
4+
import pytest
5+
import pytest_asyncio
6+
7+
from ...utils import RemoteOpenAIServer
8+
9+
10+
@pytest.fixture(scope="module")
11+
def chat_server_with_force_include_usage(request): # noqa: F811
12+
args = [
13+
# use half precision for speed and memory savings in CI environment
14+
"--dtype",
15+
"bfloat16",
16+
"--max-model-len",
17+
"128",
18+
"--enforce-eager",
19+
"--max-num-seqs",
20+
"1",
21+
"--enable-force-include-usage",
22+
"--port",
23+
"55857",
24+
"--gpu-memory-utilization",
25+
"0.2",
26+
]
27+
28+
with RemoteOpenAIServer("Qwen/Qwen3-0.6B", args, auto_port=False) as remote_server:
29+
yield remote_server
30+
31+
32+
@pytest_asyncio.fixture
33+
async def chat_client_with_force_include_usage(chat_server_with_force_include_usage):
34+
async with chat_server_with_force_include_usage.get_async_client() as async_client:
35+
yield async_client
36+
37+
38+
@pytest.mark.asyncio
39+
async def test_chat_with_enable_force_include_usage(
40+
chat_client_with_force_include_usage: openai.AsyncOpenAI,
41+
):
42+
messages = [
43+
{"role": "system", "content": "You are a helpful assistant."},
44+
{"role": "user", "content": "What is the capital of France?"},
45+
]
46+
47+
stream = await chat_client_with_force_include_usage.chat.completions.create(
48+
model="Qwen/Qwen3-0.6B",
49+
messages=messages,
50+
max_completion_tokens=10,
51+
extra_body=dict(min_tokens=10),
52+
temperature=0.0,
53+
stream=True,
54+
)
55+
last_completion_tokens = 0
56+
async for chunk in stream:
57+
if not len(chunk.choices):
58+
assert chunk.usage.prompt_tokens >= 0
59+
assert (
60+
last_completion_tokens == 0
61+
or chunk.usage.completion_tokens > last_completion_tokens
62+
or (
63+
not chunk.choices
64+
and chunk.usage.completion_tokens == last_completion_tokens
65+
)
66+
)
67+
assert chunk.usage.total_tokens == (
68+
chunk.usage.prompt_tokens + chunk.usage.completion_tokens
69+
)
70+
else:
71+
assert chunk.usage is None
72+
73+
74+
@pytest.fixture(scope="module")
75+
def transcription_server_with_force_include_usage():
76+
args = [
77+
# use half precision for speed and memory savings in CI environment
78+
"--dtype",
79+
"bfloat16",
80+
"--max-num-seqs",
81+
"1",
82+
"--enforce-eager",
83+
"--enable-force-include-usage",
84+
"--gpu-memory-utilization",
85+
"0.2",
86+
]
87+
88+
with RemoteOpenAIServer("openai/whisper-large-v3-turbo", args) as remote_server:
89+
yield remote_server
90+
91+
92+
@pytest_asyncio.fixture
93+
async def transcription_client_with_force_include_usage(
94+
transcription_server_with_force_include_usage,
95+
):
96+
async with (
97+
transcription_server_with_force_include_usage.get_async_client() as async_client
98+
):
99+
yield async_client
100+
101+
102+
@pytest.mark.asyncio
103+
async def test_transcription_with_enable_force_include_usage(
104+
transcription_client_with_force_include_usage, winning_call
105+
):
106+
res = (
107+
await transcription_client_with_force_include_usage.audio.transcriptions.create(
108+
model="openai/whisper-large-v3-turbo",
109+
file=winning_call,
110+
language="en",
111+
temperature=0.0,
112+
stream=True,
113+
timeout=30,
114+
)
115+
)
116+
117+
async for chunk in res:
118+
if not len(chunk.choices):
119+
# final usage sent
120+
usage = chunk.usage
121+
assert isinstance(usage, dict)
122+
assert usage["prompt_tokens"] > 0
123+
assert usage["completion_tokens"] > 0
124+
assert usage["total_tokens"] > 0
125+
else:
126+
assert not hasattr(chunk, "usage")

tests/entrypoints/openai/test_transcription_validation.py

Lines changed: 1 addition & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,7 @@
2727

2828

2929
@pytest.fixture(scope="module")
30-
def server(request):
31-
if marker := request.node.get_closest_marker("extra_server_args"):
32-
SERVER_ARGS.append(marker.args[0])
33-
30+
def server():
3431
with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
3532
yield remote_server
3633

@@ -202,29 +199,6 @@ async def test_stream_options(winning_call, client):
202199
assert final and continuous
203200

204201

205-
@pytest.mark.asyncio
206-
@pytest.mark.extra_server_args(['--enable-force-include-usage'])
207-
async def test_transcription_with_enable_force_include_usage(
208-
client, winning_call):
209-
res = await client.audio.transcriptions.create(model=MODEL_NAME,
210-
file=winning_call,
211-
language="en",
212-
temperature=0.0,
213-
stream=True,
214-
timeout=30)
215-
216-
async for chunk in res:
217-
if not len(chunk.choices):
218-
# final usage sent
219-
usage = chunk.usage
220-
assert isinstance(usage, dict)
221-
assert usage['prompt_tokens'] > 0
222-
assert usage['completion_tokens'] > 0
223-
assert usage['total_tokens'] > 0
224-
else:
225-
assert not hasattr(chunk, 'usage')
226-
227-
228202
@pytest.mark.asyncio
229203
async def test_sampling_params(mary_had_lamb, client):
230204
"""

vllm/entrypoints/openai/run_batch.py

Lines changed: 36 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -102,12 +102,15 @@ def make_arg_parser(parser: FlexibleArgumentParser):
102102
"--enable-prompt-tokens-details",
103103
action="store_true",
104104
default=False,
105-
help="If set to True, enable prompt_tokens_details in usage.")
106-
parser.add_argument("--enable-force-include-usage",
107-
action='store_true',
108-
default=False,
109-
help="If set to True, include usage on every request "
110-
"(even when stream_options is not specified)")
105+
help="If set to True, enable prompt_tokens_details in usage.",
106+
)
107+
parser.add_argument(
108+
"--enable-force-include-usage",
109+
action="store_true",
110+
default=False,
111+
help="If set to True, include usage on every request "
112+
"(even when stream_options is not specified)",
113+
)
111114

112115
return parser
113116

@@ -356,25 +359,33 @@ async def run_batch(
356359
base_model_paths=base_model_paths,
357360
lora_modules=None,
358361
)
359-
openai_serving_chat = OpenAIServingChat(
360-
engine_client,
361-
model_config,
362-
openai_serving_models,
363-
args.response_role,
364-
request_logger=request_logger,
365-
chat_template=None,
366-
chat_template_content_format="auto",
367-
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
368-
enable_force_include_usage=args.enable_force_include_usage,
369-
) if "generate" in supported_tasks else None
370-
openai_serving_embedding = OpenAIServingEmbedding(
371-
engine_client,
372-
model_config,
373-
openai_serving_models,
374-
request_logger=request_logger,
375-
chat_template=None,
376-
chat_template_content_format="auto",
377-
) if "embed" in supported_tasks else None
362+
openai_serving_chat = (
363+
OpenAIServingChat(
364+
engine_client,
365+
model_config,
366+
openai_serving_models,
367+
args.response_role,
368+
request_logger=request_logger,
369+
chat_template=None,
370+
chat_template_content_format="auto",
371+
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
372+
enable_force_include_usage=args.enable_force_include_usage,
373+
)
374+
if "generate" in supported_tasks
375+
else None
376+
)
377+
openai_serving_embedding = (
378+
OpenAIServingEmbedding(
379+
engine_client,
380+
model_config,
381+
openai_serving_models,
382+
request_logger=request_logger,
383+
chat_template=None,
384+
chat_template_content_format="auto",
385+
)
386+
if "embed" in supported_tasks
387+
else None
388+
)
378389

379390
enable_serving_reranking = (
380391
"classify" in supported_tasks

vllm/entrypoints/openai/serving_chat.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,6 @@
5959
from vllm.entrypoints.openai.tool_parsers import ToolParser
6060
from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolCall
6161
from vllm.entrypoints.utils import get_max_tokens, should_include_usage
62-
6362
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
6463
from vllm.logger import init_logger
6564
from vllm.logprobs import Logprob

vllm/entrypoints/openai/serving_transcription.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ def __init__(
4646
return_tokens_as_token_ids=return_tokens_as_token_ids,
4747
task_type="transcribe",
4848
log_error_stack=log_error_stack,
49-
enable_force_include_usage=enable_force_include_usage
49+
enable_force_include_usage=enable_force_include_usage,
5050
)
5151

5252
async def create_transcription(
@@ -107,7 +107,7 @@ def __init__(
107107
return_tokens_as_token_ids=return_tokens_as_token_ids,
108108
task_type="translate",
109109
log_error_stack=log_error_stack,
110-
enable_force_include_usage=enable_force_include_usage
110+
enable_force_include_usage=enable_force_include_usage,
111111
)
112112

113113
async def create_translation(

vllm/entrypoints/openai/speech_to_text.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -246,10 +246,12 @@ async def _create_speech_to_text(
246246
return self.create_error_response(str(e))
247247

248248
async def _speech_to_text_stream_generator(
249-
self, request: SpeechToTextRequest,
250-
list_result_generator: list[AsyncGenerator[RequestOutput,
251-
None]], request_id: str,
252-
request_metadata: RequestResponseMetadata, audio_duration_s: float,
249+
self,
250+
request: SpeechToTextRequest,
251+
list_result_generator: list[AsyncGenerator[RequestOutput, None]],
252+
request_id: str,
253+
request_metadata: RequestResponseMetadata,
254+
audio_duration_s: float,
253255
chunk_object_type: Literal["translation.chunk", "transcription.chunk"],
254256
response_stream_choice_class: type[TranscriptionResponseStreamChoice]
255257
| type[TranslationResponseStreamChoice],
@@ -262,11 +264,12 @@ async def _speech_to_text_stream_generator(
262264
completion_tokens = 0
263265
num_prompt_tokens = 0
264266

265-
include_usage = self.enable_force_include_usage or \
266-
request.stream_include_usage
267-
include_continuous_usage = request.stream_continuous_usage_stats \
268-
if include_usage and request.stream_continuous_usage_stats \
267+
include_usage = self.enable_force_include_usage or request.stream_include_usage
268+
include_continuous_usage = (
269+
request.stream_continuous_usage_stats
270+
if include_usage and request.stream_continuous_usage_stats
269271
else False
272+
)
270273

271274
try:
272275
for result_generator in list_result_generator:

vllm/entrypoints/utils.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,11 @@
1414

1515
from vllm.engine.arg_utils import EngineArgs
1616
from vllm.entrypoints.openai.cli_args import make_arg_parser
17-
from vllm.entrypoints.openai.protocol import ChatCompletionRequest, CompletionRequest, StreamOptions
17+
from vllm.entrypoints.openai.protocol import (
18+
ChatCompletionRequest,
19+
CompletionRequest,
20+
StreamOptions,
21+
)
1822
from vllm.logger import init_logger
1923
from vllm.platforms import current_platform
2024
from vllm.utils import FlexibleArgumentParser
@@ -240,14 +244,13 @@ def log_non_default_args(args: Namespace | EngineArgs):
240244

241245

242246
def should_include_usage(
243-
stream_options: StreamOptions | None,
244-
enable_force_include_usage: bool) -> tuple[bool, bool]:
247+
stream_options: StreamOptions | None, enable_force_include_usage: bool
248+
) -> tuple[bool, bool]:
245249
if stream_options:
246-
include_usage = stream_options.include_usage \
247-
or enable_force_include_usage
248-
include_continuous_usage = include_usage and \
249-
bool(stream_options.continuous_usage_stats)
250+
include_usage = stream_options.include_usage or enable_force_include_usage
251+
include_continuous_usage = include_usage and bool(
252+
stream_options.continuous_usage_stats
253+
)
250254
else:
251-
include_usage, include_continuous_usage \
252-
= enable_force_include_usage, False
255+
include_usage, include_continuous_usage = enable_force_include_usage, False
253256
return include_usage, include_continuous_usage

0 commit comments

Comments
 (0)