From 7fb18c88206ddc9bb119ed199af4fee2c8378250 Mon Sep 17 00:00:00 2001 From: Kyle Huang Date: Sun, 8 Dec 2024 17:26:41 -0800 Subject: [PATCH 01/11] add OpenAI API support for input_audio --- tests/entrypoints/openai/test_audio.py | 131 ++++++++++++++++++++++++- vllm/entrypoints/chat_utils.py | 76 ++++++++++++-- 2 files changed, 193 insertions(+), 14 deletions(-) diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py index a74109e2f512..fa08e1f51196 100644 --- a/tests/entrypoints/openai/test_audio.py +++ b/tests/entrypoints/openai/test_audio.py @@ -154,6 +154,61 @@ async def test_single_chat_session_audio_base64encoded( assert message.content is not None and len(message.content) >= 0 +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS) +async def test_single_chat_session_input_audio( + client: openai.AsyncOpenAI, model_name: str, audio_url: str, + base64_encoded_audio: Dict[str, str]): + messages = [{ + "role": + "user", + "content": [ + { + "type": "input_audio", + "input_audio": { + "data": base64_encoded_audio[audio_url], + "format": "wav" + } + }, + { + "type": "text", + "text": "What's happening in this audio?" + }, + ], + }] + + # test single completion + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + logprobs=True, + top_logprobs=5) + assert len(chat_completion.choices) == 1 + + choice = chat_completion.choices[0] + assert choice.finish_reason == "length" + assert chat_completion.usage == openai.types.CompletionUsage( + completion_tokens=10, prompt_tokens=202, total_tokens=212) + + message = choice.message + message = chat_completion.choices[0].message + assert message.content is not None and len(message.content) >= 10 + assert message.role == "assistant" + messages.append({"role": "assistant", "content": message.content}) + + # test multi-turn dialogue + messages.append({"role": "user", "content": "express your result in json"}) + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + ) + message = chat_completion.choices[0].message + assert message.content is not None and len(message.content) >= 0 + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS) @@ -211,11 +266,72 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI, assert "".join(chunks) == output +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS) +async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI, + model_name: str, audio_url: str, + base64_encoded_audio: Dict[str, + str]): + messages = [{ + "role": + "user", + "content": [ + { + "type": "input_audio", + "input_audio": { + "data": base64_encoded_audio[audio_url], + "format": "wav" + } + }, + { + "type": "text", + "text": "What's happening in this audio?" + }, + ], + }] + + # test single completion + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + temperature=0.0, + ) + output = chat_completion.choices[0].message.content + stop_reason = chat_completion.choices[0].finish_reason + + # test streaming + stream = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + temperature=0.0, + stream=True, + ) + chunks: List[str] = [] + finish_reason_count = 0 + async for chunk in stream: + delta = chunk.choices[0].delta + if delta.role: + assert delta.role == "assistant" + if delta.content: + chunks.append(delta.content) + if chunk.choices[0].finish_reason is not None: + finish_reason_count += 1 + # finish reason should only return in last block + assert finish_reason_count == 1 + assert chunk.choices[0].finish_reason == stop_reason + assert delta.content + assert "".join(chunks) == output + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS) async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str, - audio_url: str): + audio_url: str, + base64_encoded_audio: Dict[str, str]): messages = [{ "role": @@ -227,10 +343,17 @@ async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str, "url": audio_url } }, + # { + # "type": "audio_url", + # "audio_url": { + # "url": audio_url + # } + # }, { - "type": "audio_url", - "audio_url": { - "url": audio_url + "type": "input_audio", + "input_audio": { + "data": base64_encoded_audio[audio_url], + "format": "wav" } }, { diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index c2054dcbfce0..98667671ab96 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -13,7 +13,8 @@ # yapf conflicts with isort for this block # yapf: disable from openai.types.chat import (ChatCompletionAssistantMessageParam, - ChatCompletionContentPartImageParam) + ChatCompletionContentPartImageParam, + ChatCompletionContentPartInputAudioParam) from openai.types.chat import ( ChatCompletionContentPartParam as OpenAIChatCompletionContentPartParam) from openai.types.chat import (ChatCompletionContentPartRefusalParam, @@ -92,6 +93,16 @@ class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False): audio_url: Required[str] +class CustomChatCompletionContentInputAudioParam(TypedDict, total=False): + # Same as InputAudio type from https://github.com/openai/openai-python/blob/main/src/openai/types/chat/chat_completion_content_part_input_audio_param.py + data: Required[str] + """Base64 encoded audio data.""" + + format: Required[Literal["wav", "mp3"]] + """The format of the encoded audio data. + Currently supports "wav" and "mp3".""" + + class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False): """A simpler version of the param that only accepts a plain audio_url. @@ -105,6 +116,8 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False): ChatCompletionContentPartParam: TypeAlias = Union[ OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam, + ChatCompletionContentPartInputAudioParam, + CustomChatCompletionContentInputAudioParam, ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam, CustomChatCompletionContentSimpleImageParam, CustomChatCompletionContentSimpleAudioParam, @@ -519,6 +532,10 @@ def parse_image(self, image_url: str) -> None: def parse_audio(self, audio_url: str) -> None: raise NotImplementedError + @abstractmethod + def parse_input_audio(self, input_audio: Dict[str, str]) -> None: + raise NotImplementedError + @abstractmethod def parse_video(self, video_url: str) -> None: raise NotImplementedError @@ -545,6 +562,15 @@ def parse_audio(self, audio_url: str) -> None: placeholder = self._tracker.add("audio", audio) self._add_placeholder(placeholder) + def parse_input_audio(self, input_audio: Dict[str, str]) -> None: + input_audio_data = input_audio.get("data","") + input_audio_format = input_audio.get("format","") + audio_url = f"data:audio/{input_audio_format};base64,{input_audio_data}" + audio = get_and_parse_audio(audio_url) + + placeholder = self._tracker.add("audio", audio) + self._add_placeholder(placeholder) + def parse_video(self, video_url: str) -> None: video = get_and_parse_video(video_url) @@ -574,6 +600,15 @@ def parse_audio(self, audio_url: str) -> None: placeholder = self._tracker.add("audio", audio_coro) self._add_placeholder(placeholder) + def parse_input_audio(self, input_audio: Dict[str, str]) -> None: + input_audio_data = input_audio.get("data","") + input_audio_format = input_audio.get("format","") + audio_url = f"data:audio/{input_audio_format};base64,{input_audio_data}" + audio_coro = async_get_and_parse_audio(audio_url) + + placeholder = self._tracker.add("audio", audio_coro) + self._add_placeholder(placeholder) + def parse_video(self, video_url: str) -> None: video = async_get_and_parse_video(video_url) @@ -667,17 +702,22 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int], _TextParser = partial(cast, ChatCompletionContentPartTextParam) _ImageParser = partial(cast, ChatCompletionContentPartImageParam) _AudioParser = partial(cast, ChatCompletionContentPartAudioParam) +_InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam) _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam) _VideoParser = partial(cast, ChatCompletionContentPartVideoParam) # Define a mapping from part types to their corresponding parsing functions. -MM_PARSER_MAP: Dict[str, Callable[[ChatCompletionContentPartParam], str]] = { +MM_PARSER_MAP: Dict[str, + Callable[[ChatCompletionContentPartParam], + Union[str, Dict[str,str]]]] = { "text": lambda part: _TextParser(part).get("text", ""), "image_url": lambda part: _ImageParser(part).get("image_url", {}).get("url", ""), "audio_url": lambda part: _AudioParser(part).get("audio_url", {}).get("url", ""), + "input_audio": + lambda part: _InputAudioParser(part).get("input_audio", {}), "refusal": lambda part: _RefusalParser(part).get("refusal", ""), "video_url": @@ -686,7 +726,8 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int], def _parse_chat_message_content_mm_part( - part: ChatCompletionContentPartParam) -> Tuple[str, str]: + part: ChatCompletionContentPartParam) -> Tuple[str, + Union[str, Dict[str, str]]]: """ Parses a given multi-modal content part based on its type. @@ -717,6 +758,7 @@ def _parse_chat_message_content_mm_part( return part_type, content # Handle missing 'type' but provided direct URL fields. + # 'type' is required field by pydanic if part_type is None: if part.get("image_url") is not None: image_params = cast(CustomChatCompletionContentSimpleImageParam, @@ -726,6 +768,9 @@ def _parse_chat_message_content_mm_part( audio_params = cast(CustomChatCompletionContentSimpleAudioParam, part) return "audio_url", audio_params.get("audio_url", "") + if part.get("input_audio") is not None: + input_audio_params = cast(Dict[str, str], part) + return "input_audio", input_audio_params if part.get("video_url") is not None: video_params = cast(CustomChatCompletionContentSimpleVideoParam, part) @@ -739,7 +784,7 @@ def _parse_chat_message_content_mm_part( VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url", - "audio_url", "video_url") + "audio_url", "input_audio", "video_url") def _parse_chat_message_content_parts( @@ -795,7 +840,7 @@ def _parse_chat_message_content_part( # Handle structured dictionary parts part_type, content = _parse_chat_message_content_mm_part(part) - # if part_type is text/refusal/image_url/audio_url/video_url but + # if part_type is text/refusal/image_url/audio_url/video_url/input_audio but # content is empty, log a warning and skip if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and not content: logger.warning( @@ -804,18 +849,30 @@ def _parse_chat_message_content_part( return None if part_type in ("text", "refusal"): - return {'type': 'text', 'text': content} if wrap_dicts else content + str_content = cast(str, content) + if wrap_dicts: + return {'type': 'text', 'text': str_content} + else: + return str_content if part_type == "image_url": - mm_parser.parse_image(content) + str_content = cast(str, content) + mm_parser.parse_image(str_content) return {'type': 'image'} if wrap_dicts else None if part_type == "audio_url": - mm_parser.parse_audio(content) + str_content = cast(str, content) + mm_parser.parse_audio(str_content) + return {'type': 'audio'} if wrap_dicts else None + + if part_type == "input_audio": + dict_content = cast(Dict[str, str], content) + mm_parser.parse_input_audio(dict_content) return {'type': 'audio'} if wrap_dicts else None if part_type == "video_url": - mm_parser.parse_video(content) + str_content = cast(str, content) + mm_parser.parse_video(str_content) return {'type': 'video'} if wrap_dicts else None raise NotImplementedError(f"Unknown part type: {part_type}") @@ -840,7 +897,6 @@ def _parse_chat_message_content( content = [ ChatCompletionContentPartTextParam(type="text", text=content) ] - result = _parse_chat_message_content_parts( role, content, # type: ignore From fdf7abb7d2df206be22e74fc8a1751101441c570 Mon Sep 17 00:00:00 2001 From: Kyle Huang Date: Mon, 9 Dec 2024 09:16:47 -0800 Subject: [PATCH 02/11] Use both audio_url and input_audio in the multiple audio test case --- tests/entrypoints/openai/test_audio.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py index fa08e1f51196..f45333b5589f 100644 --- a/tests/entrypoints/openai/test_audio.py +++ b/tests/entrypoints/openai/test_audio.py @@ -343,12 +343,6 @@ async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str, "url": audio_url } }, - # { - # "type": "audio_url", - # "audio_url": { - # "url": audio_url - # } - # }, { "type": "input_audio", "input_audio": { From 6870507c8adca8ed865b184bbed390298569620f Mon Sep 17 00:00:00 2001 From: Kyle Huang Date: Tue, 10 Dec 2024 19:33:00 -0800 Subject: [PATCH 03/11] remove CustomChatCompletionContentInputAudioParam calss --- vllm/entrypoints/chat_utils.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 98667671ab96..3966617a8a73 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -93,16 +93,6 @@ class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False): audio_url: Required[str] -class CustomChatCompletionContentInputAudioParam(TypedDict, total=False): - # Same as InputAudio type from https://github.com/openai/openai-python/blob/main/src/openai/types/chat/chat_completion_content_part_input_audio_param.py - data: Required[str] - """Base64 encoded audio data.""" - - format: Required[Literal["wav", "mp3"]] - """The format of the encoded audio data. - Currently supports "wav" and "mp3".""" - - class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False): """A simpler version of the param that only accepts a plain audio_url. @@ -117,7 +107,6 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False): ChatCompletionContentPartParam: TypeAlias = Union[ OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam, ChatCompletionContentPartInputAudioParam, - CustomChatCompletionContentInputAudioParam, ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam, CustomChatCompletionContentSimpleImageParam, CustomChatCompletionContentSimpleAudioParam, From eb3e0c60779314f5b7ec46673f34462d3c796e67 Mon Sep 17 00:00:00 2001 From: Kyle Huang Date: Thu, 12 Dec 2024 18:19:26 -0800 Subject: [PATCH 04/11] add examples and doc update --- .../serving/openai_compatible_server.md | 6 ++--- ...i_chat_completion_client_for_multimodal.py | 25 +++++++++++++++++++ vllm/entrypoints/chat_utils.py | 2 +- 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index d75e90807ca1..1f95916b2db2 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -34,9 +34,9 @@ We currently support the following OpenAI APIs: - [Chat Completions API](https://platform.openai.com/docs/api-reference/chat) - [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported; see [Multimodal Inputs](../usage/multimodal_inputs.rst). - *Note: `image_url.detail` parameter is not supported.* - - We also support `audio_url` content type for audio files. - - Refer to [vllm.entrypoints.chat_utils](https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/chat_utils.py) for the exact schema. - - *TODO: Support `input_audio` content type as defined [here](https://github.com/openai/openai-python/blob/v1.52.2/src/openai/types/chat/chat_completion_content_part_input_audio_param.py).* + - We support two audio content types. + - Support `audio_url` content type for audio files. Refer to [here](https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/chat_utils.py#L51) for the exact schema. + - Support `input_audio` content type as defined [here](https://github.com/openai/openai-python/blob/v1.52.2/src/openai/types/chat/chat_completion_content_part_input_audio_param.py).* - *Note: `parallel_tool_calls` and `user` parameters are ignored.* - [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings) - Instead of `inputs`, you can pass in a list of `messages` (same schema as Chat Completions API), diff --git a/examples/openai_chat_completion_client_for_multimodal.py b/examples/openai_chat_completion_client_for_multimodal.py index 0ec4f71dddf9..fb799ebf12d7 100644 --- a/examples/openai_chat_completion_client_for_multimodal.py +++ b/examples/openai_chat_completion_client_for_multimodal.py @@ -207,6 +207,31 @@ def run_audio() -> None: result = chat_completion_from_base64.choices[0].message.content print("Chat completion output from base64 encoded audio:", result) + chat_completion_from_base64 = client.chat.completions.create( + messages=[{ + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's in this audio?" + }, + { + "type": "input_audio", + "input_audio": { + # Any format supported by librosa is supported + "data": audio_base64, + "format": "wav" + }, + }, + ], + }], + model=model, + max_completion_tokens=64, + ) + + result = chat_completion_from_base64.choices[0].message.content + print("Chat completion output from input audio:", result) example_function_map = { "text-only": run_text_only, diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 3966617a8a73..524565e34c7a 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -747,7 +747,7 @@ def _parse_chat_message_content_mm_part( return part_type, content # Handle missing 'type' but provided direct URL fields. - # 'type' is required field by pydanic + # 'type' is required field by pydantic if part_type is None: if part.get("image_url") is not None: image_params = cast(CustomChatCompletionContentSimpleImageParam, From d9cf0a54f9c20ecf5983c45111d6edfe54675e61 Mon Sep 17 00:00:00 2001 From: Kyle Huang Date: Thu, 12 Dec 2024 20:31:15 -0800 Subject: [PATCH 05/11] format update --- examples/openai_chat_completion_client_for_multimodal.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/openai_chat_completion_client_for_multimodal.py b/examples/openai_chat_completion_client_for_multimodal.py index fb799ebf12d7..eea429d86d91 100644 --- a/examples/openai_chat_completion_client_for_multimodal.py +++ b/examples/openai_chat_completion_client_for_multimodal.py @@ -233,6 +233,7 @@ def run_audio() -> None: result = chat_completion_from_base64.choices[0].message.content print("Chat completion output from input audio:", result) + example_function_map = { "text-only": run_text_only, "single-image": run_single_image, From e5157ff357ae7d52c5354735a8ca6896b69fc9ff Mon Sep 17 00:00:00 2001 From: Kyle Huang Date: Thu, 12 Dec 2024 21:21:11 -0800 Subject: [PATCH 06/11] update input_audio ref to OAI doc --- docs/source/serving/openai_compatible_server.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index 1f95916b2db2..8de7a746b182 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -35,8 +35,8 @@ We currently support the following OpenAI APIs: - [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported; see [Multimodal Inputs](../usage/multimodal_inputs.rst). - *Note: `image_url.detail` parameter is not supported.* - We support two audio content types. + - Support `input_audio` content type as defined [here](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in). - Support `audio_url` content type for audio files. Refer to [here](https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/chat_utils.py#L51) for the exact schema. - - Support `input_audio` content type as defined [here](https://github.com/openai/openai-python/blob/v1.52.2/src/openai/types/chat/chat_completion_content_part_input_audio_param.py).* - *Note: `parallel_tool_calls` and `user` parameters are ignored.* - [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings) - Instead of `inputs`, you can pass in a list of `messages` (same schema as Chat Completions API), From 43122fe9cf0a899a799d3a7bbc133178350b7956 Mon Sep 17 00:00:00 2001 From: Kyle Huang Date: Fri, 13 Dec 2024 18:58:32 -0800 Subject: [PATCH 07/11] update multimedia input doc --- docs/source/usage/multimodal_inputs.rst | 84 ++++++++++++++++++++++++- 1 file changed, 83 insertions(+), 1 deletion(-) diff --git a/docs/source/usage/multimodal_inputs.rst b/docs/source/usage/multimodal_inputs.rst index c93f65327e31..a9f09ad2abd6 100644 --- a/docs/source/usage/multimodal_inputs.rst +++ b/docs/source/usage/multimodal_inputs.rst @@ -315,7 +315,89 @@ You can use `these tests `_. +Here is a simple example using Ultravox-v0.3. + +First, launch the OpenAI-compatible server: + +.. code-block:: bash + + vllm serve fixie-ai/ultravox-v0_3 + +Then, you can use the OpenAI client as follows: + +.. code-block:: python + + from openai import OpenAI + from vllm.assets.audio import AudioAsset + import requests, base64 + + openai_api_key = "EMPTY" + openai_api_base = "http://localhost:8000/v1" + + client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, + ) + + # Any format supported by librosa is supported + audio_url = AudioAsset("winning_call").url + def encode_base64_content_from_url(content_url: str) -> str: + """Encode a content retrieved from a remote url to base64 format.""" + + with requests.get(content_url) as response: + response.raise_for_status() + result = base64.b64encode(response.content).decode('utf-8') + + return result + audio_base64 = encode_base64_content_from_url(audio_url) + + chat_completion_from_url = client.chat.completions.create( + messages=[{ + "role": "user", + "content": [ + { + "type": "text", + "text": "What's in this audio?" + }, + { + "type": "input_audio", + "input_audio": { + "data": audio_base64, + "format": "wav" + }, + }, + ], + }], + model=model, + max_completion_tokens=64, + ) + + result = chat_completion_from_url.choices[0].message.content + print("Chat completion output from input audio:", result) + + chat_completion_from_url = client.chat.completions.create( + messages=[{ + "role": "user", + "content": [ + { + "type": "text", + "text": "What's in this audio?" + }, + { + "type": "audio_url", + "audio_url": { + "url": audio_url + }, + }, + ], + }], + model=model, + max_completion_tokens=64, + ) + + result = chat_completion_from_url.choices[0].message.content + print("Chat completion output from audio url:", result) A full code example can be found in `examples/openai_chat_completion_client_for_multimodal.py `_. From 4331dd7119aba669dbeb2db7ff48f60f57301fc9 Mon Sep 17 00:00:00 2001 From: Kyle Huang Date: Fri, 13 Dec 2024 19:02:19 -0800 Subject: [PATCH 08/11] update doc --- docs/source/usage/multimodal_inputs.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/usage/multimodal_inputs.rst b/docs/source/usage/multimodal_inputs.rst index a9f09ad2abd6..e7dd54bad258 100644 --- a/docs/source/usage/multimodal_inputs.rst +++ b/docs/source/usage/multimodal_inputs.rst @@ -352,7 +352,7 @@ Then, you can use the OpenAI client as follows: return result audio_base64 = encode_base64_content_from_url(audio_url) - chat_completion_from_url = client.chat.completions.create( + chat_completion_from_base64 = client.chat.completions.create( messages=[{ "role": "user", "content": [ @@ -373,7 +373,7 @@ Then, you can use the OpenAI client as follows: max_completion_tokens=64, ) - result = chat_completion_from_url.choices[0].message.content + result = chat_completion_from_base64.choices[0].message.content print("Chat completion output from input audio:", result) chat_completion_from_url = client.chat.completions.create( From b4e5eb944809abbda01606e00081b1f47dc946bb Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 16 Dec 2024 15:10:02 +0000 Subject: [PATCH 09/11] Update docs Signed-off-by: DarkLight1337 --- .../serving/openai_compatible_server.md | 10 ++--- docs/source/usage/multimodal_inputs.rst | 4 ++ ...i_chat_completion_client_for_multimodal.py | 38 ++++++++++--------- 3 files changed, 29 insertions(+), 23 deletions(-) diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index 41ab9ed14662..1bc8d32d2d16 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -34,11 +34,6 @@ We currently support the following OpenAI APIs: - *Note: `suffix` parameter is not supported.* - [Chat Completions API](#chat-api) (`/v1/chat/completions`) - Only applicable to [text generation models](../models/generative_models.rst) (`--task generate`) with a [chat template](#chat-template). - - [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported; see [Multimodal Inputs](../usage/multimodal_inputs.rst). - - *Note: `image_url.detail` parameter is not supported.* - - We support two audio content types. - - Support `input_audio` content type as defined [here](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in). - - Support `audio_url` content type for audio files. Refer to [here](https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/chat_utils.py#L51) for the exact schema. - *Note: `parallel_tool_calls` and `user` parameters are ignored.* - [Embeddings API](#embeddings-api) (`/v1/embeddings`) - Only applicable to [embedding models](../models/pooling_models.rst) (`--task embed`). @@ -209,6 +204,11 @@ The following extra parameters are supported: Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference/chat) for more details. +We support both [Vision](https://platform.openai.com/docs/guides/vision)- and +[Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters; +see our [Multimodal Inputs](../usage/multimodal_inputs.rst) guide for more information. +- *Note: `image_url.detail` parameter is not supported.* + #### Extra parameters The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. diff --git a/docs/source/usage/multimodal_inputs.rst b/docs/source/usage/multimodal_inputs.rst index 08a053664675..417c86aad9dc 100644 --- a/docs/source/usage/multimodal_inputs.rst +++ b/docs/source/usage/multimodal_inputs.rst @@ -376,6 +376,10 @@ Then, you can use the OpenAI client as follows: result = chat_completion_from_base64.choices[0].message.content print("Chat completion output from input audio:", result) +Alternatively, you can pass :code:`audio_url`, which is the audio counterpart of :code:`image_url` for image input: + +.. code-block:: python + chat_completion_from_url = client.chat.completions.create( messages=[{ "role": "user", diff --git a/examples/openai_chat_completion_client_for_multimodal.py b/examples/openai_chat_completion_client_for_multimodal.py index eea429d86d91..6a160fd70423 100644 --- a/examples/openai_chat_completion_client_for_multimodal.py +++ b/examples/openai_chat_completion_client_for_multimodal.py @@ -153,11 +153,11 @@ def run_multi_image() -> None: # Audio input inference def run_audio() -> None: - # Any format supported by librosa is supported audio_url = AudioAsset("winning_call").url + audio_base64 = encode_base64_content_from_url(audio_url) - # Use audio url in the payload - chat_completion_from_url = client.chat.completions.create( + # OpenAI-compatible schema (`input_audio`) + chat_completion_from_base64 = client.chat.completions.create( messages=[{ "role": "user", @@ -167,9 +167,11 @@ def run_audio() -> None: "text": "What's in this audio?" }, { - "type": "audio_url", - "audio_url": { - "url": audio_url + "type": "input_audio", + "input_audio": { + # Any format supported by librosa is supported + "data": audio_base64, + "format": "wav" }, }, ], @@ -178,11 +180,11 @@ def run_audio() -> None: max_completion_tokens=64, ) - result = chat_completion_from_url.choices[0].message.content - print("Chat completion output from audio url:", result) + result = chat_completion_from_base64.choices[0].message.content + print("Chat completion output from input audio:", result) - audio_base64 = encode_base64_content_from_url(audio_url) - chat_completion_from_base64 = client.chat.completions.create( + # HTTP URL + chat_completion_from_url = client.chat.completions.create( messages=[{ "role": "user", @@ -195,7 +197,7 @@ def run_audio() -> None: "type": "audio_url", "audio_url": { # Any format supported by librosa is supported - "url": f"data:audio/ogg;base64,{audio_base64}" + "url": audio_url }, }, ], @@ -204,9 +206,10 @@ def run_audio() -> None: max_completion_tokens=64, ) - result = chat_completion_from_base64.choices[0].message.content - print("Chat completion output from base64 encoded audio:", result) + result = chat_completion_from_url.choices[0].message.content + print("Chat completion output from audio url:", result) + # base64 URL chat_completion_from_base64 = client.chat.completions.create( messages=[{ "role": @@ -217,11 +220,10 @@ def run_audio() -> None: "text": "What's in this audio?" }, { - "type": "input_audio", - "input_audio": { + "type": "audio_url", + "audio_url": { # Any format supported by librosa is supported - "data": audio_base64, - "format": "wav" + "url": f"data:audio/ogg;base64,{audio_base64}" }, }, ], @@ -231,7 +233,7 @@ def run_audio() -> None: ) result = chat_completion_from_base64.choices[0].message.content - print("Chat completion output from input audio:", result) + print("Chat completion output from base64 encoded audio:", result) example_function_map = { From a695d75a8a2f36645f607cea9f6844f7c80bcb2e Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 16 Dec 2024 15:12:35 +0000 Subject: [PATCH 10/11] Fix typo Signed-off-by: DarkLight1337 --- docs/source/usage/multimodal_inputs.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/usage/multimodal_inputs.rst b/docs/source/usage/multimodal_inputs.rst index 417c86aad9dc..4cd259ce0709 100644 --- a/docs/source/usage/multimodal_inputs.rst +++ b/docs/source/usage/multimodal_inputs.rst @@ -315,7 +315,7 @@ You can use `these tests `_. +Audio input is supported according to `OpenAI Audio API `_. Here is a simple example using Ultravox-v0.3. First, launch the OpenAI-compatible server: From e8e6b6137c094bba6be3471122308e108fb08fac Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 16 Dec 2024 15:13:30 +0000 Subject: [PATCH 11/11] Format Signed-off-by: DarkLight1337 --- docs/source/usage/multimodal_inputs.rst | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/docs/source/usage/multimodal_inputs.rst b/docs/source/usage/multimodal_inputs.rst index 4cd259ce0709..680382e457cc 100644 --- a/docs/source/usage/multimodal_inputs.rst +++ b/docs/source/usage/multimodal_inputs.rst @@ -328,9 +328,19 @@ Then, you can use the OpenAI client as follows: .. code-block:: python + import base64 + import requests from openai import OpenAI from vllm.assets.audio import AudioAsset - import requests, base64 + + def encode_base64_content_from_url(content_url: str) -> str: + """Encode a content retrieved from a remote url to base64 format.""" + + with requests.get(content_url) as response: + response.raise_for_status() + result = base64.b64encode(response.content).decode('utf-8') + + return result openai_api_key = "EMPTY" openai_api_base = "http://localhost:8000/v1" @@ -342,14 +352,6 @@ Then, you can use the OpenAI client as follows: # Any format supported by librosa is supported audio_url = AudioAsset("winning_call").url - def encode_base64_content_from_url(content_url: str) -> str: - """Encode a content retrieved from a remote url to base64 format.""" - - with requests.get(content_url) as response: - response.raise_for_status() - result = base64.b64encode(response.content).decode('utf-8') - - return result audio_base64 = encode_base64_content_from_url(audio_url) chat_completion_from_base64 = client.chat.completions.create(