From 7fb18c88206ddc9bb119ed199af4fee2c8378250 Mon Sep 17 00:00:00 2001
From: Kyle Huang <kylhuang@nvidia.com>
Date: Sun, 8 Dec 2024 17:26:41 -0800
Subject: [PATCH 01/11] add OpenAI API support for input_audio

---
 tests/entrypoints/openai/test_audio.py | 131 ++++++++++++++++++++++++-
 vllm/entrypoints/chat_utils.py         |  76 ++++++++++++--
 2 files changed, 193 insertions(+), 14 deletions(-)

diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index a74109e2f512..fa08e1f51196 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -154,6 +154,61 @@ async def test_single_chat_session_audio_base64encoded(
     assert message.content is not None and len(message.content) >= 0
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
+async def test_single_chat_session_input_audio(
+        client: openai.AsyncOpenAI, model_name: str, audio_url: str,
+        base64_encoded_audio: Dict[str, str]):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "input_audio",
+                "input_audio": {
+                    "data": base64_encoded_audio[audio_url],
+                    "format": "wav"
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's happening in this audio?"
+            },
+        ],
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        logprobs=True,
+        top_logprobs=5)
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=202, total_tokens=212)
+
+    message = choice.message
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
@@ -211,11 +266,72 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
     assert "".join(chunks) == output
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
+async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI,
+                                          model_name: str, audio_url: str,
+                                          base64_encoded_audio: Dict[str,
+                                                                     str]):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "input_audio",
+                "input_audio": {
+                    "data": base64_encoded_audio[audio_url],
+                    "format": "wav"
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's happening in this audio?"
+            },
+        ],
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+    )
+    output = chat_completion.choices[0].message.content
+    stop_reason = chat_completion.choices[0].finish_reason
+
+    # test streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+        stream=True,
+    )
+    chunks: List[str] = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.role:
+            assert delta.role == "assistant"
+        if delta.content:
+            chunks.append(delta.content)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == stop_reason
+    assert delta.content
+    assert "".join(chunks) == output
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
 async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
-                                 audio_url: str):
+                                 audio_url: str,
+                                 base64_encoded_audio: Dict[str, str]):
 
     messages = [{
         "role":
@@ -227,10 +343,17 @@ async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
                     "url": audio_url
                 }
             },
+            # {
+            #     "type": "audio_url",
+            #     "audio_url": {
+            #         "url": audio_url
+            #     }
+            # },
             {
-                "type": "audio_url",
-                "audio_url": {
-                    "url": audio_url
+                "type": "input_audio",
+                "input_audio": {
+                    "data": base64_encoded_audio[audio_url],
+                    "format": "wav"
                 }
             },
             {
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index c2054dcbfce0..98667671ab96 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -13,7 +13,8 @@
 # yapf conflicts with isort for this block
 # yapf: disable
 from openai.types.chat import (ChatCompletionAssistantMessageParam,
-                               ChatCompletionContentPartImageParam)
+                               ChatCompletionContentPartImageParam,
+                               ChatCompletionContentPartInputAudioParam)
 from openai.types.chat import (
     ChatCompletionContentPartParam as OpenAIChatCompletionContentPartParam)
 from openai.types.chat import (ChatCompletionContentPartRefusalParam,
@@ -92,6 +93,16 @@ class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False):
     audio_url: Required[str]
 
 
+class CustomChatCompletionContentInputAudioParam(TypedDict, total=False):
+    # Same as InputAudio type from https://github.com/openai/openai-python/blob/main/src/openai/types/chat/chat_completion_content_part_input_audio_param.py
+    data: Required[str]
+    """Base64 encoded audio data."""
+
+    format: Required[Literal["wav", "mp3"]]
+    """The format of the encoded audio data. 
+    Currently supports "wav" and "mp3"."""
+
+
 class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
     """A simpler version of the param that only accepts a plain audio_url.
 
@@ -105,6 +116,8 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
 
 ChatCompletionContentPartParam: TypeAlias = Union[
     OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam,
+    ChatCompletionContentPartInputAudioParam,
+    CustomChatCompletionContentInputAudioParam,
     ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam,
     CustomChatCompletionContentSimpleImageParam,
     CustomChatCompletionContentSimpleAudioParam,
@@ -519,6 +532,10 @@ def parse_image(self, image_url: str) -> None:
     def parse_audio(self, audio_url: str) -> None:
         raise NotImplementedError
 
+    @abstractmethod
+    def parse_input_audio(self, input_audio: Dict[str, str]) -> None:
+        raise NotImplementedError
+
     @abstractmethod
     def parse_video(self, video_url: str) -> None:
         raise NotImplementedError
@@ -545,6 +562,15 @@ def parse_audio(self, audio_url: str) -> None:
         placeholder = self._tracker.add("audio", audio)
         self._add_placeholder(placeholder)
 
+    def parse_input_audio(self, input_audio: Dict[str, str]) -> None:
+        input_audio_data = input_audio.get("data","")
+        input_audio_format = input_audio.get("format","")
+        audio_url = f"data:audio/{input_audio_format};base64,{input_audio_data}"
+        audio = get_and_parse_audio(audio_url)
+
+        placeholder = self._tracker.add("audio", audio)
+        self._add_placeholder(placeholder)
+
     def parse_video(self, video_url: str) -> None:
         video = get_and_parse_video(video_url)
 
@@ -574,6 +600,15 @@ def parse_audio(self, audio_url: str) -> None:
         placeholder = self._tracker.add("audio", audio_coro)
         self._add_placeholder(placeholder)
 
+    def parse_input_audio(self, input_audio: Dict[str, str]) -> None:
+        input_audio_data = input_audio.get("data","")
+        input_audio_format = input_audio.get("format","")
+        audio_url = f"data:audio/{input_audio_format};base64,{input_audio_data}"
+        audio_coro = async_get_and_parse_audio(audio_url)
+
+        placeholder = self._tracker.add("audio", audio_coro)
+        self._add_placeholder(placeholder)
+
     def parse_video(self, video_url: str) -> None:
         video = async_get_and_parse_video(video_url)
 
@@ -667,17 +702,22 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int],
 _TextParser = partial(cast, ChatCompletionContentPartTextParam)
 _ImageParser = partial(cast, ChatCompletionContentPartImageParam)
 _AudioParser = partial(cast, ChatCompletionContentPartAudioParam)
+_InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam)
 _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
 _VideoParser = partial(cast, ChatCompletionContentPartVideoParam)
 
 # Define a mapping from part types to their corresponding parsing functions.
-MM_PARSER_MAP: Dict[str, Callable[[ChatCompletionContentPartParam], str]] = {
+MM_PARSER_MAP: Dict[str,
+                    Callable[[ChatCompletionContentPartParam],
+                             Union[str, Dict[str,str]]]] = {
     "text":
     lambda part: _TextParser(part).get("text", ""),
     "image_url":
     lambda part: _ImageParser(part).get("image_url", {}).get("url", ""),
     "audio_url":
     lambda part: _AudioParser(part).get("audio_url", {}).get("url", ""),
+    "input_audio":
+    lambda part: _InputAudioParser(part).get("input_audio", {}),
     "refusal":
     lambda part: _RefusalParser(part).get("refusal", ""),
     "video_url":
@@ -686,7 +726,8 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int],
 
 
 def _parse_chat_message_content_mm_part(
-        part: ChatCompletionContentPartParam) -> Tuple[str, str]:
+        part: ChatCompletionContentPartParam) -> Tuple[str,
+                                                Union[str, Dict[str, str]]]:
     """
     Parses a given multi-modal content part based on its type.
 
@@ -717,6 +758,7 @@ def _parse_chat_message_content_mm_part(
         return part_type, content
 
     # Handle missing 'type' but provided direct URL fields.
+    # 'type' is required field by pydanic
     if part_type is None:
         if part.get("image_url") is not None:
             image_params = cast(CustomChatCompletionContentSimpleImageParam,
@@ -726,6 +768,9 @@ def _parse_chat_message_content_mm_part(
             audio_params = cast(CustomChatCompletionContentSimpleAudioParam,
                                 part)
             return "audio_url", audio_params.get("audio_url", "")
+        if part.get("input_audio") is not None:
+            input_audio_params = cast(Dict[str, str], part)
+            return "input_audio", input_audio_params
         if part.get("video_url") is not None:
             video_params = cast(CustomChatCompletionContentSimpleVideoParam,
                                 part)
@@ -739,7 +784,7 @@ def _parse_chat_message_content_mm_part(
 
 
 VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url",
-                                       "audio_url", "video_url")
+                                       "audio_url", "input_audio", "video_url")
 
 
 def _parse_chat_message_content_parts(
@@ -795,7 +840,7 @@ def _parse_chat_message_content_part(
     # Handle structured dictionary parts
     part_type, content = _parse_chat_message_content_mm_part(part)
 
-    # if part_type is text/refusal/image_url/audio_url/video_url but
+    # if part_type is text/refusal/image_url/audio_url/video_url/input_audio but
     # content is empty, log a warning and skip
     if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and not content:
         logger.warning(
@@ -804,18 +849,30 @@ def _parse_chat_message_content_part(
         return None
 
     if part_type in ("text", "refusal"):
-        return {'type': 'text', 'text': content} if wrap_dicts else content
+        str_content = cast(str, content)
+        if wrap_dicts:
+            return {'type': 'text', 'text': str_content}
+        else:
+            return str_content
 
     if part_type == "image_url":
-        mm_parser.parse_image(content)
+        str_content = cast(str, content)
+        mm_parser.parse_image(str_content)
         return {'type': 'image'} if wrap_dicts else None
 
     if part_type == "audio_url":
-        mm_parser.parse_audio(content)
+        str_content = cast(str, content)
+        mm_parser.parse_audio(str_content)
+        return {'type': 'audio'} if wrap_dicts else None
+
+    if part_type == "input_audio":
+        dict_content = cast(Dict[str, str], content)
+        mm_parser.parse_input_audio(dict_content)
         return {'type': 'audio'} if wrap_dicts else None
 
     if part_type == "video_url":
-        mm_parser.parse_video(content)
+        str_content = cast(str, content)
+        mm_parser.parse_video(str_content)
         return {'type': 'video'} if wrap_dicts else None
 
     raise NotImplementedError(f"Unknown part type: {part_type}")
@@ -840,7 +897,6 @@ def _parse_chat_message_content(
         content = [
             ChatCompletionContentPartTextParam(type="text", text=content)
         ]
-
     result = _parse_chat_message_content_parts(
         role,
         content,  # type: ignore

From fdf7abb7d2df206be22e74fc8a1751101441c570 Mon Sep 17 00:00:00 2001
From: Kyle Huang <kylhuang@nvidia.com>
Date: Mon, 9 Dec 2024 09:16:47 -0800
Subject: [PATCH 02/11] Use both audio_url and input_audio in the multiple
 audio test case

---
 tests/entrypoints/openai/test_audio.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index fa08e1f51196..f45333b5589f 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -343,12 +343,6 @@ async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
                     "url": audio_url
                 }
             },
-            # {
-            #     "type": "audio_url",
-            #     "audio_url": {
-            #         "url": audio_url
-            #     }
-            # },
             {
                 "type": "input_audio",
                 "input_audio": {

From 6870507c8adca8ed865b184bbed390298569620f Mon Sep 17 00:00:00 2001
From: Kyle Huang <kylhuang@nvidia.com>
Date: Tue, 10 Dec 2024 19:33:00 -0800
Subject: [PATCH 03/11] remove CustomChatCompletionContentInputAudioParam calss

---
 vllm/entrypoints/chat_utils.py | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 98667671ab96..3966617a8a73 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -93,16 +93,6 @@ class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False):
     audio_url: Required[str]
 
 
-class CustomChatCompletionContentInputAudioParam(TypedDict, total=False):
-    # Same as InputAudio type from https://github.com/openai/openai-python/blob/main/src/openai/types/chat/chat_completion_content_part_input_audio_param.py
-    data: Required[str]
-    """Base64 encoded audio data."""
-
-    format: Required[Literal["wav", "mp3"]]
-    """The format of the encoded audio data. 
-    Currently supports "wav" and "mp3"."""
-
-
 class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
     """A simpler version of the param that only accepts a plain audio_url.
 
@@ -117,7 +107,6 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
 ChatCompletionContentPartParam: TypeAlias = Union[
     OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam,
     ChatCompletionContentPartInputAudioParam,
-    CustomChatCompletionContentInputAudioParam,
     ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam,
     CustomChatCompletionContentSimpleImageParam,
     CustomChatCompletionContentSimpleAudioParam,

From eb3e0c60779314f5b7ec46673f34462d3c796e67 Mon Sep 17 00:00:00 2001
From: Kyle Huang <kylhuang@nvidia.com>
Date: Thu, 12 Dec 2024 18:19:26 -0800
Subject: [PATCH 04/11] add examples and doc update

---
 .../serving/openai_compatible_server.md       |  6 ++---
 ...i_chat_completion_client_for_multimodal.py | 25 +++++++++++++++++++
 vllm/entrypoints/chat_utils.py                |  2 +-
 3 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index d75e90807ca1..1f95916b2db2 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -34,9 +34,9 @@ We currently support the following OpenAI APIs:
 - [Chat Completions API](https://platform.openai.com/docs/api-reference/chat)
   - [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported; see [Multimodal Inputs](../usage/multimodal_inputs.rst).
     - *Note: `image_url.detail` parameter is not supported.*
-  - We also support `audio_url` content type for audio files.
-    - Refer to [vllm.entrypoints.chat_utils](https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/chat_utils.py) for the exact schema.
-    - *TODO: Support `input_audio` content type as defined [here](https://github.com/openai/openai-python/blob/v1.52.2/src/openai/types/chat/chat_completion_content_part_input_audio_param.py).*
+  - We support two audio content types.
+    - Support `audio_url` content type for audio files. Refer to [here](https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/chat_utils.py#L51) for the exact schema.
+    - Support `input_audio` content type as defined [here](https://github.com/openai/openai-python/blob/v1.52.2/src/openai/types/chat/chat_completion_content_part_input_audio_param.py).*
   - *Note: `parallel_tool_calls` and `user` parameters are ignored.*
 - [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings)
   - Instead of `inputs`, you can pass in a list of `messages` (same schema as Chat Completions API),
diff --git a/examples/openai_chat_completion_client_for_multimodal.py b/examples/openai_chat_completion_client_for_multimodal.py
index 0ec4f71dddf9..fb799ebf12d7 100644
--- a/examples/openai_chat_completion_client_for_multimodal.py
+++ b/examples/openai_chat_completion_client_for_multimodal.py
@@ -207,6 +207,31 @@ def run_audio() -> None:
     result = chat_completion_from_base64.choices[0].message.content
     print("Chat completion output from base64 encoded audio:", result)
 
+    chat_completion_from_base64 = client.chat.completions.create(
+        messages=[{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this audio?"
+                },
+                {
+                    "type": "input_audio",
+                    "input_audio": {
+                        # Any format supported by librosa is supported
+                        "data": audio_base64,
+                        "format": "wav"
+                    },
+                },
+            ],
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from input audio:", result)
 
 example_function_map = {
     "text-only": run_text_only,
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 3966617a8a73..524565e34c7a 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -747,7 +747,7 @@ def _parse_chat_message_content_mm_part(
         return part_type, content
 
     # Handle missing 'type' but provided direct URL fields.
-    # 'type' is required field by pydanic
+    # 'type' is required field by pydantic
     if part_type is None:
         if part.get("image_url") is not None:
             image_params = cast(CustomChatCompletionContentSimpleImageParam,

From d9cf0a54f9c20ecf5983c45111d6edfe54675e61 Mon Sep 17 00:00:00 2001
From: Kyle Huang <kylhuang@nvidia.com>
Date: Thu, 12 Dec 2024 20:31:15 -0800
Subject: [PATCH 05/11] format update

---
 examples/openai_chat_completion_client_for_multimodal.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/openai_chat_completion_client_for_multimodal.py b/examples/openai_chat_completion_client_for_multimodal.py
index fb799ebf12d7..eea429d86d91 100644
--- a/examples/openai_chat_completion_client_for_multimodal.py
+++ b/examples/openai_chat_completion_client_for_multimodal.py
@@ -233,6 +233,7 @@ def run_audio() -> None:
     result = chat_completion_from_base64.choices[0].message.content
     print("Chat completion output from input audio:", result)
 
+
 example_function_map = {
     "text-only": run_text_only,
     "single-image": run_single_image,

From e5157ff357ae7d52c5354735a8ca6896b69fc9ff Mon Sep 17 00:00:00 2001
From: Kyle Huang <kylhuang@nvidia.com>
Date: Thu, 12 Dec 2024 21:21:11 -0800
Subject: [PATCH 06/11] update input_audio ref to OAI doc

---
 docs/source/serving/openai_compatible_server.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 1f95916b2db2..8de7a746b182 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -35,8 +35,8 @@ We currently support the following OpenAI APIs:
   - [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported; see [Multimodal Inputs](../usage/multimodal_inputs.rst).
     - *Note: `image_url.detail` parameter is not supported.*
   - We support two audio content types.
+    - Support `input_audio` content type as defined [here](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in).
     - Support `audio_url` content type for audio files. Refer to [here](https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/chat_utils.py#L51) for the exact schema.
-    - Support `input_audio` content type as defined [here](https://github.com/openai/openai-python/blob/v1.52.2/src/openai/types/chat/chat_completion_content_part_input_audio_param.py).*
   - *Note: `parallel_tool_calls` and `user` parameters are ignored.*
 - [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings)
   - Instead of `inputs`, you can pass in a list of `messages` (same schema as Chat Completions API),

From 43122fe9cf0a899a799d3a7bbc133178350b7956 Mon Sep 17 00:00:00 2001
From: Kyle Huang <kylhuang@nvidia.com>
Date: Fri, 13 Dec 2024 18:58:32 -0800
Subject: [PATCH 07/11] update multimedia input doc

---
 docs/source/usage/multimodal_inputs.rst | 84 ++++++++++++++++++++++++-
 1 file changed, 83 insertions(+), 1 deletion(-)

diff --git a/docs/source/usage/multimodal_inputs.rst b/docs/source/usage/multimodal_inputs.rst
index c93f65327e31..a9f09ad2abd6 100644
--- a/docs/source/usage/multimodal_inputs.rst
+++ b/docs/source/usage/multimodal_inputs.rst
@@ -315,7 +315,89 @@ You can use `these tests <https://github.com/vllm-project/vllm/blob/main/tests/e
 Audio
 ^^^^^
 
-Instead of :code:`image_url`, you can pass an audio file via :code:`audio_url`.
+Audio input is supported according to `OpenAI Audio in API <https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in>`_.
+Here is a simple example using Ultravox-v0.3.
+
+First, launch the OpenAI-compatible server:
+
+.. code-block:: bash
+
+    vllm serve fixie-ai/ultravox-v0_3
+    
+Then, you can use the OpenAI client as follows:
+
+.. code-block:: python
+
+    from openai import OpenAI
+    from vllm.assets.audio import AudioAsset
+    import requests, base64
+
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"
+
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    # Any format supported by librosa is supported
+    audio_url = AudioAsset("winning_call").url
+    def encode_base64_content_from_url(content_url: str) -> str:
+        """Encode a content retrieved from a remote url to base64 format."""
+
+        with requests.get(content_url) as response:
+            response.raise_for_status()
+            result = base64.b64encode(response.content).decode('utf-8')
+
+        return result
+    audio_base64 = encode_base64_content_from_url(audio_url)
+
+    chat_completion_from_url = client.chat.completions.create(
+        messages=[{
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this audio?"
+                },
+                {
+                    "type": "input_audio",
+                    "input_audio": {
+                        "data": audio_base64,
+                        "format": "wav"
+                    },
+                },
+            ],
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_url.choices[0].message.content
+    print("Chat completion output from input audio:", result)
+
+    chat_completion_from_url = client.chat.completions.create(
+        messages=[{
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this audio?"
+                },
+                {
+                    "type": "audio_url",
+                    "audio_url": {
+                        "url": audio_url
+                    },
+                },
+            ],
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_url.choices[0].message.content
+    print("Chat completion output from audio url:", result)
 
 A full code example can be found in `examples/openai_chat_completion_client_for_multimodal.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client_for_multimodal.py>`_.
 

From 4331dd7119aba669dbeb2db7ff48f60f57301fc9 Mon Sep 17 00:00:00 2001
From: Kyle Huang <kylhuang@nvidia.com>
Date: Fri, 13 Dec 2024 19:02:19 -0800
Subject: [PATCH 08/11] update doc

---
 docs/source/usage/multimodal_inputs.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/usage/multimodal_inputs.rst b/docs/source/usage/multimodal_inputs.rst
index a9f09ad2abd6..e7dd54bad258 100644
--- a/docs/source/usage/multimodal_inputs.rst
+++ b/docs/source/usage/multimodal_inputs.rst
@@ -352,7 +352,7 @@ Then, you can use the OpenAI client as follows:
         return result
     audio_base64 = encode_base64_content_from_url(audio_url)
 
-    chat_completion_from_url = client.chat.completions.create(
+    chat_completion_from_base64 = client.chat.completions.create(
         messages=[{
             "role": "user",
             "content": [
@@ -373,7 +373,7 @@ Then, you can use the OpenAI client as follows:
         max_completion_tokens=64,
     )
 
-    result = chat_completion_from_url.choices[0].message.content
+    result = chat_completion_from_base64.choices[0].message.content
     print("Chat completion output from input audio:", result)
 
     chat_completion_from_url = client.chat.completions.create(

From b4e5eb944809abbda01606e00081b1f47dc946bb Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 16 Dec 2024 15:10:02 +0000
Subject: [PATCH 09/11] Update docs

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../serving/openai_compatible_server.md       | 10 ++---
 docs/source/usage/multimodal_inputs.rst       |  4 ++
 ...i_chat_completion_client_for_multimodal.py | 38 ++++++++++---------
 3 files changed, 29 insertions(+), 23 deletions(-)

diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 41ab9ed14662..1bc8d32d2d16 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -34,11 +34,6 @@ We currently support the following OpenAI APIs:
   - *Note: `suffix` parameter is not supported.*
 - [Chat Completions API](#chat-api) (`/v1/chat/completions`)
   - Only applicable to [text generation models](../models/generative_models.rst) (`--task generate`) with a [chat template](#chat-template).
-  - [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported; see [Multimodal Inputs](../usage/multimodal_inputs.rst).
-    - *Note: `image_url.detail` parameter is not supported.*
-  - We support two audio content types.
-    - Support `input_audio` content type as defined [here](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in).
-    - Support `audio_url` content type for audio files. Refer to [here](https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/chat_utils.py#L51) for the exact schema.
   - *Note: `parallel_tool_calls` and `user` parameters are ignored.*
 - [Embeddings API](#embeddings-api) (`/v1/embeddings`)
   - Only applicable to [embedding models](../models/pooling_models.rst) (`--task embed`).
@@ -209,6 +204,11 @@ The following extra parameters are supported:
 
 Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference/chat) for more details.
 
+We support both [Vision](https://platform.openai.com/docs/guides/vision)- and
+[Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters;
+see our [Multimodal Inputs](../usage/multimodal_inputs.rst) guide for more information.
+- *Note: `image_url.detail` parameter is not supported.*
+
 #### Extra parameters
 
 The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported.
diff --git a/docs/source/usage/multimodal_inputs.rst b/docs/source/usage/multimodal_inputs.rst
index 08a053664675..417c86aad9dc 100644
--- a/docs/source/usage/multimodal_inputs.rst
+++ b/docs/source/usage/multimodal_inputs.rst
@@ -376,6 +376,10 @@ Then, you can use the OpenAI client as follows:
     result = chat_completion_from_base64.choices[0].message.content
     print("Chat completion output from input audio:", result)
 
+Alternatively, you can pass :code:`audio_url`, which is the audio counterpart of :code:`image_url` for image input:
+
+.. code-block:: python
+
     chat_completion_from_url = client.chat.completions.create(
         messages=[{
             "role": "user",
diff --git a/examples/openai_chat_completion_client_for_multimodal.py b/examples/openai_chat_completion_client_for_multimodal.py
index eea429d86d91..6a160fd70423 100644
--- a/examples/openai_chat_completion_client_for_multimodal.py
+++ b/examples/openai_chat_completion_client_for_multimodal.py
@@ -153,11 +153,11 @@ def run_multi_image() -> None:
 
 # Audio input inference
 def run_audio() -> None:
-    # Any format supported by librosa is supported
     audio_url = AudioAsset("winning_call").url
+    audio_base64 = encode_base64_content_from_url(audio_url)
 
-    # Use audio url in the payload
-    chat_completion_from_url = client.chat.completions.create(
+    # OpenAI-compatible schema (`input_audio`)
+    chat_completion_from_base64 = client.chat.completions.create(
         messages=[{
             "role":
             "user",
@@ -167,9 +167,11 @@ def run_audio() -> None:
                     "text": "What's in this audio?"
                 },
                 {
-                    "type": "audio_url",
-                    "audio_url": {
-                        "url": audio_url
+                    "type": "input_audio",
+                    "input_audio": {
+                        # Any format supported by librosa is supported
+                        "data": audio_base64,
+                        "format": "wav"
                     },
                 },
             ],
@@ -178,11 +180,11 @@ def run_audio() -> None:
         max_completion_tokens=64,
     )
 
-    result = chat_completion_from_url.choices[0].message.content
-    print("Chat completion output from audio url:", result)
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from input audio:", result)
 
-    audio_base64 = encode_base64_content_from_url(audio_url)
-    chat_completion_from_base64 = client.chat.completions.create(
+    # HTTP URL
+    chat_completion_from_url = client.chat.completions.create(
         messages=[{
             "role":
             "user",
@@ -195,7 +197,7 @@ def run_audio() -> None:
                     "type": "audio_url",
                     "audio_url": {
                         # Any format supported by librosa is supported
-                        "url": f"data:audio/ogg;base64,{audio_base64}"
+                        "url": audio_url
                     },
                 },
             ],
@@ -204,9 +206,10 @@ def run_audio() -> None:
         max_completion_tokens=64,
     )
 
-    result = chat_completion_from_base64.choices[0].message.content
-    print("Chat completion output from base64 encoded audio:", result)
+    result = chat_completion_from_url.choices[0].message.content
+    print("Chat completion output from audio url:", result)
 
+    # base64 URL
     chat_completion_from_base64 = client.chat.completions.create(
         messages=[{
             "role":
@@ -217,11 +220,10 @@ def run_audio() -> None:
                     "text": "What's in this audio?"
                 },
                 {
-                    "type": "input_audio",
-                    "input_audio": {
+                    "type": "audio_url",
+                    "audio_url": {
                         # Any format supported by librosa is supported
-                        "data": audio_base64,
-                        "format": "wav"
+                        "url": f"data:audio/ogg;base64,{audio_base64}"
                     },
                 },
             ],
@@ -231,7 +233,7 @@ def run_audio() -> None:
     )
 
     result = chat_completion_from_base64.choices[0].message.content
-    print("Chat completion output from input audio:", result)
+    print("Chat completion output from base64 encoded audio:", result)
 
 
 example_function_map = {

From a695d75a8a2f36645f607cea9f6844f7c80bcb2e Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 16 Dec 2024 15:12:35 +0000
Subject: [PATCH 10/11] Fix typo

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/usage/multimodal_inputs.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/usage/multimodal_inputs.rst b/docs/source/usage/multimodal_inputs.rst
index 417c86aad9dc..4cd259ce0709 100644
--- a/docs/source/usage/multimodal_inputs.rst
+++ b/docs/source/usage/multimodal_inputs.rst
@@ -315,7 +315,7 @@ You can use `these tests <https://github.com/vllm-project/vllm/blob/main/tests/e
 Audio
 ^^^^^
 
-Audio input is supported according to `OpenAI Audio in API <https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in>`_.
+Audio input is supported according to `OpenAI Audio API <https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in>`_.
 Here is a simple example using Ultravox-v0.3.
 
 First, launch the OpenAI-compatible server:

From e8e6b6137c094bba6be3471122308e108fb08fac Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 16 Dec 2024 15:13:30 +0000
Subject: [PATCH 11/11] Format

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/usage/multimodal_inputs.rst | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/docs/source/usage/multimodal_inputs.rst b/docs/source/usage/multimodal_inputs.rst
index 4cd259ce0709..680382e457cc 100644
--- a/docs/source/usage/multimodal_inputs.rst
+++ b/docs/source/usage/multimodal_inputs.rst
@@ -328,9 +328,19 @@ Then, you can use the OpenAI client as follows:
 
 .. code-block:: python
 
+    import base64
+    import requests
     from openai import OpenAI
     from vllm.assets.audio import AudioAsset
-    import requests, base64
+
+    def encode_base64_content_from_url(content_url: str) -> str:
+        """Encode a content retrieved from a remote url to base64 format."""
+
+        with requests.get(content_url) as response:
+            response.raise_for_status()
+            result = base64.b64encode(response.content).decode('utf-8')
+
+        return result
 
     openai_api_key = "EMPTY"
     openai_api_base = "http://localhost:8000/v1"
@@ -342,14 +352,6 @@ Then, you can use the OpenAI client as follows:
 
     # Any format supported by librosa is supported
     audio_url = AudioAsset("winning_call").url
-    def encode_base64_content_from_url(content_url: str) -> str:
-        """Encode a content retrieved from a remote url to base64 format."""
-
-        with requests.get(content_url) as response:
-            response.raise_for_status()
-            result = base64.b64encode(response.content).decode('utf-8')
-
-        return result
     audio_base64 = encode_base64_content_from_url(audio_url)
 
     chat_completion_from_base64 = client.chat.completions.create(