Skip to content

Commit 78af9fc

Browse files
DarkLight1337LeiWang1999
authored andcommitted
[Frontend] Multimodal support in offline chat (vllm-project#8098)
Signed-off-by: LeiWang1999 <[email protected]>
1 parent 243f1c9 commit 78af9fc

File tree

8 files changed

+356
-112
lines changed

8 files changed

+356
-112
lines changed

tests/entrypoints/llm/test_generate.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from vllm import LLM, RequestOutput, SamplingParams
77

88
from ...conftest import cleanup
9+
from ..openai.test_vision import TEST_IMAGE_URLS
910

1011
MODEL_NAME = "facebook/opt-125m"
1112

@@ -159,3 +160,36 @@ def test_chat():
159160
]
160161
outputs = llm.chat(messages)
161162
assert len(outputs) == 1
163+
164+
165+
@pytest.mark.parametrize("image_urls",
166+
[[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
167+
def test_chat_multi_image(image_urls: List[str]):
168+
llm = LLM(
169+
model="microsoft/Phi-3.5-vision-instruct",
170+
dtype="bfloat16",
171+
max_model_len=4096,
172+
max_num_seqs=5,
173+
enforce_eager=True,
174+
trust_remote_code=True,
175+
limit_mm_per_prompt={"image": 2},
176+
)
177+
178+
messages = [{
179+
"role":
180+
"user",
181+
"content": [
182+
*({
183+
"type": "image_url",
184+
"image_url": {
185+
"url": image_url
186+
}
187+
} for image_url in image_urls),
188+
{
189+
"type": "text",
190+
"text": "What's in this image?"
191+
},
192+
],
193+
}]
194+
outputs = llm.chat(messages)
195+
assert len(outputs) >= 0

tests/entrypoints/test_chat_utils.py

Lines changed: 124 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
import warnings
2+
from typing import Optional
23

34
import pytest
45
from PIL import Image
56

67
from vllm.assets.image import ImageAsset
78
from vllm.config import ModelConfig
8-
from vllm.entrypoints.chat_utils import parse_chat_messages
9+
from vllm.entrypoints.chat_utils import (parse_chat_messages,
10+
parse_chat_messages_futures)
11+
from vllm.multimodal import MultiModalDataDict
912
from vllm.multimodal.utils import encode_image_base64
1013
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
1114

@@ -42,10 +45,28 @@ def image_url():
4245
return f"data:image/jpeg;base64,{base64}"
4346

4447

45-
@pytest.mark.asyncio
46-
async def test_parse_chat_messages_with_image_url(phi3v_model_config,
47-
phi3v_tokenizer, image_url):
48-
conversation, mm_future = parse_chat_messages([{
48+
def _assert_mm_data_is_image_input(
49+
mm_data: Optional[MultiModalDataDict],
50+
image_count: int,
51+
) -> None:
52+
assert mm_data is not None
53+
assert set(mm_data.keys()) == {"image"}
54+
55+
image_data = mm_data.get("image")
56+
assert image_data is not None
57+
58+
if image_count == 1:
59+
assert isinstance(image_data, Image.Image)
60+
else:
61+
assert isinstance(image_data, list) and len(image_data) == image_count
62+
63+
64+
def test_parse_chat_messages_single_image(
65+
phi3v_model_config,
66+
phi3v_tokenizer,
67+
image_url,
68+
):
69+
conversation, mm_data = parse_chat_messages([{
4970
"role":
5071
"user",
5172
"content": [{
@@ -63,15 +84,42 @@ async def test_parse_chat_messages_with_image_url(phi3v_model_config,
6384
"role": "user",
6485
"content": "<|image_1|>\nWhat's in the image?"
6586
}]
66-
mm_data = await mm_future
67-
assert set(mm_data.keys()) == {"image"}
68-
assert isinstance(mm_data["image"], Image.Image)
87+
_assert_mm_data_is_image_input(mm_data, 1)
6988

7089

7190
@pytest.mark.asyncio
72-
async def test_parse_chat_messages_multiple_images(phi3v_model_config,
73-
phi3v_tokenizer, image_url):
74-
conversation, mm_future = parse_chat_messages([{
91+
async def test_parse_chat_messages_single_image_async(
92+
phi3v_model_config,
93+
phi3v_tokenizer,
94+
image_url,
95+
):
96+
conversation, mm_future = parse_chat_messages_futures([{
97+
"role":
98+
"user",
99+
"content": [{
100+
"type": "image_url",
101+
"image_url": {
102+
"url": image_url
103+
}
104+
}, {
105+
"type": "text",
106+
"text": "What's in the image?"
107+
}]
108+
}], phi3v_model_config, phi3v_tokenizer)
109+
110+
assert conversation == [{
111+
"role": "user",
112+
"content": "<|image_1|>\nWhat's in the image?"
113+
}]
114+
_assert_mm_data_is_image_input(await mm_future, 1)
115+
116+
117+
def test_parse_chat_messages_multiple_images(
118+
phi3v_model_config,
119+
phi3v_tokenizer,
120+
image_url,
121+
):
122+
conversation, mm_data = parse_chat_messages([{
75123
"role":
76124
"user",
77125
"content": [{
@@ -96,15 +144,49 @@ async def test_parse_chat_messages_multiple_images(phi3v_model_config,
96144
"content":
97145
"<|image_1|>\n<|image_2|>\nWhat's in these images?"
98146
}]
99-
mm_data = await mm_future
100-
assert set(mm_data.keys()) == {"image"}
101-
assert len(mm_data["image"]) == 2
147+
_assert_mm_data_is_image_input(mm_data, 2)
102148

103149

104150
@pytest.mark.asyncio
105-
async def test_parse_chat_messages_placeholder_already_in_prompt(
106-
phi3v_model_config, phi3v_tokenizer, image_url):
107-
conversation, mm_future = parse_chat_messages([{
151+
async def test_parse_chat_messages_multiple_images_async(
152+
phi3v_model_config,
153+
phi3v_tokenizer,
154+
image_url,
155+
):
156+
conversation, mm_future = parse_chat_messages_futures([{
157+
"role":
158+
"user",
159+
"content": [{
160+
"type": "image_url",
161+
"image_url": {
162+
"url": image_url
163+
}
164+
}, {
165+
"type": "image_url",
166+
"image_url": {
167+
"url": image_url
168+
}
169+
}, {
170+
"type": "text",
171+
"text": "What's in these images?"
172+
}]
173+
}], phi3v_model_config, phi3v_tokenizer)
174+
175+
assert conversation == [{
176+
"role":
177+
"user",
178+
"content":
179+
"<|image_1|>\n<|image_2|>\nWhat's in these images?"
180+
}]
181+
_assert_mm_data_is_image_input(await mm_future, 2)
182+
183+
184+
def test_parse_chat_messages_placeholder_already_in_prompt(
185+
phi3v_model_config,
186+
phi3v_tokenizer,
187+
image_url,
188+
):
189+
conversation, mm_data = parse_chat_messages([{
108190
"role":
109191
"user",
110192
"content": [{
@@ -131,15 +213,15 @@ async def test_parse_chat_messages_placeholder_already_in_prompt(
131213
"content":
132214
"What's in <|image_1|> and how does it compare to <|image_2|>?"
133215
}]
134-
mm_data = await mm_future
135-
assert set(mm_data.keys()) == {"image"}
136-
assert len(mm_data["image"]) == 2
216+
_assert_mm_data_is_image_input(mm_data, 2)
137217

138218

139-
@pytest.mark.asyncio
140-
async def test_parse_chat_messages_placeholder_one_already_in_prompt(
141-
phi3v_model_config, phi3v_tokenizer, image_url):
142-
conversation, mm_future = parse_chat_messages([{
219+
def test_parse_chat_messages_placeholder_one_already_in_prompt(
220+
phi3v_model_config,
221+
phi3v_tokenizer,
222+
image_url,
223+
):
224+
conversation, mm_data = parse_chat_messages([{
143225
"role":
144226
"user",
145227
"content": [{
@@ -167,15 +249,15 @@ async def test_parse_chat_messages_placeholder_one_already_in_prompt(
167249
"<|image_2|>\nWhat's in <|image_1|> and how does it compare to the "
168250
"other one?"
169251
}]
170-
mm_data = await mm_future
171-
assert set(mm_data.keys()) == {"image"}
172-
assert len(mm_data["image"]) == 2
252+
_assert_mm_data_is_image_input(mm_data, 2)
173253

174254

175-
@pytest.mark.asyncio
176-
async def test_parse_chat_messages_multiple_images_across_messages(
177-
phi3v_model_config, phi3v_tokenizer, image_url):
178-
conversation, mm_future = parse_chat_messages([{
255+
def test_parse_chat_messages_multiple_images_across_messages(
256+
phi3v_model_config,
257+
phi3v_tokenizer,
258+
image_url,
259+
):
260+
conversation, mm_data = parse_chat_messages([{
179261
"role":
180262
"user",
181263
"content": [{
@@ -218,14 +300,14 @@ async def test_parse_chat_messages_multiple_images_across_messages(
218300
"content": "<|image_2|>\nWhat about this one?"
219301
},
220302
]
221-
mm_data = await mm_future
222-
assert set(mm_data.keys()) == {"image"}
223-
assert len(mm_data["image"]) == 2
303+
_assert_mm_data_is_image_input(mm_data, 2)
224304

225305

226-
@pytest.mark.asyncio
227-
async def test_parse_chat_messages_rejects_too_many_images_in_one_message(
228-
phi3v_model_config, phi3v_tokenizer, image_url):
306+
def test_parse_chat_messages_rejects_too_many_images_in_one_message(
307+
phi3v_model_config,
308+
phi3v_tokenizer,
309+
image_url,
310+
):
229311
with warnings.catch_warnings():
230312
warnings.filterwarnings(
231313
"ignore",
@@ -259,9 +341,11 @@ async def test_parse_chat_messages_rejects_too_many_images_in_one_message(
259341
}], phi3v_model_config, phi3v_tokenizer)
260342

261343

262-
@pytest.mark.asyncio
263-
async def test_parse_chat_messages_rejects_too_many_images_across_messages(
264-
phi3v_model_config, phi3v_tokenizer, image_url):
344+
def test_parse_chat_messages_rejects_too_many_images_across_messages(
345+
phi3v_model_config,
346+
phi3v_tokenizer,
347+
image_url,
348+
):
265349
with warnings.catch_warnings():
266350
warnings.filterwarnings(
267351
"ignore",

0 commit comments

Comments
 (0)