Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,7 @@ steps:
- pytest -v -s models/encoder_decoder/audio_language -m core_model
- pytest -v -s models/encoder_decoder/language -m core_model
- pytest -v -s models/encoder_decoder/vision_language -m core_model
- pytest -v -s models/decoder_only/vision_language/test_interleaved.py

- label: Multi-Modal Models Test (Extended) 1 # 48m
optional: true
Expand Down
41 changes: 19 additions & 22 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -747,30 +747,27 @@ def get_inputs(
videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None,
) -> list[TextPrompt]:
if images is not None:
assert len(prompts) == len(images)

if videos is not None:
assert len(prompts) == len(videos)

if audios is not None:
assert len(prompts) == len(audios)
if any(x is not None and len(x) != len(prompts)
for x in [images, videos, audios]):
raise ValueError(
"All non-None multimodal inputs must have the same length as "
"prompts")

inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
if images is not None:
for i, image in enumerate(images):
if image is not None:
inputs[i]["multi_modal_data"] = {"image": image}

if videos is not None:
for i, video in enumerate(videos):
if video is not None:
inputs[i]["multi_modal_data"] = {"video": video}

if audios is not None:
for i, audio in enumerate(audios):
if audio is not None:
inputs[i]["multi_modal_data"] = {"audio": audio}
inputs = []
for i, prompt in enumerate(prompts):
multi_modal_data = {}
if images is not None and (image := images[i]) is not None:
multi_modal_data["image"] = image
if videos is not None and (video := videos[i]) is not None:
multi_modal_data["video"] = video
if audios is not None and (audio := audios[i]) is not None:
multi_modal_data["audio"] = audio

inputs.append(
TextPrompt(prompt=prompt,
multi_modal_data=multi_modal_data
if multi_modal_data else None))

return inputs

Expand Down
77 changes: 77 additions & 0 deletions tests/models/decoder_only/vision_language/test_interleaved.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# SPDX-License-Identifier: Apache-2.0

import pytest

from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset

models = ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]


def base_prompt(modalities_str: str) -> str:
return f"<|im_start|>user {modalities_str}\nDescribe what you see from these items.<|im_end|><|im_start|>assistant\n" # noqa: E501


INTERLEAVED_PROMPT = base_prompt("<image><video><image>\n")
NONINTERLEAVED_PROMPT = base_prompt("<image><image><video>\n")


@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", ["float16"])
@pytest.mark.parametrize("max_tokens", [128])
def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
"""
This is a simple test to check if interleaved and non-interleaved prompts
give the same result.
"""

image_cherry = ImageAsset("cherry_blossom").pil_image.convert("RGB")
image_stop = ImageAsset("stop_sign").pil_image.convert("RGB")
images = [image_cherry, image_stop]
video = VideoAsset(name="sample_demo_1.mp4", num_frames=16).np_ndarrays

inputs = [
(
[INTERLEAVED_PROMPT],
[images],
[video],
),
(
[NONINTERLEAVED_PROMPT],
[images],
[video],
),
]

with vllm_runner(model,
task="generate",
dtype=dtype,
limit_mm_per_prompt={"image": 2},
max_model_len=32768,
max_num_seqs=2,
tensor_parallel_size=1,
enforce_eager=True) as vllm_model:
vllm_outputs_per_case = [
vllm_model.generate_greedy(prompts,
max_tokens,
images=images,
videos=videos)
for prompts, images, videos in inputs
]

all_results = [output[0][1] for output in vllm_outputs_per_case]
outputs = [(total_str, total_str.find("assistant\n") + len("assistant\n"))
for total_str in all_results]
prompt_lengths = [prompt_len for _, prompt_len in outputs]
generated_strs = [
total_str[prompt_len:] for total_str, prompt_len in outputs
]
interleaved_prompt_len, noninterleaved_prompt_len = prompt_lengths
interleaved_output_str, noninterleaved_output_str = generated_strs

# The two prompts are identical except for the order of modality tokens.
assert interleaved_prompt_len == noninterleaved_prompt_len

# The two generated strings should be different because of the
# interleaved modality tokens.
assert interleaved_output_str != noninterleaved_output_str
80 changes: 64 additions & 16 deletions tests/multimodal/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def test_merge_and_sort_multimodal_metadata():
]
},
mm_hashes={"image": ["hash1", "hash2"]},
expected_modalities=["image"],
expected_modalities=["image", "image"],
expected_ranges=[
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=3, length=2),
Expand All @@ -172,7 +172,7 @@ def test_merge_and_sort_multimodal_metadata():
]
},
mm_hashes=None,
expected_modalities=["image"],
expected_modalities=["image", "image"],
expected_ranges=[
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=2, length=2),
Expand All @@ -197,7 +197,7 @@ def test_merge_and_sort_multimodal_metadata():
"image": ["image_hash1", "image_hash2"],
"audio": ["audio_hash1", "audio_hash2"],
},
expected_modalities=["audio", "image"],
expected_modalities=["audio", "audio", "image", "image"],
expected_ranges=[
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=2, length=3),
Expand All @@ -223,7 +223,7 @@ def test_merge_and_sort_multimodal_metadata():
]
},
mm_hashes=None,
expected_modalities=["audio", "image"],
expected_modalities=["audio", "audio", "image", "image"],
expected_ranges=[
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=2, length=3),
Expand Down Expand Up @@ -254,7 +254,9 @@ def test_merge_and_sort_multimodal_metadata():
"audio": ["audio_hash1"],
"video": ["video_hash1", "video_hash2", "video_hash3"]
},
expected_modalities=["audio", "video", "image"],
expected_modalities=[
"audio", "video", "video", "video", "image", "image"
],
expected_ranges=[
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=3, length=4),
Expand Down Expand Up @@ -300,12 +302,19 @@ def test_merge_and_sort_multimodal_metadata_with_interleaving():
"image": ["image_hash1", "image_hash2"],
"audio": ["audio_hash1", "audio_hash2"],
},
expected_modalities=[],
expected_ranges=[],
expected_hashes=None,
expected_modalities=["image", "audio", "image", "audio"],
expected_ranges=[
PlaceholderRange(offset=0, length=4),
PlaceholderRange(offset=5, length=2),
PlaceholderRange(offset=8, length=2),
PlaceholderRange(offset=11, length=4),
],
expected_hashes=[
"image_hash1", "audio_hash1", "image_hash2", "audio_hash2"
],
),

# <image> <image> <video> <audio> <image>
# <image> <image> <audio> <video> <image>
TestCase(
mm_positions={
"image": [
Expand All @@ -321,15 +330,54 @@ def test_merge_and_sort_multimodal_metadata_with_interleaving():
]
},
mm_hashes=None,
expected_modalities=[],
expected_ranges=[],
expected_modalities=["image", "image", "audio", "video", "image"],
expected_ranges=[
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=2, length=3),
PlaceholderRange(offset=5, length=2),
PlaceholderRange(offset=8, length=5),
PlaceholderRange(offset=20, length=4),
],
expected_hashes=None,
),

# <image> <audio> <video> <image> with hashes
TestCase(
mm_positions={
"image": [
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=18, length=4),
],
"audio": [
PlaceholderRange(offset=6, length=2),
],
"video": [
PlaceholderRange(offset=10, length=5),
]
},
mm_hashes={
"image": ["image_hash1", "image_hash2"],
"audio": ["audio_hash1"],
"video": ["video_hash1"],
},
expected_modalities=["image", "audio", "video", "image"],
expected_ranges=[
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=6, length=2),
PlaceholderRange(offset=10, length=5),
PlaceholderRange(offset=18, length=4),
],
expected_hashes=[
"image_hash1", "audio_hash1", "video_hash1", "image_hash2"
],
),
]

for case in test_cases:
with pytest.raises(ValueError) as ex_info:
merge_and_sort_multimodal_metadata(case.mm_positions,
case.mm_hashes)
for (mm_positions, mm_hashes, expected_modalities, expected_ranges,
expected_hashes) in test_cases:
modalities, ranges, hashes = merge_and_sort_multimodal_metadata(
mm_positions, mm_hashes)

assert "Interleaved mixed-modality" in str(ex_info.value)
assert modalities == expected_modalities
assert ranges == expected_ranges
assert hashes == expected_hashes
82 changes: 30 additions & 52 deletions vllm/multimodal/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,14 +303,10 @@ def merge_and_sort_multimodal_metadata(

Optionally if a MultiModalHashDict is given, same operation will be
applied to the object and the sorted list of hashes will be returned.

Raises:
ValueError: If the input prompt has interleaved placeholders from
different modalities (e.g, "<image><audio><image> Describe the
content.")

Returns:
list[str]: Sorted list of involved modalities.
list[str]: List of item modalities in order of their positions in
the input sequence.
list[PlaceholderRange]: Sorted list of all PlaceholdeRanges from
mm_positions.
Optional[list[str]]: Sorted list of all hashes from mm_hashes if
Expand All @@ -324,50 +320,33 @@ def merge_and_sort_multimodal_metadata(
# For single modality, placeholder ranges and hashes are already sorted
# so we can return the list directly.
if len(modalities) == 1:
if mm_hashes is None:
return modalities, list(mm_positions[modalities[0]]), None
else:
return modalities, list(mm_positions[modalities[0]]), list(
mm_hashes[modalities[0]])

placeholder_lists_with_modality = [(modality, mm_positions[modality])
for modality in modalities]

if mm_hashes is None:
sorted_placeholder_lists = sorted(placeholder_lists_with_modality,
key=lambda x: x[1][0]['offset'])
sorted_hash_lists = None
else:
hashes_lists = [
mm_hashes[modality] for modality in modalities
if modality in mm_hashes
]
sorted_pairs = sorted(zip(placeholder_lists_with_modality,
hashes_lists),
key=lambda x: x[0][1][0]['offset'])
sorted_placeholder_tuple, sorted_hash_tuple = zip(*sorted_pairs)
sorted_placeholder_lists = list(sorted_placeholder_tuple)
sorted_hash_lists = list(sorted_hash_tuple)

sorted_modalities = [modality for modality, _ in sorted_placeholder_lists]

# Flatten sorted list of lists to a single list and verify there is no
# interleaving of placeholders from different modalities.
merged_placeholders: list[PlaceholderRange] = []
for modality, placeholder_list in sorted_placeholder_lists:
if merged_placeholders and placeholder_list[0][
'offset'] < merged_placeholders[-1]['offset']:
raise ValueError(
"Interleaved mixed-modality inference is currently not "
"supported.")
merged_placeholders.extend(placeholder_list)

if sorted_hash_lists is not None:
merged_hashes = []
for hash_list in sorted_hash_lists:
merged_hashes.extend(hash_list)
else:
merged_hashes = None
modality = modalities[0]
placeholder_list = list(mm_positions[modality])

return [modality] * len(
placeholder_list
), placeholder_list, None if not mm_hashes else mm_hashes[modality]

# Create a list of (modality, placeholder, hash) tuples for all placeholders
all_items = []
for modality in modalities:
placeholder_list = list(mm_positions[modality])
hash_list: list[Optional[str]] = list(
mm_hashes[modality]) if mm_hashes and modality in mm_hashes else [
None
] * len(placeholder_list)

for placeholder, hash_value in zip(placeholder_list, hash_list):
all_items.append((modality, placeholder, hash_value))

# Sort all items by offset
all_items.sort(key=lambda x: x[1]['offset'])

# Split into separate lists
sorted_modalities = [item[0] for item in all_items]
merged_placeholders = [item[1] for item in all_items]
merged_hashes = [str(item[2])
for item in all_items] if mm_hashes is not None else None

return sorted_modalities, merged_placeholders, merged_hashes

Expand All @@ -383,8 +362,7 @@ def group_mm_inputs_by_modality(

Returns:
list[list[MultiModalKwargs]]: List of list of MultiModalKwargs, each
inner list contains consecutive MultiModalKwargs with same modality, or
one with multimodal modalities.
inner list contains consecutive MultiModalKwargs with same modality.
"""
if not mm_inputs:
return []
Expand Down
Loading