diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 428b4c593c38..6a82a2285b2d 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -431,6 +431,7 @@ steps: - pytest -v -s models/encoder_decoder/audio_language -m core_model - pytest -v -s models/encoder_decoder/language -m core_model - pytest -v -s models/encoder_decoder/vision_language -m core_model + - pytest -v -s models/decoder_only/vision_language/test_interleaved.py - label: Multi-Modal Models Test (Extended) 1 # 48m optional: true diff --git a/tests/conftest.py b/tests/conftest.py index cc48fceb8eff..6627ab638bf5 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -747,30 +747,27 @@ def get_inputs( videos: Optional[PromptVideoInput] = None, audios: Optional[PromptAudioInput] = None, ) -> list[TextPrompt]: - if images is not None: - assert len(prompts) == len(images) - - if videos is not None: - assert len(prompts) == len(videos) - if audios is not None: - assert len(prompts) == len(audios) + if any(x is not None and len(x) != len(prompts) + for x in [images, videos, audios]): + raise ValueError( + "All non-None multimodal inputs must have the same length as " + "prompts") - inputs = [TextPrompt(prompt=prompt) for prompt in prompts] - if images is not None: - for i, image in enumerate(images): - if image is not None: - inputs[i]["multi_modal_data"] = {"image": image} - - if videos is not None: - for i, video in enumerate(videos): - if video is not None: - inputs[i]["multi_modal_data"] = {"video": video} - - if audios is not None: - for i, audio in enumerate(audios): - if audio is not None: - inputs[i]["multi_modal_data"] = {"audio": audio} + inputs = [] + for i, prompt in enumerate(prompts): + multi_modal_data = {} + if images is not None and (image := images[i]) is not None: + multi_modal_data["image"] = image + if videos is not None and (video := videos[i]) is not None: + multi_modal_data["video"] = video + if audios is not None and (audio := audios[i]) is not None: + multi_modal_data["audio"] = audio + + inputs.append( + TextPrompt(prompt=prompt, + multi_modal_data=multi_modal_data + if multi_modal_data else None)) return inputs diff --git a/tests/models/decoder_only/vision_language/test_interleaved.py b/tests/models/decoder_only/vision_language/test_interleaved.py new file mode 100644 index 000000000000..8804497ae616 --- /dev/null +++ b/tests/models/decoder_only/vision_language/test_interleaved.py @@ -0,0 +1,77 @@ +# SPDX-License-Identifier: Apache-2.0 + +import pytest + +from vllm.assets.image import ImageAsset +from vllm.assets.video import VideoAsset + +models = ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"] + + +def base_prompt(modalities_str: str) -> str: + return f"<|im_start|>user {modalities_str}\nDescribe what you see from these items.<|im_end|><|im_start|>assistant\n" # noqa: E501 + + +INTERLEAVED_PROMPT = base_prompt("