Skip to content

Commit c68eea1

Browse files
DarkLight1337GWS0428
authored andcommitted
[CI/Build] Move model-specific multi-modal processing tests (vllm-project#11934)
Signed-off-by: DarkLight1337 <[email protected]>
1 parent 671f64c commit c68eea1

File tree

13 files changed

+251
-240
lines changed

13 files changed

+251
-240
lines changed

.buildkite/test-pipeline.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -368,6 +368,7 @@ steps:
368368
- tests/models/encoder_decoder/vision_language
369369
commands:
370370
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
371+
- pytest -v -s models/multimodal
371372
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
372373
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
373374
- pytest -v -s models/embedding/vision_language -m core_model

tests/models/multimodal/processing/__init__.py

Whitespace-only changes.
Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
from functools import partial
2+
3+
import numpy as np
4+
import pytest
5+
from PIL import Image
6+
7+
from vllm.config import ModelConfig
8+
from vllm.inputs import InputProcessingContext
9+
from vllm.multimodal import MULTIMODAL_REGISTRY
10+
from vllm.multimodal.processing import ProcessingCache
11+
from vllm.multimodal.utils import cached_get_tokenizer
12+
13+
from ....multimodal.utils import random_audio, random_image, random_video
14+
15+
16+
def _test_processing_correctness(
17+
model_id: str,
18+
modalities: dict[str, bool],
19+
hit_rate: float,
20+
num_batches: int,
21+
simplify_rate: float,
22+
):
23+
if model_id == "TIGER-Lab/Mantis-8B-siglip-llama3":
24+
hf_overrides = {"architectures": ["MantisForConditionalGeneration"]}
25+
else:
26+
hf_overrides = {}
27+
28+
limit_mm_per_prompt = {
29+
modality: 3 if supports_multi else 1
30+
for modality, supports_multi in modalities.items()
31+
}
32+
33+
model_config = ModelConfig(
34+
model_id,
35+
task="auto",
36+
tokenizer=model_id,
37+
tokenizer_mode="auto",
38+
trust_remote_code=True,
39+
seed=0,
40+
dtype="float16",
41+
revision=None,
42+
hf_overrides=hf_overrides,
43+
limit_mm_per_prompt=limit_mm_per_prompt,
44+
)
45+
46+
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
47+
factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
48+
ctx = InputProcessingContext(
49+
model_config,
50+
tokenizer=cached_get_tokenizer(model_config.tokenizer),
51+
)
52+
# Ensure that it can fit all of the data
53+
cache = ProcessingCache(capacity=1 << 30)
54+
55+
baseline_processor = factories.build_processor(ctx, cache=None)
56+
cached_processor = factories.build_processor(ctx, cache=cache)
57+
dummy_inputs = baseline_processor.dummy_inputs
58+
tokenizer = baseline_processor.info.get_tokenizer()
59+
60+
rng = np.random.RandomState(0)
61+
62+
input_to_hit = {
63+
"image": Image.new("RGB", size=(128, 128)),
64+
"video": np.zeros((4, 128, 128, 3), dtype=np.uint8),
65+
"audio": (np.zeros((512, )), 16000),
66+
}
67+
input_factory = {
68+
"image":
69+
partial(random_image, rng, min_wh=128, max_wh=256),
70+
"video":
71+
partial(random_video,
72+
rng,
73+
min_frames=2,
74+
max_frames=8,
75+
min_wh=128,
76+
max_wh=256),
77+
"audio":
78+
partial(random_audio, rng, min_len=512, max_len=1024, sr=16000),
79+
}
80+
81+
for batch_idx in range(num_batches):
82+
mm_data = {
83+
k:
84+
[(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
85+
for _ in range(rng.randint(limit_mm_per_prompt[k]))]
86+
for k in modalities
87+
}
88+
89+
mm_counts = {k: len(vs) for k, vs in mm_data.items()}
90+
prompt = dummy_inputs.get_dummy_processor_inputs(
91+
model_config.max_model_len,
92+
mm_counts,
93+
).prompt_text
94+
95+
# Drop unnecessary keys and test single -> multi conversion
96+
if rng.rand() < simplify_rate:
97+
for k in list(mm_data.keys()):
98+
if not mm_data[k]:
99+
del mm_data[k]
100+
elif len(mm_data[k]) == 1:
101+
mm_data[k] = mm_data[k][0]
102+
103+
baseline_result = baseline_processor.apply(
104+
prompt,
105+
mm_data=mm_data,
106+
hf_processor_mm_kwargs={},
107+
)
108+
cached_result = cached_processor.apply(
109+
prompt,
110+
mm_data=mm_data,
111+
hf_processor_mm_kwargs={},
112+
)
113+
114+
assert baseline_result == cached_result, (
115+
f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
116+
117+
baseline_tokenized_result = baseline_processor.apply(
118+
tokenizer.encode(prompt),
119+
mm_data=mm_data,
120+
hf_processor_mm_kwargs={},
121+
)
122+
123+
assert baseline_result == baseline_tokenized_result, (
124+
f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
125+
126+
cached_tokenized_result = cached_processor.apply(
127+
tokenizer.encode(prompt),
128+
mm_data=mm_data,
129+
hf_processor_mm_kwargs={},
130+
)
131+
132+
assert cached_result == cached_tokenized_result, (
133+
f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
134+
135+
136+
# yapf: disable
137+
# True if the model supports multiple data items of the modality per request
138+
@pytest.mark.parametrize(("model_id", "modalities"), [
139+
("rhymes-ai/Aria", {"image": True}),
140+
("Salesforce/blip2-opt-2.7b", {"image": False}),
141+
("facebook/chameleon-7b", {"image": False}),
142+
("adept/fuyu-8b", {"image": False}),
143+
("llava-hf/llava-1.5-7b-hf", {"image": True}),
144+
("llava-hf/llava-v1.6-mistral-7b-hf", {"image": True}),
145+
("llava-hf/LLaVA-NeXT-Video-7B-hf", {"video": False}),
146+
("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", {"image": True, "video": True}), # noqa: E501
147+
("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}),
148+
("mistral-community/pixtral-12b", {"image": True}),
149+
("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}),
150+
("Qwen/Qwen2-Audio-7B-Instruct", {"audio": True}),
151+
("fixie-ai/ultravox-v0_3", {"audio": True}),
152+
])
153+
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
154+
@pytest.mark.parametrize("num_batches", [32])
155+
@pytest.mark.parametrize("simplify_rate", [1.0])
156+
# yapf: enable
157+
def test_processing_correctness(
158+
model_id: str,
159+
modalities: dict[str, bool],
160+
hit_rate: float,
161+
num_batches: int,
162+
simplify_rate: float,
163+
):
164+
_test_processing_correctness(
165+
model_id,
166+
modalities,
167+
hit_rate=hit_rate,
168+
num_batches=num_batches,
169+
simplify_rate=simplify_rate,
170+
)
171+
172+
173+
# yapf: disable
174+
@pytest.mark.parametrize(("model_id", "modalities"), [
175+
("microsoft/Phi-3-vision-128k-instruct", {"image": True}),
176+
])
177+
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
178+
@pytest.mark.parametrize("num_batches", [32])
179+
@pytest.mark.parametrize("simplify_rate", [1.0])
180+
# yapf: enable
181+
def test_processing_correctness_phi3v(
182+
model_id: str,
183+
modalities: dict[str, bool],
184+
hit_rate: float,
185+
num_batches: int,
186+
simplify_rate: float,
187+
):
188+
# HACK - this is an attempted workaround for the following bug
189+
# https://github.com/huggingface/transformers/issues/34307
190+
from transformers import AutoImageProcessor # noqa: F401
191+
from transformers import AutoProcessor # noqa: F401
192+
193+
AutoImageProcessor.from_pretrained(model_id, trust_remote_code=True)
194+
195+
_test_processing_correctness(
196+
model_id,
197+
modalities,
198+
hit_rate=hit_rate,
199+
num_batches=num_batches,
200+
simplify_rate=simplify_rate,
201+
)

tests/models/decoder_only/vision_language/processing/test_idefics3.py renamed to tests/models/multimodal/processing/test_idefics3.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88
from vllm.inputs import InputContext, token_inputs
99
from vllm.multimodal import MultiModalRegistry
1010

11-
from .....conftest import _ImageAssets
12-
from ....utils import build_model_context
11+
from ....conftest import _ImageAssets
12+
from ...utils import build_model_context
1313

1414
models = ["HuggingFaceM4/Idefics3-8B-Llama3"]
1515

tests/models/decoder_only/vision_language/processing/test_internvl.py renamed to tests/models/multimodal/processing/test_internvl.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@
77
from vllm.inputs import InputContext, token_inputs
88
from vllm.multimodal import MultiModalRegistry
99

10-
from .....conftest import _ImageAssets
11-
from ....utils import build_model_context
10+
from ....conftest import _ImageAssets
11+
from ...utils import build_model_context
1212

1313
models = ["OpenGVLab/InternVL2-2B"]
1414

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from vllm.multimodal.processing import BaseMultiModalProcessor
1111
from vllm.multimodal.utils import cached_get_tokenizer
1212

13-
from ....utils import build_model_context
13+
from ...utils import build_model_context
1414

1515

1616
def _validate_image_prompt_replacements_one(
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from vllm.multimodal.processing import BaseMultiModalProcessor
1111
from vllm.multimodal.utils import cached_get_tokenizer
1212

13-
from ....utils import build_model_context
13+
from ...utils import build_model_context
1414

1515

1616
def _validate_image_prompt_replacements_one(

tests/models/decoder_only/vision_language/processing/test_phi3v.py renamed to tests/models/multimodal/processing/test_phi3v.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
from vllm.multimodal import MULTIMODAL_REGISTRY
55
from vllm.multimodal.utils import cached_get_tokenizer
66

7-
from .....conftest import _ImageAssets
8-
from ....utils import build_model_context
7+
from ....conftest import _ImageAssets
8+
from ...utils import build_model_context
99

1010

1111
@pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"])

tests/models/decoder_only/vision_language/processing/test_qwen.py renamed to tests/models/multimodal/processing/test_qwen.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99
from vllm.multimodal import MultiModalKwargs
1010
from vllm.multimodal.utils import cached_get_tokenizer
1111

12-
from .....conftest import IMAGE_ASSETS
13-
from ....utils import build_model_context
12+
from ....conftest import IMAGE_ASSETS
13+
from ...utils import build_model_context
1414

1515
### Multimodal preprocessing tests
1616
SAMPLE_IMAGE = IMAGE_ASSETS[0].pil_image

0 commit comments

Comments
 (0)