|
| 1 | +from functools import partial |
| 2 | + |
| 3 | +import numpy as np |
| 4 | +import pytest |
| 5 | +from PIL import Image |
| 6 | + |
| 7 | +from vllm.config import ModelConfig |
| 8 | +from vllm.inputs import InputProcessingContext |
| 9 | +from vllm.multimodal import MULTIMODAL_REGISTRY |
| 10 | +from vllm.multimodal.processing import ProcessingCache |
| 11 | +from vllm.multimodal.utils import cached_get_tokenizer |
| 12 | + |
| 13 | +from ....multimodal.utils import random_audio, random_image, random_video |
| 14 | + |
| 15 | + |
| 16 | +def _test_processing_correctness( |
| 17 | + model_id: str, |
| 18 | + modalities: dict[str, bool], |
| 19 | + hit_rate: float, |
| 20 | + num_batches: int, |
| 21 | + simplify_rate: float, |
| 22 | +): |
| 23 | + if model_id == "TIGER-Lab/Mantis-8B-siglip-llama3": |
| 24 | + hf_overrides = {"architectures": ["MantisForConditionalGeneration"]} |
| 25 | + else: |
| 26 | + hf_overrides = {} |
| 27 | + |
| 28 | + limit_mm_per_prompt = { |
| 29 | + modality: 3 if supports_multi else 1 |
| 30 | + for modality, supports_multi in modalities.items() |
| 31 | + } |
| 32 | + |
| 33 | + model_config = ModelConfig( |
| 34 | + model_id, |
| 35 | + task="auto", |
| 36 | + tokenizer=model_id, |
| 37 | + tokenizer_mode="auto", |
| 38 | + trust_remote_code=True, |
| 39 | + seed=0, |
| 40 | + dtype="float16", |
| 41 | + revision=None, |
| 42 | + hf_overrides=hf_overrides, |
| 43 | + limit_mm_per_prompt=limit_mm_per_prompt, |
| 44 | + ) |
| 45 | + |
| 46 | + model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) |
| 47 | + factories = MULTIMODAL_REGISTRY._processor_factories[model_cls] |
| 48 | + ctx = InputProcessingContext( |
| 49 | + model_config, |
| 50 | + tokenizer=cached_get_tokenizer(model_config.tokenizer), |
| 51 | + ) |
| 52 | + # Ensure that it can fit all of the data |
| 53 | + cache = ProcessingCache(capacity=1 << 30) |
| 54 | + |
| 55 | + baseline_processor = factories.build_processor(ctx, cache=None) |
| 56 | + cached_processor = factories.build_processor(ctx, cache=cache) |
| 57 | + dummy_inputs = baseline_processor.dummy_inputs |
| 58 | + tokenizer = baseline_processor.info.get_tokenizer() |
| 59 | + |
| 60 | + rng = np.random.RandomState(0) |
| 61 | + |
| 62 | + input_to_hit = { |
| 63 | + "image": Image.new("RGB", size=(128, 128)), |
| 64 | + "video": np.zeros((4, 128, 128, 3), dtype=np.uint8), |
| 65 | + "audio": (np.zeros((512, )), 16000), |
| 66 | + } |
| 67 | + input_factory = { |
| 68 | + "image": |
| 69 | + partial(random_image, rng, min_wh=128, max_wh=256), |
| 70 | + "video": |
| 71 | + partial(random_video, |
| 72 | + rng, |
| 73 | + min_frames=2, |
| 74 | + max_frames=8, |
| 75 | + min_wh=128, |
| 76 | + max_wh=256), |
| 77 | + "audio": |
| 78 | + partial(random_audio, rng, min_len=512, max_len=1024, sr=16000), |
| 79 | + } |
| 80 | + |
| 81 | + for batch_idx in range(num_batches): |
| 82 | + mm_data = { |
| 83 | + k: |
| 84 | + [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]()) |
| 85 | + for _ in range(rng.randint(limit_mm_per_prompt[k]))] |
| 86 | + for k in modalities |
| 87 | + } |
| 88 | + |
| 89 | + mm_counts = {k: len(vs) for k, vs in mm_data.items()} |
| 90 | + prompt = dummy_inputs.get_dummy_processor_inputs( |
| 91 | + model_config.max_model_len, |
| 92 | + mm_counts, |
| 93 | + ).prompt_text |
| 94 | + |
| 95 | + # Drop unnecessary keys and test single -> multi conversion |
| 96 | + if rng.rand() < simplify_rate: |
| 97 | + for k in list(mm_data.keys()): |
| 98 | + if not mm_data[k]: |
| 99 | + del mm_data[k] |
| 100 | + elif len(mm_data[k]) == 1: |
| 101 | + mm_data[k] = mm_data[k][0] |
| 102 | + |
| 103 | + baseline_result = baseline_processor.apply( |
| 104 | + prompt, |
| 105 | + mm_data=mm_data, |
| 106 | + hf_processor_mm_kwargs={}, |
| 107 | + ) |
| 108 | + cached_result = cached_processor.apply( |
| 109 | + prompt, |
| 110 | + mm_data=mm_data, |
| 111 | + hf_processor_mm_kwargs={}, |
| 112 | + ) |
| 113 | + |
| 114 | + assert baseline_result == cached_result, ( |
| 115 | + f"Failed ({batch_idx=}, {prompt=}, {mm_data=})") |
| 116 | + |
| 117 | + baseline_tokenized_result = baseline_processor.apply( |
| 118 | + tokenizer.encode(prompt), |
| 119 | + mm_data=mm_data, |
| 120 | + hf_processor_mm_kwargs={}, |
| 121 | + ) |
| 122 | + |
| 123 | + assert baseline_result == baseline_tokenized_result, ( |
| 124 | + f"Failed ({batch_idx=}, {prompt=}, {mm_data=})") |
| 125 | + |
| 126 | + cached_tokenized_result = cached_processor.apply( |
| 127 | + tokenizer.encode(prompt), |
| 128 | + mm_data=mm_data, |
| 129 | + hf_processor_mm_kwargs={}, |
| 130 | + ) |
| 131 | + |
| 132 | + assert cached_result == cached_tokenized_result, ( |
| 133 | + f"Failed ({batch_idx=}, {prompt=}, {mm_data=})") |
| 134 | + |
| 135 | + |
| 136 | +# yapf: disable |
| 137 | +# True if the model supports multiple data items of the modality per request |
| 138 | +@pytest.mark.parametrize(("model_id", "modalities"), [ |
| 139 | + ("rhymes-ai/Aria", {"image": True}), |
| 140 | + ("Salesforce/blip2-opt-2.7b", {"image": False}), |
| 141 | + ("facebook/chameleon-7b", {"image": False}), |
| 142 | + ("adept/fuyu-8b", {"image": False}), |
| 143 | + ("llava-hf/llava-1.5-7b-hf", {"image": True}), |
| 144 | + ("llava-hf/llava-v1.6-mistral-7b-hf", {"image": True}), |
| 145 | + ("llava-hf/LLaVA-NeXT-Video-7B-hf", {"video": False}), |
| 146 | + ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", {"image": True, "video": True}), # noqa: E501 |
| 147 | + ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}), |
| 148 | + ("mistral-community/pixtral-12b", {"image": True}), |
| 149 | + ("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}), |
| 150 | + ("Qwen/Qwen2-Audio-7B-Instruct", {"audio": True}), |
| 151 | + ("fixie-ai/ultravox-v0_3", {"audio": True}), |
| 152 | +]) |
| 153 | +@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) |
| 154 | +@pytest.mark.parametrize("num_batches", [32]) |
| 155 | +@pytest.mark.parametrize("simplify_rate", [1.0]) |
| 156 | +# yapf: enable |
| 157 | +def test_processing_correctness( |
| 158 | + model_id: str, |
| 159 | + modalities: dict[str, bool], |
| 160 | + hit_rate: float, |
| 161 | + num_batches: int, |
| 162 | + simplify_rate: float, |
| 163 | +): |
| 164 | + _test_processing_correctness( |
| 165 | + model_id, |
| 166 | + modalities, |
| 167 | + hit_rate=hit_rate, |
| 168 | + num_batches=num_batches, |
| 169 | + simplify_rate=simplify_rate, |
| 170 | + ) |
| 171 | + |
| 172 | + |
| 173 | +# yapf: disable |
| 174 | +@pytest.mark.parametrize(("model_id", "modalities"), [ |
| 175 | + ("microsoft/Phi-3-vision-128k-instruct", {"image": True}), |
| 176 | +]) |
| 177 | +@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) |
| 178 | +@pytest.mark.parametrize("num_batches", [32]) |
| 179 | +@pytest.mark.parametrize("simplify_rate", [1.0]) |
| 180 | +# yapf: enable |
| 181 | +def test_processing_correctness_phi3v( |
| 182 | + model_id: str, |
| 183 | + modalities: dict[str, bool], |
| 184 | + hit_rate: float, |
| 185 | + num_batches: int, |
| 186 | + simplify_rate: float, |
| 187 | +): |
| 188 | + # HACK - this is an attempted workaround for the following bug |
| 189 | + # https://github.com/huggingface/transformers/issues/34307 |
| 190 | + from transformers import AutoImageProcessor # noqa: F401 |
| 191 | + from transformers import AutoProcessor # noqa: F401 |
| 192 | + |
| 193 | + AutoImageProcessor.from_pretrained(model_id, trust_remote_code=True) |
| 194 | + |
| 195 | + _test_processing_correctness( |
| 196 | + model_id, |
| 197 | + modalities, |
| 198 | + hit_rate=hit_rate, |
| 199 | + num_batches=num_batches, |
| 200 | + simplify_rate=simplify_rate, |
| 201 | + ) |
0 commit comments