Skip to content
Merged
Show file tree
Hide file tree
Changes from 46 commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
203fbb4
init
Isotr0py Apr 24, 2025
3a41833
init
Isotr0py Apr 24, 2025
4486373
init example
Isotr0py Apr 24, 2025
d1a0a69
init example
Isotr0py Apr 24, 2025
352b747
fix projection
Isotr0py Apr 24, 2025
22b2611
fix model registry
Isotr0py Apr 24, 2025
c3fe338
fix model loading
Isotr0py Apr 24, 2025
33d13ca
clean up vit initialization
Isotr0py Apr 25, 2025
7eac283
fix embed names and processing info
Isotr0py Apr 25, 2025
4b65be4
fix audio processing info
Isotr0py Apr 25, 2025
4414a21
fix vision inference
Isotr0py Apr 25, 2025
940e04f
fix vision inference
Isotr0py Apr 25, 2025
d18f0b4
fix lora
Isotr0py Apr 25, 2025
89f7545
add multi-image example
Isotr0py Apr 26, 2025
dbf6324
fix multi-image example
Isotr0py Apr 26, 2025
f77803f
fix audio
Isotr0py Apr 26, 2025
a8c4547
fix audio inference
Isotr0py Apr 26, 2025
2b1fb2a
fix audio mlp
Isotr0py Apr 26, 2025
9d15ac4
clean up
Isotr0py Apr 26, 2025
e246fea
fix audio tp
Isotr0py Apr 26, 2025
1dd185c
Merge branch 'vllm-project:main' into phi4-mm-hf
Isotr0py Apr 27, 2025
6416e26
fix v1 inference
Isotr0py Apr 27, 2025
709e257
Merge branch 'vllm-project:main' into phi4-mm-hf
Isotr0py May 1, 2025
d0e6a08
Merge branch 'vllm-project:main' into phi4-mm-hf
Isotr0py May 7, 2025
2ee04be
update example and fix tp
Isotr0py May 7, 2025
87bc8c7
Merge branch 'vllm-project:main' into phi4-mm-hf
Isotr0py May 10, 2025
09d3990
add test
Isotr0py May 10, 2025
eb224c0
update
Isotr0py May 11, 2025
e5ef054
fix lora loading
Isotr0py May 12, 2025
cb7a63a
update docs
Isotr0py May 12, 2025
d4fb0eb
update test
Isotr0py May 12, 2025
c45611a
add processing tests
Isotr0py May 13, 2025
108947e
fix processing tests
Isotr0py May 13, 2025
810cd06
Merge branch 'vllm-project:main' into phi4-mm-hf
Isotr0py May 13, 2025
808a9be
code format
Isotr0py May 13, 2025
c3d2e3f
fix processor test
Isotr0py May 13, 2025
96cc575
address comments
Isotr0py May 14, 2025
f398b8e
place mistral3 to correct place
Isotr0py May 14, 2025
0dc82e3
correct docs
Isotr0py May 14, 2025
712378a
Merge remote-tracking branch 'upstream/main' into phi4-mm-hf
Isotr0py Jul 14, 2025
8a632e5
fix processing
Isotr0py Jul 14, 2025
e0291e3
make pre-commit happy
Isotr0py Jul 14, 2025
b2540f3
update doc
Isotr0py Jul 14, 2025
04e238f
make pre-commit happy
Isotr0py Jul 14, 2025
70b15f9
remove patch
Isotr0py Jul 16, 2025
2cda47d
Merge branch 'main' into phi4-mm-hf
Isotr0py Jul 16, 2025
dce8cb1
address comments
Isotr0py Jul 23, 2025
7ff5a2f
Merge branch 'vllm-project:main' into phi4-mm-hf
Isotr0py Jul 23, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -596,6 +596,7 @@ Specified using `--task generate`.
| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | ⚠️ |
| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | ✅︎ |
| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Phi4MultimodalForCausalLM` | Phi-4-multimodal (HF Transformers) | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct` (with revision `refs/pr/70`), etc. | ✅︎ | ✅︎ | ✅︎ |
| `PixtralForConditionalGeneration` | Mistral 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Pixtral-12B-2409`, etc. | | ✅︎ | ✅︎ |
| `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | ✅︎ |
Expand Down
32 changes: 32 additions & 0 deletions examples/offline_inference/audio_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,37 @@ def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
)


def run_phi4_multimodal(question: str, audio_count: int) -> ModelRequestData:
"""
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
show how to process audio inputs.
"""
model_path = snapshot_download(
"microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
)
# Since the vision-lora and speech-lora co-exist with the base model,
# we have to manually specify the path of the lora weights.
speech_lora_path = os.path.join(model_path, "speech-lora")
placeholders = "<|audio|>" * audio_count

prompts = f"<|user|>{placeholders}{question}<|end|><|assistant|>"

engine_args = EngineArgs(
model=model_path,
max_model_len=12800,
max_num_seqs=2,
enable_lora=True,
max_lora_rank=320,
limit_mm_per_prompt={"audio": audio_count},
)

return ModelRequestData(
engine_args=engine_args,
prompt=prompts,
lora_requests=[LoRARequest("speech", 1, speech_lora_path)],
)


# Qwen2-Audio
def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData:
model_name = "Qwen/Qwen2-Audio-7B-Instruct"
Expand Down Expand Up @@ -303,6 +334,7 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData:
"granite_speech": run_granite_speech,
"minicpmo": run_minicpmo,
"phi4_mm": run_phi4mm,
"phi4_multimodal": run_phi4_multimodal,
"qwen2_audio": run_qwen2_audio,
"qwen2_5_omni": run_qwen2_5_omni,
"ultravox": run_ultravox,
Expand Down
36 changes: 36 additions & 0 deletions examples/offline_inference/vision_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -949,6 +949,41 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
)


# HF format Phi-4-multimodal-instruct
def run_phi4_multimodal(questions: list[str], modality: str) -> ModelRequestData:
"""
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
show how to process image inputs.
"""
assert modality == "image"
model_path = snapshot_download(
"microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
)
# Since the vision-lora and speech-lora co-exist with the base model,
# we have to manually specify the path of the lora weights.
vision_lora_path = os.path.join(model_path, "vision-lora")
prompts = [
f"<|user|><|image|>{question}<|end|><|assistant|>" for question in questions
]
engine_args = EngineArgs(
model=model_path,
max_model_len=5120,
max_num_seqs=2,
max_num_batched_tokens=12800,
enable_lora=True,
max_lora_rank=320,
# Note - mm_processor_kwargs can also be passed to generate/chat calls
mm_processor_kwargs={"dynamic_hd": 16},
limit_mm_per_prompt={"image": 1},
)

return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
)


# Pixtral HF-format
def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
Expand Down Expand Up @@ -1205,6 +1240,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
"paligemma2": run_paligemma2,
"phi3_v": run_phi3v,
"phi4_mm": run_phi4mm,
"phi4_multimodal": run_phi4_multimodal,
"pixtral_hf": run_pixtral_hf,
"qwen_vl": run_qwen_vl,
"qwen2_vl": run_qwen2_vl,
Expand Down
35 changes: 35 additions & 0 deletions examples/offline_inference/vision_language_multi_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -686,6 +686,40 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
)


def load_phi4_multimodal(question: str, image_urls: list[str]) -> ModelRequestData:
"""
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
show how to process multi images inputs.
"""

model_path = snapshot_download(
"microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
)
# Since the vision-lora and speech-lora co-exist with the base model,
# we have to manually specify the path of the lora weights.
vision_lora_path = os.path.join(model_path, "vision-lora")
engine_args = EngineArgs(
model=model_path,
max_model_len=4096,
max_num_seqs=2,
limit_mm_per_prompt={"image": len(image_urls)},
enable_lora=True,
max_lora_rank=320,
# Note - mm_processor_kwargs can also be passed to generate/chat calls
mm_processor_kwargs={"dynamic_hd": 4},
)

placeholders = "<|image|>" * len(image_urls)
prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"

return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=[fetch_image(url) for url in image_urls],
lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
)


def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "Qwen/Qwen-VL-Chat"
engine_args = EngineArgs(
Expand Down Expand Up @@ -912,6 +946,7 @@ def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
"ovis": load_ovis,
"phi3_v": load_phi3v,
"phi4_mm": load_phi4mm,
"phi4_multimodal": load_phi4_multimodal,
"pixtral_hf": load_pixtral_hf,
"qwen_vl_chat": load_qwen_vl_chat,
"qwen2_vl": load_qwen2_vl,
Expand Down
Loading
Loading