Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 4 additions & 7 deletions vllm/worker/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -729,7 +729,10 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
mm_kwargs, placeholder_maps = MultiModalPlaceholderMap.from_seq_group(
seq_group_metadata,
range(positions[0], positions[0] + len(positions)))
if not mm_kwargs:

# M-RoPE requires mrope_positions even for plain text; return early
# when mm_kwargs is empty only if inter_data.is_prompt is False.
if not mm_kwargs and (not inter_data.is_prompt):
return

inter_data.multi_modal_kwargs = mm_kwargs
Expand All @@ -741,12 +744,6 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
video_grid_thw = mm_kwargs.get("video_grid_thw", None)
audio_feature_lengths = mm_kwargs.get("audio_feature_lengths",
None)
assert (
image_grid_thw is not None or video_grid_thw is not None
or audio_feature_lengths is not None), (
"mrope embedding type requires multi-modal input mapper "
"returns 'image_grid_thw' or 'video_grid_thw' or "
"'audio_feature_lengths'.")

second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None)
use_audio_in_video = mm_kwargs.get("use_audio_in_video", False)
Expand Down