From b9f62d7e4187e15870e58cbbffac0ae7be129d55 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 25 Dec 2024 15:01:48 +0000 Subject: [PATCH 1/3] Move some utility functions to modality-specific modules Signed-off-by: DarkLight1337 --- .../decoder_only/vision_language/test_awq.py | 2 +- .../vision_language/test_h2ovl.py | 2 +- .../vision_language/test_phi3v.py | 2 +- .../vision_language/test_qwen2_vl.py | 4 +- .../vision_language/vlm_utils/builders.py | 5 +- .../vlm_utils/custom_inputs.py | 5 +- .../vision_language/test_mllama.py | 2 +- tests/multimodal/test_mapper.py | 2 +- vllm/assets/video.py | 2 +- vllm/multimodal/image.py | 12 ++++ vllm/multimodal/utils.py | 55 +------------------ vllm/multimodal/video.py | 43 +++++++++++++++ 12 files changed, 70 insertions(+), 66 deletions(-) diff --git a/tests/models/decoder_only/vision_language/test_awq.py b/tests/models/decoder_only/vision_language/test_awq.py index 6e6e5b40d6a3..18ceb34a4e04 100644 --- a/tests/models/decoder_only/vision_language/test_awq.py +++ b/tests/models/decoder_only/vision_language/test_awq.py @@ -3,7 +3,7 @@ import pytest import torch -from vllm.multimodal.utils import rescale_image_size +from vllm.multimodal.image import rescale_image_size from ....conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets from ...utils import check_logprobs_close diff --git a/tests/models/decoder_only/vision_language/test_h2ovl.py b/tests/models/decoder_only/vision_language/test_h2ovl.py index 45a736520440..7406df253e7f 100644 --- a/tests/models/decoder_only/vision_language/test_h2ovl.py +++ b/tests/models/decoder_only/vision_language/test_h2ovl.py @@ -8,7 +8,7 @@ # Import the functions to test from vllm.model_executor.models.h2ovl import (calculate_num_blocks, image_to_pixel_values_wrapper) -from vllm.multimodal.utils import rescale_image_size +from vllm.multimodal.image import rescale_image_size models = [ "h2oai/h2ovl-mississippi-800m", # Replace with your actual model names diff --git a/tests/models/decoder_only/vision_language/test_phi3v.py b/tests/models/decoder_only/vision_language/test_phi3v.py index 82eae0705c9b..3a8934adfb07 100644 --- a/tests/models/decoder_only/vision_language/test_phi3v.py +++ b/tests/models/decoder_only/vision_language/test_phi3v.py @@ -5,7 +5,7 @@ import pytest from transformers import AutoTokenizer -from vllm.multimodal.utils import rescale_image_size +from vllm.multimodal.image import rescale_image_size from vllm.platforms import current_platform from vllm.sequence import SampleLogprobs diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py index 71b6ba4dca43..51fe7d2ad32a 100644 --- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py +++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py @@ -6,8 +6,8 @@ from PIL import Image from vllm.entrypoints.llm import LLM -from vllm.multimodal.utils import (rescale_image_size, rescale_video_size, - sample_frames_from_video) +from vllm.multimodal.image import rescale_image_size +from vllm.multimodal.video import rescale_video_size, sample_frames_from_video from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput, PromptVideoInput, VllmRunner) diff --git a/tests/models/decoder_only/vision_language/vlm_utils/builders.py b/tests/models/decoder_only/vision_language/vlm_utils/builders.py index 66668296139f..59773be709fa 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/builders.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/builders.py @@ -5,8 +5,9 @@ import torch -from vllm.multimodal.utils import (rescale_image_size, rescale_video_size, - resize_video, sample_frames_from_video) +from vllm.multimodal.image import rescale_image_size +from vllm.multimodal.video import (rescale_video_size, resize_video, + sample_frames_from_video) from .....conftest import _ImageAssets, _VideoAssets from .types import (SINGLE_IMAGE_BASE_PROMPTS, TEST_IMG_PLACEHOLDER, diff --git a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py index e698d8d3f6f5..2291f4fa0d0a 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py @@ -1,8 +1,9 @@ """Custom input builders for edge-cases in different models.""" from typing import Callable -from vllm.multimodal.utils import (rescale_image_size, rescale_video_size, - resize_video, sample_frames_from_video) +from vllm.multimodal.image import rescale_image_size +from vllm.multimodal.video import (rescale_video_size, resize_video, + sample_frames_from_video) from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS from .builders import build_multi_image_inputs, build_single_image_inputs diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py index 77dd1d81f84d..636a3eedff31 100644 --- a/tests/models/encoder_decoder/vision_language/test_mllama.py +++ b/tests/models/encoder_decoder/vision_language/test_mllama.py @@ -6,7 +6,7 @@ from vllm.attention.selector import (_Backend, _cached_get_attn_backend, global_force_attn_backend_context_manager) -from vllm.multimodal.utils import rescale_image_size +from vllm.multimodal.image import rescale_image_size from vllm.sequence import SampleLogprobs from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, diff --git a/tests/multimodal/test_mapper.py b/tests/multimodal/test_mapper.py index 71832acbd17b..81f2a06182bc 100644 --- a/tests/multimodal/test_mapper.py +++ b/tests/multimodal/test_mapper.py @@ -6,7 +6,7 @@ from vllm.config import ModelConfig from vllm.multimodal import MultiModalRegistry -from vllm.multimodal.utils import rescale_image_size +from vllm.multimodal.image import rescale_image_size @pytest.fixture diff --git a/vllm/assets/video.py b/vllm/assets/video.py index e4dcab10466d..e6779935bad1 100644 --- a/vllm/assets/video.py +++ b/vllm/assets/video.py @@ -7,7 +7,7 @@ from huggingface_hub import hf_hub_download from PIL import Image -from vllm.multimodal.utils import (sample_frames_from_video, +from vllm.multimodal.video import (sample_frames_from_video, try_import_video_packages) from .base import get_cache_dir diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index 97bbce1ce157..c705e1a3d155 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -84,3 +84,15 @@ def _default_input_mapper( def _default_max_multimodal_tokens(self, ctx: InputContext) -> int: return 3000 + + +def rescale_image_size(image: Image.Image, + size_factor: float, + transpose: int = -1) -> Image.Image: + """Rescale the dimensions of an image by a constant factor.""" + new_width = int(image.width * size_factor) + new_height = int(image.height * size_factor) + image = image.resize((new_width, new_height)) + if transpose >= 0: + image = image.transpose(Image.Transpose(transpose)) + return image diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index c898ca4e6573..da1110dce5b4 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -15,6 +15,7 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer from .inputs import MultiModalDataDict, PlaceholderRange +from .video import try_import_video_packages logger = init_logger(__name__) @@ -330,60 +331,6 @@ def load_image_from_base64(image: Union[bytes, str]) -> Image.Image: return _load_image_from_bytes(base64.b64decode(image)) -def rescale_image_size(image: Image.Image, - size_factor: float, - transpose: int = -1) -> Image.Image: - """Rescale the dimensions of an image by a constant factor.""" - new_width = int(image.width * size_factor) - new_height = int(image.height * size_factor) - image = image.resize((new_width, new_height)) - if transpose >= 0: - image = image.transpose(Image.Transpose(transpose)) - return image - - -def try_import_video_packages() -> Any: - try: - import cv2 - import decord - except ImportError as exc: - raise ImportError( - "Please install vllm[video] for video support.") from exc - return cv2, decord - - -def resize_video(frames: npt.NDArray, size: Tuple[int, int]) -> npt.NDArray: - cv2, _ = try_import_video_packages() - - num_frames, _, _, channels = frames.shape - new_height, new_width = size - resized_frames = np.empty((num_frames, new_height, new_width, channels), - dtype=frames.dtype) - for i, frame in enumerate(frames): - resized_frame = cv2.resize(frame, (new_width, new_height)) - resized_frames[i] = resized_frame - return resized_frames - - -def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray: - _, height, width, _ = frames.shape - new_height = int(height * size_factor) - new_width = int(width * size_factor) - - return resize_video(frames, (new_height, new_width)) - - -def sample_frames_from_video(frames: npt.NDArray, - num_frames: int) -> npt.NDArray: - total_frames = frames.shape[0] - if num_frames == -1: - return frames - else: - frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int) - sampled_frames = frames[frame_indices, ...] - return sampled_frames - - def encode_video_base64(frames: npt.NDArray): base64_frames = [] frames_list = [frames[i] for i in range(frames.shape[0])] diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index ba9bf58a4a20..0856886d31dc 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -2,6 +2,7 @@ from typing import TYPE_CHECKING, Any, Dict, Optional import numpy as np +import numpy.typing as npt from vllm.inputs.registry import InputContext from vllm.logger import init_logger @@ -75,3 +76,45 @@ def _default_input_mapper( def _default_max_multimodal_tokens(self, ctx: InputContext) -> int: return 4096 + + +def try_import_video_packages() -> Any: + try: + import cv2 + import decord + except ImportError as exc: + raise ImportError( + "Please install vllm[video] for video support.") from exc + return cv2, decord + + +def resize_video(frames: npt.NDArray, size: tuple[int, int]) -> npt.NDArray: + cv2, _ = try_import_video_packages() + + num_frames, _, _, channels = frames.shape + new_height, new_width = size + resized_frames = np.empty((num_frames, new_height, new_width, channels), + dtype=frames.dtype) + for i, frame in enumerate(frames): + resized_frame = cv2.resize(frame, (new_width, new_height)) + resized_frames[i] = resized_frame + return resized_frames + + +def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray: + _, height, width, _ = frames.shape + new_height = int(height * size_factor) + new_width = int(width * size_factor) + + return resize_video(frames, (new_height, new_width)) + + +def sample_frames_from_video(frames: npt.NDArray, + num_frames: int) -> npt.NDArray: + total_frames = frames.shape[0] + if num_frames == -1: + return frames + else: + frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int) + sampled_frames = frames[frame_indices, ...] + return sampled_frames \ No newline at end of file From b384a4cb58c379b0d521ad35b942ae6f04c78c27 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 25 Dec 2024 15:04:10 +0000 Subject: [PATCH 2/3] Cleanup Signed-off-by: DarkLight1337 --- vllm/multimodal/video.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index 0856886d31dc..7f967d0afa52 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -114,7 +114,7 @@ def sample_frames_from_video(frames: npt.NDArray, total_frames = frames.shape[0] if num_frames == -1: return frames - else: - frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int) - sampled_frames = frames[frame_indices, ...] - return sampled_frames \ No newline at end of file + + frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int) + sampled_frames = frames[frame_indices, ...] + return sampled_frames From fdf13b0b136680e64cff4797e6307e90d320224f Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 25 Dec 2024 15:15:12 +0000 Subject: [PATCH 3/3] Move audio util Signed-off-by: DarkLight1337 --- vllm/multimodal/audio.py | 12 ++++++++++++ vllm/multimodal/utils.py | 13 ++----------- vllm/multimodal/video.py | 2 +- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py index c92deddbcb25..314d21b74623 100644 --- a/vllm/multimodal/audio.py +++ b/vllm/multimodal/audio.py @@ -1,3 +1,5 @@ +from typing import Any + import numpy as np import numpy.typing as npt @@ -26,6 +28,16 @@ def _default_max_multimodal_tokens(self, ctx: InputContext) -> int: "There is no default maximum multimodal tokens") +def try_import_audio_packages() -> tuple[Any, Any]: + try: + import librosa + import soundfile + except ImportError as exc: + raise ImportError( + "Please install vllm[audio] for audio support.") from exc + return librosa, soundfile + + def resample_audio( audio: npt.NDArray[np.floating], *, diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index da1110dce5b4..12d3823eb976 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -2,7 +2,7 @@ import os from functools import lru_cache from io import BytesIO -from typing import Any, List, Optional, Tuple, TypeVar, Union +from typing import List, Optional, Tuple, TypeVar, Union import numpy as np import numpy.typing as npt @@ -14,6 +14,7 @@ from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer +from .audio import try_import_audio_packages from .inputs import MultiModalDataDict, PlaceholderRange from .video import try_import_video_packages @@ -205,16 +206,6 @@ async def async_fetch_video(video_url: str, return video -def try_import_audio_packages() -> Tuple[Any, Any]: - try: - import librosa - import soundfile - except ImportError as exc: - raise ImportError( - "Please install vllm[audio] for audio support.") from exc - return librosa, soundfile - - def fetch_audio(audio_url: str) -> Tuple[np.ndarray, Union[int, float]]: """ Load audio from a URL. diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index 7f967d0afa52..bfcdef70718b 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -78,7 +78,7 @@ def _default_max_multimodal_tokens(self, ctx: InputContext) -> int: return 4096 -def try_import_video_packages() -> Any: +def try_import_video_packages() -> tuple[Any, Any]: try: import cv2 import decord