diff --git a/docs/source/en/auto_docstring.md b/docs/source/en/auto_docstring.md index e6c753419978..97a1c7d0ac92 100644 --- a/docs/source/en/auto_docstring.md +++ b/docs/source/en/auto_docstring.md @@ -292,7 +292,7 @@ The `@auto_docstring` decorator automatically generates docstrings by: 8. Unrolling kwargs typed with the unpack operator. For specific methods (defined in `UNROLL_KWARGS_METHODS`) or classes (defined in `UNROLL_KWARGS_CLASSES`), the decorator processes `**kwargs` parameters that are typed with `Unpack[KwargsTypedDict]`. It extracts the documentations from the `TypedDict` and adds each parameter to the function's docstring. - Currently only supported for [`FastImageProcessorKwargs`]. + Currently only supported for [`ImagesKwargs`]. ## Best practices diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py index 52b798c09f84..3227b08cf031 100644 --- a/src/transformers/image_processing_utils.py +++ b/src/transformers/image_processing_utils.py @@ -20,7 +20,8 @@ from .image_processing_base import BatchFeature, ImageProcessingMixin from .image_transforms import center_crop, normalize, rescale -from .image_utils import ChannelDimension, get_image_size +from .image_utils import ChannelDimension, ImageInput, get_image_size +from .processing_utils import ImagesKwargs, Unpack from .utils import logging from .utils.import_utils import requires @@ -36,6 +37,8 @@ @requires(backends=("vision",)) class BaseImageProcessor(ImageProcessingMixin): + valid_kwargs = ImagesKwargs + def __init__(self, **kwargs): super().__init__(**kwargs) @@ -46,9 +49,9 @@ def is_fast(self) -> bool: """ return False - def __call__(self, images, **kwargs) -> BatchFeature: + def __call__(self, images: ImageInput, *args, **kwargs: Unpack[ImagesKwargs]) -> BatchFeature: """Preprocess an image or a batch of images.""" - return self.preprocess(images, **kwargs) + return self.preprocess(images, *args, **kwargs) def preprocess(self, images, **kwargs) -> BatchFeature: raise NotImplementedError("Each image processor must implement its own preprocess method") diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py index 4dfa7f08b0db..a9f6900a1046 100644 --- a/src/transformers/image_processing_utils_fast.py +++ b/src/transformers/image_processing_utils_fast.py @@ -15,7 +15,7 @@ from collections.abc import Iterable from copy import deepcopy from functools import lru_cache, partial -from typing import Any, Optional, TypedDict, Union +from typing import Any, Optional, Union import numpy as np @@ -40,7 +40,7 @@ validate_kwargs, validate_preprocess_arguments, ) -from .processing_utils import Unpack +from .processing_utils import ImagesKwargs, Unpack from .utils import ( TensorType, auto_docstring, @@ -163,28 +163,6 @@ def divide_to_patches( return patches -class DefaultFastImageProcessorKwargs(TypedDict, total=False): - do_resize: Optional[bool] - size: Optional[dict[str, int]] - default_to_square: Optional[bool] - resample: Optional[Union["PILImageResampling", "F.InterpolationMode"]] - do_center_crop: Optional[bool] - crop_size: Optional[dict[str, int]] - do_rescale: Optional[bool] - rescale_factor: Optional[Union[int, float]] - do_normalize: Optional[bool] - image_mean: Optional[Union[float, list[float]]] - image_std: Optional[Union[float, list[float]]] - do_pad: Optional[bool] - pad_size: Optional[dict[str, int]] - do_convert_rgb: Optional[bool] - return_tensors: Optional[Union[str, TensorType]] - data_format: Optional[ChannelDimension] - input_data_format: Optional[Union[str, ChannelDimension]] - device: Optional["torch.device"] - disable_grouping: Optional[bool] - - @auto_docstring class BaseImageProcessorFast(BaseImageProcessor): resample = None @@ -206,10 +184,10 @@ class BaseImageProcessorFast(BaseImageProcessor): input_data_format = None device = None model_input_names = ["pixel_values"] - valid_kwargs = DefaultFastImageProcessorKwargs + valid_kwargs = ImagesKwargs unused_kwargs = None - def __init__(self, **kwargs: Unpack[DefaultFastImageProcessorKwargs]): + def __init__(self, **kwargs: Unpack[ImagesKwargs]): super().__init__(**kwargs) kwargs = self.filter_out_unused_kwargs(kwargs) size = kwargs.pop("size", self.size) @@ -728,11 +706,8 @@ def _validate_preprocess_kwargs( data_format=data_format, ) - def __call__(self, images: ImageInput, *args, **kwargs: Unpack[DefaultFastImageProcessorKwargs]) -> BatchFeature: - return self.preprocess(images, *args, **kwargs) - @auto_docstring - def preprocess(self, images: ImageInput, *args, **kwargs: Unpack[DefaultFastImageProcessorKwargs]) -> BatchFeature: + def preprocess(self, images: ImageInput, *args, **kwargs: Unpack[ImagesKwargs]) -> BatchFeature: # args are not validated, but their order in the `preprocess` and `_preprocess` signatures must be the same validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_kwargs_names) # Set default kwargs from self. This ensures that if a kwarg is not provided @@ -765,7 +740,7 @@ def _preprocess_image_like_inputs( do_convert_rgb: bool, input_data_format: ChannelDimension, device: Optional[Union[str, "torch.device"]] = None, - **kwargs: Unpack[DefaultFastImageProcessorKwargs], + **kwargs: Unpack[ImagesKwargs], ) -> BatchFeature: """ Preprocess image-like inputs. diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py index 749a4c036ed1..e0c2b67fcc90 100644 --- a/src/transformers/models/aria/modular_aria.py +++ b/src/transformers/models/aria/modular_aria.py @@ -959,8 +959,6 @@ def __call__( self, text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]], images: Optional[ImageInput] = None, - audio=None, - videos=None, **kwargs: Unpack[AriaProcessorKwargs], ) -> BatchFeature: """ diff --git a/src/transformers/models/aria/processing_aria.py b/src/transformers/models/aria/processing_aria.py index 9264776e80fd..976d2b983ee9 100644 --- a/src/transformers/models/aria/processing_aria.py +++ b/src/transformers/models/aria/processing_aria.py @@ -85,8 +85,6 @@ def __call__( self, text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]], images: Optional[ImageInput] = None, - audio=None, - videos=None, **kwargs: Unpack[AriaProcessorKwargs], ) -> BatchFeature: """ diff --git a/src/transformers/models/aya_vision/processing_aya_vision.py b/src/transformers/models/aya_vision/processing_aya_vision.py index aaede4e8e80e..882a85d40946 100644 --- a/src/transformers/models/aya_vision/processing_aya_vision.py +++ b/src/transformers/models/aya_vision/processing_aya_vision.py @@ -19,18 +19,11 @@ from ...image_processing_utils import BatchFeature from ...image_utils import ImageInput, make_flat_list_of_images -from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack +from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput -class AyaVisionImagesKwargs(ImagesKwargs, total=False): - crop_to_patches: Optional[bool] - min_patches: Optional[int] - max_patches: Optional[int] - - class AyaVisionProcessorKwargs(ProcessingKwargs, total=False): - images_kwargs: AyaVisionImagesKwargs _defaults = { "text_kwargs": { "padding_side": "left", @@ -140,8 +133,6 @@ def __call__( self, images: Optional[ImageInput] = None, text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None, - audio=None, - videos=None, **kwargs: Unpack[AyaVisionProcessorKwargs], ) -> BatchFeature: """ diff --git a/src/transformers/models/beit/image_processing_beit.py b/src/transformers/models/beit/image_processing_beit.py index 984eac3bf67e..f65709168379 100644 --- a/src/transformers/models/beit/image_processing_beit.py +++ b/src/transformers/models/beit/image_processing_beit.py @@ -33,6 +33,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import ( TensorType, filter_out_non_signature_kwargs, @@ -54,6 +55,17 @@ logger = logging.get_logger(__name__) +class BeitImageProcessorKwargs(ImagesKwargs): + r""" + do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`): + Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 + is used for background, and background itself is not included in all classes of a dataset (e.g. + ADE20k). The background label will be replaced by 255. + """ + + do_reduce_labels: Optional[bool] + + @requires(backends=("vision",)) class BeitImageProcessor(BaseImageProcessor): r""" @@ -99,6 +111,7 @@ class BeitImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values"] + valid_kwargs = BeitImageProcessorKwargs @filter_out_non_signature_kwargs(extra=INIT_SERVICE_KWARGS) def __init__( diff --git a/src/transformers/models/beit/image_processing_beit_fast.py b/src/transformers/models/beit/image_processing_beit_fast.py index 7ff894127ecd..5d89120283a5 100644 --- a/src/transformers/models/beit/image_processing_beit_fast.py +++ b/src/transformers/models/beit/image_processing_beit_fast.py @@ -22,7 +22,6 @@ from ...image_processing_utils import BatchFeature from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, group_images_by_shape, reorder_images, ) @@ -40,17 +39,7 @@ TensorType, auto_docstring, ) - - -class BeitFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - r""" - do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`): - Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 - is used for background, and background itself is not included in all classes of a dataset (e.g. - ADE20k). The background label will be replaced by 255. - """ - - do_reduce_labels: Optional[bool] +from .image_processing_beit import BeitImageProcessorKwargs @auto_docstring @@ -66,9 +55,9 @@ class BeitImageProcessorFast(BaseImageProcessorFast): do_rescale = True do_normalize = True do_reduce_labels = False - valid_kwargs = BeitFastImageProcessorKwargs + valid_kwargs = BeitImageProcessorKwargs - def __init__(self, **kwargs: Unpack[BeitFastImageProcessorKwargs]): + def __init__(self, **kwargs: Unpack[BeitImageProcessorKwargs]): super().__init__(**kwargs) def reduce_label(self, labels: list["torch.Tensor"]): @@ -86,7 +75,7 @@ def preprocess( self, images: ImageInput, segmentation_maps: Optional[ImageInput] = None, - **kwargs: Unpack[BeitFastImageProcessorKwargs], + **kwargs: Unpack[BeitImageProcessorKwargs], ) -> BatchFeature: r""" segmentation_maps (`ImageInput`, *optional*): @@ -101,7 +90,7 @@ def _preprocess_image_like_inputs( do_convert_rgb: bool, input_data_format: ChannelDimension, device: Optional[Union[str, "torch.device"]] = None, - **kwargs: Unpack[BeitFastImageProcessorKwargs], + **kwargs: Unpack[BeitImageProcessorKwargs], ) -> BatchFeature: """ Preprocess image-like inputs. diff --git a/src/transformers/models/blip/processing_blip.py b/src/transformers/models/blip/processing_blip.py index 4ac741f84f46..7fc154f3483e 100644 --- a/src/transformers/models/blip/processing_blip.py +++ b/src/transformers/models/blip/processing_blip.py @@ -36,7 +36,6 @@ class BlipProcessorKwargs(ProcessingKwargs, total=False): "return_length": False, "verbose": True, }, - "images_kwargs": {}, } @@ -67,8 +66,6 @@ def __call__( self, images: Optional[ImageInput] = None, text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None, - audio=None, - videos=None, **kwargs: Unpack[BlipProcessorKwargs], ) -> BatchEncoding: """ diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py index 71f79583c77e..abbbeb6ae0a4 100644 --- a/src/transformers/models/blip_2/processing_blip_2.py +++ b/src/transformers/models/blip_2/processing_blip_2.py @@ -41,7 +41,6 @@ class Blip2ProcessorKwargs(ProcessingKwargs, total=False): "return_length": False, "verbose": True, }, - "images_kwargs": {}, } @@ -81,8 +80,6 @@ def __call__( self, images: Optional[ImageInput] = None, text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None, - audio=None, - videos=None, **kwargs: Unpack[Blip2ProcessorKwargs], ) -> BatchEncoding: """ diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py index 75b4e2b4238c..cad23d02893f 100644 --- a/src/transformers/models/bridgetower/image_processing_bridgetower.py +++ b/src/transformers/models/bridgetower/image_processing_bridgetower.py @@ -35,6 +35,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging @@ -122,6 +123,10 @@ def get_resize_output_image_size( return new_height, new_width +class BridgeTowerImageProcessorKwargs(ImagesKwargs): + size_divisor: Optional[int] + + class BridgeTowerImageProcessor(BaseImageProcessor): r""" Constructs a BridgeTower image processor. @@ -169,6 +174,7 @@ class BridgeTowerImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values", "pixel_mask"] + valid_kwargs = BridgeTowerImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower_fast.py b/src/transformers/models/bridgetower/image_processing_bridgetower_fast.py index 5be6f9f6c54b..76a76b4b0a47 100644 --- a/src/transformers/models/bridgetower/image_processing_bridgetower_fast.py +++ b/src/transformers/models/bridgetower/image_processing_bridgetower_fast.py @@ -23,7 +23,6 @@ from ...image_processing_utils_fast import ( BaseImageProcessorFast, BatchFeature, - DefaultFastImageProcessorKwargs, ImageInput, SizeDict, TensorType, @@ -33,6 +32,7 @@ ) from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling from ...utils import auto_docstring +from .image_processing_bridgetower import BridgeTowerImageProcessorKwargs def make_pixel_mask( @@ -85,17 +85,6 @@ def get_resize_output_image_size( return new_height, new_width -class BridgeTowerFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - """ - Args: - size_divisor (`int`, *optional*, defaults to 32): - The size by which to make sure both the height and width can be divided. Only has an effect if `do_resize` - is set to `True`. Can be overridden by the `size_divisor` parameter in the `preprocess` method. - """ - - size_divisor: Optional[int] - - @auto_docstring class BridgeTowerImageProcessorFast(BaseImageProcessorFast): resample = PILImageResampling.BICUBIC @@ -110,14 +99,14 @@ class BridgeTowerImageProcessorFast(BaseImageProcessorFast): do_normalize = True do_pad = True size_divisor = 32 - valid_kwargs = BridgeTowerFastImageProcessorKwargs + valid_kwargs = BridgeTowerImageProcessorKwargs model_input_names = ["pixel_values", "pixel_mask"] - def __init__(self, **kwargs: Unpack[BridgeTowerFastImageProcessorKwargs]): + def __init__(self, **kwargs: Unpack[BridgeTowerImageProcessorKwargs]): super().__init__(**kwargs) @auto_docstring - def preprocess(self, images: ImageInput, **kwargs: Unpack[BridgeTowerFastImageProcessorKwargs]) -> BatchFeature: + def preprocess(self, images: ImageInput, **kwargs: Unpack[BridgeTowerImageProcessorKwargs]) -> BatchFeature: return super().preprocess(images, **kwargs) def resize( diff --git a/src/transformers/models/bridgetower/processing_bridgetower.py b/src/transformers/models/bridgetower/processing_bridgetower.py index 6d7059c4c5a5..030c578c49cd 100644 --- a/src/transformers/models/bridgetower/processing_bridgetower.py +++ b/src/transformers/models/bridgetower/processing_bridgetower.py @@ -16,17 +16,10 @@ Processor class for BridgeTower. """ -from typing import Optional - -from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin - - -class BridgeTowerImagesKwargs(ImagesKwargs): - size_divisor: Optional[int] +from ...processing_utils import ProcessingKwargs, ProcessorMixin class BridgeTowerProcessorKwargs(ProcessingKwargs, total=False): - images_kwargs: BridgeTowerImagesKwargs _defaults = { "text_kwargs": { "add_special_tokens": True, diff --git a/src/transformers/models/chameleon/processing_chameleon.py b/src/transformers/models/chameleon/processing_chameleon.py index bf4441c00a2e..247f72322a2d 100644 --- a/src/transformers/models/chameleon/processing_chameleon.py +++ b/src/transformers/models/chameleon/processing_chameleon.py @@ -92,8 +92,6 @@ def __call__( self, images: Optional[ImageInput] = None, text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None, - audio=None, - videos=None, **kwargs: Unpack[ChameleonProcessorKwargs], ) -> BatchFeature: """ diff --git a/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py b/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py index 322e98dbd0f5..358d84ac6d7c 100644 --- a/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +++ b/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py @@ -27,18 +27,13 @@ from torchvision.transforms.v2 import functional as F from ...image_processing_utils import BatchFeature -from ...image_processing_utils_fast import ( - BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, - group_images_by_shape, - reorder_images, -) +from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ImageInput, PILImageResampling, SizeDict -from ...processing_utils import Unpack +from ...processing_utils import ImagesKwargs, Unpack from ...utils import TensorType, auto_docstring -class Cohere2VisionFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): +class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs): """ crop_to_patches (`bool`, *optional*, defaults to `False`): Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the diff --git a/src/transformers/models/cohere2_vision/modular_cohere2_vision.py b/src/transformers/models/cohere2_vision/modular_cohere2_vision.py index 7ef20305b99e..2b7867d0eae3 100644 --- a/src/transformers/models/cohere2_vision/modular_cohere2_vision.py +++ b/src/transformers/models/cohere2_vision/modular_cohere2_vision.py @@ -30,8 +30,10 @@ from transformers.models.got_ocr2.image_processing_got_ocr2_fast import GotOcr2ImageProcessorFast from ...cache_utils import Cache +from ...image_processing_utils import BatchFeature +from ...image_utils import ImageInput from ...modeling_flash_attention_utils import FlashAttentionKwargs -from ...processing_utils import Unpack +from ...processing_utils import ImagesKwargs, Unpack from ...utils import TransformersKwargs, auto_docstring, logging from ...utils.generic import check_model_inputs from .configuration_cohere2_vision import Cohere2VisionConfig @@ -301,6 +303,24 @@ def get_optimal_tiled_canvas( return best_grid +class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs): + """ + crop_to_patches (`bool`, *optional*, defaults to `False`): + Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the + `preprocess` method. + min_patches (`int`, *optional*, defaults to 1): + The minimum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is + set to `True`. Can be overridden by the `min_patches` parameter in the `preprocess` method. + max_patches (`int`, *optional*, defaults to 12): + The maximum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is + set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method. + """ + + crop_to_patches: Optional[bool] + min_patches: Optional[int] + max_patches: Optional[int] + + @auto_docstring class Cohere2VisionImageProcessorFast(GotOcr2ImageProcessorFast): size = {"height": 512, "width": 512} @@ -308,6 +328,14 @@ class Cohere2VisionImageProcessorFast(GotOcr2ImageProcessorFast): max_patches = 12 crop_to_patches = True patch_size = 16 + valid_kwargs = Cohere2VisionFastImageProcessorKwargs + + def __init__(self, **kwargs: Unpack[Cohere2VisionFastImageProcessorKwargs]): + super().__init__(**kwargs) + + @auto_docstring + def preprocess(self, images: ImageInput, **kwargs: Unpack[Cohere2VisionFastImageProcessorKwargs]) -> BatchFeature: + return super().preprocess(images, **kwargs) __all__ = [ diff --git a/src/transformers/models/cohere2_vision/processing_cohere2_vision.py b/src/transformers/models/cohere2_vision/processing_cohere2_vision.py index cde77af658bc..d4fcec4da875 100644 --- a/src/transformers/models/cohere2_vision/processing_cohere2_vision.py +++ b/src/transformers/models/cohere2_vision/processing_cohere2_vision.py @@ -19,16 +19,11 @@ from ...image_processing_utils import BatchFeature from ...image_utils import ImageInput -from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack +from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput -class Cohere2VisionImagesKwargs(ImagesKwargs, total=False): - max_patches: Optional[int] - - class Cohere2VisionProcessorKwargs(ProcessingKwargs, total=False): - images_kwargs: Cohere2VisionImagesKwargs _defaults = { "text_kwargs": { "padding_side": "left", diff --git a/src/transformers/models/colpali/modular_colpali.py b/src/transformers/models/colpali/modular_colpali.py index 8136f560f18e..176b3e6a15ee 100644 --- a/src/transformers/models/colpali/modular_colpali.py +++ b/src/transformers/models/colpali/modular_colpali.py @@ -90,8 +90,6 @@ def __call__( self, images: Optional[ImageInput] = None, text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, - audio=None, - videos=None, **kwargs: Unpack[ColPaliProcessorKwargs], ) -> BatchFeature: """ diff --git a/src/transformers/models/colpali/processing_colpali.py b/src/transformers/models/colpali/processing_colpali.py index 1d76a74e1ab8..032cc70d4482 100644 --- a/src/transformers/models/colpali/processing_colpali.py +++ b/src/transformers/models/colpali/processing_colpali.py @@ -131,8 +131,6 @@ def __call__( self, images: Optional[ImageInput] = None, text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, - audio=None, - videos=None, **kwargs: Unpack[ColPaliProcessorKwargs], ) -> BatchFeature: """ diff --git a/src/transformers/models/colqwen2/modular_colqwen2.py b/src/transformers/models/colqwen2/modular_colqwen2.py index a9a1f8ce3e1e..adea1617e459 100644 --- a/src/transformers/models/colqwen2/modular_colqwen2.py +++ b/src/transformers/models/colqwen2/modular_colqwen2.py @@ -93,8 +93,6 @@ def __call__( self, images: Optional[ImageInput] = None, text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, - audio=None, - videos=None, **kwargs: Unpack[ColQwen2ProcessorKwargs], ) -> BatchFeature: """ diff --git a/src/transformers/models/colqwen2/processing_colqwen2.py b/src/transformers/models/colqwen2/processing_colqwen2.py index e8f7e057247c..2eb9fed873a8 100644 --- a/src/transformers/models/colqwen2/processing_colqwen2.py +++ b/src/transformers/models/colqwen2/processing_colqwen2.py @@ -94,8 +94,6 @@ def __call__( self, images: Optional[ImageInput] = None, text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, - audio=None, - videos=None, **kwargs: Unpack[ColQwen2ProcessorKwargs], ) -> BatchFeature: """ diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py index cf506b834918..163224edb34f 100644 --- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py +++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py @@ -53,6 +53,7 @@ validate_kwargs, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import ( TensorType, is_scipy_available, @@ -774,6 +775,29 @@ def compute_segments( return segmentation, segments +class ConditionalDetrImageProcessorKwargs(ImagesKwargs): + r""" + format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): + Data format of the annotations. One of "coco_detection" or "coco_panoptic". + do_convert_annotations (`bool`, *optional*, defaults to `True`): + Controls whether to convert the annotations to the format expected by the CONDITIONAL_DETR model. Converts the + bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. + Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. + return_segmentation_masks (`bool`, *optional*, defaults to `False`): + Whether to return segmentation masks. + annotations (`AnnotationType` or `list[AnnotationType]`, *optional*): + Annotations to transform according to the padding that is applied to the images. + masks_path (`str` or `pathlib.Path`, *optional*): + Path to the directory containing the segmentation masks. + """ + + format: Optional[Union[str, AnnotationFormat]] + do_convert_annotations: Optional[bool] + return_segmentation_masks: Optional[bool] + annotations: Optional[Union[AnnotationType, list[AnnotationType]]] + masks_path: Optional[Union[str, pathlib.Path]] + + @requires(backends=("vision",)) class ConditionalDetrImageProcessor(BaseImageProcessor): r""" @@ -829,6 +853,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values", "pixel_mask"] + valid_kwargs = ConditionalDetrImageProcessorKwargs # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.__init__ def __init__( diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr_fast.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr_fast.py index 351d4fa1470f..4c5b8602c0cc 100644 --- a/src/transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr_fast.py @@ -15,7 +15,6 @@ from ...image_processing_utils import BatchFeature, get_size_dict from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, SizeDict, get_image_size_for_max_height_width, get_max_height_width, @@ -37,6 +36,7 @@ from ...utils import TensorType, auto_docstring, logging from ...utils.import_utils import requires from .image_processing_conditional_detr import ( + ConditionalDetrImageProcessorKwargs, compute_segments, convert_segmentation_to_rle, get_size_with_aspect_ratio, @@ -46,24 +46,6 @@ logger = logging.get_logger(__name__) - -class ConditionalDetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - r""" - format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): - Data format of the annotations. One of "coco_detection" or "coco_panoptic". - do_convert_annotations (`bool`, *optional*, defaults to `True`): - Controls whether to convert the annotations to the format expected by the CONDITIONAL_DETR model. Converts the - bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. - Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. - return_segmentation_masks (`bool`, *optional*, defaults to `False`): - Whether to return segmentation masks. - """ - - format: Optional[Union[str, AnnotationFormat]] - do_convert_annotations: Optional[bool] - return_segmentation_masks: Optional[bool] - - SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) @@ -278,9 +260,9 @@ class ConditionalDetrImageProcessorFast(BaseImageProcessorFast): size = {"shortest_edge": 800, "longest_edge": 1333} default_to_square = False model_input_names = ["pixel_values", "pixel_mask"] - valid_kwargs = ConditionalDetrFastImageProcessorKwargs + valid_kwargs = ConditionalDetrImageProcessorKwargs - def __init__(self, **kwargs: Unpack[ConditionalDetrFastImageProcessorKwargs]) -> None: + def __init__(self, **kwargs: Unpack[ConditionalDetrImageProcessorKwargs]) -> None: if "pad_and_return_pixel_mask" in kwargs: kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") @@ -542,25 +524,8 @@ def pad( def preprocess( self, images: ImageInput, - annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None, - masks_path: Optional[Union[str, pathlib.Path]] = None, - **kwargs: Unpack[ConditionalDetrFastImageProcessorKwargs], + **kwargs: Unpack[ConditionalDetrImageProcessorKwargs], ) -> BatchFeature: - r""" - annotations (`AnnotationType` or `list[AnnotationType]`, *optional*): - List of annotations associated with the image or batch of images. If annotation is for object - detection, the annotations should be a dictionary with the following keys: - - "image_id" (`int`): The image id. - - "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a - dictionary. An image can have no annotations, in which case the list should be empty. - If annotation is for segmentation, the annotations should be a dictionary with the following keys: - - "image_id" (`int`): The image id. - - "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary. - An image can have no segments, in which case the list should be empty. - - "file_name" (`str`): The file name of the image. - masks_path (`str` or `pathlib.Path`, *optional*): - Path to the directory containing the segmentation masks. - """ if "pad_and_return_pixel_mask" in kwargs: kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") logger.warning_once( @@ -575,7 +540,7 @@ def preprocess( ) kwargs["size"] = kwargs.pop("max_size") - return super().preprocess(images, annotations, masks_path, **kwargs) + return super().preprocess(images, **kwargs) def _preprocess( self, diff --git a/src/transformers/models/convnext/image_processing_convnext.py b/src/transformers/models/convnext/image_processing_convnext.py index ae0be69a5621..d2e180de2464 100644 --- a/src/transformers/models/convnext/image_processing_convnext.py +++ b/src/transformers/models/convnext/image_processing_convnext.py @@ -38,6 +38,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging from ...utils.import_utils import requires @@ -49,6 +50,16 @@ logger = logging.get_logger(__name__) +class ConvNextImageProcessorKwargs(ImagesKwargs): + """ + crop_pct (`float`, *optional*): + Percentage of the image to crop. Only has an effect if size < 384. Can be + overridden by `crop_pct` in the`preprocess` method. + """ + + crop_pct: Optional[float] + + @requires(backends=("vision",)) class ConvNextImageProcessor(BaseImageProcessor): r""" @@ -87,6 +98,7 @@ class ConvNextImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values"] + valid_kwargs = ConvNextImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/convnext/image_processing_convnext_fast.py b/src/transformers/models/convnext/image_processing_convnext_fast.py index 3ab00c0fd091..035b92f8b7d2 100644 --- a/src/transformers/models/convnext/image_processing_convnext_fast.py +++ b/src/transformers/models/convnext/image_processing_convnext_fast.py @@ -22,7 +22,6 @@ from ...image_processing_utils import BatchFeature from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, group_images_by_shape, reorder_images, ) @@ -39,16 +38,7 @@ TensorType, auto_docstring, ) - - -class ConvNextFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - """ - crop_pct (`float`, *optional*): - Percentage of the image to crop. Only has an effect if size < 384. Can be - overridden by `crop_pct` in the`preprocess` method. - """ - - crop_pct: Optional[float] +from .image_processing_convnext import ConvNextImageProcessorKwargs @auto_docstring @@ -62,13 +52,13 @@ class ConvNextImageProcessorFast(BaseImageProcessorFast): do_rescale = True do_normalize = True crop_pct = 224 / 256 - valid_kwargs = ConvNextFastImageProcessorKwargs + valid_kwargs = ConvNextImageProcessorKwargs - def __init__(self, **kwargs: Unpack[ConvNextFastImageProcessorKwargs]): + def __init__(self, **kwargs: Unpack[ConvNextImageProcessorKwargs]): super().__init__(**kwargs) @auto_docstring - def preprocess(self, images: ImageInput, **kwargs: Unpack[ConvNextFastImageProcessorKwargs]) -> BatchFeature: + def preprocess(self, images: ImageInput, **kwargs: Unpack[ConvNextImageProcessorKwargs]) -> BatchFeature: return super().preprocess(images, **kwargs) def resize( diff --git a/src/transformers/models/csm/processing_csm.py b/src/transformers/models/csm/processing_csm.py index cbf7e44aa8d3..172016f6431d 100644 --- a/src/transformers/models/csm/processing_csm.py +++ b/src/transformers/models/csm/processing_csm.py @@ -246,9 +246,7 @@ def __call__( text_kwargs = output_kwargs["text_kwargs"] audio_kwargs = output_kwargs["audio_kwargs"] - common_kwargs = output_kwargs["common_kwargs"] - - return_tensors = common_kwargs.pop("return_tensors", None) + return_tensors = text_kwargs.get("return_tensors", None) if return_tensors != "pt": raise ValueError(f"{self.__class__.__name__} only supports `return_tensors='pt'`.") diff --git a/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py b/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py index 45f4fd2bdb93..c41ac586753e 100644 --- a/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py +++ b/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py @@ -38,6 +38,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging @@ -48,6 +49,16 @@ logger = logging.get_logger(__name__) +class DeepseekVLImageProcessorKwargs(ImagesKwargs): + r""" + min_size (`int`, *optional*, defaults to 14): + The minimum allowed size for the resized image. Ensures that neither the height nor width + falls below this value after resizing. + """ + + min_size: int + + class DeepseekVLImageProcessor(BaseImageProcessor): r""" Constructs a DEEPSEEK_VL image processor. @@ -90,6 +101,8 @@ class DeepseekVLImageProcessor(BaseImageProcessor): model_input_names = ["pixel_values"] + valid_kwargs = DeepseekVLImageProcessorKwargs + def __init__( self, do_resize: bool = True, diff --git a/src/transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py b/src/transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py index 896e91f0692c..6eaa15d827d9 100644 --- a/src/transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +++ b/src/transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py @@ -24,25 +24,11 @@ import torch.nn.functional as F from ...image_processing_utils import BatchFeature -from ...image_processing_utils_fast import ( - BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, - group_images_by_shape, - reorder_images, -) +from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling, SizeDict from ...processing_utils import Unpack from ...utils import TensorType, auto_docstring - - -class DeepseekVLFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - r""" - min_size (`int`, *optional*, defaults to 14): - The minimum allowed size for the resized image. Ensures that neither the height nor width - falls below this value after resizing. - """ - - min_size: int +from .image_processing_deepseek_vl import DeepseekVLImageProcessorKwargs @auto_docstring @@ -56,9 +42,9 @@ class DeepseekVLImageProcessorFast(BaseImageProcessorFast): do_rescale = True do_normalize = True do_pad = True - valid_kwargs = DeepseekVLFastImageProcessorKwargs + valid_kwargs = DeepseekVLImageProcessorKwargs - def __init__(self, **kwargs: Unpack[DeepseekVLFastImageProcessorKwargs]): + def __init__(self, **kwargs: Unpack[DeepseekVLImageProcessorKwargs]): super().__init__(**kwargs) if kwargs.get("image_mean") is None: background_color = (127, 127, 127) diff --git a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py index 241c12923bdb..8b93f7fa6c94 100644 --- a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +++ b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py @@ -39,6 +39,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging @@ -49,6 +50,32 @@ logger = logging.get_logger(__name__) +class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs): + r""" + min_size (`int`, *optional*, defaults to 14): + The minimum allowed size for the resized image. Ensures that neither the height nor width + falls below this value after resizing. + high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`): + Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess` + method. + high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be + overridden by the `high_res_resample` parameter in the `preprocess` method. + high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`): + Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method. + high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`): + Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method. + """ + + min_size: int + high_res_size: dict + high_res_resample: "PILImageResampling" + high_res_image_mean: list[float] + high_res_image_std: list[float] + + class DeepseekVLHybridImageProcessor(BaseImageProcessor): r""" Constructs a DEEPSEEK_VL_HYBRID image processor. @@ -102,6 +129,7 @@ class DeepseekVLHybridImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values", "high_res_pixel_values"] + valid_kwargs = DeepseekVLHybridImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py index c04e006e358d..ff5e7f2e3c73 100644 --- a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +++ b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py @@ -26,7 +26,6 @@ from ...image_processing_utils_fast import ( BaseImageProcessorFast, BatchFeature, - DefaultFastImageProcessorKwargs, get_size_dict, group_images_by_shape, reorder_images, @@ -41,32 +40,7 @@ ) from ...processing_utils import Unpack from ...utils import TensorType, auto_docstring - - -class DeepseekVLHybridFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - r""" - min_size (`int`, *optional*, defaults to 14): - The minimum allowed size for the resized image. Ensures that neither the height nor width - falls below this value after resizing. - high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`): - Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess` - method. - high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): - Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be - overridden by the `high_res_resample` parameter in the `preprocess` method. - high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`): - Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of - channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method. - high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`): - Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the - number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method. - """ - - min_size: int - high_res_size: dict - high_res_resample: "PILImageResampling" - high_res_image_mean: list[float] - high_res_image_std: list[float] +from .image_processing_deepseek_vl_hybrid import DeepseekVLHybridImageProcessorKwargs @auto_docstring @@ -80,14 +54,14 @@ class DeepseekVLHybridImageProcessorFast(BaseImageProcessorFast): do_rescale = True do_normalize = True do_pad = True - valid_kwargs = DeepseekVLHybridFastImageProcessorKwargs + valid_kwargs = DeepseekVLHybridImageProcessorKwargs high_res_image_mean = OPENAI_CLIP_MEAN high_res_image_std = OPENAI_CLIP_STD high_res_size = {"height": 1024, "width": 1024} high_res_resample = PILImageResampling.BICUBIC model_input_names = ["pixel_values", "high_res_pixel_values"] - def __init__(self, **kwargs: Unpack[DeepseekVLHybridFastImageProcessorKwargs]): + def __init__(self, **kwargs: Unpack[DeepseekVLHybridImageProcessorKwargs]): if kwargs.get("image_mean") is None: background_color = (127, 127, 127) else: diff --git a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py index 18b416a57df2..1507c9f3d028 100644 --- a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +++ b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py @@ -22,7 +22,6 @@ from ...image_processing_utils_fast import ( BaseImageProcessorFast, BatchFeature, - DefaultFastImageProcessorKwargs, get_size_dict, group_images_by_shape, reorder_images, @@ -43,7 +42,7 @@ valid_images, validate_preprocess_arguments, ) -from ...processing_utils import Unpack +from ...processing_utils import ImagesKwargs, Unpack from ...tokenization_utils_base import ( PreTokenizedInput, TextInput, @@ -430,6 +429,32 @@ def prepare_inputs_for_generation( return model_inputs +class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs): + r""" + min_size (`int`, *optional*, defaults to 14): + The minimum allowed size for the resized image. Ensures that neither the height nor width + falls below this value after resizing. + high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`): + Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess` + method. + high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be + overridden by the `high_res_resample` parameter in the `preprocess` method. + high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`): + Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method. + high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`): + Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method. + """ + + min_size: int + high_res_size: dict + high_res_resample: "PILImageResampling" + high_res_image_mean: list[float] + high_res_image_std: list[float] + + class DeepseekVLHybridImageProcessor(DeepseekVLImageProcessor): r""" Constructs a DEEPSEEK_VL_HYBRID image processor. @@ -483,6 +508,7 @@ class DeepseekVLHybridImageProcessor(DeepseekVLImageProcessor): """ model_input_names = ["pixel_values", "high_res_pixel_values"] + valid_kwargs = DeepseekVLHybridImageProcessorKwargs def __init__( self, @@ -727,32 +753,6 @@ def preprocess( return BatchFeature(data=data, tensor_type=return_tensors) -class DeepseekVLHybridFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - r""" - min_size (`int`, *optional*, defaults to 14): - The minimum allowed size for the resized image. Ensures that neither the height nor width - falls below this value after resizing. - high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`): - Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess` - method. - high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): - Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be - overridden by the `high_res_resample` parameter in the `preprocess` method. - high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`): - Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of - channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method. - high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`): - Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the - number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method. - """ - - min_size: int - high_res_size: dict - high_res_resample: "PILImageResampling" - high_res_image_mean: list[float] - high_res_image_std: list[float] - - class DeepseekVLHybridImageProcessorFast(DeepseekVLImageProcessorFast): high_res_image_mean = OPENAI_CLIP_MEAN high_res_image_std = OPENAI_CLIP_STD @@ -760,7 +760,7 @@ class DeepseekVLHybridImageProcessorFast(DeepseekVLImageProcessorFast): high_res_resample = PILImageResampling.BICUBIC model_input_names = ["pixel_values", "high_res_pixel_values"] - def __init__(self, **kwargs: Unpack[DeepseekVLHybridFastImageProcessorKwargs]): + def __init__(self, **kwargs: Unpack[DeepseekVLHybridImageProcessorKwargs]): if kwargs.get("image_mean") is None: background_color = (127, 127, 127) else: diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py index ef028eda1ed1..8249c079f5fa 100644 --- a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py +++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py @@ -53,6 +53,7 @@ validate_kwargs, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import ( TensorType, is_scipy_available, @@ -79,6 +80,30 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +class DeformableDetrImageProcessorKwargs(ImagesKwargs): + r""" + format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): + Data format of the annotations. One of "coco_detection" or "coco_panoptic". + do_convert_annotations (`bool`, *optional*, defaults to `True`): + Controls whether to convert the annotations to the format expected by the DEFORMABLE_DETR model. Converts the + bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. + Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. + return_segmentation_masks (`bool`, *optional*, defaults to `False`): + Whether to return segmentation masks. + annotations (`AnnotationType` or `list[AnnotationType]`, *optional*): + Annotations to transform according to the padding that is applied to the images. + masks_path (`str` or `pathlib.Path`, *optional*): + Path to the directory containing the segmentation masks. + """ + + format: Optional[Union[str, AnnotationFormat]] + do_convert_annotations: Optional[bool] + return_segmentation_masks: Optional[bool] + annotations: Optional[Union[AnnotationType, list[AnnotationType]]] + masks_path: Optional[Union[str, pathlib.Path]] + + SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) @@ -827,6 +852,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values", "pixel_mask"] + valid_kwargs = DeformableDetrImageProcessorKwargs # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.__init__ def __init__( diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py index 8458d02d58a5..916ad3dee0e6 100644 --- a/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py @@ -14,7 +14,6 @@ from ...image_processing_utils import BatchFeature, get_size_dict from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, SizeDict, get_image_size_for_max_height_width, get_max_height_width, @@ -35,29 +34,11 @@ from ...processing_utils import Unpack from ...utils import TensorType, auto_docstring, logging from ...utils.import_utils import requires -from .image_processing_deformable_detr import get_size_with_aspect_ratio +from .image_processing_deformable_detr import DeformableDetrImageProcessorKwargs, get_size_with_aspect_ratio logger = logging.get_logger(__name__) - -class DeformableDetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - r""" - format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): - Data format of the annotations. One of "coco_detection" or "coco_panoptic". - do_convert_annotations (`bool`, *optional*, defaults to `True`): - Controls whether to convert the annotations to the format expected by the DEFORMABLE_DETR model. Converts the - bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. - Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. - return_segmentation_masks (`bool`, *optional*, defaults to `False`): - Whether to return segmentation masks. - """ - - format: Optional[Union[str, AnnotationFormat]] - do_convert_annotations: Optional[bool] - return_segmentation_masks: Optional[bool] - - SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) @@ -272,9 +253,9 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast): size = {"shortest_edge": 800, "longest_edge": 1333} default_to_square = False model_input_names = ["pixel_values", "pixel_mask"] - valid_kwargs = DeformableDetrFastImageProcessorKwargs + valid_kwargs = DeformableDetrImageProcessorKwargs - def __init__(self, **kwargs: Unpack[DeformableDetrFastImageProcessorKwargs]) -> None: + def __init__(self, **kwargs: Unpack[DeformableDetrImageProcessorKwargs]) -> None: if "pad_and_return_pixel_mask" in kwargs: kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") @@ -536,25 +517,8 @@ def pad( def preprocess( self, images: ImageInput, - annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None, - masks_path: Optional[Union[str, pathlib.Path]] = None, - **kwargs: Unpack[DeformableDetrFastImageProcessorKwargs], + **kwargs: Unpack[DeformableDetrImageProcessorKwargs], ) -> BatchFeature: - r""" - annotations (`AnnotationType` or `list[AnnotationType]`, *optional*): - List of annotations associated with the image or batch of images. If annotation is for object - detection, the annotations should be a dictionary with the following keys: - - "image_id" (`int`): The image id. - - "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a - dictionary. An image can have no annotations, in which case the list should be empty. - If annotation is for segmentation, the annotations should be a dictionary with the following keys: - - "image_id" (`int`): The image id. - - "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary. - An image can have no segments, in which case the list should be empty. - - "file_name" (`str`): The file name of the image. - masks_path (`str` or `pathlib.Path`, *optional*): - Path to the directory containing the segmentation masks. - """ if "pad_and_return_pixel_mask" in kwargs: kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") logger.warning_once( @@ -569,7 +533,7 @@ def preprocess( ) kwargs["size"] = kwargs.pop("max_size") - return super().preprocess(images, annotations, masks_path, **kwargs) + return super().preprocess(images, **kwargs) def _preprocess( self, diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py index 7a2e67f83de6..5e0622601ac9 100644 --- a/src/transformers/models/detr/image_processing_detr.py +++ b/src/transformers/models/detr/image_processing_detr.py @@ -52,6 +52,7 @@ validate_kwargs, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import ( TensorType, is_scipy_available, @@ -82,6 +83,29 @@ SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) +class DetrImageProcessorKwargs(ImagesKwargs): + r""" + format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): + Data format of the annotations. One of "coco_detection" or "coco_panoptic". + do_convert_annotations (`bool`, *optional*, defaults to `True`): + Controls whether to convert the annotations to the format expected by the DETR model. Converts the + bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. + Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. + return_segmentation_masks (`bool`, *optional*, defaults to `False`): + Whether to return segmentation masks. + annotations (`AnnotationType` or `list[AnnotationType]`, *optional*): + Annotations to transform according to the padding that is applied to the images. + masks_path (`str` or `pathlib.Path`, *optional*): + Path to the directory containing the segmentation masks. + """ + + format: Optional[Union[str, AnnotationFormat]] + do_convert_annotations: Optional[bool] + return_segmentation_masks: Optional[bool] + annotations: Optional[Union[AnnotationType, list[AnnotationType]]] + masks_path: Optional[Union[str, pathlib.Path]] + + # From the original repo: https://github.com/facebookresearch/detr/blob/3af9fa878e73b6894ce3596450a8d9b89d918ca9/datasets/transforms.py#L76 def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, int]: """ @@ -811,6 +835,7 @@ class DetrImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values", "pixel_mask"] + valid_kwargs = DetrImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/detr/image_processing_detr_fast.py b/src/transformers/models/detr/image_processing_detr_fast.py index a2ac8d03eed3..190d01ab5590 100644 --- a/src/transformers/models/detr/image_processing_detr_fast.py +++ b/src/transformers/models/detr/image_processing_detr_fast.py @@ -28,7 +28,6 @@ from ...image_processing_utils import BatchFeature, get_size_dict from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, SizeDict, get_image_size_for_max_height_width, get_max_height_width, @@ -54,6 +53,7 @@ ) from ...utils.import_utils import requires from .image_processing_detr import ( + DetrImageProcessorKwargs, compute_segments, convert_segmentation_to_rle, get_size_with_aspect_ratio, @@ -263,23 +263,6 @@ def prepare_coco_panoptic_annotation( return new_target -class DetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - r""" - format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): - Data format of the annotations. One of "coco_detection" or "coco_panoptic". - do_convert_annotations (`bool`, *optional*, defaults to `True`): - Controls whether to convert the annotations to the format expected by the DETR model. Converts the - bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. - Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. - return_segmentation_masks (`bool`, *optional*, defaults to `False`): - Whether to return segmentation masks. - """ - - format: Optional[Union[str, AnnotationFormat]] - do_convert_annotations: Optional[bool] - return_segmentation_masks: Optional[bool] - - @auto_docstring @requires(backends=("torchvision", "torch")) class DetrImageProcessorFast(BaseImageProcessorFast): @@ -294,9 +277,9 @@ class DetrImageProcessorFast(BaseImageProcessorFast): size = {"shortest_edge": 800, "longest_edge": 1333} default_to_square = False model_input_names = ["pixel_values", "pixel_mask"] - valid_kwargs = DetrFastImageProcessorKwargs + valid_kwargs = DetrImageProcessorKwargs - def __init__(self, **kwargs: Unpack[DetrFastImageProcessorKwargs]) -> None: + def __init__(self, **kwargs: Unpack[DetrImageProcessorKwargs]) -> None: if "pad_and_return_pixel_mask" in kwargs: kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") @@ -558,25 +541,8 @@ def pad( def preprocess( self, images: ImageInput, - annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None, - masks_path: Optional[Union[str, pathlib.Path]] = None, - **kwargs: Unpack[DetrFastImageProcessorKwargs], + **kwargs: Unpack[DetrImageProcessorKwargs], ) -> BatchFeature: - r""" - annotations (`AnnotationType` or `list[AnnotationType]`, *optional*): - List of annotations associated with the image or batch of images. If annotation is for object - detection, the annotations should be a dictionary with the following keys: - - "image_id" (`int`): The image id. - - "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a - dictionary. An image can have no annotations, in which case the list should be empty. - If annotation is for segmentation, the annotations should be a dictionary with the following keys: - - "image_id" (`int`): The image id. - - "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary. - An image can have no segments, in which case the list should be empty. - - "file_name" (`str`): The file name of the image. - masks_path (`str` or `pathlib.Path`, *optional*): - Path to the directory containing the segmentation masks. - """ if "pad_and_return_pixel_mask" in kwargs: kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") logger.warning_once( @@ -591,7 +557,7 @@ def preprocess( ) kwargs["size"] = kwargs.pop("max_size") - return super().preprocess(images, annotations, masks_path, **kwargs) + return super().preprocess(images, **kwargs) def _preprocess( self, diff --git a/src/transformers/models/dia/processing_dia.py b/src/transformers/models/dia/processing_dia.py index 402f5152a64b..812a4149cb3f 100644 --- a/src/transformers/models/dia/processing_dia.py +++ b/src/transformers/models/dia/processing_dia.py @@ -111,9 +111,7 @@ def __call__( text_kwargs = output_kwargs["text_kwargs"] audio_kwargs = output_kwargs["audio_kwargs"] - common_kwargs = output_kwargs["common_kwargs"] - - return_tensors = common_kwargs.pop("return_tensors", None) + return_tensors = text_kwargs.get("return_tensors", None) if return_tensors != "pt": raise ValueError(f"{self.__class__.__name__} only supports `return_tensors='pt'`.") diff --git a/src/transformers/models/donut/image_processing_donut.py b/src/transformers/models/donut/image_processing_donut.py index f49cc964080d..5af365099724 100644 --- a/src/transformers/models/donut/image_processing_donut.py +++ b/src/transformers/models/donut/image_processing_donut.py @@ -40,6 +40,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import TensorType, filter_out_non_signature_kwargs, logging from ...utils.import_utils import is_vision_available, requires @@ -51,6 +52,18 @@ import PIL +class DonutImageProcessorKwargs(ImagesKwargs): + """ + do_thumbnail (`bool`, *optional*, defaults to `self.do_thumbnail`): + Whether to resize the image using thumbnail method. + do_align_long_axis (`bool`, *optional*, defaults to `self.do_align_long_axis`): + Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees. + """ + + do_thumbnail: Optional[bool] + do_align_long_axis: Optional[bool] + + @requires(backends=("vision",)) class DonutImageProcessor(BaseImageProcessor): r""" @@ -90,6 +103,7 @@ class DonutImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values"] + valid_kwargs = DonutImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/donut/image_processing_donut_fast.py b/src/transformers/models/donut/image_processing_donut_fast.py index 29e06831b1b4..9a150f4df75f 100644 --- a/src/transformers/models/donut/image_processing_donut_fast.py +++ b/src/transformers/models/donut/image_processing_donut_fast.py @@ -19,7 +19,7 @@ import torch from torchvision.transforms.v2 import functional as F -from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature, DefaultFastImageProcessorKwargs +from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature from ...image_transforms import group_images_by_shape, reorder_images from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, ImageInput, PILImageResampling, SizeDict from ...processing_utils import Unpack @@ -28,24 +28,12 @@ auto_docstring, logging, ) +from .image_processing_donut import DonutImageProcessorKwargs logger = logging.get_logger(__name__) -class DonutFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - """ - Args: - do_thumbnail (`bool`, *optional*, defaults to `self.do_thumbnail`): - Whether to resize the image using thumbnail method. - do_align_long_axis (`bool`, *optional*, defaults to `self.do_align_long_axis`): - Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees. - """ - - do_thumbnail: Optional[bool] - do_align_long_axis: Optional[bool] - - @auto_docstring class DonutImageProcessorFast(BaseImageProcessorFast): resample = PILImageResampling.BILINEAR @@ -58,9 +46,9 @@ class DonutImageProcessorFast(BaseImageProcessorFast): do_thumbnail = True do_align_long_axis = False do_pad = True - valid_kwargs = DonutFastImageProcessorKwargs + valid_kwargs = DonutImageProcessorKwargs - def __init__(self, **kwargs: Unpack[DonutFastImageProcessorKwargs]): + def __init__(self, **kwargs: Unpack[DonutImageProcessorKwargs]): size = kwargs.pop("size", None) if isinstance(size, (tuple, list)): size = size[::-1] @@ -68,7 +56,7 @@ def __init__(self, **kwargs: Unpack[DonutFastImageProcessorKwargs]): super().__init__(**kwargs) @auto_docstring - def preprocess(self, images: ImageInput, **kwargs: Unpack[DonutFastImageProcessorKwargs]) -> BatchFeature: + def preprocess(self, images: ImageInput, **kwargs: Unpack[DonutImageProcessorKwargs]) -> BatchFeature: if "size" in kwargs: size = kwargs.pop("size") if isinstance(size, (tuple, list)): diff --git a/src/transformers/models/donut/processing_donut.py b/src/transformers/models/donut/processing_donut.py index c75e2fcaa542..a545c90539b9 100644 --- a/src/transformers/models/donut/processing_donut.py +++ b/src/transformers/models/donut/processing_donut.py @@ -74,8 +74,6 @@ def __call__( self, images: Optional[ImageInput] = None, text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None, - audio=None, - videos=None, **kwargs: Unpack[DonutProcessorKwargs], ): """ diff --git a/src/transformers/models/dpt/image_processing_dpt.py b/src/transformers/models/dpt/image_processing_dpt.py index 0ec3eaed1c43..3ba5a6e30c21 100644 --- a/src/transformers/models/dpt/image_processing_dpt.py +++ b/src/transformers/models/dpt/image_processing_dpt.py @@ -44,6 +44,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import ( TensorType, filter_out_non_signature_kwargs, @@ -63,6 +64,26 @@ logger = logging.get_logger(__name__) +class DPTImageProcessorKwargs(ImagesKwargs): + """ + ensure_multiple_of (`int`, *optional*, defaults to 1): + If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overridden + by `ensure_multiple_of` in `preprocess`. + keep_aspect_ratio (`bool`, *optional*, defaults to `False`): + If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. Can + be overridden by `keep_aspect_ratio` in `preprocess`. + do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`): + Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 + is used for background, and background itself is not included in all classes of a dataset (e.g. + ADE20k). The background label will be replaced by 255. + """ + + ensure_multiple_of: Optional[int] + size_divisor: Optional[int] + keep_aspect_ratio: Optional[bool] + do_reduce_labels: Optional[bool] + + def get_resize_output_image_size( input_image: np.ndarray, output_size: Union[int, Iterable[int]], @@ -151,6 +172,7 @@ class DPTImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values"] + valid_kwargs = DPTImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/dpt/image_processing_dpt_fast.py b/src/transformers/models/dpt/image_processing_dpt_fast.py index 892ddd7c3d6f..ba0a6d28c56c 100644 --- a/src/transformers/models/dpt/image_processing_dpt_fast.py +++ b/src/transformers/models/dpt/image_processing_dpt_fast.py @@ -28,7 +28,7 @@ from torchvision.transforms.v2 import functional as F from ...image_processing_base import BatchFeature -from ...image_processing_utils_fast import BaseImageProcessorFast, DefaultFastImageProcessorKwargs +from ...image_processing_utils_fast import BaseImageProcessorFast from ...image_transforms import group_images_by_shape, reorder_images from ...image_utils import ( IMAGENET_STANDARD_MEAN, @@ -41,35 +41,13 @@ ) from ...processing_utils import Unpack from ...utils import TensorType, auto_docstring, requires_backends +from .image_processing_dpt import DPTImageProcessorKwargs if TYPE_CHECKING: from ...modeling_outputs import DepthEstimatorOutput -class DPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - """ - ensure_multiple_of (`int`, *optional*, defaults to 1): - If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overridden - by `ensure_multiple_of` in `preprocess`. - size_divisor (`int`, *optional*): - If `do_pad` is `True`, pads the image dimensions to be divisible by this value. This was introduced in the - DINOv2 paper, which uses the model in combination with DPT. - keep_aspect_ratio (`bool`, *optional*, defaults to `False`): - If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. Can - be overridden by `keep_aspect_ratio` in `preprocess`. - do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`): - Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 - is used for background, and background itself is not included in all classes of a dataset (e.g. - ADE20k). The background label will be replaced by 255. - """ - - ensure_multiple_of: Optional[int] - size_divisor: Optional[int] - keep_aspect_ratio: Optional[bool] - do_reduce_labels: Optional[bool] - - def get_resize_output_image_size( input_image: "torch.Tensor", output_size: Union[int, Iterable[int]], @@ -123,13 +101,13 @@ class DPTImageProcessorFast(BaseImageProcessorFast): do_normalize = True do_reduce_labels = None - valid_kwargs = DPTFastImageProcessorKwargs + valid_kwargs = DPTImageProcessorKwargs do_pad = False rescale_factor = 1 / 255 ensure_multiple_of = 1 keep_aspect_ratio = False - def __init__(self, **kwargs: Unpack[DPTFastImageProcessorKwargs]): + def __init__(self, **kwargs: Unpack[DPTImageProcessorKwargs]): super().__init__(**kwargs) def reduce_label(self, labels: list["torch.Tensor"]): @@ -147,7 +125,7 @@ def preprocess( self, images: ImageInput, segmentation_maps: Optional[ImageInput] = None, - **kwargs: Unpack[DPTFastImageProcessorKwargs], + **kwargs: Unpack[DPTImageProcessorKwargs], ) -> BatchFeature: r""" segmentation_maps (`ImageInput`, *optional*): @@ -162,7 +140,7 @@ def _preprocess_image_like_inputs( do_convert_rgb: bool, input_data_format: ChannelDimension, device: Optional[Union[str, "torch.device"]] = None, - **kwargs: Unpack[DPTFastImageProcessorKwargs], + **kwargs: Unpack[DPTImageProcessorKwargs], ) -> BatchFeature: """ Preprocess image-like inputs. diff --git a/src/transformers/models/dpt/modular_dpt.py b/src/transformers/models/dpt/modular_dpt.py index 34eb08f39b68..241d8de122b2 100644 --- a/src/transformers/models/dpt/modular_dpt.py +++ b/src/transformers/models/dpt/modular_dpt.py @@ -21,7 +21,7 @@ import torch from ...image_processing_base import BatchFeature -from ...image_processing_utils_fast import BaseImageProcessorFast, DefaultFastImageProcessorKwargs +from ...image_processing_utils_fast import BaseImageProcessorFast from ...image_transforms import group_images_by_shape, reorder_images from ...image_utils import ( IMAGENET_STANDARD_MEAN, @@ -35,6 +35,7 @@ requires_backends, ) from ..beit.image_processing_beit_fast import BeitImageProcessorFast +from .image_processing_dpt import DPTImageProcessorKwargs if TYPE_CHECKING: @@ -82,29 +83,6 @@ def constrain_to_multiple_of(val, multiple, min_val=0, max_val=None): return SizeDict(height=new_height, width=new_width) -class DPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - """ - ensure_multiple_of (`int`, *optional*, defaults to 1): - If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overridden - by `ensure_multiple_of` in `preprocess`. - size_divisor (`int`, *optional*): - If `do_pad` is `True`, pads the image dimensions to be divisible by this value. This was introduced in the - DINOv2 paper, which uses the model in combination with DPT. - keep_aspect_ratio (`bool`, *optional*, defaults to `False`): - If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. Can - be overridden by `keep_aspect_ratio` in `preprocess`. - do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`): - Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 - is used for background, and background itself is not included in all classes of a dataset (e.g. - ADE20k). The background label will be replaced by 255. - """ - - ensure_multiple_of: Optional[int] - size_divisor: Optional[int] - keep_aspect_ratio: Optional[bool] - do_reduce_labels: Optional[bool] - - @auto_docstring class DPTImageProcessorFast(BeitImageProcessorFast): resample = PILImageResampling.BICUBIC @@ -123,7 +101,7 @@ class DPTImageProcessorFast(BeitImageProcessorFast): do_center_crop = None do_reduce_labels = None - valid_kwargs = DPTFastImageProcessorKwargs + valid_kwargs = DPTImageProcessorKwargs def resize( self, diff --git a/src/transformers/models/efficientloftr/image_processing_efficientloftr.py b/src/transformers/models/efficientloftr/image_processing_efficientloftr.py index 5b87278683ac..d1beabb6c2b9 100644 --- a/src/transformers/models/efficientloftr/image_processing_efficientloftr.py +++ b/src/transformers/models/efficientloftr/image_processing_efficientloftr.py @@ -34,6 +34,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import TensorType, logging, requires_backends @@ -49,6 +50,15 @@ logger = logging.get_logger(__name__) +class EfficientLoFTRImageProcessorKwargs(ImagesKwargs): + r""" + do_grayscale (`bool`, *optional*, defaults to `True`): + Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method. + """ + + do_grayscale: Optional[bool] = True + + # Copied from transformers.models.superpoint.image_processing_superpoint.is_grayscale def is_grayscale( image: np.ndarray, @@ -155,6 +165,7 @@ class EfficientLoFTRImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values"] + valid_kwargs = EfficientLoFTRImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/efficientloftr/image_processing_efficientloftr_fast.py b/src/transformers/models/efficientloftr/image_processing_efficientloftr_fast.py index 1463ef405f37..994a10f04ee1 100644 --- a/src/transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +++ b/src/transformers/models/efficientloftr/image_processing_efficientloftr_fast.py @@ -22,7 +22,6 @@ from ...image_processing_utils import BatchFeature from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, group_images_by_shape, reorder_images, ) @@ -40,6 +39,7 @@ TensorType, auto_docstring, ) +from .image_processing_efficientloftr import EfficientLoFTRImageProcessorKwargs if TYPE_CHECKING: @@ -108,15 +108,6 @@ def convert_to_grayscale( return F.rgb_to_grayscale(image, num_output_channels=3) -class EfficientLoFTRFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - r""" - do_grayscale (`bool`, *optional*, defaults to `True`): - Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method. - """ - - do_grayscale: Optional[bool] = True - - @auto_docstring class EfficientLoFTRImageProcessorFast(BaseImageProcessorFast): resample = PILImageResampling.BILINEAR @@ -126,13 +117,13 @@ class EfficientLoFTRImageProcessorFast(BaseImageProcessorFast): do_rescale = True rescale_factor = 1 / 255 do_normalize = None - valid_kwargs = EfficientLoFTRFastImageProcessorKwargs + valid_kwargs = EfficientLoFTRImageProcessorKwargs - def __init__(self, **kwargs: Unpack[EfficientLoFTRFastImageProcessorKwargs]): + def __init__(self, **kwargs: Unpack[EfficientLoFTRImageProcessorKwargs]): super().__init__(**kwargs) @auto_docstring - def preprocess(self, images: ImageInput, **kwargs: Unpack[EfficientLoFTRFastImageProcessorKwargs]) -> BatchFeature: + def preprocess(self, images: ImageInput, **kwargs: Unpack[EfficientLoFTRImageProcessorKwargs]) -> BatchFeature: return super().preprocess(images, **kwargs) def _prepare_images_structure( diff --git a/src/transformers/models/efficientnet/image_processing_efficientnet.py b/src/transformers/models/efficientnet/image_processing_efficientnet.py index eaad420b31f8..f5a69eff70e4 100644 --- a/src/transformers/models/efficientnet/image_processing_efficientnet.py +++ b/src/transformers/models/efficientnet/image_processing_efficientnet.py @@ -33,6 +33,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging @@ -43,6 +44,18 @@ logger = logging.get_logger(__name__) +class EfficientNetImageProcessorKwargs(ImagesKwargs): + """ + rescale_offset (`bool`, *optional*, defaults to `self.rescale_offset`): + Whether to rescale the image between [-max_range/2, scale_range/2] instead of [0, scale_range]. + include_top (`bool`, *optional*, defaults to `self.include_top`): + Normalize the image again with the standard deviation only for image classification if set to True. + """ + + rescale_offset: bool + include_top: bool + + class EfficientNetImageProcessor(BaseImageProcessor): r""" Constructs a EfficientNet image processor. @@ -83,6 +96,7 @@ class EfficientNetImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values"] + valid_kwargs = EfficientNetImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/efficientnet/image_processing_efficientnet_fast.py b/src/transformers/models/efficientnet/image_processing_efficientnet_fast.py index 77e787614a10..5f3439aaa273 100644 --- a/src/transformers/models/efficientnet/image_processing_efficientnet_fast.py +++ b/src/transformers/models/efficientnet/image_processing_efficientnet_fast.py @@ -20,7 +20,7 @@ import torch from torchvision.transforms.v2 import functional as F -from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature, DefaultFastImageProcessorKwargs +from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature from ...image_transforms import group_images_by_shape, reorder_images from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, ImageInput, PILImageResampling, SizeDict from ...processing_utils import Unpack @@ -28,19 +28,7 @@ TensorType, auto_docstring, ) - - -class EfficientNetFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - """ - Args: - rescale_offset (`bool`, *optional*, defaults to `self.rescale_offset`): - Whether to rescale the image between [-max_range/2, scale_range/2] instead of [0, scale_range]. - include_top (`bool`, *optional*, defaults to `self.include_top`): - Normalize the image again with the standard deviation only for image classification if set to True. - """ - - rescale_offset: bool - include_top: bool +from .image_processing_efficientnet import EfficientNetImageProcessorKwargs @auto_docstring @@ -57,9 +45,9 @@ class EfficientNetImageProcessorFast(BaseImageProcessorFast): rescale_offset = False do_normalize = True include_top = True - valid_kwargs = EfficientNetFastImageProcessorKwargs + valid_kwargs = EfficientNetImageProcessorKwargs - def __init__(self, **kwargs: Unpack[EfficientNetFastImageProcessorKwargs]): + def __init__(self, **kwargs: Unpack[EfficientNetImageProcessorKwargs]): super().__init__(**kwargs) def rescale( @@ -195,7 +183,7 @@ def _preprocess( return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors) @auto_docstring - def preprocess(self, images: ImageInput, **kwargs: Unpack[EfficientNetFastImageProcessorKwargs]) -> BatchFeature: + def preprocess(self, images: ImageInput, **kwargs: Unpack[EfficientNetImageProcessorKwargs]) -> BatchFeature: return super().preprocess(images, **kwargs) diff --git a/src/transformers/models/emu3/image_processing_emu3.py b/src/transformers/models/emu3/image_processing_emu3.py index b876a9de96bf..fca5316a3fca 100644 --- a/src/transformers/models/emu3/image_processing_emu3.py +++ b/src/transformers/models/emu3/image_processing_emu3.py @@ -37,6 +37,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import TensorType, is_vision_available, logging @@ -46,6 +47,11 @@ logger = logging.get_logger(__name__) +class Emu3ImageProcessorKwargs(ImagesKwargs): + ratio: Optional[str] + image_area: Optional[int] + + def smart_resize( height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4 * 1280 ): @@ -108,6 +114,7 @@ class Emu3ImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values", "image_sizes"] + valid_kwargs = Emu3ImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/emu3/processing_emu3.py b/src/transformers/models/emu3/processing_emu3.py index ef2681d2385b..b7ed8e9074f0 100644 --- a/src/transformers/models/emu3/processing_emu3.py +++ b/src/transformers/models/emu3/processing_emu3.py @@ -20,7 +20,7 @@ from ...image_processing_utils import BatchFeature from ...image_utils import ImageInput -from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack +from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import is_vision_available @@ -33,14 +33,8 @@ class Emu3TextKwargs(TextKwargs, total=False): return_for_image_generation: bool -class Emu3ImagesKwargs(ImagesKwargs, total=False): - ratio: str - image_area: int - - class Emu3ProcessorKwargs(ProcessingKwargs, total=False): text_kwargs: Emu3TextKwargs - images_kwargs: Emu3ImagesKwargs _defaults = { "text_kwargs": { "return_for_image_generation": False, @@ -95,8 +89,6 @@ def __call__( self, images: Optional[ImageInput] = None, text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None, - audio=None, - videos=None, **kwargs: Unpack[Emu3ProcessorKwargs], ) -> BatchFeature: """ diff --git a/src/transformers/models/eomt/image_processing_eomt.py b/src/transformers/models/eomt/image_processing_eomt.py index b8e0058eee8d..189aaaf41d4d 100644 --- a/src/transformers/models/eomt/image_processing_eomt.py +++ b/src/transformers/models/eomt/image_processing_eomt.py @@ -36,6 +36,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import ( IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, @@ -53,6 +54,21 @@ import torch.nn.functional as F +class EomtImageProcessorKwargs(ImagesKwargs): + """ + do_split_image (`bool`, *optional*, defaults to `False`): + Whether to split the input images into overlapping patches for semantic segmentation. If set to `True`, the + input images will be split into patches of size `size["shortest_edge"]` with an overlap between patches. + Otherwise, the input images will be padded to the target size. + ignore_index (`int`, *optional*): + Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels + denoted with 0 (background) will be replaced with `ignore_index`. + """ + + do_split_image: bool + ignore_index: Optional[int] = None + + # Adapted from transformers.models.maskformer.image_processing_maskformer.convert_segmentation_map_to_binary_masks def convert_segmentation_map_to_binary_masks( segmentation_map: np.ndarray, diff --git a/src/transformers/models/eomt/image_processing_eomt_fast.py b/src/transformers/models/eomt/image_processing_eomt_fast.py index ca80231d3a76..68fd7bb00744 100644 --- a/src/transformers/models/eomt/image_processing_eomt_fast.py +++ b/src/transformers/models/eomt/image_processing_eomt_fast.py @@ -24,7 +24,6 @@ from ...image_processing_utils import BatchFeature from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, group_images_by_shape, reorder_images, ) @@ -43,6 +42,7 @@ filter_out_non_signature_kwargs, ) from .image_processing_eomt import ( + EomtImageProcessorKwargs, compute_segments, convert_segmentation_map_to_binary_masks, get_size_with_aspect_ratio, @@ -50,25 +50,6 @@ ) -class EomtImageProcessorFastKwargs(DefaultFastImageProcessorKwargs): - """ - do_split_image (`bool`, *optional*, defaults to `False`): - Whether to split the input images into overlapping patches for semantic segmentation. If set to `True`, the - input images will be split into patches of size `size["shortest_edge"]` with an overlap between patches. - Otherwise, the input images will be padded to the target size. - do_pad (`bool`, *optional*, defaults to `False`): - Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest - number of patches in the batch. Padding will be applied to the bottom and right with zeros. - ignore_index (`int`, *optional*): - Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels - denoted with 0 (background) will be replaced with `ignore_index`. - """ - - do_split_image: bool - do_pad: bool - ignore_index: Optional[int] = None - - def get_target_size(size_dict: dict[str, int]) -> tuple[int, int]: """Returns the height and width from a size dict.""" target_height = size_dict["shortest_edge"] @@ -102,9 +83,9 @@ class EomtImageProcessorFast(BaseImageProcessorFast): do_split_image = False do_pad = False ignore_index = None - valid_kwargs = EomtImageProcessorFastKwargs + valid_kwargs = EomtImageProcessorKwargs - def __init__(self, **kwargs: Unpack[EomtImageProcessorFastKwargs]): + def __init__(self, **kwargs: Unpack[EomtImageProcessorKwargs]): super().__init__(**kwargs) def _split_image(self, images: torch.Tensor, size: dict, image_indices: int) -> tuple[list, list]: @@ -153,7 +134,7 @@ def preprocess( images: ImageInput, segmentation_maps: Optional[list[torch.Tensor]] = None, instance_id_to_semantic_id: Optional[dict[int, int]] = None, - **kwargs: Unpack[EomtImageProcessorFastKwargs], + **kwargs: Unpack[EomtImageProcessorKwargs], ) -> BatchFeature: r""" segmentation_maps (`ImageInput`, *optional*): @@ -171,7 +152,7 @@ def _preprocess_image_like_inputs( do_convert_rgb: bool, input_data_format: ChannelDimension, device: Optional[Union[str, "torch.device"]] = None, - **kwargs: Unpack[EomtImageProcessorFastKwargs], + **kwargs: Unpack[EomtImageProcessorKwargs], ) -> BatchFeature: """ Preprocess image-like inputs. diff --git a/src/transformers/models/flava/image_processing_flava.py b/src/transformers/models/flava/image_processing_flava.py index 9d67ac841124..3c19a2405169 100644 --- a/src/transformers/models/flava/image_processing_flava.py +++ b/src/transformers/models/flava/image_processing_flava.py @@ -37,6 +37,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging from ...utils.import_utils import requires @@ -56,6 +57,89 @@ LOGIT_LAPLACE_EPS: float = 0.1 +class FlavaImageProcessorKwargs(ImagesKwargs): + """ + return_image_mask (`bool`, *optional*, defaults to `False`): + Whether to return the image mask. Can be overridden by the `return_image_mask` parameter in `preprocess`. + input_size_patches (`int`, *optional*, defaults to 14): + Number of patches in the image in height and width direction. 14x14 = 196 total patches. Can be overridden + by the `input_size_patches` parameter in `preprocess`. + total_mask_patches (`int`, *optional*, defaults to 75): + Total number of patches that should be masked. Can be overridden by the `total_mask_patches` parameter in + `preprocess`. + mask_group_min_patches (`int`, *optional*, defaults to 16): + Minimum number of patches that should be masked. Can be overridden by the `mask_group_min_patches` + parameter in `preprocess`. + mask_group_max_patches (`int`, *optional*): + Maximum number of patches that should be masked. Can be overridden by the `mask_group_max_patches` + parameter in `preprocess`. + mask_group_min_aspect_ratio (`float`, *optional*, defaults to 0.3): + Minimum aspect ratio of the mask window. Can be overridden by the `mask_group_min_aspect_ratio` parameter + in `preprocess`. + mask_group_max_aspect_ratio (`float`, *optional*): + Maximum aspect ratio of the mask window. Can be overridden by the `mask_group_max_aspect_ratio` parameter + in `preprocess`. + return_codebook_pixels (`bool`, *optional*, defaults to `False`): + Whether to return the codebook pixel values. + codebook_do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the input for codebook to a certain. Can be overridden by the `codebook_do_resize` + parameter in `preprocess`. `codebook_size`. + codebook_size (`dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`): + Resize the input for codebook to the given size. Can be overridden by the `codebook_size` parameter in + `preprocess`. + codebook_resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.LANCZOS`): + Resampling filter to use if resizing the codebook image. Can be overridden by the `codebook_resample` + parameter in `preprocess`. + codebook_do_center_crop (`bool`, *optional*, defaults to `True`): + Whether to crop the input for codebook at the center. If the input size is smaller than + `codebook_crop_size` along any edge, the image is padded with 0's and then center cropped. Can be + overridden by the `codebook_do_center_crop` parameter in `preprocess`. + codebook_crop_size (`dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`): + Desired output size for codebook input when applying center-cropping. Can be overridden by the + `codebook_crop_size` parameter in `preprocess`. + codebook_do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the input for codebook by the specified scale `codebook_rescale_factor`. Can be + overridden by the `codebook_do_rescale` parameter in `preprocess`. + codebook_rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Defines the scale factor to use if rescaling the codebook image. Can be overridden by the + `codebook_rescale_factor` parameter in `preprocess`. + codebook_do_map_pixels (`bool`, *optional*, defaults to `True`): + Whether to map the pixel values of the codebook input to (1 - 2e)x + e. Can be overridden by the + `codebook_do_map_pixels` parameter in `preprocess`. + codebook_do_normalize (`bool`, *optional*, defaults to `True`): + Whether or not to normalize the input for codebook with `codebook_image_mean` and `codebook_image_std`. Can + be overridden by the `codebook_do_normalize` parameter in `preprocess`. + codebook_image_mean (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0, 0, 0]`): + The sequence of means for each channel, to be used when normalizing images for codebook. Can be overridden + by the `codebook_image_mean` parameter in `preprocess`. + codebook_image_std (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0.5, 0.5, 0.5]`): + The sequence of standard deviations for each channel, to be used when normalizing images for codebook. Can + be overridden by the `codebook_image_std` parameter in `preprocess`. + """ + + # Mask related params + return_image_mask: Optional[bool] + input_size_patches: Optional[int] + total_mask_patches: Optional[int] + mask_group_min_patches: Optional[int] + mask_group_max_patches: Optional[int] + mask_group_min_aspect_ratio: Optional[float] + mask_group_max_aspect_ratio: Optional[float] + # Codebook related params + return_codebook_pixels: Optional[bool] + codebook_do_resize: Optional[bool] + codebook_size: Optional[bool] + codebook_resample: Optional[int] + codebook_do_center_crop: Optional[bool] + codebook_crop_size: Optional[int] + codebook_do_rescale: Optional[bool] + codebook_rescale_factor: Optional[Union[int, float]] + codebook_do_map_pixels: Optional[bool] + codebook_do_normalize: Optional[bool] + codebook_image_mean: Optional[Union[float, Iterable[float]]] + codebook_image_std: Optional[Union[float, Iterable[float]]] + + # Inspired from https://github.com/microsoft/unilm/blob/master/beit/masking_generator.py class FlavaMaskingGenerator: def __init__( @@ -225,6 +309,7 @@ class FlavaImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values"] + valid_kwargs = FlavaImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/flava/image_processing_flava_fast.py b/src/transformers/models/flava/image_processing_flava_fast.py index 732d25e71f69..0dfbd07f17a7 100644 --- a/src/transformers/models/flava/image_processing_flava_fast.py +++ b/src/transformers/models/flava/image_processing_flava_fast.py @@ -16,7 +16,6 @@ import math import random -from collections.abc import Iterable from functools import lru_cache from typing import Any, Optional, Union @@ -26,7 +25,6 @@ from ...image_processing_utils_fast import ( BaseImageProcessorFast, BatchFeature, - DefaultFastImageProcessorKwargs, get_size_dict, ) from ...image_transforms import ChannelDimension, group_images_by_shape, reorder_images @@ -42,6 +40,7 @@ FLAVA_IMAGE_MEAN, FLAVA_IMAGE_STD, LOGIT_LAPLACE_EPS, + FlavaImageProcessorKwargs, ) @@ -121,90 +120,6 @@ def __call__(self): return mask -class FlavaFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - """ - Args: - return_image_mask (`bool`, *optional*, defaults to `False`): - Whether to return the image mask. Can be overridden by the `return_image_mask` parameter in `preprocess`. - input_size_patches (`int`, *optional*, defaults to 14): - Number of patches in the image in height and width direction. 14x14 = 196 total patches. Can be overridden - by the `input_size_patches` parameter in `preprocess`. - total_mask_patches (`int`, *optional*, defaults to 75): - Total number of patches that should be masked. Can be overridden by the `total_mask_patches` parameter in - `preprocess`. - mask_group_min_patches (`int`, *optional*, defaults to 16): - Minimum number of patches that should be masked. Can be overridden by the `mask_group_min_patches` - parameter in `preprocess`. - mask_group_max_patches (`int`, *optional*): - Maximum number of patches that should be masked. Can be overridden by the `mask_group_max_patches` - parameter in `preprocess`. - mask_group_min_aspect_ratio (`float`, *optional*, defaults to 0.3): - Minimum aspect ratio of the mask window. Can be overridden by the `mask_group_min_aspect_ratio` parameter - in `preprocess`. - mask_group_max_aspect_ratio (`float`, *optional*): - Maximum aspect ratio of the mask window. Can be overridden by the `mask_group_max_aspect_ratio` parameter - in `preprocess`. - return_codebook_pixels (`bool`, *optional*, defaults to `False`): - Whether to return the codebook pixel values. - codebook_do_resize (`bool`, *optional*, defaults to `True`): - Whether to resize the input for codebook to a certain. Can be overridden by the `codebook_do_resize` - parameter in `preprocess`. `codebook_size`. - codebook_size (`dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`): - Resize the input for codebook to the given size. Can be overridden by the `codebook_size` parameter in - `preprocess`. - codebook_resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.LANCZOS`): - Resampling filter to use if resizing the codebook image. Can be overridden by the `codebook_resample` - parameter in `preprocess`. - codebook_do_center_crop (`bool`, *optional*, defaults to `True`): - Whether to crop the input for codebook at the center. If the input size is smaller than - `codebook_crop_size` along any edge, the image is padded with 0's and then center cropped. Can be - overridden by the `codebook_do_center_crop` parameter in `preprocess`. - codebook_crop_size (`dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`): - Desired output size for codebook input when applying center-cropping. Can be overridden by the - `codebook_crop_size` parameter in `preprocess`. - codebook_do_rescale (`bool`, *optional*, defaults to `True`): - Whether to rescale the input for codebook by the specified scale `codebook_rescale_factor`. Can be - overridden by the `codebook_do_rescale` parameter in `preprocess`. - codebook_rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): - Defines the scale factor to use if rescaling the codebook image. Can be overridden by the - `codebook_rescale_factor` parameter in `preprocess`. - codebook_do_map_pixels (`bool`, *optional*, defaults to `True`): - Whether to map the pixel values of the codebook input to (1 - 2e)x + e. Can be overridden by the - `codebook_do_map_pixels` parameter in `preprocess`. - codebook_do_normalize (`bool`, *optional*, defaults to `True`): - Whether or not to normalize the input for codebook with `codebook_image_mean` and `codebook_image_std`. Can - be overridden by the `codebook_do_normalize` parameter in `preprocess`. - codebook_image_mean (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0, 0, 0]`): - The sequence of means for each channel, to be used when normalizing images for codebook. Can be overridden - by the `codebook_image_mean` parameter in `preprocess`. - codebook_image_std (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0.5, 0.5, 0.5]`): - The sequence of standard deviations for each channel, to be used when normalizing images for codebook. Can - be overridden by the `codebook_image_std` parameter in `preprocess`. - """ - - # Mask related params - return_image_mask: Optional[bool] - input_size_patches: Optional[int] - total_mask_patches: Optional[int] - mask_group_min_patches: Optional[int] - mask_group_max_patches: Optional[int] - mask_group_min_aspect_ratio: Optional[float] - mask_group_max_aspect_ratio: Optional[float] - # Codebook related params - return_codebook_pixels: Optional[bool] - codebook_do_resize: Optional[bool] - codebook_size: Optional[bool] - codebook_resample: Optional[int] - codebook_do_center_crop: Optional[bool] - codebook_crop_size: Optional[int] - codebook_do_rescale: Optional[bool] - codebook_rescale_factor: Optional[Union[int, float]] - codebook_do_map_pixels: Optional[bool] - codebook_do_normalize: Optional[bool] - codebook_image_mean: Optional[Union[float, Iterable[float]]] - codebook_image_std: Optional[Union[float, Iterable[float]]] - - @auto_docstring class FlavaImageProcessorFast(BaseImageProcessorFast): resample = PILImageResampling.BICUBIC @@ -239,13 +154,13 @@ class FlavaImageProcessorFast(BaseImageProcessorFast): codebook_do_normalize = True codebook_image_mean = FLAVA_CODEBOOK_MEAN codebook_image_std = FLAVA_CODEBOOK_STD - valid_kwargs = FlavaFastImageProcessorKwargs + valid_kwargs = FlavaImageProcessorKwargs - def __init__(self, **kwargs: Unpack[FlavaFastImageProcessorKwargs]): + def __init__(self, **kwargs: Unpack[FlavaImageProcessorKwargs]): super().__init__(**kwargs) @auto_docstring - def preprocess(self, images: ImageInput, **kwargs: Unpack[DefaultFastImageProcessorKwargs]) -> BatchFeature: + def preprocess(self, images: ImageInput, **kwargs: Unpack[FlavaImageProcessorKwargs]) -> BatchFeature: return super().preprocess(images, **kwargs) @classmethod diff --git a/src/transformers/models/flava/processing_flava.py b/src/transformers/models/flava/processing_flava.py index ceebdb6efa49..8e8a806e8615 100644 --- a/src/transformers/models/flava/processing_flava.py +++ b/src/transformers/models/flava/processing_flava.py @@ -17,39 +17,8 @@ """ import warnings -from collections.abc import Iterable -from typing import Optional, Union -from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin - - -class FlavaImagesKwargs(ImagesKwargs): - # Mask related params - return_image_mask: Optional[bool] - input_size_patches: Optional[int] - total_mask_patches: Optional[int] - mask_group_min_patches: Optional[int] - mask_group_max_patches: Optional[int] - mask_group_min_aspect_ratio: Optional[float] - mask_group_max_aspect_ratio: Optional[float] - # Codebook related params - return_codebook_pixels: Optional[bool] - codebook_do_resize: Optional[bool] - codebook_size: Optional[bool] - codebook_resample: Optional[int] - codebook_do_center_crop: Optional[bool] - codebook_crop_size: Optional[int] - codebook_do_rescale: Optional[bool] - codebook_rescale_factor: Optional[Union[int, float]] - codebook_do_map_pixels: Optional[bool] - codebook_do_normalize: Optional[bool] - codebook_image_mean: Optional[Union[float, Iterable[float]]] - codebook_image_std: Optional[Union[float, Iterable[float]]] - - -class FlavaProcessorKwargs(ProcessingKwargs, total=False): - images_kwargs: FlavaImagesKwargs - _defaults = {} +from ...processing_utils import ProcessorMixin class FlavaProcessor(ProcessorMixin): @@ -67,7 +36,6 @@ class FlavaProcessor(ProcessorMixin): attributes = ["image_processor", "tokenizer"] image_processor_class = "FlavaImageProcessor" tokenizer_class = ("BertTokenizer", "BertTokenizerFast") - valid_processor_kwargs = FlavaProcessorKwargs def __init__(self, image_processor=None, tokenizer=None, **kwargs): feature_extractor = None diff --git a/src/transformers/models/florence2/processing_florence2.py b/src/transformers/models/florence2/processing_florence2.py index 5ae0f4828bc1..1c25ddceeafc 100644 --- a/src/transformers/models/florence2/processing_florence2.py +++ b/src/transformers/models/florence2/processing_florence2.py @@ -39,7 +39,6 @@ class Florence2ProcessorKwargs(ProcessingKwargs, total=False): _defaults = { "text_kwargs": {"padding": False, "return_mm_token_type_ids": False}, - "images_kwargs": {}, } diff --git a/src/transformers/models/fuyu/processing_fuyu.py b/src/transformers/models/fuyu/processing_fuyu.py index debbcb23aac1..75b2bbad926e 100644 --- a/src/transformers/models/fuyu/processing_fuyu.py +++ b/src/transformers/models/fuyu/processing_fuyu.py @@ -71,7 +71,6 @@ class FuyuProcessorKwargs(ProcessingKwargs, total=False): "verbose": True, "return_mm_token_type_ids": False, }, - "images_kwargs": {}, } @@ -487,8 +486,6 @@ def __call__( self, images: Optional[ImageInput] = None, text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None, - audio=None, - videos=None, **kwargs: Unpack[FuyuProcessorKwargs], ) -> "FuyuBatchFeature": """ diff --git a/src/transformers/models/gemma3/image_processing_gemma3.py b/src/transformers/models/gemma3/image_processing_gemma3.py index efa65a6d2bf2..5206a13a04a3 100644 --- a/src/transformers/models/gemma3/image_processing_gemma3.py +++ b/src/transformers/models/gemma3/image_processing_gemma3.py @@ -40,6 +40,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging @@ -50,6 +51,24 @@ import PIL +class Gemma3ImageProcessorKwargs(ImagesKwargs): + """ + do_pan_and_scan (`bool`, *optional*): + Whether to apply `pan_and_scan` to images. + pan_and_scan_min_crop_size (`int`, *optional*): + Minimum size of each crop in pan and scan. + pan_and_scan_max_num_crops (`int`, *optional*): + Maximum number of crops per image in pan and scan. + pan_and_scan_min_ratio_to_activate (`float`, *optional*): + Minimum aspect ratio to activate pan and scan. + """ + + do_pan_and_scan: Optional[bool] + pan_and_scan_min_crop_size: Optional[int] + pan_and_scan_max_num_crops: Optional[int] + pan_and_scan_min_ratio_to_activate: Optional[float] + + class Gemma3ImageProcessor(BaseImageProcessor): r""" Constructs a SigLIP image processor. @@ -91,6 +110,7 @@ class Gemma3ImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values", "num_crops"] + valid_kwargs = Gemma3ImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/gemma3/image_processing_gemma3_fast.py b/src/transformers/models/gemma3/image_processing_gemma3_fast.py index c61152bc6b22..bfb58be2a8e1 100644 --- a/src/transformers/models/gemma3/image_processing_gemma3_fast.py +++ b/src/transformers/models/gemma3/image_processing_gemma3_fast.py @@ -24,7 +24,6 @@ from ...image_processing_utils_fast import ( BaseImageProcessorFast, BatchFeature, - DefaultFastImageProcessorKwargs, group_images_by_shape, reorder_images, ) @@ -35,29 +34,12 @@ auto_docstring, logging, ) +from .image_processing_gemma3 import Gemma3ImageProcessorKwargs logger = logging.get_logger(__name__) -class Gemma3FastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - """ - do_pan_and_scan (`bool`, *optional*): - Whether to apply `pan_and_scan` to images. - pan_and_scan_min_crop_size (`int`, *optional*): - Minimum size of each crop in pan and scan. - pan_and_scan_max_num_crops (`int`, *optional*): - Maximum number of crops per image in pan and scan. - pan_and_scan_min_ratio_to_activate (`float`, *optional*): - Minimum aspect ratio to activate pan and scan. - """ - - do_pan_and_scan: Optional[bool] - pan_and_scan_min_crop_size: Optional[int] - pan_and_scan_max_num_crops: Optional[int] - pan_and_scan_min_ratio_to_activate: Optional[float] - - @auto_docstring class Gemma3ImageProcessorFast(BaseImageProcessorFast): resample = PILImageResampling.BILINEAR @@ -73,9 +55,9 @@ class Gemma3ImageProcessorFast(BaseImageProcessorFast): pan_and_scan_min_crop_size = None pan_and_scan_max_num_crops = None pan_and_scan_min_ratio_to_activate = None - valid_kwargs = Gemma3FastImageProcessorKwargs + valid_kwargs = Gemma3ImageProcessorKwargs - def __init__(self, **kwargs: Unpack[Gemma3FastImageProcessorKwargs]): + def __init__(self, **kwargs: Unpack[Gemma3ImageProcessorKwargs]): super().__init__(**kwargs) def pan_and_scan_batched( @@ -167,7 +149,7 @@ def _process_images_for_pan_and_scan( def preprocess( self, images: ImageInput, - **kwargs: Unpack[Gemma3FastImageProcessorKwargs], + **kwargs: Unpack[Gemma3ImageProcessorKwargs], ) -> BatchFeature: return super().preprocess(images, **kwargs) diff --git a/src/transformers/models/gemma3/processing_gemma3.py b/src/transformers/models/gemma3/processing_gemma3.py index 791c47833a4e..a9bac5b69e47 100644 --- a/src/transformers/models/gemma3/processing_gemma3.py +++ b/src/transformers/models/gemma3/processing_gemma3.py @@ -20,21 +20,12 @@ from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput, make_nested_list_of_images -from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack +from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import to_py_obj -class Gemma3ImagesKwargs(ImagesKwargs): - do_pan_and_scan: Optional[bool] - pan_and_scan_min_crop_size: Optional[int] - pan_and_scan_max_num_crops: Optional[int] - pan_and_scan_min_ratio_to_activate: Optional[float] - do_convert_rgb: Optional[bool] - - class Gemma3ProcessorKwargs(ProcessingKwargs, total=False): - images_kwargs: Gemma3ImagesKwargs _defaults = { "text_kwargs": { "padding": False, @@ -81,8 +72,6 @@ def __call__( self, images: Optional[ImageInput] = None, text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, - videos=None, - audio=None, **kwargs: Unpack[Gemma3ProcessorKwargs], ) -> BatchFeature: if text is None and images is None: diff --git a/src/transformers/models/gemma3n/processing_gemma3n.py b/src/transformers/models/gemma3n/processing_gemma3n.py index e2c2c3ae10f8..105b1983b7c7 100644 --- a/src/transformers/models/gemma3n/processing_gemma3n.py +++ b/src/transformers/models/gemma3n/processing_gemma3n.py @@ -19,21 +19,13 @@ from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput, make_nested_list_of_images -from ...processing_utils import AudioKwargs, ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack +from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput -class Gemma3nImagesKwargs(ImagesKwargs): - do_convert_rgb: Optional[bool] - - class Gemma3nProcessorKwargs(ProcessingKwargs, total=False): - audio_kwargs: AudioKwargs - images_kwargs: Gemma3nImagesKwargs _defaults = { - "text_kwargs": { - "padding": False, - }, + "text_kwargs": {"padding": False}, } @@ -101,7 +93,6 @@ def __call__( images: Optional[ImageInput] = None, text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, audio: Optional[Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]]] = None, - videos=None, **kwargs: Unpack[Gemma3nProcessorKwargs], ) -> BatchFeature: if text is None and images is None and audio is None: diff --git a/src/transformers/models/glm4v/image_processing_glm4v.py b/src/transformers/models/glm4v/image_processing_glm4v.py index e35699005116..13f4472e61f3 100644 --- a/src/transformers/models/glm4v/image_processing_glm4v.py +++ b/src/transformers/models/glm4v/image_processing_glm4v.py @@ -39,6 +39,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import TensorType, logging from ...video_utils import VideoInput @@ -46,6 +47,21 @@ logger = logging.get_logger(__name__) +class Glm4vImageProcessorKwargs(ImagesKwargs): + """ + patch_size (`int`, *optional*, defaults to 14): + The spatial patch size of the vision encoder. + temporal_patch_size (`int`, *optional*, defaults to 2): + The temporal patch size of the vision encoder. + merge_size (`int`, *optional*, defaults to 2): + The merge size of the vision encoder to llm encoder. + """ + + patch_size: Optional[int] + temporal_patch_size: Optional[int] + merge_size: Optional[int] + + def smart_resize( num_frames: int, height: int, @@ -120,6 +136,7 @@ class Glm4vImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values", "image_grid_thw"] + valid_kwargs = Glm4vImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/glm4v/image_processing_glm4v_fast.py b/src/transformers/models/glm4v/image_processing_glm4v_fast.py index 8cdf31a437ae..92e8a5df9137 100644 --- a/src/transformers/models/glm4v/image_processing_glm4v_fast.py +++ b/src/transformers/models/glm4v/image_processing_glm4v_fast.py @@ -24,7 +24,6 @@ ) from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, group_images_by_shape, reorder_images, ) @@ -41,27 +40,12 @@ auto_docstring, logging, ) -from .image_processing_glm4v import smart_resize +from .image_processing_glm4v import Glm4vImageProcessorKwargs, smart_resize logger = logging.get_logger(__name__) -class Glm4vFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - """ - patch_size (`int`, *optional*, defaults to 14): - The spatial patch size of the vision encoder. - temporal_patch_size (`int`, *optional*, defaults to 2): - The temporal patch size of the vision encoder. - merge_size (`int`, *optional*, defaults to 2): - The merge size of the vision encoder to llm encoder. - """ - - patch_size: Optional[int] - temporal_patch_size: Optional[int] - merge_size: Optional[int] - - @auto_docstring class Glm4vImageProcessorFast(BaseImageProcessorFast): do_resize = True @@ -75,10 +59,10 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast): patch_size = 14 temporal_patch_size = 2 merge_size = 2 - valid_kwargs = Glm4vFastImageProcessorKwargs + valid_kwargs = Glm4vImageProcessorKwargs model_input_names = ["pixel_values", "image_grid_thw"] - def __init__(self, **kwargs: Unpack[Glm4vFastImageProcessorKwargs]): + def __init__(self, **kwargs: Unpack[Glm4vImageProcessorKwargs]): super().__init__(**kwargs) if self.size is not None and ( self.size.get("shortest_edge", None) is None or self.size.get("longest_edge", None) is None @@ -205,7 +189,7 @@ def _preprocess( def preprocess( self, images: ImageInput, - **kwargs: Unpack[Glm4vFastImageProcessorKwargs], + **kwargs: Unpack[Glm4vImageProcessorKwargs], ) -> BatchFeature: return super().preprocess(images, **kwargs) diff --git a/src/transformers/models/glm4v/modular_glm4v.py b/src/transformers/models/glm4v/modular_glm4v.py index b8600713a5ed..ac2885a4a9f8 100644 --- a/src/transformers/models/glm4v/modular_glm4v.py +++ b/src/transformers/models/glm4v/modular_glm4v.py @@ -32,7 +32,7 @@ from ...modeling_outputs import BaseModelOutputWithPast from ...modeling_rope_utils import rope_config_validation from ...modeling_utils import ALL_ATTENTION_FUNCTIONS -from ...processing_utils import ImagesKwargs, Unpack +from ...processing_utils import Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling, logging from ...utils.generic import check_model_inputs @@ -52,7 +52,6 @@ Qwen2_5_VLVisionAttention, Qwen2_5_VLVisionBlock, ) -from ..qwen2_5_vl.processing_qwen2_5_vl import Qwen2_5_VLVideosProcessorKwargs from ..qwen2_vl.processing_qwen2_vl import ( Qwen2_VLProcessor, Qwen2_VLProcessorKwargs, @@ -1508,19 +1507,7 @@ def _get_image_nums_and_video_nums( return image_counts, video_counts -class Glm4vVideosProcessorKwargs(Qwen2_5_VLVideosProcessorKwargs): - pass - - -class Glm4vImagesKwargs(ImagesKwargs): - patch_size: Optional[int] - temporal_patch_size: Optional[int] - merge_size: Optional[int] - - class Glm4vProcessorKwargs(Qwen2_VLProcessorKwargs): - images_kwargs: Glm4vImagesKwargs - videos_kwargs: Glm4vVideosProcessorKwargs _defaults = { "text_kwargs": { "padding": False, diff --git a/src/transformers/models/glm4v/processing_glm4v.py b/src/transformers/models/glm4v/processing_glm4v.py index bd7d24e800f7..ad97a10efd73 100644 --- a/src/transformers/models/glm4v/processing_glm4v.py +++ b/src/transformers/models/glm4v/processing_glm4v.py @@ -24,7 +24,7 @@ from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput -from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs +from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import logging from ...video_utils import VideoInput @@ -33,18 +33,7 @@ logger = logging.get_logger(__name__) -class Glm4vVideosProcessorKwargs(VideosKwargs, total=False): - fps: Union[list[float], float] - - -class Glm4vImagesKwargs(ImagesKwargs): - patch_size: Optional[int] - temporal_patch_size: Optional[int] - merge_size: Optional[int] - - class Glm4vProcessorKwargs(ProcessingKwargs, total=False): - images_kwargs: Glm4vImagesKwargs _defaults = { "text_kwargs": { "padding": False, @@ -53,7 +42,6 @@ class Glm4vProcessorKwargs(ProcessingKwargs, total=False): }, "videos_kwargs": {"return_metadata": True}, } - videos_kwargs: Glm4vVideosProcessorKwargs class Glm4vProcessor(ProcessorMixin): diff --git a/src/transformers/models/glm4v/video_processing_glm4v.py b/src/transformers/models/glm4v/video_processing_glm4v.py index 0986c414f1d3..8324ad482baa 100644 --- a/src/transformers/models/glm4v/video_processing_glm4v.py +++ b/src/transformers/models/glm4v/video_processing_glm4v.py @@ -37,12 +37,11 @@ class Glm4vVideoProcessorInitKwargs(VideosKwargs): - max_image_size: dict[str, int] = None - patch_size: Optional[int] = None - temporal_patch_size: Optional[int] = None - merge_size: Optional[int] = None - image_mean: Optional[list[float]] = None - image_std: Optional[list[float]] = None + max_image_size: Optional[dict[str, int]] + patch_size: Optional[int] + temporal_patch_size: Optional[int] + merge_size: Optional[int] + max_duration: Optional[int] @add_start_docstrings( diff --git a/src/transformers/models/got_ocr2/image_processing_got_ocr2.py b/src/transformers/models/got_ocr2/image_processing_got_ocr2.py index 43bf8b520ffa..3424020c65b3 100644 --- a/src/transformers/models/got_ocr2/image_processing_got_ocr2.py +++ b/src/transformers/models/got_ocr2/image_processing_got_ocr2.py @@ -38,6 +38,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging @@ -48,6 +49,24 @@ logger = logging.get_logger(__name__) +class GotOcr2ImageProcessorKwargs(ImagesKwargs): + """ + crop_to_patches (`bool`, *optional*, defaults to `False`): + Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the + `preprocess` method. + min_patches (`int`, *optional*, defaults to 1): + The minimum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is + set to `True`. Can be overridden by the `min_patches` parameter in the `preprocess` method. + max_patches (`int`, *optional*, defaults to 12): + The maximum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is + set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method. + """ + + crop_to_patches: Optional[bool] + min_patches: Optional[int] + max_patches: Optional[int] + + # Similar to image_processing_mllama.get_all_supported_aspect_ratios @lru_cache(maxsize=10) def get_all_supported_aspect_ratios(min_image_tiles: int, max_image_tiles: int) -> list[tuple[int, int]]: @@ -168,6 +187,7 @@ class GotOcr2ImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values"] + valid_kwargs = GotOcr2ImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py b/src/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py index a47a1422a5dc..210a18a406be 100644 --- a/src/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +++ b/src/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py @@ -22,7 +22,6 @@ from ...image_processing_utils import BatchFeature from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, group_images_by_shape, reorder_images, ) @@ -32,25 +31,7 @@ TensorType, auto_docstring, ) -from .image_processing_got_ocr2 import get_optimal_tiled_canvas - - -class GotOcr2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - """ - crop_to_patches (`bool`, *optional*, defaults to `False`): - Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the - `preprocess` method. - min_patches (`int`, *optional*, defaults to 1): - The minimum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is - set to `True`. Can be overridden by the `min_patches` parameter in the `preprocess` method. - max_patches (`int`, *optional*, defaults to 12): - The maximum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is - set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method. - """ - - crop_to_patches: Optional[bool] - min_patches: Optional[int] - max_patches: Optional[int] +from .image_processing_got_ocr2 import GotOcr2ImageProcessorKwargs, get_optimal_tiled_canvas @auto_docstring @@ -66,13 +47,13 @@ class GotOcr2ImageProcessorFast(BaseImageProcessorFast): crop_to_patches = False min_patches = 1 max_patches = 12 - valid_kwargs = GotOcr2FastImageProcessorKwargs + valid_kwargs = GotOcr2ImageProcessorKwargs - def __init__(self, **kwargs: Unpack[GotOcr2FastImageProcessorKwargs]): + def __init__(self, **kwargs: Unpack[GotOcr2ImageProcessorKwargs]): super().__init__(**kwargs) @auto_docstring - def preprocess(self, images: ImageInput, **kwargs: Unpack[GotOcr2FastImageProcessorKwargs]) -> BatchFeature: + def preprocess(self, images: ImageInput, **kwargs: Unpack[GotOcr2ImageProcessorKwargs]) -> BatchFeature: return super().preprocess(images, **kwargs) def crop_image_to_patches( diff --git a/src/transformers/models/got_ocr2/processing_got_ocr2.py b/src/transformers/models/got_ocr2/processing_got_ocr2.py index 35df3b5a3f05..447122e18c22 100644 --- a/src/transformers/models/got_ocr2/processing_got_ocr2.py +++ b/src/transformers/models/got_ocr2/processing_got_ocr2.py @@ -18,11 +18,10 @@ import numpy as np -from transformers.processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack -from transformers.tokenization_utils_base import PreTokenizedInput, TextInput - from ...image_processing_utils import BatchFeature from ...image_utils import ImageInput +from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack +from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import is_vision_available, logging @@ -37,13 +36,13 @@ class GotOcr2TextKwargs(TextKwargs, total=False): class GotOcr2ImagesKwargs(ImagesKwargs, total=False): + crop_to_patches: Optional[bool] + min_patches: Optional[int] + max_patches: Optional[int] box: Optional[Union[list, tuple[float, float], tuple[float, float, float, float]]] color: Optional[str] num_image_tokens: Optional[int] multi_page: Optional[bool] - crop_to_patches: Optional[bool] - min_patches: Optional[int] - max_patches: Optional[int] class GotOcr2ProcessorKwargs(ProcessingKwargs, total=False): @@ -136,8 +135,6 @@ def __call__( self, images: Optional[ImageInput] = None, text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None, - audio=None, - videos=None, **kwargs: Unpack[GotOcr2ProcessorKwargs], ) -> BatchFeature: """ diff --git a/src/transformers/models/granite_speech/processing_granite_speech.py b/src/transformers/models/granite_speech/processing_granite_speech.py index 84515d173c47..0b76ccfe75db 100644 --- a/src/transformers/models/granite_speech/processing_granite_speech.py +++ b/src/transformers/models/granite_speech/processing_granite_speech.py @@ -49,8 +49,6 @@ def __call__( text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]], audio: Union["torch.Tensor", list["torch.Tensor"]] = None, device: str = "cpu", - images=None, - videos=None, **kwargs, ) -> BatchFeature: requires_backends(self, ["torch"]) diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py index 737cf2e670ee..e4e17d4d8ddf 100644 --- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py @@ -51,6 +51,7 @@ validate_kwargs, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import ( ExplicitEnum, TensorType, @@ -91,6 +92,29 @@ class AnnotationFormat(ExplicitEnum): SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) +class GroundingDinoImageProcessorKwargs(ImagesKwargs): + r""" + format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): + Data format of the annotations. One of "coco_detection" or "coco_panoptic". + do_convert_annotations (`bool`, *optional*, defaults to `True`): + Controls whether to convert the annotations to the format expected by the GROUNDING_DINO model. Converts the + bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. + Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. + return_segmentation_masks (`bool`, *optional*, defaults to `False`): + Whether to return segmentation masks. + annotations (`AnnotationType` or `list[AnnotationType]`, *optional*): + Annotations to transform according to the padding that is applied to the images. + masks_path (`str` or `pathlib.Path`, *optional*): + Path to the directory containing the segmentation masks. + """ + + format: Optional[Union[str, AnnotationFormat]] + do_convert_annotations: Optional[bool] + return_segmentation_masks: Optional[bool] + annotations: Optional[Union[AnnotationType, list[AnnotationType]]] + masks_path: Optional[Union[str, pathlib.Path]] + + # Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, int]: """ @@ -865,6 +889,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values", "pixel_mask"] + valid_kwargs = GroundingDinoImageProcessorKwargs # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.__init__ def __init__( diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino_fast.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino_fast.py index 744cb5f92923..ee303ec47fc4 100644 --- a/src/transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino_fast.py @@ -4,6 +4,26 @@ # the file from the modular. If any change should be done, please apply the change to the # modular_grounding_dino.py file directly. One of our CI enforces this. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import pathlib from typing import TYPE_CHECKING, Any, Optional, Union @@ -14,7 +34,6 @@ from ...image_processing_utils import BatchFeature, get_size_dict from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, SizeDict, get_image_size_for_max_height_width, get_max_height_width, @@ -35,7 +54,7 @@ from ...processing_utils import Unpack from ...utils import TensorType, auto_docstring, logging from ...utils.import_utils import requires -from .image_processing_grounding_dino import get_size_with_aspect_ratio +from .image_processing_grounding_dino import GroundingDinoImageProcessorKwargs, get_size_with_aspect_ratio if TYPE_CHECKING: @@ -44,24 +63,6 @@ logger = logging.get_logger(__name__) - -class GroundingDinoFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - r""" - format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): - Data format of the annotations. One of "coco_detection" or "coco_panoptic". - do_convert_annotations (`bool`, *optional*, defaults to `True`): - Controls whether to convert the annotations to the format expected by the GROUNDING_DINO model. Converts the - bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. - Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. - return_segmentation_masks (`bool`, *optional*, defaults to `False`): - Whether to return segmentation masks. - """ - - format: Optional[Union[str, AnnotationFormat]] - do_convert_annotations: Optional[bool] - return_segmentation_masks: Optional[bool] - - SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) @@ -304,9 +305,9 @@ class GroundingDinoImageProcessorFast(BaseImageProcessorFast): size = {"shortest_edge": 800, "longest_edge": 1333} default_to_square = False model_input_names = ["pixel_values", "pixel_mask"] - valid_kwargs = GroundingDinoFastImageProcessorKwargs + valid_kwargs = GroundingDinoImageProcessorKwargs - def __init__(self, **kwargs: Unpack[GroundingDinoFastImageProcessorKwargs]) -> None: + def __init__(self, **kwargs: Unpack[GroundingDinoImageProcessorKwargs]) -> None: if "pad_and_return_pixel_mask" in kwargs: kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") @@ -568,25 +569,8 @@ def pad( def preprocess( self, images: ImageInput, - annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None, - masks_path: Optional[Union[str, pathlib.Path]] = None, - **kwargs: Unpack[GroundingDinoFastImageProcessorKwargs], + **kwargs: Unpack[GroundingDinoImageProcessorKwargs], ) -> BatchFeature: - r""" - annotations (`AnnotationType` or `list[AnnotationType]`, *optional*): - List of annotations associated with the image or batch of images. If annotation is for object - detection, the annotations should be a dictionary with the following keys: - - "image_id" (`int`): The image id. - - "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a - dictionary. An image can have no annotations, in which case the list should be empty. - If annotation is for segmentation, the annotations should be a dictionary with the following keys: - - "image_id" (`int`): The image id. - - "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary. - An image can have no segments, in which case the list should be empty. - - "file_name" (`str`): The file name of the image. - masks_path (`str` or `pathlib.Path`, *optional*): - Path to the directory containing the segmentation masks. - """ if "pad_and_return_pixel_mask" in kwargs: kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") logger.warning_once( @@ -601,7 +585,7 @@ def preprocess( ) kwargs["size"] = kwargs.pop("max_size") - return super().preprocess(images, annotations, masks_path, **kwargs) + return super().preprocess(images, **kwargs) def _preprocess( self, diff --git a/src/transformers/models/grounding_dino/modular_grounding_dino.py b/src/transformers/models/grounding_dino/modular_grounding_dino.py index a7b9c570e7b0..ded6435508a5 100644 --- a/src/transformers/models/grounding_dino/modular_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modular_grounding_dino.py @@ -1,3 +1,23 @@ +# coding=utf-8 +# Copyright 2025 the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from typing import TYPE_CHECKING, Optional, Union import torch diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py index ea0e288f3eec..5f2f900451b2 100644 --- a/src/transformers/models/grounding_dino/processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py @@ -16,13 +16,12 @@ Processor class for Grounding DINO. """ -import pathlib import warnings from typing import TYPE_CHECKING, Optional, Union from ...image_transforms import center_to_corners_format -from ...image_utils import AnnotationFormat, ImageInput -from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack +from ...image_utils import ImageInput +from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput from ...utils import TensorType, is_torch_available @@ -99,16 +98,7 @@ def get(self, key, *args, **kwargs): return super().get(key, *args, **kwargs) -class GroundingDinoImagesKwargs(ImagesKwargs, total=False): - annotations: Optional[Union[AnnotationType, list[AnnotationType]]] - return_segmentation_masks: Optional[bool] - masks_path: Optional[Union[str, pathlib.Path]] - do_convert_annotations: Optional[bool] - format: Optional[Union[str, AnnotationFormat]] - - class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False): - images_kwargs: GroundingDinoImagesKwargs _defaults = { "text_kwargs": { "add_special_tokens": True, diff --git a/src/transformers/models/idefics/image_processing_idefics.py b/src/transformers/models/idefics/image_processing_idefics.py index 6ef5b39afeeb..7fda46e3a990 100644 --- a/src/transformers/models/idefics/image_processing_idefics.py +++ b/src/transformers/models/idefics/image_processing_idefics.py @@ -28,6 +28,7 @@ to_numpy_array, valid_images, ) +from ...processing_utils import ImagesKwargs from ...utils import TensorType, is_torch_available @@ -35,6 +36,20 @@ IDEFICS_STANDARD_STD = [0.26862954, 0.26130258, 0.27577711] +class IdeficsImageProcessorKwargs(ImagesKwargs): + """ + transform (`Callable`, *optional*): + A custom transform function that accepts a single image can be passed for training. For example, + `torchvision.Compose` can be used to compose multiple transforms. If `None` - an inference mode is + assumed - and then a preset of inference-specific transforms will be applied to the images + image_size (`dict[str, int]`, *optional*): + Resize to image size + """ + + transform: Optional[Callable] + image_size: Optional[dict[str, int]] + + def convert_to_rgb(image): # `image.convert("RGB")` would only work for .jpg images, as it creates a wrong background # for transparent images. The call to `alpha_composite` handles this case @@ -74,6 +89,7 @@ class IdeficsImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values"] + valid_kwargs = IdeficsImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/idefics/processing_idefics.py b/src/transformers/models/idefics/processing_idefics.py index 5ab7e480c8ea..4b5ccaffe5c8 100644 --- a/src/transformers/models/idefics/processing_idefics.py +++ b/src/transformers/models/idefics/processing_idefics.py @@ -16,13 +16,12 @@ Processor class for IDEFICS. """ -from typing import Callable, Optional, Union +from typing import Optional, Union from urllib.parse import urlparse from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput from ...processing_utils import ( - ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs, @@ -40,13 +39,6 @@ IMAGE_TOKEN = "" -class IdeficsImagesKwargs(ImagesKwargs, total=False): - transform: Optional[Callable] - image_size: Optional[dict[str, int]] - image_mean: Optional[Union[float, list[float]]] - image_std: Optional[Union[float, list[float]]] - - class IdeficsTextKwargs(TextKwargs, total=False): add_eos_token: Optional[bool] add_end_of_utterance_token: Optional[bool] @@ -54,14 +46,12 @@ class IdeficsTextKwargs(TextKwargs, total=False): class IdeficsProcessorKwargs(ProcessingKwargs, total=False): text_kwargs: IdeficsTextKwargs - images_kwargs: IdeficsImagesKwargs _defaults = { "text_kwargs": { "add_special_tokens": False, "padding": "longest", "add_eos_token": False, }, - "images_kwargs": {}, "common_kwargs": {"return_tensors": "pt"}, } @@ -198,8 +188,6 @@ def __call__( list[list[TextInput]], list[list[PreTokenizedInput]], ] = None, - audio=None, - videos=None, **kwargs: Unpack[IdeficsProcessorKwargs], ) -> BatchFeature: """This method takes batched or non-batched prompts made of text and images and converts them into prompts that diff --git a/src/transformers/models/idefics2/image_processing_idefics2.py b/src/transformers/models/idefics2/image_processing_idefics2.py index 15a04a887e87..b9b741a9704b 100644 --- a/src/transformers/models/idefics2/image_processing_idefics2.py +++ b/src/transformers/models/idefics2/image_processing_idefics2.py @@ -35,6 +35,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import TensorType, is_vision_available, logging @@ -46,6 +47,15 @@ from PIL import Image +class Idefics2ImageProcessorKwargs(ImagesKwargs): + """ + do_image_splitting (`bool`, *optional*, defaults to `False`): + Whether to split the image into a sequence 4 equal sub-images concatenated with the original image. + """ + + do_image_splitting: Optional[bool] + + def get_resize_output_image_size(image, size, input_data_format) -> tuple[int, int]: """ Get the output size of the image after resizing given a dictionary specifying the max and min sizes. @@ -186,6 +196,7 @@ class Idefics2ImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values", "pixel_attention_mask"] + valid_kwargs = Idefics2ImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/idefics2/image_processing_idefics2_fast.py b/src/transformers/models/idefics2/image_processing_idefics2_fast.py index 5348bda389ed..36ae6ea5fbc7 100644 --- a/src/transformers/models/idefics2/image_processing_idefics2_fast.py +++ b/src/transformers/models/idefics2/image_processing_idefics2_fast.py @@ -21,7 +21,6 @@ from ...image_processing_utils_fast import ( BaseImageProcessorFast, BatchFeature, - DefaultFastImageProcessorKwargs, SizeDict, group_images_by_shape, reorder_images, @@ -35,7 +34,7 @@ ) from ...processing_utils import Unpack from ...utils import TensorType, auto_docstring, is_torchvision_available, logging -from .image_processing_idefics2 import convert_to_rgb +from .image_processing_idefics2 import Idefics2ImageProcessorKwargs, convert_to_rgb if is_torchvision_available(): @@ -105,15 +104,6 @@ def make_pixel_mask(image: "torch.Tensor", output_size: tuple[int, int]) -> "tor return mask -class Idefics2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - """ - do_image_splitting (`bool`, *optional*, defaults to `False`): - Whether to split the image into a sequence 4 equal sub-images concatenated with the original image. - """ - - do_image_splitting: Optional[bool] - - @auto_docstring class Idefics2ImageProcessorFast(BaseImageProcessorFast): resample = PILImageResampling.BILINEAR @@ -127,7 +117,7 @@ class Idefics2ImageProcessorFast(BaseImageProcessorFast): do_image_splitting = False size = {"shortest_edge": 378, "longest_edge": 980} model_input_names = ["pixel_values", "pixel_attention_mask"] - valid_kwargs = Idefics2FastImageProcessorKwargs + valid_kwargs = Idefics2ImageProcessorKwargs def convert_to_rgb(self, image: ImageInput) -> ImageInput: """ @@ -214,7 +204,7 @@ def pad( return image, pixel_mask @auto_docstring - def preprocess(self, images: ImageInput, **kwargs: Unpack[Idefics2FastImageProcessorKwargs]) -> BatchFeature: + def preprocess(self, images: ImageInput, **kwargs: Unpack[Idefics2ImageProcessorKwargs]) -> BatchFeature: return super().preprocess(images, **kwargs) def _preprocess( diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py index 550ca8774095..c419a3641254 100644 --- a/src/transformers/models/idefics2/processing_idefics2.py +++ b/src/transformers/models/idefics2/processing_idefics2.py @@ -22,7 +22,6 @@ from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput, is_valid_image, load_image from ...processing_utils import ( - ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack, @@ -46,20 +45,13 @@ def is_image_or_image_url(elem): return is_url(elem) or is_valid_image(elem) -class Idefics2ImagesKwargs(ImagesKwargs, total=False): - image_seq_len: Optional[int] - - class Idefics2ProcessorKwargs(ProcessingKwargs, total=False): - images_kwargs: Idefics2ImagesKwargs - _defaults = { "text_kwargs": { "add_special_tokens": True, "padding": False, "is_split_into_words": False, }, - "images_kwargs": {}, } @@ -123,8 +115,6 @@ def __call__( self, images: Union[ImageInput, list[ImageInput], list[list[ImageInput]]] = None, text: Union[TextInput, "PreTokenizedInput", list[TextInput], list["PreTokenizedInput"]] = None, - audio=None, - videos=None, **kwargs: Unpack[Idefics2ProcessorKwargs], ) -> BatchFeature: """ @@ -181,8 +171,6 @@ def __call__( tokenizer_init_kwargs=self.tokenizer.init_kwargs, **kwargs, ) - image_seq_len = output_kwargs["images_kwargs"].pop("image_seq_len", None) - image_seq_len = image_seq_len if image_seq_len is not None else self.image_seq_len return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) n_images_in_text = [] @@ -197,12 +185,11 @@ def __call__( # Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len` fake_image_token = self.fake_image_token image_token = self.image_token - image_str = f"{fake_image_token}{image_token * image_seq_len}{fake_image_token}" + image_str = f"{fake_image_token}{image_token * self.image_seq_len}{fake_image_token}" if self.image_processor.do_image_splitting: # A single image token is split into 4 patches + 1 original image image_str = image_str * 5 - image_seq_len *= 5 prompt_strings = [] for sample in text: diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py index c7526f30993a..f098a9f54dc1 100644 --- a/src/transformers/models/idefics3/image_processing_idefics3.py +++ b/src/transformers/models/idefics3/image_processing_idefics3.py @@ -35,6 +35,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import TensorType, is_vision_available, logging @@ -47,6 +48,22 @@ from PIL import Image +class Idefics3ImageProcessorKwargs(ImagesKwargs): + """ + do_image_splitting (`bool`, *optional*, defaults to `True`): + Whether to split the image into sub-images concatenated with the original image. They are split into patches + such that each patch has a size of `max_image_size["height"]` x `max_image_size["width"]`. + max_image_size (`Dict`, *optional*, defaults to `{"longest_edge": 364}`): + Maximum resolution of the patches of images accepted by the model. This is a dictionary containing the key "longest_edge". + return_row_col_info (`bool`, *optional*, defaults to `False`): + Whether to return the row and column information of the images. + """ + + do_image_splitting: Optional[bool] + max_image_size: Optional[dict[str, int]] + return_row_col_info: Optional[bool] + + def _resize_output_size_rescale_to_max_len( height: int, width: int, min_len: Optional[int] = 1, max_len: Optional[int] = None ) -> tuple[int, int]: @@ -291,6 +308,7 @@ class Idefics3ImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values", "pixel_attention_mask"] + valid_kwargs = Idefics3ImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/idefics3/image_processing_idefics3_fast.py b/src/transformers/models/idefics3/image_processing_idefics3_fast.py index 5b0c0e6180f9..2f325f77931a 100644 --- a/src/transformers/models/idefics3/image_processing_idefics3_fast.py +++ b/src/transformers/models/idefics3/image_processing_idefics3_fast.py @@ -22,7 +22,6 @@ from ...image_processing_utils_fast import ( BaseImageProcessorFast, BatchFeature, - DefaultFastImageProcessorKwargs, SizeDict, group_images_by_shape, reorder_images, @@ -36,6 +35,7 @@ ) from ...processing_utils import Unpack from ...utils import TensorType, auto_docstring, is_torchvision_available, logging +from .image_processing_idefics3 import Idefics3ImageProcessorKwargs if is_torchvision_available(): @@ -169,22 +169,6 @@ def make_pixel_mask(image: "torch.Tensor", output_size: tuple[int, int]) -> "tor return mask -class Idefics3FastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - """ - do_image_splitting (`bool`, *optional*, defaults to `True`): - Whether to split the image into sub-images concatenated with the original image. They are split into patches - such that each patch has a size of `max_image_size["height"]` x `max_image_size["width"]`. - max_image_size (`Dict`, *optional*, defaults to `{"longest_edge": 364}`): - Maximum resolution of the patches of images accepted by the model. This is a dictionary containing the key "longest_edge". - return_row_col_info (`bool`, *optional*, defaults to `False`): - Whether to return the row and column information of the images. - """ - - do_image_splitting: Optional[bool] - max_image_size: Optional[dict[str, int]] - return_row_col_info: Optional[bool] - - @auto_docstring class Idefics3ImageProcessorFast(BaseImageProcessorFast): resample = PILImageResampling.LANCZOS @@ -199,7 +183,7 @@ class Idefics3ImageProcessorFast(BaseImageProcessorFast): do_image_splitting = True do_pad = True return_row_col_info = False - valid_kwargs = Idefics3FastImageProcessorKwargs + valid_kwargs = Idefics3ImageProcessorKwargs def _prepare_images_structure(self, images: ImageInput, expected_ndims: int = 3) -> ImageInput: """ @@ -367,7 +351,7 @@ def pad( return image, pixel_mask @auto_docstring - def preprocess(self, images: ImageInput, **kwargs: Unpack[Idefics3FastImageProcessorKwargs]) -> BatchFeature: + def preprocess(self, images: ImageInput, **kwargs: Unpack[Idefics3ImageProcessorKwargs]) -> BatchFeature: return super().preprocess(images, **kwargs) def _preprocess( diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py index 00ee8df6d414..451af1d8a38f 100644 --- a/src/transformers/models/idefics3/processing_idefics3.py +++ b/src/transformers/models/idefics3/processing_idefics3.py @@ -24,7 +24,7 @@ from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput, is_valid_image, load_image -from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack +from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import AddedToken, BatchEncoding, TextInput from ...utils import logging @@ -87,14 +87,7 @@ def get_image_prompt_string( ) -class Idefics3ImagesKwargs(ImagesKwargs, total=False): - return_row_col_info: Optional[bool] - max_image_size: Optional[dict[str, int]] - - class Idefics3ProcessorKwargs(ProcessingKwargs, total=False): - images_kwargs: Idefics3ImagesKwargs - _defaults = { "text_kwargs": { "add_special_tokens": True, @@ -179,8 +172,6 @@ def __call__( self, images: Union[ImageInput, list[ImageInput], list[list[ImageInput]]] = None, text: Union[TextInput, "PreTokenizedInput", list[TextInput], list["PreTokenizedInput"]] = None, - audio=None, - videos=None, image_seq_len: Optional[int] = None, **kwargs: Unpack[Idefics3ProcessorKwargs], ) -> BatchEncoding: diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt.py b/src/transformers/models/imagegpt/image_processing_imagegpt.py index ee8fe04771b7..8f79cd58ec5f 100644 --- a/src/transformers/models/imagegpt/image_processing_imagegpt.py +++ b/src/transformers/models/imagegpt/image_processing_imagegpt.py @@ -31,17 +31,34 @@ valid_images, validate_preprocess_arguments, ) -from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging +from ...processing_utils import ImagesKwargs +from ...utils import TensorType, filter_out_non_signature_kwargs, is_torch_available, is_vision_available, logging from ...utils.import_utils import requires if is_vision_available(): import PIL +if is_torch_available(): + import torch logger = logging.get_logger(__name__) +class ImageGPTImageProcessorKwargs(ImagesKwargs): + """ + clusters (`np.ndarray` or `list[list[int]]` or `torch.Tensor`, *optional*): + The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overridden by `clusters` + in `preprocess`. + do_color_quantize (`bool`, *optional*, defaults to `True`): + Controls whether to apply color quantization to convert continuous pixel values to discrete cluster indices. + When True, each pixel is assigned to its nearest color cluster, enabling ImageGPT's discrete token modeling. + """ + + clusters: Optional[Union[np.ndarray, list[list[int]], "torch.Tensor"]] + do_color_quantize: Optional[bool] + + def squared_euclidean_distance(a, b): b = b.T a2 = np.sum(np.square(a), axis=1) @@ -83,6 +100,7 @@ class ImageGPTImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values"] + valid_kwargs = ImageGPTImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py index 7a6bcc53ae1a..1be050b5ecf9 100644 --- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py +++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py @@ -23,7 +23,6 @@ from ...image_processing_utils import BatchFeature from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, ) from ...image_transforms import group_images_by_shape, reorder_images from ...image_utils import PILImageResampling @@ -32,6 +31,7 @@ TensorType, auto_docstring, ) +from .image_processing_imagegpt import ImageGPTImageProcessorKwargs def squared_euclidean_distance_torch(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor: @@ -68,20 +68,6 @@ def color_quantize_torch(x: torch.Tensor, clusters: torch.Tensor) -> torch.Tenso return torch.argmin(d, dim=1) -class ImageGPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - """ - clusters (`np.ndarray` or `list[list[int]]` or `torch.Tensor`, *optional*): - The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overridden by `clusters` - in `preprocess`. - do_color_quantize (`bool`, *optional*, defaults to `True`): - Controls whether to apply color quantization to convert continuous pixel values to discrete cluster indices. - When True, each pixel is assigned to its nearest color cluster, enabling ImageGPT's discrete token modeling. - """ - - clusters: Optional[Union[np.ndarray, list[list[int]], torch.Tensor]] - do_color_quantize: Optional[bool] - - @auto_docstring class ImageGPTImageProcessorFast(BaseImageProcessorFast): model_input_names = ["input_ids"] @@ -92,12 +78,12 @@ class ImageGPTImageProcessorFast(BaseImageProcessorFast): image_std = [0.5, 0.5, 0.5] do_rescale = True do_normalize = True - valid_kwargs = ImageGPTFastImageProcessorKwargs + valid_kwargs = ImageGPTImageProcessorKwargs def __init__( self, clusters: Optional[Union[list, np.ndarray, torch.Tensor]] = None, # keep as arg for backwards compatibility - **kwargs: Unpack[ImageGPTFastImageProcessorKwargs], + **kwargs: Unpack[ImageGPTImageProcessorKwargs], ): r""" clusters (`np.ndarray` or `list[list[int]]` or `torch.Tensor`, *optional*): diff --git a/src/transformers/models/instructblip/processing_instructblip.py b/src/transformers/models/instructblip/processing_instructblip.py index 122fc11622ff..afe43c1fc7a7 100644 --- a/src/transformers/models/instructblip/processing_instructblip.py +++ b/src/transformers/models/instructblip/processing_instructblip.py @@ -43,7 +43,6 @@ class InstructBlipProcessorKwargs(ProcessingKwargs, total=False): "return_length": False, "verbose": True, }, - "images_kwargs": {}, } @@ -85,8 +84,6 @@ def __call__( self, images: Optional[ImageInput] = None, text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, - audio=None, - videos=None, **kwargs: Unpack[InstructBlipProcessorKwargs], ) -> BatchFeature: """ diff --git a/src/transformers/models/internvl/processing_internvl.py b/src/transformers/models/internvl/processing_internvl.py index f68d48502b6c..12e0d395b05c 100644 --- a/src/transformers/models/internvl/processing_internvl.py +++ b/src/transformers/models/internvl/processing_internvl.py @@ -19,19 +19,12 @@ from ...image_processing_utils import BatchFeature from ...image_utils import ImageInput, concatenate_list, make_flat_list_of_images -from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack +from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...video_utils import VideoInput -class InternVLImagesKwargs(ImagesKwargs, total=False): - crop_to_patches: Optional[bool] - min_patches: Optional[int] - max_patches: Optional[int] - - class InternVLProcessorKwargs(ProcessingKwargs, total=False): - images_kwargs: InternVLImagesKwargs _defaults = { "text_kwargs": { "padding_side": "left", @@ -159,7 +152,6 @@ def __call__( self, images: Optional[ImageInput] = None, text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None, - audio=None, videos: Optional[VideoInput] = None, **kwargs: Unpack[InternVLProcessorKwargs], ) -> BatchFeature: diff --git a/src/transformers/models/internvl/video_processing_internvl.py b/src/transformers/models/internvl/video_processing_internvl.py index 96d7d3067f73..d16c57522d1c 100644 --- a/src/transformers/models/internvl/video_processing_internvl.py +++ b/src/transformers/models/internvl/video_processing_internvl.py @@ -43,7 +43,6 @@ class InternVLVideoProcessor(BaseVideoProcessor): initial_shift = True do_sample_frames = False # Set to False for BC, recommended to set `True` in new models valid_kwargs = InternVLVideoProcessorInitKwargs - model_input_names = ["pixel_values_videos"] def __init__(self, **kwargs: Unpack[InternVLVideoProcessorInitKwargs]): super().__init__(**kwargs) diff --git a/src/transformers/models/janus/image_processing_janus.py b/src/transformers/models/janus/image_processing_janus.py index a56cae7ba975..06ea0fe0e4d1 100644 --- a/src/transformers/models/janus/image_processing_janus.py +++ b/src/transformers/models/janus/image_processing_janus.py @@ -40,6 +40,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging @@ -50,6 +51,16 @@ logger = logging.get_logger(__name__) +class JanusImageProcessorKwargs(ImagesKwargs): + r""" + min_size (`int`, *optional*, defaults to 14): + The minimum allowed size for the resized image. Ensures that neither the height nor width + falls below this value after resizing. + """ + + min_size: int + + class JanusImageProcessor(BaseImageProcessor): r""" Constructs a JANUS image processor. @@ -92,6 +103,8 @@ class JanusImageProcessor(BaseImageProcessor): model_input_names = ["pixel_values"] + valid_kwargs = JanusImageProcessorKwargs + def __init__( self, do_resize: bool = True, diff --git a/src/transformers/models/janus/image_processing_janus_fast.py b/src/transformers/models/janus/image_processing_janus_fast.py index 6cbca591626e..4de23e80e63a 100644 --- a/src/transformers/models/janus/image_processing_janus_fast.py +++ b/src/transformers/models/janus/image_processing_janus_fast.py @@ -22,7 +22,6 @@ from ...image_processing_utils import BatchFeature from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, group_images_by_shape, reorder_images, ) @@ -38,16 +37,7 @@ TensorType, auto_docstring, ) - - -class JanusFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - r""" - min_size (`int`, *optional*, defaults to 14): - The minimum allowed size for the resized image. Ensures that neither the height nor width - falls below this value after resizing. - """ - - min_size: int +from .image_processing_janus import JanusImageProcessorKwargs @auto_docstring @@ -61,9 +51,9 @@ class JanusImageProcessorFast(BaseImageProcessorFast): do_rescale = True do_normalize = True do_pad = True - valid_kwargs = JanusFastImageProcessorKwargs + valid_kwargs = JanusImageProcessorKwargs - def __init__(self, **kwargs: Unpack[JanusFastImageProcessorKwargs]): + def __init__(self, **kwargs: Unpack[JanusImageProcessorKwargs]): if kwargs.get("image_mean") is None: background_color = (127, 127, 127) else: diff --git a/src/transformers/models/janus/modular_janus.py b/src/transformers/models/janus/modular_janus.py index a8e24a86b0d3..332dc689dc62 100644 --- a/src/transformers/models/janus/modular_janus.py +++ b/src/transformers/models/janus/modular_janus.py @@ -47,7 +47,7 @@ ) from ...modeling_outputs import ModelOutput from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel -from ...processing_utils import Unpack +from ...processing_utils import ImagesKwargs, Unpack from ...utils import ( TensorType, TransformersKwargs, @@ -1289,6 +1289,16 @@ def generate( return generated_tokens +class JanusImageProcessorKwargs(ImagesKwargs): + r""" + min_size (`int`, *optional*, defaults to 14): + The minimum allowed size for the resized image. Ensures that neither the height nor width + falls below this value after resizing. + """ + + min_size: int + + class JanusImageProcessor(BlipImageProcessor): r""" Constructs a JANUS image processor. @@ -1329,6 +1339,8 @@ class JanusImageProcessor(BlipImageProcessor): Whether to pad the image to square or not. """ + valid_kwargs = JanusImageProcessorKwargs + def __init__( self, do_resize: bool = True, diff --git a/src/transformers/models/janus/processing_janus.py b/src/transformers/models/janus/processing_janus.py index c2413e705756..15c237c4ced4 100644 --- a/src/transformers/models/janus/processing_janus.py +++ b/src/transformers/models/janus/processing_janus.py @@ -81,8 +81,6 @@ def __call__( self, text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, images: Optional[ImageInput] = None, - videos=None, - audio=None, **kwargs: Unpack[JanusProcessorKwargs], ) -> BatchFeature: """ diff --git a/src/transformers/models/kosmos2/processing_kosmos2.py b/src/transformers/models/kosmos2/processing_kosmos2.py index 58b3dff1e07a..98f8925e8a69 100644 --- a/src/transformers/models/kosmos2/processing_kosmos2.py +++ b/src/transformers/models/kosmos2/processing_kosmos2.py @@ -136,8 +136,6 @@ def __call__( self, images: Optional[ImageInput] = None, text: Union[TextInput, list[TextInput]] = None, - audio=None, - videos=None, **kwargs: Unpack[Kosmos2ProcessorKwargs], ) -> BatchFeature: """ diff --git a/src/transformers/models/kosmos2_5/image_processing_kosmos2_5.py b/src/transformers/models/kosmos2_5/image_processing_kosmos2_5.py index b768205da2a4..5f337e4b04c9 100644 --- a/src/transformers/models/kosmos2_5/image_processing_kosmos2_5.py +++ b/src/transformers/models/kosmos2_5/image_processing_kosmos2_5.py @@ -34,6 +34,7 @@ to_numpy_array, valid_images, ) +from ...processing_utils import ImagesKwargs from ...utils import TensorType, is_torch_available, logging from ...utils.import_utils import requires_backends @@ -45,6 +46,19 @@ DEFAULT_FONT_PATH = "ybelkada/fonts" +class Kosmos2_5ImageProcessorKwargs(ImagesKwargs): + r""" + patch_size (`Dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`): + The patch size to use for the image. According to Kosmos2_5 paper and code, the patch size is 16x16. + max_patches (`int`, *optional*, defaults to 4096): + The maximum number of patches to extract from the image as per the + [KOSMOS 2.5 paper](https://huggingface.co/papers/2309.11419). + """ + + patch_size: Optional[dict[str, int]] + max_patches: Optional[int] + + # Copied from transformers.models.pix2struct.image_processing_pix2struct.torch_extract_patches def torch_extract_patches(image_tensor, patch_height, patch_width): """ @@ -92,6 +106,7 @@ class Kosmos2_5ImageProcessor(BaseImageProcessor): """ model_input_names = ["flattened_patches"] + valid_kwargs = Kosmos2_5ImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py b/src/transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py index b7adeb2c86c2..d892436ea652 100644 --- a/src/transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +++ b/src/transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py @@ -22,13 +22,13 @@ from ...image_processing_utils import BatchFeature from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, group_images_by_shape, reorder_images, ) from ...image_utils import ChannelDimension, ImageInput, get_image_size from ...processing_utils import Unpack from ...utils import TensorType, auto_docstring +from .image_processing_kosmos2_5 import Kosmos2_5ImageProcessorKwargs # Similar to transformers.models.pix2struct.image_processing_pix2struct.torch_extract_patches but dealing with a batch of images directly. @@ -56,19 +56,6 @@ def torch_extract_patches(image_tensor, patch_height, patch_width): return patches -class Kosmos2_5FastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - r""" - patch_size (`Dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`): - The patch size to use for the image. According to Kosmos2_5 paper and code, the patch size is 16x16. - max_patches (`int`, *optional*, defaults to 4096): - The maximum number of patches to extract from the image as per the - [KOSMOS 2.5 paper](https://huggingface.co/papers/2309.11419). - """ - - patch_size: Optional[dict[str, int]] - max_patches: Optional[int] - - @auto_docstring class Kosmos2_5ImageProcessorFast(BaseImageProcessorFast): # To be checked against the slow image processor @@ -78,13 +65,13 @@ class Kosmos2_5ImageProcessorFast(BaseImageProcessorFast): patch_size = {"height": 16, "width": 16} max_patches = 4096 rescale_factor = None - valid_kwargs = Kosmos2_5FastImageProcessorKwargs + valid_kwargs = Kosmos2_5ImageProcessorKwargs - def __init__(self, **kwargs: Unpack[Kosmos2_5FastImageProcessorKwargs]): + def __init__(self, **kwargs: Unpack[Kosmos2_5ImageProcessorKwargs]): super().__init__(**kwargs) @auto_docstring - def preprocess(self, images: ImageInput, **kwargs: Unpack[Kosmos2_5FastImageProcessorKwargs]) -> BatchFeature: + def preprocess(self, images: ImageInput, **kwargs: Unpack[Kosmos2_5ImageProcessorKwargs]) -> BatchFeature: r""" patch_size (`Dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`): The patch size to use for the image. According to Kosmos2_5 paper and code, the patch size is 16x16. diff --git a/src/transformers/models/kosmos2_5/processing_kosmos2_5.py b/src/transformers/models/kosmos2_5/processing_kosmos2_5.py index 0e3c70c80234..cb6f27777a0f 100644 --- a/src/transformers/models/kosmos2_5/processing_kosmos2_5.py +++ b/src/transformers/models/kosmos2_5/processing_kosmos2_5.py @@ -20,7 +20,7 @@ from ...image_processing_utils import BatchFeature from ...image_utils import ImageInput -from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack +from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import TextInput from ...utils import is_torch_available @@ -29,14 +29,7 @@ import torch -class Kosmos2_5ImagesKwargs(ImagesKwargs, total=False): - max_patches: Optional[int] - num_image_tokens: Optional[int] - - class Kosmos2_5ProcessorKwargs(ProcessingKwargs, total=False): - text_kwargs: TextKwargs - images_kwargs: Kosmos2_5ImagesKwargs _defaults = { "text_kwargs": { "padding": True, @@ -46,7 +39,6 @@ class Kosmos2_5ProcessorKwargs(ProcessingKwargs, total=False): }, "images_kwargs": { "max_patches": 4096, - "num_image_tokens": 2048, }, "common_kwargs": {"return_tensors": "pt"}, } @@ -65,24 +57,25 @@ class Kosmos2_5Processor(ProcessorMixin): An instance of [`Kosmos2_5ImageProcessor`]. The image processor is a required input. tokenizer (Union[`T5TokenizerFast`, `T5Tokenizer`]): An instance of ['T5TokenizerFast`] or ['T5Tokenizer`]. The tokenizer is a required input. + num_image_tokens (`int`, *optional*, defaults to 2048): + Number of image tokens used as a placeholder. """ attributes = ["image_processor", "tokenizer"] image_processor_class = "AutoImageProcessor" tokenizer_class = "PreTrainedTokenizerFast" - def __init__(self, image_processor, tokenizer): + def __init__(self, image_processor, tokenizer, num_image_tokens: int = 2048): self.image_start_token = tokenizer.boi_token # "" : fixed token for the start of image self.image_end_token = tokenizer.eoi_token # "" : fixed token for the end of image self.image_token = tokenizer.image_token # "" : within a ... pair, these tokens indicate they are positions reserved for an image + self.num_image_tokens = num_image_tokens super().__init__(image_processor, tokenizer) def __call__( self, images: Optional[ImageInput] = None, text: Union[TextInput, list[TextInput]] = None, - audio=None, - videos=None, **kwargs: Unpack[Kosmos2_5ProcessorKwargs], ) -> BatchFeature: """ @@ -104,8 +97,6 @@ def __call__( tokenizer_init_kwargs=self.tokenizer.init_kwargs, **kwargs, ) - num_image_tokens = output_kwargs["images_kwargs"].setdefault("num_image_tokens", None) - encoding = BatchFeature() if images is not None: @@ -114,7 +105,7 @@ def __call__( image_encoding.pop("cols") encoding.update(image_encoding) - prompt = f"{self.tokenizer.bos_token}{self.image_start_token}{self.image_token * num_image_tokens}{self.image_end_token}" + prompt = f"{self.tokenizer.bos_token}{self.image_start_token}{self.image_token * self.num_image_tokens}{self.image_end_token}" if text is not None: if isinstance(text, str): @@ -124,7 +115,7 @@ def __call__( input = self.tokenizer(text, **output_kwargs["text_kwargs"]) batch_size, seq_len = input.input_ids.shape - image_embeds_position_mask = [0, -1] + [1] * num_image_tokens + [-1] + image_embeds_position_mask = [0, -1] + [1] * self.num_image_tokens + [-1] image_embeds_position_mask += [0] * (seq_len - len(image_embeds_position_mask)) image_embeds_position_mask = ( torch.LongTensor(image_embeds_position_mask).unsqueeze(0).repeat(batch_size, 1) diff --git a/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py b/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py index 76fc752bbeea..d5a7e95537c5 100644 --- a/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py +++ b/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py @@ -30,6 +30,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import ( TensorType, filter_out_non_signature_kwargs, @@ -51,6 +52,25 @@ logger = logging.get_logger(__name__) +class LayoutLMv2ImageProcessorKwargs(ImagesKwargs): + r""" + apply_ocr (`bool`, *optional*, defaults to `True`): + Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by + the `apply_ocr` parameter in the `preprocess` method. + ocr_lang (`str`, *optional*): + The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is + used. Can be overridden by the `ocr_lang` parameter in the `preprocess` method. + tesseract_config (`str`, *optional*): + Any additional custom configuration flags that are forwarded to the `config` parameter when calling + Tesseract. For example: '--psm 6'. Can be overridden by the `tesseract_config` parameter in the + `preprocess` method. + """ + + apply_ocr: Optional[bool] + ocr_lang: Optional[str] + tesseract_config: Optional[str] + + def normalize_box(box, width, height): return [ int(1000 * (box[0] / width)), @@ -125,6 +145,7 @@ class LayoutLMv2ImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values"] + valid_kwargs = LayoutLMv2ImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py b/src/transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py index 354bbe21c4db..2d6e6bc21cb3 100644 --- a/src/transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +++ b/src/transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py @@ -19,7 +19,7 @@ import torch from torchvision.transforms.v2 import functional as F -from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature, DefaultFastImageProcessorKwargs +from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature from ...image_transforms import ChannelDimension, group_images_by_shape, reorder_images from ...image_utils import ImageInput, PILImageResampling, SizeDict from ...processing_utils import Unpack @@ -29,32 +29,12 @@ logging, requires_backends, ) -from .image_processing_layoutlmv2 import apply_tesseract +from .image_processing_layoutlmv2 import LayoutLMv2ImageProcessorKwargs, apply_tesseract logger = logging.get_logger(__name__) -class LayoutLMv2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - """ - Args: - apply_ocr (`bool`, *optional*, defaults to `True`): - Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by - the `apply_ocr` parameter in the `preprocess` method. - ocr_lang (`str`, *optional*): - The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is - used. Can be overridden by the `ocr_lang` parameter in the `preprocess` method. - tesseract_config (`str`, *optional*): - Any additional custom configuration flags that are forwarded to the `config` parameter when calling - Tesseract. For example: '--psm 6'. Can be overridden by the `tesseract_config` parameter in the - `preprocess` method. - """ - - apply_ocr: Optional[bool] - ocr_lang: Optional[str] - tesseract_config: Optional[str] - - @auto_docstring class LayoutLMv2ImageProcessorFast(BaseImageProcessorFast): resample = PILImageResampling.BILINEAR @@ -64,13 +44,13 @@ class LayoutLMv2ImageProcessorFast(BaseImageProcessorFast): apply_ocr = True ocr_lang = None tesseract_config = "" - valid_kwargs = LayoutLMv2FastImageProcessorKwargs + valid_kwargs = LayoutLMv2ImageProcessorKwargs - def __init__(self, **kwargs: Unpack[LayoutLMv2FastImageProcessorKwargs]): + def __init__(self, **kwargs: Unpack[LayoutLMv2ImageProcessorKwargs]): super().__init__(**kwargs) @auto_docstring - def preprocess(self, images: ImageInput, **kwargs: Unpack[LayoutLMv2FastImageProcessorKwargs]) -> BatchFeature: + def preprocess(self, images: ImageInput, **kwargs: Unpack[LayoutLMv2ImageProcessorKwargs]) -> BatchFeature: return super().preprocess(images, **kwargs) def _preprocess( diff --git a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py index 0ce7f5ce6968..b9273dc75cad 100644 --- a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py +++ b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py @@ -34,6 +34,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import ( TensorType, filter_out_non_signature_kwargs, @@ -55,6 +56,25 @@ logger = logging.get_logger(__name__) +class LayoutLMv3ImageProcessorKwargs(ImagesKwargs): + r""" + apply_ocr (`bool`, *optional*, defaults to `True`): + Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by + the `apply_ocr` parameter in the `preprocess` method. + ocr_lang (`str`, *optional*): + The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is + used. Can be overridden by the `ocr_lang` parameter in the `preprocess` method. + tesseract_config (`str`, *optional*): + Any additional custom configuration flags that are forwarded to the `config` parameter when calling + Tesseract. For example: '--psm 6'. Can be overridden by the `tesseract_config` parameter in the + `preprocess` method. + """ + + apply_ocr: Optional[bool] + ocr_lang: Optional[str] + tesseract_config: Optional[str] + + def normalize_box(box, width, height): return [ int(1000 * (box[0] / width)), @@ -143,6 +163,7 @@ class LayoutLMv3ImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values"] + valid_kwargs = LayoutLMv3ImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py index caefa9b89660..b614c5ec9449 100644 --- a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +++ b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py @@ -19,7 +19,7 @@ import torch from torchvision.transforms.v2 import functional as F -from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature, DefaultFastImageProcessorKwargs +from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature from ...image_transforms import ChannelDimension, group_images_by_shape, reorder_images from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, ImageInput, PILImageResampling, SizeDict from ...processing_utils import Unpack @@ -29,32 +29,12 @@ logging, requires_backends, ) -from .image_processing_layoutlmv3 import apply_tesseract +from .image_processing_layoutlmv3 import LayoutLMv3ImageProcessorKwargs, apply_tesseract logger = logging.get_logger(__name__) -class LayoutLMv3FastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - """ - Args: - apply_ocr (`bool`, *optional*, defaults to `True`): - Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by - the `apply_ocr` parameter in the `preprocess` method. - ocr_lang (`str`, *optional*): - The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is - used. Can be overridden by the `ocr_lang` parameter in the `preprocess` method. - tesseract_config (`str`, *optional*): - Any additional custom configuration flags that are forwarded to the `config` parameter when calling - Tesseract. For example: '--psm 6'. Can be overridden by the `tesseract_config` parameter in the - `preprocess` method. - """ - - apply_ocr: Optional[bool] - ocr_lang: Optional[str] - tesseract_config: Optional[str] - - @auto_docstring class LayoutLMv3ImageProcessorFast(BaseImageProcessorFast): resample = PILImageResampling.BILINEAR @@ -67,13 +47,13 @@ class LayoutLMv3ImageProcessorFast(BaseImageProcessorFast): apply_ocr = True ocr_lang = None tesseract_config = "" - valid_kwargs = LayoutLMv3FastImageProcessorKwargs + valid_kwargs = LayoutLMv3ImageProcessorKwargs - def __init__(self, **kwargs: Unpack[LayoutLMv3FastImageProcessorKwargs]): + def __init__(self, **kwargs: Unpack[LayoutLMv3ImageProcessorKwargs]): super().__init__(**kwargs) @auto_docstring - def preprocess(self, images: ImageInput, **kwargs: Unpack[LayoutLMv3FastImageProcessorKwargs]) -> BatchFeature: + def preprocess(self, images: ImageInput, **kwargs: Unpack[LayoutLMv3ImageProcessorKwargs]) -> BatchFeature: return super().preprocess(images, **kwargs) def _preprocess( diff --git a/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py b/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py index 4081c86e108a..ad99504fcad6 100755 --- a/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +++ b/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py @@ -22,7 +22,6 @@ from ...image_processing_utils import BatchFeature from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, group_images_by_shape, reorder_images, ) @@ -33,9 +32,7 @@ PILImageResampling, SizeDict, ) -from ...processing_utils import ( - Unpack, -) +from ...processing_utils import ImagesKwargs, Unpack from ...utils import ( TensorType, auto_docstring, @@ -172,7 +169,7 @@ def pad_along_first_dim( return images, pixel_mask -class Lfm2VlFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): +class Lfm2VlImageProcessorKwargs(ImagesKwargs): """ downsample_factor (`int`, *optional*, defaults to `2`): The downsampling factor for images used when resizing the image. @@ -214,10 +211,10 @@ class Lfm2VlImageProcessorFast(BaseImageProcessorFast): return_row_col_info = False image_mean = IMAGENET_STANDARD_STD image_std = IMAGENET_STANDARD_MEAN - valid_kwargs = Lfm2VlFastImageProcessorKwargs + valid_kwargs = Lfm2VlImageProcessorKwargs model_input_names = ["pixel_values", "pixel_attention_mask", "spatial_shapes"] - def __init__(self, **kwargs: Unpack[Lfm2VlFastImageProcessorKwargs]): + def __init__(self, **kwargs: Unpack[Lfm2VlImageProcessorKwargs]): super().__init__(**kwargs) max_thumbnail_image_patches = self.max_image_tokens * self.downsample_factor**2 diff --git a/src/transformers/models/llama4/image_processing_llama4_fast.py b/src/transformers/models/llama4/image_processing_llama4_fast.py index 6506d5749d94..e2678f556d02 100644 --- a/src/transformers/models/llama4/image_processing_llama4_fast.py +++ b/src/transformers/models/llama4/image_processing_llama4_fast.py @@ -25,12 +25,11 @@ from ...image_processing_utils import BatchFeature from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, group_images_by_shape, reorder_images, ) from ...image_utils import ImageInput, PILImageResampling, SizeDict -from ...processing_utils import Unpack +from ...processing_utils import ImagesKwargs, Unpack from ...utils import ( TensorType, auto_docstring, @@ -309,8 +308,8 @@ def get_best_fit( return optimal_canvas -class Llama4ImageProcessorKwargs(DefaultFastImageProcessorKwargs): - """ +class Llama4ImageProcessorKwargs(ImagesKwargs): + r""" max_patches (`int`, *optional*, defaults to 16): The maximum number of patches to be extracted from the image. Can be overridden by the `max_patches` parameter in the `preprocess` method. diff --git a/src/transformers/models/llama4/processing_llama4.py b/src/transformers/models/llama4/processing_llama4.py index 47a0b4cd99fb..df371bdfd710 100644 --- a/src/transformers/models/llama4/processing_llama4.py +++ b/src/transformers/models/llama4/processing_llama4.py @@ -16,20 +16,14 @@ from typing import Optional, Union -from transformers.processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack +from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack from transformers.tokenization_utils_base import PreTokenizedInput, TextInput from ...image_processing_utils import BatchFeature from ...image_utils import ImageInput, make_flat_list_of_images -class Llama4ImagesKwargs(ImagesKwargs, total=False): - max_patches: Optional[int] - resize_to_max_canvas: Optional[bool] - - class Llama4ProcessorKwargs(ProcessingKwargs, total=False): - images_kwargs: Llama4ImagesKwargs _defaults = { "text_kwargs": { "padding_side": "left", @@ -139,8 +133,6 @@ def __call__( self, images: Optional[ImageInput] = None, text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None, - audio=None, - videos=None, **kwargs: Unpack[Llama4ProcessorKwargs], ) -> BatchFeature: """ diff --git a/src/transformers/models/llava/image_processing_llava_fast.py b/src/transformers/models/llava/image_processing_llava_fast.py index 596070040549..66ccb49c3671 100644 --- a/src/transformers/models/llava/image_processing_llava_fast.py +++ b/src/transformers/models/llava/image_processing_llava_fast.py @@ -22,7 +22,6 @@ from ...image_processing_utils import BatchFeature from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, group_images_by_shape, reorder_images, ) @@ -30,21 +29,16 @@ OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ChannelDimension, - ImageInput, PILImageResampling, SizeDict, get_image_size, ) -from ...processing_utils import Unpack from ...utils import ( TensorType, auto_docstring, ) -class LlavaFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): ... - - @auto_docstring class LlavaImageProcessorFast(BaseImageProcessorFast): resample = PILImageResampling.BICUBIC @@ -59,14 +53,6 @@ class LlavaImageProcessorFast(BaseImageProcessorFast): do_rescale = True do_normalize = True do_convert_rgb = True - valid_kwargs = LlavaFastImageProcessorKwargs - - def __init__(self, **kwargs: Unpack[LlavaFastImageProcessorKwargs]) -> None: - super().__init__(**kwargs) - - @auto_docstring - def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaFastImageProcessorKwargs]) -> BatchFeature: - return super().preprocess(images, **kwargs) def pad_to_square( self, diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py index 398bd9d8d065..6f8d9e3a14cc 100644 --- a/src/transformers/models/llava/processing_llava.py +++ b/src/transformers/models/llava/processing_llava.py @@ -38,7 +38,6 @@ class LlavaProcessorKwargs(ProcessingKwargs, total=False): _defaults = { "text_kwargs": {"padding": False, "return_mm_token_type_ids": False}, - "images_kwargs": {}, } @@ -94,8 +93,6 @@ def __call__( self, images: Optional[ImageInput] = None, text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, - audio=None, - videos=None, **kwargs: Unpack[LlavaProcessorKwargs], ) -> BatchFeature: """ diff --git a/src/transformers/models/llava_next/image_processing_llava_next.py b/src/transformers/models/llava_next/image_processing_llava_next.py index 60d36abb748b..56ebc10f391d 100644 --- a/src/transformers/models/llava_next/image_processing_llava_next.py +++ b/src/transformers/models/llava_next/image_processing_llava_next.py @@ -48,6 +48,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import TensorType, is_vision_available, logging @@ -58,6 +59,17 @@ from PIL import Image +class LlavaNextImageProcessorKwargs(ImagesKwargs): + r""" + image_grid_pinpoints (`list[list[int]]`, *optional*): + A list of possible resolutions to use for processing high resolution images. The best resolution is selected + based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess` + method. + """ + + image_grid_pinpoints: Optional[list[list[int]]] + + def divide_to_patches(image: np.ndarray, patch_size: int, input_data_format) -> list[np.ndarray]: """ Divides an image into patches of a specified size. @@ -152,6 +164,7 @@ class LlavaNextImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values", "image_sizes"] + valid_kwargs = LlavaNextImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/llava_next/image_processing_llava_next_fast.py b/src/transformers/models/llava_next/image_processing_llava_next_fast.py index df20e2b90e83..19d6fb941e7b 100644 --- a/src/transformers/models/llava_next/image_processing_llava_next_fast.py +++ b/src/transformers/models/llava_next/image_processing_llava_next_fast.py @@ -22,7 +22,6 @@ from ...image_processing_utils import BatchFeature, get_patch_output_size, select_best_resolution from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, divide_to_patches, group_images_by_shape, reorder_images, @@ -41,17 +40,7 @@ TensorType, auto_docstring, ) - - -class LlavaNextFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - """ - image_grid_pinpoints (`list[list[int]]`, *optional*): - A list of possible resolutions to use for processing high resolution images. The best resolution is selected - based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess` - method. - """ - - image_grid_pinpoints: Optional[list[list[int]]] +from .image_processing_llava_next import LlavaNextImageProcessorKwargs @auto_docstring @@ -71,13 +60,13 @@ class LlavaNextImageProcessorFast(BaseImageProcessorFast): do_convert_rgb = True do_pad = True image_grid_pinpoints = [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]] - valid_kwargs = LlavaNextFastImageProcessorKwargs + valid_kwargs = LlavaNextImageProcessorKwargs - def __init__(self, **kwargs: Unpack[LlavaNextFastImageProcessorKwargs]): + def __init__(self, **kwargs: Unpack[LlavaNextImageProcessorKwargs]): super().__init__(**kwargs) @auto_docstring - def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaNextFastImageProcessorKwargs]) -> BatchFeature: + def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaNextImageProcessorKwargs]) -> BatchFeature: return super().preprocess(images, **kwargs) def _resize_for_patching( diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py index 2574fc443519..04493518a020 100644 --- a/src/transformers/models/llava_next/processing_llava_next.py +++ b/src/transformers/models/llava_next/processing_llava_next.py @@ -104,8 +104,6 @@ def __call__( self, images: Optional[ImageInput] = None, text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, - audio=None, - videos=None, **kwargs: Unpack[LlavaNextProcessorKwargs], ) -> BatchFeature: """ diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py index e858a1784254..015e4cdea6df 100644 --- a/src/transformers/models/llava_next_video/processing_llava_next_video.py +++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py @@ -116,7 +116,6 @@ def __call__( self, images: Optional[ImageInput] = None, text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, - audio=None, videos: Optional[VideoInput] = None, **kwargs: Unpack[LlavaNextVideoProcessorKwargs], ) -> BatchFeature: diff --git a/src/transformers/models/llava_next_video/video_processing_llava_next_video.py b/src/transformers/models/llava_next_video/video_processing_llava_next_video.py index 80ed1e5b81b9..c353100cee79 100644 --- a/src/transformers/models/llava_next_video/video_processing_llava_next_video.py +++ b/src/transformers/models/llava_next_video/video_processing_llava_next_video.py @@ -15,13 +15,9 @@ """Video processor class for LLaVa-NeXT-Video.""" from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling -from ...processing_utils import Unpack, VideosKwargs from ...video_processing_utils import BaseVideoProcessor -class LlavaNextVideoFastVideoProcessorInitKwargs(VideosKwargs): ... - - class LlavaNextVideoVideoProcessor(BaseVideoProcessor): resample = PILImageResampling.BICUBIC image_mean = OPENAI_CLIP_MEAN @@ -35,11 +31,6 @@ class LlavaNextVideoVideoProcessor(BaseVideoProcessor): do_normalize = True do_convert_rgb = True do_sample_frames = False # Set to False for BC, recommended to set `True` in new models - valid_kwargs = LlavaNextVideoFastVideoProcessorInitKwargs - model_input_names = ["pixel_values_videos"] - - def __init__(self, **kwargs: Unpack[LlavaNextVideoFastVideoProcessorInitKwargs]): - super().__init__(**kwargs) __all__ = ["LlavaNextVideoVideoProcessor"] diff --git a/src/transformers/models/llava_onevision/image_processing_llava_onevision.py b/src/transformers/models/llava_onevision/image_processing_llava_onevision.py index b679ac269747..119df9550a2a 100644 --- a/src/transformers/models/llava_onevision/image_processing_llava_onevision.py +++ b/src/transformers/models/llava_onevision/image_processing_llava_onevision.py @@ -47,6 +47,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import TensorType, is_vision_available, logging @@ -57,6 +58,17 @@ from PIL import Image +class LlavaOnevisionImageProcessorKwargs(ImagesKwargs): + r""" + image_grid_pinpoints (`list[list[int]]`, *optional*): + A list of possible resolutions to use for processing high resolution images. The best resolution is selected + based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess` + method. + """ + + image_grid_pinpoints: Optional[list[list[int]]] + + # Copied from transformers.models.llava_next.image_processing_llava_next.divide_to_patches def divide_to_patches(image: np.ndarray, patch_size: int, input_data_format) -> list[np.ndarray]: """ @@ -146,6 +158,7 @@ class LlavaOnevisionImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values", "image_sizes", "batch_num_images"] + valid_kwargs = LlavaOnevisionImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py b/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py index 11872cb67bf3..b309583461ce 100644 --- a/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +++ b/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py @@ -27,7 +27,6 @@ from ...image_processing_utils import BatchFeature, get_patch_output_size, select_best_resolution from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, divide_to_patches, group_images_by_shape, reorder_images, @@ -43,17 +42,7 @@ ) from ...processing_utils import Unpack from ...utils import TensorType, auto_docstring - - -class LlavaOnevisionFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - """ - image_grid_pinpoints (`list[list[int]]`, *optional*): - A list of possible resolutions to use for processing high resolution images. The best resolution is selected - based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess` - method. - """ - - image_grid_pinpoints: Optional[list[list[int]]] +from .image_processing_llava_onevision import LlavaOnevisionImageProcessorKwargs @auto_docstring @@ -71,14 +60,14 @@ class LlavaOnevisionImageProcessorFast(BaseImageProcessorFast): do_convert_rgb = True do_pad = True image_grid_pinpoints = [[384, 384], [384, 768], [384, 1152], [384, 1536], [384, 1920], [384, 2304], [768, 384], [768, 768], [768, 1152], [768, 1536], [768, 1920], [768, 2304], [1152, 384], [1152, 768], [1152, 1152], [1152, 1536], [1152, 1920], [1152, 2304], [1536, 384], [1536, 768], [1536, 1152], [1536, 1536], [1536, 1920], [1536, 2304], [1920, 384], [1920, 768], [1920, 1152], [1920, 1536], [1920, 1920], [1920, 2304], [2304, 384], [2304, 768], [2304, 1152], [2304, 1536], [2304, 1920], [2304, 2304]] # fmt: skip - valid_kwargs = LlavaOnevisionFastImageProcessorKwargs + valid_kwargs = LlavaOnevisionImageProcessorKwargs model_input_names = ["pixel_values", "image_sizes", "batch_num_images"] - def __init__(self, **kwargs: Unpack[LlavaOnevisionFastImageProcessorKwargs]): + def __init__(self, **kwargs: Unpack[LlavaOnevisionImageProcessorKwargs]): super().__init__(**kwargs) @auto_docstring - def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaOnevisionFastImageProcessorKwargs]) -> BatchFeature: + def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaOnevisionImageProcessorKwargs]) -> BatchFeature: if isinstance(images, (tuple, list)) and isinstance(images[0], (tuple, list)): # if the first element is a list, we assume that all elements are lists batch_num_images = [len(x) for x in images] diff --git a/src/transformers/models/llava_onevision/modular_llava_onevision.py b/src/transformers/models/llava_onevision/modular_llava_onevision.py index b4f64dee8e04..890fcdd7ecaa 100644 --- a/src/transformers/models/llava_onevision/modular_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modular_llava_onevision.py @@ -35,7 +35,7 @@ from ...cache_utils import Cache from ...image_processing_utils import BatchFeature -from ...image_processing_utils_fast import DefaultFastImageProcessorKwargs, group_images_by_shape, reorder_images +from ...image_processing_utils_fast import group_images_by_shape, reorder_images from ...image_utils import ( OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, @@ -53,22 +53,12 @@ can_return_tuple, logging, ) +from .image_processing_llava_onevision import LlavaOnevisionImageProcessorKwargs logger = logging.get_logger(__name__) -class LlavaOnevisionFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - """ - image_grid_pinpoints (`list[list[int]]`, *optional*): - A list of possible resolutions to use for processing high resolution images. The best resolution is selected - based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess` - method. - """ - - image_grid_pinpoints: Optional[list[list[int]]] - - class LlavaOnevisionImageProcessorFast(LlavaNextImageProcessorFast): resample = PILImageResampling.BICUBIC image_mean = OPENAI_CLIP_MEAN @@ -129,7 +119,7 @@ def pad_to_square( return padded_images @auto_docstring - def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaOnevisionFastImageProcessorKwargs]) -> BatchFeature: + def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaOnevisionImageProcessorKwargs]) -> BatchFeature: if isinstance(images, (tuple, list)) and isinstance(images[0], (tuple, list)): # if the first element is a list, we assume that all elements are lists batch_num_images = [len(x) for x in images] diff --git a/src/transformers/models/llava_onevision/processing_llava_onevision.py b/src/transformers/models/llava_onevision/processing_llava_onevision.py index 9fadc6af3067..ff8eae5dd87a 100644 --- a/src/transformers/models/llava_onevision/processing_llava_onevision.py +++ b/src/transformers/models/llava_onevision/processing_llava_onevision.py @@ -42,7 +42,6 @@ class LlavaOnevisionProcessorKwargs(ProcessingKwargs, total=False): "return_mm_token_type_ids": False, }, "image_kwargs": {}, - "videos_kwargs": {}, } @@ -114,7 +113,6 @@ def __call__( self, images: Optional[ImageInput] = None, text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, - audio=None, videos: Optional[VideoInput] = None, **kwargs: Unpack[LlavaOnevisionProcessorKwargs], ) -> BatchFeature: diff --git a/src/transformers/models/llava_onevision/video_processing_llava_onevision.py b/src/transformers/models/llava_onevision/video_processing_llava_onevision.py index ddae0fcd3b6f..b201085f5ee6 100644 --- a/src/transformers/models/llava_onevision/video_processing_llava_onevision.py +++ b/src/transformers/models/llava_onevision/video_processing_llava_onevision.py @@ -15,13 +15,9 @@ """Video processor class for LLaVa-Onevision.""" from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling -from ...processing_utils import Unpack, VideosKwargs from ...video_processing_utils import BaseVideoProcessor -class LlavaOnevisionFastVideoProcessorInitKwargs(VideosKwargs): ... - - class LlavaOnevisionVideoProcessor(BaseVideoProcessor): resample = PILImageResampling.BICUBIC image_mean = OPENAI_CLIP_MEAN @@ -36,11 +32,6 @@ class LlavaOnevisionVideoProcessor(BaseVideoProcessor): do_normalize = True do_convert_rgb = True do_sample_frames = False # Set to False for BC, recommended to set `True` in new models - valid_kwargs = LlavaOnevisionFastVideoProcessorInitKwargs - model_input_names = ["pixel_values_videos"] - - def __init__(self, **kwargs: Unpack[LlavaOnevisionFastVideoProcessorInitKwargs]): - super().__init__(**kwargs) __all__ = ["LlavaOnevisionVideoProcessor"] diff --git a/src/transformers/models/mask2former/image_processing_mask2former.py b/src/transformers/models/mask2former/image_processing_mask2former.py index 752a3221d17f..d9d580955fbd 100644 --- a/src/transformers/models/mask2former/image_processing_mask2former.py +++ b/src/transformers/models/mask2former/image_processing_mask2former.py @@ -41,6 +41,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import ( IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, @@ -60,6 +61,25 @@ from torch import nn +class Mask2FormerImageProcessorKwargs(ImagesKwargs): + r""" + ignore_index (`int`, *optional*): + Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels + denoted with 0 (background) will be replaced with `ignore_index`. + do_reduce_labels (`bool`, *optional*, defaults to `False`): + Whether or not to decrement all label values of segmentation maps by 1. Usually used for datasets where 0 + is used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k). + The background label will be replaced by `ignore_index`. + num_labels (`int`, *optional*): + The number of labels in the segmentation map. + """ + + size_divisor: Optional[int] + ignore_index: Optional[int] + do_reduce_labels: Optional[bool] + num_labels: Optional[int] + + # Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, int]: """ @@ -440,6 +460,7 @@ class Mask2FormerImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values", "pixel_mask"] + valid_kwargs = Mask2FormerImageProcessorKwargs @filter_out_non_signature_kwargs(extra=["max_size", *INIT_SERVICE_KWARGS]) def __init__( diff --git a/src/transformers/models/mask2former/image_processing_mask2former_fast.py b/src/transformers/models/mask2former/image_processing_mask2former_fast.py index 58dbb09d6319..6b8cd184581b 100644 --- a/src/transformers/models/mask2former/image_processing_mask2former_fast.py +++ b/src/transformers/models/mask2former/image_processing_mask2former_fast.py @@ -28,7 +28,6 @@ from ...image_processing_utils import BatchFeature, get_size_dict from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, SizeDict, get_image_size_for_max_height_width, get_max_height_width, @@ -45,6 +44,7 @@ from ...processing_utils import Unpack from ...utils import TensorType, auto_docstring, logging from .image_processing_mask2former import ( + Mask2FormerImageProcessorKwargs, compute_segments, convert_segmentation_to_rle, get_size_with_aspect_ratio, @@ -55,28 +55,6 @@ logger = logging.get_logger(__name__) -class Mask2FormerFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - r""" - size_divisor (`int`, *optional*, defaults to 32): - Some backbones need images divisible by a certain number. If not passed, it defaults to the value used in - Swin Transformer. - ignore_index (`int`, *optional*): - Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels - denoted with 0 (background) will be replaced with `ignore_index`. - do_reduce_labels (`bool`, *optional*, defaults to `False`): - Whether or not to decrement all label values of segmentation maps by 1. Usually used for datasets where 0 - is used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k). - The background label will be replaced by `ignore_index`. - num_labels (`int`, *optional*): - The number of labels in the segmentation map. - """ - - size_divisor: Optional[int] - ignore_index: Optional[int] - do_reduce_labels: Optional[bool] - num_labels: Optional[int] - - def convert_segmentation_map_to_binary_masks_fast( segmentation_map: "torch.Tensor", instance_id_to_semantic_id: Optional[dict[int, int]] = None, @@ -127,9 +105,9 @@ class Mask2FormerImageProcessorFast(BaseImageProcessorFast): model_input_names = ["pixel_values", "pixel_mask"] size_divisor = 32 do_reduce_labels = False - valid_kwargs = Mask2FormerFastImageProcessorKwargs + valid_kwargs = Mask2FormerImageProcessorKwargs - def __init__(self, **kwargs: Unpack[Mask2FormerFastImageProcessorKwargs]) -> None: + def __init__(self, **kwargs: Unpack[Mask2FormerImageProcessorKwargs]) -> None: if "pad_and_return_pixel_mask" in kwargs: kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") @@ -259,7 +237,7 @@ def preprocess( images: ImageInput, segmentation_maps: Optional[ImageInput] = None, instance_id_to_semantic_id: Optional[Union[list[dict[int, int]], dict[int, int]]] = None, - **kwargs: Unpack[Mask2FormerFastImageProcessorKwargs], + **kwargs: Unpack[Mask2FormerImageProcessorKwargs], ) -> BatchFeature: r""" segmentation_maps (`ImageInput`, *optional*): @@ -282,7 +260,7 @@ def _preprocess_image_like_inputs( do_convert_rgb: bool, input_data_format: ChannelDimension, device: Optional[Union[str, "torch.device"]] = None, - **kwargs: Unpack[Mask2FormerFastImageProcessorKwargs], + **kwargs: Unpack[Mask2FormerImageProcessorKwargs], ) -> BatchFeature: """ Preprocess image-like inputs. diff --git a/src/transformers/models/maskformer/image_processing_maskformer.py b/src/transformers/models/maskformer/image_processing_maskformer.py index c306093e116d..8353856fb868 100644 --- a/src/transformers/models/maskformer/image_processing_maskformer.py +++ b/src/transformers/models/maskformer/image_processing_maskformer.py @@ -42,6 +42,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import ( IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, @@ -66,6 +67,25 @@ from torch import nn +class MaskFormerImageProcessorKwargs(ImagesKwargs): + r""" + ignore_index (`int`, *optional*): + Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels + denoted with 0 (background) will be replaced with `ignore_index`. + do_reduce_labels (`bool`, *optional*, defaults to `False`): + Whether or not to decrement all label values of segmentation maps by 1. Usually used for datasets where 0 + is used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k). + The background label will be replaced by `ignore_index`. + num_labels (`int`, *optional*): + The number of labels in the segmentation map. + """ + + size_divisor: Optional[int] + ignore_index: Optional[int] + do_reduce_labels: Optional[bool] + num_labels: Optional[int] + + # Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, int]: """ @@ -446,6 +466,7 @@ class MaskFormerImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values", "pixel_mask"] + valid_kwargs = MaskFormerImageProcessorKwargs @filter_out_non_signature_kwargs(extra=["max_size", *INIT_SERVICE_KWARGS]) def __init__( diff --git a/src/transformers/models/maskformer/image_processing_maskformer_fast.py b/src/transformers/models/maskformer/image_processing_maskformer_fast.py index 9e15486cfa35..d174b4ada0a9 100644 --- a/src/transformers/models/maskformer/image_processing_maskformer_fast.py +++ b/src/transformers/models/maskformer/image_processing_maskformer_fast.py @@ -25,7 +25,6 @@ from ...image_processing_utils import BatchFeature, get_size_dict from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, SizeDict, get_image_size_for_max_height_width, get_max_height_width, @@ -46,6 +45,7 @@ logging, ) from .image_processing_maskformer import ( + MaskFormerImageProcessorKwargs, compute_segments, convert_segmentation_to_rle, get_size_with_aspect_ratio, @@ -95,28 +95,6 @@ def convert_segmentation_map_to_binary_masks_fast( return binary_masks.float(), labels.long() -class MaskFormerFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - r""" - size_divisor (`int`, *optional*, defaults to 32): - Some backbones need images divisible by a certain number. If not passed, it defaults to the value used in - Swin Transformer. - ignore_index (`int`, *optional*): - Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels - denoted with 0 (background) will be replaced with `ignore_index`. - do_reduce_labels (`bool`, *optional*, defaults to `False`): - Whether or not to decrement all label values of segmentation maps by 1. Usually used for datasets where 0 - is used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k). - The background label will be replaced by `ignore_index`. - num_labels (`int`, *optional*): - The number of labels in the segmentation map. - """ - - size_divisor: Optional[int] - ignore_index: Optional[int] - do_reduce_labels: Optional[bool] - num_labels: Optional[int] - - @auto_docstring class MaskFormerImageProcessorFast(BaseImageProcessorFast): resample = PILImageResampling.BILINEAR @@ -132,9 +110,9 @@ class MaskFormerImageProcessorFast(BaseImageProcessorFast): model_input_names = ["pixel_values", "pixel_mask"] size_divisor = 32 do_reduce_labels = False - valid_kwargs = MaskFormerFastImageProcessorKwargs + valid_kwargs = MaskFormerImageProcessorKwargs - def __init__(self, **kwargs: Unpack[MaskFormerFastImageProcessorKwargs]) -> None: + def __init__(self, **kwargs: Unpack[MaskFormerImageProcessorKwargs]) -> None: if "pad_and_return_pixel_mask" in kwargs: kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") @@ -264,7 +242,7 @@ def preprocess( images: ImageInput, segmentation_maps: Optional[ImageInput] = None, instance_id_to_semantic_id: Optional[Union[list[dict[int, int]], dict[int, int]]] = None, - **kwargs: Unpack[MaskFormerFastImageProcessorKwargs], + **kwargs: Unpack[MaskFormerImageProcessorKwargs], ) -> BatchFeature: r""" segmentation_maps (`ImageInput`, *optional*): @@ -287,7 +265,7 @@ def _preprocess_image_like_inputs( do_convert_rgb: bool, input_data_format: ChannelDimension, device: Optional[Union[str, "torch.device"]] = None, - **kwargs: Unpack[MaskFormerFastImageProcessorKwargs], + **kwargs: Unpack[MaskFormerImageProcessorKwargs], ) -> BatchFeature: """ Preprocess image-like inputs. diff --git a/src/transformers/models/mllama/image_processing_mllama.py b/src/transformers/models/mllama/image_processing_mllama.py index 7578ac698db7..cd79f7de3121 100644 --- a/src/transformers/models/mllama/image_processing_mllama.py +++ b/src/transformers/models/mllama/image_processing_mllama.py @@ -38,6 +38,7 @@ to_numpy_array, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import TensorType, logging @@ -49,6 +50,15 @@ logger = logging.get_logger(__name__) +class MllamaImageProcessorKwargs(ImagesKwargs): + """ + max_image_tiles (`int`, *optional*): + The maximum number of tiles allowed. + """ + + max_image_tiles: Optional[int] + + @lru_cache(maxsize=10) def get_all_supported_aspect_ratios(max_image_tiles: int) -> list[tuple[int, int]]: """ @@ -567,6 +577,7 @@ class MllamaImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values", "num_tiles", "aspect_ratio_ids", "aspect_ratio_mask"] + valid_kwargs = MllamaImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/mllama/processing_mllama.py b/src/transformers/models/mllama/processing_mllama.py index 5420a2deeeb1..3955006a4f9e 100644 --- a/src/transformers/models/mllama/processing_mllama.py +++ b/src/transformers/models/mllama/processing_mllama.py @@ -21,17 +21,11 @@ from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput, make_nested_list_of_images -from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack +from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput -class MllamaImagesKwargs(ImagesKwargs, total=False): - max_image_tiles: Optional[int] - - class MllamaProcessorKwargs(ProcessingKwargs, total=False): - images_kwargs: MllamaImagesKwargs - _defaults = { "image_kwargs": { "max_image_tiles": 4, @@ -225,8 +219,6 @@ def __call__( self, images: Optional[ImageInput] = None, text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None, - audio=None, - videos=None, **kwargs: Unpack[MllamaProcessorKwargs], ) -> BatchFeature: """ @@ -267,10 +259,8 @@ def __call__( **kwargs, ) - text_kwargs = output_kwargs["text_kwargs"] - text_kwargs["return_tensors"] = None + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) images_kwargs = output_kwargs["images_kwargs"] - common_kwargs = output_kwargs["common_kwargs"] data = {} if text is not None: @@ -280,8 +270,7 @@ def __call__( raise ValueError("Invalid input text. Please provide a string, or a list of strings") n_images_in_text = [t.count(self.image_token) for t in text] text = [build_string_from_input(text_item, self.bos_token, self.image_token) for text_item in text] - _ = text_kwargs.pop("padding_side", None) # hack until padding-side is an accepted kwarg by tokenizers - encoding = self.tokenizer(text, **text_kwargs) + encoding = self.tokenizer(text, **output_kwargs["text_kwargs"]) self._check_special_mm_tokens(text, encoding, modalities=["image"]) n_images_in_ids = [token_ids.count(self.image_token_id) for token_ids in encoding["input_ids"]] data.update(encoding) @@ -334,10 +323,7 @@ def __call__( ) data["cross_attention_mask"] = cross_attention_mask - return_tensors = common_kwargs.pop("return_tensors", None) - batch_feature = BatchFeature(data=data, tensor_type=return_tensors) - - return batch_feature + return BatchFeature(data=data, tensor_type=return_tensors) def post_process_image_text_to_text( self, generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False, **kwargs diff --git a/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1_fast.py b/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1_fast.py index e716553a6d10..dc10170734ec 100644 --- a/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1_fast.py +++ b/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1_fast.py @@ -16,8 +16,6 @@ from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, - Unpack, ) from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, PILImageResampling from ...utils import auto_docstring @@ -36,8 +34,5 @@ class MobileNetV1ImageProcessorFast(BaseImageProcessorFast): do_rescale = True do_normalize = True - def __init__(self, **kwargs: Unpack[DefaultFastImageProcessorKwargs]) -> None: - super().__init__(**kwargs) - __all__ = ["MobileNetV1ImageProcessorFast"] diff --git a/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py b/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py index 186dc3cf5772..e8dfe992544a 100644 --- a/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py +++ b/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py @@ -37,6 +37,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import TensorType, filter_out_non_signature_kwargs, is_torch_available, is_torch_tensor, logging @@ -50,6 +51,17 @@ logger = logging.get_logger(__name__) +class MobileNetV2ImageProcessorKwargs(ImagesKwargs): + """ + do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`): + Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 + is used for background, and background itself is not included in all classes of a dataset (e.g. + ADE20k). The background label will be replaced by 255. + """ + + do_reduce_labels: Optional[bool] + + @requires(backends=("vision",)) class MobileNetV2ImageProcessor(BaseImageProcessor): r""" @@ -96,6 +108,7 @@ class MobileNetV2ImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values"] + valid_kwargs = MobileNetV2ImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py b/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py index 6c40fbf3f9b8..2c8329a034c1 100644 --- a/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +++ b/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py @@ -22,7 +22,6 @@ from ...image_processing_utils import BatchFeature from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, group_images_by_shape, reorder_images, ) @@ -40,17 +39,7 @@ TensorType, auto_docstring, ) - - -class MobileNetV2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - """ - do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`): - Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 - is used for background, and background itself is not included in all classes of a dataset (e.g. - ADE20k). The background label will be replaced by 255. - """ - - do_reduce_labels: Optional[bool] +from .image_processing_mobilenet_v2 import MobileNetV2ImageProcessorKwargs @auto_docstring @@ -66,9 +55,9 @@ class MobileNetV2ImageProcessorFast(BaseImageProcessorFast): do_rescale = True do_normalize = True do_reduce_labels = False - valid_kwargs = MobileNetV2FastImageProcessorKwargs + valid_kwargs = MobileNetV2ImageProcessorKwargs - def __init__(self, **kwargs: Unpack[MobileNetV2FastImageProcessorKwargs]): + def __init__(self, **kwargs: Unpack[MobileNetV2ImageProcessorKwargs]): super().__init__(**kwargs) # Copied from transformers.models.beit.image_processing_beit_fast.BeitImageProcessorFast.reduce_label @@ -87,7 +76,7 @@ def preprocess( self, images: ImageInput, segmentation_maps: Optional[ImageInput] = None, - **kwargs: Unpack[MobileNetV2FastImageProcessorKwargs], + **kwargs: Unpack[MobileNetV2ImageProcessorKwargs], ) -> BatchFeature: r""" segmentation_maps (`ImageInput`, *optional*): @@ -102,7 +91,7 @@ def _preprocess_image_like_inputs( do_convert_rgb: bool, input_data_format: ChannelDimension, device: Optional[Union[str, "torch.device"]] = None, - **kwargs: Unpack[MobileNetV2FastImageProcessorKwargs], + **kwargs: Unpack[MobileNetV2ImageProcessorKwargs], ) -> BatchFeature: """ Preprocess image-like inputs. diff --git a/src/transformers/models/mobilevit/image_processing_mobilevit.py b/src/transformers/models/mobilevit/image_processing_mobilevit.py index 0ea7a0706cc4..576ef9f449dc 100644 --- a/src/transformers/models/mobilevit/image_processing_mobilevit.py +++ b/src/transformers/models/mobilevit/image_processing_mobilevit.py @@ -31,6 +31,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import ( TensorType, filter_out_non_signature_kwargs, @@ -52,6 +53,20 @@ logger = logging.get_logger(__name__) +class MobileVitImageProcessorKwargs(ImagesKwargs): + """ + do_flip_channel_order (`bool`, *optional*, defaults to `self.do_flip_channel_order`): + Whether to flip the color channels from RGB to BGR or vice versa. + do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`): + Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 + is used for background, and background itself is not included in all classes of a dataset (e.g. + ADE20k). The background label will be replaced by 255. + """ + + do_flip_channel_order: Optional[bool] + do_reduce_labels: Optional[bool] + + @requires(backends=("vision",)) class MobileViTImageProcessor(BaseImageProcessor): r""" @@ -91,6 +106,7 @@ class MobileViTImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values"] + valid_kwargs = MobileVitImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/mobilevit/image_processing_mobilevit_fast.py b/src/transformers/models/mobilevit/image_processing_mobilevit_fast.py index fab16ecfdc87..81d745c2b54d 100644 --- a/src/transformers/models/mobilevit/image_processing_mobilevit_fast.py +++ b/src/transformers/models/mobilevit/image_processing_mobilevit_fast.py @@ -22,7 +22,6 @@ from ...image_processing_utils import BatchFeature from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, group_images_by_shape, reorder_images, ) @@ -38,20 +37,7 @@ TensorType, auto_docstring, ) - - -class MobileVitFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - """ - do_flip_channel_order (`bool`, *optional*, defaults to `self.do_flip_channel_order`): - Whether to flip the color channels from RGB to BGR or vice versa. - do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`): - Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 - is used for background, and background itself is not included in all classes of a dataset (e.g. - ADE20k). The background label will be replaced by 255. - """ - - do_flip_channel_order: Optional[bool] - do_reduce_labels: Optional[bool] +from .image_processing_mobilevit import MobileVitImageProcessorKwargs @auto_docstring @@ -67,9 +53,9 @@ class MobileViTImageProcessorFast(BaseImageProcessorFast): do_convert_rgb = None do_flip_channel_order = True do_reduce_labels = False - valid_kwargs = MobileVitFastImageProcessorKwargs + valid_kwargs = MobileVitImageProcessorKwargs - def __init__(self, **kwargs: Unpack[MobileVitFastImageProcessorKwargs]): + def __init__(self, **kwargs: Unpack[MobileVitImageProcessorKwargs]): super().__init__(**kwargs) # Copied from transformers.models.beit.image_processing_beit_fast.BeitImageProcessorFast.reduce_label @@ -88,7 +74,7 @@ def preprocess( self, images: ImageInput, segmentation_maps: Optional[ImageInput] = None, - **kwargs: Unpack[MobileVitFastImageProcessorKwargs], + **kwargs: Unpack[MobileVitImageProcessorKwargs], ) -> BatchFeature: r""" segmentation_maps (`ImageInput`, *optional*): @@ -103,7 +89,7 @@ def _preprocess_image_like_inputs( do_convert_rgb: bool, input_data_format: ChannelDimension, device: Optional[Union[str, "torch.device"]] = None, - **kwargs: Unpack[MobileVitFastImageProcessorKwargs], + **kwargs: Unpack[MobileVitImageProcessorKwargs], ) -> BatchFeature: """ Preprocess image-like inputs. diff --git a/src/transformers/models/nougat/image_processing_nougat.py b/src/transformers/models/nougat/image_processing_nougat.py index b4ec184773d4..0a5c445645e0 100644 --- a/src/transformers/models/nougat/image_processing_nougat.py +++ b/src/transformers/models/nougat/image_processing_nougat.py @@ -40,6 +40,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import TensorType, filter_out_non_signature_kwargs, logging from ...utils.import_utils import is_vision_available @@ -51,6 +52,21 @@ import PIL +class NougatImageProcessorKwargs(ImagesKwargs): + r""" + do_crop_margin (`bool`, *optional*, defaults to `True`): + Whether to crop the image margins. + do_thumbnail (`bool`, *optional*, defaults to `True`): + Whether to resize the image using thumbnail method. + do_align_long_axis (`bool`, *optional*, defaults to `False`): + Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees. + """ + + do_crop_margin: Optional[bool] + do_thumbnail: Optional[bool] + do_align_long_axis: Optional[bool] + + class NougatImageProcessor(BaseImageProcessor): r""" Constructs a Nougat image processor. @@ -87,6 +103,7 @@ class NougatImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values"] + valid_kwargs = NougatImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/nougat/image_processing_nougat_fast.py b/src/transformers/models/nougat/image_processing_nougat_fast.py index 15cee9051082..b059688d0046 100644 --- a/src/transformers/models/nougat/image_processing_nougat_fast.py +++ b/src/transformers/models/nougat/image_processing_nougat_fast.py @@ -22,7 +22,6 @@ from ...image_processing_utils import BatchFeature from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, group_images_by_shape, reorder_images, ) @@ -42,22 +41,7 @@ TensorType, auto_docstring, ) - - -class NougatFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - """ - Args: - do_crop_margin (`bool`, *optional*, defaults to `True`): - Whether to crop the image margins. - do_thumbnail (`bool`, *optional*, defaults to `True`): - Whether to resize the image using thumbnail method. - do_align_long_axis (`bool`, *optional*, defaults to `False`): - Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees. - """ - - do_crop_margin: Optional[bool] - do_thumbnail: Optional[bool] - do_align_long_axis: Optional[bool] +from .image_processing_nougat import NougatImageProcessorKwargs @auto_docstring @@ -73,13 +57,13 @@ class NougatImageProcessorFast(BaseImageProcessorFast): do_pad: bool = True do_rescale = True do_crop_margin: bool = True - valid_kwargs = NougatFastImageProcessorKwargs + valid_kwargs = NougatImageProcessorKwargs - def __init__(self, **kwargs: Unpack[NougatFastImageProcessorKwargs]): + def __init__(self, **kwargs: Unpack[NougatImageProcessorKwargs]): super().__init__(**kwargs) @auto_docstring - def preprocess(self, images: ImageInput, **kwargs: Unpack[NougatFastImageProcessorKwargs]) -> BatchFeature: + def preprocess(self, images: ImageInput, **kwargs: Unpack[NougatImageProcessorKwargs]) -> BatchFeature: return super().preprocess(images, **kwargs) def python_find_non_zero( diff --git a/src/transformers/models/omdet_turbo/processing_omdet_turbo.py b/src/transformers/models/omdet_turbo/processing_omdet_turbo.py index 0c4cfd40eb62..842fe5d9bddf 100644 --- a/src/transformers/models/omdet_turbo/processing_omdet_turbo.py +++ b/src/transformers/models/omdet_turbo/processing_omdet_turbo.py @@ -65,7 +65,6 @@ class OmDetTurboProcessorKwargs(ProcessingKwargs, total=False): "verbose": True, "task": None, }, - "images_kwargs": {}, } @@ -227,8 +226,6 @@ def __call__( self, images: Optional[ImageInput] = None, text: Optional[Union[list[str], list[list[str]]]] = None, - audio=None, - videos=None, **kwargs: Unpack[OmDetTurboProcessorKwargs], ) -> BatchFeature: """ diff --git a/src/transformers/models/oneformer/image_processing_oneformer.py b/src/transformers/models/oneformer/image_processing_oneformer.py index 756480d81e5a..86ce8abf084e 100644 --- a/src/transformers/models/oneformer/image_processing_oneformer.py +++ b/src/transformers/models/oneformer/image_processing_oneformer.py @@ -44,6 +44,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import ( IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, @@ -63,6 +64,30 @@ from torch import nn +class OneFormerImageProcessorKwargs(ImagesKwargs): + r""" + repo_path (`str`, *optional*, defaults to `shi-labs/oneformer_demo`): + Path to a local directory or Hugging Face Hub repository containing model metadata. + class_info_file (`str`, *optional*): + Path to the JSON file within the repository that contains class metadata. + num_text (`int`, *optional*): + Number of text queries for the text encoder, used as task-guiding prompts. + num_labels (`int`, *optional*): + Number of semantic classes for segmentation, determining the output layer's size. + ignore_index (`int`, *optional*): + Label to ignore in segmentation maps, often used for padding. + do_reduce_labels (`bool`, *optional*, defaults to `False`): + Whether to decrement all label values by 1, mapping the background class to `ignore_index`. + """ + + repo_path: Optional[str] + class_info_file: Optional[str] + num_text: Optional[int] + num_labels: Optional[int] + ignore_index: Optional[int] + do_reduce_labels: Optional[bool] + + # Copied from transformers.models.detr.image_processing_detr.max_across_indices def max_across_indices(values: Iterable[Any]) -> list[Any]: """ @@ -423,6 +448,7 @@ class OneFormerImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values", "pixel_mask", "task_inputs"] + valid_kwargs = OneFormerImageProcessorKwargs @filter_out_non_signature_kwargs(extra=["max_size", "metadata", *INIT_SERVICE_KWARGS]) def __init__( diff --git a/src/transformers/models/oneformer/image_processing_oneformer_fast.py b/src/transformers/models/oneformer/image_processing_oneformer_fast.py index 4a20a04e70f2..a14b0015b498 100644 --- a/src/transformers/models/oneformer/image_processing_oneformer_fast.py +++ b/src/transformers/models/oneformer/image_processing_oneformer_fast.py @@ -23,7 +23,6 @@ from ...image_processing_utils_fast import ( BaseImageProcessorFast, BatchFeature, - DefaultFastImageProcessorKwargs, get_max_height_width, group_images_by_shape, reorder_images, @@ -42,7 +41,7 @@ auto_docstring, logging, ) -from .image_processing_oneformer import load_metadata, prepare_metadata +from .image_processing_oneformer import OneFormerImageProcessorKwargs, load_metadata, prepare_metadata logger = logging.get_logger(__name__) @@ -300,30 +299,6 @@ def get_oneformer_resize_output_image_size( return (new_long, new_short) if width <= height else (new_short, new_long) -class OneFormerFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - r""" - repo_path (`str`, *optional*, defaults to `shi-labs/oneformer_demo`): - Path to a local directory or Hugging Face Hub repository containing model metadata. - class_info_file (`str`, *optional*): - Path to the JSON file within the repository that contains class metadata. - num_text (`int`, *optional*): - Number of text queries for the text encoder, used as task-guiding prompts. - num_labels (`int`, *optional*): - Number of semantic classes for segmentation, determining the output layer's size. - ignore_index (`int`, *optional*): - Label to ignore in segmentation maps, often used for padding. - do_reduce_labels (`bool`, *optional*, defaults to `False`): - Whether to decrement all label values by 1, mapping the background class to `ignore_index`. - """ - - repo_path: Optional[str] - class_info_file: Optional[str] - num_text: Optional[int] - num_labels: Optional[int] - ignore_index: Optional[int] - do_reduce_labels: Optional[bool] - - @auto_docstring class OneFormerImageProcessorFast(BaseImageProcessorFast): resample = PILImageResampling.BILINEAR @@ -344,10 +319,10 @@ class OneFormerImageProcessorFast(BaseImageProcessorFast): class_info_file = None num_text = None num_labels = None - valid_kwargs = OneFormerFastImageProcessorKwargs + valid_kwargs = OneFormerImageProcessorKwargs model_input_names = ["pixel_values", "pixel_mask", "task_inputs"] - def __init__(self, **kwargs: Unpack[OneFormerFastImageProcessorKwargs]): + def __init__(self, **kwargs: Unpack[OneFormerImageProcessorKwargs]): super().__init__(**kwargs) if self.class_info_file: self.metadata = prepare_metadata(load_metadata(self.repo_path, self.class_info_file)) @@ -359,7 +334,7 @@ def preprocess( task_inputs: Optional[list[str]] = None, segmentation_maps: Optional[ImageInput] = None, instance_id_to_semantic_id: Optional[Union[list[dict[int, int]], dict[int, int]]] = None, - **kwargs: Unpack[OneFormerFastImageProcessorKwargs], + **kwargs: Unpack[OneFormerImageProcessorKwargs], ) -> BatchFeature: r""" task_inputs (`list[str]`, *optional*): @@ -386,7 +361,7 @@ def _preprocess_image_like_inputs( do_convert_rgb: bool, input_data_format: ChannelDimension, device: Optional[Union[str, "torch.device"]] = None, - **kwargs: Unpack[OneFormerFastImageProcessorKwargs], + **kwargs: Unpack[OneFormerImageProcessorKwargs], ) -> BatchFeature: """ Preprocess image-like inputs. diff --git a/src/transformers/models/ovis2/image_processing_ovis2.py b/src/transformers/models/ovis2/image_processing_ovis2.py index 633a7fdee46c..5d0c2e2097d3 100644 --- a/src/transformers/models/ovis2/image_processing_ovis2.py +++ b/src/transformers/models/ovis2/image_processing_ovis2.py @@ -33,6 +33,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging @@ -43,6 +44,29 @@ logger = logging.get_logger(__name__) +class Ovis2ImageProcessorKwargs(ImagesKwargs): + """ + crop_to_patches (`bool`, *optional*, defaults to `False`): + Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the + `preprocess` method. + min_patches (`int`, *optional*, defaults to 1): + The minimum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is + set to `True`. Can be overridden by the `min_patches` parameter in the `preprocess` method. + max_patches (`int`, *optional*, defaults to 12): + The maximum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is + set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method. + use_covering_area_grid (`bool`, *optional*, defaults to `True`): + Whether to use the covering area grid to determine the number of patches. Only has an effect if + `crop_to_patches` is set to `True`. Can be overridden by the `use_covering_area_grid` parameter in the + `preprocess` method. + """ + + crop_to_patches: Optional[bool] + min_patches: Optional[int] + max_patches: Optional[int] + use_covering_area_grid: Optional[bool] + + # Similar to image_processing_mllama.get_all_supported_aspect_ratios @lru_cache(maxsize=10) def get_all_supported_aspect_ratios(min_image_tiles: int, max_image_tiles: int) -> list[tuple[int, int]]: @@ -224,6 +248,7 @@ class Ovis2ImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values"] + valid_kwargs = Ovis2ImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/ovis2/image_processing_ovis2_fast.py b/src/transformers/models/ovis2/image_processing_ovis2_fast.py index 04b79299e9e1..ea618e073526 100644 --- a/src/transformers/models/ovis2/image_processing_ovis2_fast.py +++ b/src/transformers/models/ovis2/image_processing_ovis2_fast.py @@ -21,7 +21,6 @@ from ...image_processing_utils import BatchFeature from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, group_images_by_shape, reorder_images, ) @@ -37,31 +36,7 @@ TensorType, auto_docstring, ) -from .image_processing_ovis2 import get_min_tile_covering_grid, get_optimal_tiled_canvas - - -class Ovis2ImageProcessorKwargs(DefaultFastImageProcessorKwargs): - """ - Args: - crop_to_patches (`bool`, *optional*, defaults to `False`): - Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the - `preprocess` method. - min_patches (`int`, *optional*, defaults to 1): - The minimum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is - set to `True`. Can be overridden by the `min_patches` parameter in the `preprocess` method. - max_patches (`int`, *optional*, defaults to 12): - The maximum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is - set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method. - use_covering_area_grid (`bool`, *optional*, defaults to `True`): - Whether to use the covering area grid to determine the number of patches. Only has an effect if - `crop_to_patches` is set to `True`. Can be overridden by the `use_covering_area_grid` parameter in the - `preprocess` method. - """ - - crop_to_patches: Optional[bool] - min_patches: Optional[int] - max_patches: Optional[int] - use_covering_area_grid: Optional[bool] +from .image_processing_ovis2 import Ovis2ImageProcessorKwargs, get_min_tile_covering_grid, get_optimal_tiled_canvas @auto_docstring diff --git a/src/transformers/models/owlv2/image_processing_owlv2_fast.py b/src/transformers/models/owlv2/image_processing_owlv2_fast.py index 417fc800ea88..25022f4f6c8c 100644 --- a/src/transformers/models/owlv2/image_processing_owlv2_fast.py +++ b/src/transformers/models/owlv2/image_processing_owlv2_fast.py @@ -25,17 +25,9 @@ import torch from torchvision.transforms.v2 import functional as F -from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature, DefaultFastImageProcessorKwargs +from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature from ...image_transforms import center_to_corners_format, group_images_by_shape, reorder_images -from ...image_utils import ( - OPENAI_CLIP_MEAN, - OPENAI_CLIP_STD, - ChannelDimension, - ImageInput, - PILImageResampling, - SizeDict, -) -from ...processing_utils import Unpack +from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ChannelDimension, PILImageResampling, SizeDict from ...utils import TensorType, auto_docstring from .image_processing_owlv2 import _scale_boxes, box_iou @@ -44,9 +36,6 @@ from .modeling_owlv2 import Owlv2ObjectDetectionOutput -class Owlv2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs): ... - - @auto_docstring class Owlv2ImageProcessorFast(BaseImageProcessorFast): resample = PILImageResampling.BILINEAR @@ -63,7 +52,6 @@ class Owlv2ImageProcessorFast(BaseImageProcessorFast): model_input_names = ["pixel_values"] rescale_factor = 1 / 255 do_pad = True - valid_kwargs = Owlv2FastImageProcessorKwargs def post_process(self, outputs, target_sizes): """ @@ -240,13 +228,6 @@ def post_process_image_guided_detection(self, outputs, threshold=0.0, nms_thresh return results - def __init__(self, **kwargs: Unpack[Owlv2FastImageProcessorKwargs]): - super().__init__(**kwargs) - - @auto_docstring - def preprocess(self, images: ImageInput, **kwargs: Unpack[Owlv2FastImageProcessorKwargs]): - return super().preprocess(images, **kwargs) - def _pad_images(self, images: "torch.Tensor", constant_value: float = 0.5) -> "torch.Tensor": """ Pad an image with zeros to the given size. diff --git a/src/transformers/models/owlv2/modular_owlv2.py b/src/transformers/models/owlv2/modular_owlv2.py index 66acd2088399..c58db1efd46e 100644 --- a/src/transformers/models/owlv2/modular_owlv2.py +++ b/src/transformers/models/owlv2/modular_owlv2.py @@ -21,20 +21,16 @@ from torchvision.transforms.v2 import functional as F from ...image_processing_utils_fast import ( - BaseImageProcessorFast, BatchFeature, - DefaultFastImageProcessorKwargs, ) from ...image_transforms import group_images_by_shape, reorder_images from ...image_utils import ( OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ChannelDimension, - ImageInput, PILImageResampling, SizeDict, ) -from ...processing_utils import Unpack from ...utils import ( TensorType, auto_docstring, @@ -42,9 +38,6 @@ from ..owlvit.image_processing_owlvit_fast import OwlViTImageProcessorFast -class Owlv2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs): ... - - @auto_docstring class Owlv2ImageProcessorFast(OwlViTImageProcessorFast): resample = PILImageResampling.BILINEAR @@ -56,17 +49,9 @@ class Owlv2ImageProcessorFast(OwlViTImageProcessorFast): do_rescale = True do_normalize = True do_pad = True - valid_kwargs = Owlv2FastImageProcessorKwargs crop_size = None do_center_crop = None - def __init__(self, **kwargs: Unpack[Owlv2FastImageProcessorKwargs]): - BaseImageProcessorFast.__init__(self, **kwargs) - - @auto_docstring - def preprocess(self, images: ImageInput, **kwargs: Unpack[Owlv2FastImageProcessorKwargs]): - return BaseImageProcessorFast.preprocess(self, images, **kwargs) - def _pad_images(self, images: "torch.Tensor", constant_value: float = 0.5) -> "torch.Tensor": """ Pad an image with zeros to the given size. diff --git a/src/transformers/models/owlv2/processing_owlv2.py b/src/transformers/models/owlv2/processing_owlv2.py index c22f9b045ae9..65f111e2ca79 100644 --- a/src/transformers/models/owlv2/processing_owlv2.py +++ b/src/transformers/models/owlv2/processing_owlv2.py @@ -47,7 +47,6 @@ class Owlv2ProcessorKwargs(ProcessingKwargs, total=False): "text_kwargs": { "padding": "max_length", }, - "images_kwargs": {}, "common_kwargs": { "return_tensors": "np", }, @@ -79,8 +78,6 @@ def __call__( self, images: Optional[ImageInput] = None, text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, - audio=None, - videos=None, **kwargs: Unpack[Owlv2ProcessorKwargs], ) -> BatchFeature: """ @@ -123,7 +120,7 @@ def __call__( **kwargs, ) query_images = output_kwargs["images_kwargs"].pop("query_images", None) - return_tensors = output_kwargs["common_kwargs"]["return_tensors"] + return_tensors = output_kwargs["text_kwargs"]["return_tensors"] if text is None and query_images is None and images is None: raise ValueError( diff --git a/src/transformers/models/owlvit/processing_owlvit.py b/src/transformers/models/owlvit/processing_owlvit.py index e4feef67da9d..e7fb401d9a76 100644 --- a/src/transformers/models/owlvit/processing_owlvit.py +++ b/src/transformers/models/owlvit/processing_owlvit.py @@ -47,7 +47,6 @@ class OwlViTProcessorKwargs(ProcessingKwargs, total=False): "text_kwargs": { "padding": "max_length", }, - "images_kwargs": {}, "common_kwargs": { "return_tensors": "np", }, @@ -89,8 +88,6 @@ def __call__( self, images: Optional[ImageInput] = None, text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, - audio=None, - videos=None, **kwargs: Unpack[OwlViTProcessorKwargs], ) -> BatchFeature: """ @@ -133,7 +130,7 @@ def __call__( **kwargs, ) query_images = output_kwargs["images_kwargs"].pop("query_images", None) - return_tensors = output_kwargs["common_kwargs"]["return_tensors"] + return_tensors = output_kwargs["text_kwargs"]["return_tensors"] if text is None and query_images is None and images is None: raise ValueError( diff --git a/src/transformers/models/paligemma/processing_paligemma.py b/src/transformers/models/paligemma/processing_paligemma.py index c4c618a4d958..7fa636ab796b 100644 --- a/src/transformers/models/paligemma/processing_paligemma.py +++ b/src/transformers/models/paligemma/processing_paligemma.py @@ -23,7 +23,6 @@ from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput, is_valid_image from ...processing_utils import ( - ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, @@ -44,13 +43,8 @@ class PaliGemmaTextKwargs(TextKwargs): suffix: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] -class PaliGemmaImagesKwargs(ImagesKwargs): - do_convert_rgb: Optional[bool] - - class PaliGemmaProcessorKwargs(ProcessingKwargs, total=False): text_kwargs: PaliGemmaTextKwargs - images_kwargs: PaliGemmaImagesKwargs _defaults = { "text_kwargs": { "padding": False, @@ -150,8 +144,6 @@ def __call__( self, images: Optional[ImageInput] = None, text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, - audio=None, - videos=None, **kwargs: Unpack[PaliGemmaProcessorKwargs], ) -> BatchFeature: """ diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py index c26132a48439..c6491b4bc703 100644 --- a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py +++ b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py @@ -25,7 +25,6 @@ ) from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, get_image_size, group_images_by_shape, reorder_images, @@ -36,11 +35,14 @@ ChannelDimension, PILImageResampling, ) -from ...processing_utils import Unpack -from ...utils import TensorType, auto_docstring +from ...processing_utils import ImagesKwargs, Unpack +from ...utils import ( + TensorType, + auto_docstring, +) -class PerceptionLMFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): +class PerceptionLMImageProcessorKwargs(ImagesKwargs): r""" vision_input_type (`str`, *optional*, defaults to `"thumb+tile"`): Vision processing strategy. `"thumb+tile"` uses both thumbnails and multiple tiles for @@ -51,9 +53,9 @@ class PerceptionLMFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): Maximum number of tiles an image can be split into based on its aspect ratio. """ - vision_input_type: str = "thumb+tile" - tile_size: int = 448 - max_num_tiles: int = 36 + vision_input_type: Optional[str] + tile_size: Optional[int] + max_num_tiles: Optional[int] @auto_docstring @@ -66,14 +68,17 @@ class PerceptionLMImageProcessorFast(BaseImageProcessorFast): do_rescale = True do_normalize = True do_convert_rgb = True + vision_input_type = "thumb+tail" + tile_size = 448 + max_num_tiles = 36 size = {"width": 448, "height": 448} # for backward compatibility in tests - valid_kwargs = PerceptionLMFastImageProcessorKwargs + valid_kwargs = PerceptionLMImageProcessorKwargs - def __init__(self, **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]) -> None: + def __init__(self, **kwargs: Unpack[PerceptionLMImageProcessorKwargs]) -> None: super().__init__(**kwargs) @auto_docstring - def preprocess(self, images, **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]) -> BatchFeature: + def preprocess(self, images, **kwargs: Unpack[PerceptionLMImageProcessorKwargs]) -> BatchFeature: return super().preprocess(images, **kwargs) @staticmethod @@ -267,7 +272,7 @@ def _preprocess( max_num_tiles: int, return_tensors: Optional[Union[str, TensorType]], disable_grouping: bool, - **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs], + **kwargs: Unpack[PerceptionLMImageProcessorKwargs], ) -> BatchFeature: # Group images by size for batched transformation grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) diff --git a/src/transformers/models/perception_lm/processing_perception_lm.py b/src/transformers/models/perception_lm/processing_perception_lm.py index 35f0fef6c4ca..e57418ef92f7 100644 --- a/src/transformers/models/perception_lm/processing_perception_lm.py +++ b/src/transformers/models/perception_lm/processing_perception_lm.py @@ -89,7 +89,6 @@ def __call__( self, images: Optional[ImageInput] = None, text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, - audio=None, videos: Optional[VideoInput] = None, **kwargs: Unpack[PerceptionLMProcessorKwargs], ) -> BatchFeature: diff --git a/src/transformers/models/perception_lm/video_processing_perception_lm.py b/src/transformers/models/perception_lm/video_processing_perception_lm.py index 1023aa7c589d..9b5c5d0b67a0 100644 --- a/src/transformers/models/perception_lm/video_processing_perception_lm.py +++ b/src/transformers/models/perception_lm/video_processing_perception_lm.py @@ -14,13 +14,9 @@ """Video processor class for PerceptionLM.""" from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, PILImageResampling -from ...processing_utils import Unpack, VideosKwargs from ...video_processing_utils import BaseVideoProcessor -class PerceptionLMFastVideoProcessorInitKwargs(VideosKwargs): ... - - class PerceptionLMVideoProcessor(BaseVideoProcessor): resample = PILImageResampling.BICUBIC image_mean = IMAGENET_STANDARD_MEAN @@ -31,11 +27,6 @@ class PerceptionLMVideoProcessor(BaseVideoProcessor): do_rescale = True do_normalize = True do_convert_rgb = True - valid_kwargs = PerceptionLMFastVideoProcessorInitKwargs - model_input_names = ["pixel_values_videos"] - - def __init__(self, **kwargs: Unpack[PerceptionLMFastVideoProcessorInitKwargs]): - super().__init__(**kwargs) __all__ = ["PerceptionLMVideoProcessor"] diff --git a/src/transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py b/src/transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py index 4bd9928daa94..5c750fae953e 100644 --- a/src/transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +++ b/src/transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py @@ -21,10 +21,10 @@ from ...image_processing_utils_fast import ( BaseImageProcessorFast, BatchFeature, - DefaultFastImageProcessorKwargs, Unpack, ) from ...image_utils import ImageInput, PILImageResampling, SizeDict +from ...processing_utils import ImagesKwargs from ...utils import ( TensorType, auto_docstring, @@ -35,7 +35,7 @@ logger = logging.get_logger(__name__) -class Phi4MultimodalFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): +class Phi4MultimodalImageProcessorKwargs(ImagesKwargs): r""" patch_size (`int`, *optional*): The size of the patch. @@ -59,10 +59,10 @@ class Phi4MultimodalImageProcessorFast(BaseImageProcessorFast): do_rescale = True do_normalize = True do_convert_rgb = True - valid_kwargs = Phi4MultimodalFastImageProcessorKwargs + valid_kwargs = Phi4MultimodalImageProcessorKwargs model_input_names = ["image_pixel_values", "image_sizes", "image_attention_mask"] - def __init__(self, **kwargs: Unpack[Phi4MultimodalFastImageProcessorKwargs]): + def __init__(self, **kwargs: Unpack[Phi4MultimodalImageProcessorKwargs]): super().__init__(**kwargs) def find_closest_aspect_ratio(self, aspect_ratio, target_ratios, width, height, image_size): @@ -158,7 +158,7 @@ def pad_mask_to_max_num_crops(self, masks, max_crops=5): def preprocess( self, images: ImageInput, - **kwargs: Unpack[Phi4MultimodalFastImageProcessorKwargs], + **kwargs: Unpack[Phi4MultimodalImageProcessorKwargs], ) -> BatchFeature: return super().preprocess(images, **kwargs) diff --git a/src/transformers/models/pix2struct/image_processing_pix2struct.py b/src/transformers/models/pix2struct/image_processing_pix2struct.py index 316f2021461c..e0c630369029 100644 --- a/src/transformers/models/pix2struct/image_processing_pix2struct.py +++ b/src/transformers/models/pix2struct/image_processing_pix2struct.py @@ -32,6 +32,7 @@ to_numpy_array, valid_images, ) +from ...processing_utils import ImagesKwargs from ...utils import TensorType, is_torch_available, is_vision_available, logging from ...utils.import_utils import requires_backends @@ -48,6 +49,18 @@ DEFAULT_FONT_PATH = "ybelkada/fonts" +class Pix2StructImageProcessorKwargs(ImagesKwargs): + """ + max_patches (`int`, *optional*): + Maximum number of patches to extract. + header_text (`Union[list[str], str]`, *optional*): + Text to render as a header. Only has an effect if `image_processor.is_vqa` is `True`. + """ + + max_patches: Optional[int] + header_text: Optional[Union[list[str], str]] + + # adapted from: https://discuss.pytorch.org/t/tf-image-extract-patches-in-pytorch/171409/2 def torch_extract_patches(image_tensor, patch_height, patch_width): """ @@ -208,6 +221,7 @@ class Pix2StructImageProcessor(BaseImageProcessor): """ model_input_names = ["flattened_patches", "attention_mask"] + valid_kwargs = Pix2StructImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/pix2struct/processing_pix2struct.py b/src/transformers/models/pix2struct/processing_pix2struct.py index f21dd5d7a002..fba2fe93ef19 100644 --- a/src/transformers/models/pix2struct/processing_pix2struct.py +++ b/src/transformers/models/pix2struct/processing_pix2struct.py @@ -16,21 +16,15 @@ Processor class for Pix2Struct. """ -from typing import Optional, Union +from typing import Union from ...feature_extraction_utils import BatchFeature -from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack +from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput from ...utils import logging -class Pix2StructImagesKwargs(ImagesKwargs, total=False): - max_patches: Optional[int] - header_text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] - - class Pix2StructProcessorKwargs(ProcessingKwargs, total=False): - images_kwargs: Pix2StructImagesKwargs _defaults = { "text_kwargs": { "add_special_tokens": True, @@ -79,8 +73,6 @@ def __call__( self, images=None, text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, - audio=None, - videos=None, **kwargs: Unpack[Pix2StructProcessorKwargs], ) -> Union[BatchEncoding, BatchFeature]: """ diff --git a/src/transformers/models/pixtral/image_processing_pixtral.py b/src/transformers/models/pixtral/image_processing_pixtral.py index 2be667e5dc6e..f5df895e66a4 100644 --- a/src/transformers/models/pixtral/image_processing_pixtral.py +++ b/src/transformers/models/pixtral/image_processing_pixtral.py @@ -38,6 +38,7 @@ validate_kwargs, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import TensorType, is_vision_available, logging from ...utils.import_utils import requires_backends @@ -49,6 +50,15 @@ import PIL +class PixtralImageProcessorKwargs(ImagesKwargs): + """ + patch_size (`dict[str, int]` *optional*, defaults to `{"height": 16, "width": 16}`): + Size of the patches in the model, used to calculate the output image size. Can be overridden by `patch_size` in the `preprocess` method. + """ + + patch_size: Optional[dict[str, int]] + + # Adapted from function in image_transforms.py to ensure any transparent pixels are converted to white. def convert_to_rgb(image: ImageInput) -> ImageInput: """ @@ -171,6 +181,7 @@ class PixtralImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values", "image_sizes"] + valid_kwargs = PixtralImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/pixtral/image_processing_pixtral_fast.py b/src/transformers/models/pixtral/image_processing_pixtral_fast.py index b31f910e4817..4a877d633048 100644 --- a/src/transformers/models/pixtral/image_processing_pixtral_fast.py +++ b/src/transformers/models/pixtral/image_processing_pixtral_fast.py @@ -22,7 +22,6 @@ from ...image_processing_utils import BatchFeature, get_size_dict from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, group_images_by_shape, reorder_images, ) @@ -33,21 +32,12 @@ auto_docstring, logging, ) -from .image_processing_pixtral import get_resize_output_image_size +from .image_processing_pixtral import PixtralImageProcessorKwargs, get_resize_output_image_size logger = logging.get_logger(__name__) -class PixtralFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - """ - patch_size (`dict[str, int]` *optional*, defaults to `{"height": 16, "width": 16}`): - Size of the patches in the model, used to calculate the output image size. Can be overridden by `patch_size` in the `preprocess` method. - """ - - patch_size: Optional[dict[str, int]] - - @auto_docstring class PixtralImageProcessorFast(BaseImageProcessorFast): resample = PILImageResampling.BICUBIC @@ -60,15 +50,15 @@ class PixtralImageProcessorFast(BaseImageProcessorFast): do_rescale = True do_normalize = True do_convert_rgb = True - valid_kwargs = PixtralFastImageProcessorKwargs + valid_kwargs = PixtralImageProcessorKwargs model_input_names = ["pixel_values", "image_sizes"] - def __init__(self, **kwargs: Unpack[PixtralFastImageProcessorKwargs]): + def __init__(self, **kwargs: Unpack[PixtralImageProcessorKwargs]): super().__init__(**kwargs) @auto_docstring - def preprocess(self, images: ImageInput, **kwargs: Unpack[PixtralFastImageProcessorKwargs]) -> BatchFeature: + def preprocess(self, images: ImageInput, **kwargs: Unpack[PixtralImageProcessorKwargs]) -> BatchFeature: return super().preprocess(images, **kwargs) def resize( diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py index bf4eb9307c72..5bb9fd780328 100644 --- a/src/transformers/models/pixtral/processing_pixtral.py +++ b/src/transformers/models/pixtral/processing_pixtral.py @@ -45,7 +45,6 @@ class PixtralProcessorKwargs(ProcessingKwargs, total=False): "padding": False, "return_mm_token_type_ids": False, }, - "images_kwargs": {}, "common_kwargs": { "return_tensors": "pt", }, @@ -120,8 +119,6 @@ def __call__( self, images: Optional[ImageInput] = None, text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, - audio=None, - videos=None, **kwargs: Unpack[PixtralProcessorKwargs], ) -> BatchFeature: """ @@ -164,7 +161,8 @@ def __call__( patch_size = self.patch_size * self.spatial_merge_size if images is not None: - image_inputs = self.image_processor(images, patch_size=patch_size, **output_kwargs["images_kwargs"]) + output_kwargs["images_kwargs"]["patch_size"] = patch_size + image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"]) else: image_inputs = {} diff --git a/src/transformers/models/poolformer/image_processing_poolformer.py b/src/transformers/models/poolformer/image_processing_poolformer.py index ce3cd398745c..7d03f8281285 100644 --- a/src/transformers/models/poolformer/image_processing_poolformer.py +++ b/src/transformers/models/poolformer/image_processing_poolformer.py @@ -37,6 +37,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging @@ -47,6 +48,15 @@ logger = logging.get_logger(__name__) +class PoolFormerImageProcessorKwargs(ImagesKwargs): + r""" + crop_pct (`float`, *optional*, defaults to `self.crop_pct`): + Percentage of the image to crop. Only has an effect if `do_resize` is set to `True`. + """ + + crop_pct: Optional[float] + + class PoolFormerImageProcessor(BaseImageProcessor): r""" Constructs a PoolFormer image processor. @@ -99,6 +109,7 @@ class PoolFormerImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values"] + valid_kwargs = PoolFormerImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/poolformer/image_processing_poolformer_fast.py b/src/transformers/models/poolformer/image_processing_poolformer_fast.py index 62d5f276859f..594d076a924c 100644 --- a/src/transformers/models/poolformer/image_processing_poolformer_fast.py +++ b/src/transformers/models/poolformer/image_processing_poolformer_fast.py @@ -19,7 +19,7 @@ import torch from torchvision.transforms.v2 import functional as F -from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature, DefaultFastImageProcessorKwargs +from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature from ...image_transforms import ( ChannelDimension, get_resize_output_image_size, @@ -40,16 +40,7 @@ TensorType, auto_docstring, ) - - -class PoolFormerFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - """ - Args: - crop_pct (`float`, *optional*, defaults to `self.crop_pct`): - Percentage of the image to crop. Only has an effect if `do_resize` is set to `True`. - """ - - crop_pct: Optional[float] +from .image_processing_poolformer import PoolFormerImageProcessorKwargs @auto_docstring @@ -65,13 +56,13 @@ class PoolFormerImageProcessorFast(BaseImageProcessorFast): do_center_crop = True do_rescale = True do_normalize = True - valid_kwargs = PoolFormerFastImageProcessorKwargs + valid_kwargs = PoolFormerImageProcessorKwargs - def __init__(self, **kwargs: Unpack[PoolFormerFastImageProcessorKwargs]): + def __init__(self, **kwargs: Unpack[PoolFormerImageProcessorKwargs]): super().__init__(**kwargs) @auto_docstring - def preprocess(self, images: ImageInput, **kwargs: Unpack[PoolFormerFastImageProcessorKwargs]) -> BatchFeature: + def preprocess(self, images: ImageInput, **kwargs: Unpack[PoolFormerImageProcessorKwargs]) -> BatchFeature: return super().preprocess(images, **kwargs) def resize( diff --git a/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py b/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py index 7224aeef8612..b8220a30fa42 100644 --- a/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py +++ b/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py @@ -38,6 +38,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import ( TensorType, filter_out_non_signature_kwargs, @@ -53,6 +54,22 @@ logger = logging.get_logger(__name__) +class PromptDepthAnythingImageProcessorKwargs(ImagesKwargs): + r""" + keep_aspect_ratio (`bool`, *optional*): + If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. + ensure_multiple_of (`int`, *optional*): + If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. + prompt_scale_to_meter (`float`, *optional*): + Scale factor to convert the prompt depth to meters. + """ + + keep_aspect_ratio: Optional[bool] + ensure_multiple_of: Optional[int] + size_divisor: Optional[int] + prompt_scale_to_meter: Optional[float] + + def _constrain_to_multiple_of(val, multiple, min_val=0, max_val=None): x = round(val / multiple) * multiple @@ -136,6 +153,7 @@ class PromptDepthAnythingImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values", "prompt_depth"] + valid_kwargs = PromptDepthAnythingImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py b/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py index 06d6ed156443..552d921700bc 100644 --- a/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +++ b/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py @@ -28,7 +28,6 @@ from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, group_images_by_shape, reorder_images, ) @@ -45,6 +44,7 @@ auto_docstring, requires_backends, ) +from .image_processing_prompt_depth_anything import PromptDepthAnythingImageProcessorKwargs def _constrain_to_multiple_of(val, multiple, min_val=0, max_val=None): @@ -89,27 +89,6 @@ def _get_resize_output_image_size( return (new_height, new_width) -class PromptDepthAnythingFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - """ - keep_aspect_ratio (`bool`, *optional*): - If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. - ensure_multiple_of (`int`, *optional*): - If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. - do_pad (`bool`, *optional*): - Whether to apply center padding. - size_divisor (`int`, *optional*): - If `do_pad` is `True`, pads the image dimensions to be divisible by this value. - prompt_scale_to_meter (`float`, *optional*): - Scale factor to convert the prompt depth to meters. - """ - - keep_aspect_ratio: Optional[bool] - ensure_multiple_of: Optional[int] - do_pad: Optional[bool] - size_divisor: Optional[int] - prompt_scale_to_meter: Optional[float] - - @auto_docstring class PromptDepthAnythingImageProcessorFast(BaseImageProcessorFast): model_input_names = ["pixel_values", "prompt_depth"] @@ -126,9 +105,9 @@ class PromptDepthAnythingImageProcessorFast(BaseImageProcessorFast): do_pad = False size_divisor = None prompt_scale_to_meter = 0.001 - valid_kwargs = PromptDepthAnythingFastImageProcessorKwargs + valid_kwargs = PromptDepthAnythingImageProcessorKwargs - def __init__(self, **kwargs: Unpack[PromptDepthAnythingFastImageProcessorKwargs]): + def __init__(self, **kwargs: Unpack[PromptDepthAnythingImageProcessorKwargs]): super().__init__(**kwargs) @auto_docstring @@ -136,7 +115,7 @@ def preprocess( self, images: ImageInput, prompt_depth: Optional[ImageInput] = None, - **kwargs: Unpack[PromptDepthAnythingFastImageProcessorKwargs], + **kwargs: Unpack[PromptDepthAnythingImageProcessorKwargs], ) -> BatchFeature: r""" prompt_depth (`ImageInput`, *optional*): @@ -213,7 +192,7 @@ def _preprocess_image_like_inputs( device: Optional[Union[str, "torch.device"]] = None, prompt_scale_to_meter: Optional[float] = None, return_tensors: Optional[Union[str, TensorType]] = None, - **kwargs: Unpack[PromptDepthAnythingFastImageProcessorKwargs], + **kwargs: Unpack[PromptDepthAnythingImageProcessorKwargs], ) -> BatchFeature: """ Preprocess image-like inputs, including the main images and optional prompt depth. diff --git a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py index 5fcbb0c535f9..95f687e1414a 100644 --- a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py @@ -25,34 +25,29 @@ from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput -from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs +from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs from ...tokenization_utils_base import AudioInput, PreTokenizedInput, TextInput from ...video_utils import VideoInput +# Redefine kwargs for videos because Qwen-Omni uses some kwargs for processing omni +# and does not use them in video processor class class Qwen2_5_OmniVideosKwargs(VideosKwargs): - fps: Optional[list[Union[int, float]]] - use_audio_in_video: Optional[bool] - seconds_per_chunk: Optional[float] - position_id_per_seconds: Optional[int] - min_pixels: Optional[int] - max_pixels: Optional[int] - patch_size: Optional[int] - temporal_patch_size: Optional[int] - merge_size: Optional[int] - - -class Qwen2_5_OmniImagesKwargs(ImagesKwargs): min_pixels: Optional[int] max_pixels: Optional[int] patch_size: Optional[int] temporal_patch_size: Optional[int] merge_size: Optional[int] + min_frames: Optional[int] + max_frames: Optional[int] + use_audio_in_video: Optional[bool] + seconds_per_chunk: Optional[float] + position_id_per_seconds: Optional[int] class Qwen2_5OmniProcessorKwargs(ProcessingKwargs, total=False): videos_kwargs: Qwen2_5_OmniVideosKwargs - images_kwargs: Qwen2_5_OmniImagesKwargs + _defaults = { "text_kwargs": { "padding": False, diff --git a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py index 2a2ee775b7be..3d3f325e86f6 100644 --- a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py @@ -40,7 +40,7 @@ VisionAttention, VisionRotaryEmbedding, ) -from transformers.models.qwen2_vl.processing_qwen2_vl import Qwen2VLImagesKwargs, Qwen2VLProcessor +from transformers.models.qwen2_vl.processing_qwen2_vl import Qwen2VLProcessor from ...activations import ACT2FN from ...cache_utils import Cache @@ -49,7 +49,7 @@ from ...image_utils import ImageInput from ...modeling_flash_attention_utils import is_flash_attn_available from ...modeling_layers import GradientCheckpointingLayer -from ...processing_utils import MultiModalData, ProcessingKwargs, Unpack, VideosKwargs +from ...processing_utils import MultiModalData, ProcessingKwargs, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import is_torchdynamo_compiling, logging from ...video_utils import VideoInput @@ -839,17 +839,7 @@ def prepare_inputs_for_generation( return model_inputs -class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False): - fps: Union[list[float], float] - - -class Qwen2_5_VLImagesKwargs(Qwen2VLImagesKwargs): - pass - - class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False): - images_kwargs: Qwen2_5_VLImagesKwargs - videos_kwargs: Qwen2_5_VLVideosProcessorKwargs _defaults = { "text_kwargs": { "padding": False, diff --git a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py index 0b2fc3dbfc38..8d249fb2d51c 100644 --- a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py @@ -30,26 +30,12 @@ from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput -from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs +from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...video_utils import VideoInput -class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False): - fps: Union[list[float], float] - - -class Qwen2_5_VLImagesKwargs(ImagesKwargs): - min_pixels: Optional[int] - max_pixels: Optional[int] - patch_size: Optional[int] - temporal_patch_size: Optional[int] - merge_size: Optional[int] - - class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False): - images_kwargs: Qwen2_5_VLImagesKwargs - videos_kwargs: Qwen2_5_VLVideosProcessorKwargs _defaults = { "text_kwargs": { "padding": False, diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py index 36a58d68730b..fe218bd05b9d 100644 --- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py @@ -44,6 +44,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import TensorType, logging from ...video_utils import VideoInput, make_batched_videos @@ -51,6 +52,27 @@ logger = logging.get_logger(__name__) +class Qwen2VLImageProcessorKwargs(ImagesKwargs): + r""" + min_pixels (`int`, *optional*, defaults to `56 * 56`): + The min pixels of the image to resize the image. + max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`): + The max pixels of the image to resize the image. + patch_size (`int`, *optional*, defaults to 14): + The spatial patch size of the vision encoder. + temporal_patch_size (`int`, *optional*, defaults to 2): + The temporal patch size of the vision encoder. + merge_size (`int`, *optional*, defaults to 2): + The merge size of the vision encoder to llm encoder. + """ + + min_pixels: Optional[int] + max_pixels: Optional[int] + patch_size: Optional[int] + temporal_patch_size: Optional[int] + merge_size: Optional[int] + + def smart_resize( height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4 * 1280 ): @@ -116,6 +138,7 @@ class Qwen2VLImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw"] + valid_kwargs = Qwen2VLImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py index ec9878da3222..bcbcaa6a8ad4 100644 --- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py @@ -27,7 +27,6 @@ from ...image_processing_utils import BatchFeature from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, group_images_by_shape, reorder_images, ) @@ -46,33 +45,12 @@ logging, ) from ...video_utils import VideoInput, make_batched_videos -from .image_processing_qwen2_vl import smart_resize +from .image_processing_qwen2_vl import Qwen2VLImageProcessorKwargs, smart_resize logger = logging.get_logger(__name__) -class Qwen2VLFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - """ - min_pixels (`int`, *optional*, defaults to `56 * 56`): - The min pixels of the image to resize the image. - max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`): - The max pixels of the image to resize the image. - patch_size (`int`, *optional*, defaults to 14): - The spatial patch size of the vision encoder. - temporal_patch_size (`int`, *optional*, defaults to 2): - The temporal patch size of the vision encoder. - merge_size (`int`, *optional*, defaults to 2): - The merge size of the vision encoder to llm encoder. - """ - - min_pixels: Optional[int] - max_pixels: Optional[int] - patch_size: Optional[int] - temporal_patch_size: Optional[int] - merge_size: Optional[int] - - @auto_docstring class Qwen2VLImageProcessorFast(BaseImageProcessorFast): do_resize = True @@ -88,10 +66,10 @@ class Qwen2VLImageProcessorFast(BaseImageProcessorFast): merge_size = 2 min_pixels = None max_pixels = None - valid_kwargs = Qwen2VLFastImageProcessorKwargs + valid_kwargs = Qwen2VLImageProcessorKwargs model_input_names = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw"] - def __init__(self, **kwargs: Unpack[Qwen2VLFastImageProcessorKwargs]): + def __init__(self, **kwargs: Unpack[Qwen2VLImageProcessorKwargs]): size = kwargs.pop("size", None) min_pixels = kwargs.pop("min_pixels", None) max_pixels = kwargs.pop("max_pixels", None) @@ -136,7 +114,7 @@ def preprocess( self, images: ImageInput, videos: Optional[VideoInput] = None, - **kwargs: Unpack[Qwen2VLFastImageProcessorKwargs], + **kwargs: Unpack[Qwen2VLImageProcessorKwargs], ) -> BatchFeature: return super().preprocess(images, videos, **kwargs) @@ -147,7 +125,7 @@ def _preprocess_image_like_inputs( do_convert_rgb: bool, input_data_format: ChannelDimension, device: Optional[Union[str, "torch.device"]] = None, - **kwargs: Unpack[DefaultFastImageProcessorKwargs], + **kwargs: Unpack[Qwen2VLImageProcessorKwargs], ) -> BatchFeature: """ Preprocess image-like inputs. diff --git a/src/transformers/models/qwen2_vl/processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/processing_qwen2_vl.py index b237cb6079fb..f630d039edbd 100644 --- a/src/transformers/models/qwen2_vl/processing_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/processing_qwen2_vl.py @@ -27,7 +27,7 @@ from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput -from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack +from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import logging from ...video_utils import VideoInput @@ -36,16 +36,7 @@ logger = logging.get_logger(__name__) -class Qwen2VLImagesKwargs(ImagesKwargs): - min_pixels: Optional[int] - max_pixels: Optional[int] - patch_size: Optional[int] - temporal_patch_size: Optional[int] - merge_size: Optional[int] - - class Qwen2VLProcessorKwargs(ProcessingKwargs, total=False): - images_kwargs: Qwen2VLImagesKwargs _defaults = { "text_kwargs": { "padding": False, diff --git a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py index 86041fc3de16..a118f7d2260b 100644 --- a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py @@ -20,41 +20,35 @@ # See the License for the specific language governing permissions and # limitations under the License. import re -from typing import Optional, Union +from typing import Optional import numpy as np from ...audio_utils import AudioInput from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput -from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, VideosKwargs +from ...processing_utils import ProcessingKwargs, ProcessorMixin, VideosKwargs from ...tokenization_utils_base import TextInput from ...video_utils import VideoInput, make_batched_videos +# Redefine kwargs for videos because Qwen-Omni uses some kwargs for processing omni +# and does not use them in video processor class class Qwen3OmniMoeVideosKwargs(VideosKwargs): - fps: Optional[list[Union[int, float]]] - use_audio_in_video: Optional[bool] - seconds_per_chunk: Optional[float] - position_id_per_seconds: Optional[int] - min_pixels: Optional[int] - max_pixels: Optional[int] - patch_size: Optional[int] - temporal_patch_size: Optional[int] - merge_size: Optional[int] - - -class Qwen3OmniMoeImagesKwargs(ImagesKwargs): min_pixels: Optional[int] max_pixels: Optional[int] patch_size: Optional[int] temporal_patch_size: Optional[int] merge_size: Optional[int] + min_frames: Optional[int] + max_frames: Optional[int] + use_audio_in_video: Optional[bool] + seconds_per_chunk: Optional[float] + position_id_per_seconds: Optional[int] class Qwen3OmniMoeProcessorKwargs(ProcessingKwargs, total=False): videos_kwargs: Qwen3OmniMoeVideosKwargs - images_kwargs: Qwen3OmniMoeImagesKwargs _defaults = { "text_kwargs": { "padding": False, diff --git a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py index 9c479c6bdd23..ff5346fb94b6 100644 --- a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py @@ -31,7 +31,7 @@ from ...modeling_outputs import BaseModelOutputWithPast from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update, rope_config_validation from ...modeling_utils import ALL_ATTENTION_FUNCTIONS -from ...processing_utils import ProcessingKwargs, Unpack, VideosKwargs +from ...processing_utils import ProcessingKwargs, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import auto_docstring, is_torchdynamo_compiling, logging from ...utils.generic import check_model_inputs @@ -50,7 +50,7 @@ VisionAttention, VisionRotaryEmbedding, ) -from ..qwen2_vl.processing_qwen2_vl import Qwen2VLImagesKwargs, Qwen2VLProcessor +from ..qwen2_vl.processing_qwen2_vl import Qwen2VLProcessor from ..qwen3.modeling_qwen3 import ( Qwen3Attention, Qwen3DecoderLayer, @@ -1252,17 +1252,7 @@ def prepare_inputs_for_generation( return model_inputs -class Qwen3VLVideosProcessorKwargs(VideosKwargs, total=False): - pass - - -class Qwen3VLImagesKwargs(Qwen2VLImagesKwargs): - pass - - class Qwen3VLProcessorKwargs(ProcessingKwargs, total=False): - images_kwargs: Qwen3VLImagesKwargs - videos_kwargs: Qwen3VLVideosProcessorKwargs _defaults = { "text_kwargs": { "padding": False, diff --git a/src/transformers/models/qwen3_vl/processing_qwen3_vl.py b/src/transformers/models/qwen3_vl/processing_qwen3_vl.py index 03c709cb0b3f..49199f0c3ecc 100644 --- a/src/transformers/models/qwen3_vl/processing_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/processing_qwen3_vl.py @@ -19,13 +19,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Optional, Union +from typing import Union import numpy as np from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput -from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs +from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import logging from ...video_utils import VideoInput @@ -34,21 +34,7 @@ logger = logging.get_logger(__name__) -class Qwen3VLVideosProcessorKwargs(VideosKwargs, total=False): - pass - - -class Qwen3VLImagesKwargs(ImagesKwargs): - min_pixels: Optional[int] - max_pixels: Optional[int] - patch_size: Optional[int] - temporal_patch_size: Optional[int] - merge_size: Optional[int] - - class Qwen3VLProcessorKwargs(ProcessingKwargs, total=False): - images_kwargs: Qwen3VLImagesKwargs - videos_kwargs: Qwen3VLVideosProcessorKwargs _defaults = { "text_kwargs": { "padding": False, diff --git a/src/transformers/models/rt_detr/image_processing_rt_detr.py b/src/transformers/models/rt_detr/image_processing_rt_detr.py index cf657867a9f8..ec65d0e7731d 100644 --- a/src/transformers/models/rt_detr/image_processing_rt_detr.py +++ b/src/transformers/models/rt_detr/image_processing_rt_detr.py @@ -48,6 +48,7 @@ validate_annotations, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import ( filter_out_non_signature_kwargs, is_torch_available, @@ -66,6 +67,29 @@ SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION,) +class RTDetrImageProcessorKwargs(ImagesKwargs): + r""" + format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): + Data format of the annotations. One of "coco_detection" or "coco_panoptic". + do_convert_annotations (`bool`, *optional*, defaults to `True`): + Controls whether to convert the annotations to the format expected by the RT_DETR model. Converts the + bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. + Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. + return_segmentation_masks (`bool`, *optional*, defaults to `False`): + Whether to return segmentation masks. + annotations (`AnnotationType` or `list[AnnotationType]`, *optional*): + Annotations to transform according to the padding that is applied to the images. + masks_path (`str` or `pathlib.Path`, *optional*): + Path to the directory containing the segmentation masks. + """ + + format: Optional[Union[str, AnnotationFormat]] + do_convert_annotations: Optional[bool] + return_segmentation_masks: Optional[bool] + annotations: Optional[Union[AnnotationType, list[AnnotationType]]] + masks_path: Optional[Union[str, pathlib.Path]] + + # Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, int]: """ @@ -406,6 +430,7 @@ class RTDetrImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values", "pixel_mask"] + valid_kwargs = RTDetrImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py b/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py index 9aae271deacc..44946eeed9e3 100644 --- a/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py +++ b/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py @@ -13,7 +13,6 @@ from ...image_processing_utils import BatchFeature from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, SizeDict, get_image_size_for_max_height_width, get_max_height_width, @@ -34,24 +33,7 @@ from ...processing_utils import Unpack from ...utils import TensorType, auto_docstring, requires_backends from ...utils.import_utils import requires -from .image_processing_rt_detr import get_size_with_aspect_ratio - - -class RTDetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - r""" - format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): - Data format of the annotations. One of "coco_detection" or "coco_panoptic". - do_convert_annotations (`bool`, *optional*, defaults to `True`): - Controls whether to convert the annotations to the format expected by the RT_DETR model. Converts the - bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. - Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. - return_segmentation_masks (`bool`, *optional*, defaults to `False`): - Whether to return segmentation masks. - """ - - format: Optional[Union[str, AnnotationFormat]] - do_convert_annotations: Optional[bool] - return_segmentation_masks: Optional[bool] +from .image_processing_rt_detr import RTDetrImageProcessorKwargs, get_size_with_aspect_ratio SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) @@ -130,10 +112,10 @@ class RTDetrImageProcessorFast(BaseImageProcessorFast): size = {"height": 640, "width": 640} default_to_square = False model_input_names = ["pixel_values", "pixel_mask"] - valid_kwargs = RTDetrFastImageProcessorKwargs + valid_kwargs = RTDetrImageProcessorKwargs do_convert_annotations = True - def __init__(self, **kwargs: Unpack[RTDetrFastImageProcessorKwargs]) -> None: + def __init__(self, **kwargs: Unpack[RTDetrImageProcessorKwargs]) -> None: # Backwards compatibility do_convert_annotations = kwargs.get("do_convert_annotations") do_normalize = kwargs.get("do_normalize") @@ -356,26 +338,9 @@ def pad( def preprocess( self, images: ImageInput, - annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None, - masks_path: Optional[Union[str, pathlib.Path]] = None, - **kwargs: Unpack[RTDetrFastImageProcessorKwargs], + **kwargs: Unpack[RTDetrImageProcessorKwargs], ) -> BatchFeature: - r""" - annotations (`AnnotationType` or `list[AnnotationType]`, *optional*): - List of annotations associated with the image or batch of images. If annotation is for object - detection, the annotations should be a dictionary with the following keys: - - "image_id" (`int`): The image id. - - "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a - dictionary. An image can have no annotations, in which case the list should be empty. - If annotation is for segmentation, the annotations should be a dictionary with the following keys: - - "image_id" (`int`): The image id. - - "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary. - An image can have no segments, in which case the list should be empty. - - "file_name" (`str`): The file name of the image. - masks_path (`str` or `pathlib.Path`, *optional*): - Path to the directory containing the segmentation masks. - """ - return super().preprocess(images, annotations, masks_path, **kwargs) + return super().preprocess(images, **kwargs) def _preprocess( self, diff --git a/src/transformers/models/rt_detr/modular_rt_detr.py b/src/transformers/models/rt_detr/modular_rt_detr.py index 61bd055144f0..0f72fdd52845 100644 --- a/src/transformers/models/rt_detr/modular_rt_detr.py +++ b/src/transformers/models/rt_detr/modular_rt_detr.py @@ -4,7 +4,7 @@ import torch from torchvision.transforms.v2 import functional as F -from transformers.models.detr.image_processing_detr_fast import DetrFastImageProcessorKwargs, DetrImageProcessorFast +from transformers.models.detr.image_processing_detr_fast import DetrImageProcessorFast from ...image_processing_utils import BatchFeature from ...image_processing_utils_fast import BaseImageProcessorFast, SizeDict, get_max_height_width @@ -26,6 +26,7 @@ logging, requires_backends, ) +from .image_processing_rt_detr import RTDetrImageProcessorKwargs logger = logging.get_logger(__name__) @@ -92,10 +93,6 @@ def prepare_coco_detection_annotation( return new_target -class RTDetrFastImageProcessorKwargs(DetrFastImageProcessorKwargs): - pass - - class RTDetrImageProcessorFast(DetrImageProcessorFast): resample = PILImageResampling.BILINEAR image_mean = IMAGENET_DEFAULT_MEAN @@ -109,9 +106,9 @@ class RTDetrImageProcessorFast(DetrImageProcessorFast): size = {"height": 640, "width": 640} default_to_square = False model_input_names = ["pixel_values", "pixel_mask"] - valid_kwargs = RTDetrFastImageProcessorKwargs + valid_kwargs = RTDetrImageProcessorKwargs - def __init__(self, **kwargs: Unpack[RTDetrFastImageProcessorKwargs]) -> None: + def __init__(self, **kwargs: Unpack[RTDetrImageProcessorKwargs]) -> None: # Backwards compatibility do_convert_annotations = kwargs.get("do_convert_annotations") do_normalize = kwargs.get("do_normalize") @@ -123,11 +120,9 @@ def __init__(self, **kwargs: Unpack[RTDetrFastImageProcessorKwargs]) -> None: def preprocess( self, images: ImageInput, - annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None, - masks_path: Optional[Union[str, pathlib.Path]] = None, - **kwargs: Unpack[RTDetrFastImageProcessorKwargs], + **kwargs: Unpack[RTDetrImageProcessorKwargs], ) -> BatchFeature: - return BaseImageProcessorFast.preprocess(self, images, annotations, masks_path, **kwargs) + return BaseImageProcessorFast.preprocess(self, images, **kwargs) def prepare_annotation( self, diff --git a/src/transformers/models/sam/image_processing_sam.py b/src/transformers/models/sam/image_processing_sam.py index 91d3867484ad..e9da260a6e9c 100644 --- a/src/transformers/models/sam/image_processing_sam.py +++ b/src/transformers/models/sam/image_processing_sam.py @@ -37,6 +37,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import ( TensorType, filter_out_non_signature_kwargs, @@ -57,6 +58,19 @@ logger = logging.get_logger(__name__) +class SamImageProcessorKwargs(ImagesKwargs): + r""" + mask_size (`dict[str, int]`, *optional*): + The size `{"longest_edge": int}` to resize the segmentation maps to. + mask_pad_size (`dict[str, int]`, *optional*): + The size `{"height": int, "width": int}` to pad the segmentation maps to. Must be larger than any segmentation + map size provided for preprocessing. + """ + + mask_size: Optional[dict[str, int]] + mask_pad_size: Optional[dict[str, int]] + + class SamImageProcessor(BaseImageProcessor): r""" Constructs a SAM image processor. @@ -107,6 +121,7 @@ class SamImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values"] + valid_kwargs = SamImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/sam/image_processing_sam_fast.py b/src/transformers/models/sam/image_processing_sam_fast.py index 7cb5f7b2bfc2..e77b69ee1e2b 100644 --- a/src/transformers/models/sam/image_processing_sam_fast.py +++ b/src/transformers/models/sam/image_processing_sam_fast.py @@ -26,10 +26,7 @@ from torchvision.transforms.v2 import functional as F_t from ...image_processing_utils import BatchFeature, get_size_dict -from ...image_processing_utils_fast import ( - BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, -) +from ...image_processing_utils_fast import BaseImageProcessorFast from ...image_utils import ( IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, @@ -41,19 +38,7 @@ ) from ...processing_utils import Unpack from ...utils import auto_docstring - - -class SamFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - r""" - mask_size (`dict[str, int]`, *optional*): - The size `{"longest_edge": int}` to resize the segmentation maps to. - mask_pad_size (`dict[str, int]`, *optional*): - The size `{"height": int, "width": int}` to pad the segmentation maps to. Must be larger than any segmentation - map size provided for preprocessing. - """ - - mask_size: Optional[dict[str, int]] - mask_pad_size: Optional[dict[str, int]] +from .image_processing_sam import SamImageProcessorKwargs @auto_docstring @@ -68,13 +53,13 @@ class SamImageProcessorFast(BaseImageProcessorFast): do_normalize = True do_convert_rgb = True - valid_kwargs = SamFastImageProcessorKwargs + valid_kwargs = SamImageProcessorKwargs do_pad = True pad_size = {"height": 1024, "width": 1024} mask_pad_size = {"height": 256, "width": 256} - def __init__(self, **kwargs: Unpack[SamFastImageProcessorKwargs]): + def __init__(self, **kwargs: Unpack[SamImageProcessorKwargs]): super().__init__(**kwargs) def _get_preprocess_shape(self, old_shape: tuple[int, int], longest_edge: int): @@ -172,7 +157,7 @@ def preprocess( self, images: ImageInput, segmentation_maps: Optional[ImageInput] = None, - **kwargs: Unpack[SamFastImageProcessorKwargs], + **kwargs: Unpack[SamImageProcessorKwargs], ) -> BatchFeature: r""" segmentation_maps (`ImageInput`, *optional*): @@ -187,7 +172,7 @@ def _preprocess_image_like_inputs( do_convert_rgb: bool, input_data_format: ChannelDimension, device: Optional[Union[str, "torch.device"]] = None, - **kwargs: Unpack[SamFastImageProcessorKwargs], + **kwargs: Unpack[SamImageProcessorKwargs], ) -> BatchFeature: """ Preprocess image-like inputs. diff --git a/src/transformers/models/sam/processing_sam.py b/src/transformers/models/sam/processing_sam.py index c9ccbc93a413..bc82daf2034d 100644 --- a/src/transformers/models/sam/processing_sam.py +++ b/src/transformers/models/sam/processing_sam.py @@ -23,9 +23,8 @@ from ...image_utils import ImageInput from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin -from ...tokenization_utils_base import AudioInput, BatchEncoding, PreTokenizedInput, TextInput +from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput from ...utils import is_torch_available -from ...video_utils import VideoInput if is_torch_available(): @@ -38,6 +37,8 @@ class SamImagesKwargs(ImagesKwargs): input_labels: Optional[list[list[int]]] input_boxes: Optional[list[list[list[float]]]] point_pad_value: Optional[int] + mask_size: Optional[dict[str, int]] + mask_pad_size: Optional[dict[str, int]] class SamProcessorKwargs(ProcessingKwargs, total=False): @@ -73,8 +74,6 @@ def __call__( self, images: Optional[ImageInput] = None, text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None, - audio: Optional[AudioInput] = None, - video: Optional[VideoInput] = None, **kwargs, ) -> BatchEncoding: """ @@ -114,7 +113,7 @@ def __call__( input_points=input_points, input_labels=input_labels, input_boxes=input_boxes, - return_tensors=output_kwargs["common_kwargs"].get("return_tensors"), + return_tensors=output_kwargs["images_kwargs"].get("return_tensors"), point_pad_value=point_pad_value, ) diff --git a/src/transformers/models/sam2/image_processing_sam2_fast.py b/src/transformers/models/sam2/image_processing_sam2_fast.py index 30e99980f4d1..5ae472f53638 100644 --- a/src/transformers/models/sam2/image_processing_sam2_fast.py +++ b/src/transformers/models/sam2/image_processing_sam2_fast.py @@ -29,7 +29,7 @@ from torchvision.ops.boxes import batched_nms from ...image_processing_utils import BatchFeature, get_size_dict -from ...image_processing_utils_fast import BaseImageProcessorFast, DefaultFastImageProcessorKwargs +from ...image_processing_utils_fast import BaseImageProcessorFast from ...image_utils import ( IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, @@ -39,11 +39,11 @@ SizeDict, pil_torch_interpolation_mapping, ) -from ...processing_utils import Unpack +from ...processing_utils import ImagesKwargs, Unpack from ...utils import TensorType, auto_docstring -class Sam2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs): +class Sam2FastImageProcessorKwargs(ImagesKwargs): r""" mask_size (`dict[str, int]`, *optional*): The size `{"height": int, "width": int}` to resize the segmentation maps to. diff --git a/src/transformers/models/sam2/modular_sam2.py b/src/transformers/models/sam2/modular_sam2.py index daab10855512..40414566267f 100644 --- a/src/transformers/models/sam2/modular_sam2.py +++ b/src/transformers/models/sam2/modular_sam2.py @@ -24,7 +24,7 @@ from ...activations import ACT2FN from ...image_processing_utils import BatchFeature, get_size_dict -from ...image_processing_utils_fast import BaseImageProcessorFast, DefaultFastImageProcessorKwargs +from ...image_processing_utils_fast import BaseImageProcessorFast from ...image_utils import ( IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, @@ -36,7 +36,7 @@ ) from ...modeling_layers import GradientCheckpointingLayer from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel -from ...processing_utils import Unpack +from ...processing_utils import ImagesKwargs, Unpack from ...utils import ( ModelOutput, TensorType, @@ -70,7 +70,7 @@ logger = logging.get_logger(__name__) -class Sam2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs): +class Sam2FastImageProcessorKwargs(ImagesKwargs): r""" mask_size (`dict[str, int]`, *optional*): The size `{"height": int, "width": int}` to resize the segmentation maps to. @@ -118,6 +118,19 @@ def _preprocess( ) -> "torch.Tensor": return BaseImageProcessorFast._preprocess(self, images, return_tensors=return_tensors, **kwargs).pixel_values + @auto_docstring + def preprocess( + self, + images: ImageInput, + segmentation_maps: Optional[ImageInput] = None, + **kwargs: Unpack[Sam2FastImageProcessorKwargs], + ) -> BatchFeature: + r""" + segmentation_maps (`ImageInput`, *optional*): + The segmentation maps to preprocess. + """ + return super().preprocess(images, segmentation_maps, **kwargs) + def _preprocess_image_like_inputs( self, images: ImageInput, diff --git a/src/transformers/models/sam_hq/processing_samhq.py b/src/transformers/models/sam_hq/processing_samhq.py index 49681c7c6a26..902e68832836 100644 --- a/src/transformers/models/sam_hq/processing_samhq.py +++ b/src/transformers/models/sam_hq/processing_samhq.py @@ -23,9 +23,8 @@ from ...image_utils import ImageInput from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack -from ...tokenization_utils_base import AudioInput, BatchEncoding, PreTokenizedInput, TextInput +from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput from ...utils import is_torch_available -from ...video_utils import VideoInput if is_torch_available(): @@ -38,6 +37,8 @@ class SamHQImagesKwargs(ImagesKwargs): input_labels: Optional[list[list[int]]] input_boxes: Optional[list[list[list[float]]]] point_pad_value: Optional[int] + mask_size: Optional[dict[str, int]] + mask_pad_size: Optional[dict[str, int]] class SamHQProcessorKwargs(ProcessingKwargs, total=False): @@ -78,8 +79,6 @@ def __call__( self, images: Optional[ImageInput] = None, text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None, - audio: Optional[AudioInput] = None, - video: Optional[VideoInput] = None, **kwargs: Unpack[SamHQProcessorKwargs], ) -> BatchEncoding: """ @@ -118,7 +117,7 @@ def __call__( input_points=input_points, input_labels=input_labels, input_boxes=input_boxes, - return_tensors=output_kwargs["common_kwargs"].get("return_tensors"), + return_tensors=output_kwargs["images_kwargs"].get("return_tensors"), point_pad_value=output_kwargs["images_kwargs"].get("point_pad_value"), ) diff --git a/src/transformers/models/segformer/image_processing_segformer.py b/src/transformers/models/segformer/image_processing_segformer.py index 0894c352de8b..ce9ace8115a4 100644 --- a/src/transformers/models/segformer/image_processing_segformer.py +++ b/src/transformers/models/segformer/image_processing_segformer.py @@ -33,6 +33,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import ( TensorType, filter_out_non_signature_kwargs, @@ -54,6 +55,17 @@ logger = logging.get_logger(__name__) +class SegformerImageProcessorKwargs(ImagesKwargs): + r""" + do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`): + Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 + is used for background, and background itself is not included in all classes of a dataset (e.g. + ADE20k). The background label will be replaced by 255. + """ + + do_reduce_labels: Optional[bool] + + @requires(backends=("vision",)) class SegformerImageProcessor(BaseImageProcessor): r""" @@ -92,6 +104,7 @@ class SegformerImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values"] + valid_kwargs = SegformerImageProcessorKwargs @filter_out_non_signature_kwargs(extra=INIT_SERVICE_KWARGS) def __init__( diff --git a/src/transformers/models/segformer/image_processing_segformer_fast.py b/src/transformers/models/segformer/image_processing_segformer_fast.py index 11dfa3c42ab1..d3dc35e609de 100644 --- a/src/transformers/models/segformer/image_processing_segformer_fast.py +++ b/src/transformers/models/segformer/image_processing_segformer_fast.py @@ -25,12 +25,7 @@ from torchvision.transforms.v2 import functional as F from ...image_processing_utils import BatchFeature -from ...image_processing_utils_fast import ( - BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, - group_images_by_shape, - reorder_images, -) +from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images from ...image_utils import ( IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, @@ -42,17 +37,7 @@ ) from ...processing_utils import Unpack from ...utils import TensorType, auto_docstring - - -class SegformerFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - r""" - do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`): - Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 - is used for background, and background itself is not included in all classes of a dataset (e.g. - ADE20k). The background label will be replaced by 255. - """ - - do_reduce_labels: Optional[bool] +from .image_processing_segformer import SegformerImageProcessorKwargs @auto_docstring @@ -68,10 +53,10 @@ class SegformerImageProcessorFast(BaseImageProcessorFast): do_rescale = True do_normalize = True do_reduce_labels = False - valid_kwargs = SegformerFastImageProcessorKwargs + valid_kwargs = SegformerImageProcessorKwargs rescale_factor = 1 / 255 - def __init__(self, **kwargs: Unpack[SegformerFastImageProcessorKwargs]): + def __init__(self, **kwargs: Unpack[SegformerImageProcessorKwargs]): super().__init__(**kwargs) def reduce_label(self, labels: list["torch.Tensor"]): @@ -89,7 +74,7 @@ def preprocess( self, images: ImageInput, segmentation_maps: Optional[ImageInput] = None, - **kwargs: Unpack[SegformerFastImageProcessorKwargs], + **kwargs: Unpack[SegformerImageProcessorKwargs], ) -> BatchFeature: r""" segmentation_maps (`ImageInput`, *optional*): @@ -104,7 +89,7 @@ def _preprocess_image_like_inputs( do_convert_rgb: bool, input_data_format: ChannelDimension, device: Optional[Union[str, "torch.device"]] = None, - **kwargs: Unpack[SegformerFastImageProcessorKwargs], + **kwargs: Unpack[SegformerImageProcessorKwargs], ) -> BatchFeature: """ Preprocess image-like inputs. diff --git a/src/transformers/models/segformer/modular_segformer.py b/src/transformers/models/segformer/modular_segformer.py index 831d046fd9a7..6bbbe9ecd4fd 100644 --- a/src/transformers/models/segformer/modular_segformer.py +++ b/src/transformers/models/segformer/modular_segformer.py @@ -19,7 +19,7 @@ import torch from torchvision.transforms.v2 import functional as F -from transformers.models.beit.image_processing_beit_fast import BeitFastImageProcessorKwargs, BeitImageProcessorFast +from transformers.models.beit.image_processing_beit_fast import BeitImageProcessorFast from ...image_processing_utils import BatchFeature from ...image_processing_utils_fast import ( @@ -38,10 +38,7 @@ from ...utils import ( TensorType, ) - - -class SegformerFastImageProcessorKwargs(BeitFastImageProcessorKwargs): - pass +from .image_processing_segformer import SegformerImageProcessorKwargs class SegformerImageProcessorFast(BeitImageProcessorFast): @@ -64,7 +61,7 @@ def _preprocess_image_like_inputs( do_convert_rgb: bool, input_data_format: ChannelDimension, device: Optional[Union[str, "torch.device"]] = None, - **kwargs: Unpack[SegformerFastImageProcessorKwargs], + **kwargs: Unpack[SegformerImageProcessorKwargs], ) -> BatchFeature: """ Preprocess image-like inputs. diff --git a/src/transformers/models/shieldgemma2/processing_shieldgemma2.py b/src/transformers/models/shieldgemma2/processing_shieldgemma2.py index 4341d087361e..8c221e826167 100644 --- a/src/transformers/models/shieldgemma2/processing_shieldgemma2.py +++ b/src/transformers/models/shieldgemma2/processing_shieldgemma2.py @@ -87,8 +87,6 @@ def __call__( self, images: Optional[ImageInput] = None, text=None, - videos=None, - audio=None, **kwargs: Unpack[ShieldGemma2ProcessorKwargs], ) -> BatchFeature: """Generates a batch of inputs from the provided images. @@ -120,8 +118,6 @@ def __call__( `(len(images) * len(policies), )`, and the order within the batch will be img1_policy1, ... img1_policyN, ... imgM_policyN. """ - del text, videos, audio - if not images: raise ValueError("ShieldGemma 2 needs images to classify") elif not isinstance(images, Sequence): diff --git a/src/transformers/models/siglip2/image_processing_siglip2.py b/src/transformers/models/siglip2/image_processing_siglip2.py index 74a166c440b5..caff1bce0bc9 100644 --- a/src/transformers/models/siglip2/image_processing_siglip2.py +++ b/src/transformers/models/siglip2/image_processing_siglip2.py @@ -37,6 +37,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging @@ -47,6 +48,19 @@ from PIL import Image +class Siglip2ImageProcessorKwargs(ImagesKwargs): + """ + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch the image will be split to. + max_num_patches (`int`, *optional*, defaults to 256): + The image will be resized to have at most this number of patches, + and then padded in "patch" dimension to match this number exactly. + """ + + patch_size: Optional[int] + max_num_patches: Optional[int] + + @lru_cache(maxsize=256) def get_image_size_for_max_num_patches( image_height: int, image_width: int, patch_size: int, max_num_patches: int, eps: float = 1e-5 @@ -159,6 +173,7 @@ class Siglip2ImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values", "pixel_attention_mask", "spatial_shapes"] + valid_kwargs = Siglip2ImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/siglip2/image_processing_siglip2_fast.py b/src/transformers/models/siglip2/image_processing_siglip2_fast.py index 45261fab2cd0..347ec372d410 100644 --- a/src/transformers/models/siglip2/image_processing_siglip2_fast.py +++ b/src/transformers/models/siglip2/image_processing_siglip2_fast.py @@ -20,22 +20,15 @@ from torchvision.transforms.v2 import functional as F from ...image_processing_utils import BatchFeature -from ...image_processing_utils_fast import ( - BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, - SizeDict, -) -from ...image_utils import ( - ImageInput, - PILImageResampling, -) +from ...image_processing_utils_fast import BaseImageProcessorFast, SizeDict +from ...image_utils import ImageInput, PILImageResampling from ...processing_utils import Unpack from ...utils import ( TensorType, auto_docstring, logging, ) -from .image_processing_siglip2 import get_image_size_for_max_num_patches +from .image_processing_siglip2 import Siglip2ImageProcessorKwargs, get_image_size_for_max_num_patches logger = logging.get_logger(__name__) @@ -71,19 +64,6 @@ def pad_along_first_dim( return tensor, mask -class Siglip2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - """ - patch_size (`int`, *optional*, defaults to 16): - The size (resolution) of each patch the image will be split to. - max_num_patches (`int`, *optional*, defaults to 256): - The image will be resized to have at most this number of patches, - and then padded in "patch" dimension to match this number exactly. - """ - - patch_size: Optional[int] - max_num_patches: Optional[int] - - @auto_docstring class Siglip2ImageProcessorFast(BaseImageProcessorFast): resample = PILImageResampling.BILINEAR @@ -94,10 +74,10 @@ class Siglip2ImageProcessorFast(BaseImageProcessorFast): do_normalize = True patch_size = 16 max_num_patches = 256 - valid_kwargs = Siglip2FastImageProcessorKwargs + valid_kwargs = Siglip2ImageProcessorKwargs unused_kwargs = ["size", "do_center_crop", "crop_size"] - def __init__(self, **kwargs: Unpack[Siglip2FastImageProcessorKwargs]): + def __init__(self, **kwargs: Unpack[Siglip2ImageProcessorKwargs]): super().__init__(**kwargs) def _validate_preprocess_kwargs(self, **kwargs) -> tuple: @@ -106,7 +86,7 @@ def _validate_preprocess_kwargs(self, **kwargs) -> tuple: return super()._validate_preprocess_kwargs(**kwargs) @auto_docstring - def preprocess(self, images: ImageInput, **kwargs: Unpack[Siglip2FastImageProcessorKwargs]) -> BatchFeature: + def preprocess(self, images: ImageInput, **kwargs: Unpack[Siglip2ImageProcessorKwargs]) -> BatchFeature: return super().preprocess(images, **kwargs) def _preprocess( diff --git a/src/transformers/models/siglip2/processing_siglip2.py b/src/transformers/models/siglip2/processing_siglip2.py index 8e177b237b10..b16650303da4 100644 --- a/src/transformers/models/siglip2/processing_siglip2.py +++ b/src/transformers/models/siglip2/processing_siglip2.py @@ -16,19 +16,10 @@ Image/Text processor class for SigLIP2. """ -from typing import Optional - -from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin - - -class Siglip2ImagesKwargs(ImagesKwargs, total=False): - max_num_patches: Optional[int] - patch_size: Optional[int] +from ...processing_utils import ProcessingKwargs, ProcessorMixin class Siglip2ProcessorKwargs(ProcessingKwargs, total=False): - images_kwargs: Siglip2ImagesKwargs - _defaults = { "text_kwargs": { "padding": "max_length", diff --git a/src/transformers/models/smolvlm/image_processing_smolvlm.py b/src/transformers/models/smolvlm/image_processing_smolvlm.py index 8a8ee5d4aa14..e231c1ec6b07 100644 --- a/src/transformers/models/smolvlm/image_processing_smolvlm.py +++ b/src/transformers/models/smolvlm/image_processing_smolvlm.py @@ -41,6 +41,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import TensorType, is_vision_available, logging @@ -50,6 +51,24 @@ logger = logging.get_logger(__name__) + + +class SmolVLMImageProcessorKwargs(ImagesKwargs): + """ + do_image_splitting (`bool`, *optional*, defaults to `True`): + Whether to split the image into sub-images concatenated with the original image. They are split into patches + such that each patch has a size of `max_image_size["height"]` x `max_image_size["width"]`. + max_image_size (`Dict`, *optional*, defaults to `{"longest_edge": 364}`): + Maximum resolution of the patches of images accepted by the model. This is a dictionary containing the key "longest_edge". + return_row_col_info (`bool`, *optional*, defaults to `False`): + Whether to return the row and column information of the images. + """ + + do_image_splitting: Optional[bool] + max_image_size: Optional[dict[str, int]] + return_row_col_info: Optional[bool] + + MAX_IMAGE_SIZE = 4096 # 4k resolution as absolute maximum @@ -288,6 +307,7 @@ class SmolVLMImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values", "pixel_attention_mask"] + valid_kwargs = SmolVLMImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/smolvlm/image_processing_smolvlm_fast.py b/src/transformers/models/smolvlm/image_processing_smolvlm_fast.py index 4e24bc279543..5bff89823f32 100644 --- a/src/transformers/models/smolvlm/image_processing_smolvlm_fast.py +++ b/src/transformers/models/smolvlm/image_processing_smolvlm_fast.py @@ -25,13 +25,7 @@ import torch from ...image_processing_utils import BatchFeature -from ...image_processing_utils_fast import ( - BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, - SizeDict, - group_images_by_shape, - reorder_images, -) +from ...image_processing_utils_fast import BaseImageProcessorFast, SizeDict, group_images_by_shape, reorder_images from ...image_utils import ( IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, @@ -41,6 +35,7 @@ ) from ...processing_utils import Unpack from ...utils import TensorType, auto_docstring, is_torchvision_available, logging +from .image_processing_smolvlm import SmolVLMImageProcessorKwargs if is_torchvision_available(): @@ -49,23 +44,6 @@ logger = logging.get_logger(__name__) - -class SmolVLMFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - """ - do_image_splitting (`bool`, *optional*, defaults to `True`): - Whether to split the image into sub-images concatenated with the original image. They are split into patches - such that each patch has a size of `max_image_size["height"]` x `max_image_size["width"]`. - max_image_size (`Dict`, *optional*, defaults to `{"longest_edge": 364}`): - Maximum resolution of the patches of images accepted by the model. This is a dictionary containing the key "longest_edge". - return_row_col_info (`bool`, *optional*, defaults to `False`): - Whether to return the row and column information of the images. - """ - - do_image_splitting: Optional[bool] - max_image_size: Optional[dict[str, int]] - return_row_col_info: Optional[bool] - - MAX_IMAGE_SIZE = 4096 # 4k resolution as absolute maximum @@ -189,7 +167,7 @@ class SmolVLMImageProcessorFast(BaseImageProcessorFast): do_image_splitting = True do_pad = True return_row_col_info = False - valid_kwargs = SmolVLMFastImageProcessorKwargs + valid_kwargs = SmolVLMImageProcessorKwargs def _prepare_images_structure(self, images: ImageInput, expected_ndims: int = 3) -> ImageInput: """ @@ -357,7 +335,7 @@ def pad( return image, pixel_mask @auto_docstring - def preprocess(self, images: ImageInput, **kwargs: Unpack[SmolVLMFastImageProcessorKwargs]) -> BatchFeature: + def preprocess(self, images: ImageInput, **kwargs: Unpack[SmolVLMImageProcessorKwargs]) -> BatchFeature: return super().preprocess(images, **kwargs) def _preprocess( diff --git a/src/transformers/models/smolvlm/processing_smolvlm.py b/src/transformers/models/smolvlm/processing_smolvlm.py index 42dcecce6a3b..86d07e238f1b 100644 --- a/src/transformers/models/smolvlm/processing_smolvlm.py +++ b/src/transformers/models/smolvlm/processing_smolvlm.py @@ -21,7 +21,7 @@ from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput, make_nested_list_of_images -from ...processing_utils import AllKwargsForChatTemplate, ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack +from ...processing_utils import AllKwargsForChatTemplate, ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import BatchEncoding, TextInput from ...utils import is_num2words_available, is_vision_available, logging from ...video_utils import VideoInput @@ -103,14 +103,7 @@ def get_image_prompt_string( ) -class SmolVLMImagesKwargs(ImagesKwargs, total=False): - return_row_col_info: Optional[bool] - max_image_size: Optional[dict[str, int]] - - class SmolVLMProcessorKwargs(ProcessingKwargs, total=False): - images_kwargs: SmolVLMImagesKwargs - _defaults = { "text_kwargs": { "add_special_tokens": True, @@ -248,7 +241,6 @@ def __call__( self, images: Union[ImageInput, list[ImageInput], list[list[ImageInput]]] = None, text: Union[TextInput, "PreTokenizedInput", list[TextInput], list["PreTokenizedInput"]] = None, - audio=None, videos: Optional[VideoInput] = None, **kwargs: Unpack[SmolVLMProcessorKwargs], ) -> BatchEncoding: diff --git a/src/transformers/models/smolvlm/video_processing_smolvlm.py b/src/transformers/models/smolvlm/video_processing_smolvlm.py index 522a344b09b5..ce73dfb4a82e 100644 --- a/src/transformers/models/smolvlm/video_processing_smolvlm.py +++ b/src/transformers/models/smolvlm/video_processing_smolvlm.py @@ -92,7 +92,6 @@ def get_resize_output_image_size( class SmolVLMVideoProcessorInitKwargs(VideosKwargs): max_image_size: Optional[dict[str, int]] - do_pad: Optional[bool] class SmolVLMVideoProcessor(BaseVideoProcessor): diff --git a/src/transformers/models/superpoint/image_processing_superpoint.py b/src/transformers/models/superpoint/image_processing_superpoint.py index dc2c6ab22419..633d9b0b16b9 100644 --- a/src/transformers/models/superpoint/image_processing_superpoint.py +++ b/src/transformers/models/superpoint/image_processing_superpoint.py @@ -30,6 +30,7 @@ to_numpy_array, valid_images, ) +from ...processing_utils import ImagesKwargs from ...utils import TensorType, logging, requires_backends @@ -45,6 +46,15 @@ logger = logging.get_logger(__name__) +class SuperPointImageProcessorKwargs(ImagesKwargs): + r""" + do_grayscale (`bool`, *optional*, defaults to `True`): + Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method. + """ + + do_grayscale: Optional[bool] = True + + def is_grayscale( image: np.ndarray, input_data_format: Optional[Union[str, ChannelDimension]] = None, @@ -120,6 +130,7 @@ class SuperPointImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values"] + valid_kwargs = SuperPointImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/superpoint/image_processing_superpoint_fast.py b/src/transformers/models/superpoint/image_processing_superpoint_fast.py index 54f95fa75af6..3750441fc9f0 100644 --- a/src/transformers/models/superpoint/image_processing_superpoint_fast.py +++ b/src/transformers/models/superpoint/image_processing_superpoint_fast.py @@ -21,7 +21,6 @@ from ...image_processing_utils import BatchFeature from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, group_images_by_shape, reorder_images, ) @@ -34,6 +33,7 @@ TensorType, auto_docstring, ) +from .image_processing_superpoint import SuperPointImageProcessorKwargs if TYPE_CHECKING: @@ -53,15 +53,6 @@ def is_grayscale( ) -class SuperPointFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - r""" - do_grayscale (`bool`, *optional*, defaults to `True`): - Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method. - """ - - do_grayscale: Optional[bool] = True - - def convert_to_grayscale( image: "torch.Tensor", ) -> "torch.Tensor": @@ -90,9 +81,9 @@ class SuperPointImageProcessorFast(BaseImageProcessorFast): do_rescale = True rescale_factor = 1 / 255 do_normalize = None - valid_kwargs = SuperPointFastImageProcessorKwargs + valid_kwargs = SuperPointImageProcessorKwargs - def __init__(self, **kwargs: Unpack[SuperPointFastImageProcessorKwargs]): + def __init__(self, **kwargs: Unpack[SuperPointImageProcessorKwargs]): super().__init__(**kwargs) def _preprocess( diff --git a/src/transformers/models/swin2sr/image_processing_swin2sr.py b/src/transformers/models/swin2sr/image_processing_swin2sr.py index b15e7a9d8f86..018a1bf0f4df 100644 --- a/src/transformers/models/swin2sr/image_processing_swin2sr.py +++ b/src/transformers/models/swin2sr/image_processing_swin2sr.py @@ -30,6 +30,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import TensorType, filter_out_non_signature_kwargs, logging from ...utils.deprecation import deprecate_kwarg @@ -37,6 +38,10 @@ logger = logging.get_logger(__name__) +class Swin2SRImageProcessorKwargs(ImagesKwargs): + size_divisor: Optional[int] + + class Swin2SRImageProcessor(BaseImageProcessor): r""" Constructs a Swin2SR image processor. @@ -51,6 +56,7 @@ class Swin2SRImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values"] + valid_kwargs = Swin2SRImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/swin2sr/image_processing_swin2sr_fast.py b/src/transformers/models/swin2sr/image_processing_swin2sr_fast.py index 82c9d733d367..bee3da36c9b6 100644 --- a/src/transformers/models/swin2sr/image_processing_swin2sr_fast.py +++ b/src/transformers/models/swin2sr/image_processing_swin2sr_fast.py @@ -22,7 +22,6 @@ from ...image_processing_utils import BatchFeature, ChannelDimension, get_image_size from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, group_images_by_shape, reorder_images, ) @@ -34,30 +33,21 @@ logging, ) from ...utils.deprecation import deprecate_kwarg +from .image_processing_swin2sr import Swin2SRImageProcessorKwargs logger = logging.get_logger(__name__) -class Swin2SRFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - """ - size_divisor (`int`, *optional*, defaults to `8`): - The size of the sliding window for the local attention. It will be used to pad the image - to the size divisible by `size_divisor` - """ - - size_divisor: Optional[int] - - @auto_docstring class Swin2SRImageProcessorFast(BaseImageProcessorFast): do_rescale = True rescale_factor = 1 / 255 do_pad = True size_divisor = 8 - valid_kwargs = Swin2SRFastImageProcessorKwargs + valid_kwargs = Swin2SRImageProcessorKwargs - def __init__(self, **kwargs: Unpack[Swin2SRFastImageProcessorKwargs]): + def __init__(self, **kwargs: Unpack[Swin2SRImageProcessorKwargs]): pad_size = kwargs.pop("pad_size", None) kwargs.setdefault("size_divisor", pad_size) super().__init__(**kwargs) @@ -76,7 +66,7 @@ def pad_size(self, value): ) self.size_divisor = value - def preprocess(self, images: ImageInput, **kwargs: Unpack[Swin2SRFastImageProcessorKwargs]) -> BatchFeature: + def preprocess(self, images: ImageInput, **kwargs: Unpack[Swin2SRImageProcessorKwargs]) -> BatchFeature: return super().preprocess(images, **kwargs) @deprecate_kwarg("size", version="v5", new_name="size_divisor") diff --git a/src/transformers/models/textnet/image_processing_textnet.py b/src/transformers/models/textnet/image_processing_textnet.py index 578dabd3cb71..1a4d68522205 100644 --- a/src/transformers/models/textnet/image_processing_textnet.py +++ b/src/transformers/models/textnet/image_processing_textnet.py @@ -39,6 +39,7 @@ validate_kwargs, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import TensorType, is_vision_available, logging @@ -48,6 +49,10 @@ import PIL +class TextNetImageProcessorKwargs(ImagesKwargs): + size_divisor: Optional[int] + + class TextNetImageProcessor(BaseImageProcessor): r""" Constructs a TextNet image processor. @@ -90,6 +95,7 @@ class TextNetImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values"] + valid_kwargs = TextNetImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/textnet/image_processing_textnet_fast.py b/src/transformers/models/textnet/image_processing_textnet_fast.py index baa6276736f7..eba6e14e64bc 100644 --- a/src/transformers/models/textnet/image_processing_textnet_fast.py +++ b/src/transformers/models/textnet/image_processing_textnet_fast.py @@ -20,7 +20,7 @@ from torchvision.transforms.v2 import functional as F from ...image_processing_utils import BatchFeature -from ...image_processing_utils_fast import BaseImageProcessorFast, DefaultFastImageProcessorKwargs +from ...image_processing_utils_fast import BaseImageProcessorFast from ...image_transforms import ( get_resize_output_image_size, group_images_by_shape, @@ -39,15 +39,7 @@ TensorType, auto_docstring, ) - - -class TextNetFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - """ - size_divisor (`int`, *optional*, defaults to 32): - Ensures height and width are rounded to a multiple of this value after resizing. - """ - - size_divisor: Optional[int] +from .image_processing_textnet import TextNetImageProcessorKwargs @auto_docstring @@ -64,13 +56,13 @@ class TextNetImageProcessorFast(BaseImageProcessorFast): do_normalize = True do_convert_rgb = True size_divisor = 32 - valid_kwargs = TextNetFastImageProcessorKwargs + valid_kwargs = TextNetImageProcessorKwargs - def __init__(self, **kwargs: Unpack[TextNetFastImageProcessorKwargs]) -> None: + def __init__(self, **kwargs: Unpack[TextNetImageProcessorKwargs]) -> None: super().__init__(**kwargs) @auto_docstring - def preprocess(self, images: ImageInput, **kwargs: Unpack[TextNetFastImageProcessorKwargs]) -> BatchFeature: + def preprocess(self, images: ImageInput, **kwargs: Unpack[TextNetImageProcessorKwargs]) -> BatchFeature: return super().preprocess(images, **kwargs) def resize( diff --git a/src/transformers/models/trocr/processing_trocr.py b/src/transformers/models/trocr/processing_trocr.py index 049ed96c1749..81c7db2850b3 100644 --- a/src/transformers/models/trocr/processing_trocr.py +++ b/src/transformers/models/trocr/processing_trocr.py @@ -69,8 +69,6 @@ def __call__( self, images: Optional[ImageInput] = None, text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, - audio=None, - videos=None, **kwargs: Unpack[TrOCRProcessorKwargs], ) -> BatchFeature: """ diff --git a/src/transformers/models/tvp/image_processing_tvp.py b/src/transformers/models/tvp/image_processing_tvp.py index 2c4b066b6225..67c1ffe4fae8 100644 --- a/src/transformers/models/tvp/image_processing_tvp.py +++ b/src/transformers/models/tvp/image_processing_tvp.py @@ -39,6 +39,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging @@ -49,6 +50,21 @@ logger = logging.get_logger(__name__) +class TvpImageProcessorKwargs(ImagesKwargs): + r""" + do_flip_channel_order (`bool`, *optional*): + Whether to flip the channel order of the image from RGB to BGR. + constant_values (`float` or `List[float]`, *optional*): + Value used to fill the padding area when `pad_mode` is `'constant'`. + pad_mode (`str`, *optional*): + Padding mode to use — `'constant'`, `'edge'`, `'reflect'`, or `'symmetric'`. + """ + + do_flip_channel_order: Optional[bool] + constant_values: Optional[Union[float, list[float]]] + pad_mode: Optional[str] + + # Copied from transformers.models.vivit.image_processing_vivit.make_batched def make_batched(videos) -> list[list[ImageInput]]: if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): @@ -133,6 +149,7 @@ class TvpImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values"] + valid_kwargs = TvpImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/tvp/image_processing_tvp_fast.py b/src/transformers/models/tvp/image_processing_tvp_fast.py index 5d74e6efb71f..7cd550d75194 100644 --- a/src/transformers/models/tvp/image_processing_tvp_fast.py +++ b/src/transformers/models/tvp/image_processing_tvp_fast.py @@ -22,7 +22,6 @@ from ...image_processing_utils import BatchFeature from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, group_images_by_shape, reorder_images, ) @@ -36,21 +35,7 @@ ) from ...processing_utils import Unpack from ...utils import TensorType, auto_docstring - - -class TvpFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - r""" - do_flip_channel_order (`bool`, *optional*): - Whether to flip the channel order of the image from RGB to BGR. - constant_values (`float` or `List[float]`, *optional*): - Value used to fill the padding area when `pad_mode` is `'constant'`. - pad_mode (`str`, *optional*): - Padding mode to use — `'constant'`, `'edge'`, `'reflect'`, or `'symmetric'`. - """ - - do_flip_channel_order: Optional[bool] - constant_values: Optional[Union[float, list[float]]] - pad_mode: Optional[str] +from .image_processing_tvp import TvpImageProcessorKwargs @auto_docstring @@ -71,16 +56,16 @@ class TvpImageProcessorFast(BaseImageProcessorFast): pad_mode = "constant" do_normalize = True do_flip_channel_order = True - valid_kwargs = TvpFastImageProcessorKwargs + valid_kwargs = TvpImageProcessorKwargs - def __init__(self, **kwargs: Unpack[TvpFastImageProcessorKwargs]): + def __init__(self, **kwargs: Unpack[TvpImageProcessorKwargs]): super().__init__(**kwargs) @auto_docstring def preprocess( self, videos: Union[ImageInput, list[ImageInput], list[list[ImageInput]]], - **kwargs: Unpack[TvpFastImageProcessorKwargs], + **kwargs: Unpack[TvpImageProcessorKwargs], ) -> BatchFeature: return super().preprocess(videos, **kwargs) diff --git a/src/transformers/models/udop/processing_udop.py b/src/transformers/models/udop/processing_udop.py index 94b1565c9a22..1be71aea63e2 100644 --- a/src/transformers/models/udop/processing_udop.py +++ b/src/transformers/models/udop/processing_udop.py @@ -48,7 +48,6 @@ class UdopProcessorKwargs(ProcessingKwargs, total=False): "return_length": False, "verbose": True, }, - "images_kwargs": {}, } @@ -85,8 +84,6 @@ def __call__( self, images: Optional[ImageInput] = None, text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, - audio=None, - videos=None, **kwargs: Unpack[UdopProcessorKwargs], ) -> BatchFeature: """ diff --git a/src/transformers/models/video_llava/video_processing_video_llava.py b/src/transformers/models/video_llava/video_processing_video_llava.py index 1e5deb543654..371a419d4a36 100644 --- a/src/transformers/models/video_llava/video_processing_video_llava.py +++ b/src/transformers/models/video_llava/video_processing_video_llava.py @@ -15,13 +15,9 @@ """Video processor class for Video-LLaVA.""" from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling -from ...processing_utils import Unpack, VideosKwargs from ...video_processing_utils import BaseVideoProcessor -class VideoLlavaFastVideoProcessorInitKwargs(VideosKwargs): ... - - class VideoLlavaVideoProcessor(BaseVideoProcessor): resample = PILImageResampling.BICUBIC image_mean = OPENAI_CLIP_MEAN @@ -35,11 +31,6 @@ class VideoLlavaVideoProcessor(BaseVideoProcessor): do_normalize = True do_convert_rgb = True do_sample_frames = False # Set to False for BC, recommended to set `True` in new models - valid_kwargs = VideoLlavaFastVideoProcessorInitKwargs - model_input_names = ["pixel_values_videos"] - - def __init__(self, **kwargs: Unpack[VideoLlavaFastVideoProcessorInitKwargs]): - super().__init__(**kwargs) __all__ = ["VideoLlavaVideoProcessor"] diff --git a/src/transformers/models/vilt/image_processing_vilt.py b/src/transformers/models/vilt/image_processing_vilt.py index c7013e660332..bb29e1d1ee30 100644 --- a/src/transformers/models/vilt/image_processing_vilt.py +++ b/src/transformers/models/vilt/image_processing_vilt.py @@ -35,6 +35,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging from ...utils.import_utils import requires @@ -46,6 +47,10 @@ logger = logging.get_logger(__name__) +class ViltImageProcessorKwargs(ImagesKwargs): + size_divisor: Optional[int] + + def max_across_indices(values: Iterable[Any]) -> list[Any]: """ Return the maximum value across all indices of an iterable of values. @@ -162,6 +167,7 @@ class ViltImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values"] + valid_kwargs = ViltImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/vilt/image_processing_vilt_fast.py b/src/transformers/models/vilt/image_processing_vilt_fast.py index 6926b655ce45..11537f74656d 100644 --- a/src/transformers/models/vilt/image_processing_vilt_fast.py +++ b/src/transformers/models/vilt/image_processing_vilt_fast.py @@ -22,7 +22,6 @@ from ...image_processing_utils import BatchFeature from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, get_max_height_width, group_images_by_shape, reorder_images, @@ -32,6 +31,7 @@ TensorType, auto_docstring, ) +from .image_processing_vilt import ViltImageProcessorKwargs # Set maximum size based on the typical aspect ratio of the COCO dataset @@ -39,19 +39,6 @@ MAX_SHORTER_EDGE = 800 -class ViltFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - """ - Args: - size_divisor (`int`, *optional*, defaults to 32): - The size to make the height and width divisible by. - rescale_factor (`float`, *optional*, defaults to 1/255): - The factor to rescale the image by. - """ - - size_divisor: Optional[int] - rescale_factor: Optional[float] - - @auto_docstring class ViltImageProcessorFast(BaseImageProcessorFast): resample = PILImageResampling.BICUBIC @@ -65,7 +52,7 @@ class ViltImageProcessorFast(BaseImageProcessorFast): do_pad = True default_to_square = False model_input_names = ["pixel_values", "pixel_mask"] - valid_kwargs = ViltFastImageProcessorKwargs + valid_kwargs = ViltImageProcessorKwargs def _preprocess( self, diff --git a/src/transformers/models/vilt/processing_vilt.py b/src/transformers/models/vilt/processing_vilt.py index f4f9fc9a746d..5b5126ad4a85 100644 --- a/src/transformers/models/vilt/processing_vilt.py +++ b/src/transformers/models/vilt/processing_vilt.py @@ -17,17 +17,11 @@ """ import warnings -from typing import Optional -from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin - - -class ViltImagesKwargs(ImagesKwargs): - size_divisor: Optional[int] +from ...processing_utils import ProcessingKwargs, ProcessorMixin class ViltProcessorKwargs(ProcessingKwargs, total=False): - images_kwargs: ViltImagesKwargs _defaults = { "text_kwargs": { "add_special_tokens": True, diff --git a/src/transformers/models/vitmatte/image_processing_vitmatte.py b/src/transformers/models/vitmatte/image_processing_vitmatte.py index 87b6d2662ef4..95933c053ce5 100644 --- a/src/transformers/models/vitmatte/image_processing_vitmatte.py +++ b/src/transformers/models/vitmatte/image_processing_vitmatte.py @@ -33,6 +33,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import TensorType, filter_out_non_signature_kwargs, logging from ...utils.deprecation import deprecate_kwarg @@ -40,6 +41,10 @@ logger = logging.get_logger(__name__) +class VitMatteImageProcessorKwargs(ImagesKwargs): + size_divisor: Optional[int] + + class VitMatteImageProcessor(BaseImageProcessor): r""" Constructs a ViTMatte image processor. @@ -68,6 +73,7 @@ class VitMatteImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values"] + valid_kwargs = VitMatteImageProcessorKwargs def __init__( self, @@ -107,7 +113,7 @@ def size_divisibility(self, value): def pad_image( self, image: np.ndarray, - size_divisibility: int = 32, + size_divisor: int = 32, data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, ) -> np.ndarray: @@ -115,7 +121,7 @@ def pad_image( Args: image (`np.ndarray`): Image to pad. - size_divisibility (`int`, *optional*, defaults to 32): + size_divisor (`int`, *optional*, defaults to 32): The width and height of the image will be padded to be divisible by this number. data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): The channel dimension format for the output image. Can be one of: @@ -134,8 +140,8 @@ def pad_image( height, width = get_image_size(image, input_data_format) - pad_height = 0 if height % size_divisibility == 0 else size_divisibility - height % size_divisibility - pad_width = 0 if width % size_divisibility == 0 else size_divisibility - width % size_divisibility + pad_height = 0 if height % size_divisor == 0 else size_divisor - height % size_divisor + pad_width = 0 if width % size_divisor == 0 else size_divisor - width % size_divisor if pad_width + pad_height > 0: padding = ((0, pad_height), (0, pad_width)) image = pad(image, padding=padding, data_format=data_format, input_data_format=input_data_format) @@ -265,7 +271,7 @@ def preprocess( if do_pad: images = [ - self.pad_image(image, size_divisibility=size_divisor, input_data_format=input_data_format) + self.pad_image(image, size_divisor=size_divisor, input_data_format=input_data_format) for image in images ] diff --git a/src/transformers/models/vitmatte/image_processing_vitmatte_fast.py b/src/transformers/models/vitmatte/image_processing_vitmatte_fast.py index c5a7256a612b..dd09b987090d 100644 --- a/src/transformers/models/vitmatte/image_processing_vitmatte_fast.py +++ b/src/transformers/models/vitmatte/image_processing_vitmatte_fast.py @@ -22,7 +22,6 @@ from ...image_processing_utils import BatchFeature from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, group_images_by_shape, reorder_images, ) @@ -40,20 +39,12 @@ filter_out_non_signature_kwargs, logging, ) +from .image_processing_vitmatte import VitMatteImageProcessorKwargs logger = logging.get_logger(__name__) -class VitMatteFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - """ - size_divisor (`int`, *optional*, defaults to 32): - The width and height of the image will be padded to be divisible by this number. - """ - - size_divisor: Optional[int] - - @auto_docstring class VitMatteImageProcessorFast(BaseImageProcessorFast): do_rescale: bool = True @@ -63,9 +54,9 @@ class VitMatteImageProcessorFast(BaseImageProcessorFast): image_std: Optional[Union[float, list[float]]] = IMAGENET_STANDARD_STD do_pad: bool = True size_divisor: int = 32 - valid_kwargs = VitMatteFastImageProcessorKwargs + valid_kwargs = VitMatteImageProcessorKwargs - def __init__(self, **kwargs: Unpack[VitMatteFastImageProcessorKwargs]) -> None: + def __init__(self, **kwargs: Unpack[VitMatteImageProcessorKwargs]) -> None: size_divisibility = kwargs.pop("size_divisibility", None) kwargs.setdefault("size_divisor", size_divisibility) super().__init__(**kwargs) @@ -87,21 +78,21 @@ def size_divisibility(self, value): def _pad_image( self, images: torch.Tensor, - size_divisibility: int = 32, + size_divisor: int = 32, ) -> torch.Tensor: """ - Pads an image or batched images constantly so that width and height are divisible by size_divisibility + Pads an image or batched images constantly so that width and height are divisible by size_divisor Args: image (`torch.Tensor`): Image to pad. - size_divisibility (`int`, *optional*, defaults to 32): + size_divisor (`int`, *optional*, defaults to 32): The width and height of the image will be padded to be divisible by this number. """ height, width = get_image_size(images, channel_dim=ChannelDimension.FIRST) - pad_height = 0 if height % size_divisibility == 0 else size_divisibility - height % size_divisibility - pad_width = 0 if width % size_divisibility == 0 else size_divisibility - width % size_divisibility + pad_height = 0 if height % size_divisor == 0 else size_divisor - height % size_divisor + pad_width = 0 if width % size_divisor == 0 else size_divisor - width % size_divisor if pad_width + pad_height > 0: padding = (0, 0, pad_width, pad_height) @@ -114,7 +105,7 @@ def preprocess( self, images: list["torch.Tensor"], trimaps: list["torch.Tensor"], - **kwargs: Unpack[VitMatteFastImageProcessorKwargs], + **kwargs: Unpack[VitMatteImageProcessorKwargs], ) -> BatchFeature: r""" trimaps (`list[torch.Tensor]`): @@ -129,7 +120,7 @@ def _preprocess_image_like_inputs( do_convert_rgb: bool, input_data_format: ChannelDimension, device: Optional[Union[str, "torch.device"]] = None, - **kwargs: Unpack[VitMatteFastImageProcessorKwargs], + **kwargs: Unpack[VitMatteImageProcessorKwargs], ) -> BatchFeature: """ Preprocess image-like inputs. diff --git a/src/transformers/models/vjepa2/video_processing_vjepa2.py b/src/transformers/models/vjepa2/video_processing_vjepa2.py index 3a5f5509ba6b..ad806ba4cc3d 100644 --- a/src/transformers/models/vjepa2/video_processing_vjepa2.py +++ b/src/transformers/models/vjepa2/video_processing_vjepa2.py @@ -19,9 +19,6 @@ from ...video_processing_utils import BaseVideoProcessor -class VJEPA2VideoProcessorInitKwargs(VideosKwargs): ... - - class VJEPA2VideoProcessor(BaseVideoProcessor): resample = PILImageResampling.BILINEAR image_mean = IMAGENET_DEFAULT_MEAN @@ -32,10 +29,8 @@ class VJEPA2VideoProcessor(BaseVideoProcessor): do_rescale = True do_center_crop = True do_normalize = True - valid_kwargs = VJEPA2VideoProcessorInitKwargs - model_input_names = ["pixel_values_videos"] - def __init__(self, **kwargs: Unpack[VJEPA2VideoProcessorInitKwargs]): + def __init__(self, **kwargs: Unpack[VideosKwargs]): crop_size = kwargs.get("crop_size", 256) if not isinstance(crop_size, int): if not isinstance(crop_size, dict) or "height" not in crop_size: diff --git a/src/transformers/models/voxtral/processing_voxtral.py b/src/transformers/models/voxtral/processing_voxtral.py index 1b812ba60a4b..124835e35338 100644 --- a/src/transformers/models/voxtral/processing_voxtral.py +++ b/src/transformers/models/voxtral/processing_voxtral.py @@ -198,9 +198,8 @@ def apply_chat_template( ) text_kwargs = output_kwargs["text_kwargs"] audio_kwargs = output_kwargs["audio_kwargs"] - common_kwargs = output_kwargs["common_kwargs"] + return_tensors = text_kwargs.get("return_tensors", None) - return_tensors = common_kwargs.pop("return_tensors", None) if return_tensors != "pt": raise ValueError(f"{self.__class__.__name__} only supports `return_tensors='pt'`.") @@ -270,16 +269,10 @@ def __call__( f"{self.audio_token} is present in the provided text which is not supported by VoxtralProcessor. Please use the `apply_chat_template` method instead." ) - output_kwargs = self._merge_kwargs( - VoxtralProcessorKwargs, - **kwargs, - ) - text_kwargs = output_kwargs["text_kwargs"] - common_kwargs = output_kwargs["common_kwargs"] - - out = self.tokenizer(text, **text_kwargs) + output_kwargs = self._merge_kwargs(VoxtralProcessorKwargs, **kwargs) + out = self.tokenizer(text, **output_kwargs["text_kwargs"]) - return BatchFeature(data=out, tensor_type=common_kwargs.pop("return_tensors", None)) + return BatchFeature(data=out, tensor_type=output_kwargs["text_kwargs"].get("return_tensors", None)) # TODO: @eustlb, this should be moved to mistral_common + testing def apply_transcription_request( @@ -327,7 +320,6 @@ def apply_transcription_request( ) text_kwargs = output_kwargs["text_kwargs"] audio_kwargs = output_kwargs["audio_kwargs"] - common_kwargs = output_kwargs["common_kwargs"] is_str = isinstance(audio, str) is_list_of_str = all(isinstance(el, str) for el in audio) @@ -344,15 +336,14 @@ def apply_transcription_request( ) sampling_rate = audio_kwargs["sampling_rate"] - return_dict = common_kwargs.pop("return_dict", False) - tokenize = common_kwargs.pop("tokenize", False) # make sure to remove from text_kwargs and audio_kwargs - for k in ("return_dict", "tokenize"): - text_kwargs.pop(k, None) - audio_kwargs.pop(k, None) + return_dict = text_kwargs.pop("return_dict", False) + tokenize = text_kwargs.pop("tokenize", False) + _ = audio_kwargs.pop("return_dict", False) + _ = audio_kwargs.pop("tokenize", False) - return_tensors = common_kwargs.pop("return_tensors", None) + return_tensors = text_kwargs.pop("return_tensors", None) if return_tensors != "pt": raise ValueError(f"{self.__class__.__name__} only supports `return_tensors='pt'`.") diff --git a/src/transformers/models/wav2vec2/processing_wav2vec2.py b/src/transformers/models/wav2vec2/processing_wav2vec2.py index 1dc382d6f68a..ee8e3abd195d 100644 --- a/src/transformers/models/wav2vec2/processing_wav2vec2.py +++ b/src/transformers/models/wav2vec2/processing_wav2vec2.py @@ -76,8 +76,6 @@ def __call__( self, audio: Optional[AudioInput] = None, text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None, - images=None, - videos=None, **kwargs: Unpack[Wav2Vec2ProcessorKwargs], ): """ @@ -112,7 +110,6 @@ def __call__( audio, **output_kwargs["audio_kwargs"], **output_kwargs["text_kwargs"], - **output_kwargs["common_kwargs"], ) if audio is not None: diff --git a/src/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py b/src/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py index ead53edb101a..fc95fc04c754 100644 --- a/src/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py +++ b/src/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py @@ -73,8 +73,6 @@ def __call__( self, audio: Optional[AudioInput] = None, text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None, - images=None, - videos=None, **kwargs: Unpack[Wav2Vec2BertProcessorKwargs], ): """ diff --git a/src/transformers/models/yolos/image_processing_yolos.py b/src/transformers/models/yolos/image_processing_yolos.py index 50da604db8d3..21aac76adac8 100644 --- a/src/transformers/models/yolos/image_processing_yolos.py +++ b/src/transformers/models/yolos/image_processing_yolos.py @@ -51,6 +51,7 @@ validate_kwargs, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import ( TensorType, is_scipy_available, @@ -80,6 +81,29 @@ SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) +class YolosImageProcessorKwargs(ImagesKwargs): + r""" + format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): + Data format of the annotations. One of "coco_detection" or "coco_panoptic". + do_convert_annotations (`bool`, *optional*, defaults to `True`): + Controls whether to convert the annotations to the format expected by the YOLOS model. Converts the + bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. + Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. + return_segmentation_masks (`bool`, *optional*, defaults to `False`): + Whether to return segmentation masks. + annotations (`AnnotationType` or `list[AnnotationType]`, *optional*): + Annotations to transform according to the padding that is applied to the images. + masks_path (`str` or `pathlib.Path`, *optional*): + Path to the directory containing the segmentation masks. + """ + + format: Optional[Union[str, AnnotationFormat]] + do_convert_annotations: Optional[bool] + return_segmentation_masks: Optional[bool] + annotations: Optional[Union[AnnotationType, list[AnnotationType]]] + masks_path: Optional[Union[str, pathlib.Path]] + + # Copied from transformers.models.detr.image_processing_detr.get_max_height_width def get_max_height_width( images: list[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None @@ -744,6 +768,7 @@ class YolosImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values", "pixel_mask"] + valid_kwargs = YolosImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/yolos/image_processing_yolos_fast.py b/src/transformers/models/yolos/image_processing_yolos_fast.py index 59bb3868e75e..fc1f1852862f 100644 --- a/src/transformers/models/yolos/image_processing_yolos_fast.py +++ b/src/transformers/models/yolos/image_processing_yolos_fast.py @@ -14,7 +14,6 @@ from ...image_processing_utils import BatchFeature, get_size_dict from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, SizeDict, get_image_size_for_max_height_width, get_max_height_width, @@ -35,28 +34,11 @@ from ...processing_utils import Unpack from ...utils import TensorType, auto_docstring, logging from ...utils.import_utils import requires +from .image_processing_yolos import YolosImageProcessorKwargs logger = logging.get_logger(__name__) - -class YolosFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - r""" - format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): - Data format of the annotations. One of "coco_detection" or "coco_panoptic". - do_convert_annotations (`bool`, *optional*, defaults to `True`): - Controls whether to convert the annotations to the format expected by the YOLOS model. Converts the - bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. - Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. - return_segmentation_masks (`bool`, *optional*, defaults to `False`): - Whether to return segmentation masks. - """ - - format: Optional[Union[str, AnnotationFormat]] - do_convert_annotations: Optional[bool] - return_segmentation_masks: Optional[bool] - - SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) @@ -320,9 +302,9 @@ class YolosImageProcessorFast(BaseImageProcessorFast): size = {"shortest_edge": 800, "longest_edge": 1333} default_to_square = False model_input_names = ["pixel_values", "pixel_mask"] - valid_kwargs = YolosFastImageProcessorKwargs + valid_kwargs = YolosImageProcessorKwargs - def __init__(self, **kwargs: Unpack[YolosFastImageProcessorKwargs]) -> None: + def __init__(self, **kwargs: Unpack[YolosImageProcessorKwargs]) -> None: if "pad_and_return_pixel_mask" in kwargs: kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") @@ -584,25 +566,8 @@ def pad( def preprocess( self, images: ImageInput, - annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None, - masks_path: Optional[Union[str, pathlib.Path]] = None, - **kwargs: Unpack[YolosFastImageProcessorKwargs], + **kwargs: Unpack[YolosImageProcessorKwargs], ) -> BatchFeature: - r""" - annotations (`AnnotationType` or `list[AnnotationType]`, *optional*): - List of annotations associated with the image or batch of images. If annotation is for object - detection, the annotations should be a dictionary with the following keys: - - "image_id" (`int`): The image id. - - "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a - dictionary. An image can have no annotations, in which case the list should be empty. - If annotation is for segmentation, the annotations should be a dictionary with the following keys: - - "image_id" (`int`): The image id. - - "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary. - An image can have no segments, in which case the list should be empty. - - "file_name" (`str`): The file name of the image. - masks_path (`str` or `pathlib.Path`, *optional*): - Path to the directory containing the segmentation masks. - """ if "pad_and_return_pixel_mask" in kwargs: kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask") logger.warning_once( @@ -617,7 +582,7 @@ def preprocess( ) kwargs["size"] = kwargs.pop("max_size") - return super().preprocess(images, annotations, masks_path, **kwargs) + return super().preprocess(images, **kwargs) def _preprocess( self, diff --git a/src/transformers/models/zoedepth/image_processing_zoedepth.py b/src/transformers/models/zoedepth/image_processing_zoedepth.py index 1ef2b8a59ec1..e8ad44dd76c3 100644 --- a/src/transformers/models/zoedepth/image_processing_zoedepth.py +++ b/src/transformers/models/zoedepth/image_processing_zoedepth.py @@ -40,6 +40,7 @@ valid_images, validate_preprocess_arguments, ) +from ...processing_utils import ImagesKwargs from ...utils import ( TensorType, filter_out_non_signature_kwargs, @@ -61,6 +62,25 @@ logger = logging.get_logger(__name__) +class ZoeDepthImageProcessorKwargs(ImagesKwargs): + """ + keep_aspect_ratio (`bool`, *optional*, defaults to `True`): + If `True`, the image is resized by choosing the smaller of the height and width scaling factors and using it + for both dimensions. This ensures that the image is scaled down as little as possible while still fitting + within the desired output size. In case `ensure_multiple_of` is also set, the image is further resized to a + size that is a multiple of this value by flooring the height and width to the nearest multiple of this value. + Can be overridden by `keep_aspect_ratio` in `preprocess`. + ensure_multiple_of (`int`, *optional*, defaults to 32): + If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Works by flooring + the height and width to the nearest multiple of this value. + Works both with and without `keep_aspect_ratio` being set to `True`. + Can be overridden by `ensure_multiple_of` in `preprocess`. + """ + + keep_aspect_ratio: Optional[bool] + ensure_multiple_of: Optional[int] + + def get_resize_output_image_size( input_image: np.ndarray, output_size: Union[int, Iterable[int]], @@ -145,6 +165,7 @@ class ZoeDepthImageProcessor(BaseImageProcessor): """ model_input_names = ["pixel_values"] + valid_kwargs = ZoeDepthImageProcessorKwargs def __init__( self, diff --git a/src/transformers/models/zoedepth/image_processing_zoedepth_fast.py b/src/transformers/models/zoedepth/image_processing_zoedepth_fast.py index 045dbfdacd4d..852ee161aff1 100644 --- a/src/transformers/models/zoedepth/image_processing_zoedepth_fast.py +++ b/src/transformers/models/zoedepth/image_processing_zoedepth_fast.py @@ -28,7 +28,6 @@ ) from ...image_processing_utils_fast import ( BaseImageProcessorFast, - DefaultFastImageProcessorKwargs, group_images_by_shape, reorder_images, ) @@ -48,32 +47,13 @@ logging, requires_backends, ) -from .image_processing_zoedepth import get_resize_output_image_size +from .image_processing_zoedepth import ZoeDepthImageProcessorKwargs, get_resize_output_image_size from .modeling_zoedepth import ZoeDepthDepthEstimatorOutput logger = logging.get_logger(__name__) -class ZoeDepthFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): - """ - keep_aspect_ratio (`bool`, *optional*, defaults to `True`): - If `True`, the image is resized by choosing the smaller of the height and width scaling factors and using it - for both dimensions. This ensures that the image is scaled down as little as possible while still fitting - within the desired output size. In case `ensure_multiple_of` is also set, the image is further resized to a - size that is a multiple of this value by flooring the height and width to the nearest multiple of this value. - Can be overridden by `keep_aspect_ratio` in `preprocess`. - ensure_multiple_of (`int`, *optional*, defaults to 32): - If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Works by flooring - the height and width to the nearest multiple of this value. - Works both with and without `keep_aspect_ratio` being set to `True`. - Can be overridden by `ensure_multiple_of` in `preprocess`. - """ - - keep_aspect_ratio: Optional[bool] - ensure_multiple_of: Optional[int] - - @auto_docstring class ZoeDepthImageProcessorFast(BaseImageProcessorFast): do_pad = True @@ -86,16 +66,16 @@ class ZoeDepthImageProcessorFast(BaseImageProcessorFast): resample = PILImageResampling.BILINEAR keep_aspect_ratio = True ensure_multiple_of = 1 / 32 - valid_kwargs = ZoeDepthFastImageProcessorKwargs + valid_kwargs = ZoeDepthImageProcessorKwargs - def __init__(self, **kwargs: Unpack[ZoeDepthFastImageProcessorKwargs]) -> None: + def __init__(self, **kwargs: Unpack[ZoeDepthImageProcessorKwargs]) -> None: super().__init__(**kwargs) @auto_docstring def preprocess( self, images: ImageInput, - **kwargs: Unpack[ZoeDepthFastImageProcessorKwargs], + **kwargs: Unpack[ZoeDepthImageProcessorKwargs], ) -> BatchFeature: return super().preprocess(images, **kwargs) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 952bc65ce706..e7786d1ba61d 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -72,6 +72,8 @@ if is_torch_available(): + import torch + from .modeling_utils import PreTrainedAudioTokenizerBase @@ -156,6 +158,7 @@ class TextKwargs(TypedDict, total=False): verbose: Optional[bool] padding_side: Optional[str] return_mm_token_type_ids: Optional[bool] + return_tensors: Optional[Union[str, TensorType]] class ImagesKwargs(TypedDict, total=False): @@ -164,6 +167,8 @@ class ImagesKwargs(TypedDict, total=False): class methods and docstrings. Attributes: + do_convert_rgb (`bool`): + Whether to convert the video to RGB format. do_resize (`bool`, *optional*): Whether to resize the image. size (`dict[str, int]`, *optional*): @@ -183,7 +188,7 @@ class methods and docstrings. image_std (`float` or `list[float]`, *optional*): Standard deviation to use if normalizing the image. do_pad (`bool`, *optional*): - Whether to pad the image to the `(max_height, max_width)` of the images in the batch. + Whether to pad the images in the batch. pad_size (`dict[str, int]`, *optional*): The size `{"height": int, "width" int}` to pad the images to. do_center_crop (`bool`, *optional*): @@ -192,10 +197,13 @@ class methods and docstrings. The channel dimension format for the output image. input_data_format (`ChannelDimension` or `str`, *optional*): The channel dimension format for the input image. - device (`str`, *optional*): + device (`Union[str, torch.Tensor]`, *optional*): The device to use for processing (e.g. "cpu", "cuda"), only relevant for fast image processing. + disable_grouping (`bool`, *optional*): + Whether to group images by shapes when processing or not, only relevant for fast image processing. """ + do_convert_rgb: Optional[bool] do_resize: Optional[bool] size: Optional[dict[str, int]] crop_size: Optional[dict[str, int]] @@ -210,7 +218,9 @@ class methods and docstrings. do_center_crop: Optional[bool] data_format: Optional[ChannelDimension] input_data_format: Optional[Union[str, ChannelDimension]] - device: Optional[str] + device: Optional[Union[str, "torch.device"]] + disable_grouping: Optional[bool] + return_tensors: Optional[Union[str, TensorType]] class VideosKwargs(TypedDict, total=False): @@ -240,6 +250,8 @@ class VideosKwargs(TypedDict, total=False): Standard deviation to use if normalizing the video. do_center_crop (`bool`, *optional*): Whether to center crop the video. + do_pad (`bool`, *optional*): + Whether to pad the images in the batch. do_sample_frames (`bool`, *optional*): Whether to sample frames from the video before processing or to process the whole video. video_metadata (`Union[VideoMetadata, dict]`, *optional*): @@ -254,6 +266,8 @@ class VideosKwargs(TypedDict, total=False): The channel dimension format for the output video. input_data_format (`ChannelDimension` or `str`, *optional*): The channel dimension format for the input video. + device (`Union[str, torch.Tensor]`, *optional*): + The device to use for processing (e.g. "cpu", "cuda"), only relevant for fast image processing. return_metadata (`ChannelDimension` or `str`, *optional*): Whether to return video metadata or not. """ @@ -269,15 +283,17 @@ class VideosKwargs(TypedDict, total=False): image_mean: Optional[Union[float, list[float]]] image_std: Optional[Union[float, list[float]]] do_center_crop: Optional[bool] + do_pad: Optional[bool] crop_size: Optional[dict[str, int]] data_format: Optional[ChannelDimension] input_data_format: Optional[Union[str, ChannelDimension]] - device: Optional[str] + device: Optional[Union[str, "torch.device"]] do_sample_frames: Optional[bool] video_metadata: Optional[Union[VideoMetadata, dict]] fps: Optional[Union[int, float]] num_frames: Optional[int] return_metadata: Optional[bool] + return_tensors: Optional[Union[str, TensorType]] class AudioKwargs(TypedDict, total=False): @@ -317,9 +333,6 @@ class AudioKwargs(TypedDict, total=False): truncation: Optional[bool] pad_to_multiple_of: Optional[int] return_attention_mask: Optional[bool] - - -class CommonKwargs(TypedDict, total=False): return_tensors: Optional[Union[str, TensorType]] @@ -364,9 +377,6 @@ class CustomProcessorKwargs(ProcessingKwargs, total=False): _defaults = {} - common_kwargs: CommonKwargs = { - **CommonKwargs.__annotations__, - } text_kwargs: TextKwargs = { **TextKwargs.__annotations__, } @@ -1245,7 +1255,6 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg "images_kwargs": {}, "audio_kwargs": {}, "videos_kwargs": {}, - "common_kwargs": {}, } default_kwargs = { @@ -1253,7 +1262,13 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg "images_kwargs": {}, "audio_kwargs": {}, "videos_kwargs": {}, - "common_kwargs": {}, + } + + map_preprocessor_kwargs = { + "text_kwargs": "tokenizer", + "images_kwargs": "image_processor", + "audio_kwargs": "feature_extractor", + "videos_kwargs": "video_processor", } possible_modality_keywords = {"text", "audio", "videos", "images"} @@ -1262,8 +1277,22 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg # get defaults from set model processor kwargs if they exist for modality in default_kwargs: default_kwargs[modality] = ModelProcessorKwargs._defaults.get(modality, {}).copy() + # Some preprocessors define a set of accepted "valid_kwargs" (currently only vision). + # In those cases, we don’t declare a `ModalityKwargs` attribute in the TypedDict. + # Instead, we dynamically obtain the kwargs from the preprocessor and merge them + # with the general kwargs set. This ensures consistency between preprocessor and + # processor classes, and helps prevent accidental mismatches. + modality_valid_kwargs = set(ModelProcessorKwargs.__annotations__[modality].__annotations__) + if modality in map_preprocessor_kwargs: + preprocessor = getattr(self, map_preprocessor_kwargs[modality], None) + preprocessor_valid_kwargs = ( + getattr(preprocessor, "valid_kwargs", None) if preprocessor is not None else None + ) + modality_valid_kwargs.update( + set(preprocessor_valid_kwargs.__annotations__ if preprocessor_valid_kwargs is not None else []) + ) # update defaults with arguments from tokenizer init - for modality_key in ModelProcessorKwargs.__annotations__[modality].__annotations__: + for modality_key in modality_valid_kwargs: # init with tokenizer init kwargs if necessary if tokenizer_init_kwargs is not None and modality_key in tokenizer_init_kwargs: value = ( @@ -1279,7 +1308,16 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg # update modality kwargs with passed kwargs non_modality_kwargs = set(kwargs) - set(output_kwargs) for modality, output_kwarg in output_kwargs.items(): - for modality_key in ModelProcessorKwargs.__annotations__[modality].__annotations__: + modality_valid_kwargs = set(ModelProcessorKwargs.__annotations__[modality].__annotations__) + if modality in map_preprocessor_kwargs: + preprocessor = getattr(self, map_preprocessor_kwargs[modality], None) + preprocessor_valid_kwargs = ( + getattr(preprocessor, "valid_kwargs", None) if preprocessor is not None else None + ) + modality_valid_kwargs.update( + set(preprocessor_valid_kwargs.__annotations__ if preprocessor_valid_kwargs is not None else []) + ) + for modality_key in modality_valid_kwargs: # check if we received a structured kwarg dict or not to handle it correctly if modality in kwargs: kwarg_value = kwargs[modality].pop(modality_key, "__empty__") @@ -1311,17 +1349,18 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg else: # kwargs is a flat dictionary for key, kwarg in kwargs.items(): - if key not in used_keys: - if key in ModelProcessorKwargs.__annotations__["common_kwargs"].__annotations__: - output_kwargs["common_kwargs"][key] = kwarg - elif key not in possible_modality_keywords: - logger.warning_once( - f"Keyword argument `{key}` is not a valid argument for this processor and will be ignored." - ) + if key not in used_keys and key not in possible_modality_keywords: + logger.warning_once( + f"Keyword argument `{key}` is not a valid argument for this processor and will be ignored." + ) + + # For `common_kwargs` just update all modality-specific kwargs with same key/values + common_kwargs = kwargs.get("common_kwargs", {}) + common_kwargs.update(ModelProcessorKwargs._defaults.get("common_kwargs", {})) + if common_kwargs: + for kwarg in output_kwargs.values(): + kwarg.update(common_kwargs) - # all modality-specific kwargs are updated with common kwargs - for kwarg in output_kwargs.values(): - kwarg.update(output_kwargs["common_kwargs"]) return output_kwargs @classmethod diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index ac72c6617d7b..cd3b9a18b1c5 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -913,7 +913,7 @@ def add_special_tokens( Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the `unk_token` to them). - replace_additional_special_tokens (`bool`, *optional*,, defaults to `True`): + replace_additional_special_tokens (`bool`, *optional*, defaults to `True`): If `True`, the existing list of additional special tokens will be replaced by the list provided in `special_tokens_dict`. Otherwise, `self._special_tokens_map["additional_special_tokens"]` is just extended. In the former case, the tokens will NOT be removed from the tokenizer's full vocabulary - they are only being flagged diff --git a/src/transformers/utils/auto_docstring.py b/src/transformers/utils/auto_docstring.py index 15882c6b63cc..135f20bf4cf9 100644 --- a/src/transformers/utils/auto_docstring.py +++ b/src/transformers/utils/auto_docstring.py @@ -102,6 +102,13 @@ class ImageProcessorArgs: "shape": None, } + size_divisor = { + "description": """ + The size by which to make sure both the height and width can be divided. + """, + "shape": None, + } + default_to_square = { "description": """ Whether to default to a square image when resizing, if size is an int. diff --git a/tests/models/got_ocr2/test_image_processing_got_ocr2.py b/tests/models/got_ocr2/test_image_processing_got_ocr2.py index 4228ffe4dcba..a9420a671094 100644 --- a/tests/models/got_ocr2/test_image_processing_got_ocr2.py +++ b/tests/models/got_ocr2/test_image_processing_got_ocr2.py @@ -44,7 +44,6 @@ def __init__( do_resize=True, size=None, do_normalize=True, - do_pad=False, image_mean=[0.48145466, 0.4578275, 0.40821073], image_std=[0.26862954, 0.26130258, 0.27577711], do_convert_rgb=True, @@ -62,7 +61,6 @@ def __init__( self.do_normalize = do_normalize self.image_mean = image_mean self.image_std = image_std - self.do_pad = do_pad self.do_convert_rgb = do_convert_rgb def prepare_image_processor_dict(self): @@ -73,7 +71,6 @@ def prepare_image_processor_dict(self): "image_mean": self.image_mean, "image_std": self.image_std, "do_convert_rgb": self.do_convert_rgb, - "do_pad": self.do_pad, } def expected_output_image_shape(self, images): diff --git a/tests/utils/test_add_new_model_like.py b/tests/utils/test_add_new_model_like.py index 5ba84bab5501..9796d67cc5f6 100644 --- a/tests/utils/test_add_new_model_like.py +++ b/tests/utils/test_add_new_model_like.py @@ -473,8 +473,8 @@ def test_phi4_with_all_processors(self): ) from ..phi4_multimodal.feature_extraction_phi4_multimodal import Phi4MultimodalFeatureExtractor from ..phi4_multimodal.image_processing_phi4_multimodal_fast import ( - Phi4MultimodalFastImageProcessorKwargs, Phi4MultimodalImageProcessorFast, + Phi4MultimodalImageProcessorKwargs, ) from ..phi4_multimodal.modeling_phi4_multimodal import ( Phi4MultimodalAttention, @@ -643,7 +643,7 @@ class MyTest2ForCausalLM(Phi4MultimodalForCausalLM): pass - class MyTest2FastImageProcessorKwargs(Phi4MultimodalFastImageProcessorKwargs): + class MyTest2ImageProcessorKwargs(Phi4MultimodalImageProcessorKwargs): pass diff --git a/utils/check_modular_conversion.py b/utils/check_modular_conversion.py index 41650aedd0e2..a566025d2e1c 100644 --- a/utils/check_modular_conversion.py +++ b/utils/check_modular_conversion.py @@ -30,8 +30,8 @@ def process_file( file_type="modeling_", show_diff=True, ): - file_name_prefix = file_type.split("*")[0] - file_name_suffix = file_type.split("*")[-1] if "*" in file_type else "" + file_name_prefix = file_type.split(".*")[0] + file_name_suffix = file_type.split(".*")[-1] if ".*" in file_type else "" file_path = modular_file_path.replace("modular_", f"{file_name_prefix}_").replace(".py", f"{file_name_suffix}.py") # Read the actual modeling file with open(file_path, "r", encoding="utf-8") as modeling_file: diff --git a/utils/modular_model_converter.py b/utils/modular_model_converter.py index e2a19aa611b9..18c3a729368b 100644 --- a/utils/modular_model_converter.py +++ b/utils/modular_model_converter.py @@ -499,6 +499,7 @@ def augmented_dependencies_for_class_node( "configuration", "tokenization", "processing", + "image_processing.*_fast", "image_processing", "video_processing", "feature_extraction", @@ -538,7 +539,7 @@ def visit_ImportFrom(self, node): to be added (because it will be part of the imports)""" import_module = self.python_module.code_for_node(node.module) import_statement = "." * len(node.relative) + import_module - if re.search(rf"^\.({self.match_patterns})_.*", import_statement): + if re.search(rf"^\.({self.match_patterns}).*", import_statement): for imported_object in node.names: # If an alias is present, we record it and not the original name if imported_object.evaluated_alias is not None: @@ -1056,10 +1057,11 @@ def replace_class_node( "Tokenizer": "tokenization", "Processor": "processing", "ImageProcessor": "image_processing", - "ImageProcessorFast": "image_processing*_fast", # "*" indicates where to insert the model name before the "_fast" suffix + "ImageProcessorFast": "image_processing.*_fast", # "*" indicates where to insert the model name before the "_fast" suffix "VideoProcessor": "video_processing", "VideoProcessorInitKwargs": "video_processing", - "FastImageProcessorKwargs": "image_processing*_fast", + "FastImageProcessorKwargs": "image_processing.*_fast", + "ImageProcessorKwargs": "image_processing", "FeatureExtractor": "feature_extraction", "ProcessorKwargs": "processing", "VideosKwargs": "processing", @@ -1208,7 +1210,7 @@ def visit_ImportFrom(self, node: cst.ImportFrom) -> None: if m.matches(node.module, m.Attribute()): for imported_ in node.names: _import = re.search( - rf"(?:transformers\.models\.)|(?:\.\.\.models\.)|(?:\.\.)\w+\.({self.match_patterns})_.*", + rf"(?:transformers\.models\.)|(?:\.\.\.models\.)|(?:\.\.)\w+\.({self.match_patterns}).*", import_statement, ) if _import: @@ -1257,7 +1259,7 @@ def visit_SimpleStatementLine(self, node): import_module = self.python_module.code_for_node(node.body[0].module) import_statement = "." * len(node.body[0].relative) + import_module if not ( - re.search(rf"(?:transformers\.models\.)|(?:\.\.)\w+\.({self.match_patterns})_.*", import_statement) + re.search(rf"(?:transformers\.models\.)|(?:\.\.)\w+\.({self.match_patterns}).*", import_statement) and not any(import_to_skip in import_statement for import_to_skip in IMPORTS_TO_SKIP_IN_MODULAR) ): self.imports.append(node) @@ -1320,7 +1322,7 @@ def leave_Module(self, node): # Note that we may visit several of the same file types, thus we save them per file type, not file self.imported_objects_per_file = defaultdict(set) for file, mapper in self.visited_modules.items(): - file_type = re.search(rf"^transformers\.models\.\w+\.({self.match_patterns})_.*", file).group(1) + file_type = re.search(rf"^transformers\.models\.\w+\.({self.match_patterns})", file).group(1) self.imported_objects_per_file[file_type].update(mapper.objects_imported_from_modeling) def merge_model_specific_imports(self, visited_modules): @@ -1716,8 +1718,8 @@ def convert_modular_file(modular_file: str) -> dict[str, str]: def save_modeling_files(modular_file: str, converted_files: dict[str, str]): """Save all the `converted_files` from the `modular_file`.""" for file_type in converted_files: - file_name_prefix = file_type.split("*")[0] - file_name_suffix = file_type.split("*")[-1] if "*" in file_type else "" + file_name_prefix = file_type.split(".*")[0] + file_name_suffix = file_type.split(".*")[-1] if ".*" in file_type else "" new_file_name = modular_file.replace("modular_", f"{file_name_prefix}_").replace( ".py", f"{file_name_suffix}.py" )