diff --git a/docs/source/en/auto_docstring.md b/docs/source/en/auto_docstring.md
index e6c753419978..97a1c7d0ac92 100644
--- a/docs/source/en/auto_docstring.md
+++ b/docs/source/en/auto_docstring.md
@@ -292,7 +292,7 @@ The `@auto_docstring` decorator automatically generates docstrings by:
 
 8. Unrolling kwargs typed with the unpack operator. For specific methods (defined in `UNROLL_KWARGS_METHODS`) or classes (defined in `UNROLL_KWARGS_CLASSES`), the decorator processes `**kwargs` parameters that are typed with `Unpack[KwargsTypedDict]`. It extracts the documentations from the `TypedDict` and adds each parameter to the function's docstring.
 
-    Currently only supported for [`FastImageProcessorKwargs`].
+    Currently only supported for [`ImagesKwargs`].
 
 ## Best practices
 
diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index 52b798c09f84..3227b08cf031 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -20,7 +20,8 @@
 
 from .image_processing_base import BatchFeature, ImageProcessingMixin
 from .image_transforms import center_crop, normalize, rescale
-from .image_utils import ChannelDimension, get_image_size
+from .image_utils import ChannelDimension, ImageInput, get_image_size
+from .processing_utils import ImagesKwargs, Unpack
 from .utils import logging
 from .utils.import_utils import requires
 
@@ -36,6 +37,8 @@
 
 @requires(backends=("vision",))
 class BaseImageProcessor(ImageProcessingMixin):
+    valid_kwargs = ImagesKwargs
+
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
@@ -46,9 +49,9 @@ def is_fast(self) -> bool:
         """
         return False
 
-    def __call__(self, images, **kwargs) -> BatchFeature:
+    def __call__(self, images: ImageInput, *args, **kwargs: Unpack[ImagesKwargs]) -> BatchFeature:
         """Preprocess an image or a batch of images."""
-        return self.preprocess(images, **kwargs)
+        return self.preprocess(images, *args, **kwargs)
 
     def preprocess(self, images, **kwargs) -> BatchFeature:
         raise NotImplementedError("Each image processor must implement its own preprocess method")
diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py
index 4dfa7f08b0db..a9f6900a1046 100644
--- a/src/transformers/image_processing_utils_fast.py
+++ b/src/transformers/image_processing_utils_fast.py
@@ -15,7 +15,7 @@
 from collections.abc import Iterable
 from copy import deepcopy
 from functools import lru_cache, partial
-from typing import Any, Optional, TypedDict, Union
+from typing import Any, Optional, Union
 
 import numpy as np
 
@@ -40,7 +40,7 @@
     validate_kwargs,
     validate_preprocess_arguments,
 )
-from .processing_utils import Unpack
+from .processing_utils import ImagesKwargs, Unpack
 from .utils import (
     TensorType,
     auto_docstring,
@@ -163,28 +163,6 @@ def divide_to_patches(
     return patches
 
 
-class DefaultFastImageProcessorKwargs(TypedDict, total=False):
-    do_resize: Optional[bool]
-    size: Optional[dict[str, int]]
-    default_to_square: Optional[bool]
-    resample: Optional[Union["PILImageResampling", "F.InterpolationMode"]]
-    do_center_crop: Optional[bool]
-    crop_size: Optional[dict[str, int]]
-    do_rescale: Optional[bool]
-    rescale_factor: Optional[Union[int, float]]
-    do_normalize: Optional[bool]
-    image_mean: Optional[Union[float, list[float]]]
-    image_std: Optional[Union[float, list[float]]]
-    do_pad: Optional[bool]
-    pad_size: Optional[dict[str, int]]
-    do_convert_rgb: Optional[bool]
-    return_tensors: Optional[Union[str, TensorType]]
-    data_format: Optional[ChannelDimension]
-    input_data_format: Optional[Union[str, ChannelDimension]]
-    device: Optional["torch.device"]
-    disable_grouping: Optional[bool]
-
-
 @auto_docstring
 class BaseImageProcessorFast(BaseImageProcessor):
     resample = None
@@ -206,10 +184,10 @@ class BaseImageProcessorFast(BaseImageProcessor):
     input_data_format = None
     device = None
     model_input_names = ["pixel_values"]
-    valid_kwargs = DefaultFastImageProcessorKwargs
+    valid_kwargs = ImagesKwargs
     unused_kwargs = None
 
-    def __init__(self, **kwargs: Unpack[DefaultFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[ImagesKwargs]):
         super().__init__(**kwargs)
         kwargs = self.filter_out_unused_kwargs(kwargs)
         size = kwargs.pop("size", self.size)
@@ -728,11 +706,8 @@ def _validate_preprocess_kwargs(
             data_format=data_format,
         )
 
-    def __call__(self, images: ImageInput, *args, **kwargs: Unpack[DefaultFastImageProcessorKwargs]) -> BatchFeature:
-        return self.preprocess(images, *args, **kwargs)
-
     @auto_docstring
-    def preprocess(self, images: ImageInput, *args, **kwargs: Unpack[DefaultFastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, *args, **kwargs: Unpack[ImagesKwargs]) -> BatchFeature:
         # args are not validated, but their order in the `preprocess` and `_preprocess` signatures must be the same
         validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_kwargs_names)
         # Set default kwargs from self. This ensures that if a kwarg is not provided
@@ -765,7 +740,7 @@ def _preprocess_image_like_inputs(
         do_convert_rgb: bool,
         input_data_format: ChannelDimension,
         device: Optional[Union[str, "torch.device"]] = None,
-        **kwargs: Unpack[DefaultFastImageProcessorKwargs],
+        **kwargs: Unpack[ImagesKwargs],
     ) -> BatchFeature:
         """
         Preprocess image-like inputs.
diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py
index 749a4c036ed1..e0c2b67fcc90 100644
--- a/src/transformers/models/aria/modular_aria.py
+++ b/src/transformers/models/aria/modular_aria.py
@@ -959,8 +959,6 @@ def __call__(
         self,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]],
         images: Optional[ImageInput] = None,
-        audio=None,
-        videos=None,
         **kwargs: Unpack[AriaProcessorKwargs],
     ) -> BatchFeature:
         """
diff --git a/src/transformers/models/aria/processing_aria.py b/src/transformers/models/aria/processing_aria.py
index 9264776e80fd..976d2b983ee9 100644
--- a/src/transformers/models/aria/processing_aria.py
+++ b/src/transformers/models/aria/processing_aria.py
@@ -85,8 +85,6 @@ def __call__(
         self,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]],
         images: Optional[ImageInput] = None,
-        audio=None,
-        videos=None,
         **kwargs: Unpack[AriaProcessorKwargs],
     ) -> BatchFeature:
         """
diff --git a/src/transformers/models/aya_vision/processing_aya_vision.py b/src/transformers/models/aya_vision/processing_aya_vision.py
index aaede4e8e80e..882a85d40946 100644
--- a/src/transformers/models/aya_vision/processing_aya_vision.py
+++ b/src/transformers/models/aya_vision/processing_aya_vision.py
@@ -19,18 +19,11 @@
 
 from ...image_processing_utils import BatchFeature
 from ...image_utils import ImageInput, make_flat_list_of_images
-from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
+from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 
 
-class AyaVisionImagesKwargs(ImagesKwargs, total=False):
-    crop_to_patches: Optional[bool]
-    min_patches: Optional[int]
-    max_patches: Optional[int]
-
-
 class AyaVisionProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: AyaVisionImagesKwargs
     _defaults = {
         "text_kwargs": {
             "padding_side": "left",
@@ -140,8 +133,6 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
-        audio=None,
-        videos=None,
         **kwargs: Unpack[AyaVisionProcessorKwargs],
     ) -> BatchFeature:
         """
diff --git a/src/transformers/models/beit/image_processing_beit.py b/src/transformers/models/beit/image_processing_beit.py
index 984eac3bf67e..f65709168379 100644
--- a/src/transformers/models/beit/image_processing_beit.py
+++ b/src/transformers/models/beit/image_processing_beit.py
@@ -33,6 +33,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import (
     TensorType,
     filter_out_non_signature_kwargs,
@@ -54,6 +55,17 @@
 logger = logging.get_logger(__name__)
 
 
+class BeitImageProcessorKwargs(ImagesKwargs):
+    r"""
+    do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
+        Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
+        is used for background, and background itself is not included in all classes of a dataset (e.g.
+        ADE20k). The background label will be replaced by 255.
+    """
+
+    do_reduce_labels: Optional[bool]
+
+
 @requires(backends=("vision",))
 class BeitImageProcessor(BaseImageProcessor):
     r"""
@@ -99,6 +111,7 @@ class BeitImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values"]
+    valid_kwargs = BeitImageProcessorKwargs
 
     @filter_out_non_signature_kwargs(extra=INIT_SERVICE_KWARGS)
     def __init__(
diff --git a/src/transformers/models/beit/image_processing_beit_fast.py b/src/transformers/models/beit/image_processing_beit_fast.py
index 7ff894127ecd..5d89120283a5 100644
--- a/src/transformers/models/beit/image_processing_beit_fast.py
+++ b/src/transformers/models/beit/image_processing_beit_fast.py
@@ -22,7 +22,6 @@
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
     group_images_by_shape,
     reorder_images,
 )
@@ -40,17 +39,7 @@
     TensorType,
     auto_docstring,
 )
-
-
-class BeitFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    r"""
-    do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
-        Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
-        is used for background, and background itself is not included in all classes of a dataset (e.g.
-        ADE20k). The background label will be replaced by 255.
-    """
-
-    do_reduce_labels: Optional[bool]
+from .image_processing_beit import BeitImageProcessorKwargs
 
 
 @auto_docstring
@@ -66,9 +55,9 @@ class BeitImageProcessorFast(BaseImageProcessorFast):
     do_rescale = True
     do_normalize = True
     do_reduce_labels = False
-    valid_kwargs = BeitFastImageProcessorKwargs
+    valid_kwargs = BeitImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[BeitFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[BeitImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     def reduce_label(self, labels: list["torch.Tensor"]):
@@ -86,7 +75,7 @@ def preprocess(
         self,
         images: ImageInput,
         segmentation_maps: Optional[ImageInput] = None,
-        **kwargs: Unpack[BeitFastImageProcessorKwargs],
+        **kwargs: Unpack[BeitImageProcessorKwargs],
     ) -> BatchFeature:
         r"""
         segmentation_maps (`ImageInput`, *optional*):
@@ -101,7 +90,7 @@ def _preprocess_image_like_inputs(
         do_convert_rgb: bool,
         input_data_format: ChannelDimension,
         device: Optional[Union[str, "torch.device"]] = None,
-        **kwargs: Unpack[BeitFastImageProcessorKwargs],
+        **kwargs: Unpack[BeitImageProcessorKwargs],
     ) -> BatchFeature:
         """
         Preprocess image-like inputs.
diff --git a/src/transformers/models/blip/processing_blip.py b/src/transformers/models/blip/processing_blip.py
index 4ac741f84f46..7fc154f3483e 100644
--- a/src/transformers/models/blip/processing_blip.py
+++ b/src/transformers/models/blip/processing_blip.py
@@ -36,7 +36,6 @@ class BlipProcessorKwargs(ProcessingKwargs, total=False):
             "return_length": False,
             "verbose": True,
         },
-        "images_kwargs": {},
     }
 
 
@@ -67,8 +66,6 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None,
-        audio=None,
-        videos=None,
         **kwargs: Unpack[BlipProcessorKwargs],
     ) -> BatchEncoding:
         """
diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py
index 71f79583c77e..abbbeb6ae0a4 100644
--- a/src/transformers/models/blip_2/processing_blip_2.py
+++ b/src/transformers/models/blip_2/processing_blip_2.py
@@ -41,7 +41,6 @@ class Blip2ProcessorKwargs(ProcessingKwargs, total=False):
             "return_length": False,
             "verbose": True,
         },
-        "images_kwargs": {},
     }
 
 
@@ -81,8 +80,6 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None,
-        audio=None,
-        videos=None,
         **kwargs: Unpack[Blip2ProcessorKwargs],
     ) -> BatchEncoding:
         """
diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py
index 75b4e2b4238c..cad23d02893f 100644
--- a/src/transformers/models/bridgetower/image_processing_bridgetower.py
+++ b/src/transformers/models/bridgetower/image_processing_bridgetower.py
@@ -35,6 +35,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
 
 
@@ -122,6 +123,10 @@ def get_resize_output_image_size(
     return new_height, new_width
 
 
+class BridgeTowerImageProcessorKwargs(ImagesKwargs):
+    size_divisor: Optional[int]
+
+
 class BridgeTowerImageProcessor(BaseImageProcessor):
     r"""
     Constructs a BridgeTower image processor.
@@ -169,6 +174,7 @@ class BridgeTowerImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values", "pixel_mask"]
+    valid_kwargs = BridgeTowerImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower_fast.py b/src/transformers/models/bridgetower/image_processing_bridgetower_fast.py
index 5be6f9f6c54b..76a76b4b0a47 100644
--- a/src/transformers/models/bridgetower/image_processing_bridgetower_fast.py
+++ b/src/transformers/models/bridgetower/image_processing_bridgetower_fast.py
@@ -23,7 +23,6 @@
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
     BatchFeature,
-    DefaultFastImageProcessorKwargs,
     ImageInput,
     SizeDict,
     TensorType,
@@ -33,6 +32,7 @@
 )
 from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling
 from ...utils import auto_docstring
+from .image_processing_bridgetower import BridgeTowerImageProcessorKwargs
 
 
 def make_pixel_mask(
@@ -85,17 +85,6 @@ def get_resize_output_image_size(
     return new_height, new_width
 
 
-class BridgeTowerFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    Args:
-        size_divisor (`int`, *optional*, defaults to 32):
-            The size by which to make sure both the height and width can be divided. Only has an effect if `do_resize`
-            is set to `True`. Can be overridden by the `size_divisor` parameter in the `preprocess` method.
-    """
-
-    size_divisor: Optional[int]
-
-
 @auto_docstring
 class BridgeTowerImageProcessorFast(BaseImageProcessorFast):
     resample = PILImageResampling.BICUBIC
@@ -110,14 +99,14 @@ class BridgeTowerImageProcessorFast(BaseImageProcessorFast):
     do_normalize = True
     do_pad = True
     size_divisor = 32
-    valid_kwargs = BridgeTowerFastImageProcessorKwargs
+    valid_kwargs = BridgeTowerImageProcessorKwargs
     model_input_names = ["pixel_values", "pixel_mask"]
 
-    def __init__(self, **kwargs: Unpack[BridgeTowerFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[BridgeTowerImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     @auto_docstring
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[BridgeTowerFastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[BridgeTowerImageProcessorKwargs]) -> BatchFeature:
         return super().preprocess(images, **kwargs)
 
     def resize(
diff --git a/src/transformers/models/bridgetower/processing_bridgetower.py b/src/transformers/models/bridgetower/processing_bridgetower.py
index 6d7059c4c5a5..030c578c49cd 100644
--- a/src/transformers/models/bridgetower/processing_bridgetower.py
+++ b/src/transformers/models/bridgetower/processing_bridgetower.py
@@ -16,17 +16,10 @@
 Processor class for BridgeTower.
 """
 
-from typing import Optional
-
-from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
-
-
-class BridgeTowerImagesKwargs(ImagesKwargs):
-    size_divisor: Optional[int]
+from ...processing_utils import ProcessingKwargs, ProcessorMixin
 
 
 class BridgeTowerProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: BridgeTowerImagesKwargs
     _defaults = {
         "text_kwargs": {
             "add_special_tokens": True,
diff --git a/src/transformers/models/chameleon/processing_chameleon.py b/src/transformers/models/chameleon/processing_chameleon.py
index bf4441c00a2e..247f72322a2d 100644
--- a/src/transformers/models/chameleon/processing_chameleon.py
+++ b/src/transformers/models/chameleon/processing_chameleon.py
@@ -92,8 +92,6 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
-        audio=None,
-        videos=None,
         **kwargs: Unpack[ChameleonProcessorKwargs],
     ) -> BatchFeature:
         """
diff --git a/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py b/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py
index 322e98dbd0f5..358d84ac6d7c 100644
--- a/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py
+++ b/src/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py
@@ -27,18 +27,13 @@
 from torchvision.transforms.v2 import functional as F
 
 from ...image_processing_utils import BatchFeature
-from ...image_processing_utils_fast import (
-    BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
-    group_images_by_shape,
-    reorder_images,
-)
+from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
 from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ImageInput, PILImageResampling, SizeDict
-from ...processing_utils import Unpack
+from ...processing_utils import ImagesKwargs, Unpack
 from ...utils import TensorType, auto_docstring
 
 
-class Cohere2VisionFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs):
     """
     crop_to_patches (`bool`, *optional*, defaults to `False`):
         Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
diff --git a/src/transformers/models/cohere2_vision/modular_cohere2_vision.py b/src/transformers/models/cohere2_vision/modular_cohere2_vision.py
index 7ef20305b99e..2b7867d0eae3 100644
--- a/src/transformers/models/cohere2_vision/modular_cohere2_vision.py
+++ b/src/transformers/models/cohere2_vision/modular_cohere2_vision.py
@@ -30,8 +30,10 @@
 from transformers.models.got_ocr2.image_processing_got_ocr2_fast import GotOcr2ImageProcessorFast
 
 from ...cache_utils import Cache
+from ...image_processing_utils import BatchFeature
+from ...image_utils import ImageInput
 from ...modeling_flash_attention_utils import FlashAttentionKwargs
-from ...processing_utils import Unpack
+from ...processing_utils import ImagesKwargs, Unpack
 from ...utils import TransformersKwargs, auto_docstring, logging
 from ...utils.generic import check_model_inputs
 from .configuration_cohere2_vision import Cohere2VisionConfig
@@ -301,6 +303,24 @@ def get_optimal_tiled_canvas(
     return best_grid
 
 
+class Cohere2VisionFastImageProcessorKwargs(ImagesKwargs):
+    """
+    crop_to_patches (`bool`, *optional*, defaults to `False`):
+        Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
+        `preprocess` method.
+    min_patches (`int`, *optional*, defaults to 1):
+        The minimum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
+        set to `True`. Can be overridden by the `min_patches` parameter in the `preprocess` method.
+    max_patches (`int`, *optional*, defaults to 12):
+        The maximum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
+        set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
+    """
+
+    crop_to_patches: Optional[bool]
+    min_patches: Optional[int]
+    max_patches: Optional[int]
+
+
 @auto_docstring
 class Cohere2VisionImageProcessorFast(GotOcr2ImageProcessorFast):
     size = {"height": 512, "width": 512}
@@ -308,6 +328,14 @@ class Cohere2VisionImageProcessorFast(GotOcr2ImageProcessorFast):
     max_patches = 12
     crop_to_patches = True
     patch_size = 16
+    valid_kwargs = Cohere2VisionFastImageProcessorKwargs
+
+    def __init__(self, **kwargs: Unpack[Cohere2VisionFastImageProcessorKwargs]):
+        super().__init__(**kwargs)
+
+    @auto_docstring
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[Cohere2VisionFastImageProcessorKwargs]) -> BatchFeature:
+        return super().preprocess(images, **kwargs)
 
 
 __all__ = [
diff --git a/src/transformers/models/cohere2_vision/processing_cohere2_vision.py b/src/transformers/models/cohere2_vision/processing_cohere2_vision.py
index cde77af658bc..d4fcec4da875 100644
--- a/src/transformers/models/cohere2_vision/processing_cohere2_vision.py
+++ b/src/transformers/models/cohere2_vision/processing_cohere2_vision.py
@@ -19,16 +19,11 @@
 
 from ...image_processing_utils import BatchFeature
 from ...image_utils import ImageInput
-from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
+from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 
 
-class Cohere2VisionImagesKwargs(ImagesKwargs, total=False):
-    max_patches: Optional[int]
-
-
 class Cohere2VisionProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: Cohere2VisionImagesKwargs
     _defaults = {
         "text_kwargs": {
             "padding_side": "left",
diff --git a/src/transformers/models/colpali/modular_colpali.py b/src/transformers/models/colpali/modular_colpali.py
index 8136f560f18e..176b3e6a15ee 100644
--- a/src/transformers/models/colpali/modular_colpali.py
+++ b/src/transformers/models/colpali/modular_colpali.py
@@ -90,8 +90,6 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
-        audio=None,
-        videos=None,
         **kwargs: Unpack[ColPaliProcessorKwargs],
     ) -> BatchFeature:
         """
diff --git a/src/transformers/models/colpali/processing_colpali.py b/src/transformers/models/colpali/processing_colpali.py
index 1d76a74e1ab8..032cc70d4482 100644
--- a/src/transformers/models/colpali/processing_colpali.py
+++ b/src/transformers/models/colpali/processing_colpali.py
@@ -131,8 +131,6 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
-        audio=None,
-        videos=None,
         **kwargs: Unpack[ColPaliProcessorKwargs],
     ) -> BatchFeature:
         """
diff --git a/src/transformers/models/colqwen2/modular_colqwen2.py b/src/transformers/models/colqwen2/modular_colqwen2.py
index a9a1f8ce3e1e..adea1617e459 100644
--- a/src/transformers/models/colqwen2/modular_colqwen2.py
+++ b/src/transformers/models/colqwen2/modular_colqwen2.py
@@ -93,8 +93,6 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
-        audio=None,
-        videos=None,
         **kwargs: Unpack[ColQwen2ProcessorKwargs],
     ) -> BatchFeature:
         """
diff --git a/src/transformers/models/colqwen2/processing_colqwen2.py b/src/transformers/models/colqwen2/processing_colqwen2.py
index e8f7e057247c..2eb9fed873a8 100644
--- a/src/transformers/models/colqwen2/processing_colqwen2.py
+++ b/src/transformers/models/colqwen2/processing_colqwen2.py
@@ -94,8 +94,6 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
-        audio=None,
-        videos=None,
         **kwargs: Unpack[ColQwen2ProcessorKwargs],
     ) -> BatchFeature:
         """
diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
index cf506b834918..163224edb34f 100644
--- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
@@ -53,6 +53,7 @@
     validate_kwargs,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import (
     TensorType,
     is_scipy_available,
@@ -774,6 +775,29 @@ def compute_segments(
     return segmentation, segments
 
 
+class ConditionalDetrImageProcessorKwargs(ImagesKwargs):
+    r"""
+    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
+        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+    do_convert_annotations (`bool`, *optional*, defaults to `True`):
+        Controls whether to convert the annotations to the format expected by the CONDITIONAL_DETR model. Converts the
+        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+    return_segmentation_masks (`bool`, *optional*, defaults to `False`):
+        Whether to return segmentation masks.
+    annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
+        Annotations to transform according to the padding that is applied to the images.
+    masks_path (`str` or `pathlib.Path`, *optional*):
+        Path to the directory containing the segmentation masks.
+    """
+
+    format: Optional[Union[str, AnnotationFormat]]
+    do_convert_annotations: Optional[bool]
+    return_segmentation_masks: Optional[bool]
+    annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
+    masks_path: Optional[Union[str, pathlib.Path]]
+
+
 @requires(backends=("vision",))
 class ConditionalDetrImageProcessor(BaseImageProcessor):
     r"""
@@ -829,6 +853,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values", "pixel_mask"]
+    valid_kwargs = ConditionalDetrImageProcessorKwargs
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.__init__
     def __init__(
diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr_fast.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr_fast.py
index 351d4fa1470f..4c5b8602c0cc 100644
--- a/src/transformers/models/conditional_detr/image_processing_conditional_detr_fast.py
+++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr_fast.py
@@ -15,7 +15,6 @@
 from ...image_processing_utils import BatchFeature, get_size_dict
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
     SizeDict,
     get_image_size_for_max_height_width,
     get_max_height_width,
@@ -37,6 +36,7 @@
 from ...utils import TensorType, auto_docstring, logging
 from ...utils.import_utils import requires
 from .image_processing_conditional_detr import (
+    ConditionalDetrImageProcessorKwargs,
     compute_segments,
     convert_segmentation_to_rle,
     get_size_with_aspect_ratio,
@@ -46,24 +46,6 @@
 
 logger = logging.get_logger(__name__)
 
-
-class ConditionalDetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    r"""
-    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
-        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
-    do_convert_annotations (`bool`, *optional*, defaults to `True`):
-        Controls whether to convert the annotations to the format expected by the CONDITIONAL_DETR model. Converts the
-        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
-        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
-    return_segmentation_masks (`bool`, *optional*, defaults to `False`):
-        Whether to return segmentation masks.
-    """
-
-    format: Optional[Union[str, AnnotationFormat]]
-    do_convert_annotations: Optional[bool]
-    return_segmentation_masks: Optional[bool]
-
-
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
@@ -278,9 +260,9 @@ class ConditionalDetrImageProcessorFast(BaseImageProcessorFast):
     size = {"shortest_edge": 800, "longest_edge": 1333}
     default_to_square = False
     model_input_names = ["pixel_values", "pixel_mask"]
-    valid_kwargs = ConditionalDetrFastImageProcessorKwargs
+    valid_kwargs = ConditionalDetrImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[ConditionalDetrFastImageProcessorKwargs]) -> None:
+    def __init__(self, **kwargs: Unpack[ConditionalDetrImageProcessorKwargs]) -> None:
         if "pad_and_return_pixel_mask" in kwargs:
             kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
 
@@ -542,25 +524,8 @@ def pad(
     def preprocess(
         self,
         images: ImageInput,
-        annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None,
-        masks_path: Optional[Union[str, pathlib.Path]] = None,
-        **kwargs: Unpack[ConditionalDetrFastImageProcessorKwargs],
+        **kwargs: Unpack[ConditionalDetrImageProcessorKwargs],
     ) -> BatchFeature:
-        r"""
-        annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
-            List of annotations associated with the image or batch of images. If annotation is for object
-            detection, the annotations should be a dictionary with the following keys:
-            - "image_id" (`int`): The image id.
-            - "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a
-                dictionary. An image can have no annotations, in which case the list should be empty.
-            If annotation is for segmentation, the annotations should be a dictionary with the following keys:
-            - "image_id" (`int`): The image id.
-            - "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary.
-                An image can have no segments, in which case the list should be empty.
-            - "file_name" (`str`): The file name of the image.
-        masks_path (`str` or `pathlib.Path`, *optional*):
-            Path to the directory containing the segmentation masks.
-        """
         if "pad_and_return_pixel_mask" in kwargs:
             kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
             logger.warning_once(
@@ -575,7 +540,7 @@ def preprocess(
             )
             kwargs["size"] = kwargs.pop("max_size")
 
-        return super().preprocess(images, annotations, masks_path, **kwargs)
+        return super().preprocess(images, **kwargs)
 
     def _preprocess(
         self,
diff --git a/src/transformers/models/convnext/image_processing_convnext.py b/src/transformers/models/convnext/image_processing_convnext.py
index ae0be69a5621..d2e180de2464 100644
--- a/src/transformers/models/convnext/image_processing_convnext.py
+++ b/src/transformers/models/convnext/image_processing_convnext.py
@@ -38,6 +38,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
 from ...utils.import_utils import requires
 
@@ -49,6 +50,16 @@
 logger = logging.get_logger(__name__)
 
 
+class ConvNextImageProcessorKwargs(ImagesKwargs):
+    """
+    crop_pct (`float`, *optional*):
+        Percentage of the image to crop. Only has an effect if size < 384. Can be
+        overridden by `crop_pct` in the`preprocess` method.
+    """
+
+    crop_pct: Optional[float]
+
+
 @requires(backends=("vision",))
 class ConvNextImageProcessor(BaseImageProcessor):
     r"""
@@ -87,6 +98,7 @@ class ConvNextImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values"]
+    valid_kwargs = ConvNextImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/convnext/image_processing_convnext_fast.py b/src/transformers/models/convnext/image_processing_convnext_fast.py
index 3ab00c0fd091..035b92f8b7d2 100644
--- a/src/transformers/models/convnext/image_processing_convnext_fast.py
+++ b/src/transformers/models/convnext/image_processing_convnext_fast.py
@@ -22,7 +22,6 @@
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
     group_images_by_shape,
     reorder_images,
 )
@@ -39,16 +38,7 @@
     TensorType,
     auto_docstring,
 )
-
-
-class ConvNextFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    crop_pct (`float`, *optional*):
-        Percentage of the image to crop. Only has an effect if size < 384. Can be
-        overridden by `crop_pct` in the`preprocess` method.
-    """
-
-    crop_pct: Optional[float]
+from .image_processing_convnext import ConvNextImageProcessorKwargs
 
 
 @auto_docstring
@@ -62,13 +52,13 @@ class ConvNextImageProcessorFast(BaseImageProcessorFast):
     do_rescale = True
     do_normalize = True
     crop_pct = 224 / 256
-    valid_kwargs = ConvNextFastImageProcessorKwargs
+    valid_kwargs = ConvNextImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[ConvNextFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[ConvNextImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     @auto_docstring
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[ConvNextFastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[ConvNextImageProcessorKwargs]) -> BatchFeature:
         return super().preprocess(images, **kwargs)
 
     def resize(
diff --git a/src/transformers/models/csm/processing_csm.py b/src/transformers/models/csm/processing_csm.py
index cbf7e44aa8d3..172016f6431d 100644
--- a/src/transformers/models/csm/processing_csm.py
+++ b/src/transformers/models/csm/processing_csm.py
@@ -246,9 +246,7 @@ def __call__(
 
         text_kwargs = output_kwargs["text_kwargs"]
         audio_kwargs = output_kwargs["audio_kwargs"]
-        common_kwargs = output_kwargs["common_kwargs"]
-
-        return_tensors = common_kwargs.pop("return_tensors", None)
+        return_tensors = text_kwargs.get("return_tensors", None)
         if return_tensors != "pt":
             raise ValueError(f"{self.__class__.__name__} only supports `return_tensors='pt'`.")
 
diff --git a/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py b/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py
index 45f4fd2bdb93..c41ac586753e 100644
--- a/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py
+++ b/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py
@@ -38,6 +38,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
 
 
@@ -48,6 +49,16 @@
 logger = logging.get_logger(__name__)
 
 
+class DeepseekVLImageProcessorKwargs(ImagesKwargs):
+    r"""
+    min_size (`int`, *optional*, defaults to 14):
+        The minimum allowed size for the resized image. Ensures that neither the height nor width
+        falls below this value after resizing.
+    """
+
+    min_size: int
+
+
 class DeepseekVLImageProcessor(BaseImageProcessor):
     r"""
     Constructs a DEEPSEEK_VL image processor.
@@ -90,6 +101,8 @@ class DeepseekVLImageProcessor(BaseImageProcessor):
 
     model_input_names = ["pixel_values"]
 
+    valid_kwargs = DeepseekVLImageProcessorKwargs
+
     def __init__(
         self,
         do_resize: bool = True,
diff --git a/src/transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py b/src/transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py
index 896e91f0692c..6eaa15d827d9 100644
--- a/src/transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py
+++ b/src/transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py
@@ -24,25 +24,11 @@
 import torch.nn.functional as F
 
 from ...image_processing_utils import BatchFeature
-from ...image_processing_utils_fast import (
-    BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
-    group_images_by_shape,
-    reorder_images,
-)
+from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
 from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling, SizeDict
 from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring
-
-
-class DeepseekVLFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    r"""
-    min_size (`int`, *optional*, defaults to 14):
-        The minimum allowed size for the resized image. Ensures that neither the height nor width
-        falls below this value after resizing.
-    """
-
-    min_size: int
+from .image_processing_deepseek_vl import DeepseekVLImageProcessorKwargs
 
 
 @auto_docstring
@@ -56,9 +42,9 @@ class DeepseekVLImageProcessorFast(BaseImageProcessorFast):
     do_rescale = True
     do_normalize = True
     do_pad = True
-    valid_kwargs = DeepseekVLFastImageProcessorKwargs
+    valid_kwargs = DeepseekVLImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[DeepseekVLFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[DeepseekVLImageProcessorKwargs]):
         super().__init__(**kwargs)
         if kwargs.get("image_mean") is None:
             background_color = (127, 127, 127)
diff --git a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py
index 241c12923bdb..8b93f7fa6c94 100644
--- a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py
+++ b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py
@@ -39,6 +39,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
 
 
@@ -49,6 +50,32 @@
 logger = logging.get_logger(__name__)
 
 
+class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs):
+    r"""
+    min_size (`int`, *optional*, defaults to 14):
+        The minimum allowed size for the resized image. Ensures that neither the height nor width
+        falls below this value after resizing.
+     high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`):
+        Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess`
+        method.
+    high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+        Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
+        overridden by the `high_res_resample` parameter in the `preprocess` method.
+    high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
+        Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of
+        channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method.
+    high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`):
+        Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the
+        number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method.
+    """
+
+    min_size: int
+    high_res_size: dict
+    high_res_resample: "PILImageResampling"
+    high_res_image_mean: list[float]
+    high_res_image_std: list[float]
+
+
 class DeepseekVLHybridImageProcessor(BaseImageProcessor):
     r"""
     Constructs a DEEPSEEK_VL_HYBRID image processor.
@@ -102,6 +129,7 @@ class DeepseekVLHybridImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values", "high_res_pixel_values"]
+    valid_kwargs = DeepseekVLHybridImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py
index c04e006e358d..ff5e7f2e3c73 100644
--- a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py
+++ b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py
@@ -26,7 +26,6 @@
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
     BatchFeature,
-    DefaultFastImageProcessorKwargs,
     get_size_dict,
     group_images_by_shape,
     reorder_images,
@@ -41,32 +40,7 @@
 )
 from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring
-
-
-class DeepseekVLHybridFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    r"""
-    min_size (`int`, *optional*, defaults to 14):
-        The minimum allowed size for the resized image. Ensures that neither the height nor width
-        falls below this value after resizing.
-     high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`):
-        Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess`
-        method.
-    high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
-        Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
-        overridden by the `high_res_resample` parameter in the `preprocess` method.
-    high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
-        Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of
-        channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method.
-    high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`):
-        Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the
-        number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method.
-    """
-
-    min_size: int
-    high_res_size: dict
-    high_res_resample: "PILImageResampling"
-    high_res_image_mean: list[float]
-    high_res_image_std: list[float]
+from .image_processing_deepseek_vl_hybrid import DeepseekVLHybridImageProcessorKwargs
 
 
 @auto_docstring
@@ -80,14 +54,14 @@ class DeepseekVLHybridImageProcessorFast(BaseImageProcessorFast):
     do_rescale = True
     do_normalize = True
     do_pad = True
-    valid_kwargs = DeepseekVLHybridFastImageProcessorKwargs
+    valid_kwargs = DeepseekVLHybridImageProcessorKwargs
     high_res_image_mean = OPENAI_CLIP_MEAN
     high_res_image_std = OPENAI_CLIP_STD
     high_res_size = {"height": 1024, "width": 1024}
     high_res_resample = PILImageResampling.BICUBIC
     model_input_names = ["pixel_values", "high_res_pixel_values"]
 
-    def __init__(self, **kwargs: Unpack[DeepseekVLHybridFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[DeepseekVLHybridImageProcessorKwargs]):
         if kwargs.get("image_mean") is None:
             background_color = (127, 127, 127)
         else:
diff --git a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py
index 18b416a57df2..1507c9f3d028 100644
--- a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py
+++ b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py
@@ -22,7 +22,6 @@
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
     BatchFeature,
-    DefaultFastImageProcessorKwargs,
     get_size_dict,
     group_images_by_shape,
     reorder_images,
@@ -43,7 +42,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
-from ...processing_utils import Unpack
+from ...processing_utils import ImagesKwargs, Unpack
 from ...tokenization_utils_base import (
     PreTokenizedInput,
     TextInput,
@@ -430,6 +429,32 @@ def prepare_inputs_for_generation(
         return model_inputs
 
 
+class DeepseekVLHybridImageProcessorKwargs(ImagesKwargs):
+    r"""
+    min_size (`int`, *optional*, defaults to 14):
+        The minimum allowed size for the resized image. Ensures that neither the height nor width
+        falls below this value after resizing.
+     high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`):
+        Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess`
+        method.
+    high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+        Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
+        overridden by the `high_res_resample` parameter in the `preprocess` method.
+    high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
+        Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of
+        channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method.
+    high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`):
+        Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the
+        number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method.
+    """
+
+    min_size: int
+    high_res_size: dict
+    high_res_resample: "PILImageResampling"
+    high_res_image_mean: list[float]
+    high_res_image_std: list[float]
+
+
 class DeepseekVLHybridImageProcessor(DeepseekVLImageProcessor):
     r"""
     Constructs a DEEPSEEK_VL_HYBRID image processor.
@@ -483,6 +508,7 @@ class DeepseekVLHybridImageProcessor(DeepseekVLImageProcessor):
     """
 
     model_input_names = ["pixel_values", "high_res_pixel_values"]
+    valid_kwargs = DeepseekVLHybridImageProcessorKwargs
 
     def __init__(
         self,
@@ -727,32 +753,6 @@ def preprocess(
         return BatchFeature(data=data, tensor_type=return_tensors)
 
 
-class DeepseekVLHybridFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    r"""
-    min_size (`int`, *optional*, defaults to 14):
-        The minimum allowed size for the resized image. Ensures that neither the height nor width
-        falls below this value after resizing.
-     high_res_size (`dict`, *optional*, defaults to `{"height": 1024, "width": 1024}`):
-        Size of the high resolution output image after resizing. Can be overridden by the `high_res_size` parameter in the `preprocess`
-        method.
-    high_res_resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
-        Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
-        overridden by the `high_res_resample` parameter in the `preprocess` method.
-    high_res_image_mean (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
-        Mean to use if normalizing the high resolution image. This is a float or list of floats the length of the number of
-        channels in the image. Can be overridden by the `high_res_image_mean` parameter in the `preprocess` method.
-    high_res_image_std (`float` or `list[float]`, *optional*, defaults to `OPENAI_CLIP_STD`):
-        Standard deviation to use if normalizing the high resolution image. This is a float or list of floats the length of the
-        number of channels in the image. Can be overridden by the `high_res_image_std` parameter in the `preprocess` method.
-    """
-
-    min_size: int
-    high_res_size: dict
-    high_res_resample: "PILImageResampling"
-    high_res_image_mean: list[float]
-    high_res_image_std: list[float]
-
-
 class DeepseekVLHybridImageProcessorFast(DeepseekVLImageProcessorFast):
     high_res_image_mean = OPENAI_CLIP_MEAN
     high_res_image_std = OPENAI_CLIP_STD
@@ -760,7 +760,7 @@ class DeepseekVLHybridImageProcessorFast(DeepseekVLImageProcessorFast):
     high_res_resample = PILImageResampling.BICUBIC
     model_input_names = ["pixel_values", "high_res_pixel_values"]
 
-    def __init__(self, **kwargs: Unpack[DeepseekVLHybridFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[DeepseekVLHybridImageProcessorKwargs]):
         if kwargs.get("image_mean") is None:
             background_color = (127, 127, 127)
         else:
diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
index ef028eda1ed1..8249c079f5fa 100644
--- a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
@@ -53,6 +53,7 @@
     validate_kwargs,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import (
     TensorType,
     is_scipy_available,
@@ -79,6 +80,30 @@
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
+
+class DeformableDetrImageProcessorKwargs(ImagesKwargs):
+    r"""
+    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
+        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+    do_convert_annotations (`bool`, *optional*, defaults to `True`):
+        Controls whether to convert the annotations to the format expected by the DEFORMABLE_DETR model. Converts the
+        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+    return_segmentation_masks (`bool`, *optional*, defaults to `False`):
+        Whether to return segmentation masks.
+    annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
+        Annotations to transform according to the padding that is applied to the images.
+    masks_path (`str` or `pathlib.Path`, *optional*):
+        Path to the directory containing the segmentation masks.
+    """
+
+    format: Optional[Union[str, AnnotationFormat]]
+    do_convert_annotations: Optional[bool]
+    return_segmentation_masks: Optional[bool]
+    annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
+    masks_path: Optional[Union[str, pathlib.Path]]
+
+
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
@@ -827,6 +852,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values", "pixel_mask"]
+    valid_kwargs = DeformableDetrImageProcessorKwargs
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.__init__
     def __init__(
diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py
index 8458d02d58a5..916ad3dee0e6 100644
--- a/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py
+++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py
@@ -14,7 +14,6 @@
 from ...image_processing_utils import BatchFeature, get_size_dict
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
     SizeDict,
     get_image_size_for_max_height_width,
     get_max_height_width,
@@ -35,29 +34,11 @@
 from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring, logging
 from ...utils.import_utils import requires
-from .image_processing_deformable_detr import get_size_with_aspect_ratio
+from .image_processing_deformable_detr import DeformableDetrImageProcessorKwargs, get_size_with_aspect_ratio
 
 
 logger = logging.get_logger(__name__)
 
-
-class DeformableDetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    r"""
-    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
-        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
-    do_convert_annotations (`bool`, *optional*, defaults to `True`):
-        Controls whether to convert the annotations to the format expected by the DEFORMABLE_DETR model. Converts the
-        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
-        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
-    return_segmentation_masks (`bool`, *optional*, defaults to `False`):
-        Whether to return segmentation masks.
-    """
-
-    format: Optional[Union[str, AnnotationFormat]]
-    do_convert_annotations: Optional[bool]
-    return_segmentation_masks: Optional[bool]
-
-
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
@@ -272,9 +253,9 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
     size = {"shortest_edge": 800, "longest_edge": 1333}
     default_to_square = False
     model_input_names = ["pixel_values", "pixel_mask"]
-    valid_kwargs = DeformableDetrFastImageProcessorKwargs
+    valid_kwargs = DeformableDetrImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[DeformableDetrFastImageProcessorKwargs]) -> None:
+    def __init__(self, **kwargs: Unpack[DeformableDetrImageProcessorKwargs]) -> None:
         if "pad_and_return_pixel_mask" in kwargs:
             kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
 
@@ -536,25 +517,8 @@ def pad(
     def preprocess(
         self,
         images: ImageInput,
-        annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None,
-        masks_path: Optional[Union[str, pathlib.Path]] = None,
-        **kwargs: Unpack[DeformableDetrFastImageProcessorKwargs],
+        **kwargs: Unpack[DeformableDetrImageProcessorKwargs],
     ) -> BatchFeature:
-        r"""
-        annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
-            List of annotations associated with the image or batch of images. If annotation is for object
-            detection, the annotations should be a dictionary with the following keys:
-            - "image_id" (`int`): The image id.
-            - "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a
-                dictionary. An image can have no annotations, in which case the list should be empty.
-            If annotation is for segmentation, the annotations should be a dictionary with the following keys:
-            - "image_id" (`int`): The image id.
-            - "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary.
-                An image can have no segments, in which case the list should be empty.
-            - "file_name" (`str`): The file name of the image.
-        masks_path (`str` or `pathlib.Path`, *optional*):
-            Path to the directory containing the segmentation masks.
-        """
         if "pad_and_return_pixel_mask" in kwargs:
             kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
             logger.warning_once(
@@ -569,7 +533,7 @@ def preprocess(
             )
             kwargs["size"] = kwargs.pop("max_size")
 
-        return super().preprocess(images, annotations, masks_path, **kwargs)
+        return super().preprocess(images, **kwargs)
 
     def _preprocess(
         self,
diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py
index 7a2e67f83de6..5e0622601ac9 100644
--- a/src/transformers/models/detr/image_processing_detr.py
+++ b/src/transformers/models/detr/image_processing_detr.py
@@ -52,6 +52,7 @@
     validate_kwargs,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import (
     TensorType,
     is_scipy_available,
@@ -82,6 +83,29 @@
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
+class DetrImageProcessorKwargs(ImagesKwargs):
+    r"""
+    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
+        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+    do_convert_annotations (`bool`, *optional*, defaults to `True`):
+        Controls whether to convert the annotations to the format expected by the DETR model. Converts the
+        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+    return_segmentation_masks (`bool`, *optional*, defaults to `False`):
+        Whether to return segmentation masks.
+    annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
+        Annotations to transform according to the padding that is applied to the images.
+    masks_path (`str` or `pathlib.Path`, *optional*):
+        Path to the directory containing the segmentation masks.
+    """
+
+    format: Optional[Union[str, AnnotationFormat]]
+    do_convert_annotations: Optional[bool]
+    return_segmentation_masks: Optional[bool]
+    annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
+    masks_path: Optional[Union[str, pathlib.Path]]
+
+
 # From the original repo: https://github.com/facebookresearch/detr/blob/3af9fa878e73b6894ce3596450a8d9b89d918ca9/datasets/transforms.py#L76
 def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, int]:
     """
@@ -811,6 +835,7 @@ class DetrImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values", "pixel_mask"]
+    valid_kwargs = DetrImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/detr/image_processing_detr_fast.py b/src/transformers/models/detr/image_processing_detr_fast.py
index a2ac8d03eed3..190d01ab5590 100644
--- a/src/transformers/models/detr/image_processing_detr_fast.py
+++ b/src/transformers/models/detr/image_processing_detr_fast.py
@@ -28,7 +28,6 @@
 from ...image_processing_utils import BatchFeature, get_size_dict
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
     SizeDict,
     get_image_size_for_max_height_width,
     get_max_height_width,
@@ -54,6 +53,7 @@
 )
 from ...utils.import_utils import requires
 from .image_processing_detr import (
+    DetrImageProcessorKwargs,
     compute_segments,
     convert_segmentation_to_rle,
     get_size_with_aspect_ratio,
@@ -263,23 +263,6 @@ def prepare_coco_panoptic_annotation(
     return new_target
 
 
-class DetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    r"""
-    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
-        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
-    do_convert_annotations (`bool`, *optional*, defaults to `True`):
-        Controls whether to convert the annotations to the format expected by the DETR model. Converts the
-        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
-        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
-    return_segmentation_masks (`bool`, *optional*, defaults to `False`):
-        Whether to return segmentation masks.
-    """
-
-    format: Optional[Union[str, AnnotationFormat]]
-    do_convert_annotations: Optional[bool]
-    return_segmentation_masks: Optional[bool]
-
-
 @auto_docstring
 @requires(backends=("torchvision", "torch"))
 class DetrImageProcessorFast(BaseImageProcessorFast):
@@ -294,9 +277,9 @@ class DetrImageProcessorFast(BaseImageProcessorFast):
     size = {"shortest_edge": 800, "longest_edge": 1333}
     default_to_square = False
     model_input_names = ["pixel_values", "pixel_mask"]
-    valid_kwargs = DetrFastImageProcessorKwargs
+    valid_kwargs = DetrImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[DetrFastImageProcessorKwargs]) -> None:
+    def __init__(self, **kwargs: Unpack[DetrImageProcessorKwargs]) -> None:
         if "pad_and_return_pixel_mask" in kwargs:
             kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
 
@@ -558,25 +541,8 @@ def pad(
     def preprocess(
         self,
         images: ImageInput,
-        annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None,
-        masks_path: Optional[Union[str, pathlib.Path]] = None,
-        **kwargs: Unpack[DetrFastImageProcessorKwargs],
+        **kwargs: Unpack[DetrImageProcessorKwargs],
     ) -> BatchFeature:
-        r"""
-        annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
-            List of annotations associated with the image or batch of images. If annotation is for object
-            detection, the annotations should be a dictionary with the following keys:
-            - "image_id" (`int`): The image id.
-            - "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a
-                dictionary. An image can have no annotations, in which case the list should be empty.
-            If annotation is for segmentation, the annotations should be a dictionary with the following keys:
-            - "image_id" (`int`): The image id.
-            - "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary.
-                An image can have no segments, in which case the list should be empty.
-            - "file_name" (`str`): The file name of the image.
-        masks_path (`str` or `pathlib.Path`, *optional*):
-            Path to the directory containing the segmentation masks.
-        """
         if "pad_and_return_pixel_mask" in kwargs:
             kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
             logger.warning_once(
@@ -591,7 +557,7 @@ def preprocess(
             )
             kwargs["size"] = kwargs.pop("max_size")
 
-        return super().preprocess(images, annotations, masks_path, **kwargs)
+        return super().preprocess(images, **kwargs)
 
     def _preprocess(
         self,
diff --git a/src/transformers/models/dia/processing_dia.py b/src/transformers/models/dia/processing_dia.py
index 402f5152a64b..812a4149cb3f 100644
--- a/src/transformers/models/dia/processing_dia.py
+++ b/src/transformers/models/dia/processing_dia.py
@@ -111,9 +111,7 @@ def __call__(
 
         text_kwargs = output_kwargs["text_kwargs"]
         audio_kwargs = output_kwargs["audio_kwargs"]
-        common_kwargs = output_kwargs["common_kwargs"]
-
-        return_tensors = common_kwargs.pop("return_tensors", None)
+        return_tensors = text_kwargs.get("return_tensors", None)
         if return_tensors != "pt":
             raise ValueError(f"{self.__class__.__name__} only supports `return_tensors='pt'`.")
 
diff --git a/src/transformers/models/donut/image_processing_donut.py b/src/transformers/models/donut/image_processing_donut.py
index f49cc964080d..5af365099724 100644
--- a/src/transformers/models/donut/image_processing_donut.py
+++ b/src/transformers/models/donut/image_processing_donut.py
@@ -40,6 +40,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, filter_out_non_signature_kwargs, logging
 from ...utils.import_utils import is_vision_available, requires
 
@@ -51,6 +52,18 @@
     import PIL
 
 
+class DonutImageProcessorKwargs(ImagesKwargs):
+    """
+    do_thumbnail (`bool`, *optional*, defaults to `self.do_thumbnail`):
+        Whether to resize the image using thumbnail method.
+    do_align_long_axis (`bool`, *optional*, defaults to `self.do_align_long_axis`):
+        Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
+    """
+
+    do_thumbnail: Optional[bool]
+    do_align_long_axis: Optional[bool]
+
+
 @requires(backends=("vision",))
 class DonutImageProcessor(BaseImageProcessor):
     r"""
@@ -90,6 +103,7 @@ class DonutImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values"]
+    valid_kwargs = DonutImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/donut/image_processing_donut_fast.py b/src/transformers/models/donut/image_processing_donut_fast.py
index 29e06831b1b4..9a150f4df75f 100644
--- a/src/transformers/models/donut/image_processing_donut_fast.py
+++ b/src/transformers/models/donut/image_processing_donut_fast.py
@@ -19,7 +19,7 @@
 import torch
 from torchvision.transforms.v2 import functional as F
 
-from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature, DefaultFastImageProcessorKwargs
+from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature
 from ...image_transforms import group_images_by_shape, reorder_images
 from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, ImageInput, PILImageResampling, SizeDict
 from ...processing_utils import Unpack
@@ -28,24 +28,12 @@
     auto_docstring,
     logging,
 )
+from .image_processing_donut import DonutImageProcessorKwargs
 
 
 logger = logging.get_logger(__name__)
 
 
-class DonutFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    Args:
-        do_thumbnail (`bool`, *optional*, defaults to `self.do_thumbnail`):
-            Whether to resize the image using thumbnail method.
-        do_align_long_axis (`bool`, *optional*, defaults to `self.do_align_long_axis`):
-            Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
-    """
-
-    do_thumbnail: Optional[bool]
-    do_align_long_axis: Optional[bool]
-
-
 @auto_docstring
 class DonutImageProcessorFast(BaseImageProcessorFast):
     resample = PILImageResampling.BILINEAR
@@ -58,9 +46,9 @@ class DonutImageProcessorFast(BaseImageProcessorFast):
     do_thumbnail = True
     do_align_long_axis = False
     do_pad = True
-    valid_kwargs = DonutFastImageProcessorKwargs
+    valid_kwargs = DonutImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[DonutFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[DonutImageProcessorKwargs]):
         size = kwargs.pop("size", None)
         if isinstance(size, (tuple, list)):
             size = size[::-1]
@@ -68,7 +56,7 @@ def __init__(self, **kwargs: Unpack[DonutFastImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     @auto_docstring
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[DonutFastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[DonutImageProcessorKwargs]) -> BatchFeature:
         if "size" in kwargs:
             size = kwargs.pop("size")
             if isinstance(size, (tuple, list)):
diff --git a/src/transformers/models/donut/processing_donut.py b/src/transformers/models/donut/processing_donut.py
index c75e2fcaa542..a545c90539b9 100644
--- a/src/transformers/models/donut/processing_donut.py
+++ b/src/transformers/models/donut/processing_donut.py
@@ -74,8 +74,6 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None,
-        audio=None,
-        videos=None,
         **kwargs: Unpack[DonutProcessorKwargs],
     ):
         """
diff --git a/src/transformers/models/dpt/image_processing_dpt.py b/src/transformers/models/dpt/image_processing_dpt.py
index 0ec3eaed1c43..3ba5a6e30c21 100644
--- a/src/transformers/models/dpt/image_processing_dpt.py
+++ b/src/transformers/models/dpt/image_processing_dpt.py
@@ -44,6 +44,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import (
     TensorType,
     filter_out_non_signature_kwargs,
@@ -63,6 +64,26 @@
 logger = logging.get_logger(__name__)
 
 
+class DPTImageProcessorKwargs(ImagesKwargs):
+    """
+    ensure_multiple_of (`int`, *optional*, defaults to 1):
+        If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overridden
+        by `ensure_multiple_of` in `preprocess`.
+    keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
+        If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. Can
+        be overridden by `keep_aspect_ratio` in `preprocess`.
+    do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
+        Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
+        is used for background, and background itself is not included in all classes of a dataset (e.g.
+        ADE20k). The background label will be replaced by 255.
+    """
+
+    ensure_multiple_of: Optional[int]
+    size_divisor: Optional[int]
+    keep_aspect_ratio: Optional[bool]
+    do_reduce_labels: Optional[bool]
+
+
 def get_resize_output_image_size(
     input_image: np.ndarray,
     output_size: Union[int, Iterable[int]],
@@ -151,6 +172,7 @@ class DPTImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values"]
+    valid_kwargs = DPTImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/dpt/image_processing_dpt_fast.py b/src/transformers/models/dpt/image_processing_dpt_fast.py
index 892ddd7c3d6f..ba0a6d28c56c 100644
--- a/src/transformers/models/dpt/image_processing_dpt_fast.py
+++ b/src/transformers/models/dpt/image_processing_dpt_fast.py
@@ -28,7 +28,7 @@
 from torchvision.transforms.v2 import functional as F
 
 from ...image_processing_base import BatchFeature
-from ...image_processing_utils_fast import BaseImageProcessorFast, DefaultFastImageProcessorKwargs
+from ...image_processing_utils_fast import BaseImageProcessorFast
 from ...image_transforms import group_images_by_shape, reorder_images
 from ...image_utils import (
     IMAGENET_STANDARD_MEAN,
@@ -41,35 +41,13 @@
 )
 from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring, requires_backends
+from .image_processing_dpt import DPTImageProcessorKwargs
 
 
 if TYPE_CHECKING:
     from ...modeling_outputs import DepthEstimatorOutput
 
 
-class DPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    ensure_multiple_of (`int`, *optional*, defaults to 1):
-        If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overridden
-        by `ensure_multiple_of` in `preprocess`.
-    size_divisor (`int`, *optional*):
-        If `do_pad` is `True`, pads the image dimensions to be divisible by this value. This was introduced in the
-        DINOv2 paper, which uses the model in combination with DPT.
-    keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
-        If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. Can
-        be overridden by `keep_aspect_ratio` in `preprocess`.
-    do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
-        Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
-        is used for background, and background itself is not included in all classes of a dataset (e.g.
-        ADE20k). The background label will be replaced by 255.
-    """
-
-    ensure_multiple_of: Optional[int]
-    size_divisor: Optional[int]
-    keep_aspect_ratio: Optional[bool]
-    do_reduce_labels: Optional[bool]
-
-
 def get_resize_output_image_size(
     input_image: "torch.Tensor",
     output_size: Union[int, Iterable[int]],
@@ -123,13 +101,13 @@ class DPTImageProcessorFast(BaseImageProcessorFast):
     do_normalize = True
     do_reduce_labels = None
 
-    valid_kwargs = DPTFastImageProcessorKwargs
+    valid_kwargs = DPTImageProcessorKwargs
     do_pad = False
     rescale_factor = 1 / 255
     ensure_multiple_of = 1
     keep_aspect_ratio = False
 
-    def __init__(self, **kwargs: Unpack[DPTFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[DPTImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     def reduce_label(self, labels: list["torch.Tensor"]):
@@ -147,7 +125,7 @@ def preprocess(
         self,
         images: ImageInput,
         segmentation_maps: Optional[ImageInput] = None,
-        **kwargs: Unpack[DPTFastImageProcessorKwargs],
+        **kwargs: Unpack[DPTImageProcessorKwargs],
     ) -> BatchFeature:
         r"""
         segmentation_maps (`ImageInput`, *optional*):
@@ -162,7 +140,7 @@ def _preprocess_image_like_inputs(
         do_convert_rgb: bool,
         input_data_format: ChannelDimension,
         device: Optional[Union[str, "torch.device"]] = None,
-        **kwargs: Unpack[DPTFastImageProcessorKwargs],
+        **kwargs: Unpack[DPTImageProcessorKwargs],
     ) -> BatchFeature:
         """
         Preprocess image-like inputs.
diff --git a/src/transformers/models/dpt/modular_dpt.py b/src/transformers/models/dpt/modular_dpt.py
index 34eb08f39b68..241d8de122b2 100644
--- a/src/transformers/models/dpt/modular_dpt.py
+++ b/src/transformers/models/dpt/modular_dpt.py
@@ -21,7 +21,7 @@
 import torch
 
 from ...image_processing_base import BatchFeature
-from ...image_processing_utils_fast import BaseImageProcessorFast, DefaultFastImageProcessorKwargs
+from ...image_processing_utils_fast import BaseImageProcessorFast
 from ...image_transforms import group_images_by_shape, reorder_images
 from ...image_utils import (
     IMAGENET_STANDARD_MEAN,
@@ -35,6 +35,7 @@
     requires_backends,
 )
 from ..beit.image_processing_beit_fast import BeitImageProcessorFast
+from .image_processing_dpt import DPTImageProcessorKwargs
 
 
 if TYPE_CHECKING:
@@ -82,29 +83,6 @@ def constrain_to_multiple_of(val, multiple, min_val=0, max_val=None):
     return SizeDict(height=new_height, width=new_width)
 
 
-class DPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    ensure_multiple_of (`int`, *optional*, defaults to 1):
-        If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overridden
-        by `ensure_multiple_of` in `preprocess`.
-    size_divisor (`int`, *optional*):
-        If `do_pad` is `True`, pads the image dimensions to be divisible by this value. This was introduced in the
-        DINOv2 paper, which uses the model in combination with DPT.
-    keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
-        If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. Can
-        be overridden by `keep_aspect_ratio` in `preprocess`.
-    do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
-        Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
-        is used for background, and background itself is not included in all classes of a dataset (e.g.
-        ADE20k). The background label will be replaced by 255.
-    """
-
-    ensure_multiple_of: Optional[int]
-    size_divisor: Optional[int]
-    keep_aspect_ratio: Optional[bool]
-    do_reduce_labels: Optional[bool]
-
-
 @auto_docstring
 class DPTImageProcessorFast(BeitImageProcessorFast):
     resample = PILImageResampling.BICUBIC
@@ -123,7 +101,7 @@ class DPTImageProcessorFast(BeitImageProcessorFast):
     do_center_crop = None
     do_reduce_labels = None
 
-    valid_kwargs = DPTFastImageProcessorKwargs
+    valid_kwargs = DPTImageProcessorKwargs
 
     def resize(
         self,
diff --git a/src/transformers/models/efficientloftr/image_processing_efficientloftr.py b/src/transformers/models/efficientloftr/image_processing_efficientloftr.py
index 5b87278683ac..d1beabb6c2b9 100644
--- a/src/transformers/models/efficientloftr/image_processing_efficientloftr.py
+++ b/src/transformers/models/efficientloftr/image_processing_efficientloftr.py
@@ -34,6 +34,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, logging, requires_backends
 
 
@@ -49,6 +50,15 @@
 logger = logging.get_logger(__name__)
 
 
+class EfficientLoFTRImageProcessorKwargs(ImagesKwargs):
+    r"""
+    do_grayscale (`bool`, *optional*, defaults to `True`):
+        Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method.
+    """
+
+    do_grayscale: Optional[bool] = True
+
+
 # Copied from transformers.models.superpoint.image_processing_superpoint.is_grayscale
 def is_grayscale(
     image: np.ndarray,
@@ -155,6 +165,7 @@ class EfficientLoFTRImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values"]
+    valid_kwargs = EfficientLoFTRImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/efficientloftr/image_processing_efficientloftr_fast.py b/src/transformers/models/efficientloftr/image_processing_efficientloftr_fast.py
index 1463ef405f37..994a10f04ee1 100644
--- a/src/transformers/models/efficientloftr/image_processing_efficientloftr_fast.py
+++ b/src/transformers/models/efficientloftr/image_processing_efficientloftr_fast.py
@@ -22,7 +22,6 @@
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
     group_images_by_shape,
     reorder_images,
 )
@@ -40,6 +39,7 @@
     TensorType,
     auto_docstring,
 )
+from .image_processing_efficientloftr import EfficientLoFTRImageProcessorKwargs
 
 
 if TYPE_CHECKING:
@@ -108,15 +108,6 @@ def convert_to_grayscale(
     return F.rgb_to_grayscale(image, num_output_channels=3)
 
 
-class EfficientLoFTRFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    r"""
-    do_grayscale (`bool`, *optional*, defaults to `True`):
-        Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method.
-    """
-
-    do_grayscale: Optional[bool] = True
-
-
 @auto_docstring
 class EfficientLoFTRImageProcessorFast(BaseImageProcessorFast):
     resample = PILImageResampling.BILINEAR
@@ -126,13 +117,13 @@ class EfficientLoFTRImageProcessorFast(BaseImageProcessorFast):
     do_rescale = True
     rescale_factor = 1 / 255
     do_normalize = None
-    valid_kwargs = EfficientLoFTRFastImageProcessorKwargs
+    valid_kwargs = EfficientLoFTRImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[EfficientLoFTRFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[EfficientLoFTRImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     @auto_docstring
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[EfficientLoFTRFastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[EfficientLoFTRImageProcessorKwargs]) -> BatchFeature:
         return super().preprocess(images, **kwargs)
 
     def _prepare_images_structure(
diff --git a/src/transformers/models/efficientnet/image_processing_efficientnet.py b/src/transformers/models/efficientnet/image_processing_efficientnet.py
index eaad420b31f8..f5a69eff70e4 100644
--- a/src/transformers/models/efficientnet/image_processing_efficientnet.py
+++ b/src/transformers/models/efficientnet/image_processing_efficientnet.py
@@ -33,6 +33,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
 
 
@@ -43,6 +44,18 @@
 logger = logging.get_logger(__name__)
 
 
+class EfficientNetImageProcessorKwargs(ImagesKwargs):
+    """
+    rescale_offset (`bool`, *optional*, defaults to `self.rescale_offset`):
+        Whether to rescale the image between [-max_range/2, scale_range/2] instead of [0, scale_range].
+    include_top (`bool`, *optional*, defaults to `self.include_top`):
+        Normalize the image again with the standard deviation only for image classification if set to True.
+    """
+
+    rescale_offset: bool
+    include_top: bool
+
+
 class EfficientNetImageProcessor(BaseImageProcessor):
     r"""
     Constructs a EfficientNet image processor.
@@ -83,6 +96,7 @@ class EfficientNetImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values"]
+    valid_kwargs = EfficientNetImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/efficientnet/image_processing_efficientnet_fast.py b/src/transformers/models/efficientnet/image_processing_efficientnet_fast.py
index 77e787614a10..5f3439aaa273 100644
--- a/src/transformers/models/efficientnet/image_processing_efficientnet_fast.py
+++ b/src/transformers/models/efficientnet/image_processing_efficientnet_fast.py
@@ -20,7 +20,7 @@
 import torch
 from torchvision.transforms.v2 import functional as F
 
-from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature, DefaultFastImageProcessorKwargs
+from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature
 from ...image_transforms import group_images_by_shape, reorder_images
 from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, ImageInput, PILImageResampling, SizeDict
 from ...processing_utils import Unpack
@@ -28,19 +28,7 @@
     TensorType,
     auto_docstring,
 )
-
-
-class EfficientNetFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    Args:
-        rescale_offset (`bool`, *optional*, defaults to `self.rescale_offset`):
-            Whether to rescale the image between [-max_range/2, scale_range/2] instead of [0, scale_range].
-        include_top (`bool`, *optional*, defaults to `self.include_top`):
-            Normalize the image again with the standard deviation only for image classification if set to True.
-    """
-
-    rescale_offset: bool
-    include_top: bool
+from .image_processing_efficientnet import EfficientNetImageProcessorKwargs
 
 
 @auto_docstring
@@ -57,9 +45,9 @@ class EfficientNetImageProcessorFast(BaseImageProcessorFast):
     rescale_offset = False
     do_normalize = True
     include_top = True
-    valid_kwargs = EfficientNetFastImageProcessorKwargs
+    valid_kwargs = EfficientNetImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[EfficientNetFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[EfficientNetImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     def rescale(
@@ -195,7 +183,7 @@ def _preprocess(
         return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
 
     @auto_docstring
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[EfficientNetFastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[EfficientNetImageProcessorKwargs]) -> BatchFeature:
         return super().preprocess(images, **kwargs)
 
 
diff --git a/src/transformers/models/emu3/image_processing_emu3.py b/src/transformers/models/emu3/image_processing_emu3.py
index b876a9de96bf..fca5316a3fca 100644
--- a/src/transformers/models/emu3/image_processing_emu3.py
+++ b/src/transformers/models/emu3/image_processing_emu3.py
@@ -37,6 +37,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, is_vision_available, logging
 
 
@@ -46,6 +47,11 @@
 logger = logging.get_logger(__name__)
 
 
+class Emu3ImageProcessorKwargs(ImagesKwargs):
+    ratio: Optional[str]
+    image_area: Optional[int]
+
+
 def smart_resize(
     height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4 * 1280
 ):
@@ -108,6 +114,7 @@ class Emu3ImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values", "image_sizes"]
+    valid_kwargs = Emu3ImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/emu3/processing_emu3.py b/src/transformers/models/emu3/processing_emu3.py
index ef2681d2385b..b7ed8e9074f0 100644
--- a/src/transformers/models/emu3/processing_emu3.py
+++ b/src/transformers/models/emu3/processing_emu3.py
@@ -20,7 +20,7 @@
 
 from ...image_processing_utils import BatchFeature
 from ...image_utils import ImageInput
-from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
+from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import is_vision_available
 
@@ -33,14 +33,8 @@ class Emu3TextKwargs(TextKwargs, total=False):
     return_for_image_generation: bool
 
 
-class Emu3ImagesKwargs(ImagesKwargs, total=False):
-    ratio: str
-    image_area: int
-
-
 class Emu3ProcessorKwargs(ProcessingKwargs, total=False):
     text_kwargs: Emu3TextKwargs
-    images_kwargs: Emu3ImagesKwargs
     _defaults = {
         "text_kwargs": {
             "return_for_image_generation": False,
@@ -95,8 +89,6 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
-        audio=None,
-        videos=None,
         **kwargs: Unpack[Emu3ProcessorKwargs],
     ) -> BatchFeature:
         """
diff --git a/src/transformers/models/eomt/image_processing_eomt.py b/src/transformers/models/eomt/image_processing_eomt.py
index b8e0058eee8d..189aaaf41d4d 100644
--- a/src/transformers/models/eomt/image_processing_eomt.py
+++ b/src/transformers/models/eomt/image_processing_eomt.py
@@ -36,6 +36,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
@@ -53,6 +54,21 @@
     import torch.nn.functional as F
 
 
+class EomtImageProcessorKwargs(ImagesKwargs):
+    """
+    do_split_image (`bool`, *optional*, defaults to `False`):
+        Whether to split the input images into overlapping patches for semantic segmentation. If set to `True`, the
+        input images will be split into patches of size `size["shortest_edge"]` with an overlap between patches.
+        Otherwise, the input images will be padded to the target size.
+    ignore_index (`int`, *optional*):
+        Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
+        denoted with 0 (background) will be replaced with `ignore_index`.
+    """
+
+    do_split_image: bool
+    ignore_index: Optional[int] = None
+
+
 # Adapted from transformers.models.maskformer.image_processing_maskformer.convert_segmentation_map_to_binary_masks
 def convert_segmentation_map_to_binary_masks(
     segmentation_map: np.ndarray,
diff --git a/src/transformers/models/eomt/image_processing_eomt_fast.py b/src/transformers/models/eomt/image_processing_eomt_fast.py
index ca80231d3a76..68fd7bb00744 100644
--- a/src/transformers/models/eomt/image_processing_eomt_fast.py
+++ b/src/transformers/models/eomt/image_processing_eomt_fast.py
@@ -24,7 +24,6 @@
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
     group_images_by_shape,
     reorder_images,
 )
@@ -43,6 +42,7 @@
     filter_out_non_signature_kwargs,
 )
 from .image_processing_eomt import (
+    EomtImageProcessorKwargs,
     compute_segments,
     convert_segmentation_map_to_binary_masks,
     get_size_with_aspect_ratio,
@@ -50,25 +50,6 @@
 )
 
 
-class EomtImageProcessorFastKwargs(DefaultFastImageProcessorKwargs):
-    """
-    do_split_image (`bool`, *optional*, defaults to `False`):
-            Whether to split the input images into overlapping patches for semantic segmentation. If set to `True`, the
-            input images will be split into patches of size `size["shortest_edge"]` with an overlap between patches.
-            Otherwise, the input images will be padded to the target size.
-    do_pad (`bool`, *optional*, defaults to `False`):
-            Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
-            number of patches in the batch. Padding will be applied to the bottom and right with zeros.
-    ignore_index (`int`, *optional*):
-            Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
-            denoted with 0 (background) will be replaced with `ignore_index`.
-    """
-
-    do_split_image: bool
-    do_pad: bool
-    ignore_index: Optional[int] = None
-
-
 def get_target_size(size_dict: dict[str, int]) -> tuple[int, int]:
     """Returns the height and width from a size dict."""
     target_height = size_dict["shortest_edge"]
@@ -102,9 +83,9 @@ class EomtImageProcessorFast(BaseImageProcessorFast):
     do_split_image = False
     do_pad = False
     ignore_index = None
-    valid_kwargs = EomtImageProcessorFastKwargs
+    valid_kwargs = EomtImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[EomtImageProcessorFastKwargs]):
+    def __init__(self, **kwargs: Unpack[EomtImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     def _split_image(self, images: torch.Tensor, size: dict, image_indices: int) -> tuple[list, list]:
@@ -153,7 +134,7 @@ def preprocess(
         images: ImageInput,
         segmentation_maps: Optional[list[torch.Tensor]] = None,
         instance_id_to_semantic_id: Optional[dict[int, int]] = None,
-        **kwargs: Unpack[EomtImageProcessorFastKwargs],
+        **kwargs: Unpack[EomtImageProcessorKwargs],
     ) -> BatchFeature:
         r"""
         segmentation_maps (`ImageInput`, *optional*):
@@ -171,7 +152,7 @@ def _preprocess_image_like_inputs(
         do_convert_rgb: bool,
         input_data_format: ChannelDimension,
         device: Optional[Union[str, "torch.device"]] = None,
-        **kwargs: Unpack[EomtImageProcessorFastKwargs],
+        **kwargs: Unpack[EomtImageProcessorKwargs],
     ) -> BatchFeature:
         """
         Preprocess image-like inputs.
diff --git a/src/transformers/models/flava/image_processing_flava.py b/src/transformers/models/flava/image_processing_flava.py
index 9d67ac841124..3c19a2405169 100644
--- a/src/transformers/models/flava/image_processing_flava.py
+++ b/src/transformers/models/flava/image_processing_flava.py
@@ -37,6 +37,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
 from ...utils.import_utils import requires
 
@@ -56,6 +57,89 @@
 LOGIT_LAPLACE_EPS: float = 0.1
 
 
+class FlavaImageProcessorKwargs(ImagesKwargs):
+    """
+    return_image_mask (`bool`, *optional*, defaults to `False`):
+        Whether to return the image mask. Can be overridden by the `return_image_mask` parameter in `preprocess`.
+    input_size_patches (`int`, *optional*, defaults to 14):
+        Number of patches in the image in height and width direction. 14x14 = 196 total patches. Can be overridden
+        by the `input_size_patches` parameter in `preprocess`.
+    total_mask_patches (`int`, *optional*, defaults to 75):
+        Total number of patches that should be masked. Can be overridden by the `total_mask_patches` parameter in
+        `preprocess`.
+    mask_group_min_patches (`int`, *optional*, defaults to 16):
+        Minimum number of patches that should be masked. Can be overridden by the `mask_group_min_patches`
+        parameter in `preprocess`.
+    mask_group_max_patches (`int`, *optional*):
+        Maximum number of patches that should be masked. Can be overridden by the `mask_group_max_patches`
+        parameter in `preprocess`.
+    mask_group_min_aspect_ratio (`float`, *optional*, defaults to 0.3):
+        Minimum aspect ratio of the mask window. Can be overridden by the `mask_group_min_aspect_ratio` parameter
+        in `preprocess`.
+    mask_group_max_aspect_ratio (`float`, *optional*):
+        Maximum aspect ratio of the mask window. Can be overridden by the `mask_group_max_aspect_ratio` parameter
+        in `preprocess`.
+    return_codebook_pixels (`bool`, *optional*, defaults to `False`):
+        Whether to return the codebook pixel values.
+    codebook_do_resize (`bool`, *optional*, defaults to `True`):
+        Whether to resize the input for codebook to a certain. Can be overridden by the `codebook_do_resize`
+        parameter in `preprocess`. `codebook_size`.
+    codebook_size (`dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
+        Resize the input for codebook to the given size. Can be overridden by the `codebook_size` parameter in
+        `preprocess`.
+    codebook_resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.LANCZOS`):
+        Resampling filter to use if resizing the codebook image. Can be overridden by the `codebook_resample`
+        parameter in `preprocess`.
+    codebook_do_center_crop (`bool`, *optional*, defaults to `True`):
+        Whether to crop the input for codebook at the center. If the input size is smaller than
+        `codebook_crop_size` along any edge, the image is padded with 0's and then center cropped. Can be
+        overridden by the `codebook_do_center_crop` parameter in `preprocess`.
+    codebook_crop_size (`dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
+        Desired output size for codebook input when applying center-cropping. Can be overridden by the
+        `codebook_crop_size` parameter in `preprocess`.
+    codebook_do_rescale (`bool`, *optional*, defaults to `True`):
+        Whether to rescale the input for codebook by the specified scale `codebook_rescale_factor`. Can be
+        overridden by the `codebook_do_rescale` parameter in `preprocess`.
+    codebook_rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+        Defines the scale factor to use if rescaling the codebook image. Can be overridden by the
+        `codebook_rescale_factor` parameter in `preprocess`.
+    codebook_do_map_pixels (`bool`, *optional*, defaults to `True`):
+        Whether to map the pixel values of the codebook input to (1 - 2e)x + e. Can be overridden by the
+        `codebook_do_map_pixels` parameter in `preprocess`.
+    codebook_do_normalize (`bool`, *optional*, defaults to `True`):
+        Whether or not to normalize the input for codebook with `codebook_image_mean` and `codebook_image_std`. Can
+        be overridden by the `codebook_do_normalize` parameter in `preprocess`.
+    codebook_image_mean (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0, 0, 0]`):
+        The sequence of means for each channel, to be used when normalizing images for codebook. Can be overridden
+        by the `codebook_image_mean` parameter in `preprocess`.
+    codebook_image_std (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+        The sequence of standard deviations for each channel, to be used when normalizing images for codebook. Can
+        be overridden by the `codebook_image_std` parameter in `preprocess`.
+    """
+
+    # Mask related params
+    return_image_mask: Optional[bool]
+    input_size_patches: Optional[int]
+    total_mask_patches: Optional[int]
+    mask_group_min_patches: Optional[int]
+    mask_group_max_patches: Optional[int]
+    mask_group_min_aspect_ratio: Optional[float]
+    mask_group_max_aspect_ratio: Optional[float]
+    # Codebook related params
+    return_codebook_pixels: Optional[bool]
+    codebook_do_resize: Optional[bool]
+    codebook_size: Optional[bool]
+    codebook_resample: Optional[int]
+    codebook_do_center_crop: Optional[bool]
+    codebook_crop_size: Optional[int]
+    codebook_do_rescale: Optional[bool]
+    codebook_rescale_factor: Optional[Union[int, float]]
+    codebook_do_map_pixels: Optional[bool]
+    codebook_do_normalize: Optional[bool]
+    codebook_image_mean: Optional[Union[float, Iterable[float]]]
+    codebook_image_std: Optional[Union[float, Iterable[float]]]
+
+
 # Inspired from https://github.com/microsoft/unilm/blob/master/beit/masking_generator.py
 class FlavaMaskingGenerator:
     def __init__(
@@ -225,6 +309,7 @@ class FlavaImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values"]
+    valid_kwargs = FlavaImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/flava/image_processing_flava_fast.py b/src/transformers/models/flava/image_processing_flava_fast.py
index 732d25e71f69..0dfbd07f17a7 100644
--- a/src/transformers/models/flava/image_processing_flava_fast.py
+++ b/src/transformers/models/flava/image_processing_flava_fast.py
@@ -16,7 +16,6 @@
 
 import math
 import random
-from collections.abc import Iterable
 from functools import lru_cache
 from typing import Any, Optional, Union
 
@@ -26,7 +25,6 @@
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
     BatchFeature,
-    DefaultFastImageProcessorKwargs,
     get_size_dict,
 )
 from ...image_transforms import ChannelDimension, group_images_by_shape, reorder_images
@@ -42,6 +40,7 @@
     FLAVA_IMAGE_MEAN,
     FLAVA_IMAGE_STD,
     LOGIT_LAPLACE_EPS,
+    FlavaImageProcessorKwargs,
 )
 
 
@@ -121,90 +120,6 @@ def __call__(self):
         return mask
 
 
-class FlavaFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    Args:
-        return_image_mask (`bool`, *optional*, defaults to `False`):
-            Whether to return the image mask. Can be overridden by the `return_image_mask` parameter in `preprocess`.
-        input_size_patches (`int`, *optional*, defaults to 14):
-            Number of patches in the image in height and width direction. 14x14 = 196 total patches. Can be overridden
-            by the `input_size_patches` parameter in `preprocess`.
-        total_mask_patches (`int`, *optional*, defaults to 75):
-            Total number of patches that should be masked. Can be overridden by the `total_mask_patches` parameter in
-            `preprocess`.
-        mask_group_min_patches (`int`, *optional*, defaults to 16):
-            Minimum number of patches that should be masked. Can be overridden by the `mask_group_min_patches`
-            parameter in `preprocess`.
-        mask_group_max_patches (`int`, *optional*):
-            Maximum number of patches that should be masked. Can be overridden by the `mask_group_max_patches`
-            parameter in `preprocess`.
-        mask_group_min_aspect_ratio (`float`, *optional*, defaults to 0.3):
-            Minimum aspect ratio of the mask window. Can be overridden by the `mask_group_min_aspect_ratio` parameter
-            in `preprocess`.
-        mask_group_max_aspect_ratio (`float`, *optional*):
-            Maximum aspect ratio of the mask window. Can be overridden by the `mask_group_max_aspect_ratio` parameter
-            in `preprocess`.
-        return_codebook_pixels (`bool`, *optional*, defaults to `False`):
-            Whether to return the codebook pixel values.
-        codebook_do_resize (`bool`, *optional*, defaults to `True`):
-            Whether to resize the input for codebook to a certain. Can be overridden by the `codebook_do_resize`
-            parameter in `preprocess`. `codebook_size`.
-        codebook_size (`dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
-            Resize the input for codebook to the given size. Can be overridden by the `codebook_size` parameter in
-            `preprocess`.
-        codebook_resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.LANCZOS`):
-            Resampling filter to use if resizing the codebook image. Can be overridden by the `codebook_resample`
-            parameter in `preprocess`.
-        codebook_do_center_crop (`bool`, *optional*, defaults to `True`):
-            Whether to crop the input for codebook at the center. If the input size is smaller than
-            `codebook_crop_size` along any edge, the image is padded with 0's and then center cropped. Can be
-            overridden by the `codebook_do_center_crop` parameter in `preprocess`.
-        codebook_crop_size (`dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
-            Desired output size for codebook input when applying center-cropping. Can be overridden by the
-            `codebook_crop_size` parameter in `preprocess`.
-        codebook_do_rescale (`bool`, *optional*, defaults to `True`):
-            Whether to rescale the input for codebook by the specified scale `codebook_rescale_factor`. Can be
-            overridden by the `codebook_do_rescale` parameter in `preprocess`.
-        codebook_rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
-            Defines the scale factor to use if rescaling the codebook image. Can be overridden by the
-            `codebook_rescale_factor` parameter in `preprocess`.
-        codebook_do_map_pixels (`bool`, *optional*, defaults to `True`):
-            Whether to map the pixel values of the codebook input to (1 - 2e)x + e. Can be overridden by the
-            `codebook_do_map_pixels` parameter in `preprocess`.
-        codebook_do_normalize (`bool`, *optional*, defaults to `True`):
-            Whether or not to normalize the input for codebook with `codebook_image_mean` and `codebook_image_std`. Can
-            be overridden by the `codebook_do_normalize` parameter in `preprocess`.
-        codebook_image_mean (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0, 0, 0]`):
-            The sequence of means for each channel, to be used when normalizing images for codebook. Can be overridden
-            by the `codebook_image_mean` parameter in `preprocess`.
-        codebook_image_std (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
-            The sequence of standard deviations for each channel, to be used when normalizing images for codebook. Can
-            be overridden by the `codebook_image_std` parameter in `preprocess`.
-    """
-
-    # Mask related params
-    return_image_mask: Optional[bool]
-    input_size_patches: Optional[int]
-    total_mask_patches: Optional[int]
-    mask_group_min_patches: Optional[int]
-    mask_group_max_patches: Optional[int]
-    mask_group_min_aspect_ratio: Optional[float]
-    mask_group_max_aspect_ratio: Optional[float]
-    # Codebook related params
-    return_codebook_pixels: Optional[bool]
-    codebook_do_resize: Optional[bool]
-    codebook_size: Optional[bool]
-    codebook_resample: Optional[int]
-    codebook_do_center_crop: Optional[bool]
-    codebook_crop_size: Optional[int]
-    codebook_do_rescale: Optional[bool]
-    codebook_rescale_factor: Optional[Union[int, float]]
-    codebook_do_map_pixels: Optional[bool]
-    codebook_do_normalize: Optional[bool]
-    codebook_image_mean: Optional[Union[float, Iterable[float]]]
-    codebook_image_std: Optional[Union[float, Iterable[float]]]
-
-
 @auto_docstring
 class FlavaImageProcessorFast(BaseImageProcessorFast):
     resample = PILImageResampling.BICUBIC
@@ -239,13 +154,13 @@ class FlavaImageProcessorFast(BaseImageProcessorFast):
     codebook_do_normalize = True
     codebook_image_mean = FLAVA_CODEBOOK_MEAN
     codebook_image_std = FLAVA_CODEBOOK_STD
-    valid_kwargs = FlavaFastImageProcessorKwargs
+    valid_kwargs = FlavaImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[FlavaFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[FlavaImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     @auto_docstring
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[DefaultFastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[FlavaImageProcessorKwargs]) -> BatchFeature:
         return super().preprocess(images, **kwargs)
 
     @classmethod
diff --git a/src/transformers/models/flava/processing_flava.py b/src/transformers/models/flava/processing_flava.py
index ceebdb6efa49..8e8a806e8615 100644
--- a/src/transformers/models/flava/processing_flava.py
+++ b/src/transformers/models/flava/processing_flava.py
@@ -17,39 +17,8 @@
 """
 
 import warnings
-from collections.abc import Iterable
-from typing import Optional, Union
 
-from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
-
-
-class FlavaImagesKwargs(ImagesKwargs):
-    # Mask related params
-    return_image_mask: Optional[bool]
-    input_size_patches: Optional[int]
-    total_mask_patches: Optional[int]
-    mask_group_min_patches: Optional[int]
-    mask_group_max_patches: Optional[int]
-    mask_group_min_aspect_ratio: Optional[float]
-    mask_group_max_aspect_ratio: Optional[float]
-    # Codebook related params
-    return_codebook_pixels: Optional[bool]
-    codebook_do_resize: Optional[bool]
-    codebook_size: Optional[bool]
-    codebook_resample: Optional[int]
-    codebook_do_center_crop: Optional[bool]
-    codebook_crop_size: Optional[int]
-    codebook_do_rescale: Optional[bool]
-    codebook_rescale_factor: Optional[Union[int, float]]
-    codebook_do_map_pixels: Optional[bool]
-    codebook_do_normalize: Optional[bool]
-    codebook_image_mean: Optional[Union[float, Iterable[float]]]
-    codebook_image_std: Optional[Union[float, Iterable[float]]]
-
-
-class FlavaProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: FlavaImagesKwargs
-    _defaults = {}
+from ...processing_utils import ProcessorMixin
 
 
 class FlavaProcessor(ProcessorMixin):
@@ -67,7 +36,6 @@ class FlavaProcessor(ProcessorMixin):
     attributes = ["image_processor", "tokenizer"]
     image_processor_class = "FlavaImageProcessor"
     tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
-    valid_processor_kwargs = FlavaProcessorKwargs
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         feature_extractor = None
diff --git a/src/transformers/models/florence2/processing_florence2.py b/src/transformers/models/florence2/processing_florence2.py
index 5ae0f4828bc1..1c25ddceeafc 100644
--- a/src/transformers/models/florence2/processing_florence2.py
+++ b/src/transformers/models/florence2/processing_florence2.py
@@ -39,7 +39,6 @@
 class Florence2ProcessorKwargs(ProcessingKwargs, total=False):
     _defaults = {
         "text_kwargs": {"padding": False, "return_mm_token_type_ids": False},
-        "images_kwargs": {},
     }
 
 
diff --git a/src/transformers/models/fuyu/processing_fuyu.py b/src/transformers/models/fuyu/processing_fuyu.py
index debbcb23aac1..75b2bbad926e 100644
--- a/src/transformers/models/fuyu/processing_fuyu.py
+++ b/src/transformers/models/fuyu/processing_fuyu.py
@@ -71,7 +71,6 @@ class FuyuProcessorKwargs(ProcessingKwargs, total=False):
             "verbose": True,
             "return_mm_token_type_ids": False,
         },
-        "images_kwargs": {},
     }
 
 
@@ -487,8 +486,6 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None,
-        audio=None,
-        videos=None,
         **kwargs: Unpack[FuyuProcessorKwargs],
     ) -> "FuyuBatchFeature":
         """
diff --git a/src/transformers/models/gemma3/image_processing_gemma3.py b/src/transformers/models/gemma3/image_processing_gemma3.py
index efa65a6d2bf2..5206a13a04a3 100644
--- a/src/transformers/models/gemma3/image_processing_gemma3.py
+++ b/src/transformers/models/gemma3/image_processing_gemma3.py
@@ -40,6 +40,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
 
 
@@ -50,6 +51,24 @@
     import PIL
 
 
+class Gemma3ImageProcessorKwargs(ImagesKwargs):
+    """
+    do_pan_and_scan (`bool`, *optional*):
+        Whether to apply `pan_and_scan` to images.
+    pan_and_scan_min_crop_size (`int`, *optional*):
+        Minimum size of each crop in pan and scan.
+    pan_and_scan_max_num_crops (`int`, *optional*):
+        Maximum number of crops per image in pan and scan.
+    pan_and_scan_min_ratio_to_activate (`float`, *optional*):
+        Minimum aspect ratio to activate pan and scan.
+    """
+
+    do_pan_and_scan: Optional[bool]
+    pan_and_scan_min_crop_size: Optional[int]
+    pan_and_scan_max_num_crops: Optional[int]
+    pan_and_scan_min_ratio_to_activate: Optional[float]
+
+
 class Gemma3ImageProcessor(BaseImageProcessor):
     r"""
     Constructs a SigLIP image processor.
@@ -91,6 +110,7 @@ class Gemma3ImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values", "num_crops"]
+    valid_kwargs = Gemma3ImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/gemma3/image_processing_gemma3_fast.py b/src/transformers/models/gemma3/image_processing_gemma3_fast.py
index c61152bc6b22..bfb58be2a8e1 100644
--- a/src/transformers/models/gemma3/image_processing_gemma3_fast.py
+++ b/src/transformers/models/gemma3/image_processing_gemma3_fast.py
@@ -24,7 +24,6 @@
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
     BatchFeature,
-    DefaultFastImageProcessorKwargs,
     group_images_by_shape,
     reorder_images,
 )
@@ -35,29 +34,12 @@
     auto_docstring,
     logging,
 )
+from .image_processing_gemma3 import Gemma3ImageProcessorKwargs
 
 
 logger = logging.get_logger(__name__)
 
 
-class Gemma3FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    do_pan_and_scan (`bool`, *optional*):
-        Whether to apply `pan_and_scan` to images.
-    pan_and_scan_min_crop_size (`int`, *optional*):
-        Minimum size of each crop in pan and scan.
-    pan_and_scan_max_num_crops (`int`, *optional*):
-        Maximum number of crops per image in pan and scan.
-    pan_and_scan_min_ratio_to_activate (`float`, *optional*):
-        Minimum aspect ratio to activate pan and scan.
-    """
-
-    do_pan_and_scan: Optional[bool]
-    pan_and_scan_min_crop_size: Optional[int]
-    pan_and_scan_max_num_crops: Optional[int]
-    pan_and_scan_min_ratio_to_activate: Optional[float]
-
-
 @auto_docstring
 class Gemma3ImageProcessorFast(BaseImageProcessorFast):
     resample = PILImageResampling.BILINEAR
@@ -73,9 +55,9 @@ class Gemma3ImageProcessorFast(BaseImageProcessorFast):
     pan_and_scan_min_crop_size = None
     pan_and_scan_max_num_crops = None
     pan_and_scan_min_ratio_to_activate = None
-    valid_kwargs = Gemma3FastImageProcessorKwargs
+    valid_kwargs = Gemma3ImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[Gemma3FastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[Gemma3ImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     def pan_and_scan_batched(
@@ -167,7 +149,7 @@ def _process_images_for_pan_and_scan(
     def preprocess(
         self,
         images: ImageInput,
-        **kwargs: Unpack[Gemma3FastImageProcessorKwargs],
+        **kwargs: Unpack[Gemma3ImageProcessorKwargs],
     ) -> BatchFeature:
         return super().preprocess(images, **kwargs)
 
diff --git a/src/transformers/models/gemma3/processing_gemma3.py b/src/transformers/models/gemma3/processing_gemma3.py
index 791c47833a4e..a9bac5b69e47 100644
--- a/src/transformers/models/gemma3/processing_gemma3.py
+++ b/src/transformers/models/gemma3/processing_gemma3.py
@@ -20,21 +20,12 @@
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, make_nested_list_of_images
-from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
+from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import to_py_obj
 
 
-class Gemma3ImagesKwargs(ImagesKwargs):
-    do_pan_and_scan: Optional[bool]
-    pan_and_scan_min_crop_size: Optional[int]
-    pan_and_scan_max_num_crops: Optional[int]
-    pan_and_scan_min_ratio_to_activate: Optional[float]
-    do_convert_rgb: Optional[bool]
-
-
 class Gemma3ProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: Gemma3ImagesKwargs
     _defaults = {
         "text_kwargs": {
             "padding": False,
@@ -81,8 +72,6 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
-        videos=None,
-        audio=None,
         **kwargs: Unpack[Gemma3ProcessorKwargs],
     ) -> BatchFeature:
         if text is None and images is None:
diff --git a/src/transformers/models/gemma3n/processing_gemma3n.py b/src/transformers/models/gemma3n/processing_gemma3n.py
index e2c2c3ae10f8..105b1983b7c7 100644
--- a/src/transformers/models/gemma3n/processing_gemma3n.py
+++ b/src/transformers/models/gemma3n/processing_gemma3n.py
@@ -19,21 +19,13 @@
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, make_nested_list_of_images
-from ...processing_utils import AudioKwargs, ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 
 
-class Gemma3nImagesKwargs(ImagesKwargs):
-    do_convert_rgb: Optional[bool]
-
-
 class Gemma3nProcessorKwargs(ProcessingKwargs, total=False):
-    audio_kwargs: AudioKwargs
-    images_kwargs: Gemma3nImagesKwargs
     _defaults = {
-        "text_kwargs": {
-            "padding": False,
-        },
+        "text_kwargs": {"padding": False},
     }
 
 
@@ -101,7 +93,6 @@ def __call__(
         images: Optional[ImageInput] = None,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
         audio: Optional[Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]]] = None,
-        videos=None,
         **kwargs: Unpack[Gemma3nProcessorKwargs],
     ) -> BatchFeature:
         if text is None and images is None and audio is None:
diff --git a/src/transformers/models/glm4v/image_processing_glm4v.py b/src/transformers/models/glm4v/image_processing_glm4v.py
index e35699005116..13f4472e61f3 100644
--- a/src/transformers/models/glm4v/image_processing_glm4v.py
+++ b/src/transformers/models/glm4v/image_processing_glm4v.py
@@ -39,6 +39,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, logging
 from ...video_utils import VideoInput
 
@@ -46,6 +47,21 @@
 logger = logging.get_logger(__name__)
 
 
+class Glm4vImageProcessorKwargs(ImagesKwargs):
+    """
+    patch_size (`int`, *optional*, defaults to 14):
+        The spatial patch size of the vision encoder.
+    temporal_patch_size (`int`, *optional*, defaults to 2):
+        The temporal patch size of the vision encoder.
+    merge_size (`int`, *optional*, defaults to 2):
+        The merge size of the vision encoder to llm encoder.
+    """
+
+    patch_size: Optional[int]
+    temporal_patch_size: Optional[int]
+    merge_size: Optional[int]
+
+
 def smart_resize(
     num_frames: int,
     height: int,
@@ -120,6 +136,7 @@ class Glm4vImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values", "image_grid_thw"]
+    valid_kwargs = Glm4vImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/glm4v/image_processing_glm4v_fast.py b/src/transformers/models/glm4v/image_processing_glm4v_fast.py
index 8cdf31a437ae..92e8a5df9137 100644
--- a/src/transformers/models/glm4v/image_processing_glm4v_fast.py
+++ b/src/transformers/models/glm4v/image_processing_glm4v_fast.py
@@ -24,7 +24,6 @@
 )
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
     group_images_by_shape,
     reorder_images,
 )
@@ -41,27 +40,12 @@
     auto_docstring,
     logging,
 )
-from .image_processing_glm4v import smart_resize
+from .image_processing_glm4v import Glm4vImageProcessorKwargs, smart_resize
 
 
 logger = logging.get_logger(__name__)
 
 
-class Glm4vFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    patch_size (`int`, *optional*, defaults to 14):
-        The spatial patch size of the vision encoder.
-    temporal_patch_size (`int`, *optional*, defaults to 2):
-        The temporal patch size of the vision encoder.
-    merge_size (`int`, *optional*, defaults to 2):
-        The merge size of the vision encoder to llm encoder.
-    """
-
-    patch_size: Optional[int]
-    temporal_patch_size: Optional[int]
-    merge_size: Optional[int]
-
-
 @auto_docstring
 class Glm4vImageProcessorFast(BaseImageProcessorFast):
     do_resize = True
@@ -75,10 +59,10 @@ class Glm4vImageProcessorFast(BaseImageProcessorFast):
     patch_size = 14
     temporal_patch_size = 2
     merge_size = 2
-    valid_kwargs = Glm4vFastImageProcessorKwargs
+    valid_kwargs = Glm4vImageProcessorKwargs
     model_input_names = ["pixel_values", "image_grid_thw"]
 
-    def __init__(self, **kwargs: Unpack[Glm4vFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[Glm4vImageProcessorKwargs]):
         super().__init__(**kwargs)
         if self.size is not None and (
             self.size.get("shortest_edge", None) is None or self.size.get("longest_edge", None) is None
@@ -205,7 +189,7 @@ def _preprocess(
     def preprocess(
         self,
         images: ImageInput,
-        **kwargs: Unpack[Glm4vFastImageProcessorKwargs],
+        **kwargs: Unpack[Glm4vImageProcessorKwargs],
     ) -> BatchFeature:
         return super().preprocess(images, **kwargs)
 
diff --git a/src/transformers/models/glm4v/modular_glm4v.py b/src/transformers/models/glm4v/modular_glm4v.py
index b8600713a5ed..ac2885a4a9f8 100644
--- a/src/transformers/models/glm4v/modular_glm4v.py
+++ b/src/transformers/models/glm4v/modular_glm4v.py
@@ -32,7 +32,7 @@
 from ...modeling_outputs import BaseModelOutputWithPast
 from ...modeling_rope_utils import rope_config_validation
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
-from ...processing_utils import ImagesKwargs, Unpack
+from ...processing_utils import Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling, logging
 from ...utils.generic import check_model_inputs
@@ -52,7 +52,6 @@
     Qwen2_5_VLVisionAttention,
     Qwen2_5_VLVisionBlock,
 )
-from ..qwen2_5_vl.processing_qwen2_5_vl import Qwen2_5_VLVideosProcessorKwargs
 from ..qwen2_vl.processing_qwen2_vl import (
     Qwen2_VLProcessor,
     Qwen2_VLProcessorKwargs,
@@ -1508,19 +1507,7 @@ def _get_image_nums_and_video_nums(
         return image_counts, video_counts
 
 
-class Glm4vVideosProcessorKwargs(Qwen2_5_VLVideosProcessorKwargs):
-    pass
-
-
-class Glm4vImagesKwargs(ImagesKwargs):
-    patch_size: Optional[int]
-    temporal_patch_size: Optional[int]
-    merge_size: Optional[int]
-
-
 class Glm4vProcessorKwargs(Qwen2_VLProcessorKwargs):
-    images_kwargs: Glm4vImagesKwargs
-    videos_kwargs: Glm4vVideosProcessorKwargs
     _defaults = {
         "text_kwargs": {
             "padding": False,
diff --git a/src/transformers/models/glm4v/processing_glm4v.py b/src/transformers/models/glm4v/processing_glm4v.py
index bd7d24e800f7..ad97a10efd73 100644
--- a/src/transformers/models/glm4v/processing_glm4v.py
+++ b/src/transformers/models/glm4v/processing_glm4v.py
@@ -24,7 +24,7 @@
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput
-from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs
+from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
 from ...video_utils import VideoInput
@@ -33,18 +33,7 @@
 logger = logging.get_logger(__name__)
 
 
-class Glm4vVideosProcessorKwargs(VideosKwargs, total=False):
-    fps: Union[list[float], float]
-
-
-class Glm4vImagesKwargs(ImagesKwargs):
-    patch_size: Optional[int]
-    temporal_patch_size: Optional[int]
-    merge_size: Optional[int]
-
-
 class Glm4vProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: Glm4vImagesKwargs
     _defaults = {
         "text_kwargs": {
             "padding": False,
@@ -53,7 +42,6 @@ class Glm4vProcessorKwargs(ProcessingKwargs, total=False):
         },
         "videos_kwargs": {"return_metadata": True},
     }
-    videos_kwargs: Glm4vVideosProcessorKwargs
 
 
 class Glm4vProcessor(ProcessorMixin):
diff --git a/src/transformers/models/glm4v/video_processing_glm4v.py b/src/transformers/models/glm4v/video_processing_glm4v.py
index 0986c414f1d3..8324ad482baa 100644
--- a/src/transformers/models/glm4v/video_processing_glm4v.py
+++ b/src/transformers/models/glm4v/video_processing_glm4v.py
@@ -37,12 +37,11 @@
 
 
 class Glm4vVideoProcessorInitKwargs(VideosKwargs):
-    max_image_size: dict[str, int] = None
-    patch_size: Optional[int] = None
-    temporal_patch_size: Optional[int] = None
-    merge_size: Optional[int] = None
-    image_mean: Optional[list[float]] = None
-    image_std: Optional[list[float]] = None
+    max_image_size: Optional[dict[str, int]]
+    patch_size: Optional[int]
+    temporal_patch_size: Optional[int]
+    merge_size: Optional[int]
+    max_duration: Optional[int]
 
 
 @add_start_docstrings(
diff --git a/src/transformers/models/got_ocr2/image_processing_got_ocr2.py b/src/transformers/models/got_ocr2/image_processing_got_ocr2.py
index 43bf8b520ffa..3424020c65b3 100644
--- a/src/transformers/models/got_ocr2/image_processing_got_ocr2.py
+++ b/src/transformers/models/got_ocr2/image_processing_got_ocr2.py
@@ -38,6 +38,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
 
 
@@ -48,6 +49,24 @@
 logger = logging.get_logger(__name__)
 
 
+class GotOcr2ImageProcessorKwargs(ImagesKwargs):
+    """
+    crop_to_patches (`bool`, *optional*, defaults to `False`):
+        Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
+        `preprocess` method.
+    min_patches (`int`, *optional*, defaults to 1):
+        The minimum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
+        set to `True`. Can be overridden by the `min_patches` parameter in the `preprocess` method.
+    max_patches (`int`, *optional*, defaults to 12):
+        The maximum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
+        set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
+    """
+
+    crop_to_patches: Optional[bool]
+    min_patches: Optional[int]
+    max_patches: Optional[int]
+
+
 # Similar to image_processing_mllama.get_all_supported_aspect_ratios
 @lru_cache(maxsize=10)
 def get_all_supported_aspect_ratios(min_image_tiles: int, max_image_tiles: int) -> list[tuple[int, int]]:
@@ -168,6 +187,7 @@ class GotOcr2ImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values"]
+    valid_kwargs = GotOcr2ImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py b/src/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py
index a47a1422a5dc..210a18a406be 100644
--- a/src/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py
+++ b/src/transformers/models/got_ocr2/image_processing_got_ocr2_fast.py
@@ -22,7 +22,6 @@
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
     group_images_by_shape,
     reorder_images,
 )
@@ -32,25 +31,7 @@
     TensorType,
     auto_docstring,
 )
-from .image_processing_got_ocr2 import get_optimal_tiled_canvas
-
-
-class GotOcr2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    crop_to_patches (`bool`, *optional*, defaults to `False`):
-        Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
-        `preprocess` method.
-    min_patches (`int`, *optional*, defaults to 1):
-        The minimum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
-        set to `True`. Can be overridden by the `min_patches` parameter in the `preprocess` method.
-    max_patches (`int`, *optional*, defaults to 12):
-        The maximum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
-        set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
-    """
-
-    crop_to_patches: Optional[bool]
-    min_patches: Optional[int]
-    max_patches: Optional[int]
+from .image_processing_got_ocr2 import GotOcr2ImageProcessorKwargs, get_optimal_tiled_canvas
 
 
 @auto_docstring
@@ -66,13 +47,13 @@ class GotOcr2ImageProcessorFast(BaseImageProcessorFast):
     crop_to_patches = False
     min_patches = 1
     max_patches = 12
-    valid_kwargs = GotOcr2FastImageProcessorKwargs
+    valid_kwargs = GotOcr2ImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[GotOcr2FastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[GotOcr2ImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     @auto_docstring
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[GotOcr2FastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[GotOcr2ImageProcessorKwargs]) -> BatchFeature:
         return super().preprocess(images, **kwargs)
 
     def crop_image_to_patches(
diff --git a/src/transformers/models/got_ocr2/processing_got_ocr2.py b/src/transformers/models/got_ocr2/processing_got_ocr2.py
index 35df3b5a3f05..447122e18c22 100644
--- a/src/transformers/models/got_ocr2/processing_got_ocr2.py
+++ b/src/transformers/models/got_ocr2/processing_got_ocr2.py
@@ -18,11 +18,10 @@
 
 import numpy as np
 
-from transformers.processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
-from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
-
 from ...image_processing_utils import BatchFeature
 from ...image_utils import ImageInput
+from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import is_vision_available, logging
 
 
@@ -37,13 +36,13 @@ class GotOcr2TextKwargs(TextKwargs, total=False):
 
 
 class GotOcr2ImagesKwargs(ImagesKwargs, total=False):
+    crop_to_patches: Optional[bool]
+    min_patches: Optional[int]
+    max_patches: Optional[int]
     box: Optional[Union[list, tuple[float, float], tuple[float, float, float, float]]]
     color: Optional[str]
     num_image_tokens: Optional[int]
     multi_page: Optional[bool]
-    crop_to_patches: Optional[bool]
-    min_patches: Optional[int]
-    max_patches: Optional[int]
 
 
 class GotOcr2ProcessorKwargs(ProcessingKwargs, total=False):
@@ -136,8 +135,6 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
-        audio=None,
-        videos=None,
         **kwargs: Unpack[GotOcr2ProcessorKwargs],
     ) -> BatchFeature:
         """
diff --git a/src/transformers/models/granite_speech/processing_granite_speech.py b/src/transformers/models/granite_speech/processing_granite_speech.py
index 84515d173c47..0b76ccfe75db 100644
--- a/src/transformers/models/granite_speech/processing_granite_speech.py
+++ b/src/transformers/models/granite_speech/processing_granite_speech.py
@@ -49,8 +49,6 @@ def __call__(
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]],
         audio: Union["torch.Tensor", list["torch.Tensor"]] = None,
         device: str = "cpu",
-        images=None,
-        videos=None,
         **kwargs,
     ) -> BatchFeature:
         requires_backends(self, ["torch"])
diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
index 737cf2e670ee..e4e17d4d8ddf 100644
--- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py
@@ -51,6 +51,7 @@
     validate_kwargs,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import (
     ExplicitEnum,
     TensorType,
@@ -91,6 +92,29 @@ class AnnotationFormat(ExplicitEnum):
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
+class GroundingDinoImageProcessorKwargs(ImagesKwargs):
+    r"""
+    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
+        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+    do_convert_annotations (`bool`, *optional*, defaults to `True`):
+        Controls whether to convert the annotations to the format expected by the GROUNDING_DINO model. Converts the
+        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+    return_segmentation_masks (`bool`, *optional*, defaults to `False`):
+        Whether to return segmentation masks.
+    annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
+        Annotations to transform according to the padding that is applied to the images.
+    masks_path (`str` or `pathlib.Path`, *optional*):
+        Path to the directory containing the segmentation masks.
+    """
+
+    format: Optional[Union[str, AnnotationFormat]]
+    do_convert_annotations: Optional[bool]
+    return_segmentation_masks: Optional[bool]
+    annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
+    masks_path: Optional[Union[str, pathlib.Path]]
+
+
 # Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio
 def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, int]:
     """
@@ -865,6 +889,7 @@ class GroundingDinoImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values", "pixel_mask"]
+    valid_kwargs = GroundingDinoImageProcessorKwargs
 
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.__init__
     def __init__(
diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino_fast.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino_fast.py
index 744cb5f92923..ee303ec47fc4 100644
--- a/src/transformers/models/grounding_dino/image_processing_grounding_dino_fast.py
+++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino_fast.py
@@ -4,6 +4,26 @@
 #             the file from the modular. If any change should be done, please apply the change to the
 #                          modular_grounding_dino.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import pathlib
 from typing import TYPE_CHECKING, Any, Optional, Union
 
@@ -14,7 +34,6 @@
 from ...image_processing_utils import BatchFeature, get_size_dict
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
     SizeDict,
     get_image_size_for_max_height_width,
     get_max_height_width,
@@ -35,7 +54,7 @@
 from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring, logging
 from ...utils.import_utils import requires
-from .image_processing_grounding_dino import get_size_with_aspect_ratio
+from .image_processing_grounding_dino import GroundingDinoImageProcessorKwargs, get_size_with_aspect_ratio
 
 
 if TYPE_CHECKING:
@@ -44,24 +63,6 @@
 
 logger = logging.get_logger(__name__)
 
-
-class GroundingDinoFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    r"""
-    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
-        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
-    do_convert_annotations (`bool`, *optional*, defaults to `True`):
-        Controls whether to convert the annotations to the format expected by the GROUNDING_DINO model. Converts the
-        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
-        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
-    return_segmentation_masks (`bool`, *optional*, defaults to `False`):
-        Whether to return segmentation masks.
-    """
-
-    format: Optional[Union[str, AnnotationFormat]]
-    do_convert_annotations: Optional[bool]
-    return_segmentation_masks: Optional[bool]
-
-
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
@@ -304,9 +305,9 @@ class GroundingDinoImageProcessorFast(BaseImageProcessorFast):
     size = {"shortest_edge": 800, "longest_edge": 1333}
     default_to_square = False
     model_input_names = ["pixel_values", "pixel_mask"]
-    valid_kwargs = GroundingDinoFastImageProcessorKwargs
+    valid_kwargs = GroundingDinoImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[GroundingDinoFastImageProcessorKwargs]) -> None:
+    def __init__(self, **kwargs: Unpack[GroundingDinoImageProcessorKwargs]) -> None:
         if "pad_and_return_pixel_mask" in kwargs:
             kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
 
@@ -568,25 +569,8 @@ def pad(
     def preprocess(
         self,
         images: ImageInput,
-        annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None,
-        masks_path: Optional[Union[str, pathlib.Path]] = None,
-        **kwargs: Unpack[GroundingDinoFastImageProcessorKwargs],
+        **kwargs: Unpack[GroundingDinoImageProcessorKwargs],
     ) -> BatchFeature:
-        r"""
-        annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
-            List of annotations associated with the image or batch of images. If annotation is for object
-            detection, the annotations should be a dictionary with the following keys:
-            - "image_id" (`int`): The image id.
-            - "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a
-                dictionary. An image can have no annotations, in which case the list should be empty.
-            If annotation is for segmentation, the annotations should be a dictionary with the following keys:
-            - "image_id" (`int`): The image id.
-            - "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary.
-                An image can have no segments, in which case the list should be empty.
-            - "file_name" (`str`): The file name of the image.
-        masks_path (`str` or `pathlib.Path`, *optional*):
-            Path to the directory containing the segmentation masks.
-        """
         if "pad_and_return_pixel_mask" in kwargs:
             kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
             logger.warning_once(
@@ -601,7 +585,7 @@ def preprocess(
             )
             kwargs["size"] = kwargs.pop("max_size")
 
-        return super().preprocess(images, annotations, masks_path, **kwargs)
+        return super().preprocess(images, **kwargs)
 
     def _preprocess(
         self,
diff --git a/src/transformers/models/grounding_dino/modular_grounding_dino.py b/src/transformers/models/grounding_dino/modular_grounding_dino.py
index a7b9c570e7b0..ded6435508a5 100644
--- a/src/transformers/models/grounding_dino/modular_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modular_grounding_dino.py
@@ -1,3 +1,23 @@
+# coding=utf-8
+# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Optional, Union
 
 import torch
diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index ea0e288f3eec..5f2f900451b2 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -16,13 +16,12 @@
 Processor class for Grounding DINO.
 """
 
-import pathlib
 import warnings
 from typing import TYPE_CHECKING, Optional, Union
 
 from ...image_transforms import center_to_corners_format
-from ...image_utils import AnnotationFormat, ImageInput
-from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
+from ...image_utils import ImageInput
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
 from ...utils import TensorType, is_torch_available
 
@@ -99,16 +98,7 @@ def get(self, key, *args, **kwargs):
         return super().get(key, *args, **kwargs)
 
 
-class GroundingDinoImagesKwargs(ImagesKwargs, total=False):
-    annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
-    return_segmentation_masks: Optional[bool]
-    masks_path: Optional[Union[str, pathlib.Path]]
-    do_convert_annotations: Optional[bool]
-    format: Optional[Union[str, AnnotationFormat]]
-
-
 class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: GroundingDinoImagesKwargs
     _defaults = {
         "text_kwargs": {
             "add_special_tokens": True,
diff --git a/src/transformers/models/idefics/image_processing_idefics.py b/src/transformers/models/idefics/image_processing_idefics.py
index 6ef5b39afeeb..7fda46e3a990 100644
--- a/src/transformers/models/idefics/image_processing_idefics.py
+++ b/src/transformers/models/idefics/image_processing_idefics.py
@@ -28,6 +28,7 @@
     to_numpy_array,
     valid_images,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, is_torch_available
 
 
@@ -35,6 +36,20 @@
 IDEFICS_STANDARD_STD = [0.26862954, 0.26130258, 0.27577711]
 
 
+class IdeficsImageProcessorKwargs(ImagesKwargs):
+    """
+    transform (`Callable`, *optional*):
+        A custom transform function that accepts a single image can be passed for training. For example,
+        `torchvision.Compose` can be used to compose multiple transforms. If `None` - an inference mode is
+        assumed - and then a preset of inference-specific transforms will be applied to the images
+    image_size (`dict[str, int]`, *optional*):
+        Resize to image size
+    """
+
+    transform: Optional[Callable]
+    image_size: Optional[dict[str, int]]
+
+
 def convert_to_rgb(image):
     # `image.convert("RGB")` would only work for .jpg images, as it creates a wrong background
     # for transparent images. The call to `alpha_composite` handles this case
@@ -74,6 +89,7 @@ class IdeficsImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values"]
+    valid_kwargs = IdeficsImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/idefics/processing_idefics.py b/src/transformers/models/idefics/processing_idefics.py
index 5ab7e480c8ea..4b5ccaffe5c8 100644
--- a/src/transformers/models/idefics/processing_idefics.py
+++ b/src/transformers/models/idefics/processing_idefics.py
@@ -16,13 +16,12 @@
 Processor class for IDEFICS.
 """
 
-from typing import Callable, Optional, Union
+from typing import Optional, Union
 from urllib.parse import urlparse
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput
 from ...processing_utils import (
-    ImagesKwargs,
     ProcessingKwargs,
     ProcessorMixin,
     TextKwargs,
@@ -40,13 +39,6 @@
 IMAGE_TOKEN = "<image>"
 
 
-class IdeficsImagesKwargs(ImagesKwargs, total=False):
-    transform: Optional[Callable]
-    image_size: Optional[dict[str, int]]
-    image_mean: Optional[Union[float, list[float]]]
-    image_std: Optional[Union[float, list[float]]]
-
-
 class IdeficsTextKwargs(TextKwargs, total=False):
     add_eos_token: Optional[bool]
     add_end_of_utterance_token: Optional[bool]
@@ -54,14 +46,12 @@ class IdeficsTextKwargs(TextKwargs, total=False):
 
 class IdeficsProcessorKwargs(ProcessingKwargs, total=False):
     text_kwargs: IdeficsTextKwargs
-    images_kwargs: IdeficsImagesKwargs
     _defaults = {
         "text_kwargs": {
             "add_special_tokens": False,
             "padding": "longest",
             "add_eos_token": False,
         },
-        "images_kwargs": {},
         "common_kwargs": {"return_tensors": "pt"},
     }
 
@@ -198,8 +188,6 @@ def __call__(
             list[list[TextInput]],
             list[list[PreTokenizedInput]],
         ] = None,
-        audio=None,
-        videos=None,
         **kwargs: Unpack[IdeficsProcessorKwargs],
     ) -> BatchFeature:
         """This method takes batched or non-batched prompts made of text and images and converts them into prompts that
diff --git a/src/transformers/models/idefics2/image_processing_idefics2.py b/src/transformers/models/idefics2/image_processing_idefics2.py
index 15a04a887e87..b9b741a9704b 100644
--- a/src/transformers/models/idefics2/image_processing_idefics2.py
+++ b/src/transformers/models/idefics2/image_processing_idefics2.py
@@ -35,6 +35,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, is_vision_available, logging
 
 
@@ -46,6 +47,15 @@
     from PIL import Image
 
 
+class Idefics2ImageProcessorKwargs(ImagesKwargs):
+    """
+    do_image_splitting (`bool`, *optional*, defaults to `False`):
+        Whether to split the image into a sequence 4 equal sub-images concatenated with the original image.
+    """
+
+    do_image_splitting: Optional[bool]
+
+
 def get_resize_output_image_size(image, size, input_data_format) -> tuple[int, int]:
     """
     Get the output size of the image after resizing given a dictionary specifying the max and min sizes.
@@ -186,6 +196,7 @@ class Idefics2ImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values", "pixel_attention_mask"]
+    valid_kwargs = Idefics2ImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/idefics2/image_processing_idefics2_fast.py b/src/transformers/models/idefics2/image_processing_idefics2_fast.py
index 5348bda389ed..36ae6ea5fbc7 100644
--- a/src/transformers/models/idefics2/image_processing_idefics2_fast.py
+++ b/src/transformers/models/idefics2/image_processing_idefics2_fast.py
@@ -21,7 +21,6 @@
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
     BatchFeature,
-    DefaultFastImageProcessorKwargs,
     SizeDict,
     group_images_by_shape,
     reorder_images,
@@ -35,7 +34,7 @@
 )
 from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring, is_torchvision_available, logging
-from .image_processing_idefics2 import convert_to_rgb
+from .image_processing_idefics2 import Idefics2ImageProcessorKwargs, convert_to_rgb
 
 
 if is_torchvision_available():
@@ -105,15 +104,6 @@ def make_pixel_mask(image: "torch.Tensor", output_size: tuple[int, int]) -> "tor
     return mask
 
 
-class Idefics2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    do_image_splitting (`bool`, *optional*, defaults to `False`):
-        Whether to split the image into a sequence 4 equal sub-images concatenated with the original image.
-    """
-
-    do_image_splitting: Optional[bool]
-
-
 @auto_docstring
 class Idefics2ImageProcessorFast(BaseImageProcessorFast):
     resample = PILImageResampling.BILINEAR
@@ -127,7 +117,7 @@ class Idefics2ImageProcessorFast(BaseImageProcessorFast):
     do_image_splitting = False
     size = {"shortest_edge": 378, "longest_edge": 980}
     model_input_names = ["pixel_values", "pixel_attention_mask"]
-    valid_kwargs = Idefics2FastImageProcessorKwargs
+    valid_kwargs = Idefics2ImageProcessorKwargs
 
     def convert_to_rgb(self, image: ImageInput) -> ImageInput:
         """
@@ -214,7 +204,7 @@ def pad(
         return image, pixel_mask
 
     @auto_docstring
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[Idefics2FastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[Idefics2ImageProcessorKwargs]) -> BatchFeature:
         return super().preprocess(images, **kwargs)
 
     def _preprocess(
diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py
index 550ca8774095..c419a3641254 100644
--- a/src/transformers/models/idefics2/processing_idefics2.py
+++ b/src/transformers/models/idefics2/processing_idefics2.py
@@ -22,7 +22,6 @@
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, is_valid_image, load_image
 from ...processing_utils import (
-    ImagesKwargs,
     ProcessingKwargs,
     ProcessorMixin,
     Unpack,
@@ -46,20 +45,13 @@ def is_image_or_image_url(elem):
     return is_url(elem) or is_valid_image(elem)
 
 
-class Idefics2ImagesKwargs(ImagesKwargs, total=False):
-    image_seq_len: Optional[int]
-
-
 class Idefics2ProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: Idefics2ImagesKwargs
-
     _defaults = {
         "text_kwargs": {
             "add_special_tokens": True,
             "padding": False,
             "is_split_into_words": False,
         },
-        "images_kwargs": {},
     }
 
 
@@ -123,8 +115,6 @@ def __call__(
         self,
         images: Union[ImageInput, list[ImageInput], list[list[ImageInput]]] = None,
         text: Union[TextInput, "PreTokenizedInput", list[TextInput], list["PreTokenizedInput"]] = None,
-        audio=None,
-        videos=None,
         **kwargs: Unpack[Idefics2ProcessorKwargs],
     ) -> BatchFeature:
         """
@@ -181,8 +171,6 @@ def __call__(
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,
             **kwargs,
         )
-        image_seq_len = output_kwargs["images_kwargs"].pop("image_seq_len", None)
-        image_seq_len = image_seq_len if image_seq_len is not None else self.image_seq_len
         return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
 
         n_images_in_text = []
@@ -197,12 +185,11 @@ def __call__(
             # Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len`
             fake_image_token = self.fake_image_token
             image_token = self.image_token
-            image_str = f"{fake_image_token}{image_token * image_seq_len}{fake_image_token}"
+            image_str = f"{fake_image_token}{image_token * self.image_seq_len}{fake_image_token}"
 
             if self.image_processor.do_image_splitting:
                 # A single image token is split into 4 patches + 1 original image
                 image_str = image_str * 5
-                image_seq_len *= 5
 
             prompt_strings = []
             for sample in text:
diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py
index c7526f30993a..f098a9f54dc1 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3.py
@@ -35,6 +35,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, is_vision_available, logging
 
 
@@ -47,6 +48,22 @@
     from PIL import Image
 
 
+class Idefics3ImageProcessorKwargs(ImagesKwargs):
+    """
+    do_image_splitting (`bool`, *optional*, defaults to `True`):
+        Whether to split the image into sub-images concatenated with the original image. They are split into patches
+        such that each patch has a size of `max_image_size["height"]` x `max_image_size["width"]`.
+    max_image_size (`Dict`, *optional*, defaults to `{"longest_edge": 364}`):
+        Maximum resolution of the patches of images accepted by the model. This is a dictionary containing the key "longest_edge".
+    return_row_col_info (`bool`, *optional*, defaults to `False`):
+        Whether to return the row and column information of the images.
+    """
+
+    do_image_splitting: Optional[bool]
+    max_image_size: Optional[dict[str, int]]
+    return_row_col_info: Optional[bool]
+
+
 def _resize_output_size_rescale_to_max_len(
     height: int, width: int, min_len: Optional[int] = 1, max_len: Optional[int] = None
 ) -> tuple[int, int]:
@@ -291,6 +308,7 @@ class Idefics3ImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values", "pixel_attention_mask"]
+    valid_kwargs = Idefics3ImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/idefics3/image_processing_idefics3_fast.py b/src/transformers/models/idefics3/image_processing_idefics3_fast.py
index 5b0c0e6180f9..2f325f77931a 100644
--- a/src/transformers/models/idefics3/image_processing_idefics3_fast.py
+++ b/src/transformers/models/idefics3/image_processing_idefics3_fast.py
@@ -22,7 +22,6 @@
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
     BatchFeature,
-    DefaultFastImageProcessorKwargs,
     SizeDict,
     group_images_by_shape,
     reorder_images,
@@ -36,6 +35,7 @@
 )
 from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring, is_torchvision_available, logging
+from .image_processing_idefics3 import Idefics3ImageProcessorKwargs
 
 
 if is_torchvision_available():
@@ -169,22 +169,6 @@ def make_pixel_mask(image: "torch.Tensor", output_size: tuple[int, int]) -> "tor
     return mask
 
 
-class Idefics3FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    do_image_splitting (`bool`, *optional*, defaults to `True`):
-        Whether to split the image into sub-images concatenated with the original image. They are split into patches
-        such that each patch has a size of `max_image_size["height"]` x `max_image_size["width"]`.
-    max_image_size (`Dict`, *optional*, defaults to `{"longest_edge": 364}`):
-        Maximum resolution of the patches of images accepted by the model. This is a dictionary containing the key "longest_edge".
-    return_row_col_info (`bool`, *optional*, defaults to `False`):
-        Whether to return the row and column information of the images.
-    """
-
-    do_image_splitting: Optional[bool]
-    max_image_size: Optional[dict[str, int]]
-    return_row_col_info: Optional[bool]
-
-
 @auto_docstring
 class Idefics3ImageProcessorFast(BaseImageProcessorFast):
     resample = PILImageResampling.LANCZOS
@@ -199,7 +183,7 @@ class Idefics3ImageProcessorFast(BaseImageProcessorFast):
     do_image_splitting = True
     do_pad = True
     return_row_col_info = False
-    valid_kwargs = Idefics3FastImageProcessorKwargs
+    valid_kwargs = Idefics3ImageProcessorKwargs
 
     def _prepare_images_structure(self, images: ImageInput, expected_ndims: int = 3) -> ImageInput:
         """
@@ -367,7 +351,7 @@ def pad(
         return image, pixel_mask
 
     @auto_docstring
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[Idefics3FastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[Idefics3ImageProcessorKwargs]) -> BatchFeature:
         return super().preprocess(images, **kwargs)
 
     def _preprocess(
diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py
index 00ee8df6d414..451af1d8a38f 100644
--- a/src/transformers/models/idefics3/processing_idefics3.py
+++ b/src/transformers/models/idefics3/processing_idefics3.py
@@ -24,7 +24,7 @@
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, is_valid_image, load_image
-from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
+from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import AddedToken, BatchEncoding, TextInput
 from ...utils import logging
 
@@ -87,14 +87,7 @@ def get_image_prompt_string(
     )
 
 
-class Idefics3ImagesKwargs(ImagesKwargs, total=False):
-    return_row_col_info: Optional[bool]
-    max_image_size: Optional[dict[str, int]]
-
-
 class Idefics3ProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: Idefics3ImagesKwargs
-
     _defaults = {
         "text_kwargs": {
             "add_special_tokens": True,
@@ -179,8 +172,6 @@ def __call__(
         self,
         images: Union[ImageInput, list[ImageInput], list[list[ImageInput]]] = None,
         text: Union[TextInput, "PreTokenizedInput", list[TextInput], list["PreTokenizedInput"]] = None,
-        audio=None,
-        videos=None,
         image_seq_len: Optional[int] = None,
         **kwargs: Unpack[Idefics3ProcessorKwargs],
     ) -> BatchEncoding:
diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt.py b/src/transformers/models/imagegpt/image_processing_imagegpt.py
index ee8fe04771b7..8f79cd58ec5f 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt.py
@@ -31,17 +31,34 @@
     valid_images,
     validate_preprocess_arguments,
 )
-from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
+from ...processing_utils import ImagesKwargs
+from ...utils import TensorType, filter_out_non_signature_kwargs, is_torch_available, is_vision_available, logging
 from ...utils.import_utils import requires
 
 
 if is_vision_available():
     import PIL
 
+if is_torch_available():
+    import torch
 
 logger = logging.get_logger(__name__)
 
 
+class ImageGPTImageProcessorKwargs(ImagesKwargs):
+    """
+    clusters (`np.ndarray` or `list[list[int]]` or `torch.Tensor`, *optional*):
+        The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overridden by `clusters`
+        in `preprocess`.
+    do_color_quantize (`bool`, *optional*, defaults to `True`):
+        Controls whether to apply color quantization to convert continuous pixel values to discrete cluster indices.
+        When True, each pixel is assigned to its nearest color cluster, enabling ImageGPT's discrete token modeling.
+    """
+
+    clusters: Optional[Union[np.ndarray, list[list[int]], "torch.Tensor"]]
+    do_color_quantize: Optional[bool]
+
+
 def squared_euclidean_distance(a, b):
     b = b.T
     a2 = np.sum(np.square(a), axis=1)
@@ -83,6 +100,7 @@ class ImageGPTImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values"]
+    valid_kwargs = ImageGPTImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
index 7a6bcc53ae1a..1be050b5ecf9 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt_fast.py
@@ -23,7 +23,6 @@
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
 )
 from ...image_transforms import group_images_by_shape, reorder_images
 from ...image_utils import PILImageResampling
@@ -32,6 +31,7 @@
     TensorType,
     auto_docstring,
 )
+from .image_processing_imagegpt import ImageGPTImageProcessorKwargs
 
 
 def squared_euclidean_distance_torch(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
@@ -68,20 +68,6 @@ def color_quantize_torch(x: torch.Tensor, clusters: torch.Tensor) -> torch.Tenso
     return torch.argmin(d, dim=1)
 
 
-class ImageGPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    clusters (`np.ndarray` or `list[list[int]]` or `torch.Tensor`, *optional*):
-        The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overridden by `clusters`
-        in `preprocess`.
-    do_color_quantize (`bool`, *optional*, defaults to `True`):
-        Controls whether to apply color quantization to convert continuous pixel values to discrete cluster indices.
-        When True, each pixel is assigned to its nearest color cluster, enabling ImageGPT's discrete token modeling.
-    """
-
-    clusters: Optional[Union[np.ndarray, list[list[int]], torch.Tensor]]
-    do_color_quantize: Optional[bool]
-
-
 @auto_docstring
 class ImageGPTImageProcessorFast(BaseImageProcessorFast):
     model_input_names = ["input_ids"]
@@ -92,12 +78,12 @@ class ImageGPTImageProcessorFast(BaseImageProcessorFast):
     image_std = [0.5, 0.5, 0.5]
     do_rescale = True
     do_normalize = True
-    valid_kwargs = ImageGPTFastImageProcessorKwargs
+    valid_kwargs = ImageGPTImageProcessorKwargs
 
     def __init__(
         self,
         clusters: Optional[Union[list, np.ndarray, torch.Tensor]] = None,  # keep as arg for backwards compatibility
-        **kwargs: Unpack[ImageGPTFastImageProcessorKwargs],
+        **kwargs: Unpack[ImageGPTImageProcessorKwargs],
     ):
         r"""
         clusters (`np.ndarray` or `list[list[int]]` or `torch.Tensor`, *optional*):
diff --git a/src/transformers/models/instructblip/processing_instructblip.py b/src/transformers/models/instructblip/processing_instructblip.py
index 122fc11622ff..afe43c1fc7a7 100644
--- a/src/transformers/models/instructblip/processing_instructblip.py
+++ b/src/transformers/models/instructblip/processing_instructblip.py
@@ -43,7 +43,6 @@ class InstructBlipProcessorKwargs(ProcessingKwargs, total=False):
             "return_length": False,
             "verbose": True,
         },
-        "images_kwargs": {},
     }
 
 
@@ -85,8 +84,6 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
-        audio=None,
-        videos=None,
         **kwargs: Unpack[InstructBlipProcessorKwargs],
     ) -> BatchFeature:
         """
diff --git a/src/transformers/models/internvl/processing_internvl.py b/src/transformers/models/internvl/processing_internvl.py
index f68d48502b6c..12e0d395b05c 100644
--- a/src/transformers/models/internvl/processing_internvl.py
+++ b/src/transformers/models/internvl/processing_internvl.py
@@ -19,19 +19,12 @@
 
 from ...image_processing_utils import BatchFeature
 from ...image_utils import ImageInput, concatenate_list, make_flat_list_of_images
-from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
+from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...video_utils import VideoInput
 
 
-class InternVLImagesKwargs(ImagesKwargs, total=False):
-    crop_to_patches: Optional[bool]
-    min_patches: Optional[int]
-    max_patches: Optional[int]
-
-
 class InternVLProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: InternVLImagesKwargs
     _defaults = {
         "text_kwargs": {
             "padding_side": "left",
@@ -159,7 +152,6 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
-        audio=None,
         videos: Optional[VideoInput] = None,
         **kwargs: Unpack[InternVLProcessorKwargs],
     ) -> BatchFeature:
diff --git a/src/transformers/models/internvl/video_processing_internvl.py b/src/transformers/models/internvl/video_processing_internvl.py
index 96d7d3067f73..d16c57522d1c 100644
--- a/src/transformers/models/internvl/video_processing_internvl.py
+++ b/src/transformers/models/internvl/video_processing_internvl.py
@@ -43,7 +43,6 @@ class InternVLVideoProcessor(BaseVideoProcessor):
     initial_shift = True
     do_sample_frames = False  # Set to False for BC, recommended to set `True` in new models
     valid_kwargs = InternVLVideoProcessorInitKwargs
-    model_input_names = ["pixel_values_videos"]
 
     def __init__(self, **kwargs: Unpack[InternVLVideoProcessorInitKwargs]):
         super().__init__(**kwargs)
diff --git a/src/transformers/models/janus/image_processing_janus.py b/src/transformers/models/janus/image_processing_janus.py
index a56cae7ba975..06ea0fe0e4d1 100644
--- a/src/transformers/models/janus/image_processing_janus.py
+++ b/src/transformers/models/janus/image_processing_janus.py
@@ -40,6 +40,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
 
 
@@ -50,6 +51,16 @@
 logger = logging.get_logger(__name__)
 
 
+class JanusImageProcessorKwargs(ImagesKwargs):
+    r"""
+    min_size (`int`, *optional*, defaults to 14):
+        The minimum allowed size for the resized image. Ensures that neither the height nor width
+        falls below this value after resizing.
+    """
+
+    min_size: int
+
+
 class JanusImageProcessor(BaseImageProcessor):
     r"""
     Constructs a JANUS image processor.
@@ -92,6 +103,8 @@ class JanusImageProcessor(BaseImageProcessor):
 
     model_input_names = ["pixel_values"]
 
+    valid_kwargs = JanusImageProcessorKwargs
+
     def __init__(
         self,
         do_resize: bool = True,
diff --git a/src/transformers/models/janus/image_processing_janus_fast.py b/src/transformers/models/janus/image_processing_janus_fast.py
index 6cbca591626e..4de23e80e63a 100644
--- a/src/transformers/models/janus/image_processing_janus_fast.py
+++ b/src/transformers/models/janus/image_processing_janus_fast.py
@@ -22,7 +22,6 @@
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
     group_images_by_shape,
     reorder_images,
 )
@@ -38,16 +37,7 @@
     TensorType,
     auto_docstring,
 )
-
-
-class JanusFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    r"""
-    min_size (`int`, *optional*, defaults to 14):
-        The minimum allowed size for the resized image. Ensures that neither the height nor width
-        falls below this value after resizing.
-    """
-
-    min_size: int
+from .image_processing_janus import JanusImageProcessorKwargs
 
 
 @auto_docstring
@@ -61,9 +51,9 @@ class JanusImageProcessorFast(BaseImageProcessorFast):
     do_rescale = True
     do_normalize = True
     do_pad = True
-    valid_kwargs = JanusFastImageProcessorKwargs
+    valid_kwargs = JanusImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[JanusFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[JanusImageProcessorKwargs]):
         if kwargs.get("image_mean") is None:
             background_color = (127, 127, 127)
         else:
diff --git a/src/transformers/models/janus/modular_janus.py b/src/transformers/models/janus/modular_janus.py
index a8e24a86b0d3..332dc689dc62 100644
--- a/src/transformers/models/janus/modular_janus.py
+++ b/src/transformers/models/janus/modular_janus.py
@@ -47,7 +47,7 @@
 )
 from ...modeling_outputs import ModelOutput
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
-from ...processing_utils import Unpack
+from ...processing_utils import ImagesKwargs, Unpack
 from ...utils import (
     TensorType,
     TransformersKwargs,
@@ -1289,6 +1289,16 @@ def generate(
             return generated_tokens
 
 
+class JanusImageProcessorKwargs(ImagesKwargs):
+    r"""
+    min_size (`int`, *optional*, defaults to 14):
+        The minimum allowed size for the resized image. Ensures that neither the height nor width
+        falls below this value after resizing.
+    """
+
+    min_size: int
+
+
 class JanusImageProcessor(BlipImageProcessor):
     r"""
     Constructs a JANUS image processor.
@@ -1329,6 +1339,8 @@ class JanusImageProcessor(BlipImageProcessor):
             Whether to pad the image to square or not.
     """
 
+    valid_kwargs = JanusImageProcessorKwargs
+
     def __init__(
         self,
         do_resize: bool = True,
diff --git a/src/transformers/models/janus/processing_janus.py b/src/transformers/models/janus/processing_janus.py
index c2413e705756..15c237c4ced4 100644
--- a/src/transformers/models/janus/processing_janus.py
+++ b/src/transformers/models/janus/processing_janus.py
@@ -81,8 +81,6 @@ def __call__(
         self,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
         images: Optional[ImageInput] = None,
-        videos=None,
-        audio=None,
         **kwargs: Unpack[JanusProcessorKwargs],
     ) -> BatchFeature:
         """
diff --git a/src/transformers/models/kosmos2/processing_kosmos2.py b/src/transformers/models/kosmos2/processing_kosmos2.py
index 58b3dff1e07a..98f8925e8a69 100644
--- a/src/transformers/models/kosmos2/processing_kosmos2.py
+++ b/src/transformers/models/kosmos2/processing_kosmos2.py
@@ -136,8 +136,6 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Union[TextInput, list[TextInput]] = None,
-        audio=None,
-        videos=None,
         **kwargs: Unpack[Kosmos2ProcessorKwargs],
     ) -> BatchFeature:
         """
diff --git a/src/transformers/models/kosmos2_5/image_processing_kosmos2_5.py b/src/transformers/models/kosmos2_5/image_processing_kosmos2_5.py
index b768205da2a4..5f337e4b04c9 100644
--- a/src/transformers/models/kosmos2_5/image_processing_kosmos2_5.py
+++ b/src/transformers/models/kosmos2_5/image_processing_kosmos2_5.py
@@ -34,6 +34,7 @@
     to_numpy_array,
     valid_images,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, is_torch_available, logging
 from ...utils.import_utils import requires_backends
 
@@ -45,6 +46,19 @@
 DEFAULT_FONT_PATH = "ybelkada/fonts"
 
 
+class Kosmos2_5ImageProcessorKwargs(ImagesKwargs):
+    r"""
+    patch_size (`Dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
+        The patch size to use for the image. According to Kosmos2_5 paper and code, the patch size is 16x16.
+    max_patches (`int`, *optional*, defaults to 4096):
+        The maximum number of patches to extract from the image as per the
+        [KOSMOS 2.5 paper](https://huggingface.co/papers/2309.11419).
+    """
+
+    patch_size: Optional[dict[str, int]]
+    max_patches: Optional[int]
+
+
 # Copied from transformers.models.pix2struct.image_processing_pix2struct.torch_extract_patches
 def torch_extract_patches(image_tensor, patch_height, patch_width):
     """
@@ -92,6 +106,7 @@ class Kosmos2_5ImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["flattened_patches"]
+    valid_kwargs = Kosmos2_5ImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py b/src/transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py
index b7adeb2c86c2..d892436ea652 100644
--- a/src/transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py
+++ b/src/transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py
@@ -22,13 +22,13 @@
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
     group_images_by_shape,
     reorder_images,
 )
 from ...image_utils import ChannelDimension, ImageInput, get_image_size
 from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring
+from .image_processing_kosmos2_5 import Kosmos2_5ImageProcessorKwargs
 
 
 # Similar to transformers.models.pix2struct.image_processing_pix2struct.torch_extract_patches but dealing with a batch of images directly.
@@ -56,19 +56,6 @@ def torch_extract_patches(image_tensor, patch_height, patch_width):
     return patches
 
 
-class Kosmos2_5FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    r"""
-    patch_size (`Dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
-        The patch size to use for the image. According to Kosmos2_5 paper and code, the patch size is 16x16.
-    max_patches (`int`, *optional*, defaults to 4096):
-        The maximum number of patches to extract from the image as per the
-        [KOSMOS 2.5 paper](https://huggingface.co/papers/2309.11419).
-    """
-
-    patch_size: Optional[dict[str, int]]
-    max_patches: Optional[int]
-
-
 @auto_docstring
 class Kosmos2_5ImageProcessorFast(BaseImageProcessorFast):
     # To be checked against the slow image processor
@@ -78,13 +65,13 @@ class Kosmos2_5ImageProcessorFast(BaseImageProcessorFast):
     patch_size = {"height": 16, "width": 16}
     max_patches = 4096
     rescale_factor = None
-    valid_kwargs = Kosmos2_5FastImageProcessorKwargs
+    valid_kwargs = Kosmos2_5ImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[Kosmos2_5FastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[Kosmos2_5ImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     @auto_docstring
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[Kosmos2_5FastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[Kosmos2_5ImageProcessorKwargs]) -> BatchFeature:
         r"""
         patch_size (`Dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
             The patch size to use for the image. According to Kosmos2_5 paper and code, the patch size is 16x16.
diff --git a/src/transformers/models/kosmos2_5/processing_kosmos2_5.py b/src/transformers/models/kosmos2_5/processing_kosmos2_5.py
index 0e3c70c80234..cb6f27777a0f 100644
--- a/src/transformers/models/kosmos2_5/processing_kosmos2_5.py
+++ b/src/transformers/models/kosmos2_5/processing_kosmos2_5.py
@@ -20,7 +20,7 @@
 
 from ...image_processing_utils import BatchFeature
 from ...image_utils import ImageInput
-from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import TextInput
 from ...utils import is_torch_available
 
@@ -29,14 +29,7 @@
     import torch
 
 
-class Kosmos2_5ImagesKwargs(ImagesKwargs, total=False):
-    max_patches: Optional[int]
-    num_image_tokens: Optional[int]
-
-
 class Kosmos2_5ProcessorKwargs(ProcessingKwargs, total=False):
-    text_kwargs: TextKwargs
-    images_kwargs: Kosmos2_5ImagesKwargs
     _defaults = {
         "text_kwargs": {
             "padding": True,
@@ -46,7 +39,6 @@ class Kosmos2_5ProcessorKwargs(ProcessingKwargs, total=False):
         },
         "images_kwargs": {
             "max_patches": 4096,
-            "num_image_tokens": 2048,
         },
         "common_kwargs": {"return_tensors": "pt"},
     }
@@ -65,24 +57,25 @@ class Kosmos2_5Processor(ProcessorMixin):
             An instance of [`Kosmos2_5ImageProcessor`]. The image processor is a required input.
         tokenizer (Union[`T5TokenizerFast`, `T5Tokenizer`]):
             An instance of ['T5TokenizerFast`] or ['T5Tokenizer`]. The tokenizer is a required input.
+        num_image_tokens (`int`, *optional*, defaults to 2048):
+            Number of image tokens used as a placeholder.
     """
 
     attributes = ["image_processor", "tokenizer"]
     image_processor_class = "AutoImageProcessor"
     tokenizer_class = "PreTrainedTokenizerFast"
 
-    def __init__(self, image_processor, tokenizer):
+    def __init__(self, image_processor, tokenizer, num_image_tokens: int = 2048):
         self.image_start_token = tokenizer.boi_token  # "<image>" : fixed token for the start of image
         self.image_end_token = tokenizer.eoi_token  # "</image>" : fixed token for the end of image
         self.image_token = tokenizer.image_token  # "<s>" : within a <image> ... </image> pair, these <s> tokens indicate they are positions reserved for an image
+        self.num_image_tokens = num_image_tokens
         super().__init__(image_processor, tokenizer)
 
     def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Union[TextInput, list[TextInput]] = None,
-        audio=None,
-        videos=None,
         **kwargs: Unpack[Kosmos2_5ProcessorKwargs],
     ) -> BatchFeature:
         """
@@ -104,8 +97,6 @@ def __call__(
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,
             **kwargs,
         )
-        num_image_tokens = output_kwargs["images_kwargs"].setdefault("num_image_tokens", None)
-
         encoding = BatchFeature()
 
         if images is not None:
@@ -114,7 +105,7 @@ def __call__(
             image_encoding.pop("cols")
             encoding.update(image_encoding)
 
-        prompt = f"{self.tokenizer.bos_token}{self.image_start_token}{self.image_token * num_image_tokens}{self.image_end_token}"
+        prompt = f"{self.tokenizer.bos_token}{self.image_start_token}{self.image_token * self.num_image_tokens}{self.image_end_token}"
 
         if text is not None:
             if isinstance(text, str):
@@ -124,7 +115,7 @@ def __call__(
             input = self.tokenizer(text, **output_kwargs["text_kwargs"])
 
             batch_size, seq_len = input.input_ids.shape
-            image_embeds_position_mask = [0, -1] + [1] * num_image_tokens + [-1]
+            image_embeds_position_mask = [0, -1] + [1] * self.num_image_tokens + [-1]
             image_embeds_position_mask += [0] * (seq_len - len(image_embeds_position_mask))
             image_embeds_position_mask = (
                 torch.LongTensor(image_embeds_position_mask).unsqueeze(0).repeat(batch_size, 1)
diff --git a/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py b/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py
index 76fc752bbeea..d5a7e95537c5 100644
--- a/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py
@@ -30,6 +30,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import (
     TensorType,
     filter_out_non_signature_kwargs,
@@ -51,6 +52,25 @@
 logger = logging.get_logger(__name__)
 
 
+class LayoutLMv2ImageProcessorKwargs(ImagesKwargs):
+    r"""
+    apply_ocr (`bool`, *optional*, defaults to `True`):
+        Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by
+        the `apply_ocr` parameter in the `preprocess` method.
+    ocr_lang (`str`, *optional*):
+        The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
+        used. Can be overridden by the `ocr_lang` parameter in the `preprocess` method.
+    tesseract_config (`str`, *optional*):
+        Any additional custom configuration flags that are forwarded to the `config` parameter when calling
+        Tesseract. For example: '--psm 6'. Can be overridden by the `tesseract_config` parameter in the
+        `preprocess` method.
+    """
+
+    apply_ocr: Optional[bool]
+    ocr_lang: Optional[str]
+    tesseract_config: Optional[str]
+
+
 def normalize_box(box, width, height):
     return [
         int(1000 * (box[0] / width)),
@@ -125,6 +145,7 @@ class LayoutLMv2ImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values"]
+    valid_kwargs = LayoutLMv2ImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py b/src/transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py
index 354bbe21c4db..2d6e6bc21cb3 100644
--- a/src/transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py
+++ b/src/transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py
@@ -19,7 +19,7 @@
 import torch
 from torchvision.transforms.v2 import functional as F
 
-from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature, DefaultFastImageProcessorKwargs
+from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature
 from ...image_transforms import ChannelDimension, group_images_by_shape, reorder_images
 from ...image_utils import ImageInput, PILImageResampling, SizeDict
 from ...processing_utils import Unpack
@@ -29,32 +29,12 @@
     logging,
     requires_backends,
 )
-from .image_processing_layoutlmv2 import apply_tesseract
+from .image_processing_layoutlmv2 import LayoutLMv2ImageProcessorKwargs, apply_tesseract
 
 
 logger = logging.get_logger(__name__)
 
 
-class LayoutLMv2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    Args:
-        apply_ocr (`bool`, *optional*, defaults to `True`):
-            Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by
-            the `apply_ocr` parameter in the `preprocess` method.
-        ocr_lang (`str`, *optional*):
-            The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
-            used. Can be overridden by the `ocr_lang` parameter in the `preprocess` method.
-        tesseract_config (`str`, *optional*):
-            Any additional custom configuration flags that are forwarded to the `config` parameter when calling
-            Tesseract. For example: '--psm 6'. Can be overridden by the `tesseract_config` parameter in the
-            `preprocess` method.
-    """
-
-    apply_ocr: Optional[bool]
-    ocr_lang: Optional[str]
-    tesseract_config: Optional[str]
-
-
 @auto_docstring
 class LayoutLMv2ImageProcessorFast(BaseImageProcessorFast):
     resample = PILImageResampling.BILINEAR
@@ -64,13 +44,13 @@ class LayoutLMv2ImageProcessorFast(BaseImageProcessorFast):
     apply_ocr = True
     ocr_lang = None
     tesseract_config = ""
-    valid_kwargs = LayoutLMv2FastImageProcessorKwargs
+    valid_kwargs = LayoutLMv2ImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[LayoutLMv2FastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[LayoutLMv2ImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     @auto_docstring
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[LayoutLMv2FastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[LayoutLMv2ImageProcessorKwargs]) -> BatchFeature:
         return super().preprocess(images, **kwargs)
 
     def _preprocess(
diff --git a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
index 0ce7f5ce6968..b9273dc75cad 100644
--- a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
@@ -34,6 +34,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import (
     TensorType,
     filter_out_non_signature_kwargs,
@@ -55,6 +56,25 @@
 logger = logging.get_logger(__name__)
 
 
+class LayoutLMv3ImageProcessorKwargs(ImagesKwargs):
+    r"""
+    apply_ocr (`bool`, *optional*, defaults to `True`):
+        Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by
+        the `apply_ocr` parameter in the `preprocess` method.
+    ocr_lang (`str`, *optional*):
+        The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
+        used. Can be overridden by the `ocr_lang` parameter in the `preprocess` method.
+    tesseract_config (`str`, *optional*):
+        Any additional custom configuration flags that are forwarded to the `config` parameter when calling
+        Tesseract. For example: '--psm 6'. Can be overridden by the `tesseract_config` parameter in the
+        `preprocess` method.
+    """
+
+    apply_ocr: Optional[bool]
+    ocr_lang: Optional[str]
+    tesseract_config: Optional[str]
+
+
 def normalize_box(box, width, height):
     return [
         int(1000 * (box[0] / width)),
@@ -143,6 +163,7 @@ class LayoutLMv3ImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values"]
+    valid_kwargs = LayoutLMv3ImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py
index caefa9b89660..b614c5ec9449 100644
--- a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py
+++ b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py
@@ -19,7 +19,7 @@
 import torch
 from torchvision.transforms.v2 import functional as F
 
-from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature, DefaultFastImageProcessorKwargs
+from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature
 from ...image_transforms import ChannelDimension, group_images_by_shape, reorder_images
 from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, ImageInput, PILImageResampling, SizeDict
 from ...processing_utils import Unpack
@@ -29,32 +29,12 @@
     logging,
     requires_backends,
 )
-from .image_processing_layoutlmv3 import apply_tesseract
+from .image_processing_layoutlmv3 import LayoutLMv3ImageProcessorKwargs, apply_tesseract
 
 
 logger = logging.get_logger(__name__)
 
 
-class LayoutLMv3FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    Args:
-        apply_ocr (`bool`, *optional*, defaults to `True`):
-            Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by
-            the `apply_ocr` parameter in the `preprocess` method.
-        ocr_lang (`str`, *optional*):
-            The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
-            used. Can be overridden by the `ocr_lang` parameter in the `preprocess` method.
-        tesseract_config (`str`, *optional*):
-            Any additional custom configuration flags that are forwarded to the `config` parameter when calling
-            Tesseract. For example: '--psm 6'. Can be overridden by the `tesseract_config` parameter in the
-            `preprocess` method.
-    """
-
-    apply_ocr: Optional[bool]
-    ocr_lang: Optional[str]
-    tesseract_config: Optional[str]
-
-
 @auto_docstring
 class LayoutLMv3ImageProcessorFast(BaseImageProcessorFast):
     resample = PILImageResampling.BILINEAR
@@ -67,13 +47,13 @@ class LayoutLMv3ImageProcessorFast(BaseImageProcessorFast):
     apply_ocr = True
     ocr_lang = None
     tesseract_config = ""
-    valid_kwargs = LayoutLMv3FastImageProcessorKwargs
+    valid_kwargs = LayoutLMv3ImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[LayoutLMv3FastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[LayoutLMv3ImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     @auto_docstring
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[LayoutLMv3FastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[LayoutLMv3ImageProcessorKwargs]) -> BatchFeature:
         return super().preprocess(images, **kwargs)
 
     def _preprocess(
diff --git a/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py b/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py
index 4081c86e108a..ad99504fcad6 100755
--- a/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py
+++ b/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py
@@ -22,7 +22,6 @@
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
     group_images_by_shape,
     reorder_images,
 )
@@ -33,9 +32,7 @@
     PILImageResampling,
     SizeDict,
 )
-from ...processing_utils import (
-    Unpack,
-)
+from ...processing_utils import ImagesKwargs, Unpack
 from ...utils import (
     TensorType,
     auto_docstring,
@@ -172,7 +169,7 @@ def pad_along_first_dim(
     return images, pixel_mask
 
 
-class Lfm2VlFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+class Lfm2VlImageProcessorKwargs(ImagesKwargs):
     """
     downsample_factor (`int`, *optional*, defaults to `2`):
         The downsampling factor for images used when resizing the image.
@@ -214,10 +211,10 @@ class Lfm2VlImageProcessorFast(BaseImageProcessorFast):
     return_row_col_info = False
     image_mean = IMAGENET_STANDARD_STD
     image_std = IMAGENET_STANDARD_MEAN
-    valid_kwargs = Lfm2VlFastImageProcessorKwargs
+    valid_kwargs = Lfm2VlImageProcessorKwargs
     model_input_names = ["pixel_values", "pixel_attention_mask", "spatial_shapes"]
 
-    def __init__(self, **kwargs: Unpack[Lfm2VlFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[Lfm2VlImageProcessorKwargs]):
         super().__init__(**kwargs)
 
         max_thumbnail_image_patches = self.max_image_tokens * self.downsample_factor**2
diff --git a/src/transformers/models/llama4/image_processing_llama4_fast.py b/src/transformers/models/llama4/image_processing_llama4_fast.py
index 6506d5749d94..e2678f556d02 100644
--- a/src/transformers/models/llama4/image_processing_llama4_fast.py
+++ b/src/transformers/models/llama4/image_processing_llama4_fast.py
@@ -25,12 +25,11 @@
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
     group_images_by_shape,
     reorder_images,
 )
 from ...image_utils import ImageInput, PILImageResampling, SizeDict
-from ...processing_utils import Unpack
+from ...processing_utils import ImagesKwargs, Unpack
 from ...utils import (
     TensorType,
     auto_docstring,
@@ -309,8 +308,8 @@ def get_best_fit(
     return optimal_canvas
 
 
-class Llama4ImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
+class Llama4ImageProcessorKwargs(ImagesKwargs):
+    r"""
     max_patches (`int`, *optional*, defaults to 16):
         The maximum number of patches to be extracted from the image.
         Can be overridden by the `max_patches` parameter in the `preprocess` method.
diff --git a/src/transformers/models/llama4/processing_llama4.py b/src/transformers/models/llama4/processing_llama4.py
index 47a0b4cd99fb..df371bdfd710 100644
--- a/src/transformers/models/llama4/processing_llama4.py
+++ b/src/transformers/models/llama4/processing_llama4.py
@@ -16,20 +16,14 @@
 
 from typing import Optional, Union
 
-from transformers.processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
+from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
 
 from ...image_processing_utils import BatchFeature
 from ...image_utils import ImageInput, make_flat_list_of_images
 
 
-class Llama4ImagesKwargs(ImagesKwargs, total=False):
-    max_patches: Optional[int]
-    resize_to_max_canvas: Optional[bool]
-
-
 class Llama4ProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: Llama4ImagesKwargs
     _defaults = {
         "text_kwargs": {
             "padding_side": "left",
@@ -139,8 +133,6 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
-        audio=None,
-        videos=None,
         **kwargs: Unpack[Llama4ProcessorKwargs],
     ) -> BatchFeature:
         """
diff --git a/src/transformers/models/llava/image_processing_llava_fast.py b/src/transformers/models/llava/image_processing_llava_fast.py
index 596070040549..66ccb49c3671 100644
--- a/src/transformers/models/llava/image_processing_llava_fast.py
+++ b/src/transformers/models/llava/image_processing_llava_fast.py
@@ -22,7 +22,6 @@
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
     group_images_by_shape,
     reorder_images,
 )
@@ -30,21 +29,16 @@
     OPENAI_CLIP_MEAN,
     OPENAI_CLIP_STD,
     ChannelDimension,
-    ImageInput,
     PILImageResampling,
     SizeDict,
     get_image_size,
 )
-from ...processing_utils import Unpack
 from ...utils import (
     TensorType,
     auto_docstring,
 )
 
 
-class LlavaFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): ...
-
-
 @auto_docstring
 class LlavaImageProcessorFast(BaseImageProcessorFast):
     resample = PILImageResampling.BICUBIC
@@ -59,14 +53,6 @@ class LlavaImageProcessorFast(BaseImageProcessorFast):
     do_rescale = True
     do_normalize = True
     do_convert_rgb = True
-    valid_kwargs = LlavaFastImageProcessorKwargs
-
-    def __init__(self, **kwargs: Unpack[LlavaFastImageProcessorKwargs]) -> None:
-        super().__init__(**kwargs)
-
-    @auto_docstring
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaFastImageProcessorKwargs]) -> BatchFeature:
-        return super().preprocess(images, **kwargs)
 
     def pad_to_square(
         self,
diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py
index 398bd9d8d065..6f8d9e3a14cc 100644
--- a/src/transformers/models/llava/processing_llava.py
+++ b/src/transformers/models/llava/processing_llava.py
@@ -38,7 +38,6 @@
 class LlavaProcessorKwargs(ProcessingKwargs, total=False):
     _defaults = {
         "text_kwargs": {"padding": False, "return_mm_token_type_ids": False},
-        "images_kwargs": {},
     }
 
 
@@ -94,8 +93,6 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
-        audio=None,
-        videos=None,
         **kwargs: Unpack[LlavaProcessorKwargs],
     ) -> BatchFeature:
         """
diff --git a/src/transformers/models/llava_next/image_processing_llava_next.py b/src/transformers/models/llava_next/image_processing_llava_next.py
index 60d36abb748b..56ebc10f391d 100644
--- a/src/transformers/models/llava_next/image_processing_llava_next.py
+++ b/src/transformers/models/llava_next/image_processing_llava_next.py
@@ -48,6 +48,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, is_vision_available, logging
 
 
@@ -58,6 +59,17 @@
     from PIL import Image
 
 
+class LlavaNextImageProcessorKwargs(ImagesKwargs):
+    r"""
+    image_grid_pinpoints (`list[list[int]]`, *optional*):
+        A list of possible resolutions to use for processing high resolution images. The best resolution is selected
+        based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess`
+        method.
+    """
+
+    image_grid_pinpoints: Optional[list[list[int]]]
+
+
 def divide_to_patches(image: np.ndarray, patch_size: int, input_data_format) -> list[np.ndarray]:
     """
     Divides an image into patches of a specified size.
@@ -152,6 +164,7 @@ class LlavaNextImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values", "image_sizes"]
+    valid_kwargs = LlavaNextImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/llava_next/image_processing_llava_next_fast.py b/src/transformers/models/llava_next/image_processing_llava_next_fast.py
index df20e2b90e83..19d6fb941e7b 100644
--- a/src/transformers/models/llava_next/image_processing_llava_next_fast.py
+++ b/src/transformers/models/llava_next/image_processing_llava_next_fast.py
@@ -22,7 +22,6 @@
 from ...image_processing_utils import BatchFeature, get_patch_output_size, select_best_resolution
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
     divide_to_patches,
     group_images_by_shape,
     reorder_images,
@@ -41,17 +40,7 @@
     TensorType,
     auto_docstring,
 )
-
-
-class LlavaNextFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    image_grid_pinpoints (`list[list[int]]`, *optional*):
-        A list of possible resolutions to use for processing high resolution images. The best resolution is selected
-        based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess`
-        method.
-    """
-
-    image_grid_pinpoints: Optional[list[list[int]]]
+from .image_processing_llava_next import LlavaNextImageProcessorKwargs
 
 
 @auto_docstring
@@ -71,13 +60,13 @@ class LlavaNextImageProcessorFast(BaseImageProcessorFast):
     do_convert_rgb = True
     do_pad = True
     image_grid_pinpoints = [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]
-    valid_kwargs = LlavaNextFastImageProcessorKwargs
+    valid_kwargs = LlavaNextImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[LlavaNextFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[LlavaNextImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     @auto_docstring
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaNextFastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaNextImageProcessorKwargs]) -> BatchFeature:
         return super().preprocess(images, **kwargs)
 
     def _resize_for_patching(
diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py
index 2574fc443519..04493518a020 100644
--- a/src/transformers/models/llava_next/processing_llava_next.py
+++ b/src/transformers/models/llava_next/processing_llava_next.py
@@ -104,8 +104,6 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
-        audio=None,
-        videos=None,
         **kwargs: Unpack[LlavaNextProcessorKwargs],
     ) -> BatchFeature:
         """
diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py
index e858a1784254..015e4cdea6df 100644
--- a/src/transformers/models/llava_next_video/processing_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py
@@ -116,7 +116,6 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
-        audio=None,
         videos: Optional[VideoInput] = None,
         **kwargs: Unpack[LlavaNextVideoProcessorKwargs],
     ) -> BatchFeature:
diff --git a/src/transformers/models/llava_next_video/video_processing_llava_next_video.py b/src/transformers/models/llava_next_video/video_processing_llava_next_video.py
index 80ed1e5b81b9..c353100cee79 100644
--- a/src/transformers/models/llava_next_video/video_processing_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/video_processing_llava_next_video.py
@@ -15,13 +15,9 @@
 """Video processor class for LLaVa-NeXT-Video."""
 
 from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling
-from ...processing_utils import Unpack, VideosKwargs
 from ...video_processing_utils import BaseVideoProcessor
 
 
-class LlavaNextVideoFastVideoProcessorInitKwargs(VideosKwargs): ...
-
-
 class LlavaNextVideoVideoProcessor(BaseVideoProcessor):
     resample = PILImageResampling.BICUBIC
     image_mean = OPENAI_CLIP_MEAN
@@ -35,11 +31,6 @@ class LlavaNextVideoVideoProcessor(BaseVideoProcessor):
     do_normalize = True
     do_convert_rgb = True
     do_sample_frames = False  # Set to False for BC, recommended to set `True` in new models
-    valid_kwargs = LlavaNextVideoFastVideoProcessorInitKwargs
-    model_input_names = ["pixel_values_videos"]
-
-    def __init__(self, **kwargs: Unpack[LlavaNextVideoFastVideoProcessorInitKwargs]):
-        super().__init__(**kwargs)
 
 
 __all__ = ["LlavaNextVideoVideoProcessor"]
diff --git a/src/transformers/models/llava_onevision/image_processing_llava_onevision.py b/src/transformers/models/llava_onevision/image_processing_llava_onevision.py
index b679ac269747..119df9550a2a 100644
--- a/src/transformers/models/llava_onevision/image_processing_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/image_processing_llava_onevision.py
@@ -47,6 +47,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, is_vision_available, logging
 
 
@@ -57,6 +58,17 @@
     from PIL import Image
 
 
+class LlavaOnevisionImageProcessorKwargs(ImagesKwargs):
+    r"""
+    image_grid_pinpoints (`list[list[int]]`, *optional*):
+        A list of possible resolutions to use for processing high resolution images. The best resolution is selected
+        based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess`
+        method.
+    """
+
+    image_grid_pinpoints: Optional[list[list[int]]]
+
+
 # Copied from transformers.models.llava_next.image_processing_llava_next.divide_to_patches
 def divide_to_patches(image: np.ndarray, patch_size: int, input_data_format) -> list[np.ndarray]:
     """
@@ -146,6 +158,7 @@ class LlavaOnevisionImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values", "image_sizes", "batch_num_images"]
+    valid_kwargs = LlavaOnevisionImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py b/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py
index 11872cb67bf3..b309583461ce 100644
--- a/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py
+++ b/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py
@@ -27,7 +27,6 @@
 from ...image_processing_utils import BatchFeature, get_patch_output_size, select_best_resolution
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
     divide_to_patches,
     group_images_by_shape,
     reorder_images,
@@ -43,17 +42,7 @@
 )
 from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring
-
-
-class LlavaOnevisionFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    image_grid_pinpoints (`list[list[int]]`, *optional*):
-        A list of possible resolutions to use for processing high resolution images. The best resolution is selected
-        based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess`
-        method.
-    """
-
-    image_grid_pinpoints: Optional[list[list[int]]]
+from .image_processing_llava_onevision import LlavaOnevisionImageProcessorKwargs
 
 
 @auto_docstring
@@ -71,14 +60,14 @@ class LlavaOnevisionImageProcessorFast(BaseImageProcessorFast):
     do_convert_rgb = True
     do_pad = True
     image_grid_pinpoints = [[384, 384], [384, 768], [384, 1152], [384, 1536], [384, 1920], [384, 2304], [768, 384], [768, 768], [768, 1152], [768, 1536], [768, 1920], [768, 2304], [1152, 384], [1152, 768], [1152, 1152], [1152, 1536], [1152, 1920], [1152, 2304], [1536, 384], [1536, 768], [1536, 1152], [1536, 1536], [1536, 1920], [1536, 2304], [1920, 384], [1920, 768], [1920, 1152], [1920, 1536], [1920, 1920], [1920, 2304], [2304, 384], [2304, 768], [2304, 1152], [2304, 1536], [2304, 1920], [2304, 2304]]  # fmt: skip
-    valid_kwargs = LlavaOnevisionFastImageProcessorKwargs
+    valid_kwargs = LlavaOnevisionImageProcessorKwargs
     model_input_names = ["pixel_values", "image_sizes", "batch_num_images"]
 
-    def __init__(self, **kwargs: Unpack[LlavaOnevisionFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[LlavaOnevisionImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     @auto_docstring
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaOnevisionFastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaOnevisionImageProcessorKwargs]) -> BatchFeature:
         if isinstance(images, (tuple, list)) and isinstance(images[0], (tuple, list)):
             # if the first element is a list, we assume that all elements are lists
             batch_num_images = [len(x) for x in images]
diff --git a/src/transformers/models/llava_onevision/modular_llava_onevision.py b/src/transformers/models/llava_onevision/modular_llava_onevision.py
index b4f64dee8e04..890fcdd7ecaa 100644
--- a/src/transformers/models/llava_onevision/modular_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/modular_llava_onevision.py
@@ -35,7 +35,7 @@
 
 from ...cache_utils import Cache
 from ...image_processing_utils import BatchFeature
-from ...image_processing_utils_fast import DefaultFastImageProcessorKwargs, group_images_by_shape, reorder_images
+from ...image_processing_utils_fast import group_images_by_shape, reorder_images
 from ...image_utils import (
     OPENAI_CLIP_MEAN,
     OPENAI_CLIP_STD,
@@ -53,22 +53,12 @@
     can_return_tuple,
     logging,
 )
+from .image_processing_llava_onevision import LlavaOnevisionImageProcessorKwargs
 
 
 logger = logging.get_logger(__name__)
 
 
-class LlavaOnevisionFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    image_grid_pinpoints (`list[list[int]]`, *optional*):
-        A list of possible resolutions to use for processing high resolution images. The best resolution is selected
-        based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess`
-        method.
-    """
-
-    image_grid_pinpoints: Optional[list[list[int]]]
-
-
 class LlavaOnevisionImageProcessorFast(LlavaNextImageProcessorFast):
     resample = PILImageResampling.BICUBIC
     image_mean = OPENAI_CLIP_MEAN
@@ -129,7 +119,7 @@ def pad_to_square(
         return padded_images
 
     @auto_docstring
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaOnevisionFastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaOnevisionImageProcessorKwargs]) -> BatchFeature:
         if isinstance(images, (tuple, list)) and isinstance(images[0], (tuple, list)):
             # if the first element is a list, we assume that all elements are lists
             batch_num_images = [len(x) for x in images]
diff --git a/src/transformers/models/llava_onevision/processing_llava_onevision.py b/src/transformers/models/llava_onevision/processing_llava_onevision.py
index 9fadc6af3067..ff8eae5dd87a 100644
--- a/src/transformers/models/llava_onevision/processing_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/processing_llava_onevision.py
@@ -42,7 +42,6 @@ class LlavaOnevisionProcessorKwargs(ProcessingKwargs, total=False):
             "return_mm_token_type_ids": False,
         },
         "image_kwargs": {},
-        "videos_kwargs": {},
     }
 
 
@@ -114,7 +113,6 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
-        audio=None,
         videos: Optional[VideoInput] = None,
         **kwargs: Unpack[LlavaOnevisionProcessorKwargs],
     ) -> BatchFeature:
diff --git a/src/transformers/models/llava_onevision/video_processing_llava_onevision.py b/src/transformers/models/llava_onevision/video_processing_llava_onevision.py
index ddae0fcd3b6f..b201085f5ee6 100644
--- a/src/transformers/models/llava_onevision/video_processing_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/video_processing_llava_onevision.py
@@ -15,13 +15,9 @@
 """Video processor class for LLaVa-Onevision."""
 
 from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling
-from ...processing_utils import Unpack, VideosKwargs
 from ...video_processing_utils import BaseVideoProcessor
 
 
-class LlavaOnevisionFastVideoProcessorInitKwargs(VideosKwargs): ...
-
-
 class LlavaOnevisionVideoProcessor(BaseVideoProcessor):
     resample = PILImageResampling.BICUBIC
     image_mean = OPENAI_CLIP_MEAN
@@ -36,11 +32,6 @@ class LlavaOnevisionVideoProcessor(BaseVideoProcessor):
     do_normalize = True
     do_convert_rgb = True
     do_sample_frames = False  # Set to False for BC, recommended to set `True` in new models
-    valid_kwargs = LlavaOnevisionFastVideoProcessorInitKwargs
-    model_input_names = ["pixel_values_videos"]
-
-    def __init__(self, **kwargs: Unpack[LlavaOnevisionFastVideoProcessorInitKwargs]):
-        super().__init__(**kwargs)
 
 
 __all__ = ["LlavaOnevisionVideoProcessor"]
diff --git a/src/transformers/models/mask2former/image_processing_mask2former.py b/src/transformers/models/mask2former/image_processing_mask2former.py
index 752a3221d17f..d9d580955fbd 100644
--- a/src/transformers/models/mask2former/image_processing_mask2former.py
+++ b/src/transformers/models/mask2former/image_processing_mask2former.py
@@ -41,6 +41,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
@@ -60,6 +61,25 @@
     from torch import nn
 
 
+class Mask2FormerImageProcessorKwargs(ImagesKwargs):
+    r"""
+    ignore_index (`int`, *optional*):
+        Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
+        denoted with 0 (background) will be replaced with `ignore_index`.
+    do_reduce_labels (`bool`, *optional*, defaults to `False`):
+        Whether or not to decrement all label values of segmentation maps by 1. Usually used for datasets where 0
+        is used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k).
+        The background label will be replaced by `ignore_index`.
+    num_labels (`int`, *optional*):
+        The number of labels in the segmentation map.
+    """
+
+    size_divisor: Optional[int]
+    ignore_index: Optional[int]
+    do_reduce_labels: Optional[bool]
+    num_labels: Optional[int]
+
+
 # Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio
 def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, int]:
     """
@@ -440,6 +460,7 @@ class Mask2FormerImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values", "pixel_mask"]
+    valid_kwargs = Mask2FormerImageProcessorKwargs
 
     @filter_out_non_signature_kwargs(extra=["max_size", *INIT_SERVICE_KWARGS])
     def __init__(
diff --git a/src/transformers/models/mask2former/image_processing_mask2former_fast.py b/src/transformers/models/mask2former/image_processing_mask2former_fast.py
index 58dbb09d6319..6b8cd184581b 100644
--- a/src/transformers/models/mask2former/image_processing_mask2former_fast.py
+++ b/src/transformers/models/mask2former/image_processing_mask2former_fast.py
@@ -28,7 +28,6 @@
 from ...image_processing_utils import BatchFeature, get_size_dict
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
     SizeDict,
     get_image_size_for_max_height_width,
     get_max_height_width,
@@ -45,6 +44,7 @@
 from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring, logging
 from .image_processing_mask2former import (
+    Mask2FormerImageProcessorKwargs,
     compute_segments,
     convert_segmentation_to_rle,
     get_size_with_aspect_ratio,
@@ -55,28 +55,6 @@
 logger = logging.get_logger(__name__)
 
 
-class Mask2FormerFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    r"""
-    size_divisor (`int`, *optional*, defaults to 32):
-        Some backbones need images divisible by a certain number. If not passed, it defaults to the value used in
-        Swin Transformer.
-    ignore_index (`int`, *optional*):
-        Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
-        denoted with 0 (background) will be replaced with `ignore_index`.
-    do_reduce_labels (`bool`, *optional*, defaults to `False`):
-        Whether or not to decrement all label values of segmentation maps by 1. Usually used for datasets where 0
-        is used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k).
-        The background label will be replaced by `ignore_index`.
-    num_labels (`int`, *optional*):
-        The number of labels in the segmentation map.
-    """
-
-    size_divisor: Optional[int]
-    ignore_index: Optional[int]
-    do_reduce_labels: Optional[bool]
-    num_labels: Optional[int]
-
-
 def convert_segmentation_map_to_binary_masks_fast(
     segmentation_map: "torch.Tensor",
     instance_id_to_semantic_id: Optional[dict[int, int]] = None,
@@ -127,9 +105,9 @@ class Mask2FormerImageProcessorFast(BaseImageProcessorFast):
     model_input_names = ["pixel_values", "pixel_mask"]
     size_divisor = 32
     do_reduce_labels = False
-    valid_kwargs = Mask2FormerFastImageProcessorKwargs
+    valid_kwargs = Mask2FormerImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[Mask2FormerFastImageProcessorKwargs]) -> None:
+    def __init__(self, **kwargs: Unpack[Mask2FormerImageProcessorKwargs]) -> None:
         if "pad_and_return_pixel_mask" in kwargs:
             kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
 
@@ -259,7 +237,7 @@ def preprocess(
         images: ImageInput,
         segmentation_maps: Optional[ImageInput] = None,
         instance_id_to_semantic_id: Optional[Union[list[dict[int, int]], dict[int, int]]] = None,
-        **kwargs: Unpack[Mask2FormerFastImageProcessorKwargs],
+        **kwargs: Unpack[Mask2FormerImageProcessorKwargs],
     ) -> BatchFeature:
         r"""
         segmentation_maps (`ImageInput`, *optional*):
@@ -282,7 +260,7 @@ def _preprocess_image_like_inputs(
         do_convert_rgb: bool,
         input_data_format: ChannelDimension,
         device: Optional[Union[str, "torch.device"]] = None,
-        **kwargs: Unpack[Mask2FormerFastImageProcessorKwargs],
+        **kwargs: Unpack[Mask2FormerImageProcessorKwargs],
     ) -> BatchFeature:
         """
         Preprocess image-like inputs.
diff --git a/src/transformers/models/maskformer/image_processing_maskformer.py b/src/transformers/models/maskformer/image_processing_maskformer.py
index c306093e116d..8353856fb868 100644
--- a/src/transformers/models/maskformer/image_processing_maskformer.py
+++ b/src/transformers/models/maskformer/image_processing_maskformer.py
@@ -42,6 +42,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
@@ -66,6 +67,25 @@
     from torch import nn
 
 
+class MaskFormerImageProcessorKwargs(ImagesKwargs):
+    r"""
+    ignore_index (`int`, *optional*):
+        Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
+        denoted with 0 (background) will be replaced with `ignore_index`.
+    do_reduce_labels (`bool`, *optional*, defaults to `False`):
+        Whether or not to decrement all label values of segmentation maps by 1. Usually used for datasets where 0
+        is used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k).
+        The background label will be replaced by `ignore_index`.
+    num_labels (`int`, *optional*):
+        The number of labels in the segmentation map.
+    """
+
+    size_divisor: Optional[int]
+    ignore_index: Optional[int]
+    do_reduce_labels: Optional[bool]
+    num_labels: Optional[int]
+
+
 # Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio
 def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, int]:
     """
@@ -446,6 +466,7 @@ class MaskFormerImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values", "pixel_mask"]
+    valid_kwargs = MaskFormerImageProcessorKwargs
 
     @filter_out_non_signature_kwargs(extra=["max_size", *INIT_SERVICE_KWARGS])
     def __init__(
diff --git a/src/transformers/models/maskformer/image_processing_maskformer_fast.py b/src/transformers/models/maskformer/image_processing_maskformer_fast.py
index 9e15486cfa35..d174b4ada0a9 100644
--- a/src/transformers/models/maskformer/image_processing_maskformer_fast.py
+++ b/src/transformers/models/maskformer/image_processing_maskformer_fast.py
@@ -25,7 +25,6 @@
 from ...image_processing_utils import BatchFeature, get_size_dict
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
     SizeDict,
     get_image_size_for_max_height_width,
     get_max_height_width,
@@ -46,6 +45,7 @@
     logging,
 )
 from .image_processing_maskformer import (
+    MaskFormerImageProcessorKwargs,
     compute_segments,
     convert_segmentation_to_rle,
     get_size_with_aspect_ratio,
@@ -95,28 +95,6 @@ def convert_segmentation_map_to_binary_masks_fast(
     return binary_masks.float(), labels.long()
 
 
-class MaskFormerFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    r"""
-    size_divisor (`int`, *optional*, defaults to 32):
-        Some backbones need images divisible by a certain number. If not passed, it defaults to the value used in
-        Swin Transformer.
-    ignore_index (`int`, *optional*):
-        Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
-        denoted with 0 (background) will be replaced with `ignore_index`.
-    do_reduce_labels (`bool`, *optional*, defaults to `False`):
-        Whether or not to decrement all label values of segmentation maps by 1. Usually used for datasets where 0
-        is used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k).
-        The background label will be replaced by `ignore_index`.
-    num_labels (`int`, *optional*):
-        The number of labels in the segmentation map.
-    """
-
-    size_divisor: Optional[int]
-    ignore_index: Optional[int]
-    do_reduce_labels: Optional[bool]
-    num_labels: Optional[int]
-
-
 @auto_docstring
 class MaskFormerImageProcessorFast(BaseImageProcessorFast):
     resample = PILImageResampling.BILINEAR
@@ -132,9 +110,9 @@ class MaskFormerImageProcessorFast(BaseImageProcessorFast):
     model_input_names = ["pixel_values", "pixel_mask"]
     size_divisor = 32
     do_reduce_labels = False
-    valid_kwargs = MaskFormerFastImageProcessorKwargs
+    valid_kwargs = MaskFormerImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[MaskFormerFastImageProcessorKwargs]) -> None:
+    def __init__(self, **kwargs: Unpack[MaskFormerImageProcessorKwargs]) -> None:
         if "pad_and_return_pixel_mask" in kwargs:
             kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
 
@@ -264,7 +242,7 @@ def preprocess(
         images: ImageInput,
         segmentation_maps: Optional[ImageInput] = None,
         instance_id_to_semantic_id: Optional[Union[list[dict[int, int]], dict[int, int]]] = None,
-        **kwargs: Unpack[MaskFormerFastImageProcessorKwargs],
+        **kwargs: Unpack[MaskFormerImageProcessorKwargs],
     ) -> BatchFeature:
         r"""
         segmentation_maps (`ImageInput`, *optional*):
@@ -287,7 +265,7 @@ def _preprocess_image_like_inputs(
         do_convert_rgb: bool,
         input_data_format: ChannelDimension,
         device: Optional[Union[str, "torch.device"]] = None,
-        **kwargs: Unpack[MaskFormerFastImageProcessorKwargs],
+        **kwargs: Unpack[MaskFormerImageProcessorKwargs],
     ) -> BatchFeature:
         """
         Preprocess image-like inputs.
diff --git a/src/transformers/models/mllama/image_processing_mllama.py b/src/transformers/models/mllama/image_processing_mllama.py
index 7578ac698db7..cd79f7de3121 100644
--- a/src/transformers/models/mllama/image_processing_mllama.py
+++ b/src/transformers/models/mllama/image_processing_mllama.py
@@ -38,6 +38,7 @@
     to_numpy_array,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, logging
 
 
@@ -49,6 +50,15 @@
 logger = logging.get_logger(__name__)
 
 
+class MllamaImageProcessorKwargs(ImagesKwargs):
+    """
+    max_image_tiles (`int`, *optional*):
+        The maximum number of tiles allowed.
+    """
+
+    max_image_tiles: Optional[int]
+
+
 @lru_cache(maxsize=10)
 def get_all_supported_aspect_ratios(max_image_tiles: int) -> list[tuple[int, int]]:
     """
@@ -567,6 +577,7 @@ class MllamaImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values", "num_tiles", "aspect_ratio_ids", "aspect_ratio_mask"]
+    valid_kwargs = MllamaImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/mllama/processing_mllama.py b/src/transformers/models/mllama/processing_mllama.py
index 5420a2deeeb1..3955006a4f9e 100644
--- a/src/transformers/models/mllama/processing_mllama.py
+++ b/src/transformers/models/mllama/processing_mllama.py
@@ -21,17 +21,11 @@
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, make_nested_list_of_images
-from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 
 
-class MllamaImagesKwargs(ImagesKwargs, total=False):
-    max_image_tiles: Optional[int]
-
-
 class MllamaProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: MllamaImagesKwargs
-
     _defaults = {
         "image_kwargs": {
             "max_image_tiles": 4,
@@ -225,8 +219,6 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
-        audio=None,
-        videos=None,
         **kwargs: Unpack[MllamaProcessorKwargs],
     ) -> BatchFeature:
         """
@@ -267,10 +259,8 @@ def __call__(
             **kwargs,
         )
 
-        text_kwargs = output_kwargs["text_kwargs"]
-        text_kwargs["return_tensors"] = None
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
         images_kwargs = output_kwargs["images_kwargs"]
-        common_kwargs = output_kwargs["common_kwargs"]
 
         data = {}
         if text is not None:
@@ -280,8 +270,7 @@ def __call__(
                 raise ValueError("Invalid input text. Please provide a string, or a list of strings")
             n_images_in_text = [t.count(self.image_token) for t in text]
             text = [build_string_from_input(text_item, self.bos_token, self.image_token) for text_item in text]
-            _ = text_kwargs.pop("padding_side", None)  # hack until padding-side is an accepted kwarg by tokenizers
-            encoding = self.tokenizer(text, **text_kwargs)
+            encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
             self._check_special_mm_tokens(text, encoding, modalities=["image"])
             n_images_in_ids = [token_ids.count(self.image_token_id) for token_ids in encoding["input_ids"]]
             data.update(encoding)
@@ -334,10 +323,7 @@ def __call__(
             )
             data["cross_attention_mask"] = cross_attention_mask
 
-        return_tensors = common_kwargs.pop("return_tensors", None)
-        batch_feature = BatchFeature(data=data, tensor_type=return_tensors)
-
-        return batch_feature
+        return BatchFeature(data=data, tensor_type=return_tensors)
 
     def post_process_image_text_to_text(
         self, generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False, **kwargs
diff --git a/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1_fast.py b/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1_fast.py
index e716553a6d10..dc10170734ec 100644
--- a/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1_fast.py
+++ b/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1_fast.py
@@ -16,8 +16,6 @@
 
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
-    Unpack,
 )
 from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, PILImageResampling
 from ...utils import auto_docstring
@@ -36,8 +34,5 @@ class MobileNetV1ImageProcessorFast(BaseImageProcessorFast):
     do_rescale = True
     do_normalize = True
 
-    def __init__(self, **kwargs: Unpack[DefaultFastImageProcessorKwargs]) -> None:
-        super().__init__(**kwargs)
-
 
 __all__ = ["MobileNetV1ImageProcessorFast"]
diff --git a/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py b/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
index 186dc3cf5772..e8dfe992544a 100644
--- a/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
+++ b/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
@@ -37,6 +37,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, filter_out_non_signature_kwargs, is_torch_available, is_torch_tensor, logging
 
 
@@ -50,6 +51,17 @@
 logger = logging.get_logger(__name__)
 
 
+class MobileNetV2ImageProcessorKwargs(ImagesKwargs):
+    """
+    do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
+        Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
+        is used for background, and background itself is not included in all classes of a dataset (e.g.
+        ADE20k). The background label will be replaced by 255.
+    """
+
+    do_reduce_labels: Optional[bool]
+
+
 @requires(backends=("vision",))
 class MobileNetV2ImageProcessor(BaseImageProcessor):
     r"""
@@ -96,6 +108,7 @@ class MobileNetV2ImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values"]
+    valid_kwargs = MobileNetV2ImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py b/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py
index 6c40fbf3f9b8..2c8329a034c1 100644
--- a/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py
+++ b/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py
@@ -22,7 +22,6 @@
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
     group_images_by_shape,
     reorder_images,
 )
@@ -40,17 +39,7 @@
     TensorType,
     auto_docstring,
 )
-
-
-class MobileNetV2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
-        Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
-        is used for background, and background itself is not included in all classes of a dataset (e.g.
-        ADE20k). The background label will be replaced by 255.
-    """
-
-    do_reduce_labels: Optional[bool]
+from .image_processing_mobilenet_v2 import MobileNetV2ImageProcessorKwargs
 
 
 @auto_docstring
@@ -66,9 +55,9 @@ class MobileNetV2ImageProcessorFast(BaseImageProcessorFast):
     do_rescale = True
     do_normalize = True
     do_reduce_labels = False
-    valid_kwargs = MobileNetV2FastImageProcessorKwargs
+    valid_kwargs = MobileNetV2ImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[MobileNetV2FastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[MobileNetV2ImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     # Copied from transformers.models.beit.image_processing_beit_fast.BeitImageProcessorFast.reduce_label
@@ -87,7 +76,7 @@ def preprocess(
         self,
         images: ImageInput,
         segmentation_maps: Optional[ImageInput] = None,
-        **kwargs: Unpack[MobileNetV2FastImageProcessorKwargs],
+        **kwargs: Unpack[MobileNetV2ImageProcessorKwargs],
     ) -> BatchFeature:
         r"""
         segmentation_maps (`ImageInput`, *optional*):
@@ -102,7 +91,7 @@ def _preprocess_image_like_inputs(
         do_convert_rgb: bool,
         input_data_format: ChannelDimension,
         device: Optional[Union[str, "torch.device"]] = None,
-        **kwargs: Unpack[MobileNetV2FastImageProcessorKwargs],
+        **kwargs: Unpack[MobileNetV2ImageProcessorKwargs],
     ) -> BatchFeature:
         """
         Preprocess image-like inputs.
diff --git a/src/transformers/models/mobilevit/image_processing_mobilevit.py b/src/transformers/models/mobilevit/image_processing_mobilevit.py
index 0ea7a0706cc4..576ef9f449dc 100644
--- a/src/transformers/models/mobilevit/image_processing_mobilevit.py
+++ b/src/transformers/models/mobilevit/image_processing_mobilevit.py
@@ -31,6 +31,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import (
     TensorType,
     filter_out_non_signature_kwargs,
@@ -52,6 +53,20 @@
 logger = logging.get_logger(__name__)
 
 
+class MobileVitImageProcessorKwargs(ImagesKwargs):
+    """
+    do_flip_channel_order (`bool`, *optional*, defaults to `self.do_flip_channel_order`):
+        Whether to flip the color channels from RGB to BGR or vice versa.
+    do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
+        Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
+        is used for background, and background itself is not included in all classes of a dataset (e.g.
+        ADE20k). The background label will be replaced by 255.
+    """
+
+    do_flip_channel_order: Optional[bool]
+    do_reduce_labels: Optional[bool]
+
+
 @requires(backends=("vision",))
 class MobileViTImageProcessor(BaseImageProcessor):
     r"""
@@ -91,6 +106,7 @@ class MobileViTImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values"]
+    valid_kwargs = MobileVitImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/mobilevit/image_processing_mobilevit_fast.py b/src/transformers/models/mobilevit/image_processing_mobilevit_fast.py
index fab16ecfdc87..81d745c2b54d 100644
--- a/src/transformers/models/mobilevit/image_processing_mobilevit_fast.py
+++ b/src/transformers/models/mobilevit/image_processing_mobilevit_fast.py
@@ -22,7 +22,6 @@
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
     group_images_by_shape,
     reorder_images,
 )
@@ -38,20 +37,7 @@
     TensorType,
     auto_docstring,
 )
-
-
-class MobileVitFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    do_flip_channel_order (`bool`, *optional*, defaults to `self.do_flip_channel_order`):
-        Whether to flip the color channels from RGB to BGR or vice versa.
-    do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
-        Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
-        is used for background, and background itself is not included in all classes of a dataset (e.g.
-        ADE20k). The background label will be replaced by 255.
-    """
-
-    do_flip_channel_order: Optional[bool]
-    do_reduce_labels: Optional[bool]
+from .image_processing_mobilevit import MobileVitImageProcessorKwargs
 
 
 @auto_docstring
@@ -67,9 +53,9 @@ class MobileViTImageProcessorFast(BaseImageProcessorFast):
     do_convert_rgb = None
     do_flip_channel_order = True
     do_reduce_labels = False
-    valid_kwargs = MobileVitFastImageProcessorKwargs
+    valid_kwargs = MobileVitImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[MobileVitFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[MobileVitImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     # Copied from transformers.models.beit.image_processing_beit_fast.BeitImageProcessorFast.reduce_label
@@ -88,7 +74,7 @@ def preprocess(
         self,
         images: ImageInput,
         segmentation_maps: Optional[ImageInput] = None,
-        **kwargs: Unpack[MobileVitFastImageProcessorKwargs],
+        **kwargs: Unpack[MobileVitImageProcessorKwargs],
     ) -> BatchFeature:
         r"""
         segmentation_maps (`ImageInput`, *optional*):
@@ -103,7 +89,7 @@ def _preprocess_image_like_inputs(
         do_convert_rgb: bool,
         input_data_format: ChannelDimension,
         device: Optional[Union[str, "torch.device"]] = None,
-        **kwargs: Unpack[MobileVitFastImageProcessorKwargs],
+        **kwargs: Unpack[MobileVitImageProcessorKwargs],
     ) -> BatchFeature:
         """
         Preprocess image-like inputs.
diff --git a/src/transformers/models/nougat/image_processing_nougat.py b/src/transformers/models/nougat/image_processing_nougat.py
index b4ec184773d4..0a5c445645e0 100644
--- a/src/transformers/models/nougat/image_processing_nougat.py
+++ b/src/transformers/models/nougat/image_processing_nougat.py
@@ -40,6 +40,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, filter_out_non_signature_kwargs, logging
 from ...utils.import_utils import is_vision_available
 
@@ -51,6 +52,21 @@
     import PIL
 
 
+class NougatImageProcessorKwargs(ImagesKwargs):
+    r"""
+    do_crop_margin (`bool`, *optional*, defaults to `True`):
+        Whether to crop the image margins.
+    do_thumbnail (`bool`, *optional*, defaults to `True`):
+        Whether to resize the image using thumbnail method.
+    do_align_long_axis (`bool`, *optional*, defaults to `False`):
+        Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
+    """
+
+    do_crop_margin: Optional[bool]
+    do_thumbnail: Optional[bool]
+    do_align_long_axis: Optional[bool]
+
+
 class NougatImageProcessor(BaseImageProcessor):
     r"""
     Constructs a Nougat image processor.
@@ -87,6 +103,7 @@ class NougatImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values"]
+    valid_kwargs = NougatImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/nougat/image_processing_nougat_fast.py b/src/transformers/models/nougat/image_processing_nougat_fast.py
index 15cee9051082..b059688d0046 100644
--- a/src/transformers/models/nougat/image_processing_nougat_fast.py
+++ b/src/transformers/models/nougat/image_processing_nougat_fast.py
@@ -22,7 +22,6 @@
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
     group_images_by_shape,
     reorder_images,
 )
@@ -42,22 +41,7 @@
     TensorType,
     auto_docstring,
 )
-
-
-class NougatFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    Args:
-    do_crop_margin (`bool`, *optional*, defaults to `True`):
-            Whether to crop the image margins.
-    do_thumbnail (`bool`, *optional*, defaults to `True`):
-            Whether to resize the image using thumbnail method.
-    do_align_long_axis (`bool`, *optional*, defaults to `False`):
-            Whether to align the long axis of the image with the long axis of `size` by rotating by 90 degrees.
-    """
-
-    do_crop_margin: Optional[bool]
-    do_thumbnail: Optional[bool]
-    do_align_long_axis: Optional[bool]
+from .image_processing_nougat import NougatImageProcessorKwargs
 
 
 @auto_docstring
@@ -73,13 +57,13 @@ class NougatImageProcessorFast(BaseImageProcessorFast):
     do_pad: bool = True
     do_rescale = True
     do_crop_margin: bool = True
-    valid_kwargs = NougatFastImageProcessorKwargs
+    valid_kwargs = NougatImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[NougatFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[NougatImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     @auto_docstring
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[NougatFastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[NougatImageProcessorKwargs]) -> BatchFeature:
         return super().preprocess(images, **kwargs)
 
     def python_find_non_zero(
diff --git a/src/transformers/models/omdet_turbo/processing_omdet_turbo.py b/src/transformers/models/omdet_turbo/processing_omdet_turbo.py
index 0c4cfd40eb62..842fe5d9bddf 100644
--- a/src/transformers/models/omdet_turbo/processing_omdet_turbo.py
+++ b/src/transformers/models/omdet_turbo/processing_omdet_turbo.py
@@ -65,7 +65,6 @@ class OmDetTurboProcessorKwargs(ProcessingKwargs, total=False):
             "verbose": True,
             "task": None,
         },
-        "images_kwargs": {},
     }
 
 
@@ -227,8 +226,6 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Optional[Union[list[str], list[list[str]]]] = None,
-        audio=None,
-        videos=None,
         **kwargs: Unpack[OmDetTurboProcessorKwargs],
     ) -> BatchFeature:
         """
diff --git a/src/transformers/models/oneformer/image_processing_oneformer.py b/src/transformers/models/oneformer/image_processing_oneformer.py
index 756480d81e5a..86ce8abf084e 100644
--- a/src/transformers/models/oneformer/image_processing_oneformer.py
+++ b/src/transformers/models/oneformer/image_processing_oneformer.py
@@ -44,6 +44,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
@@ -63,6 +64,30 @@
     from torch import nn
 
 
+class OneFormerImageProcessorKwargs(ImagesKwargs):
+    r"""
+    repo_path (`str`, *optional*, defaults to `shi-labs/oneformer_demo`):
+        Path to a local directory or Hugging Face Hub repository containing model metadata.
+    class_info_file (`str`, *optional*):
+        Path to the JSON file within the repository that contains class metadata.
+    num_text (`int`, *optional*):
+        Number of text queries for the text encoder, used as task-guiding prompts.
+    num_labels (`int`, *optional*):
+        Number of semantic classes for segmentation, determining the output layer's size.
+    ignore_index (`int`, *optional*):
+        Label to ignore in segmentation maps, often used for padding.
+    do_reduce_labels (`bool`, *optional*, defaults to `False`):
+        Whether to decrement all label values by 1, mapping the background class to `ignore_index`.
+    """
+
+    repo_path: Optional[str]
+    class_info_file: Optional[str]
+    num_text: Optional[int]
+    num_labels: Optional[int]
+    ignore_index: Optional[int]
+    do_reduce_labels: Optional[bool]
+
+
 # Copied from transformers.models.detr.image_processing_detr.max_across_indices
 def max_across_indices(values: Iterable[Any]) -> list[Any]:
     """
@@ -423,6 +448,7 @@ class OneFormerImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values", "pixel_mask", "task_inputs"]
+    valid_kwargs = OneFormerImageProcessorKwargs
 
     @filter_out_non_signature_kwargs(extra=["max_size", "metadata", *INIT_SERVICE_KWARGS])
     def __init__(
diff --git a/src/transformers/models/oneformer/image_processing_oneformer_fast.py b/src/transformers/models/oneformer/image_processing_oneformer_fast.py
index 4a20a04e70f2..a14b0015b498 100644
--- a/src/transformers/models/oneformer/image_processing_oneformer_fast.py
+++ b/src/transformers/models/oneformer/image_processing_oneformer_fast.py
@@ -23,7 +23,6 @@
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
     BatchFeature,
-    DefaultFastImageProcessorKwargs,
     get_max_height_width,
     group_images_by_shape,
     reorder_images,
@@ -42,7 +41,7 @@
     auto_docstring,
     logging,
 )
-from .image_processing_oneformer import load_metadata, prepare_metadata
+from .image_processing_oneformer import OneFormerImageProcessorKwargs, load_metadata, prepare_metadata
 
 
 logger = logging.get_logger(__name__)
@@ -300,30 +299,6 @@ def get_oneformer_resize_output_image_size(
     return (new_long, new_short) if width <= height else (new_short, new_long)
 
 
-class OneFormerFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    r"""
-    repo_path (`str`, *optional*, defaults to `shi-labs/oneformer_demo`):
-        Path to a local directory or Hugging Face Hub repository containing model metadata.
-    class_info_file (`str`, *optional*):
-        Path to the JSON file within the repository that contains class metadata.
-    num_text (`int`, *optional*):
-        Number of text queries for the text encoder, used as task-guiding prompts.
-    num_labels (`int`, *optional*):
-        Number of semantic classes for segmentation, determining the output layer's size.
-    ignore_index (`int`, *optional*):
-        Label to ignore in segmentation maps, often used for padding.
-    do_reduce_labels (`bool`, *optional*, defaults to `False`):
-        Whether to decrement all label values by 1, mapping the background class to `ignore_index`.
-    """
-
-    repo_path: Optional[str]
-    class_info_file: Optional[str]
-    num_text: Optional[int]
-    num_labels: Optional[int]
-    ignore_index: Optional[int]
-    do_reduce_labels: Optional[bool]
-
-
 @auto_docstring
 class OneFormerImageProcessorFast(BaseImageProcessorFast):
     resample = PILImageResampling.BILINEAR
@@ -344,10 +319,10 @@ class OneFormerImageProcessorFast(BaseImageProcessorFast):
     class_info_file = None
     num_text = None
     num_labels = None
-    valid_kwargs = OneFormerFastImageProcessorKwargs
+    valid_kwargs = OneFormerImageProcessorKwargs
     model_input_names = ["pixel_values", "pixel_mask", "task_inputs"]
 
-    def __init__(self, **kwargs: Unpack[OneFormerFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[OneFormerImageProcessorKwargs]):
         super().__init__(**kwargs)
         if self.class_info_file:
             self.metadata = prepare_metadata(load_metadata(self.repo_path, self.class_info_file))
@@ -359,7 +334,7 @@ def preprocess(
         task_inputs: Optional[list[str]] = None,
         segmentation_maps: Optional[ImageInput] = None,
         instance_id_to_semantic_id: Optional[Union[list[dict[int, int]], dict[int, int]]] = None,
-        **kwargs: Unpack[OneFormerFastImageProcessorKwargs],
+        **kwargs: Unpack[OneFormerImageProcessorKwargs],
     ) -> BatchFeature:
         r"""
         task_inputs (`list[str]`, *optional*):
@@ -386,7 +361,7 @@ def _preprocess_image_like_inputs(
         do_convert_rgb: bool,
         input_data_format: ChannelDimension,
         device: Optional[Union[str, "torch.device"]] = None,
-        **kwargs: Unpack[OneFormerFastImageProcessorKwargs],
+        **kwargs: Unpack[OneFormerImageProcessorKwargs],
     ) -> BatchFeature:
         """
         Preprocess image-like inputs.
diff --git a/src/transformers/models/ovis2/image_processing_ovis2.py b/src/transformers/models/ovis2/image_processing_ovis2.py
index 633a7fdee46c..5d0c2e2097d3 100644
--- a/src/transformers/models/ovis2/image_processing_ovis2.py
+++ b/src/transformers/models/ovis2/image_processing_ovis2.py
@@ -33,6 +33,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
 
 
@@ -43,6 +44,29 @@
 logger = logging.get_logger(__name__)
 
 
+class Ovis2ImageProcessorKwargs(ImagesKwargs):
+    """
+    crop_to_patches (`bool`, *optional*, defaults to `False`):
+        Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
+        `preprocess` method.
+    min_patches (`int`, *optional*, defaults to 1):
+        The minimum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
+        set to `True`. Can be overridden by the `min_patches` parameter in the `preprocess` method.
+    max_patches (`int`, *optional*, defaults to 12):
+        The maximum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
+        set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
+    use_covering_area_grid (`bool`, *optional*, defaults to `True`):
+        Whether to use the covering area grid to determine the number of patches. Only has an effect if
+        `crop_to_patches` is set to `True`. Can be overridden by the `use_covering_area_grid` parameter in the
+        `preprocess` method.
+    """
+
+    crop_to_patches: Optional[bool]
+    min_patches: Optional[int]
+    max_patches: Optional[int]
+    use_covering_area_grid: Optional[bool]
+
+
 # Similar to image_processing_mllama.get_all_supported_aspect_ratios
 @lru_cache(maxsize=10)
 def get_all_supported_aspect_ratios(min_image_tiles: int, max_image_tiles: int) -> list[tuple[int, int]]:
@@ -224,6 +248,7 @@ class Ovis2ImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values"]
+    valid_kwargs = Ovis2ImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/ovis2/image_processing_ovis2_fast.py b/src/transformers/models/ovis2/image_processing_ovis2_fast.py
index 04b79299e9e1..ea618e073526 100644
--- a/src/transformers/models/ovis2/image_processing_ovis2_fast.py
+++ b/src/transformers/models/ovis2/image_processing_ovis2_fast.py
@@ -21,7 +21,6 @@
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
     group_images_by_shape,
     reorder_images,
 )
@@ -37,31 +36,7 @@
     TensorType,
     auto_docstring,
 )
-from .image_processing_ovis2 import get_min_tile_covering_grid, get_optimal_tiled_canvas
-
-
-class Ovis2ImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    Args:
-        crop_to_patches (`bool`, *optional*, defaults to `False`):
-            Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
-            `preprocess` method.
-        min_patches (`int`, *optional*, defaults to 1):
-            The minimum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
-            set to `True`. Can be overridden by the `min_patches` parameter in the `preprocess` method.
-        max_patches (`int`, *optional*, defaults to 12):
-            The maximum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
-            set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
-        use_covering_area_grid (`bool`, *optional*, defaults to `True`):
-            Whether to use the covering area grid to determine the number of patches. Only has an effect if
-            `crop_to_patches` is set to `True`. Can be overridden by the `use_covering_area_grid` parameter in the
-            `preprocess` method.
-    """
-
-    crop_to_patches: Optional[bool]
-    min_patches: Optional[int]
-    max_patches: Optional[int]
-    use_covering_area_grid: Optional[bool]
+from .image_processing_ovis2 import Ovis2ImageProcessorKwargs, get_min_tile_covering_grid, get_optimal_tiled_canvas
 
 
 @auto_docstring
diff --git a/src/transformers/models/owlv2/image_processing_owlv2_fast.py b/src/transformers/models/owlv2/image_processing_owlv2_fast.py
index 417fc800ea88..25022f4f6c8c 100644
--- a/src/transformers/models/owlv2/image_processing_owlv2_fast.py
+++ b/src/transformers/models/owlv2/image_processing_owlv2_fast.py
@@ -25,17 +25,9 @@
 import torch
 from torchvision.transforms.v2 import functional as F
 
-from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature, DefaultFastImageProcessorKwargs
+from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature
 from ...image_transforms import center_to_corners_format, group_images_by_shape, reorder_images
-from ...image_utils import (
-    OPENAI_CLIP_MEAN,
-    OPENAI_CLIP_STD,
-    ChannelDimension,
-    ImageInput,
-    PILImageResampling,
-    SizeDict,
-)
-from ...processing_utils import Unpack
+from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ChannelDimension, PILImageResampling, SizeDict
 from ...utils import TensorType, auto_docstring
 from .image_processing_owlv2 import _scale_boxes, box_iou
 
@@ -44,9 +36,6 @@
     from .modeling_owlv2 import Owlv2ObjectDetectionOutput
 
 
-class Owlv2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs): ...
-
-
 @auto_docstring
 class Owlv2ImageProcessorFast(BaseImageProcessorFast):
     resample = PILImageResampling.BILINEAR
@@ -63,7 +52,6 @@ class Owlv2ImageProcessorFast(BaseImageProcessorFast):
     model_input_names = ["pixel_values"]
     rescale_factor = 1 / 255
     do_pad = True
-    valid_kwargs = Owlv2FastImageProcessorKwargs
 
     def post_process(self, outputs, target_sizes):
         """
@@ -240,13 +228,6 @@ def post_process_image_guided_detection(self, outputs, threshold=0.0, nms_thresh
 
         return results
 
-    def __init__(self, **kwargs: Unpack[Owlv2FastImageProcessorKwargs]):
-        super().__init__(**kwargs)
-
-    @auto_docstring
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[Owlv2FastImageProcessorKwargs]):
-        return super().preprocess(images, **kwargs)
-
     def _pad_images(self, images: "torch.Tensor", constant_value: float = 0.5) -> "torch.Tensor":
         """
         Pad an image with zeros to the given size.
diff --git a/src/transformers/models/owlv2/modular_owlv2.py b/src/transformers/models/owlv2/modular_owlv2.py
index 66acd2088399..c58db1efd46e 100644
--- a/src/transformers/models/owlv2/modular_owlv2.py
+++ b/src/transformers/models/owlv2/modular_owlv2.py
@@ -21,20 +21,16 @@
 from torchvision.transforms.v2 import functional as F
 
 from ...image_processing_utils_fast import (
-    BaseImageProcessorFast,
     BatchFeature,
-    DefaultFastImageProcessorKwargs,
 )
 from ...image_transforms import group_images_by_shape, reorder_images
 from ...image_utils import (
     OPENAI_CLIP_MEAN,
     OPENAI_CLIP_STD,
     ChannelDimension,
-    ImageInput,
     PILImageResampling,
     SizeDict,
 )
-from ...processing_utils import Unpack
 from ...utils import (
     TensorType,
     auto_docstring,
@@ -42,9 +38,6 @@
 from ..owlvit.image_processing_owlvit_fast import OwlViTImageProcessorFast
 
 
-class Owlv2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs): ...
-
-
 @auto_docstring
 class Owlv2ImageProcessorFast(OwlViTImageProcessorFast):
     resample = PILImageResampling.BILINEAR
@@ -56,17 +49,9 @@ class Owlv2ImageProcessorFast(OwlViTImageProcessorFast):
     do_rescale = True
     do_normalize = True
     do_pad = True
-    valid_kwargs = Owlv2FastImageProcessorKwargs
     crop_size = None
     do_center_crop = None
 
-    def __init__(self, **kwargs: Unpack[Owlv2FastImageProcessorKwargs]):
-        BaseImageProcessorFast.__init__(self, **kwargs)
-
-    @auto_docstring
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[Owlv2FastImageProcessorKwargs]):
-        return BaseImageProcessorFast.preprocess(self, images, **kwargs)
-
     def _pad_images(self, images: "torch.Tensor", constant_value: float = 0.5) -> "torch.Tensor":
         """
         Pad an image with zeros to the given size.
diff --git a/src/transformers/models/owlv2/processing_owlv2.py b/src/transformers/models/owlv2/processing_owlv2.py
index c22f9b045ae9..65f111e2ca79 100644
--- a/src/transformers/models/owlv2/processing_owlv2.py
+++ b/src/transformers/models/owlv2/processing_owlv2.py
@@ -47,7 +47,6 @@ class Owlv2ProcessorKwargs(ProcessingKwargs, total=False):
         "text_kwargs": {
             "padding": "max_length",
         },
-        "images_kwargs": {},
         "common_kwargs": {
             "return_tensors": "np",
         },
@@ -79,8 +78,6 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
-        audio=None,
-        videos=None,
         **kwargs: Unpack[Owlv2ProcessorKwargs],
     ) -> BatchFeature:
         """
@@ -123,7 +120,7 @@ def __call__(
             **kwargs,
         )
         query_images = output_kwargs["images_kwargs"].pop("query_images", None)
-        return_tensors = output_kwargs["common_kwargs"]["return_tensors"]
+        return_tensors = output_kwargs["text_kwargs"]["return_tensors"]
 
         if text is None and query_images is None and images is None:
             raise ValueError(
diff --git a/src/transformers/models/owlvit/processing_owlvit.py b/src/transformers/models/owlvit/processing_owlvit.py
index e4feef67da9d..e7fb401d9a76 100644
--- a/src/transformers/models/owlvit/processing_owlvit.py
+++ b/src/transformers/models/owlvit/processing_owlvit.py
@@ -47,7 +47,6 @@ class OwlViTProcessorKwargs(ProcessingKwargs, total=False):
         "text_kwargs": {
             "padding": "max_length",
         },
-        "images_kwargs": {},
         "common_kwargs": {
             "return_tensors": "np",
         },
@@ -89,8 +88,6 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
-        audio=None,
-        videos=None,
         **kwargs: Unpack[OwlViTProcessorKwargs],
     ) -> BatchFeature:
         """
@@ -133,7 +130,7 @@ def __call__(
             **kwargs,
         )
         query_images = output_kwargs["images_kwargs"].pop("query_images", None)
-        return_tensors = output_kwargs["common_kwargs"]["return_tensors"]
+        return_tensors = output_kwargs["text_kwargs"]["return_tensors"]
 
         if text is None and query_images is None and images is None:
             raise ValueError(
diff --git a/src/transformers/models/paligemma/processing_paligemma.py b/src/transformers/models/paligemma/processing_paligemma.py
index c4c618a4d958..7fa636ab796b 100644
--- a/src/transformers/models/paligemma/processing_paligemma.py
+++ b/src/transformers/models/paligemma/processing_paligemma.py
@@ -23,7 +23,6 @@
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, is_valid_image
 from ...processing_utils import (
-    ImagesKwargs,
     MultiModalData,
     ProcessingKwargs,
     ProcessorMixin,
@@ -44,13 +43,8 @@ class PaliGemmaTextKwargs(TextKwargs):
     suffix: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]]
 
 
-class PaliGemmaImagesKwargs(ImagesKwargs):
-    do_convert_rgb: Optional[bool]
-
-
 class PaliGemmaProcessorKwargs(ProcessingKwargs, total=False):
     text_kwargs: PaliGemmaTextKwargs
-    images_kwargs: PaliGemmaImagesKwargs
     _defaults = {
         "text_kwargs": {
             "padding": False,
@@ -150,8 +144,6 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
-        audio=None,
-        videos=None,
         **kwargs: Unpack[PaliGemmaProcessorKwargs],
     ) -> BatchFeature:
         """
diff --git a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
index c26132a48439..c6491b4bc703 100644
--- a/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
+++ b/src/transformers/models/perception_lm/image_processing_perception_lm_fast.py
@@ -25,7 +25,6 @@
 )
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
     get_image_size,
     group_images_by_shape,
     reorder_images,
@@ -36,11 +35,14 @@
     ChannelDimension,
     PILImageResampling,
 )
-from ...processing_utils import Unpack
-from ...utils import TensorType, auto_docstring
+from ...processing_utils import ImagesKwargs, Unpack
+from ...utils import (
+    TensorType,
+    auto_docstring,
+)
 
 
-class PerceptionLMFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+class PerceptionLMImageProcessorKwargs(ImagesKwargs):
     r"""
     vision_input_type (`str`, *optional*, defaults to `"thumb+tile"`):
         Vision processing strategy. `"thumb+tile"` uses both thumbnails and multiple tiles for
@@ -51,9 +53,9 @@ class PerceptionLMFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
         Maximum number of tiles an image can be split into based on its aspect ratio.
     """
 
-    vision_input_type: str = "thumb+tile"
-    tile_size: int = 448
-    max_num_tiles: int = 36
+    vision_input_type: Optional[str]
+    tile_size: Optional[int]
+    max_num_tiles: Optional[int]
 
 
 @auto_docstring
@@ -66,14 +68,17 @@ class PerceptionLMImageProcessorFast(BaseImageProcessorFast):
     do_rescale = True
     do_normalize = True
     do_convert_rgb = True
+    vision_input_type = "thumb+tail"
+    tile_size = 448
+    max_num_tiles = 36
     size = {"width": 448, "height": 448}  # for backward compatibility in tests
-    valid_kwargs = PerceptionLMFastImageProcessorKwargs
+    valid_kwargs = PerceptionLMImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]) -> None:
+    def __init__(self, **kwargs: Unpack[PerceptionLMImageProcessorKwargs]) -> None:
         super().__init__(**kwargs)
 
     @auto_docstring
-    def preprocess(self, images, **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images, **kwargs: Unpack[PerceptionLMImageProcessorKwargs]) -> BatchFeature:
         return super().preprocess(images, **kwargs)
 
     @staticmethod
@@ -267,7 +272,7 @@ def _preprocess(
         max_num_tiles: int,
         return_tensors: Optional[Union[str, TensorType]],
         disable_grouping: bool,
-        **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs],
+        **kwargs: Unpack[PerceptionLMImageProcessorKwargs],
     ) -> BatchFeature:
         # Group images by size for batched transformation
         grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
diff --git a/src/transformers/models/perception_lm/processing_perception_lm.py b/src/transformers/models/perception_lm/processing_perception_lm.py
index 35f0fef6c4ca..e57418ef92f7 100644
--- a/src/transformers/models/perception_lm/processing_perception_lm.py
+++ b/src/transformers/models/perception_lm/processing_perception_lm.py
@@ -89,7 +89,6 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
-        audio=None,
         videos: Optional[VideoInput] = None,
         **kwargs: Unpack[PerceptionLMProcessorKwargs],
     ) -> BatchFeature:
diff --git a/src/transformers/models/perception_lm/video_processing_perception_lm.py b/src/transformers/models/perception_lm/video_processing_perception_lm.py
index 1023aa7c589d..9b5c5d0b67a0 100644
--- a/src/transformers/models/perception_lm/video_processing_perception_lm.py
+++ b/src/transformers/models/perception_lm/video_processing_perception_lm.py
@@ -14,13 +14,9 @@
 """Video processor class for PerceptionLM."""
 
 from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, PILImageResampling
-from ...processing_utils import Unpack, VideosKwargs
 from ...video_processing_utils import BaseVideoProcessor
 
 
-class PerceptionLMFastVideoProcessorInitKwargs(VideosKwargs): ...
-
-
 class PerceptionLMVideoProcessor(BaseVideoProcessor):
     resample = PILImageResampling.BICUBIC
     image_mean = IMAGENET_STANDARD_MEAN
@@ -31,11 +27,6 @@ class PerceptionLMVideoProcessor(BaseVideoProcessor):
     do_rescale = True
     do_normalize = True
     do_convert_rgb = True
-    valid_kwargs = PerceptionLMFastVideoProcessorInitKwargs
-    model_input_names = ["pixel_values_videos"]
-
-    def __init__(self, **kwargs: Unpack[PerceptionLMFastVideoProcessorInitKwargs]):
-        super().__init__(**kwargs)
 
 
 __all__ = ["PerceptionLMVideoProcessor"]
diff --git a/src/transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py b/src/transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py
index 4bd9928daa94..5c750fae953e 100644
--- a/src/transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py
+++ b/src/transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py
@@ -21,10 +21,10 @@
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
     BatchFeature,
-    DefaultFastImageProcessorKwargs,
     Unpack,
 )
 from ...image_utils import ImageInput, PILImageResampling, SizeDict
+from ...processing_utils import ImagesKwargs
 from ...utils import (
     TensorType,
     auto_docstring,
@@ -35,7 +35,7 @@
 logger = logging.get_logger(__name__)
 
 
-class Phi4MultimodalFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+class Phi4MultimodalImageProcessorKwargs(ImagesKwargs):
     r"""
     patch_size (`int`, *optional*):
         The size of the patch.
@@ -59,10 +59,10 @@ class Phi4MultimodalImageProcessorFast(BaseImageProcessorFast):
     do_rescale = True
     do_normalize = True
     do_convert_rgb = True
-    valid_kwargs = Phi4MultimodalFastImageProcessorKwargs
+    valid_kwargs = Phi4MultimodalImageProcessorKwargs
     model_input_names = ["image_pixel_values", "image_sizes", "image_attention_mask"]
 
-    def __init__(self, **kwargs: Unpack[Phi4MultimodalFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[Phi4MultimodalImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     def find_closest_aspect_ratio(self, aspect_ratio, target_ratios, width, height, image_size):
@@ -158,7 +158,7 @@ def pad_mask_to_max_num_crops(self, masks, max_crops=5):
     def preprocess(
         self,
         images: ImageInput,
-        **kwargs: Unpack[Phi4MultimodalFastImageProcessorKwargs],
+        **kwargs: Unpack[Phi4MultimodalImageProcessorKwargs],
     ) -> BatchFeature:
         return super().preprocess(images, **kwargs)
 
diff --git a/src/transformers/models/pix2struct/image_processing_pix2struct.py b/src/transformers/models/pix2struct/image_processing_pix2struct.py
index 316f2021461c..e0c630369029 100644
--- a/src/transformers/models/pix2struct/image_processing_pix2struct.py
+++ b/src/transformers/models/pix2struct/image_processing_pix2struct.py
@@ -32,6 +32,7 @@
     to_numpy_array,
     valid_images,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, is_torch_available, is_vision_available, logging
 from ...utils.import_utils import requires_backends
 
@@ -48,6 +49,18 @@
 DEFAULT_FONT_PATH = "ybelkada/fonts"
 
 
+class Pix2StructImageProcessorKwargs(ImagesKwargs):
+    """
+    max_patches (`int`, *optional*):
+        Maximum number of patches to extract.
+    header_text (`Union[list[str], str]`, *optional*):
+        Text to render as a header. Only has an effect if `image_processor.is_vqa` is `True`.
+    """
+
+    max_patches: Optional[int]
+    header_text: Optional[Union[list[str], str]]
+
+
 # adapted from: https://discuss.pytorch.org/t/tf-image-extract-patches-in-pytorch/171409/2
 def torch_extract_patches(image_tensor, patch_height, patch_width):
     """
@@ -208,6 +221,7 @@ class Pix2StructImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["flattened_patches", "attention_mask"]
+    valid_kwargs = Pix2StructImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/pix2struct/processing_pix2struct.py b/src/transformers/models/pix2struct/processing_pix2struct.py
index f21dd5d7a002..fba2fe93ef19 100644
--- a/src/transformers/models/pix2struct/processing_pix2struct.py
+++ b/src/transformers/models/pix2struct/processing_pix2struct.py
@@ -16,21 +16,15 @@
 Processor class for Pix2Struct.
 """
 
-from typing import Optional, Union
+from typing import Union
 
 from ...feature_extraction_utils import BatchFeature
-from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
 from ...utils import logging
 
 
-class Pix2StructImagesKwargs(ImagesKwargs, total=False):
-    max_patches: Optional[int]
-    header_text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]]
-
-
 class Pix2StructProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: Pix2StructImagesKwargs
     _defaults = {
         "text_kwargs": {
             "add_special_tokens": True,
@@ -79,8 +73,6 @@ def __call__(
         self,
         images=None,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
-        audio=None,
-        videos=None,
         **kwargs: Unpack[Pix2StructProcessorKwargs],
     ) -> Union[BatchEncoding, BatchFeature]:
         """
diff --git a/src/transformers/models/pixtral/image_processing_pixtral.py b/src/transformers/models/pixtral/image_processing_pixtral.py
index 2be667e5dc6e..f5df895e66a4 100644
--- a/src/transformers/models/pixtral/image_processing_pixtral.py
+++ b/src/transformers/models/pixtral/image_processing_pixtral.py
@@ -38,6 +38,7 @@
     validate_kwargs,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, is_vision_available, logging
 from ...utils.import_utils import requires_backends
 
@@ -49,6 +50,15 @@
     import PIL
 
 
+class PixtralImageProcessorKwargs(ImagesKwargs):
+    """
+    patch_size (`dict[str, int]` *optional*, defaults to `{"height": 16, "width": 16}`):
+        Size of the patches in the model, used to calculate the output image size. Can be overridden by `patch_size` in the `preprocess` method.
+    """
+
+    patch_size: Optional[dict[str, int]]
+
+
 # Adapted from function in image_transforms.py to ensure any transparent pixels are converted to white.
 def convert_to_rgb(image: ImageInput) -> ImageInput:
     """
@@ -171,6 +181,7 @@ class PixtralImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values", "image_sizes"]
+    valid_kwargs = PixtralImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/pixtral/image_processing_pixtral_fast.py b/src/transformers/models/pixtral/image_processing_pixtral_fast.py
index b31f910e4817..4a877d633048 100644
--- a/src/transformers/models/pixtral/image_processing_pixtral_fast.py
+++ b/src/transformers/models/pixtral/image_processing_pixtral_fast.py
@@ -22,7 +22,6 @@
 from ...image_processing_utils import BatchFeature, get_size_dict
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
     group_images_by_shape,
     reorder_images,
 )
@@ -33,21 +32,12 @@
     auto_docstring,
     logging,
 )
-from .image_processing_pixtral import get_resize_output_image_size
+from .image_processing_pixtral import PixtralImageProcessorKwargs, get_resize_output_image_size
 
 
 logger = logging.get_logger(__name__)
 
 
-class PixtralFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    patch_size (`dict[str, int]` *optional*, defaults to `{"height": 16, "width": 16}`):
-        Size of the patches in the model, used to calculate the output image size. Can be overridden by `patch_size` in the `preprocess` method.
-    """
-
-    patch_size: Optional[dict[str, int]]
-
-
 @auto_docstring
 class PixtralImageProcessorFast(BaseImageProcessorFast):
     resample = PILImageResampling.BICUBIC
@@ -60,15 +50,15 @@ class PixtralImageProcessorFast(BaseImageProcessorFast):
     do_rescale = True
     do_normalize = True
     do_convert_rgb = True
-    valid_kwargs = PixtralFastImageProcessorKwargs
+    valid_kwargs = PixtralImageProcessorKwargs
 
     model_input_names = ["pixel_values", "image_sizes"]
 
-    def __init__(self, **kwargs: Unpack[PixtralFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[PixtralImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     @auto_docstring
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[PixtralFastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[PixtralImageProcessorKwargs]) -> BatchFeature:
         return super().preprocess(images, **kwargs)
 
     def resize(
diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py
index bf4eb9307c72..5bb9fd780328 100644
--- a/src/transformers/models/pixtral/processing_pixtral.py
+++ b/src/transformers/models/pixtral/processing_pixtral.py
@@ -45,7 +45,6 @@ class PixtralProcessorKwargs(ProcessingKwargs, total=False):
             "padding": False,
             "return_mm_token_type_ids": False,
         },
-        "images_kwargs": {},
         "common_kwargs": {
             "return_tensors": "pt",
         },
@@ -120,8 +119,6 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
-        audio=None,
-        videos=None,
         **kwargs: Unpack[PixtralProcessorKwargs],
     ) -> BatchFeature:
         """
@@ -164,7 +161,8 @@ def __call__(
         patch_size = self.patch_size * self.spatial_merge_size
 
         if images is not None:
-            image_inputs = self.image_processor(images, patch_size=patch_size, **output_kwargs["images_kwargs"])
+            output_kwargs["images_kwargs"]["patch_size"] = patch_size
+            image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
         else:
             image_inputs = {}
 
diff --git a/src/transformers/models/poolformer/image_processing_poolformer.py b/src/transformers/models/poolformer/image_processing_poolformer.py
index ce3cd398745c..7d03f8281285 100644
--- a/src/transformers/models/poolformer/image_processing_poolformer.py
+++ b/src/transformers/models/poolformer/image_processing_poolformer.py
@@ -37,6 +37,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
 
 
@@ -47,6 +48,15 @@
 logger = logging.get_logger(__name__)
 
 
+class PoolFormerImageProcessorKwargs(ImagesKwargs):
+    r"""
+    crop_pct (`float`, *optional*, defaults to `self.crop_pct`):
+        Percentage of the image to crop. Only has an effect if `do_resize` is set to `True`.
+    """
+
+    crop_pct: Optional[float]
+
+
 class PoolFormerImageProcessor(BaseImageProcessor):
     r"""
     Constructs a PoolFormer image processor.
@@ -99,6 +109,7 @@ class PoolFormerImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values"]
+    valid_kwargs = PoolFormerImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/poolformer/image_processing_poolformer_fast.py b/src/transformers/models/poolformer/image_processing_poolformer_fast.py
index 62d5f276859f..594d076a924c 100644
--- a/src/transformers/models/poolformer/image_processing_poolformer_fast.py
+++ b/src/transformers/models/poolformer/image_processing_poolformer_fast.py
@@ -19,7 +19,7 @@
 import torch
 from torchvision.transforms.v2 import functional as F
 
-from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature, DefaultFastImageProcessorKwargs
+from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature
 from ...image_transforms import (
     ChannelDimension,
     get_resize_output_image_size,
@@ -40,16 +40,7 @@
     TensorType,
     auto_docstring,
 )
-
-
-class PoolFormerFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    Args:
-        crop_pct (`float`, *optional*, defaults to `self.crop_pct`):
-            Percentage of the image to crop. Only has an effect if `do_resize` is set to `True`.
-    """
-
-    crop_pct: Optional[float]
+from .image_processing_poolformer import PoolFormerImageProcessorKwargs
 
 
 @auto_docstring
@@ -65,13 +56,13 @@ class PoolFormerImageProcessorFast(BaseImageProcessorFast):
     do_center_crop = True
     do_rescale = True
     do_normalize = True
-    valid_kwargs = PoolFormerFastImageProcessorKwargs
+    valid_kwargs = PoolFormerImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[PoolFormerFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[PoolFormerImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     @auto_docstring
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[PoolFormerFastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[PoolFormerImageProcessorKwargs]) -> BatchFeature:
         return super().preprocess(images, **kwargs)
 
     def resize(
diff --git a/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py b/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py
index 7224aeef8612..b8220a30fa42 100644
--- a/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py
+++ b/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py
@@ -38,6 +38,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import (
     TensorType,
     filter_out_non_signature_kwargs,
@@ -53,6 +54,22 @@
 logger = logging.get_logger(__name__)
 
 
+class PromptDepthAnythingImageProcessorKwargs(ImagesKwargs):
+    r"""
+    keep_aspect_ratio (`bool`, *optional*):
+        If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved.
+    ensure_multiple_of (`int`, *optional*):
+        If `do_resize` is `True`, the image is resized to a size that is a multiple of this value.
+    prompt_scale_to_meter (`float`, *optional*):
+        Scale factor to convert the prompt depth to meters.
+    """
+
+    keep_aspect_ratio: Optional[bool]
+    ensure_multiple_of: Optional[int]
+    size_divisor: Optional[int]
+    prompt_scale_to_meter: Optional[float]
+
+
 def _constrain_to_multiple_of(val, multiple, min_val=0, max_val=None):
     x = round(val / multiple) * multiple
 
@@ -136,6 +153,7 @@ class PromptDepthAnythingImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values", "prompt_depth"]
+    valid_kwargs = PromptDepthAnythingImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py b/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py
index 06d6ed156443..552d921700bc 100644
--- a/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py
+++ b/src/transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py
@@ -28,7 +28,6 @@
 
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
     group_images_by_shape,
     reorder_images,
 )
@@ -45,6 +44,7 @@
     auto_docstring,
     requires_backends,
 )
+from .image_processing_prompt_depth_anything import PromptDepthAnythingImageProcessorKwargs
 
 
 def _constrain_to_multiple_of(val, multiple, min_val=0, max_val=None):
@@ -89,27 +89,6 @@ def _get_resize_output_image_size(
     return (new_height, new_width)
 
 
-class PromptDepthAnythingFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    keep_aspect_ratio (`bool`, *optional*):
-        If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved.
-    ensure_multiple_of (`int`, *optional*):
-        If `do_resize` is `True`, the image is resized to a size that is a multiple of this value.
-    do_pad (`bool`, *optional*):
-        Whether to apply center padding.
-    size_divisor (`int`, *optional*):
-        If `do_pad` is `True`, pads the image dimensions to be divisible by this value.
-    prompt_scale_to_meter (`float`, *optional*):
-        Scale factor to convert the prompt depth to meters.
-    """
-
-    keep_aspect_ratio: Optional[bool]
-    ensure_multiple_of: Optional[int]
-    do_pad: Optional[bool]
-    size_divisor: Optional[int]
-    prompt_scale_to_meter: Optional[float]
-
-
 @auto_docstring
 class PromptDepthAnythingImageProcessorFast(BaseImageProcessorFast):
     model_input_names = ["pixel_values", "prompt_depth"]
@@ -126,9 +105,9 @@ class PromptDepthAnythingImageProcessorFast(BaseImageProcessorFast):
     do_pad = False
     size_divisor = None
     prompt_scale_to_meter = 0.001
-    valid_kwargs = PromptDepthAnythingFastImageProcessorKwargs
+    valid_kwargs = PromptDepthAnythingImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[PromptDepthAnythingFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[PromptDepthAnythingImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     @auto_docstring
@@ -136,7 +115,7 @@ def preprocess(
         self,
         images: ImageInput,
         prompt_depth: Optional[ImageInput] = None,
-        **kwargs: Unpack[PromptDepthAnythingFastImageProcessorKwargs],
+        **kwargs: Unpack[PromptDepthAnythingImageProcessorKwargs],
     ) -> BatchFeature:
         r"""
         prompt_depth (`ImageInput`, *optional*):
@@ -213,7 +192,7 @@ def _preprocess_image_like_inputs(
         device: Optional[Union[str, "torch.device"]] = None,
         prompt_scale_to_meter: Optional[float] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs: Unpack[PromptDepthAnythingFastImageProcessorKwargs],
+        **kwargs: Unpack[PromptDepthAnythingImageProcessorKwargs],
     ) -> BatchFeature:
         """
         Preprocess image-like inputs, including the main images and optional prompt depth.
diff --git a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
index 5fcbb0c535f9..95f687e1414a 100644
--- a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
+++ b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
@@ -25,34 +25,29 @@
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput
-from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs
 from ...tokenization_utils_base import AudioInput, PreTokenizedInput, TextInput
 from ...video_utils import VideoInput
 
 
+# Redefine kwargs for videos because Qwen-Omni uses some kwargs for processing omni
+# and does not use them in video processor class
 class Qwen2_5_OmniVideosKwargs(VideosKwargs):
-    fps: Optional[list[Union[int, float]]]
-    use_audio_in_video: Optional[bool]
-    seconds_per_chunk: Optional[float]
-    position_id_per_seconds: Optional[int]
-    min_pixels: Optional[int]
-    max_pixels: Optional[int]
-    patch_size: Optional[int]
-    temporal_patch_size: Optional[int]
-    merge_size: Optional[int]
-
-
-class Qwen2_5_OmniImagesKwargs(ImagesKwargs):
     min_pixels: Optional[int]
     max_pixels: Optional[int]
     patch_size: Optional[int]
     temporal_patch_size: Optional[int]
     merge_size: Optional[int]
+    min_frames: Optional[int]
+    max_frames: Optional[int]
+    use_audio_in_video: Optional[bool]
+    seconds_per_chunk: Optional[float]
+    position_id_per_seconds: Optional[int]
 
 
 class Qwen2_5OmniProcessorKwargs(ProcessingKwargs, total=False):
     videos_kwargs: Qwen2_5_OmniVideosKwargs
-    images_kwargs: Qwen2_5_OmniImagesKwargs
+
     _defaults = {
         "text_kwargs": {
             "padding": False,
diff --git a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
index 2a2ee775b7be..3d3f325e86f6 100644
--- a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
@@ -40,7 +40,7 @@
     VisionAttention,
     VisionRotaryEmbedding,
 )
-from transformers.models.qwen2_vl.processing_qwen2_vl import Qwen2VLImagesKwargs, Qwen2VLProcessor
+from transformers.models.qwen2_vl.processing_qwen2_vl import Qwen2VLProcessor
 
 from ...activations import ACT2FN
 from ...cache_utils import Cache
@@ -49,7 +49,7 @@
 from ...image_utils import ImageInput
 from ...modeling_flash_attention_utils import is_flash_attn_available
 from ...modeling_layers import GradientCheckpointingLayer
-from ...processing_utils import MultiModalData, ProcessingKwargs, Unpack, VideosKwargs
+from ...processing_utils import MultiModalData, ProcessingKwargs, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import is_torchdynamo_compiling, logging
 from ...video_utils import VideoInput
@@ -839,17 +839,7 @@ def prepare_inputs_for_generation(
         return model_inputs
 
 
-class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False):
-    fps: Union[list[float], float]
-
-
-class Qwen2_5_VLImagesKwargs(Qwen2VLImagesKwargs):
-    pass
-
-
 class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: Qwen2_5_VLImagesKwargs
-    videos_kwargs: Qwen2_5_VLVideosProcessorKwargs
     _defaults = {
         "text_kwargs": {
             "padding": False,
diff --git a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
index 0b2fc3dbfc38..8d249fb2d51c 100644
--- a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
@@ -30,26 +30,12 @@
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput
-from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs
+from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...video_utils import VideoInput
 
 
-class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False):
-    fps: Union[list[float], float]
-
-
-class Qwen2_5_VLImagesKwargs(ImagesKwargs):
-    min_pixels: Optional[int]
-    max_pixels: Optional[int]
-    patch_size: Optional[int]
-    temporal_patch_size: Optional[int]
-    merge_size: Optional[int]
-
-
 class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: Qwen2_5_VLImagesKwargs
-    videos_kwargs: Qwen2_5_VLVideosProcessorKwargs
     _defaults = {
         "text_kwargs": {
             "padding": False,
diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
index 36a58d68730b..fe218bd05b9d 100644
--- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py
@@ -44,6 +44,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, logging
 from ...video_utils import VideoInput, make_batched_videos
 
@@ -51,6 +52,27 @@
 logger = logging.get_logger(__name__)
 
 
+class Qwen2VLImageProcessorKwargs(ImagesKwargs):
+    r"""
+    min_pixels (`int`, *optional*, defaults to `56 * 56`):
+        The min pixels of the image to resize the image.
+    max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
+        The max pixels of the image to resize the image.
+    patch_size (`int`, *optional*, defaults to 14):
+        The spatial patch size of the vision encoder.
+    temporal_patch_size (`int`, *optional*, defaults to 2):
+        The temporal patch size of the vision encoder.
+    merge_size (`int`, *optional*, defaults to 2):
+        The merge size of the vision encoder to llm encoder.
+    """
+
+    min_pixels: Optional[int]
+    max_pixels: Optional[int]
+    patch_size: Optional[int]
+    temporal_patch_size: Optional[int]
+    merge_size: Optional[int]
+
+
 def smart_resize(
     height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4 * 1280
 ):
@@ -116,6 +138,7 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw"]
+    valid_kwargs = Qwen2VLImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py
index ec9878da3222..bcbcaa6a8ad4 100644
--- a/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py
+++ b/src/transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py
@@ -27,7 +27,6 @@
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
     group_images_by_shape,
     reorder_images,
 )
@@ -46,33 +45,12 @@
     logging,
 )
 from ...video_utils import VideoInput, make_batched_videos
-from .image_processing_qwen2_vl import smart_resize
+from .image_processing_qwen2_vl import Qwen2VLImageProcessorKwargs, smart_resize
 
 
 logger = logging.get_logger(__name__)
 
 
-class Qwen2VLFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    min_pixels (`int`, *optional*, defaults to `56 * 56`):
-        The min pixels of the image to resize the image.
-    max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
-        The max pixels of the image to resize the image.
-    patch_size (`int`, *optional*, defaults to 14):
-        The spatial patch size of the vision encoder.
-    temporal_patch_size (`int`, *optional*, defaults to 2):
-        The temporal patch size of the vision encoder.
-    merge_size (`int`, *optional*, defaults to 2):
-        The merge size of the vision encoder to llm encoder.
-    """
-
-    min_pixels: Optional[int]
-    max_pixels: Optional[int]
-    patch_size: Optional[int]
-    temporal_patch_size: Optional[int]
-    merge_size: Optional[int]
-
-
 @auto_docstring
 class Qwen2VLImageProcessorFast(BaseImageProcessorFast):
     do_resize = True
@@ -88,10 +66,10 @@ class Qwen2VLImageProcessorFast(BaseImageProcessorFast):
     merge_size = 2
     min_pixels = None
     max_pixels = None
-    valid_kwargs = Qwen2VLFastImageProcessorKwargs
+    valid_kwargs = Qwen2VLImageProcessorKwargs
     model_input_names = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw"]
 
-    def __init__(self, **kwargs: Unpack[Qwen2VLFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[Qwen2VLImageProcessorKwargs]):
         size = kwargs.pop("size", None)
         min_pixels = kwargs.pop("min_pixels", None)
         max_pixels = kwargs.pop("max_pixels", None)
@@ -136,7 +114,7 @@ def preprocess(
         self,
         images: ImageInput,
         videos: Optional[VideoInput] = None,
-        **kwargs: Unpack[Qwen2VLFastImageProcessorKwargs],
+        **kwargs: Unpack[Qwen2VLImageProcessorKwargs],
     ) -> BatchFeature:
         return super().preprocess(images, videos, **kwargs)
 
@@ -147,7 +125,7 @@ def _preprocess_image_like_inputs(
         do_convert_rgb: bool,
         input_data_format: ChannelDimension,
         device: Optional[Union[str, "torch.device"]] = None,
-        **kwargs: Unpack[DefaultFastImageProcessorKwargs],
+        **kwargs: Unpack[Qwen2VLImageProcessorKwargs],
     ) -> BatchFeature:
         """
         Preprocess image-like inputs.
diff --git a/src/transformers/models/qwen2_vl/processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/processing_qwen2_vl.py
index b237cb6079fb..f630d039edbd 100644
--- a/src/transformers/models/qwen2_vl/processing_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/processing_qwen2_vl.py
@@ -27,7 +27,7 @@
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput
-from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
+from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
 from ...video_utils import VideoInput
@@ -36,16 +36,7 @@
 logger = logging.get_logger(__name__)
 
 
-class Qwen2VLImagesKwargs(ImagesKwargs):
-    min_pixels: Optional[int]
-    max_pixels: Optional[int]
-    patch_size: Optional[int]
-    temporal_patch_size: Optional[int]
-    merge_size: Optional[int]
-
-
 class Qwen2VLProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: Qwen2VLImagesKwargs
     _defaults = {
         "text_kwargs": {
             "padding": False,
diff --git a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py
index 86041fc3de16..a118f7d2260b 100644
--- a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py
+++ b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py
@@ -20,41 +20,35 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import re
-from typing import Optional, Union
+from typing import Optional
 
 import numpy as np
 
 from ...audio_utils import AudioInput
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput
-from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, VideosKwargs
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, VideosKwargs
 from ...tokenization_utils_base import TextInput
 from ...video_utils import VideoInput, make_batched_videos
 
 
+# Redefine kwargs for videos because Qwen-Omni uses some kwargs for processing omni
+# and does not use them in video processor class
 class Qwen3OmniMoeVideosKwargs(VideosKwargs):
-    fps: Optional[list[Union[int, float]]]
-    use_audio_in_video: Optional[bool]
-    seconds_per_chunk: Optional[float]
-    position_id_per_seconds: Optional[int]
-    min_pixels: Optional[int]
-    max_pixels: Optional[int]
-    patch_size: Optional[int]
-    temporal_patch_size: Optional[int]
-    merge_size: Optional[int]
-
-
-class Qwen3OmniMoeImagesKwargs(ImagesKwargs):
     min_pixels: Optional[int]
     max_pixels: Optional[int]
     patch_size: Optional[int]
     temporal_patch_size: Optional[int]
     merge_size: Optional[int]
+    min_frames: Optional[int]
+    max_frames: Optional[int]
+    use_audio_in_video: Optional[bool]
+    seconds_per_chunk: Optional[float]
+    position_id_per_seconds: Optional[int]
 
 
 class Qwen3OmniMoeProcessorKwargs(ProcessingKwargs, total=False):
     videos_kwargs: Qwen3OmniMoeVideosKwargs
-    images_kwargs: Qwen3OmniMoeImagesKwargs
     _defaults = {
         "text_kwargs": {
             "padding": False,
diff --git a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py
index 9c479c6bdd23..ff5346fb94b6 100644
--- a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py
+++ b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py
@@ -31,7 +31,7 @@
 from ...modeling_outputs import BaseModelOutputWithPast
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update, rope_config_validation
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
-from ...processing_utils import ProcessingKwargs, Unpack, VideosKwargs
+from ...processing_utils import ProcessingKwargs, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import auto_docstring, is_torchdynamo_compiling, logging
 from ...utils.generic import check_model_inputs
@@ -50,7 +50,7 @@
     VisionAttention,
     VisionRotaryEmbedding,
 )
-from ..qwen2_vl.processing_qwen2_vl import Qwen2VLImagesKwargs, Qwen2VLProcessor
+from ..qwen2_vl.processing_qwen2_vl import Qwen2VLProcessor
 from ..qwen3.modeling_qwen3 import (
     Qwen3Attention,
     Qwen3DecoderLayer,
@@ -1252,17 +1252,7 @@ def prepare_inputs_for_generation(
         return model_inputs
 
 
-class Qwen3VLVideosProcessorKwargs(VideosKwargs, total=False):
-    pass
-
-
-class Qwen3VLImagesKwargs(Qwen2VLImagesKwargs):
-    pass
-
-
 class Qwen3VLProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: Qwen3VLImagesKwargs
-    videos_kwargs: Qwen3VLVideosProcessorKwargs
     _defaults = {
         "text_kwargs": {
             "padding": False,
diff --git a/src/transformers/models/qwen3_vl/processing_qwen3_vl.py b/src/transformers/models/qwen3_vl/processing_qwen3_vl.py
index 03c709cb0b3f..49199f0c3ecc 100644
--- a/src/transformers/models/qwen3_vl/processing_qwen3_vl.py
+++ b/src/transformers/models/qwen3_vl/processing_qwen3_vl.py
@@ -19,13 +19,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, Union
+from typing import Union
 
 import numpy as np
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput
-from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs
+from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
 from ...video_utils import VideoInput
@@ -34,21 +34,7 @@
 logger = logging.get_logger(__name__)
 
 
-class Qwen3VLVideosProcessorKwargs(VideosKwargs, total=False):
-    pass
-
-
-class Qwen3VLImagesKwargs(ImagesKwargs):
-    min_pixels: Optional[int]
-    max_pixels: Optional[int]
-    patch_size: Optional[int]
-    temporal_patch_size: Optional[int]
-    merge_size: Optional[int]
-
-
 class Qwen3VLProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: Qwen3VLImagesKwargs
-    videos_kwargs: Qwen3VLVideosProcessorKwargs
     _defaults = {
         "text_kwargs": {
             "padding": False,
diff --git a/src/transformers/models/rt_detr/image_processing_rt_detr.py b/src/transformers/models/rt_detr/image_processing_rt_detr.py
index cf657867a9f8..ec65d0e7731d 100644
--- a/src/transformers/models/rt_detr/image_processing_rt_detr.py
+++ b/src/transformers/models/rt_detr/image_processing_rt_detr.py
@@ -48,6 +48,7 @@
     validate_annotations,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import (
     filter_out_non_signature_kwargs,
     is_torch_available,
@@ -66,6 +67,29 @@
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION,)
 
 
+class RTDetrImageProcessorKwargs(ImagesKwargs):
+    r"""
+    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
+        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+    do_convert_annotations (`bool`, *optional*, defaults to `True`):
+        Controls whether to convert the annotations to the format expected by the RT_DETR model. Converts the
+        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+    return_segmentation_masks (`bool`, *optional*, defaults to `False`):
+        Whether to return segmentation masks.
+    annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
+        Annotations to transform according to the padding that is applied to the images.
+    masks_path (`str` or `pathlib.Path`, *optional*):
+        Path to the directory containing the segmentation masks.
+    """
+
+    format: Optional[Union[str, AnnotationFormat]]
+    do_convert_annotations: Optional[bool]
+    return_segmentation_masks: Optional[bool]
+    annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
+    masks_path: Optional[Union[str, pathlib.Path]]
+
+
 # Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio
 def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, int]:
     """
@@ -406,6 +430,7 @@ class RTDetrImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values", "pixel_mask"]
+    valid_kwargs = RTDetrImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py b/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py
index 9aae271deacc..44946eeed9e3 100644
--- a/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py
+++ b/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py
@@ -13,7 +13,6 @@
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
     SizeDict,
     get_image_size_for_max_height_width,
     get_max_height_width,
@@ -34,24 +33,7 @@
 from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring, requires_backends
 from ...utils.import_utils import requires
-from .image_processing_rt_detr import get_size_with_aspect_ratio
-
-
-class RTDetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    r"""
-    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
-        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
-    do_convert_annotations (`bool`, *optional*, defaults to `True`):
-        Controls whether to convert the annotations to the format expected by the RT_DETR model. Converts the
-        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
-        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
-    return_segmentation_masks (`bool`, *optional*, defaults to `False`):
-        Whether to return segmentation masks.
-    """
-
-    format: Optional[Union[str, AnnotationFormat]]
-    do_convert_annotations: Optional[bool]
-    return_segmentation_masks: Optional[bool]
+from .image_processing_rt_detr import RTDetrImageProcessorKwargs, get_size_with_aspect_ratio
 
 
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
@@ -130,10 +112,10 @@ class RTDetrImageProcessorFast(BaseImageProcessorFast):
     size = {"height": 640, "width": 640}
     default_to_square = False
     model_input_names = ["pixel_values", "pixel_mask"]
-    valid_kwargs = RTDetrFastImageProcessorKwargs
+    valid_kwargs = RTDetrImageProcessorKwargs
     do_convert_annotations = True
 
-    def __init__(self, **kwargs: Unpack[RTDetrFastImageProcessorKwargs]) -> None:
+    def __init__(self, **kwargs: Unpack[RTDetrImageProcessorKwargs]) -> None:
         # Backwards compatibility
         do_convert_annotations = kwargs.get("do_convert_annotations")
         do_normalize = kwargs.get("do_normalize")
@@ -356,26 +338,9 @@ def pad(
     def preprocess(
         self,
         images: ImageInput,
-        annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None,
-        masks_path: Optional[Union[str, pathlib.Path]] = None,
-        **kwargs: Unpack[RTDetrFastImageProcessorKwargs],
+        **kwargs: Unpack[RTDetrImageProcessorKwargs],
     ) -> BatchFeature:
-        r"""
-        annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
-            List of annotations associated with the image or batch of images. If annotation is for object
-            detection, the annotations should be a dictionary with the following keys:
-            - "image_id" (`int`): The image id.
-            - "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a
-                dictionary. An image can have no annotations, in which case the list should be empty.
-            If annotation is for segmentation, the annotations should be a dictionary with the following keys:
-            - "image_id" (`int`): The image id.
-            - "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary.
-                An image can have no segments, in which case the list should be empty.
-            - "file_name" (`str`): The file name of the image.
-        masks_path (`str` or `pathlib.Path`, *optional*):
-            Path to the directory containing the segmentation masks.
-        """
-        return super().preprocess(images, annotations, masks_path, **kwargs)
+        return super().preprocess(images, **kwargs)
 
     def _preprocess(
         self,
diff --git a/src/transformers/models/rt_detr/modular_rt_detr.py b/src/transformers/models/rt_detr/modular_rt_detr.py
index 61bd055144f0..0f72fdd52845 100644
--- a/src/transformers/models/rt_detr/modular_rt_detr.py
+++ b/src/transformers/models/rt_detr/modular_rt_detr.py
@@ -4,7 +4,7 @@
 import torch
 from torchvision.transforms.v2 import functional as F
 
-from transformers.models.detr.image_processing_detr_fast import DetrFastImageProcessorKwargs, DetrImageProcessorFast
+from transformers.models.detr.image_processing_detr_fast import DetrImageProcessorFast
 
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import BaseImageProcessorFast, SizeDict, get_max_height_width
@@ -26,6 +26,7 @@
     logging,
     requires_backends,
 )
+from .image_processing_rt_detr import RTDetrImageProcessorKwargs
 
 
 logger = logging.get_logger(__name__)
@@ -92,10 +93,6 @@ def prepare_coco_detection_annotation(
     return new_target
 
 
-class RTDetrFastImageProcessorKwargs(DetrFastImageProcessorKwargs):
-    pass
-
-
 class RTDetrImageProcessorFast(DetrImageProcessorFast):
     resample = PILImageResampling.BILINEAR
     image_mean = IMAGENET_DEFAULT_MEAN
@@ -109,9 +106,9 @@ class RTDetrImageProcessorFast(DetrImageProcessorFast):
     size = {"height": 640, "width": 640}
     default_to_square = False
     model_input_names = ["pixel_values", "pixel_mask"]
-    valid_kwargs = RTDetrFastImageProcessorKwargs
+    valid_kwargs = RTDetrImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[RTDetrFastImageProcessorKwargs]) -> None:
+    def __init__(self, **kwargs: Unpack[RTDetrImageProcessorKwargs]) -> None:
         # Backwards compatibility
         do_convert_annotations = kwargs.get("do_convert_annotations")
         do_normalize = kwargs.get("do_normalize")
@@ -123,11 +120,9 @@ def __init__(self, **kwargs: Unpack[RTDetrFastImageProcessorKwargs]) -> None:
     def preprocess(
         self,
         images: ImageInput,
-        annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None,
-        masks_path: Optional[Union[str, pathlib.Path]] = None,
-        **kwargs: Unpack[RTDetrFastImageProcessorKwargs],
+        **kwargs: Unpack[RTDetrImageProcessorKwargs],
     ) -> BatchFeature:
-        return BaseImageProcessorFast.preprocess(self, images, annotations, masks_path, **kwargs)
+        return BaseImageProcessorFast.preprocess(self, images, **kwargs)
 
     def prepare_annotation(
         self,
diff --git a/src/transformers/models/sam/image_processing_sam.py b/src/transformers/models/sam/image_processing_sam.py
index 91d3867484ad..e9da260a6e9c 100644
--- a/src/transformers/models/sam/image_processing_sam.py
+++ b/src/transformers/models/sam/image_processing_sam.py
@@ -37,6 +37,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import (
     TensorType,
     filter_out_non_signature_kwargs,
@@ -57,6 +58,19 @@
 logger = logging.get_logger(__name__)
 
 
+class SamImageProcessorKwargs(ImagesKwargs):
+    r"""
+    mask_size (`dict[str, int]`, *optional*):
+        The size `{"longest_edge": int}` to resize the segmentation maps to.
+    mask_pad_size (`dict[str, int]`, *optional*):
+        The size `{"height": int, "width": int}` to pad the segmentation maps to. Must be larger than any segmentation
+        map size provided for preprocessing.
+    """
+
+    mask_size: Optional[dict[str, int]]
+    mask_pad_size: Optional[dict[str, int]]
+
+
 class SamImageProcessor(BaseImageProcessor):
     r"""
     Constructs a SAM image processor.
@@ -107,6 +121,7 @@ class SamImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values"]
+    valid_kwargs = SamImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/sam/image_processing_sam_fast.py b/src/transformers/models/sam/image_processing_sam_fast.py
index 7cb5f7b2bfc2..e77b69ee1e2b 100644
--- a/src/transformers/models/sam/image_processing_sam_fast.py
+++ b/src/transformers/models/sam/image_processing_sam_fast.py
@@ -26,10 +26,7 @@
 from torchvision.transforms.v2 import functional as F_t
 
 from ...image_processing_utils import BatchFeature, get_size_dict
-from ...image_processing_utils_fast import (
-    BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
-)
+from ...image_processing_utils_fast import BaseImageProcessorFast
 from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
@@ -41,19 +38,7 @@
 )
 from ...processing_utils import Unpack
 from ...utils import auto_docstring
-
-
-class SamFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    r"""
-    mask_size (`dict[str, int]`, *optional*):
-        The size `{"longest_edge": int}` to resize the segmentation maps to.
-    mask_pad_size (`dict[str, int]`, *optional*):
-        The size `{"height": int, "width": int}` to pad the segmentation maps to. Must be larger than any segmentation
-        map size provided for preprocessing.
-    """
-
-    mask_size: Optional[dict[str, int]]
-    mask_pad_size: Optional[dict[str, int]]
+from .image_processing_sam import SamImageProcessorKwargs
 
 
 @auto_docstring
@@ -68,13 +53,13 @@ class SamImageProcessorFast(BaseImageProcessorFast):
     do_normalize = True
     do_convert_rgb = True
 
-    valid_kwargs = SamFastImageProcessorKwargs
+    valid_kwargs = SamImageProcessorKwargs
 
     do_pad = True
     pad_size = {"height": 1024, "width": 1024}
     mask_pad_size = {"height": 256, "width": 256}
 
-    def __init__(self, **kwargs: Unpack[SamFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[SamImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     def _get_preprocess_shape(self, old_shape: tuple[int, int], longest_edge: int):
@@ -172,7 +157,7 @@ def preprocess(
         self,
         images: ImageInput,
         segmentation_maps: Optional[ImageInput] = None,
-        **kwargs: Unpack[SamFastImageProcessorKwargs],
+        **kwargs: Unpack[SamImageProcessorKwargs],
     ) -> BatchFeature:
         r"""
         segmentation_maps (`ImageInput`, *optional*):
@@ -187,7 +172,7 @@ def _preprocess_image_like_inputs(
         do_convert_rgb: bool,
         input_data_format: ChannelDimension,
         device: Optional[Union[str, "torch.device"]] = None,
-        **kwargs: Unpack[SamFastImageProcessorKwargs],
+        **kwargs: Unpack[SamImageProcessorKwargs],
     ) -> BatchFeature:
         """
         Preprocess image-like inputs.
diff --git a/src/transformers/models/sam/processing_sam.py b/src/transformers/models/sam/processing_sam.py
index c9ccbc93a413..bc82daf2034d 100644
--- a/src/transformers/models/sam/processing_sam.py
+++ b/src/transformers/models/sam/processing_sam.py
@@ -23,9 +23,8 @@
 
 from ...image_utils import ImageInput
 from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
-from ...tokenization_utils_base import AudioInput, BatchEncoding, PreTokenizedInput, TextInput
+from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
 from ...utils import is_torch_available
-from ...video_utils import VideoInput
 
 
 if is_torch_available():
@@ -38,6 +37,8 @@ class SamImagesKwargs(ImagesKwargs):
     input_labels: Optional[list[list[int]]]
     input_boxes: Optional[list[list[list[float]]]]
     point_pad_value: Optional[int]
+    mask_size: Optional[dict[str, int]]
+    mask_pad_size: Optional[dict[str, int]]
 
 
 class SamProcessorKwargs(ProcessingKwargs, total=False):
@@ -73,8 +74,6 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
-        audio: Optional[AudioInput] = None,
-        video: Optional[VideoInput] = None,
         **kwargs,
     ) -> BatchEncoding:
         """
@@ -114,7 +113,7 @@ def __call__(
             input_points=input_points,
             input_labels=input_labels,
             input_boxes=input_boxes,
-            return_tensors=output_kwargs["common_kwargs"].get("return_tensors"),
+            return_tensors=output_kwargs["images_kwargs"].get("return_tensors"),
             point_pad_value=point_pad_value,
         )
 
diff --git a/src/transformers/models/sam2/image_processing_sam2_fast.py b/src/transformers/models/sam2/image_processing_sam2_fast.py
index 30e99980f4d1..5ae472f53638 100644
--- a/src/transformers/models/sam2/image_processing_sam2_fast.py
+++ b/src/transformers/models/sam2/image_processing_sam2_fast.py
@@ -29,7 +29,7 @@
 from torchvision.ops.boxes import batched_nms
 
 from ...image_processing_utils import BatchFeature, get_size_dict
-from ...image_processing_utils_fast import BaseImageProcessorFast, DefaultFastImageProcessorKwargs
+from ...image_processing_utils_fast import BaseImageProcessorFast
 from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
@@ -39,11 +39,11 @@
     SizeDict,
     pil_torch_interpolation_mapping,
 )
-from ...processing_utils import Unpack
+from ...processing_utils import ImagesKwargs, Unpack
 from ...utils import TensorType, auto_docstring
 
 
-class Sam2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+class Sam2FastImageProcessorKwargs(ImagesKwargs):
     r"""
     mask_size (`dict[str, int]`, *optional*):
         The size `{"height": int, "width": int}` to resize the segmentation maps to.
diff --git a/src/transformers/models/sam2/modular_sam2.py b/src/transformers/models/sam2/modular_sam2.py
index daab10855512..40414566267f 100644
--- a/src/transformers/models/sam2/modular_sam2.py
+++ b/src/transformers/models/sam2/modular_sam2.py
@@ -24,7 +24,7 @@
 
 from ...activations import ACT2FN
 from ...image_processing_utils import BatchFeature, get_size_dict
-from ...image_processing_utils_fast import BaseImageProcessorFast, DefaultFastImageProcessorKwargs
+from ...image_processing_utils_fast import BaseImageProcessorFast
 from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
@@ -36,7 +36,7 @@
 )
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
-from ...processing_utils import Unpack
+from ...processing_utils import ImagesKwargs, Unpack
 from ...utils import (
     ModelOutput,
     TensorType,
@@ -70,7 +70,7 @@
 logger = logging.get_logger(__name__)
 
 
-class Sam2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+class Sam2FastImageProcessorKwargs(ImagesKwargs):
     r"""
     mask_size (`dict[str, int]`, *optional*):
         The size `{"height": int, "width": int}` to resize the segmentation maps to.
@@ -118,6 +118,19 @@ def _preprocess(
     ) -> "torch.Tensor":
         return BaseImageProcessorFast._preprocess(self, images, return_tensors=return_tensors, **kwargs).pixel_values
 
+    @auto_docstring
+    def preprocess(
+        self,
+        images: ImageInput,
+        segmentation_maps: Optional[ImageInput] = None,
+        **kwargs: Unpack[Sam2FastImageProcessorKwargs],
+    ) -> BatchFeature:
+        r"""
+        segmentation_maps (`ImageInput`, *optional*):
+            The segmentation maps to preprocess.
+        """
+        return super().preprocess(images, segmentation_maps, **kwargs)
+
     def _preprocess_image_like_inputs(
         self,
         images: ImageInput,
diff --git a/src/transformers/models/sam_hq/processing_samhq.py b/src/transformers/models/sam_hq/processing_samhq.py
index 49681c7c6a26..902e68832836 100644
--- a/src/transformers/models/sam_hq/processing_samhq.py
+++ b/src/transformers/models/sam_hq/processing_samhq.py
@@ -23,9 +23,8 @@
 
 from ...image_utils import ImageInput
 from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
-from ...tokenization_utils_base import AudioInput, BatchEncoding, PreTokenizedInput, TextInput
+from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
 from ...utils import is_torch_available
-from ...video_utils import VideoInput
 
 
 if is_torch_available():
@@ -38,6 +37,8 @@ class SamHQImagesKwargs(ImagesKwargs):
     input_labels: Optional[list[list[int]]]
     input_boxes: Optional[list[list[list[float]]]]
     point_pad_value: Optional[int]
+    mask_size: Optional[dict[str, int]]
+    mask_pad_size: Optional[dict[str, int]]
 
 
 class SamHQProcessorKwargs(ProcessingKwargs, total=False):
@@ -78,8 +79,6 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
-        audio: Optional[AudioInput] = None,
-        video: Optional[VideoInput] = None,
         **kwargs: Unpack[SamHQProcessorKwargs],
     ) -> BatchEncoding:
         """
@@ -118,7 +117,7 @@ def __call__(
             input_points=input_points,
             input_labels=input_labels,
             input_boxes=input_boxes,
-            return_tensors=output_kwargs["common_kwargs"].get("return_tensors"),
+            return_tensors=output_kwargs["images_kwargs"].get("return_tensors"),
             point_pad_value=output_kwargs["images_kwargs"].get("point_pad_value"),
         )
 
diff --git a/src/transformers/models/segformer/image_processing_segformer.py b/src/transformers/models/segformer/image_processing_segformer.py
index 0894c352de8b..ce9ace8115a4 100644
--- a/src/transformers/models/segformer/image_processing_segformer.py
+++ b/src/transformers/models/segformer/image_processing_segformer.py
@@ -33,6 +33,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import (
     TensorType,
     filter_out_non_signature_kwargs,
@@ -54,6 +55,17 @@
 logger = logging.get_logger(__name__)
 
 
+class SegformerImageProcessorKwargs(ImagesKwargs):
+    r"""
+    do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
+        Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
+        is used for background, and background itself is not included in all classes of a dataset (e.g.
+        ADE20k). The background label will be replaced by 255.
+    """
+
+    do_reduce_labels: Optional[bool]
+
+
 @requires(backends=("vision",))
 class SegformerImageProcessor(BaseImageProcessor):
     r"""
@@ -92,6 +104,7 @@ class SegformerImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values"]
+    valid_kwargs = SegformerImageProcessorKwargs
 
     @filter_out_non_signature_kwargs(extra=INIT_SERVICE_KWARGS)
     def __init__(
diff --git a/src/transformers/models/segformer/image_processing_segformer_fast.py b/src/transformers/models/segformer/image_processing_segformer_fast.py
index 11dfa3c42ab1..d3dc35e609de 100644
--- a/src/transformers/models/segformer/image_processing_segformer_fast.py
+++ b/src/transformers/models/segformer/image_processing_segformer_fast.py
@@ -25,12 +25,7 @@
 from torchvision.transforms.v2 import functional as F
 
 from ...image_processing_utils import BatchFeature
-from ...image_processing_utils_fast import (
-    BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
-    group_images_by_shape,
-    reorder_images,
-)
+from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
 from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
@@ -42,17 +37,7 @@
 )
 from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring
-
-
-class SegformerFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    r"""
-    do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
-        Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
-        is used for background, and background itself is not included in all classes of a dataset (e.g.
-        ADE20k). The background label will be replaced by 255.
-    """
-
-    do_reduce_labels: Optional[bool]
+from .image_processing_segformer import SegformerImageProcessorKwargs
 
 
 @auto_docstring
@@ -68,10 +53,10 @@ class SegformerImageProcessorFast(BaseImageProcessorFast):
     do_rescale = True
     do_normalize = True
     do_reduce_labels = False
-    valid_kwargs = SegformerFastImageProcessorKwargs
+    valid_kwargs = SegformerImageProcessorKwargs
     rescale_factor = 1 / 255
 
-    def __init__(self, **kwargs: Unpack[SegformerFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[SegformerImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     def reduce_label(self, labels: list["torch.Tensor"]):
@@ -89,7 +74,7 @@ def preprocess(
         self,
         images: ImageInput,
         segmentation_maps: Optional[ImageInput] = None,
-        **kwargs: Unpack[SegformerFastImageProcessorKwargs],
+        **kwargs: Unpack[SegformerImageProcessorKwargs],
     ) -> BatchFeature:
         r"""
         segmentation_maps (`ImageInput`, *optional*):
@@ -104,7 +89,7 @@ def _preprocess_image_like_inputs(
         do_convert_rgb: bool,
         input_data_format: ChannelDimension,
         device: Optional[Union[str, "torch.device"]] = None,
-        **kwargs: Unpack[SegformerFastImageProcessorKwargs],
+        **kwargs: Unpack[SegformerImageProcessorKwargs],
     ) -> BatchFeature:
         """
         Preprocess image-like inputs.
diff --git a/src/transformers/models/segformer/modular_segformer.py b/src/transformers/models/segformer/modular_segformer.py
index 831d046fd9a7..6bbbe9ecd4fd 100644
--- a/src/transformers/models/segformer/modular_segformer.py
+++ b/src/transformers/models/segformer/modular_segformer.py
@@ -19,7 +19,7 @@
 import torch
 from torchvision.transforms.v2 import functional as F
 
-from transformers.models.beit.image_processing_beit_fast import BeitFastImageProcessorKwargs, BeitImageProcessorFast
+from transformers.models.beit.image_processing_beit_fast import BeitImageProcessorFast
 
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
@@ -38,10 +38,7 @@
 from ...utils import (
     TensorType,
 )
-
-
-class SegformerFastImageProcessorKwargs(BeitFastImageProcessorKwargs):
-    pass
+from .image_processing_segformer import SegformerImageProcessorKwargs
 
 
 class SegformerImageProcessorFast(BeitImageProcessorFast):
@@ -64,7 +61,7 @@ def _preprocess_image_like_inputs(
         do_convert_rgb: bool,
         input_data_format: ChannelDimension,
         device: Optional[Union[str, "torch.device"]] = None,
-        **kwargs: Unpack[SegformerFastImageProcessorKwargs],
+        **kwargs: Unpack[SegformerImageProcessorKwargs],
     ) -> BatchFeature:
         """
         Preprocess image-like inputs.
diff --git a/src/transformers/models/shieldgemma2/processing_shieldgemma2.py b/src/transformers/models/shieldgemma2/processing_shieldgemma2.py
index 4341d087361e..8c221e826167 100644
--- a/src/transformers/models/shieldgemma2/processing_shieldgemma2.py
+++ b/src/transformers/models/shieldgemma2/processing_shieldgemma2.py
@@ -87,8 +87,6 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text=None,
-        videos=None,
-        audio=None,
         **kwargs: Unpack[ShieldGemma2ProcessorKwargs],
     ) -> BatchFeature:
         """Generates a batch of inputs from the provided images.
@@ -120,8 +118,6 @@ def __call__(
             `(len(images) * len(policies), )`, and the order within the batch will be
             img1_policy1, ... img1_policyN, ... imgM_policyN.
         """
-        del text, videos, audio
-
         if not images:
             raise ValueError("ShieldGemma 2 needs images to classify")
         elif not isinstance(images, Sequence):
diff --git a/src/transformers/models/siglip2/image_processing_siglip2.py b/src/transformers/models/siglip2/image_processing_siglip2.py
index 74a166c440b5..caff1bce0bc9 100644
--- a/src/transformers/models/siglip2/image_processing_siglip2.py
+++ b/src/transformers/models/siglip2/image_processing_siglip2.py
@@ -37,6 +37,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
 
 
@@ -47,6 +48,19 @@
     from PIL import Image
 
 
+class Siglip2ImageProcessorKwargs(ImagesKwargs):
+    """
+    patch_size (`int`, *optional*, defaults to 16):
+        The size (resolution) of each patch the image will be split to.
+    max_num_patches (`int`, *optional*, defaults to 256):
+        The image will be resized to have at most this number of patches,
+        and then padded in "patch" dimension to match this number exactly.
+    """
+
+    patch_size: Optional[int]
+    max_num_patches: Optional[int]
+
+
 @lru_cache(maxsize=256)
 def get_image_size_for_max_num_patches(
     image_height: int, image_width: int, patch_size: int, max_num_patches: int, eps: float = 1e-5
@@ -159,6 +173,7 @@ class Siglip2ImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values", "pixel_attention_mask", "spatial_shapes"]
+    valid_kwargs = Siglip2ImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/siglip2/image_processing_siglip2_fast.py b/src/transformers/models/siglip2/image_processing_siglip2_fast.py
index 45261fab2cd0..347ec372d410 100644
--- a/src/transformers/models/siglip2/image_processing_siglip2_fast.py
+++ b/src/transformers/models/siglip2/image_processing_siglip2_fast.py
@@ -20,22 +20,15 @@
 from torchvision.transforms.v2 import functional as F
 
 from ...image_processing_utils import BatchFeature
-from ...image_processing_utils_fast import (
-    BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
-    SizeDict,
-)
-from ...image_utils import (
-    ImageInput,
-    PILImageResampling,
-)
+from ...image_processing_utils_fast import BaseImageProcessorFast, SizeDict
+from ...image_utils import ImageInput, PILImageResampling
 from ...processing_utils import Unpack
 from ...utils import (
     TensorType,
     auto_docstring,
     logging,
 )
-from .image_processing_siglip2 import get_image_size_for_max_num_patches
+from .image_processing_siglip2 import Siglip2ImageProcessorKwargs, get_image_size_for_max_num_patches
 
 
 logger = logging.get_logger(__name__)
@@ -71,19 +64,6 @@ def pad_along_first_dim(
     return tensor, mask
 
 
-class Siglip2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    patch_size (`int`, *optional*, defaults to 16):
-        The size (resolution) of each patch the image will be split to.
-    max_num_patches (`int`, *optional*, defaults to 256):
-        The image will be resized to have at most this number of patches,
-        and then padded in "patch" dimension to match this number exactly.
-    """
-
-    patch_size: Optional[int]
-    max_num_patches: Optional[int]
-
-
 @auto_docstring
 class Siglip2ImageProcessorFast(BaseImageProcessorFast):
     resample = PILImageResampling.BILINEAR
@@ -94,10 +74,10 @@ class Siglip2ImageProcessorFast(BaseImageProcessorFast):
     do_normalize = True
     patch_size = 16
     max_num_patches = 256
-    valid_kwargs = Siglip2FastImageProcessorKwargs
+    valid_kwargs = Siglip2ImageProcessorKwargs
     unused_kwargs = ["size", "do_center_crop", "crop_size"]
 
-    def __init__(self, **kwargs: Unpack[Siglip2FastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[Siglip2ImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     def _validate_preprocess_kwargs(self, **kwargs) -> tuple:
@@ -106,7 +86,7 @@ def _validate_preprocess_kwargs(self, **kwargs) -> tuple:
         return super()._validate_preprocess_kwargs(**kwargs)
 
     @auto_docstring
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[Siglip2FastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[Siglip2ImageProcessorKwargs]) -> BatchFeature:
         return super().preprocess(images, **kwargs)
 
     def _preprocess(
diff --git a/src/transformers/models/siglip2/processing_siglip2.py b/src/transformers/models/siglip2/processing_siglip2.py
index 8e177b237b10..b16650303da4 100644
--- a/src/transformers/models/siglip2/processing_siglip2.py
+++ b/src/transformers/models/siglip2/processing_siglip2.py
@@ -16,19 +16,10 @@
 Image/Text processor class for SigLIP2.
 """
 
-from typing import Optional
-
-from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
-
-
-class Siglip2ImagesKwargs(ImagesKwargs, total=False):
-    max_num_patches: Optional[int]
-    patch_size: Optional[int]
+from ...processing_utils import ProcessingKwargs, ProcessorMixin
 
 
 class Siglip2ProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: Siglip2ImagesKwargs
-
     _defaults = {
         "text_kwargs": {
             "padding": "max_length",
diff --git a/src/transformers/models/smolvlm/image_processing_smolvlm.py b/src/transformers/models/smolvlm/image_processing_smolvlm.py
index 8a8ee5d4aa14..e231c1ec6b07 100644
--- a/src/transformers/models/smolvlm/image_processing_smolvlm.py
+++ b/src/transformers/models/smolvlm/image_processing_smolvlm.py
@@ -41,6 +41,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, is_vision_available, logging
 
 
@@ -50,6 +51,24 @@
 
 
 logger = logging.get_logger(__name__)
+
+
+class SmolVLMImageProcessorKwargs(ImagesKwargs):
+    """
+    do_image_splitting (`bool`, *optional*, defaults to `True`):
+        Whether to split the image into sub-images concatenated with the original image. They are split into patches
+        such that each patch has a size of `max_image_size["height"]` x `max_image_size["width"]`.
+    max_image_size (`Dict`, *optional*, defaults to `{"longest_edge": 364}`):
+        Maximum resolution of the patches of images accepted by the model. This is a dictionary containing the key "longest_edge".
+    return_row_col_info (`bool`, *optional*, defaults to `False`):
+        Whether to return the row and column information of the images.
+    """
+
+    do_image_splitting: Optional[bool]
+    max_image_size: Optional[dict[str, int]]
+    return_row_col_info: Optional[bool]
+
+
 MAX_IMAGE_SIZE = 4096  # 4k resolution as absolute maximum
 
 
@@ -288,6 +307,7 @@ class SmolVLMImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values", "pixel_attention_mask"]
+    valid_kwargs = SmolVLMImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/smolvlm/image_processing_smolvlm_fast.py b/src/transformers/models/smolvlm/image_processing_smolvlm_fast.py
index 4e24bc279543..5bff89823f32 100644
--- a/src/transformers/models/smolvlm/image_processing_smolvlm_fast.py
+++ b/src/transformers/models/smolvlm/image_processing_smolvlm_fast.py
@@ -25,13 +25,7 @@
 import torch
 
 from ...image_processing_utils import BatchFeature
-from ...image_processing_utils_fast import (
-    BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
-    SizeDict,
-    group_images_by_shape,
-    reorder_images,
-)
+from ...image_processing_utils_fast import BaseImageProcessorFast, SizeDict, group_images_by_shape, reorder_images
 from ...image_utils import (
     IMAGENET_STANDARD_MEAN,
     IMAGENET_STANDARD_STD,
@@ -41,6 +35,7 @@
 )
 from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring, is_torchvision_available, logging
+from .image_processing_smolvlm import SmolVLMImageProcessorKwargs
 
 
 if is_torchvision_available():
@@ -49,23 +44,6 @@
 
 logger = logging.get_logger(__name__)
 
-
-class SmolVLMFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    do_image_splitting (`bool`, *optional*, defaults to `True`):
-        Whether to split the image into sub-images concatenated with the original image. They are split into patches
-        such that each patch has a size of `max_image_size["height"]` x `max_image_size["width"]`.
-    max_image_size (`Dict`, *optional*, defaults to `{"longest_edge": 364}`):
-        Maximum resolution of the patches of images accepted by the model. This is a dictionary containing the key "longest_edge".
-    return_row_col_info (`bool`, *optional*, defaults to `False`):
-        Whether to return the row and column information of the images.
-    """
-
-    do_image_splitting: Optional[bool]
-    max_image_size: Optional[dict[str, int]]
-    return_row_col_info: Optional[bool]
-
-
 MAX_IMAGE_SIZE = 4096  # 4k resolution as absolute maximum
 
 
@@ -189,7 +167,7 @@ class SmolVLMImageProcessorFast(BaseImageProcessorFast):
     do_image_splitting = True
     do_pad = True
     return_row_col_info = False
-    valid_kwargs = SmolVLMFastImageProcessorKwargs
+    valid_kwargs = SmolVLMImageProcessorKwargs
 
     def _prepare_images_structure(self, images: ImageInput, expected_ndims: int = 3) -> ImageInput:
         """
@@ -357,7 +335,7 @@ def pad(
         return image, pixel_mask
 
     @auto_docstring
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[SmolVLMFastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[SmolVLMImageProcessorKwargs]) -> BatchFeature:
         return super().preprocess(images, **kwargs)
 
     def _preprocess(
diff --git a/src/transformers/models/smolvlm/processing_smolvlm.py b/src/transformers/models/smolvlm/processing_smolvlm.py
index 42dcecce6a3b..86d07e238f1b 100644
--- a/src/transformers/models/smolvlm/processing_smolvlm.py
+++ b/src/transformers/models/smolvlm/processing_smolvlm.py
@@ -21,7 +21,7 @@
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, make_nested_list_of_images
-from ...processing_utils import AllKwargsForChatTemplate, ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
+from ...processing_utils import AllKwargsForChatTemplate, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import BatchEncoding, TextInput
 from ...utils import is_num2words_available, is_vision_available, logging
 from ...video_utils import VideoInput
@@ -103,14 +103,7 @@ def get_image_prompt_string(
     )
 
 
-class SmolVLMImagesKwargs(ImagesKwargs, total=False):
-    return_row_col_info: Optional[bool]
-    max_image_size: Optional[dict[str, int]]
-
-
 class SmolVLMProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: SmolVLMImagesKwargs
-
     _defaults = {
         "text_kwargs": {
             "add_special_tokens": True,
@@ -248,7 +241,6 @@ def __call__(
         self,
         images: Union[ImageInput, list[ImageInput], list[list[ImageInput]]] = None,
         text: Union[TextInput, "PreTokenizedInput", list[TextInput], list["PreTokenizedInput"]] = None,
-        audio=None,
         videos: Optional[VideoInput] = None,
         **kwargs: Unpack[SmolVLMProcessorKwargs],
     ) -> BatchEncoding:
diff --git a/src/transformers/models/smolvlm/video_processing_smolvlm.py b/src/transformers/models/smolvlm/video_processing_smolvlm.py
index 522a344b09b5..ce73dfb4a82e 100644
--- a/src/transformers/models/smolvlm/video_processing_smolvlm.py
+++ b/src/transformers/models/smolvlm/video_processing_smolvlm.py
@@ -92,7 +92,6 @@ def get_resize_output_image_size(
 
 class SmolVLMVideoProcessorInitKwargs(VideosKwargs):
     max_image_size: Optional[dict[str, int]]
-    do_pad: Optional[bool]
 
 
 class SmolVLMVideoProcessor(BaseVideoProcessor):
diff --git a/src/transformers/models/superpoint/image_processing_superpoint.py b/src/transformers/models/superpoint/image_processing_superpoint.py
index dc2c6ab22419..633d9b0b16b9 100644
--- a/src/transformers/models/superpoint/image_processing_superpoint.py
+++ b/src/transformers/models/superpoint/image_processing_superpoint.py
@@ -30,6 +30,7 @@
     to_numpy_array,
     valid_images,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, logging, requires_backends
 
 
@@ -45,6 +46,15 @@
 logger = logging.get_logger(__name__)
 
 
+class SuperPointImageProcessorKwargs(ImagesKwargs):
+    r"""
+    do_grayscale (`bool`, *optional*, defaults to `True`):
+        Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method.
+    """
+
+    do_grayscale: Optional[bool] = True
+
+
 def is_grayscale(
     image: np.ndarray,
     input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -120,6 +130,7 @@ class SuperPointImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values"]
+    valid_kwargs = SuperPointImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/superpoint/image_processing_superpoint_fast.py b/src/transformers/models/superpoint/image_processing_superpoint_fast.py
index 54f95fa75af6..3750441fc9f0 100644
--- a/src/transformers/models/superpoint/image_processing_superpoint_fast.py
+++ b/src/transformers/models/superpoint/image_processing_superpoint_fast.py
@@ -21,7 +21,6 @@
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
     group_images_by_shape,
     reorder_images,
 )
@@ -34,6 +33,7 @@
     TensorType,
     auto_docstring,
 )
+from .image_processing_superpoint import SuperPointImageProcessorKwargs
 
 
 if TYPE_CHECKING:
@@ -53,15 +53,6 @@ def is_grayscale(
     )
 
 
-class SuperPointFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    r"""
-    do_grayscale (`bool`, *optional*, defaults to `True`):
-        Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method.
-    """
-
-    do_grayscale: Optional[bool] = True
-
-
 def convert_to_grayscale(
     image: "torch.Tensor",
 ) -> "torch.Tensor":
@@ -90,9 +81,9 @@ class SuperPointImageProcessorFast(BaseImageProcessorFast):
     do_rescale = True
     rescale_factor = 1 / 255
     do_normalize = None
-    valid_kwargs = SuperPointFastImageProcessorKwargs
+    valid_kwargs = SuperPointImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[SuperPointFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[SuperPointImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     def _preprocess(
diff --git a/src/transformers/models/swin2sr/image_processing_swin2sr.py b/src/transformers/models/swin2sr/image_processing_swin2sr.py
index b15e7a9d8f86..018a1bf0f4df 100644
--- a/src/transformers/models/swin2sr/image_processing_swin2sr.py
+++ b/src/transformers/models/swin2sr/image_processing_swin2sr.py
@@ -30,6 +30,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, filter_out_non_signature_kwargs, logging
 from ...utils.deprecation import deprecate_kwarg
 
@@ -37,6 +38,10 @@
 logger = logging.get_logger(__name__)
 
 
+class Swin2SRImageProcessorKwargs(ImagesKwargs):
+    size_divisor: Optional[int]
+
+
 class Swin2SRImageProcessor(BaseImageProcessor):
     r"""
     Constructs a Swin2SR image processor.
@@ -51,6 +56,7 @@ class Swin2SRImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values"]
+    valid_kwargs = Swin2SRImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/swin2sr/image_processing_swin2sr_fast.py b/src/transformers/models/swin2sr/image_processing_swin2sr_fast.py
index 82c9d733d367..bee3da36c9b6 100644
--- a/src/transformers/models/swin2sr/image_processing_swin2sr_fast.py
+++ b/src/transformers/models/swin2sr/image_processing_swin2sr_fast.py
@@ -22,7 +22,6 @@
 from ...image_processing_utils import BatchFeature, ChannelDimension, get_image_size
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
     group_images_by_shape,
     reorder_images,
 )
@@ -34,30 +33,21 @@
     logging,
 )
 from ...utils.deprecation import deprecate_kwarg
+from .image_processing_swin2sr import Swin2SRImageProcessorKwargs
 
 
 logger = logging.get_logger(__name__)
 
 
-class Swin2SRFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    size_divisor (`int`, *optional*, defaults to `8`):
-        The size of the sliding window for the local attention. It will be used to pad the image
-        to the size divisible by `size_divisor`
-    """
-
-    size_divisor: Optional[int]
-
-
 @auto_docstring
 class Swin2SRImageProcessorFast(BaseImageProcessorFast):
     do_rescale = True
     rescale_factor = 1 / 255
     do_pad = True
     size_divisor = 8
-    valid_kwargs = Swin2SRFastImageProcessorKwargs
+    valid_kwargs = Swin2SRImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[Swin2SRFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[Swin2SRImageProcessorKwargs]):
         pad_size = kwargs.pop("pad_size", None)
         kwargs.setdefault("size_divisor", pad_size)
         super().__init__(**kwargs)
@@ -76,7 +66,7 @@ def pad_size(self, value):
         )
         self.size_divisor = value
 
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[Swin2SRFastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[Swin2SRImageProcessorKwargs]) -> BatchFeature:
         return super().preprocess(images, **kwargs)
 
     @deprecate_kwarg("size", version="v5", new_name="size_divisor")
diff --git a/src/transformers/models/textnet/image_processing_textnet.py b/src/transformers/models/textnet/image_processing_textnet.py
index 578dabd3cb71..1a4d68522205 100644
--- a/src/transformers/models/textnet/image_processing_textnet.py
+++ b/src/transformers/models/textnet/image_processing_textnet.py
@@ -39,6 +39,7 @@
     validate_kwargs,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, is_vision_available, logging
 
 
@@ -48,6 +49,10 @@
     import PIL
 
 
+class TextNetImageProcessorKwargs(ImagesKwargs):
+    size_divisor: Optional[int]
+
+
 class TextNetImageProcessor(BaseImageProcessor):
     r"""
     Constructs a TextNet image processor.
@@ -90,6 +95,7 @@ class TextNetImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values"]
+    valid_kwargs = TextNetImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/textnet/image_processing_textnet_fast.py b/src/transformers/models/textnet/image_processing_textnet_fast.py
index baa6276736f7..eba6e14e64bc 100644
--- a/src/transformers/models/textnet/image_processing_textnet_fast.py
+++ b/src/transformers/models/textnet/image_processing_textnet_fast.py
@@ -20,7 +20,7 @@
 from torchvision.transforms.v2 import functional as F
 
 from ...image_processing_utils import BatchFeature
-from ...image_processing_utils_fast import BaseImageProcessorFast, DefaultFastImageProcessorKwargs
+from ...image_processing_utils_fast import BaseImageProcessorFast
 from ...image_transforms import (
     get_resize_output_image_size,
     group_images_by_shape,
@@ -39,15 +39,7 @@
     TensorType,
     auto_docstring,
 )
-
-
-class TextNetFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    size_divisor (`int`, *optional*, defaults to 32):
-        Ensures height and width are rounded to a multiple of this value after resizing.
-    """
-
-    size_divisor: Optional[int]
+from .image_processing_textnet import TextNetImageProcessorKwargs
 
 
 @auto_docstring
@@ -64,13 +56,13 @@ class TextNetImageProcessorFast(BaseImageProcessorFast):
     do_normalize = True
     do_convert_rgb = True
     size_divisor = 32
-    valid_kwargs = TextNetFastImageProcessorKwargs
+    valid_kwargs = TextNetImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[TextNetFastImageProcessorKwargs]) -> None:
+    def __init__(self, **kwargs: Unpack[TextNetImageProcessorKwargs]) -> None:
         super().__init__(**kwargs)
 
     @auto_docstring
-    def preprocess(self, images: ImageInput, **kwargs: Unpack[TextNetFastImageProcessorKwargs]) -> BatchFeature:
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[TextNetImageProcessorKwargs]) -> BatchFeature:
         return super().preprocess(images, **kwargs)
 
     def resize(
diff --git a/src/transformers/models/trocr/processing_trocr.py b/src/transformers/models/trocr/processing_trocr.py
index 049ed96c1749..81c7db2850b3 100644
--- a/src/transformers/models/trocr/processing_trocr.py
+++ b/src/transformers/models/trocr/processing_trocr.py
@@ -69,8 +69,6 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
-        audio=None,
-        videos=None,
         **kwargs: Unpack[TrOCRProcessorKwargs],
     ) -> BatchFeature:
         """
diff --git a/src/transformers/models/tvp/image_processing_tvp.py b/src/transformers/models/tvp/image_processing_tvp.py
index 2c4b066b6225..67c1ffe4fae8 100644
--- a/src/transformers/models/tvp/image_processing_tvp.py
+++ b/src/transformers/models/tvp/image_processing_tvp.py
@@ -39,6 +39,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
 
 
@@ -49,6 +50,21 @@
 logger = logging.get_logger(__name__)
 
 
+class TvpImageProcessorKwargs(ImagesKwargs):
+    r"""
+    do_flip_channel_order (`bool`, *optional*):
+        Whether to flip the channel order of the image from RGB to BGR.
+    constant_values (`float` or `List[float]`, *optional*):
+        Value used to fill the padding area when `pad_mode` is `'constant'`.
+    pad_mode (`str`, *optional*):
+        Padding mode to use — `'constant'`, `'edge'`, `'reflect'`, or `'symmetric'`.
+    """
+
+    do_flip_channel_order: Optional[bool]
+    constant_values: Optional[Union[float, list[float]]]
+    pad_mode: Optional[str]
+
+
 # Copied from transformers.models.vivit.image_processing_vivit.make_batched
 def make_batched(videos) -> list[list[ImageInput]]:
     if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
@@ -133,6 +149,7 @@ class TvpImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values"]
+    valid_kwargs = TvpImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/tvp/image_processing_tvp_fast.py b/src/transformers/models/tvp/image_processing_tvp_fast.py
index 5d74e6efb71f..7cd550d75194 100644
--- a/src/transformers/models/tvp/image_processing_tvp_fast.py
+++ b/src/transformers/models/tvp/image_processing_tvp_fast.py
@@ -22,7 +22,6 @@
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
     group_images_by_shape,
     reorder_images,
 )
@@ -36,21 +35,7 @@
 )
 from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring
-
-
-class TvpFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    r"""
-    do_flip_channel_order (`bool`, *optional*):
-        Whether to flip the channel order of the image from RGB to BGR.
-    constant_values (`float` or `List[float]`, *optional*):
-        Value used to fill the padding area when `pad_mode` is `'constant'`.
-    pad_mode (`str`, *optional*):
-        Padding mode to use — `'constant'`, `'edge'`, `'reflect'`, or `'symmetric'`.
-    """
-
-    do_flip_channel_order: Optional[bool]
-    constant_values: Optional[Union[float, list[float]]]
-    pad_mode: Optional[str]
+from .image_processing_tvp import TvpImageProcessorKwargs
 
 
 @auto_docstring
@@ -71,16 +56,16 @@ class TvpImageProcessorFast(BaseImageProcessorFast):
     pad_mode = "constant"
     do_normalize = True
     do_flip_channel_order = True
-    valid_kwargs = TvpFastImageProcessorKwargs
+    valid_kwargs = TvpImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[TvpFastImageProcessorKwargs]):
+    def __init__(self, **kwargs: Unpack[TvpImageProcessorKwargs]):
         super().__init__(**kwargs)
 
     @auto_docstring
     def preprocess(
         self,
         videos: Union[ImageInput, list[ImageInput], list[list[ImageInput]]],
-        **kwargs: Unpack[TvpFastImageProcessorKwargs],
+        **kwargs: Unpack[TvpImageProcessorKwargs],
     ) -> BatchFeature:
         return super().preprocess(videos, **kwargs)
 
diff --git a/src/transformers/models/udop/processing_udop.py b/src/transformers/models/udop/processing_udop.py
index 94b1565c9a22..1be71aea63e2 100644
--- a/src/transformers/models/udop/processing_udop.py
+++ b/src/transformers/models/udop/processing_udop.py
@@ -48,7 +48,6 @@ class UdopProcessorKwargs(ProcessingKwargs, total=False):
             "return_length": False,
             "verbose": True,
         },
-        "images_kwargs": {},
     }
 
 
@@ -85,8 +84,6 @@ def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
-        audio=None,
-        videos=None,
         **kwargs: Unpack[UdopProcessorKwargs],
     ) -> BatchFeature:
         """
diff --git a/src/transformers/models/video_llava/video_processing_video_llava.py b/src/transformers/models/video_llava/video_processing_video_llava.py
index 1e5deb543654..371a419d4a36 100644
--- a/src/transformers/models/video_llava/video_processing_video_llava.py
+++ b/src/transformers/models/video_llava/video_processing_video_llava.py
@@ -15,13 +15,9 @@
 """Video processor class for Video-LLaVA."""
 
 from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling
-from ...processing_utils import Unpack, VideosKwargs
 from ...video_processing_utils import BaseVideoProcessor
 
 
-class VideoLlavaFastVideoProcessorInitKwargs(VideosKwargs): ...
-
-
 class VideoLlavaVideoProcessor(BaseVideoProcessor):
     resample = PILImageResampling.BICUBIC
     image_mean = OPENAI_CLIP_MEAN
@@ -35,11 +31,6 @@ class VideoLlavaVideoProcessor(BaseVideoProcessor):
     do_normalize = True
     do_convert_rgb = True
     do_sample_frames = False  # Set to False for BC, recommended to set `True` in new models
-    valid_kwargs = VideoLlavaFastVideoProcessorInitKwargs
-    model_input_names = ["pixel_values_videos"]
-
-    def __init__(self, **kwargs: Unpack[VideoLlavaFastVideoProcessorInitKwargs]):
-        super().__init__(**kwargs)
 
 
 __all__ = ["VideoLlavaVideoProcessor"]
diff --git a/src/transformers/models/vilt/image_processing_vilt.py b/src/transformers/models/vilt/image_processing_vilt.py
index c7013e660332..bb29e1d1ee30 100644
--- a/src/transformers/models/vilt/image_processing_vilt.py
+++ b/src/transformers/models/vilt/image_processing_vilt.py
@@ -35,6 +35,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
 from ...utils.import_utils import requires
 
@@ -46,6 +47,10 @@
 logger = logging.get_logger(__name__)
 
 
+class ViltImageProcessorKwargs(ImagesKwargs):
+    size_divisor: Optional[int]
+
+
 def max_across_indices(values: Iterable[Any]) -> list[Any]:
     """
     Return the maximum value across all indices of an iterable of values.
@@ -162,6 +167,7 @@ class ViltImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values"]
+    valid_kwargs = ViltImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/vilt/image_processing_vilt_fast.py b/src/transformers/models/vilt/image_processing_vilt_fast.py
index 6926b655ce45..11537f74656d 100644
--- a/src/transformers/models/vilt/image_processing_vilt_fast.py
+++ b/src/transformers/models/vilt/image_processing_vilt_fast.py
@@ -22,7 +22,6 @@
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
     get_max_height_width,
     group_images_by_shape,
     reorder_images,
@@ -32,6 +31,7 @@
     TensorType,
     auto_docstring,
 )
+from .image_processing_vilt import ViltImageProcessorKwargs
 
 
 # Set maximum size based on the typical aspect ratio of the COCO dataset
@@ -39,19 +39,6 @@
 MAX_SHORTER_EDGE = 800
 
 
-class ViltFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    Args:
-        size_divisor (`int`, *optional*, defaults to 32):
-            The size to make the height and width divisible by.
-        rescale_factor (`float`, *optional*, defaults to 1/255):
-            The factor to rescale the image by.
-    """
-
-    size_divisor: Optional[int]
-    rescale_factor: Optional[float]
-
-
 @auto_docstring
 class ViltImageProcessorFast(BaseImageProcessorFast):
     resample = PILImageResampling.BICUBIC
@@ -65,7 +52,7 @@ class ViltImageProcessorFast(BaseImageProcessorFast):
     do_pad = True
     default_to_square = False
     model_input_names = ["pixel_values", "pixel_mask"]
-    valid_kwargs = ViltFastImageProcessorKwargs
+    valid_kwargs = ViltImageProcessorKwargs
 
     def _preprocess(
         self,
diff --git a/src/transformers/models/vilt/processing_vilt.py b/src/transformers/models/vilt/processing_vilt.py
index f4f9fc9a746d..5b5126ad4a85 100644
--- a/src/transformers/models/vilt/processing_vilt.py
+++ b/src/transformers/models/vilt/processing_vilt.py
@@ -17,17 +17,11 @@
 """
 
 import warnings
-from typing import Optional
 
-from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
-
-
-class ViltImagesKwargs(ImagesKwargs):
-    size_divisor: Optional[int]
+from ...processing_utils import ProcessingKwargs, ProcessorMixin
 
 
 class ViltProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: ViltImagesKwargs
     _defaults = {
         "text_kwargs": {
             "add_special_tokens": True,
diff --git a/src/transformers/models/vitmatte/image_processing_vitmatte.py b/src/transformers/models/vitmatte/image_processing_vitmatte.py
index 87b6d2662ef4..95933c053ce5 100644
--- a/src/transformers/models/vitmatte/image_processing_vitmatte.py
+++ b/src/transformers/models/vitmatte/image_processing_vitmatte.py
@@ -33,6 +33,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import TensorType, filter_out_non_signature_kwargs, logging
 from ...utils.deprecation import deprecate_kwarg
 
@@ -40,6 +41,10 @@
 logger = logging.get_logger(__name__)
 
 
+class VitMatteImageProcessorKwargs(ImagesKwargs):
+    size_divisor: Optional[int]
+
+
 class VitMatteImageProcessor(BaseImageProcessor):
     r"""
     Constructs a ViTMatte image processor.
@@ -68,6 +73,7 @@ class VitMatteImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values"]
+    valid_kwargs = VitMatteImageProcessorKwargs
 
     def __init__(
         self,
@@ -107,7 +113,7 @@ def size_divisibility(self, value):
     def pad_image(
         self,
         image: np.ndarray,
-        size_divisibility: int = 32,
+        size_divisor: int = 32,
         data_format: Optional[Union[str, ChannelDimension]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> np.ndarray:
@@ -115,7 +121,7 @@ def pad_image(
         Args:
             image (`np.ndarray`):
                 Image to pad.
-            size_divisibility (`int`, *optional*, defaults to 32):
+            size_divisor (`int`, *optional*, defaults to 32):
                 The width and height of the image will be padded to be divisible by this number.
             data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
                 The channel dimension format for the output image. Can be one of:
@@ -134,8 +140,8 @@ def pad_image(
 
         height, width = get_image_size(image, input_data_format)
 
-        pad_height = 0 if height % size_divisibility == 0 else size_divisibility - height % size_divisibility
-        pad_width = 0 if width % size_divisibility == 0 else size_divisibility - width % size_divisibility
+        pad_height = 0 if height % size_divisor == 0 else size_divisor - height % size_divisor
+        pad_width = 0 if width % size_divisor == 0 else size_divisor - width % size_divisor
         if pad_width + pad_height > 0:
             padding = ((0, pad_height), (0, pad_width))
             image = pad(image, padding=padding, data_format=data_format, input_data_format=input_data_format)
@@ -265,7 +271,7 @@ def preprocess(
 
         if do_pad:
             images = [
-                self.pad_image(image, size_divisibility=size_divisor, input_data_format=input_data_format)
+                self.pad_image(image, size_divisor=size_divisor, input_data_format=input_data_format)
                 for image in images
             ]
 
diff --git a/src/transformers/models/vitmatte/image_processing_vitmatte_fast.py b/src/transformers/models/vitmatte/image_processing_vitmatte_fast.py
index c5a7256a612b..dd09b987090d 100644
--- a/src/transformers/models/vitmatte/image_processing_vitmatte_fast.py
+++ b/src/transformers/models/vitmatte/image_processing_vitmatte_fast.py
@@ -22,7 +22,6 @@
 from ...image_processing_utils import BatchFeature
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
     group_images_by_shape,
     reorder_images,
 )
@@ -40,20 +39,12 @@
     filter_out_non_signature_kwargs,
     logging,
 )
+from .image_processing_vitmatte import VitMatteImageProcessorKwargs
 
 
 logger = logging.get_logger(__name__)
 
 
-class VitMatteFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    size_divisor (`int`, *optional*, defaults to 32):
-        The width and height of the image will be padded to be divisible by this number.
-    """
-
-    size_divisor: Optional[int]
-
-
 @auto_docstring
 class VitMatteImageProcessorFast(BaseImageProcessorFast):
     do_rescale: bool = True
@@ -63,9 +54,9 @@ class VitMatteImageProcessorFast(BaseImageProcessorFast):
     image_std: Optional[Union[float, list[float]]] = IMAGENET_STANDARD_STD
     do_pad: bool = True
     size_divisor: int = 32
-    valid_kwargs = VitMatteFastImageProcessorKwargs
+    valid_kwargs = VitMatteImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[VitMatteFastImageProcessorKwargs]) -> None:
+    def __init__(self, **kwargs: Unpack[VitMatteImageProcessorKwargs]) -> None:
         size_divisibility = kwargs.pop("size_divisibility", None)
         kwargs.setdefault("size_divisor", size_divisibility)
         super().__init__(**kwargs)
@@ -87,21 +78,21 @@ def size_divisibility(self, value):
     def _pad_image(
         self,
         images: torch.Tensor,
-        size_divisibility: int = 32,
+        size_divisor: int = 32,
     ) -> torch.Tensor:
         """
-        Pads an image or batched images constantly so that width and height are divisible by size_divisibility
+        Pads an image or batched images constantly so that width and height are divisible by size_divisor
 
         Args:
             image (`torch.Tensor`):
                 Image to pad.
-            size_divisibility (`int`, *optional*, defaults to 32):
+            size_divisor (`int`, *optional*, defaults to 32):
                 The width and height of the image will be padded to be divisible by this number.
         """
         height, width = get_image_size(images, channel_dim=ChannelDimension.FIRST)
 
-        pad_height = 0 if height % size_divisibility == 0 else size_divisibility - height % size_divisibility
-        pad_width = 0 if width % size_divisibility == 0 else size_divisibility - width % size_divisibility
+        pad_height = 0 if height % size_divisor == 0 else size_divisor - height % size_divisor
+        pad_width = 0 if width % size_divisor == 0 else size_divisor - width % size_divisor
 
         if pad_width + pad_height > 0:
             padding = (0, 0, pad_width, pad_height)
@@ -114,7 +105,7 @@ def preprocess(
         self,
         images: list["torch.Tensor"],
         trimaps: list["torch.Tensor"],
-        **kwargs: Unpack[VitMatteFastImageProcessorKwargs],
+        **kwargs: Unpack[VitMatteImageProcessorKwargs],
     ) -> BatchFeature:
         r"""
         trimaps (`list[torch.Tensor]`):
@@ -129,7 +120,7 @@ def _preprocess_image_like_inputs(
         do_convert_rgb: bool,
         input_data_format: ChannelDimension,
         device: Optional[Union[str, "torch.device"]] = None,
-        **kwargs: Unpack[VitMatteFastImageProcessorKwargs],
+        **kwargs: Unpack[VitMatteImageProcessorKwargs],
     ) -> BatchFeature:
         """
         Preprocess image-like inputs.
diff --git a/src/transformers/models/vjepa2/video_processing_vjepa2.py b/src/transformers/models/vjepa2/video_processing_vjepa2.py
index 3a5f5509ba6b..ad806ba4cc3d 100644
--- a/src/transformers/models/vjepa2/video_processing_vjepa2.py
+++ b/src/transformers/models/vjepa2/video_processing_vjepa2.py
@@ -19,9 +19,6 @@
 from ...video_processing_utils import BaseVideoProcessor
 
 
-class VJEPA2VideoProcessorInitKwargs(VideosKwargs): ...
-
-
 class VJEPA2VideoProcessor(BaseVideoProcessor):
     resample = PILImageResampling.BILINEAR
     image_mean = IMAGENET_DEFAULT_MEAN
@@ -32,10 +29,8 @@ class VJEPA2VideoProcessor(BaseVideoProcessor):
     do_rescale = True
     do_center_crop = True
     do_normalize = True
-    valid_kwargs = VJEPA2VideoProcessorInitKwargs
-    model_input_names = ["pixel_values_videos"]
 
-    def __init__(self, **kwargs: Unpack[VJEPA2VideoProcessorInitKwargs]):
+    def __init__(self, **kwargs: Unpack[VideosKwargs]):
         crop_size = kwargs.get("crop_size", 256)
         if not isinstance(crop_size, int):
             if not isinstance(crop_size, dict) or "height" not in crop_size:
diff --git a/src/transformers/models/voxtral/processing_voxtral.py b/src/transformers/models/voxtral/processing_voxtral.py
index 1b812ba60a4b..124835e35338 100644
--- a/src/transformers/models/voxtral/processing_voxtral.py
+++ b/src/transformers/models/voxtral/processing_voxtral.py
@@ -198,9 +198,8 @@ def apply_chat_template(
         )
         text_kwargs = output_kwargs["text_kwargs"]
         audio_kwargs = output_kwargs["audio_kwargs"]
-        common_kwargs = output_kwargs["common_kwargs"]
+        return_tensors = text_kwargs.get("return_tensors", None)
 
-        return_tensors = common_kwargs.pop("return_tensors", None)
         if return_tensors != "pt":
             raise ValueError(f"{self.__class__.__name__} only supports `return_tensors='pt'`.")
 
@@ -270,16 +269,10 @@ def __call__(
                 f"{self.audio_token} is present in the provided text which is not supported by VoxtralProcessor. Please use the `apply_chat_template` method instead."
             )
 
-        output_kwargs = self._merge_kwargs(
-            VoxtralProcessorKwargs,
-            **kwargs,
-        )
-        text_kwargs = output_kwargs["text_kwargs"]
-        common_kwargs = output_kwargs["common_kwargs"]
-
-        out = self.tokenizer(text, **text_kwargs)
+        output_kwargs = self._merge_kwargs(VoxtralProcessorKwargs, **kwargs)
+        out = self.tokenizer(text, **output_kwargs["text_kwargs"])
 
-        return BatchFeature(data=out, tensor_type=common_kwargs.pop("return_tensors", None))
+        return BatchFeature(data=out, tensor_type=output_kwargs["text_kwargs"].get("return_tensors", None))
 
     # TODO: @eustlb, this should be moved to mistral_common + testing
     def apply_transcription_request(
@@ -327,7 +320,6 @@ def apply_transcription_request(
         )
         text_kwargs = output_kwargs["text_kwargs"]
         audio_kwargs = output_kwargs["audio_kwargs"]
-        common_kwargs = output_kwargs["common_kwargs"]
 
         is_str = isinstance(audio, str)
         is_list_of_str = all(isinstance(el, str) for el in audio)
@@ -344,15 +336,14 @@ def apply_transcription_request(
                 )
 
         sampling_rate = audio_kwargs["sampling_rate"]
-        return_dict = common_kwargs.pop("return_dict", False)
-        tokenize = common_kwargs.pop("tokenize", False)
 
         # make sure to remove from text_kwargs and audio_kwargs
-        for k in ("return_dict", "tokenize"):
-            text_kwargs.pop(k, None)
-            audio_kwargs.pop(k, None)
+        return_dict = text_kwargs.pop("return_dict", False)
+        tokenize = text_kwargs.pop("tokenize", False)
+        _ = audio_kwargs.pop("return_dict", False)
+        _ = audio_kwargs.pop("tokenize", False)
 
-        return_tensors = common_kwargs.pop("return_tensors", None)
+        return_tensors = text_kwargs.pop("return_tensors", None)
         if return_tensors != "pt":
             raise ValueError(f"{self.__class__.__name__} only supports `return_tensors='pt'`.")
 
diff --git a/src/transformers/models/wav2vec2/processing_wav2vec2.py b/src/transformers/models/wav2vec2/processing_wav2vec2.py
index 1dc382d6f68a..ee8e3abd195d 100644
--- a/src/transformers/models/wav2vec2/processing_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/processing_wav2vec2.py
@@ -76,8 +76,6 @@ def __call__(
         self,
         audio: Optional[AudioInput] = None,
         text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None,
-        images=None,
-        videos=None,
         **kwargs: Unpack[Wav2Vec2ProcessorKwargs],
     ):
         """
@@ -112,7 +110,6 @@ def __call__(
                 audio,
                 **output_kwargs["audio_kwargs"],
                 **output_kwargs["text_kwargs"],
-                **output_kwargs["common_kwargs"],
             )
 
         if audio is not None:
diff --git a/src/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py b/src/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py
index ead53edb101a..fc95fc04c754 100644
--- a/src/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py
+++ b/src/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py
@@ -73,8 +73,6 @@ def __call__(
         self,
         audio: Optional[AudioInput] = None,
         text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None,
-        images=None,
-        videos=None,
         **kwargs: Unpack[Wav2Vec2BertProcessorKwargs],
     ):
         """
diff --git a/src/transformers/models/yolos/image_processing_yolos.py b/src/transformers/models/yolos/image_processing_yolos.py
index 50da604db8d3..21aac76adac8 100644
--- a/src/transformers/models/yolos/image_processing_yolos.py
+++ b/src/transformers/models/yolos/image_processing_yolos.py
@@ -51,6 +51,7 @@
     validate_kwargs,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import (
     TensorType,
     is_scipy_available,
@@ -80,6 +81,29 @@
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
+class YolosImageProcessorKwargs(ImagesKwargs):
+    r"""
+    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
+        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+    do_convert_annotations (`bool`, *optional*, defaults to `True`):
+        Controls whether to convert the annotations to the format expected by the YOLOS model. Converts the
+        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+    return_segmentation_masks (`bool`, *optional*, defaults to `False`):
+        Whether to return segmentation masks.
+    annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
+        Annotations to transform according to the padding that is applied to the images.
+    masks_path (`str` or `pathlib.Path`, *optional*):
+        Path to the directory containing the segmentation masks.
+    """
+
+    format: Optional[Union[str, AnnotationFormat]]
+    do_convert_annotations: Optional[bool]
+    return_segmentation_masks: Optional[bool]
+    annotations: Optional[Union[AnnotationType, list[AnnotationType]]]
+    masks_path: Optional[Union[str, pathlib.Path]]
+
+
 # Copied from transformers.models.detr.image_processing_detr.get_max_height_width
 def get_max_height_width(
     images: list[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
@@ -744,6 +768,7 @@ class YolosImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values", "pixel_mask"]
+    valid_kwargs = YolosImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/yolos/image_processing_yolos_fast.py b/src/transformers/models/yolos/image_processing_yolos_fast.py
index 59bb3868e75e..fc1f1852862f 100644
--- a/src/transformers/models/yolos/image_processing_yolos_fast.py
+++ b/src/transformers/models/yolos/image_processing_yolos_fast.py
@@ -14,7 +14,6 @@
 from ...image_processing_utils import BatchFeature, get_size_dict
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
     SizeDict,
     get_image_size_for_max_height_width,
     get_max_height_width,
@@ -35,28 +34,11 @@
 from ...processing_utils import Unpack
 from ...utils import TensorType, auto_docstring, logging
 from ...utils.import_utils import requires
+from .image_processing_yolos import YolosImageProcessorKwargs
 
 
 logger = logging.get_logger(__name__)
 
-
-class YolosFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    r"""
-    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
-        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
-    do_convert_annotations (`bool`, *optional*, defaults to `True`):
-        Controls whether to convert the annotations to the format expected by the YOLOS model. Converts the
-        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
-        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
-    return_segmentation_masks (`bool`, *optional*, defaults to `False`):
-        Whether to return segmentation masks.
-    """
-
-    format: Optional[Union[str, AnnotationFormat]]
-    do_convert_annotations: Optional[bool]
-    return_segmentation_masks: Optional[bool]
-
-
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
@@ -320,9 +302,9 @@ class YolosImageProcessorFast(BaseImageProcessorFast):
     size = {"shortest_edge": 800, "longest_edge": 1333}
     default_to_square = False
     model_input_names = ["pixel_values", "pixel_mask"]
-    valid_kwargs = YolosFastImageProcessorKwargs
+    valid_kwargs = YolosImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[YolosFastImageProcessorKwargs]) -> None:
+    def __init__(self, **kwargs: Unpack[YolosImageProcessorKwargs]) -> None:
         if "pad_and_return_pixel_mask" in kwargs:
             kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
 
@@ -584,25 +566,8 @@ def pad(
     def preprocess(
         self,
         images: ImageInput,
-        annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None,
-        masks_path: Optional[Union[str, pathlib.Path]] = None,
-        **kwargs: Unpack[YolosFastImageProcessorKwargs],
+        **kwargs: Unpack[YolosImageProcessorKwargs],
     ) -> BatchFeature:
-        r"""
-        annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
-            List of annotations associated with the image or batch of images. If annotation is for object
-            detection, the annotations should be a dictionary with the following keys:
-            - "image_id" (`int`): The image id.
-            - "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a
-                dictionary. An image can have no annotations, in which case the list should be empty.
-            If annotation is for segmentation, the annotations should be a dictionary with the following keys:
-            - "image_id" (`int`): The image id.
-            - "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary.
-                An image can have no segments, in which case the list should be empty.
-            - "file_name" (`str`): The file name of the image.
-        masks_path (`str` or `pathlib.Path`, *optional*):
-            Path to the directory containing the segmentation masks.
-        """
         if "pad_and_return_pixel_mask" in kwargs:
             kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
             logger.warning_once(
@@ -617,7 +582,7 @@ def preprocess(
             )
             kwargs["size"] = kwargs.pop("max_size")
 
-        return super().preprocess(images, annotations, masks_path, **kwargs)
+        return super().preprocess(images, **kwargs)
 
     def _preprocess(
         self,
diff --git a/src/transformers/models/zoedepth/image_processing_zoedepth.py b/src/transformers/models/zoedepth/image_processing_zoedepth.py
index 1ef2b8a59ec1..e8ad44dd76c3 100644
--- a/src/transformers/models/zoedepth/image_processing_zoedepth.py
+++ b/src/transformers/models/zoedepth/image_processing_zoedepth.py
@@ -40,6 +40,7 @@
     valid_images,
     validate_preprocess_arguments,
 )
+from ...processing_utils import ImagesKwargs
 from ...utils import (
     TensorType,
     filter_out_non_signature_kwargs,
@@ -61,6 +62,25 @@
 logger = logging.get_logger(__name__)
 
 
+class ZoeDepthImageProcessorKwargs(ImagesKwargs):
+    """
+    keep_aspect_ratio (`bool`, *optional*, defaults to `True`):
+        If `True`, the image is resized by choosing the smaller of the height and width scaling factors and using it
+        for both dimensions. This ensures that the image is scaled down as little as possible while still fitting
+        within the desired output size. In case `ensure_multiple_of` is also set, the image is further resized to a
+        size that is a multiple of this value by flooring the height and width to the nearest multiple of this value.
+        Can be overridden by `keep_aspect_ratio` in `preprocess`.
+    ensure_multiple_of (`int`, *optional*, defaults to 32):
+        If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Works by flooring
+        the height and width to the nearest multiple of this value.
+        Works both with and without `keep_aspect_ratio` being set to `True`.
+        Can be overridden by `ensure_multiple_of` in `preprocess`.
+    """
+
+    keep_aspect_ratio: Optional[bool]
+    ensure_multiple_of: Optional[int]
+
+
 def get_resize_output_image_size(
     input_image: np.ndarray,
     output_size: Union[int, Iterable[int]],
@@ -145,6 +165,7 @@ class ZoeDepthImageProcessor(BaseImageProcessor):
     """
 
     model_input_names = ["pixel_values"]
+    valid_kwargs = ZoeDepthImageProcessorKwargs
 
     def __init__(
         self,
diff --git a/src/transformers/models/zoedepth/image_processing_zoedepth_fast.py b/src/transformers/models/zoedepth/image_processing_zoedepth_fast.py
index 045dbfdacd4d..852ee161aff1 100644
--- a/src/transformers/models/zoedepth/image_processing_zoedepth_fast.py
+++ b/src/transformers/models/zoedepth/image_processing_zoedepth_fast.py
@@ -28,7 +28,6 @@
 )
 from ...image_processing_utils_fast import (
     BaseImageProcessorFast,
-    DefaultFastImageProcessorKwargs,
     group_images_by_shape,
     reorder_images,
 )
@@ -48,32 +47,13 @@
     logging,
     requires_backends,
 )
-from .image_processing_zoedepth import get_resize_output_image_size
+from .image_processing_zoedepth import ZoeDepthImageProcessorKwargs, get_resize_output_image_size
 from .modeling_zoedepth import ZoeDepthDepthEstimatorOutput
 
 
 logger = logging.get_logger(__name__)
 
 
-class ZoeDepthFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
-    """
-    keep_aspect_ratio (`bool`, *optional*, defaults to `True`):
-        If `True`, the image is resized by choosing the smaller of the height and width scaling factors and using it
-        for both dimensions. This ensures that the image is scaled down as little as possible while still fitting
-        within the desired output size. In case `ensure_multiple_of` is also set, the image is further resized to a
-        size that is a multiple of this value by flooring the height and width to the nearest multiple of this value.
-        Can be overridden by `keep_aspect_ratio` in `preprocess`.
-    ensure_multiple_of (`int`, *optional*, defaults to 32):
-        If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Works by flooring
-        the height and width to the nearest multiple of this value.
-        Works both with and without `keep_aspect_ratio` being set to `True`.
-        Can be overridden by `ensure_multiple_of` in `preprocess`.
-    """
-
-    keep_aspect_ratio: Optional[bool]
-    ensure_multiple_of: Optional[int]
-
-
 @auto_docstring
 class ZoeDepthImageProcessorFast(BaseImageProcessorFast):
     do_pad = True
@@ -86,16 +66,16 @@ class ZoeDepthImageProcessorFast(BaseImageProcessorFast):
     resample = PILImageResampling.BILINEAR
     keep_aspect_ratio = True
     ensure_multiple_of = 1 / 32
-    valid_kwargs = ZoeDepthFastImageProcessorKwargs
+    valid_kwargs = ZoeDepthImageProcessorKwargs
 
-    def __init__(self, **kwargs: Unpack[ZoeDepthFastImageProcessorKwargs]) -> None:
+    def __init__(self, **kwargs: Unpack[ZoeDepthImageProcessorKwargs]) -> None:
         super().__init__(**kwargs)
 
     @auto_docstring
     def preprocess(
         self,
         images: ImageInput,
-        **kwargs: Unpack[ZoeDepthFastImageProcessorKwargs],
+        **kwargs: Unpack[ZoeDepthImageProcessorKwargs],
     ) -> BatchFeature:
         return super().preprocess(images, **kwargs)
 
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 952bc65ce706..e7786d1ba61d 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -72,6 +72,8 @@
 
 
 if is_torch_available():
+    import torch
+
     from .modeling_utils import PreTrainedAudioTokenizerBase
 
 
@@ -156,6 +158,7 @@ class TextKwargs(TypedDict, total=False):
     verbose: Optional[bool]
     padding_side: Optional[str]
     return_mm_token_type_ids: Optional[bool]
+    return_tensors: Optional[Union[str, TensorType]]
 
 
 class ImagesKwargs(TypedDict, total=False):
@@ -164,6 +167,8 @@ class ImagesKwargs(TypedDict, total=False):
     class methods and docstrings.
 
     Attributes:
+        do_convert_rgb (`bool`):
+            Whether to convert the video to RGB format.
         do_resize (`bool`, *optional*):
             Whether to resize the image.
         size (`dict[str, int]`, *optional*):
@@ -183,7 +188,7 @@ class methods and docstrings.
         image_std (`float` or `list[float]`, *optional*):
             Standard deviation to use if normalizing the image.
         do_pad (`bool`, *optional*):
-            Whether to pad the image to the `(max_height, max_width)` of the images in the batch.
+            Whether to pad the images in the batch.
         pad_size (`dict[str, int]`, *optional*):
             The size `{"height": int, "width" int}` to pad the images to.
         do_center_crop (`bool`, *optional*):
@@ -192,10 +197,13 @@ class methods and docstrings.
             The channel dimension format for the output image.
         input_data_format (`ChannelDimension` or `str`, *optional*):
             The channel dimension format for the input image.
-        device (`str`, *optional*):
+        device (`Union[str, torch.Tensor]`, *optional*):
             The device to use for processing (e.g. "cpu", "cuda"), only relevant for fast image processing.
+        disable_grouping (`bool`, *optional*):
+            Whether to group images by shapes when processing or not, only relevant for fast image processing.
     """
 
+    do_convert_rgb: Optional[bool]
     do_resize: Optional[bool]
     size: Optional[dict[str, int]]
     crop_size: Optional[dict[str, int]]
@@ -210,7 +218,9 @@ class methods and docstrings.
     do_center_crop: Optional[bool]
     data_format: Optional[ChannelDimension]
     input_data_format: Optional[Union[str, ChannelDimension]]
-    device: Optional[str]
+    device: Optional[Union[str, "torch.device"]]
+    disable_grouping: Optional[bool]
+    return_tensors: Optional[Union[str, TensorType]]
 
 
 class VideosKwargs(TypedDict, total=False):
@@ -240,6 +250,8 @@ class VideosKwargs(TypedDict, total=False):
             Standard deviation to use if normalizing the video.
         do_center_crop (`bool`, *optional*):
             Whether to center crop the video.
+        do_pad (`bool`, *optional*):
+            Whether to pad the images in the batch.
         do_sample_frames (`bool`, *optional*):
             Whether to sample frames from the video before processing or to process the whole video.
         video_metadata (`Union[VideoMetadata, dict]`, *optional*):
@@ -254,6 +266,8 @@ class VideosKwargs(TypedDict, total=False):
             The channel dimension format for the output video.
         input_data_format (`ChannelDimension` or `str`, *optional*):
             The channel dimension format for the input video.
+        device (`Union[str, torch.Tensor]`, *optional*):
+            The device to use for processing (e.g. "cpu", "cuda"), only relevant for fast image processing.
         return_metadata (`ChannelDimension` or `str`, *optional*):
             Whether to return video metadata or not.
     """
@@ -269,15 +283,17 @@ class VideosKwargs(TypedDict, total=False):
     image_mean: Optional[Union[float, list[float]]]
     image_std: Optional[Union[float, list[float]]]
     do_center_crop: Optional[bool]
+    do_pad: Optional[bool]
     crop_size: Optional[dict[str, int]]
     data_format: Optional[ChannelDimension]
     input_data_format: Optional[Union[str, ChannelDimension]]
-    device: Optional[str]
+    device: Optional[Union[str, "torch.device"]]
     do_sample_frames: Optional[bool]
     video_metadata: Optional[Union[VideoMetadata, dict]]
     fps: Optional[Union[int, float]]
     num_frames: Optional[int]
     return_metadata: Optional[bool]
+    return_tensors: Optional[Union[str, TensorType]]
 
 
 class AudioKwargs(TypedDict, total=False):
@@ -317,9 +333,6 @@ class AudioKwargs(TypedDict, total=False):
     truncation: Optional[bool]
     pad_to_multiple_of: Optional[int]
     return_attention_mask: Optional[bool]
-
-
-class CommonKwargs(TypedDict, total=False):
     return_tensors: Optional[Union[str, TensorType]]
 
 
@@ -364,9 +377,6 @@ class CustomProcessorKwargs(ProcessingKwargs, total=False):
 
     _defaults = {}
 
-    common_kwargs: CommonKwargs = {
-        **CommonKwargs.__annotations__,
-    }
     text_kwargs: TextKwargs = {
         **TextKwargs.__annotations__,
     }
@@ -1245,7 +1255,6 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg
             "images_kwargs": {},
             "audio_kwargs": {},
             "videos_kwargs": {},
-            "common_kwargs": {},
         }
 
         default_kwargs = {
@@ -1253,7 +1262,13 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg
             "images_kwargs": {},
             "audio_kwargs": {},
             "videos_kwargs": {},
-            "common_kwargs": {},
+        }
+
+        map_preprocessor_kwargs = {
+            "text_kwargs": "tokenizer",
+            "images_kwargs": "image_processor",
+            "audio_kwargs": "feature_extractor",
+            "videos_kwargs": "video_processor",
         }
 
         possible_modality_keywords = {"text", "audio", "videos", "images"}
@@ -1262,8 +1277,22 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg
         # get defaults from set model processor kwargs if they exist
         for modality in default_kwargs:
             default_kwargs[modality] = ModelProcessorKwargs._defaults.get(modality, {}).copy()
+            # Some preprocessors define a set of accepted "valid_kwargs" (currently only vision).
+            # In those cases, we don’t declare a `ModalityKwargs` attribute in the TypedDict.
+            # Instead, we dynamically obtain the kwargs from the preprocessor and merge them
+            # with the general kwargs set. This ensures consistency between preprocessor and
+            # processor classes, and helps prevent accidental mismatches.
+            modality_valid_kwargs = set(ModelProcessorKwargs.__annotations__[modality].__annotations__)
+            if modality in map_preprocessor_kwargs:
+                preprocessor = getattr(self, map_preprocessor_kwargs[modality], None)
+                preprocessor_valid_kwargs = (
+                    getattr(preprocessor, "valid_kwargs", None) if preprocessor is not None else None
+                )
+                modality_valid_kwargs.update(
+                    set(preprocessor_valid_kwargs.__annotations__ if preprocessor_valid_kwargs is not None else [])
+                )
             # update defaults with arguments from tokenizer init
-            for modality_key in ModelProcessorKwargs.__annotations__[modality].__annotations__:
+            for modality_key in modality_valid_kwargs:
                 # init with tokenizer init kwargs if necessary
                 if tokenizer_init_kwargs is not None and modality_key in tokenizer_init_kwargs:
                     value = (
@@ -1279,7 +1308,16 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg
         # update modality kwargs with passed kwargs
         non_modality_kwargs = set(kwargs) - set(output_kwargs)
         for modality, output_kwarg in output_kwargs.items():
-            for modality_key in ModelProcessorKwargs.__annotations__[modality].__annotations__:
+            modality_valid_kwargs = set(ModelProcessorKwargs.__annotations__[modality].__annotations__)
+            if modality in map_preprocessor_kwargs:
+                preprocessor = getattr(self, map_preprocessor_kwargs[modality], None)
+                preprocessor_valid_kwargs = (
+                    getattr(preprocessor, "valid_kwargs", None) if preprocessor is not None else None
+                )
+                modality_valid_kwargs.update(
+                    set(preprocessor_valid_kwargs.__annotations__ if preprocessor_valid_kwargs is not None else [])
+                )
+            for modality_key in modality_valid_kwargs:
                 # check if we received a structured kwarg dict or not to handle it correctly
                 if modality in kwargs:
                     kwarg_value = kwargs[modality].pop(modality_key, "__empty__")
@@ -1311,17 +1349,18 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg
         else:
             # kwargs is a flat dictionary
             for key, kwarg in kwargs.items():
-                if key not in used_keys:
-                    if key in ModelProcessorKwargs.__annotations__["common_kwargs"].__annotations__:
-                        output_kwargs["common_kwargs"][key] = kwarg
-                    elif key not in possible_modality_keywords:
-                        logger.warning_once(
-                            f"Keyword argument `{key}` is not a valid argument for this processor and will be ignored."
-                        )
+                if key not in used_keys and key not in possible_modality_keywords:
+                    logger.warning_once(
+                        f"Keyword argument `{key}` is not a valid argument for this processor and will be ignored."
+                    )
+
+        # For `common_kwargs` just update all modality-specific kwargs with same key/values
+        common_kwargs = kwargs.get("common_kwargs", {})
+        common_kwargs.update(ModelProcessorKwargs._defaults.get("common_kwargs", {}))
+        if common_kwargs:
+            for kwarg in output_kwargs.values():
+                kwarg.update(common_kwargs)
 
-        # all modality-specific kwargs are updated with common kwargs
-        for kwarg in output_kwargs.values():
-            kwarg.update(output_kwargs["common_kwargs"])
         return output_kwargs
 
     @classmethod
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index ac72c6617d7b..cd3b9a18b1c5 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -913,7 +913,7 @@ def add_special_tokens(
 
                 Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer
                 assign the index of the `unk_token` to them).
-            replace_additional_special_tokens (`bool`, *optional*,, defaults to `True`):
+            replace_additional_special_tokens (`bool`, *optional*, defaults to `True`):
                 If `True`, the existing list of additional special tokens will be replaced by the list provided in
                 `special_tokens_dict`. Otherwise, `self._special_tokens_map["additional_special_tokens"]` is just extended. In the former
                 case, the tokens will NOT be removed from the tokenizer's full vocabulary - they are only being flagged
diff --git a/src/transformers/utils/auto_docstring.py b/src/transformers/utils/auto_docstring.py
index 15882c6b63cc..135f20bf4cf9 100644
--- a/src/transformers/utils/auto_docstring.py
+++ b/src/transformers/utils/auto_docstring.py
@@ -102,6 +102,13 @@ class ImageProcessorArgs:
         "shape": None,
     }
 
+    size_divisor = {
+        "description": """
+    The size by which to make sure both the height and width can be divided.
+    """,
+        "shape": None,
+    }
+
     default_to_square = {
         "description": """
     Whether to default to a square image when resizing, if size is an int.
diff --git a/tests/models/got_ocr2/test_image_processing_got_ocr2.py b/tests/models/got_ocr2/test_image_processing_got_ocr2.py
index 4228ffe4dcba..a9420a671094 100644
--- a/tests/models/got_ocr2/test_image_processing_got_ocr2.py
+++ b/tests/models/got_ocr2/test_image_processing_got_ocr2.py
@@ -44,7 +44,6 @@ def __init__(
         do_resize=True,
         size=None,
         do_normalize=True,
-        do_pad=False,
         image_mean=[0.48145466, 0.4578275, 0.40821073],
         image_std=[0.26862954, 0.26130258, 0.27577711],
         do_convert_rgb=True,
@@ -62,7 +61,6 @@ def __init__(
         self.do_normalize = do_normalize
         self.image_mean = image_mean
         self.image_std = image_std
-        self.do_pad = do_pad
         self.do_convert_rgb = do_convert_rgb
 
     def prepare_image_processor_dict(self):
@@ -73,7 +71,6 @@ def prepare_image_processor_dict(self):
             "image_mean": self.image_mean,
             "image_std": self.image_std,
             "do_convert_rgb": self.do_convert_rgb,
-            "do_pad": self.do_pad,
         }
 
     def expected_output_image_shape(self, images):
diff --git a/tests/utils/test_add_new_model_like.py b/tests/utils/test_add_new_model_like.py
index 5ba84bab5501..9796d67cc5f6 100644
--- a/tests/utils/test_add_new_model_like.py
+++ b/tests/utils/test_add_new_model_like.py
@@ -473,8 +473,8 @@ def test_phi4_with_all_processors(self):
             )
             from ..phi4_multimodal.feature_extraction_phi4_multimodal import Phi4MultimodalFeatureExtractor
             from ..phi4_multimodal.image_processing_phi4_multimodal_fast import (
-                Phi4MultimodalFastImageProcessorKwargs,
                 Phi4MultimodalImageProcessorFast,
+                Phi4MultimodalImageProcessorKwargs,
             )
             from ..phi4_multimodal.modeling_phi4_multimodal import (
                 Phi4MultimodalAttention,
@@ -643,7 +643,7 @@ class MyTest2ForCausalLM(Phi4MultimodalForCausalLM):
                 pass
 
 
-            class MyTest2FastImageProcessorKwargs(Phi4MultimodalFastImageProcessorKwargs):
+            class MyTest2ImageProcessorKwargs(Phi4MultimodalImageProcessorKwargs):
                 pass
 
 
diff --git a/utils/check_modular_conversion.py b/utils/check_modular_conversion.py
index 41650aedd0e2..a566025d2e1c 100644
--- a/utils/check_modular_conversion.py
+++ b/utils/check_modular_conversion.py
@@ -30,8 +30,8 @@ def process_file(
     file_type="modeling_",
     show_diff=True,
 ):
-    file_name_prefix = file_type.split("*")[0]
-    file_name_suffix = file_type.split("*")[-1] if "*" in file_type else ""
+    file_name_prefix = file_type.split(".*")[0]
+    file_name_suffix = file_type.split(".*")[-1] if ".*" in file_type else ""
     file_path = modular_file_path.replace("modular_", f"{file_name_prefix}_").replace(".py", f"{file_name_suffix}.py")
     # Read the actual modeling file
     with open(file_path, "r", encoding="utf-8") as modeling_file:
diff --git a/utils/modular_model_converter.py b/utils/modular_model_converter.py
index e2a19aa611b9..18c3a729368b 100644
--- a/utils/modular_model_converter.py
+++ b/utils/modular_model_converter.py
@@ -499,6 +499,7 @@ def augmented_dependencies_for_class_node(
     "configuration",
     "tokenization",
     "processing",
+    "image_processing.*_fast",
     "image_processing",
     "video_processing",
     "feature_extraction",
@@ -538,7 +539,7 @@ def visit_ImportFrom(self, node):
         to be added (because it will be part of the imports)"""
         import_module = self.python_module.code_for_node(node.module)
         import_statement = "." * len(node.relative) + import_module
-        if re.search(rf"^\.({self.match_patterns})_.*", import_statement):
+        if re.search(rf"^\.({self.match_patterns}).*", import_statement):
             for imported_object in node.names:
                 # If an alias is present, we record it and not the original name
                 if imported_object.evaluated_alias is not None:
@@ -1056,10 +1057,11 @@ def replace_class_node(
     "Tokenizer": "tokenization",
     "Processor": "processing",
     "ImageProcessor": "image_processing",
-    "ImageProcessorFast": "image_processing*_fast",  # "*" indicates where to insert the model name before the "_fast" suffix
+    "ImageProcessorFast": "image_processing.*_fast",  # "*" indicates where to insert the model name before the "_fast" suffix
     "VideoProcessor": "video_processing",
     "VideoProcessorInitKwargs": "video_processing",
-    "FastImageProcessorKwargs": "image_processing*_fast",
+    "FastImageProcessorKwargs": "image_processing.*_fast",
+    "ImageProcessorKwargs": "image_processing",
     "FeatureExtractor": "feature_extraction",
     "ProcessorKwargs": "processing",
     "VideosKwargs": "processing",
@@ -1208,7 +1210,7 @@ def visit_ImportFrom(self, node: cst.ImportFrom) -> None:
         if m.matches(node.module, m.Attribute()):
             for imported_ in node.names:
                 _import = re.search(
-                    rf"(?:transformers\.models\.)|(?:\.\.\.models\.)|(?:\.\.)\w+\.({self.match_patterns})_.*",
+                    rf"(?:transformers\.models\.)|(?:\.\.\.models\.)|(?:\.\.)\w+\.({self.match_patterns}).*",
                     import_statement,
                 )
                 if _import:
@@ -1257,7 +1259,7 @@ def visit_SimpleStatementLine(self, node):
                 import_module = self.python_module.code_for_node(node.body[0].module)
                 import_statement = "." * len(node.body[0].relative) + import_module
                 if not (
-                    re.search(rf"(?:transformers\.models\.)|(?:\.\.)\w+\.({self.match_patterns})_.*", import_statement)
+                    re.search(rf"(?:transformers\.models\.)|(?:\.\.)\w+\.({self.match_patterns}).*", import_statement)
                     and not any(import_to_skip in import_statement for import_to_skip in IMPORTS_TO_SKIP_IN_MODULAR)
                 ):
                     self.imports.append(node)
@@ -1320,7 +1322,7 @@ def leave_Module(self, node):
         # Note that we may visit several of the same file types, thus we save them per file type, not file
         self.imported_objects_per_file = defaultdict(set)
         for file, mapper in self.visited_modules.items():
-            file_type = re.search(rf"^transformers\.models\.\w+\.({self.match_patterns})_.*", file).group(1)
+            file_type = re.search(rf"^transformers\.models\.\w+\.({self.match_patterns})", file).group(1)
             self.imported_objects_per_file[file_type].update(mapper.objects_imported_from_modeling)
 
     def merge_model_specific_imports(self, visited_modules):
@@ -1716,8 +1718,8 @@ def convert_modular_file(modular_file: str) -> dict[str, str]:
 def save_modeling_files(modular_file: str, converted_files: dict[str, str]):
     """Save all the `converted_files` from the `modular_file`."""
     for file_type in converted_files:
-        file_name_prefix = file_type.split("*")[0]
-        file_name_suffix = file_type.split("*")[-1] if "*" in file_type else ""
+        file_name_prefix = file_type.split(".*")[0]
+        file_name_suffix = file_type.split(".*")[-1] if ".*" in file_type else ""
         new_file_name = modular_file.replace("modular_", f"{file_name_prefix}_").replace(
             ".py", f"{file_name_suffix}.py"
         )