Add MLlama fast image processor (#41391)

yonigozlan · rootonchair · web-flow · commit eb2824225161 · 2025-10-13T09:16:05.000Z
* Merge conflict

* add fast processor

* add fast processor

* make style

* add new convert rgb

* use nested group by shape in mllama fast, add support for multiple inputs in group by shape

* refactor after review

---------

Co-authored-by: Vincent &lt;phamvinh257@gmail.com&gt;
diff --git a/docs/source/en/model_doc/mllama.md b/docs/source/en/model_doc/mllama.md
@@ -67,7 +67,7 @@ processor = AutoProcessor.from_pretrained(model_id)
 messages = [
     [
         {
-            "role": "user", 
+            "role": "user",
             "content": [
                 {"type": "image", "url": "https://llava-vl.github.io/static/images/view.jpg"},
                 {"type": "text", "text": "What does the image show?"}
@@ -113,6 +113,10 @@ print(processor.decode(output[0], skip_special_tokens=True))
 
 [[autodoc]] MllamaImageProcessor
 
+## MllamaImageProcessorFast
+
+[[autodoc]] MllamaImageProcessorFast
+
 ## MllamaForConditionalGeneration
 
 [[autodoc]] MllamaForConditionalGeneration
diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py
@@ -221,19 +221,19 @@ def is_fast(self) -> bool:
 
     def pad(
         self,
-        images: "torch.Tensor",
+        images: list["torch.Tensor"],
         pad_size: SizeDict = None,
         fill_value: Optional[int] = 0,
         padding_mode: Optional[str] = "constant",
         return_mask: bool = False,
         disable_grouping: Optional[bool] = False,
         **kwargs,
-    ) -> "torch.Tensor":
+    ) -> Union[tuple["torch.Tensor", "torch.Tensor"], "torch.Tensor"]:
         """
         Pads images to `(pad_size["height"], pad_size["width"])` or to the largest size in the batch.
 
         Args:
-            images (`torch.Tensor`):
+            images (`list[torch.Tensor]`):
                 Images to pad.
             pad_size (`SizeDict`, *optional*):
                 Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
@@ -248,7 +248,7 @@ def pad(
                 Whether to disable grouping of images by size.
 
         Returns:
-            `torch.Tensor`: The resized image.
+            `Union[tuple[torch.Tensor, torch.Tensor], torch.Tensor]`: The padded images and pixel masks if `return_mask` is `True`.
         """
         if pad_size is not None:
             if not (pad_size.height and pad_size.width):
diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
@@ -797,25 +797,61 @@ def flip_channel_order(
     return image
 
 
+def split_to_tiles(images: "torch.Tensor", num_tiles_height: int, num_tiles_width: int) -> "torch.Tensor":
+    # Split image into number of required tiles (width x height)
+    batch_size, num_channels, height, width = images.size()
+    images = images.view(
+        batch_size,
+        num_channels,
+        num_tiles_height,
+        height // num_tiles_height,
+        num_tiles_width,
+        width // num_tiles_width,
+    )
+    # Permute dimensions to reorder the axes
+    image = images.permute(0, 2, 4, 1, 3, 5).contiguous()
+    # Reshape into the desired output shape (batch_size * 4, num_channels, width/2, height/2)
+    image = image.view(
+        batch_size,
+        num_tiles_width * num_tiles_height,
+        num_channels,
+        height // num_tiles_height,
+        width // num_tiles_width,
+    )
+    return image
+
+
 def _cast_tensor_to_float(x):
     if x.is_floating_point():
         return x
     return x.float()
 
 
-def _group_images_by_shape(nested_images, is_nested: bool = False):
-    """Helper function to flatten a single level of nested image structures and group by shape."""
+def _group_images_by_shape(nested_images, *paired_inputs, is_nested: bool = False):
+    """Helper function to flatten a single level of nested image and batch structures and group by shape."""
     grouped_images = defaultdict(list)
     grouped_images_index = {}
-    nested_images = [nested_images] if not is_nested else nested_images
-    for i, sublist in enumerate(nested_images):
-        for j, image in enumerate(sublist):
+    paired_grouped_values = [defaultdict(list) for _ in paired_inputs]
+
+    # Normalize inputs to consistent nested structure
+    normalized_images = [nested_images] if not is_nested else nested_images
+    normalized_paired = []
+    for paired_input in paired_inputs:
+        normalized_paired.append([paired_input] if not is_nested else paired_input)
+
+    # Process each image and group by shape
+    for i, (sublist, *paired_sublists) in enumerate(zip(normalized_images, *normalized_paired)):
+        for j, (image, *paired_values) in enumerate(zip(sublist, *paired_sublists)):
             key = (i, j) if is_nested else j
             shape = image.shape[1:]
+
+            # Add to grouped structures
             grouped_images[shape].append(image)
+            for paired_index, paired_value in enumerate(paired_values):
+                paired_grouped_values[paired_index][shape].append(paired_value)
             grouped_images_index[key] = (shape, len(grouped_images[shape]) - 1)
 
-    return grouped_images, grouped_images_index
+    return grouped_images, *paired_grouped_values, grouped_images_index
 
 
 def _reconstruct_nested_structure(indices, processed_images):
@@ -844,13 +880,35 @@ def _reconstruct_nested_structure(indices, processed_images):
     return result
 
 
+def _disable_grouping_output_nested(images, *paired_inputs):
+    """Build the disable_grouping output tuple for a single-level nested structure."""
+    outer_range = range(len(images))
+    inner_ranges = [range(len(images[i])) for i in outer_range]
+
+    # Precompute all (i, j) pairs
+    ij_pairs = [(i, j) for i in outer_range for j in inner_ranges[i]]
+
+    images_dict = {(i, j): images[i][j].unsqueeze(0) for (i, j) in ij_pairs}
+    paired_dicts = [{(i, j): paired_list[i][j].unsqueeze(0) for (i, j) in ij_pairs} for paired_list in paired_inputs]
+    index_map = {(i, j): ((i, j), 0) for (i, j) in ij_pairs}
+    return images_dict, *paired_dicts, index_map
+
+
+def _disable_grouping_output_flat(images, *paired_inputs):
+    """Build the disable_grouping output tuple for a flat list structure."""
+    idx_range = range(len(images))
+    images_dict = {i: images[i].unsqueeze(0) for i in idx_range}
+    paired_dicts = [{i: paired_list[i].unsqueeze(0) for i in idx_range} for paired_list in paired_inputs]
+    index_map = {i: (i, 0) for i in idx_range}
+    return images_dict, *paired_dicts, index_map
+
+
 def group_images_by_shape(
     images: Union[list["torch.Tensor"], "torch.Tensor"],
-    disable_grouping: bool,
+    *paired_inputs,
+    disable_grouping: Optional[bool],
     is_nested: bool = False,
-) -> tuple[
-    dict[tuple[int, int], list["torch.Tensor"]], dict[Union[int, tuple[int, int]], tuple[tuple[int, int], int]]
-]:
+) -> tuple[dict, ...]:
     """
     Groups images by shape.
     Returns a dictionary with the shape as key and a list of images with that shape as value,
@@ -862,15 +920,22 @@ def group_images_by_shape(
     Args:
         images (Union[list["torch.Tensor"], "torch.Tensor"]):
             A list of images or a single tensor
+        *paired_inputs (Any):
+            Zero or more lists that mirror the structure of `images` (flat list, or list of lists when
+            `is_nested=True`). Each element is paired 1:1 with the corresponding image so it can be grouped by the
+            same shape key. These paired values are grouped alongside `images` but are not stacked in the output, so
+            they do not need to be tensors.
         disable_grouping (bool):
             Whether to disable grouping. If None, will be set to True if the images are on CPU, and False otherwise.
             This choice is based on empirical observations, as detailed here: https://github.com/huggingface/transformers/pull/38157
         is_nested (bool, *optional*, defaults to False):
             Whether the images are nested.
 
     Returns:
-        tuple[dict[tuple[int, int], list["torch.Tensor"]], dict[Union[int, tuple[int, int]], tuple[tuple[int, int], int]]]:
-            - A dictionary with shape as key and list of images with that shape as value
+        tuple[dict, ...]:
+            - A dictionary with shape as key and list/batch of images with that shape as value
+            - Zero or more dictionaries (one per argument in `*paired_inputs`) grouped consistently with `images`; these carry
+              the corresponding per-item values and are not stacked
             - A dictionary mapping original indices to (shape, index) tuples
     """
     # If disable grouping is not explicitly provided, we favor disabling it if the images are on CPU, and enabling it otherwise.
@@ -880,19 +945,19 @@ def group_images_by_shape(
 
     if disable_grouping:
         if is_nested:
-            return {(i, j): images[i][j].unsqueeze(0) for i in range(len(images)) for j in range(len(images[i]))}, {
-                (i, j): ((i, j), 0) for i in range(len(images)) for j in range(len(images[i]))
-            }
+            return _disable_grouping_output_nested(images, *paired_inputs)
         else:
-            return {i: images[i].unsqueeze(0) for i in range(len(images))}, {i: (i, 0) for i in range(len(images))}
+            return _disable_grouping_output_flat(images, *paired_inputs)
 
     # Handle single level nested structure
-    grouped_images, grouped_images_index = _group_images_by_shape(images, is_nested)
+    grouped_images, *paired_grouped_values, grouped_images_index = _group_images_by_shape(
+        images, *paired_inputs, is_nested=is_nested
+    )
 
     # Stack images with the same shape
     grouped_images = {shape: torch.stack(images_list, dim=0) for shape, images_list in grouped_images.items()}
 
-    return grouped_images, grouped_images_index
+    return grouped_images, *paired_grouped_values, grouped_images_index
 
 
 def reorder_images(
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
@@ -134,7 +134,7 @@
             ("mgp-str", ("ViTImageProcessor", "ViTImageProcessorFast")),
             ("mistral3", ("PixtralImageProcessor", "PixtralImageProcessorFast")),
             ("mlcd", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
-            ("mllama", ("MllamaImageProcessor", None)),
+            ("mllama", ("MllamaImageProcessor", "MllamaImageProcessorFast")),
             ("mm-grounding-dino", ("GroundingDinoImageProcessor", "GroundingDinoImageProcessorFast")),
             ("mobilenet_v1", ("MobileNetV1ImageProcessor", "MobileNetV1ImageProcessorFast")),
             ("mobilenet_v2", ("MobileNetV2ImageProcessor", "MobileNetV2ImageProcessorFast")),
diff --git a/src/transformers/models/idefics2/image_processing_idefics2.py b/src/transformers/models/idefics2/image_processing_idefics2.py
@@ -43,7 +43,6 @@
 
 
 if is_vision_available():
-    import PIL
     from PIL import Image
 
 
@@ -142,7 +141,7 @@ def convert_to_rgb(image: ImageInput) -> ImageInput:
         image (Image):
             The image to convert.
     """
-    if not isinstance(image, PIL.Image.Image):
+    if not isinstance(image, Image.Image):
         return image
 
     # `image.convert("RGB")` would only work for .jpg images, as it creates a wrong background
diff --git a/src/transformers/models/llama4/image_processing_llama4_fast.py b/src/transformers/models/llama4/image_processing_llama4_fast.py
@@ -28,6 +28,7 @@
     group_images_by_shape,
     reorder_images,
 )
+from ...image_transforms import split_to_tiles
 from ...image_utils import ImageInput, PILImageResampling, SizeDict
 from ...processing_utils import ImagesKwargs, Unpack
 from ...utils import (
@@ -92,30 +93,6 @@ def get_max_res_without_distortion(
     return new_height, new_width
 
 
-def split_to_tiles(images: torch.Tensor, num_tiles_height: int, num_tiles_width: int) -> torch.Tensor:
-    # Split image into number of required tiles (width x height)
-    batch_size, num_channels, height, width = images.size()
-    images = images.view(
-        batch_size,
-        num_channels,
-        num_tiles_height,
-        height // num_tiles_height,
-        num_tiles_width,
-        width // num_tiles_width,
-    )
-    # Permute dimensions to reorder the axes
-    image = images.permute(0, 2, 4, 1, 3, 5).contiguous()
-    # Reshape into the desired output shape (batch_size * 4, num_channels, width/2, height/2)
-    image = image.view(
-        batch_size,
-        num_tiles_width * num_tiles_height,
-        num_channels,
-        height // num_tiles_height,
-        width // num_tiles_width,
-    )
-    return image
-
-
 @lru_cache(maxsize=1)
 def find_supported_resolutions(max_num_chunks: int, patch_size: SizeDict) -> torch.Tensor:
     """
diff --git a/src/transformers/models/mllama/__init__.py b/src/transformers/models/mllama/__init__.py
@@ -20,6 +20,7 @@
 if TYPE_CHECKING:
     from .configuration_mllama import *
     from .image_processing_mllama import *
+    from .image_processing_mllama_fast import *
     from .modeling_mllama import *
     from .processing_mllama import *
 else:
diff --git a/src/transformers/models/mllama/image_processing_mllama.py b/src/transformers/models/mllama/image_processing_mllama.py
@@ -43,7 +43,6 @@
 
 
 if is_vision_available():
-    import PIL
     from PIL import Image
 
 
@@ -407,30 +406,6 @@ def pack_images(
     return stacked_images, all_num_tiles
 
 
-def pack_aspect_ratios(aspect_ratios: list[list[tuple[int, int]]], pad_value: int = 1) -> np.ndarray:
-    """
-    Stack a list of aspect ratios into a numpy array.
-
-    Args:
-        aspect_ratios (`list[list[tuple[int, int]]]`):
-            A list of aspect ratios.
-        pad_value (`int`, *optional*, defaults to 1):
-            The value to pad the aspect ratios with.
-
-    Returns:
-        `np.ndarray`:
-            The aspect ratios stacked into a numpy array with shape (batch_size, max_num_images, 2).
-    """
-    batch_size = len(aspect_ratios)
-    max_num_images = max(len(row) for row in aspect_ratios)
-
-    aspect_ratios_stacked = np.full((batch_size, max_num_images, 2), pad_value, dtype=np.int64)
-    for i, row in enumerate(aspect_ratios):
-        if len(row) > 0:
-            aspect_ratios_stacked[i, : len(row)] = np.array(row)
-    return aspect_ratios_stacked
-
-
 def convert_aspect_ratios_to_ids(aspect_ratios: list[list[tuple[int, int]]], max_image_tiles: int) -> np.ndarray:
     """
     Convert aspect ratio tuples to unique ids.
@@ -511,7 +486,7 @@ def convert_to_rgb(image: ImageInput) -> ImageInput:
         image (Image):
             The image to convert.
     """
-    if not isinstance(image, PIL.Image.Image):
+    if not isinstance(image, Image.Image):
         return image
 
     # `image.convert("RGB")` would only work for .jpg images, as it creates a wrong background
@@ -718,7 +693,7 @@ def preprocess(
             # iterate over images in a batch sample
             for image in images:
                 # default PIL images to channels_last
-                if input_data_format is None and isinstance(image, PIL.Image.Image):
+                if input_data_format is None and isinstance(image, Image.Image):
                     input_data_format = ChannelDimension.LAST
 
                 # convert to numpy array for processing
diff --git a/src/transformers/models/mllama/image_processing_mllama_fast.py b/src/transformers/models/mllama/image_processing_mllama_fast.py
diff --git a/tests/models/mllama/test_image_processing_mllama.py b/tests/models/mllama/test_image_processing_mllama.py