zucchini-nlp
diff --git a/‎src/transformers/models/smolvlm/processing_smolvlm.py‎
Lines changed: 57 additions & 156 deletions b/‎src/transformers/models/smolvlm/processing_smolvlm.py‎
Lines changed: 57 additions & 156 deletions
@@ -16,26 +16,29 @@
 Processor class for SmolVLM.
 """
 
-import copy
 from datetime import timedelta
 from typing import TYPE_CHECKING, Dict, List, Optional, Union
 
-import numpy as np
-
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, make_nested_list_of_images
 from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import BatchEncoding, TextInput
 from ...utils import is_num2words_available, is_vision_available, logging
-from ...video_utils import VideoInput, load_video, make_batched_videos
+from ...video_utils import VideoInput
 
 
 if is_vision_available():
     from .video_processing_smolvlm import (
         DEFAULT_MEDIA_OUTTRO,
         DEFAULT_VIDEO_INTRO,
         FRAME_TIMESTAMP_MESSAGE,
-        smolvlm_sample_indices_fn,
+    )
+
+if is_vision_available():
+    from .video_processing_smolvlm import (
+        DEFAULT_MEDIA_OUTTRO,
+        DEFAULT_VIDEO_INTRO,
+        FRAME_TIMESTAMP_MESSAGE,
     )
 
 if TYPE_CHECKING:
@@ -141,9 +144,7 @@ class SmolVLMProcessor(ProcessorMixin):
     attributes = ["image_processor", "tokenizer", "video_processor"]
     valid_kwargs = ["image_seq_len", "chat_template"]
     image_processor_class = "SmolVLMImageProcessor"
-    video_processor_class = (
-        "SmolVLMImageProcessor"  # TODO: raushan should be VideoProcessor when LANCZOS resizing is settled
-    )
+    video_processor_class = "SmolVLMVideoProcessor"  # NOTE: uses different interpolation than slow processors
     tokenizer_class = "AutoTokenizer"
 
     def __init__(
@@ -161,17 +162,7 @@ def __init__(
         self.end_of_utterance_token = getattr(tokenizer, "end_of_utterance_token", "<end_of_utterance>")
         self.global_image_token = getattr(tokenizer, "global_image_token", "<global-img>")
         self.image_seq_len = image_seq_len
-
-        self.video_size = video_processor.video_sampling["video_size"]
-        self.image_size = image_processor.size
-
-        self.do_image_splitting = image_processor.do_image_splitting
-        self.do_video_splitting = video_processor.video_sampling.get("do_image_splitting", False)
-
-        self.default_max_frames = video_processor.video_sampling["max_frames"]
-        self.default_fps = video_processor.video_sampling["fps"]
-        # Matches one or more occurrences of <row_x_col_y> tags (where x and y are digits, optionally surrounded by newline characters
-        # self._regex_to_remove_extra_special_tokens = re.compile(r"(<row_\d+_col_\d+>\n?)+")
+        self.video_token = getattr(tokenizer, "video_token", "<video>")
 
         if not num2words:
             raise ImportError(
@@ -180,14 +171,12 @@ def __init__(
 
         super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template, **kwargs)
 
-    def process_vision(self, text, images, output_kwargs, do_image_splitting=False, image_processor_size=None):
+    def process_vision(self, text, images, output_kwargs):
         if text is not None:
             n_images_in_text = [sample.count(self.image_token) for sample in text]
 
         n_images_in_images = [len(sublist) for sublist in images]
-        image_inputs = self.image_processor(
-            images, do_image_splitting=do_image_splitting, size=image_processor_size, **output_kwargs["images_kwargs"]
-        )
+        image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
 
         if text is None:
             return None, image_inputs
@@ -226,6 +215,50 @@ def process_vision(self, text, images, output_kwargs, do_image_splitting=False,
 
         return prompt_strings, image_inputs
 
+    def process_video(self, text, videos, output_kwargs):
+        if text is not None:
+            n_videos_in_text = [sample.count(self.video_token) for sample in text]
+
+        n_videos_in_videos = [len(sublist) for sublist in videos]
+        video_inputs = self.video_processor(videos, **output_kwargs["videos_kwargs"])
+
+        num_frames = video_inputs["pixel_values"].shape[1]
+        batch_timestamps = iter(video_inputs.pop("timestamps"))
+        batch_durations = iter(video_inputs.pop("durations"))
+
+        if text is None:
+            return None, video_inputs
+
+        if n_videos_in_videos != n_videos_in_text:
+            raise ValueError(
+                f"The number of videos in the text {n_videos_in_text} and videos {n_videos_in_videos} should be the same."
+            )
+
+        prompt_strings = []
+        for sample in text:
+            while self.video_token in sample:
+                timestamps = next(batch_timestamps)
+                duration = next(batch_durations)
+                duration_td = timedelta(seconds=int(duration))
+                image_prompt_strings = DEFAULT_VIDEO_INTRO.format(
+                    frame_count=num2words(num_frames), video_duration=str(duration_td)
+                )
+                for timestamp in timestamps:
+                    image_prompt_string = _prompt_single_image(
+                        self.image_seq_len,
+                        image_token=self.image_token,
+                        fake_token_around_image=self.fake_image_token,
+                        global_image_token=self.global_image_token,
+                    )
+                    timestamp = f"{timestamp[0]:02d}:{timestamp[1]:02d}"
+                    image_prompt_string = FRAME_TIMESTAMP_MESSAGE.format(timestamp=timestamp) + image_prompt_string
+                    image_prompt_strings += image_prompt_string
+
+                image_prompt_strings += DEFAULT_MEDIA_OUTTRO
+                sample = sample.replace(self.video_token, image_prompt_strings, 1)
+            prompt_strings.append(sample)
+        return prompt_strings, video_inputs
+
     def __call__(
         self,
         images: Union[ImageInput, List[ImageInput], List[List[ImageInput]]] = None,
@@ -310,18 +343,13 @@ def __call__(
                 text,
                 images,
                 output_kwargs,
-                do_image_splitting=self.do_image_splitting,
-                image_processor_size=self.image_size,
             )
             inputs.update(vision_inputs)
         elif videos is not None:
-            videos = make_batched_videos(videos)
-            text, vision_inputs = self.process_vision(
+            text, vision_inputs = self.process_video(
                 text,
                 videos,
                 output_kwargs,
-                do_image_splitting=self.do_image_splitting,
-                image_processor_size=self.video_size,
             )
             inputs.update(vision_inputs)
 
@@ -334,93 +362,6 @@ def __call__(
 
         return BatchFeature(inputs, tensor_type=return_tensors)
 
-    def _process_messages_for_chat_template(
-        self,
-        conversations: List[List[Dict[str, str]]],
-        batch_images: List[ImageInput],
-        batch_videos: List[VideoInput],
-        batch_video_metadata: List[List[Dict[str, any]]],
-        **chat_template_kwargs,
-    ):
-        """
-        Used within `apply_chat_template` when a model has special way to process conversation history. For example,
-        video models might want to specify in the prompt the duration of video or which frame indices at which timestamps
-        were sampled. This information cannot be accessed before the video is loaded.
-        For most models it is a no-op, must be overridden by model processors which require special processing.
-        Args:
-            conversation (`List[Dict, str, str]`):
-                The conversation to process. Always comes in batched format.
-            batch_images (`List[List[ImageInput]]`):
-                Batch of images that were loaded from url/path defined in the conversation. The images
-                are ordered in the same way as in the conversation. Comes in nested list format, one list of `PIL` images
-                per batch.
-            batch_videos (`List[List[ImageInput]]`):
-                Batch of videos that were loaded from url/path defined in the conversation. The videos
-                are ordered in the same way as in the conversation. Comes in nested list format, one list of 4D video arrays
-                per batch.
-            batch_video_metadata (`List[List[Dict[[str, any]]]]`):
-                Batch of metadata returned from loading videos. That includes video fps, duration and total number of framer in original video.
-                Metadata are ordered in the same way as `batch_videos`. Comes in nested list format, one list of 4D video arrays
-                per batch.
-        """
-        # We don't want to modify in-place the messages passed by user
-        # The user might want to add new turn on conv and continue generation
-        conversations = copy.deepcopy(conversations)
-        batch_num_frames, batch_timestamps = [], []
-        for metadata_list, video_list in zip(batch_video_metadata, batch_videos):
-            for metadata, video in zip(metadata_list, video_list):
-                duration_sec = getattr(metadata, "duration")
-                frames_idx = getattr(metadata, "frames_indices")
-                fps = getattr(metadata, "fps")
-
-                timestamps = []
-                for idx, frame_np in zip(frames_idx, video):
-                    sec = idx / fps
-                    mm = int(sec // 60)
-                    ss = int(sec % 60)
-                    timestamps.append(f"{mm:02d}:{ss:02d}")
-                batch_timestamps.append(timestamps)
-                batch_num_frames.append(len(video))
-
-        for conversation in conversations:
-            # For each message, scan content for {"type": "video"}
-            for msg in conversation:
-                if "content" not in msg:
-                    continue
-
-                new_content = []
-                for block in msg["content"]:
-                    if block.get("type") == "video":
-                        curr_timestamps = batch_timestamps.pop(0)
-                        curr_num_frames = batch_num_frames.pop(0)
-
-                        # Build the video intro texts
-                        td = timedelta(seconds=int(duration_sec))
-                        new_content.append(
-                            {
-                                "type": "text",
-                                "text": DEFAULT_VIDEO_INTRO.format(
-                                    frame_count=num2words(curr_num_frames), video_duration=str(td)
-                                ),
-                            }
-                        )
-
-                        # 2) Insert per-frame lines: "Frame from {timestamp}:", then an "image" block
-                        for i, ts in enumerate(curr_timestamps):
-                            new_content.append({"type": "text", "text": FRAME_TIMESTAMP_MESSAGE.format(timestamp=ts)})
-                            new_content.append({"type": "image"})
-
-                        # 3) Optionally add an outro (e.g. "Now answer the question:")
-                        new_content.append({"type": "text", "text": DEFAULT_MEDIA_OUTTRO})
-                        # Do NOT add the original block => we skip it (since we've replaced it)
-                    else:
-                        # keep original block
-                        new_content.append(block)
-
-                # update the content
-                msg["content"] = new_content
-        return conversations
-
     def batch_decode(self, *args, **kwargs):
         """
         This method forwards all its arguments to SmolVLMTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
@@ -443,45 +384,5 @@ def model_input_names(self):
         image_processor_input_names = self.image_processor.model_input_names
         return list(dict.fromkeys(image_processor_input_names + tokenizer_input_names))
 
-    # TODO: raushan, has to be public method under `VideoProcessorBase` when API is added
-    def _load_video_for_model(
-        self,
-        video: Union[str, "VideoInput"],
-        num_frames: Optional[int] = None,
-        fps: Optional[int] = None,
-        backend: str = "opencv",
-        skip_secs: int = 0.0,
-        **kwargs,
-    ) -> np.array:
-        """
-        Loads `video` to a numpy array.
-
-        Args:
-            video (`str` or `VideoInput`):
-                The video to convert to the numpy array format. Can be a link to video or local path.
-            num_frames (`int`, *optional*):
-                Number of frames to sample uniformly. If not passed, the whole video is loaded.
-            fps (`int`, *optional*):
-                Number of frames to sample per second. Should be passed only when `num_frames=None`.
-                If not specified and `num_frames==None`, all frames are sampled.
-            backend (`str`, *optional*, defaults to `"opencv"`):
-                The backend to use when loading the video. Can be any of ["decord", "pyav", "opencv", "torchvision"]. Defaults to "opencv".
-
-        Returns:
-            Tuple[`np.array`, Dict]: A tuple containing:
-                - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]).
-                - Metadata dictionary.
-        """
-        max_frames = self.default_max_frames if num_frames is None else num_frames
-        target_fps = self.default_fps if fps is None else fps
-
-        def sample_indices_fn_func(metadata, **fn_kwargs):
-            return smolvlm_sample_indices_fn(
-                metadata, max_frames=max_frames, target_fps=target_fps, skip_secs=skip_secs, **fn_kwargs
-            )
-
-        video, metadata = load_video(video, backend=backend, sample_indices_fn=sample_indices_fn_func)
-        return video, metadata
-
 
 __all__ = ["SmolVLMProcessor"]