From df1b6c4a86e5e58bc05add6a6e26a1fafcf40bb4 Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Tue, 26 Dec 2023 02:07:01 +0530
Subject: [PATCH 01/47] begin animatediff img2video and video2video

---
 .../animatediff/pipeline_animatediff.py       | 421 +++++++++++++++++-
 1 file changed, 397 insertions(+), 24 deletions(-)

diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py
index 0dab722e51a8..804762ef4cd1 100644
--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py
@@ -57,6 +57,134 @@
 """
 
 
+def lerp(
+    v0: Union[torch.Tensor, np.ndarray],
+    v1: Union[torch.Tensor, np.ndarray],
+    t: Union[float, torch.Tensor, np.ndarray],
+) -> Union[torch.Tensor, np.ndarray]:
+    """
+    Linearly interpolate between two vectors/tensors.
+
+    Parameters
+    ----------
+    v0: Union[torch.Tensor, np.ndarray]
+        First vector/tensor.
+    v1: Union[torch.Tensor, np.ndarray]
+        Second vector/tensor.
+    t: Union[float, torch.Tensor, np.ndarray]
+        Interpolation factor. If float, must be between 0 and 1. If np.ndarray or
+        torch.Tensor, must be one dimensional with values between 0 and 1.
+
+    Returns
+    -------
+    Union[torch.Tensor, np.ndarray]
+        Interpolated vector/tensor between v0 and v1.
+    """
+    inputs_are_torch = False
+    t_is_float = False
+
+    if isinstance(v0, torch.Tensor):
+        inputs_are_torch = True
+        input_device = v0.device
+        v0 = v0.cpu().numpy()
+    if isinstance(v1, torch.Tensor):
+        inputs_are_torch = True
+        input_device = v1.device
+        v1 = v1.cpu().numpy()
+    if isinstance(t, torch.Tensor):
+        inputs_are_torch = True
+        input_device = t.device
+        t = t.cpu().numpy()
+    elif isinstance(t, float):
+        t_is_float = True
+        t = np.array([t])
+
+    t = t[..., None]
+    v0 = v0[None, ...]
+    v1 = v1[None, ...]
+    v2 = (1 - t) * v0 + t * v1
+
+    if t_is_float and v0.ndim > 1:
+        assert v2.shape[0] == 1
+        v2 = np.squeeze(v2, axis=0)
+    if inputs_are_torch:
+        v2 = torch.from_numpy(v2).to(input_device)
+
+    return v2
+
+
+def slerp(
+    v0: Union[torch.Tensor, np.ndarray],
+    v1: Union[torch.Tensor, np.ndarray],
+    t: Union[float, torch.Tensor, np.ndarray],
+    DOT_THRESHOLD=0.9995,
+) -> Union[torch.Tensor, np.ndarray]:
+    """
+    Spherical linear interpolation between two vectors/tensors.
+
+    Parameters
+    ----------
+    v0: Union[torch.Tensor, np.ndarray]
+        First vector/tensor.
+    v1: Union[torch.Tensor, np.ndarray]
+        Second vector/tensor.
+    t: Union[float, np.ndarray]
+        Interpolation factor. If float, must be between 0 and 1. If np.ndarray, must be one
+        dimensional with values between 0 and 1.
+    DOT_THRESHOLD: float
+        Threshold for when to use linear interpolation instead of spherical interpolation.
+
+    Returns
+    -------
+    Union[torch.Tensor, np.ndarray]
+        Interpolated vector/tensor between v0 and v1.
+    """
+    inputs_are_torch = False
+    t_is_float = False
+
+    if isinstance(v0, torch.Tensor):
+        inputs_are_torch = True
+        input_device = v0.device
+        v0 = v0.cpu().numpy()
+    if isinstance(v1, torch.Tensor):
+        inputs_are_torch = True
+        input_device = v1.device
+        v1 = v1.cpu().numpy()
+    if isinstance(t, torch.Tensor):
+        inputs_are_torch = True
+        input_device = t.device
+        t = t.cpu().numpy()
+    elif isinstance(t, float):
+        t_is_float = True
+        t = np.array([t], dtype=v0.dtype)
+
+    dot = np.sum(v0 * v1 / (np.linalg.norm(v0) * np.linalg.norm(v1)))
+    if np.abs(dot) > DOT_THRESHOLD:
+        # v1 and v2 are close to parallel
+        # Use linear interpolation instead
+        v2 = lerp(v0, v1, t)
+    else:
+        theta_0 = np.arccos(dot)
+        sin_theta_0 = np.sin(theta_0)
+        theta_t = theta_0 * t
+        sin_theta_t = np.sin(theta_t)
+        s0 = np.sin(theta_0 - theta_t) / sin_theta_0
+        s1 = sin_theta_t / sin_theta_0
+        s0 = s0[..., None]
+        s1 = s1[..., None]
+        v0 = v0[None, ...]
+        v1 = v1[None, ...]
+        v2 = s0 * v0 + s1 * v1
+
+    if t_is_float and v0.ndim > 1:
+        assert v2.shape[0] == 1
+        v2 = np.squeeze(v2, axis=0)
+    if inputs_are_torch:
+        v2 = torch.from_numpy(v2).to(input_device)
+
+    return v2
+
+
 def tensor2vid(video: torch.Tensor, processor, output_type="np"):
     # Based on:
     # https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/pipelines/multi_modal/text_to_video_synthesis_pipeline.py#L78
@@ -72,6 +200,65 @@ def tensor2vid(video: torch.Tensor, processor, output_type="np"):
     return outputs
 
 
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
 @dataclass
 class AnimateDiffPipelineOutput(BaseOutput):
     frames: Union[torch.Tensor, np.ndarray]
@@ -506,9 +693,30 @@ def check_inputs(
                     f" {negative_prompt_embeds.shape}."
                 )
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+
+        return timesteps, num_inference_steps - t_start
+
     # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.prepare_latents
     def prepare_latents(
-        self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
+        self,
+        image,
+        batch_size,
+        num_channels_latents,
+        num_frames,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+        interpolation_method="slerp",
     ):
         shape = (
             batch_size,
@@ -517,29 +725,80 @@ def prepare_latents(
             height // self.vae_scale_factor,
             width // self.vae_scale_factor,
         )
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
 
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        if image is None:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            if latents is None:
+                latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            else:
+                latents = latents.to(device)
+
+            # scale the initial noise by the standard deviation required by the scheduler
+            latents = latents * self.scheduler.init_noise_sigma
+            return latents
+
         else:
-            latents = latents.to(device)
+            image = image.to(device=device, dtype=dtype)
 
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
+            if image.shape[1] == 4:
+                init_latents = image
+            else:
+                # make sure the VAE is in float32 mode, as it overflows in float16
+                if self.vae.config.force_upcast:
+                    image = image.float()
+                    self.vae.to(dtype=torch.float32)
+
+                if isinstance(generator, list):
+                    if len(generator) != batch_size:
+                        raise ValueError(
+                            f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                            f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                        )
+
+                    init_latents = [
+                        retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                        for i in range(batch_size)
+                    ]
+                    init_latents = torch.cat(init_latents, dim=0)
+                else:
+                    init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+                if self.vae.config.force_upcast:
+                    self.vae.to(dtype)
+
+                init_latents = init_latents.to(dtype)
+                init_latents = self.vae.config.scaling_factor * init_latents
+                latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+                for i in range(num_frames):
+                    alpha = i / num_frames
+
+                    if interpolation_method == "repeat":
+                        latents[:, :, i, :, :] = init_latents.detach().clone()
+                    elif interpolation_method == "lerp":
+                        latents[:, :, i, :, :] = lerp(latents[:, :, i, :, :], init_latents, alpha)
+                    elif interpolation_method == "slerp":
+                        latents[:, :, i, :, :] = slerp(latents[:, :, i, :, :], init_latents, alpha)
+                    else:
+                        raise NotImplementedError
+
+            return latents
 
     @torch.no_grad()
     def __call__(
         self,
-        prompt: Union[str, List[str]] = None,
+        prompt: Optional[Union[str, List[str]]] = None,
+        image: Optional[PipelineImageInput] = None,
         num_frames: Optional[int] = 16,
         height: Optional[int] = None,
         width: Optional[int] = None,
         num_inference_steps: int = 50,
+        timesteps: Optional[List[int]] = None,
         guidance_scale: float = 7.5,
         negative_prompt: Optional[Union[str, List[str]]] = None,
         num_videos_per_prompt: Optional[int] = 1,
@@ -555,6 +814,7 @@ def __call__(
         callback_steps: Optional[int] = 1,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         clip_skip: Optional[int] = None,
+        interpolation_method: str = "slerp",
     ):
         r"""
         The call function to the pipeline for generation.
@@ -661,6 +921,7 @@ def __call__(
             lora_scale=text_encoder_lora_scale,
             clip_skip=clip_skip,
         )
+
         # For classifier free guidance, we need to do two forward passes.
         # Here we concatenate the unconditional and text embeddings into a single batch
         # to avoid doing two forward passes
@@ -675,22 +936,26 @@ def __call__(
             if do_classifier_free_guidance:
                 image_embeds = torch.cat([negative_image_embeds, image_embeds])
 
+        if image is not None:
+            image = self.image_processor.preprocess(image)
+
         # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps = self.scheduler.timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
 
         # 5. Prepare latent variables
         num_channels_latents = self.unet.config.in_channels
         latents = self.prepare_latents(
-            batch_size * num_videos_per_prompt,
-            num_channels_latents,
-            num_frames,
-            height,
-            width,
-            prompt_embeds.dtype,
-            device,
-            generator,
-            latents,
+            image=image,
+            batch_size=batch_size * num_videos_per_prompt,
+            num_channels_latents=num_channels_latents,
+            num_frames=num_frames,
+            height=height,
+            width=width,
+            dtype=prompt_embeds.dtype,
+            device=device,
+            generator=generator,
+            latents=latents,
+            interpolation_method=interpolation_method,
         )
 
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
@@ -747,3 +1012,111 @@ def __call__(
             return (video,)
 
         return AnimateDiffPipelineOutput(frames=video)
+
+    @torch.no_grad()
+    def text_to_video(
+        self,
+        prompt: Union[str, List[str]],
+        num_frames: Optional[int] = 16,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: Optional[List[int]] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Refer to the documentation of AnimateDiffPipeline.__call__.
+        """
+        return self.__call__(
+            prompt=prompt,
+            num_frames=num_frames,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            timesteps=timesteps,
+            guidance_scale=guidance_scale,
+            negative_prompt=negative_prompt,
+            num_videos_per_prompt=num_videos_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            ip_adapter_image=ip_adapter_image,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+            cross_attention_kwargs=cross_attention_kwargs,
+            clip_skip=clip_skip,
+        )
+
+    @torch.no_grad()
+    def image_to_video(
+        self,
+        image: PipelineImageInput,
+        prompt: Optional[Union[str, List[str]]] = None,
+        num_frames: Optional[int] = 16,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: Optional[List[int]] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+        interpolation_method: str = "slerp",
+    ):
+        r"""
+        Refer to the documentation of AnimateDiffPipeline.__call__.
+        """
+        return self.__call__(
+            image=image,
+            prompt=prompt,
+            num_frames=num_frames,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            timesteps=timesteps,
+            guidance_scale=guidance_scale,
+            negative_prompt=negative_prompt,
+            num_videos_per_prompt=num_videos_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            ip_adapter_image=ip_adapter_image,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+            cross_attention_kwargs=cross_attention_kwargs,
+            clip_skip=clip_skip,
+            interpolation_method=interpolation_method,
+        )

From 4be3068d104b641bf25acd543c9d1db40c2495fd Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Wed, 27 Dec 2023 03:42:14 +0530
Subject: [PATCH 02/47] revert animatediff to original implementation

---
 .../animatediff/pipeline_animatediff.py       | 433 ++----------------
 1 file changed, 34 insertions(+), 399 deletions(-)

diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py
index 804762ef4cd1..b0fe790c2222 100644
--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py
@@ -33,7 +33,14 @@
     LMSDiscreteScheduler,
     PNDMScheduler,
 )
-from ...utils import USE_PEFT_BACKEND, BaseOutput, logging, scale_lora_layers, unscale_lora_layers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    BaseOutput,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 
@@ -47,7 +54,7 @@
         >>> from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler
         >>> from diffusers.utils import export_to_gif
 
-        >>> adapter = MotionAdapter.from_pretrained("diffusers/motion-adapter")
+        >>> adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
         >>> pipe = AnimateDiffPipeline.from_pretrained("frankjoshua/toonyou_beta6", motion_adapter=adapter)
         >>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False)
         >>> output = pipe(prompt="A corgi walking in the park")
@@ -57,134 +64,6 @@
 """
 
 
-def lerp(
-    v0: Union[torch.Tensor, np.ndarray],
-    v1: Union[torch.Tensor, np.ndarray],
-    t: Union[float, torch.Tensor, np.ndarray],
-) -> Union[torch.Tensor, np.ndarray]:
-    """
-    Linearly interpolate between two vectors/tensors.
-
-    Parameters
-    ----------
-    v0: Union[torch.Tensor, np.ndarray]
-        First vector/tensor.
-    v1: Union[torch.Tensor, np.ndarray]
-        Second vector/tensor.
-    t: Union[float, torch.Tensor, np.ndarray]
-        Interpolation factor. If float, must be between 0 and 1. If np.ndarray or
-        torch.Tensor, must be one dimensional with values between 0 and 1.
-
-    Returns
-    -------
-    Union[torch.Tensor, np.ndarray]
-        Interpolated vector/tensor between v0 and v1.
-    """
-    inputs_are_torch = False
-    t_is_float = False
-
-    if isinstance(v0, torch.Tensor):
-        inputs_are_torch = True
-        input_device = v0.device
-        v0 = v0.cpu().numpy()
-    if isinstance(v1, torch.Tensor):
-        inputs_are_torch = True
-        input_device = v1.device
-        v1 = v1.cpu().numpy()
-    if isinstance(t, torch.Tensor):
-        inputs_are_torch = True
-        input_device = t.device
-        t = t.cpu().numpy()
-    elif isinstance(t, float):
-        t_is_float = True
-        t = np.array([t])
-
-    t = t[..., None]
-    v0 = v0[None, ...]
-    v1 = v1[None, ...]
-    v2 = (1 - t) * v0 + t * v1
-
-    if t_is_float and v0.ndim > 1:
-        assert v2.shape[0] == 1
-        v2 = np.squeeze(v2, axis=0)
-    if inputs_are_torch:
-        v2 = torch.from_numpy(v2).to(input_device)
-
-    return v2
-
-
-def slerp(
-    v0: Union[torch.Tensor, np.ndarray],
-    v1: Union[torch.Tensor, np.ndarray],
-    t: Union[float, torch.Tensor, np.ndarray],
-    DOT_THRESHOLD=0.9995,
-) -> Union[torch.Tensor, np.ndarray]:
-    """
-    Spherical linear interpolation between two vectors/tensors.
-
-    Parameters
-    ----------
-    v0: Union[torch.Tensor, np.ndarray]
-        First vector/tensor.
-    v1: Union[torch.Tensor, np.ndarray]
-        Second vector/tensor.
-    t: Union[float, np.ndarray]
-        Interpolation factor. If float, must be between 0 and 1. If np.ndarray, must be one
-        dimensional with values between 0 and 1.
-    DOT_THRESHOLD: float
-        Threshold for when to use linear interpolation instead of spherical interpolation.
-
-    Returns
-    -------
-    Union[torch.Tensor, np.ndarray]
-        Interpolated vector/tensor between v0 and v1.
-    """
-    inputs_are_torch = False
-    t_is_float = False
-
-    if isinstance(v0, torch.Tensor):
-        inputs_are_torch = True
-        input_device = v0.device
-        v0 = v0.cpu().numpy()
-    if isinstance(v1, torch.Tensor):
-        inputs_are_torch = True
-        input_device = v1.device
-        v1 = v1.cpu().numpy()
-    if isinstance(t, torch.Tensor):
-        inputs_are_torch = True
-        input_device = t.device
-        t = t.cpu().numpy()
-    elif isinstance(t, float):
-        t_is_float = True
-        t = np.array([t], dtype=v0.dtype)
-
-    dot = np.sum(v0 * v1 / (np.linalg.norm(v0) * np.linalg.norm(v1)))
-    if np.abs(dot) > DOT_THRESHOLD:
-        # v1 and v2 are close to parallel
-        # Use linear interpolation instead
-        v2 = lerp(v0, v1, t)
-    else:
-        theta_0 = np.arccos(dot)
-        sin_theta_0 = np.sin(theta_0)
-        theta_t = theta_0 * t
-        sin_theta_t = np.sin(theta_t)
-        s0 = np.sin(theta_0 - theta_t) / sin_theta_0
-        s1 = sin_theta_t / sin_theta_0
-        s0 = s0[..., None]
-        s1 = s1[..., None]
-        v0 = v0[None, ...]
-        v1 = v1[None, ...]
-        v2 = s0 * v0 + s1 * v1
-
-    if t_is_float and v0.ndim > 1:
-        assert v2.shape[0] == 1
-        v2 = np.squeeze(v2, axis=0)
-    if inputs_are_torch:
-        v2 = torch.from_numpy(v2).to(input_device)
-
-    return v2
-
-
 def tensor2vid(video: torch.Tensor, processor, output_type="np"):
     # Based on:
     # https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/pipelines/multi_modal/text_to_video_synthesis_pipeline.py#L78
@@ -200,65 +79,6 @@ def tensor2vid(video: torch.Tensor, processor, output_type="np"):
     return outputs
 
 
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
-def retrieve_latents(
-    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
-):
-    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
-        return encoder_output.latent_dist.sample(generator)
-    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
-        return encoder_output.latent_dist.mode()
-    elif hasattr(encoder_output, "latents"):
-        return encoder_output.latents
-    else:
-        raise AttributeError("Could not access latents of provided encoder_output")
-
-
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
-def retrieve_timesteps(
-    scheduler,
-    num_inference_steps: Optional[int] = None,
-    device: Optional[Union[str, torch.device]] = None,
-    timesteps: Optional[List[int]] = None,
-    **kwargs,
-):
-    """
-    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
-    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
-
-    Args:
-        scheduler (`SchedulerMixin`):
-            The scheduler to get timesteps from.
-        num_inference_steps (`int`):
-            The number of diffusion steps used when generating samples with a pre-trained model. If used,
-            `timesteps` must be `None`.
-        device (`str` or `torch.device`, *optional*):
-            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
-        timesteps (`List[int]`, *optional*):
-                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
-                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
-                must be `None`.
-
-    Returns:
-        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
-        second element is the number of inference steps.
-    """
-    if timesteps is not None:
-        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
-        if not accepts_timesteps:
-            raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
-                f" timestep schedules. Please check whether you are using the correct scheduler."
-            )
-        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
-    else:
-        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-    return timesteps, num_inference_steps
-
-
 @dataclass
 class AnimateDiffPipelineOutput(BaseOutput):
     frames: Union[torch.Tensor, np.ndarray]
@@ -693,30 +513,9 @@ def check_inputs(
                     f" {negative_prompt_embeds.shape}."
                 )
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
-    def get_timesteps(self, num_inference_steps, strength, device):
-        # get the original timestep using init_timestep
-        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
-
-        t_start = max(num_inference_steps - init_timestep, 0)
-        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
-
-        return timesteps, num_inference_steps - t_start
-
     # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.prepare_latents
     def prepare_latents(
-        self,
-        image,
-        batch_size,
-        num_channels_latents,
-        num_frames,
-        height,
-        width,
-        dtype,
-        device,
-        generator,
-        latents=None,
-        interpolation_method="slerp",
+        self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
     ):
         shape = (
             batch_size,
@@ -725,80 +524,30 @@ def prepare_latents(
             height // self.vae_scale_factor,
             width // self.vae_scale_factor,
         )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
 
-        if image is None:
-            if isinstance(generator, list) and len(generator) != batch_size:
-                raise ValueError(
-                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-                )
-
-            if latents is None:
-                latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-            else:
-                latents = latents.to(device)
-
-            # scale the initial noise by the standard deviation required by the scheduler
-            latents = latents * self.scheduler.init_noise_sigma
-            return latents
-
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
         else:
-            image = image.to(device=device, dtype=dtype)
+            latents = latents.to(device)
 
-            if image.shape[1] == 4:
-                init_latents = image
-            else:
-                # make sure the VAE is in float32 mode, as it overflows in float16
-                if self.vae.config.force_upcast:
-                    image = image.float()
-                    self.vae.to(dtype=torch.float32)
-
-                if isinstance(generator, list):
-                    if len(generator) != batch_size:
-                        raise ValueError(
-                            f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                            f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-                        )
-
-                    init_latents = [
-                        retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
-                        for i in range(batch_size)
-                    ]
-                    init_latents = torch.cat(init_latents, dim=0)
-                else:
-                    init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
-
-                if self.vae.config.force_upcast:
-                    self.vae.to(dtype)
-
-                init_latents = init_latents.to(dtype)
-                init_latents = self.vae.config.scaling_factor * init_latents
-                latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-
-                for i in range(num_frames):
-                    alpha = i / num_frames
-
-                    if interpolation_method == "repeat":
-                        latents[:, :, i, :, :] = init_latents.detach().clone()
-                    elif interpolation_method == "lerp":
-                        latents[:, :, i, :, :] = lerp(latents[:, :, i, :, :], init_latents, alpha)
-                    elif interpolation_method == "slerp":
-                        latents[:, :, i, :, :] = slerp(latents[:, :, i, :, :], init_latents, alpha)
-                    else:
-                        raise NotImplementedError
-
-            return latents
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
 
     @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
-        prompt: Optional[Union[str, List[str]]] = None,
-        image: Optional[PipelineImageInput] = None,
+        prompt: Union[str, List[str]] = None,
         num_frames: Optional[int] = 16,
         height: Optional[int] = None,
         width: Optional[int] = None,
         num_inference_steps: int = 50,
-        timesteps: Optional[List[int]] = None,
         guidance_scale: float = 7.5,
         negative_prompt: Optional[Union[str, List[str]]] = None,
         num_videos_per_prompt: Optional[int] = 1,
@@ -814,7 +563,6 @@ def __call__(
         callback_steps: Optional[int] = 1,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         clip_skip: Optional[int] = None,
-        interpolation_method: str = "slerp",
     ):
         r"""
         The call function to the pipeline for generation.
@@ -921,7 +669,6 @@ def __call__(
             lora_scale=text_encoder_lora_scale,
             clip_skip=clip_skip,
         )
-
         # For classifier free guidance, we need to do two forward passes.
         # Here we concatenate the unconditional and text embeddings into a single batch
         # to avoid doing two forward passes
@@ -936,26 +683,22 @@ def __call__(
             if do_classifier_free_guidance:
                 image_embeds = torch.cat([negative_image_embeds, image_embeds])
 
-        if image is not None:
-            image = self.image_processor.preprocess(image)
-
         # 4. Prepare timesteps
-        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
 
         # 5. Prepare latent variables
         num_channels_latents = self.unet.config.in_channels
         latents = self.prepare_latents(
-            image=image,
-            batch_size=batch_size * num_videos_per_prompt,
-            num_channels_latents=num_channels_latents,
-            num_frames=num_frames,
-            height=height,
-            width=width,
-            dtype=prompt_embeds.dtype,
-            device=device,
-            generator=generator,
-            latents=latents,
-            interpolation_method=interpolation_method,
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            num_frames,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
         )
 
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
@@ -1012,111 +755,3 @@ def __call__(
             return (video,)
 
         return AnimateDiffPipelineOutput(frames=video)
-
-    @torch.no_grad()
-    def text_to_video(
-        self,
-        prompt: Union[str, List[str]],
-        num_frames: Optional[int] = 16,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        timesteps: Optional[List[int]] = None,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_videos_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        ip_adapter_image: Optional[PipelineImageInput] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        clip_skip: Optional[int] = None,
-    ):
-        r"""
-        Refer to the documentation of AnimateDiffPipeline.__call__.
-        """
-        return self.__call__(
-            prompt=prompt,
-            num_frames=num_frames,
-            height=height,
-            width=width,
-            num_inference_steps=num_inference_steps,
-            timesteps=timesteps,
-            guidance_scale=guidance_scale,
-            negative_prompt=negative_prompt,
-            num_videos_per_prompt=num_videos_per_prompt,
-            eta=eta,
-            generator=generator,
-            latents=latents,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            ip_adapter_image=ip_adapter_image,
-            output_type=output_type,
-            return_dict=return_dict,
-            callback=callback,
-            callback_steps=callback_steps,
-            cross_attention_kwargs=cross_attention_kwargs,
-            clip_skip=clip_skip,
-        )
-
-    @torch.no_grad()
-    def image_to_video(
-        self,
-        image: PipelineImageInput,
-        prompt: Optional[Union[str, List[str]]] = None,
-        num_frames: Optional[int] = 16,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        timesteps: Optional[List[int]] = None,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_videos_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        ip_adapter_image: Optional[PipelineImageInput] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        clip_skip: Optional[int] = None,
-        interpolation_method: str = "slerp",
-    ):
-        r"""
-        Refer to the documentation of AnimateDiffPipeline.__call__.
-        """
-        return self.__call__(
-            image=image,
-            prompt=prompt,
-            num_frames=num_frames,
-            height=height,
-            width=width,
-            num_inference_steps=num_inference_steps,
-            timesteps=timesteps,
-            guidance_scale=guidance_scale,
-            negative_prompt=negative_prompt,
-            num_videos_per_prompt=num_videos_per_prompt,
-            eta=eta,
-            generator=generator,
-            latents=latents,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            ip_adapter_image=ip_adapter_image,
-            output_type=output_type,
-            return_dict=return_dict,
-            callback=callback,
-            callback_steps=callback_steps,
-            cross_attention_kwargs=cross_attention_kwargs,
-            clip_skip=clip_skip,
-            interpolation_method=interpolation_method,
-        )

From 06b427f3f095227e24c8c78c9b2fcd357b91012c Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Wed, 27 Dec 2023 04:24:42 +0530
Subject: [PATCH 03/47] add img2video as pipeline

---
 .../pipeline_animatediff_img2video.py         | 963 ++++++++++++++++++
 1 file changed, 963 insertions(+)
 create mode 100644 src/diffusers/pipelines/animatediff/pipeline_animatediff_img2video.py

diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_img2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_img2video.py
new file mode 100644
index 000000000000..687f97ff2fc2
--- /dev/null
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_img2video.py
@@ -0,0 +1,963 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from dataclasses import dataclass
+from types import FunctionType
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...models.unet_motion_model import MotionAdapter
+from ...schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from ...utils import USE_PEFT_BACKEND, BaseOutput, logging, scale_lora_layers, unscale_lora_layers
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler
+        >>> from diffusers.utils import export_to_gif
+
+        >>> adapter = MotionAdapter.from_pretrained("diffusers/motion-adapter")
+        >>> pipe = AnimateDiffPipeline.from_pretrained("frankjoshua/toonyou_beta6", motion_adapter=adapter)
+        >>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False)
+        >>> output = pipe(prompt="A corgi walking in the park")
+        >>> frames = output.frames[0]
+        >>> export_to_gif(frames, "animation.gif")
+        ```
+"""
+
+
+def lerp(
+    v0: torch.Tensor,
+    v1: torch.Tensor,
+    t: Union[float, torch.Tensor],
+) -> torch.Tensor:
+    t_is_float = False
+    input_device = v0.device
+    v0 = v0.cpu().numpy()
+    v1 = v1.cpu().numpy()
+
+    if isinstance(t, torch.Tensor):
+        t = t.cpu().numpy()
+    else:
+        t_is_float = True
+        t = np.array([t], dtype=v0.dtype)
+
+    t = t[..., None]
+    v0 = v0[None, ...]
+    v1 = v1[None, ...]
+    v2 = (1 - t) * v0 + t * v1
+
+    if t_is_float and v0.ndim > 1:
+        assert v2.shape[0] == 1
+        v2 = np.squeeze(v2, axis=0)
+
+    v2 = torch.from_numpy(v2).to(input_device)
+    return v2
+
+
+def slerp(
+    v0: torch.Tensor,
+    v1: torch.Tensor,
+    t: Union[float, torch.Tensor],
+    DOT_THRESHOLD: float = 0.9995,
+) -> torch.Tensor:
+    t_is_float = False
+    input_device = v0.device
+    v0 = v0.cpu().numpy()
+    v1 = v1.cpu().numpy()
+
+    if isinstance(t, torch.Tensor):
+        t = t.cpu().numpy()
+    else:
+        t_is_float = True
+        t = np.array([t], dtype=v0.dtype)
+
+    dot = np.sum(v0 * v1 / (np.linalg.norm(v0) * np.linalg.norm(v1)))
+
+    if np.abs(dot) > DOT_THRESHOLD:
+        # v1 and v2 are close to parallel, so use linear interpolation instead
+        v2 = lerp(v0, v1, t)
+    else:
+        theta_0 = np.arccos(dot)
+        sin_theta_0 = np.sin(theta_0)
+        theta_t = theta_0 * t
+        sin_theta_t = np.sin(theta_t)
+        s0 = np.sin(theta_0 - theta_t) / sin_theta_0
+        s1 = sin_theta_t / sin_theta_0
+        s0 = s0[..., None]
+        s1 = s1[..., None]
+        v0 = v0[None, ...]
+        v1 = v1[None, ...]
+        v2 = s0 * v0 + s1 * v1
+
+    if t_is_float and v0.ndim > 1:
+        assert v2.shape[0] == 1
+        v2 = np.squeeze(v2, axis=0)
+
+    v2 = torch.from_numpy(v2).to(input_device)
+    return v2
+
+
+def tensor2vid(video: torch.Tensor, processor, output_type="np"):
+    # Based on:
+    # https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/pipelines/multi_modal/text_to_video_synthesis_pipeline.py#L78
+
+    batch_size, channels, num_frames, height, width = video.shape
+    outputs = []
+    for batch_idx in range(batch_size):
+        batch_vid = video[batch_idx].permute(1, 0, 2, 3)
+        batch_output = processor.postprocess(batch_vid, output_type)
+
+        outputs.append(batch_output)
+
+    return outputs
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+@dataclass
+class AnimateDiffImg2VideoPipelineOutput(BaseOutput):
+    frames: Union[torch.Tensor, np.ndarray]
+
+
+class AnimateDiffImg2VideoPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
+    r"""
+    Pipeline for text-to-video generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer (`CLIPTokenizer`):
+            A [`~transformers.CLIPTokenizer`] to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A [`UNet2DConditionModel`] used to create a UNetMotionModel to denoise the encoded video latents.
+        motion_adapter ([`MotionAdapter`]):
+            A [`MotionAdapter`] to be used in combination with `unet` to denoise the encoded video latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["feature_extractor", "image_encoder"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        motion_adapter: MotionAdapter,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
+        feature_extractor: CLIPImageProcessor = None,
+        image_encoder: CLIPVisionModelWithProjection = None,
+    ):
+        super().__init__()
+        unet = UNetMotionModel.from_unet2d(unet, motion_adapter)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            motion_adapter=motion_adapter,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+
+        batch_size, channels, num_frames, height, width = latents.shape
+        latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
+
+        image = self.vae.decode(latents).sample
+        video = (
+            image[None, :]
+            .reshape(
+                (
+                    batch_size,
+                    num_frames,
+                    -1,
+                )
+                + image.shape[2:]
+            )
+            .permute(0, 2, 1, 3, 4)
+        )
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        video = video.float()
+        return video
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
+    def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
+        r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
+
+        The suffixes after the scaling factors represent the stages where they are being applied.
+
+        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
+        that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+
+        Args:
+            s1 (`float`):
+                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+                mitigate "oversmoothing effect" in the enhanced denoising process.
+            s2 (`float`):
+                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+                mitigate "oversmoothing effect" in the enhanced denoising process.
+            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+        """
+        if not hasattr(self, "unet"):
+            raise ValueError("The pipeline must have `unet` for using FreeU.")
+        self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
+    def disable_freeu(self):
+        """Disables the FreeU mechanism if enabled."""
+        self.unet.disable_freeu()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        latent_interpolation_method=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if latent_interpolation_method is not None:
+            if latent_interpolation_method not in ["lerp", "slerp"] and not isinstance(
+                latent_interpolation_method, FunctionType
+            ):
+                raise ValueError(
+                    "`latent_interpolation_method` must be one of `lerp`, `slerp` or a Callable[[torch.Tensor, torch.Tensor, int], torch.Tensor]"
+                )
+
+    def prepare_latents(
+        self,
+        image,
+        strength,
+        batch_size,
+        num_channels_latents,
+        num_frames,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+        latent_interpolation_method="lerp",
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_frames,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        if image.shape[1] == 4:
+            init_latents = image
+        else:
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            if self.vae.config.force_upcast:
+                image = image.float()
+                self.vae.to(dtype=torch.float32)
+
+            if isinstance(generator, list):
+                if len(generator) != batch_size:
+                    raise ValueError(
+                        f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                        f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                    )
+
+                init_latents = [
+                    retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                    for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+            if self.vae.config.force_upcast:
+                self.vae.to(dtype)
+
+            init_latents = init_latents.to(dtype)
+            init_latents = self.vae.config.scaling_factor * init_latents
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            latents = latents * self.scheduler.init_noise_sigma
+
+            if latent_interpolation_method == "lerp":
+
+                def latent_cls(v0, v1, index):
+                    return lerp(v0, v1, index / num_frames * (1 - strength))
+            elif latent_interpolation_method == "slerp":
+
+                def latent_cls(v0, v1, index):
+                    return slerp(v0, v1, index / num_frames * (1 - strength))
+            else:
+                latent_cls = latent_interpolation_method
+
+            for i in range(num_frames):
+                latents[:, :, i, :, :] = latent_cls(latents[:, :, i, :, :], init_latents, i)
+
+        return latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: PipelineImageInput,
+        prompt: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_frames: Optional[int] = 16,
+        num_inference_steps: int = 50,
+        timesteps: Optional[List[int]] = None,
+        guidance_scale: float = 7.5,
+        strength: float = 0.8,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+        latent_interpolation_method: Union[str, Callable[[torch.Tensor, torch.Tensor, int], torch.Tensor]] = "lerp",
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            image (`PipelineImageInput`):
+                The input image to condition the generation on.
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated video.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated video.
+            num_frames (`int`, *optional*, defaults to 16):
+                The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
+                amounts to 2 seconds of video.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
+                expense of slower inference.
+            strength (`float`, *optional*, defaults to 0.8):
+                Higher strength leads to more differences between original image and generated video.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
+                `(batch_size, num_channel, num_frames, height, width)`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or
+                `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`AnimateDiffImg2VideoPipelineOutput`] instead
+                of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        Examples:
+
+        Returns:
+            [`AnimateDiffImg2VideoPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`AnimateDiffImg2VideoPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        num_videos_per_prompt = 1
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt=prompt,
+            height=height,
+            width=width,
+            callback_steps=callback_steps,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            latent_interpolation_method=latent_interpolation_method,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_videos_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=clip_skip,
+        )
+
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        if ip_adapter_image is not None:
+            output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
+            image_embeds, negative_image_embeds = self.encode_image(
+                ip_adapter_image, device, num_videos_per_prompt, output_hidden_state
+            )
+            if do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+
+        image = self.image_processor.preprocess(image)
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            image=image,
+            strength=strength,
+            batch_size=batch_size * num_videos_per_prompt,
+            num_channels_latents=num_channels_latents,
+            num_frames=num_frames,
+            height=height,
+            width=width,
+            dtype=prompt_embeds.dtype,
+            device=device,
+            generator=generator,
+            latents=latents,
+            latent_interpolation_method=latent_interpolation_method,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7 Add image embeds for IP-Adapter
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
+
+        # Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+
+        if output_type == "latent":
+            return AnimateDiffImg2VideoPipelineOutput(frames=latents)
+
+        # Post-processing
+        video_tensor = self.decode_latents(latents)
+
+        if output_type == "pt":
+            video = video_tensor
+        else:
+            video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return AnimateDiffImg2VideoPipelineOutput(frames=video)

From 2bc77c6ac5e3165c02d861f6f994f6513f57acc9 Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Wed, 27 Dec 2023 07:41:59 +0530
Subject: [PATCH 04/47] update

---
 .../pipelines/animatediff/pipeline_animatediff_img2video.py  | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_img2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_img2video.py
index 687f97ff2fc2..f74ba892db48 100644
--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_img2video.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_img2video.py
@@ -678,7 +678,7 @@ def prepare_latents(
         image = image.to(device=device, dtype=dtype)
 
         if image.shape[1] == 4:
-            init_latents = image
+            latents = image
         else:
             # make sure the VAE is in float32 mode, as it overflows in float16
             if self.vae.config.force_upcast:
@@ -815,6 +815,9 @@ def __call__(
             clip_skip (`int`, *optional*):
                 Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                 the output of the pre-final layer will be used for computing the prompt embeddings.
+            latent_interpolation_method (`str` or `Callable[[torch.Tensor, torch.Tensor, int], torch.Tensor]]`, *optional*):
+                Must be one of "lerp", "slerp" or a callable that takes in a random noisy latent, image latent and a frame index
+                as input and returns an initial latent for sampling.
         Examples:
 
         Returns:

From d0b38937c45265148d701906d08998e6caf5ed89 Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Wed, 27 Dec 2023 07:42:14 +0530
Subject: [PATCH 05/47] add vid2vid pipeline

---
 .../pipeline_animatediff_video2video.py       | 903 ++++++++++++++++++
 1 file changed, 903 insertions(+)
 create mode 100644 src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py

diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
new file mode 100644
index 000000000000..11f086565331
--- /dev/null
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
@@ -0,0 +1,903 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from dataclasses import dataclass
+from types import FunctionType
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...models.unet_motion_model import MotionAdapter
+from ...schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from ...utils import USE_PEFT_BACKEND, BaseOutput, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler
+        >>> from diffusers.utils import export_to_gif
+
+        >>> adapter = MotionAdapter.from_pretrained("diffusers/motion-adapter")
+        >>> pipe = AnimateDiffPipeline.from_pretrained("frankjoshua/toonyou_beta6", motion_adapter=adapter)
+        >>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False)
+        >>> output = pipe(prompt="A corgi walking in the park")
+        >>> frames = output.frames[0]
+        >>> export_to_gif(frames, "animation.gif")
+        ```
+"""
+
+
+def tensor2vid(video: torch.Tensor, processor, output_type="np"):
+    # Based on:
+    # https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/pipelines/multi_modal/text_to_video_synthesis_pipeline.py#L78
+
+    batch_size, channels, num_frames, height, width = video.shape
+    outputs = []
+    for batch_idx in range(batch_size):
+        batch_vid = video[batch_idx].permute(1, 0, 2, 3)
+        batch_output = processor.postprocess(batch_vid, output_type)
+
+        outputs.append(batch_output)
+
+    return outputs
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+@dataclass
+class AnimateDiffVideo2VideoPipelineOutput(BaseOutput):
+    frames: Union[torch.Tensor, np.ndarray]
+
+
+class AnimateDiffVideo2VideoPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
+    r"""
+    Pipeline for text-to-video generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer (`CLIPTokenizer`):
+            A [`~transformers.CLIPTokenizer`] to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A [`UNet2DConditionModel`] used to create a UNetMotionModel to denoise the encoded video latents.
+        motion_adapter ([`MotionAdapter`]):
+            A [`MotionAdapter`] to be used in combination with `unet` to denoise the encoded video latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["feature_extractor", "image_encoder"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        motion_adapter: MotionAdapter,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
+        feature_extractor: CLIPImageProcessor = None,
+        image_encoder: CLIPVisionModelWithProjection = None,
+    ):
+        super().__init__()
+        unet = UNetMotionModel.from_unet2d(unet, motion_adapter)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            motion_adapter=motion_adapter,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+
+        batch_size, channels, num_frames, height, width = latents.shape
+        latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
+
+        image = self.vae.decode(latents).sample
+        video = (
+            image[None, :]
+            .reshape(
+                (
+                    batch_size,
+                    num_frames,
+                    -1,
+                )
+                + image.shape[2:]
+            )
+            .permute(0, 2, 1, 3, 4)
+        )
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        video = video.float()
+        return video
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
+    def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
+        r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
+
+        The suffixes after the scaling factors represent the stages where they are being applied.
+
+        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
+        that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+
+        Args:
+            s1 (`float`):
+                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+                mitigate "oversmoothing effect" in the enhanced denoising process.
+            s2 (`float`):
+                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+                mitigate "oversmoothing effect" in the enhanced denoising process.
+            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+        """
+        if not hasattr(self, "unet"):
+            raise ValueError("The pipeline must have `unet` for using FreeU.")
+        self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
+    def disable_freeu(self):
+        """Disables the FreeU mechanism if enabled."""
+        self.unet.disable_freeu()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        latent_interpolation_method=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if latent_interpolation_method is not None:
+            if latent_interpolation_method not in ["lerp", "slerp"] and not isinstance(
+                latent_interpolation_method, FunctionType
+            ):
+                raise ValueError(
+                    "`latent_interpolation_method` must be one of `lerp`, `slerp` or a Callable[[torch.Tensor, torch.Tensor, int], torch.Tensor]"
+                )
+
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_latents(
+        self,
+        video,
+        batch_size,
+        num_channels_latents,
+        num_frames,
+        height,
+        width,
+        timestep,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        # shape = (
+        #     batch_size,
+        #     num_channels_latents,
+        #     num_frames,
+        #     height // self.vae_scale_factor,
+        #     width // self.vae_scale_factor,
+        # )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        video = video.to(device=device, dtype=dtype)
+
+        if video.shape[1] == 4:
+            latents = video
+        else:
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            if self.vae.config.force_upcast:
+                video = video.float()
+                self.vae.to(dtype=torch.float32)
+
+            if isinstance(generator, list):
+                if len(generator) != batch_size:
+                    raise ValueError(
+                        f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                        f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                    )
+
+                init_latents = [
+                    retrieve_latents(self.vae.encode(video[i : i + 1]), generator=generator[i])
+                    for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = retrieve_latents(self.vae.encode(video), generator=generator)
+
+            if self.vae.config.force_upcast:
+                self.vae.to(dtype)
+
+            init_latents = init_latents.to(dtype)
+            init_latents = self.vae.config.scaling_factor * init_latents
+
+            if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+                # expand init_latents for batch_size
+                deprecation_message = (
+                    f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+                    " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                    " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                    " your script to pass as many initial images as text prompts to suppress this warning."
+                )
+                deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+                additional_image_per_prompt = batch_size // init_latents.shape[0]
+                init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+            elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+                raise ValueError(
+                    f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+                )
+            else:
+                init_latents = torch.cat([init_latents], dim=0)
+
+            noise = randn_tensor(init_latents.shape, generator=generator, device=device, dtype=dtype)
+            latents = self.scheduler.add_noise(init_latents, noise, timestep).unsqueeze(0).permute(0, 2, 1, 3, 4)
+
+        return latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        video: List[PipelineImageInput],
+        prompt: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_frames: Optional[int] = 16,
+        num_inference_steps: int = 50,
+        timesteps: Optional[List[int]] = None,
+        guidance_scale: float = 7.5,
+        strength: float = 0.8,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            video (`List[PipelineImageInput]`):
+                The input video to condition the generation on.
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated video.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated video.
+            num_frames (`int`, *optional*, defaults to 16):
+                The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
+                amounts to 2 seconds of video.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
+                expense of slower inference.
+            strength (`float`, *optional*, defaults to 0.8):
+                Higher strength leads to more differences between original video and generated video.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
+                `(batch_size, num_channel, num_frames, height, width)`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or
+                `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`AnimateDiffImg2VideoPipelineOutput`] instead
+                of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        Examples:
+
+        Returns:
+            [`AnimateDiffImg2VideoPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`AnimateDiffImg2VideoPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        num_videos_per_prompt = 1
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt=prompt,
+            height=height,
+            width=width,
+            callback_steps=callback_steps,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_videos_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=clip_skip,
+        )
+
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        if ip_adapter_image is not None:
+            output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
+            image_embeds, negative_image_embeds = self.encode_image(
+                ip_adapter_image, device, num_videos_per_prompt, output_hidden_state
+            )
+            if do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+
+        video = self.image_processor.preprocess(video)
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_videos_per_prompt)
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            video=video,
+            batch_size=batch_size * num_videos_per_prompt,
+            num_channels_latents=num_channels_latents,
+            num_frames=num_frames,
+            height=height,
+            width=width,
+            timestep=latent_timestep,
+            dtype=prompt_embeds.dtype,
+            device=device,
+            generator=generator,
+            latents=latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7 Add image embeds for IP-Adapter
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
+
+        # Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+
+        if output_type == "latent":
+            return AnimateDiffVideo2VideoPipelineOutput(frames=latents)
+
+        # Post-processing
+        video_tensor = self.decode_latents(latents)
+
+        if output_type == "pt":
+            video = video_tensor
+        else:
+            video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return AnimateDiffVideo2VideoPipelineOutput(frames=video)

From 466d92a6b4387332ebb25d8f8838972261709de8 Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Sun, 31 Dec 2023 11:29:17 +0530
Subject: [PATCH 06/47] update imports

---
 src/diffusers/pipelines/__init__.py             |  8 ++++++--
 src/diffusers/pipelines/animatediff/__init__.py | 13 +++++++++++++
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 3bf67dfc1cdc..6c0ce6d526a7 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -109,7 +109,11 @@
         ]
     )
     _import_structure["amused"] = ["AmusedImg2ImgPipeline", "AmusedInpaintPipeline", "AmusedPipeline"]
-    _import_structure["animatediff"] = ["AnimateDiffPipeline"]
+    _import_structure["animatediff"] = [
+        "AnimateDiffPipeline",
+        "AnimateDiffImg2VideoPipeline",
+        "AnimateDiffVideo2VideoPipeline",
+    ]
     _import_structure["audioldm"] = ["AudioLDMPipeline"]
     _import_structure["audioldm2"] = [
         "AudioLDM2Pipeline",
@@ -344,7 +348,7 @@
         from ..utils.dummy_torch_and_transformers_objects import *
     else:
         from .amused import AmusedImg2ImgPipeline, AmusedInpaintPipeline, AmusedPipeline
-        from .animatediff import AnimateDiffPipeline
+        from .animatediff import AnimateDiffImg2VideoPipeline, AnimateDiffPipeline, AnimateDiffVideo2VideoPipeline
         from .audioldm import AudioLDMPipeline
         from .audioldm2 import (
             AudioLDM2Pipeline,
diff --git a/src/diffusers/pipelines/animatediff/__init__.py b/src/diffusers/pipelines/animatediff/__init__.py
index 503352fec865..c82569bfe982 100644
--- a/src/diffusers/pipelines/animatediff/__init__.py
+++ b/src/diffusers/pipelines/animatediff/__init__.py
@@ -22,6 +22,14 @@
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
     _import_structure["pipeline_animatediff"] = ["AnimateDiffPipeline", "AnimateDiffPipelineOutput"]
+    _import_structure["pipeline_animatediff_img2video"] = [
+        "AnimateDiffImg2VideoPipeline",
+        "AnimateDiffImg2VideoPipelineOutput",
+    ]
+    _import_structure["pipeline_animatediff_video2video"] = [
+        "AnimateDiffVideo2VideoPipeline",
+        "AnimateDiffVideo2VideoPipelineOutput",
+    ]
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
     try:
@@ -32,6 +40,11 @@
 
     else:
         from .pipeline_animatediff import AnimateDiffPipeline, AnimateDiffPipelineOutput
+        from .pipeline_animatediff_img2video import AnimateDiffImg2VideoPipeline, AnimateDiffImg2VideoPipelineOutput
+        from .pipeline_animatediff_video2video import (
+            AnimateDiffVideo2VideoPipeline,
+            AnimateDiffVideo2VideoPipelineOutput,
+        )
 
 else:
     import sys

From fc815c87ecd80b419930ba700676615219fedaed Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Sun, 31 Dec 2023 11:39:56 +0530
Subject: [PATCH 07/47] update

---
 .../pipeline_animatediff_img2video.py         | 11 ++--
 .../pipeline_animatediff_video2video.py       | 51 +++++++------------
 2 files changed, 25 insertions(+), 37 deletions(-)

diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_img2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_img2video.py
index f74ba892db48..a27bb9207574 100644
--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_img2video.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_img2video.py
@@ -731,7 +731,7 @@ def __call__(
         prompt: Optional[Union[str, List[str]]] = None,
         height: Optional[int] = None,
         width: Optional[int] = None,
-        num_frames: Optional[int] = 16,
+        num_frames: int = 16,
         num_inference_steps: int = 50,
         timesteps: Optional[List[int]] = None,
         guidance_scale: float = 7.5,
@@ -912,10 +912,11 @@ def __call__(
 
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-        # 7 Add image embeds for IP-Adapter
+
+        # 7. Add image embeds for IP-Adapter
         added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
 
-        # Denoising loop
+        # 8. Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
@@ -949,7 +950,7 @@ def __call__(
         if output_type == "latent":
             return AnimateDiffImg2VideoPipelineOutput(frames=latents)
 
-        # Post-processing
+        # 9. Post-processing
         video_tensor = self.decode_latents(latents)
 
         if output_type == "pt":
@@ -957,7 +958,7 @@ def __call__(
         else:
             video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
 
-        # Offload all models
+        # 10. Offload all models
         self.maybe_free_model_hooks()
 
         if not return_dict:
diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
index 11f086565331..5f49450c9f25 100644
--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
@@ -517,6 +517,7 @@ def prepare_extra_step_kwargs(self, generator, eta):
     def check_inputs(
         self,
         prompt,
+        strength,
         height,
         width,
         callback_steps,
@@ -526,6 +527,9 @@ def check_inputs(
         callback_on_step_end_tensor_inputs=None,
         latent_interpolation_method=None,
     ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
         if height % 8 != 0 or width % 8 != 0:
             raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
@@ -588,23 +592,12 @@ def prepare_latents(
         self,
         video,
         batch_size,
-        num_channels_latents,
-        num_frames,
-        height,
-        width,
         timestep,
         dtype,
         device,
         generator,
         latents=None,
     ):
-        # shape = (
-        #     batch_size,
-        #     num_channels_latents,
-        #     num_frames,
-        #     height // self.vae_scale_factor,
-        #     width // self.vae_scale_factor,
-        # )
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
                 f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -672,7 +665,6 @@ def __call__(
         prompt: Optional[Union[str, List[str]]] = None,
         height: Optional[int] = None,
         width: Optional[int] = None,
-        num_frames: Optional[int] = 16,
         num_inference_steps: int = 50,
         timesteps: Optional[List[int]] = None,
         guidance_scale: float = 7.5,
@@ -697,16 +689,13 @@ def __call__(
 
         Args:
             video (`List[PipelineImageInput]`):
-                The input video to condition the generation on.
+                The input video to condition the generation on. Must be a list of images/frames of the video.
             prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
             height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                 The height in pixels of the generated video.
             width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                 The width in pixels of the generated video.
-            num_frames (`int`, *optional*, defaults to 16):
-                The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
-                amounts to 2 seconds of video.
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
                 expense of slower inference.
@@ -741,7 +730,7 @@ def __call__(
                 The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or
                 `np.array`.
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`AnimateDiffImg2VideoPipelineOutput`] instead
+                Whether or not to return a [`AnimateDiffVideo2VideoPipelineOutput`] instead
                 of a plain tuple.
             callback (`Callable`, *optional*):
                 A function that calls every `callback_steps` steps during inference. The function is called with the
@@ -758,8 +747,8 @@ def __call__(
         Examples:
 
         Returns:
-            [`AnimateDiffImg2VideoPipelineOutput`] or `tuple`:
-                If `return_dict` is `True`, [`AnimateDiffImg2VideoPipelineOutput`] is
+            [`AnimateDiffVideo2VideoPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`AnimateDiffVideo2VideoPipelineOutput`] is
                 returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
         """
         # 0. Default height and width to unet
@@ -771,6 +760,7 @@ def __call__(
         # 1. Check inputs. Raise error if not correct
         self.check_inputs(
             prompt=prompt,
+            strength=strength,
             height=height,
             width=width,
             callback_steps=callback_steps,
@@ -824,22 +814,18 @@ def __call__(
             if do_classifier_free_guidance:
                 image_embeds = torch.cat([negative_image_embeds, image_embeds])
 
-        video = self.image_processor.preprocess(video)
+        # 4. Preprocess video
+        video = self.image_processor.preprocess(video, height=height, width=width)
 
-        # 4. Prepare timesteps
+        # 5. Prepare timesteps
         timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
         timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
         latent_timestep = timesteps[:1].repeat(batch_size * num_videos_per_prompt)
 
-        # 5. Prepare latent variables
-        num_channels_latents = self.unet.config.in_channels
+        # 6. Prepare latent variables
         latents = self.prepare_latents(
             video=video,
             batch_size=batch_size * num_videos_per_prompt,
-            num_channels_latents=num_channels_latents,
-            num_frames=num_frames,
-            height=height,
-            width=width,
             timestep=latent_timestep,
             dtype=prompt_embeds.dtype,
             device=device,
@@ -847,12 +833,13 @@ def __call__(
             latents=latents,
         )
 
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-        # 7 Add image embeds for IP-Adapter
+
+        # 8. Add image embeds for IP-Adapter
         added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
 
-        # Denoising loop
+        # 9. Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
@@ -886,7 +873,7 @@ def __call__(
         if output_type == "latent":
             return AnimateDiffVideo2VideoPipelineOutput(frames=latents)
 
-        # Post-processing
+        # 10. Post-processing
         video_tensor = self.decode_latents(latents)
 
         if output_type == "pt":
@@ -894,7 +881,7 @@ def __call__(
         else:
             video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
 
-        # Offload all models
+        # 11. Offload all models
         self.maybe_free_model_hooks()
 
         if not return_dict:

From 315daad5772415af5b2564a1de15c9af619cf0e8 Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Sun, 31 Dec 2023 11:41:57 +0530
Subject: [PATCH 08/47] remove copied from line for check_inputs

---
 .../pipelines/animatediff/pipeline_animatediff_img2video.py      | 1 -
 .../pipelines/animatediff/pipeline_animatediff_video2video.py    | 1 -
 2 files changed, 2 deletions(-)

diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_img2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_img2video.py
index a27bb9207574..97650a135b79 100644
--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_img2video.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_img2video.py
@@ -585,7 +585,6 @@ def prepare_extra_step_kwargs(self, generator, eta):
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
     def check_inputs(
         self,
         prompt,
diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
index 5f49450c9f25..785594cdeeac 100644
--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
@@ -513,7 +513,6 @@ def prepare_extra_step_kwargs(self, generator, eta):
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
     def check_inputs(
         self,
         prompt,

From cc55f3d649584785513ee7246ce12813893a60c4 Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Sun, 31 Dec 2023 13:44:02 +0530
Subject: [PATCH 09/47] update

---
 .../pipeline_animatediff_img2video.py           | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_img2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_img2video.py
index 97650a135b79..e1c5967e6621 100644
--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_img2video.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_img2video.py
@@ -887,12 +887,13 @@ def __call__(
             if do_classifier_free_guidance:
                 image_embeds = torch.cat([negative_image_embeds, image_embeds])
 
-        image = self.image_processor.preprocess(image)
+        # 4. Preprocess image
+        image = self.image_processor.preprocess(image, height=height, width=width)
 
-        # 4. Prepare timesteps
+        # 5. Prepare timesteps
         timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
 
-        # 5. Prepare latent variables
+        # 6. Prepare latent variables
         num_channels_latents = self.unet.config.in_channels
         latents = self.prepare_latents(
             image=image,
@@ -909,13 +910,13 @@ def __call__(
             latent_interpolation_method=latent_interpolation_method,
         )
 
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
-        # 7. Add image embeds for IP-Adapter
+        # 8. Add image embeds for IP-Adapter
         added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
 
-        # 8. Denoising loop
+        # 9. Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
@@ -949,7 +950,7 @@ def __call__(
         if output_type == "latent":
             return AnimateDiffImg2VideoPipelineOutput(frames=latents)
 
-        # 9. Post-processing
+        # 10. Post-processing
         video_tensor = self.decode_latents(latents)
 
         if output_type == "pt":
@@ -957,7 +958,7 @@ def __call__(
         else:
             video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
 
-        # 10. Offload all models
+        # 11. Offload all models
         self.maybe_free_model_hooks()
 
         if not return_dict:

From d7a85be76b5410ec908b56afca8ef8e06919baf2 Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Sun, 31 Dec 2023 13:52:10 +0530
Subject: [PATCH 10/47] update examples

---
 .../pipeline_animatediff_img2video.py         | 12 ++++++-----
 .../pipeline_animatediff_video2video.py       | 21 ++++++++++++++-----
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_img2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_img2video.py
index e1c5967e6621..893192074d95 100644
--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_img2video.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_img2video.py
@@ -45,13 +45,15 @@
     Examples:
         ```py
         >>> import torch
-        >>> from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler
-        >>> from diffusers.utils import export_to_gif
+        >>> from diffusers import MotionAdapter, AnimateDiffImg2VideoPipeline, DDIMScheduler
+        >>> from diffusers.utils import export_to_gif, load_image
 
         >>> adapter = MotionAdapter.from_pretrained("diffusers/motion-adapter")
-        >>> pipe = AnimateDiffPipeline.from_pretrained("frankjoshua/toonyou_beta6", motion_adapter=adapter)
-        >>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False)
-        >>> output = pipe(prompt="A corgi walking in the park")
+        >>> pipe = AnimateDiffImg2VideoPipeline.from_pretrained("SG161222/Realistic_Vision_V5.1_noVAE", motion_adapter=adapter).to("cuda")
+        >>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False, timespace_spacing="linspace")
+
+        >>> img = load_image("snail.png")
+        >>> output = pipe(image=image, prompt="A snail moving on the ground", strength=0.8, latent_interpolation_method="slerp")
         >>> frames = output.frames[0]
         >>> export_to_gif(frames, "animation.gif")
         ```
diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
index 785594cdeeac..83773386d93f 100644
--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
@@ -44,14 +44,25 @@
 EXAMPLE_DOC_STRING = """
     Examples:
         ```py
+        >>> import imageio
         >>> import torch
-        >>> from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler
-        >>> from diffusers.utils import export_to_gif
+        >>> from diffusers import MotionAdapter, AnimateDiffVideo2VideoPipeline, DDIMScheduler
+        >>> from diffusers.utils import export_to_gif, load_image
 
         >>> adapter = MotionAdapter.from_pretrained("diffusers/motion-adapter")
-        >>> pipe = AnimateDiffPipeline.from_pretrained("frankjoshua/toonyou_beta6", motion_adapter=adapter)
-        >>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False)
-        >>> output = pipe(prompt="A corgi walking in the park")
+        >>> pipe = AnimateDiffVideo2VideoPipeline.from_pretrained("SG161222/Realistic_Vision_V5.1_noVAE", motion_adapter=adapter).to("cuda")
+        >>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False, timespace_spacing="linspace")
+
+        >>> def load_video(file_path):
+        >>>     images = []
+        >>>     vid = imageio.get_reader(file_path)
+        >>>     for i, frame in enumerate(vid):
+        >>>         pil_image = Image.fromarray(frame)
+        >>>         images.append(pil_image)
+        >>>     return images
+
+        >>> video = load_image("animation_fireworks.png")
+        >>> output = pipe(video=video, prompt="Closeup of a woman, fireworks in the background", strength=0.7)
         >>> frames = output.frames[0]
         >>> export_to_gif(frames, "animation.gif")
         ```

From a831a5e1f9418c089ff1056b510d3415e41d6aab Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Wed, 3 Jan 2024 16:08:01 +0530
Subject: [PATCH 11/47] add multi-batch support

---
 .../pipeline_animatediff_video2video.py       | 44 +++++++++++++------
 1 file changed, 30 insertions(+), 14 deletions(-)

diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
index 83773386d93f..bc4e154c3c38 100644
--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
@@ -531,6 +531,8 @@ def check_inputs(
         height,
         width,
         callback_steps,
+        video=None,
+        latents=None,
         negative_prompt=None,
         prompt_embeds=None,
         negative_prompt_embeds=None,
@@ -589,6 +591,9 @@ def check_inputs(
                     "`latent_interpolation_method` must be one of `lerp`, `slerp` or a Callable[[torch.Tensor, torch.Tensor, int], torch.Tensor]"
                 )
 
+        if video is not None and latents is not None:
+            raise ValueError("Only one of `video` or `latents` should be provided")
+
     def get_timesteps(self, num_inference_steps, strength, device):
         # get the original timestep using init_timestep
         init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
@@ -608,17 +613,16 @@ def prepare_latents(
         generator,
         latents=None,
     ):
+        print(batch_size)
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
                 f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
                 f" size of {batch_size}. Make sure the batch size matches the length of the generators."
             )
 
-        video = video.to(device=device, dtype=dtype)
+        if latents is None:
+            video = video.to(device=device, dtype=dtype)
 
-        if video.shape[1] == 4:
-            latents = video
-        else:
             # make sure the VAE is in float32 mode, as it overflows in float16
             if self.vae.config.force_upcast:
                 video = video.float()
@@ -632,13 +636,15 @@ def prepare_latents(
                     )
 
                 init_latents = [
-                    retrieve_latents(self.vae.encode(video[i : i + 1]), generator=generator[i])
-                    for i in range(batch_size)
+                    retrieve_latents(self.vae.encode(video[i]), generator=generator[i]) for i in range(batch_size)
                 ]
-                init_latents = torch.cat(init_latents, dim=0)
+                init_latents = torch.stack(init_latents)
             else:
-                init_latents = retrieve_latents(self.vae.encode(video), generator=generator)
+                init_latents = torch.stack(
+                    [retrieve_latents(self.vae.encode(vid), generator=generator) for vid in video]
+                )
 
+            # restore vae to original dtype
             if self.vae.config.force_upcast:
                 self.vae.to(dtype)
 
@@ -653,9 +659,9 @@ def prepare_latents(
                     " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
                     " your script to pass as many initial images as text prompts to suppress this warning."
                 )
-                deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
-                additional_image_per_prompt = batch_size // init_latents.shape[0]
-                init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+                deprecate("len(prompt) != len(video)", "1.0.0", deprecation_message, standard_warn=False)
+                additional_video_per_prompt = batch_size // init_latents.shape[0]
+                init_latents = torch.cat([init_latents] * additional_video_per_prompt, dim=0)
             elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
                 raise ValueError(
                     f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
@@ -664,14 +670,19 @@ def prepare_latents(
                 init_latents = torch.cat([init_latents], dim=0)
 
             noise = randn_tensor(init_latents.shape, generator=generator, device=device, dtype=dtype)
-            latents = self.scheduler.add_noise(init_latents, noise, timestep).unsqueeze(0).permute(0, 2, 1, 3, 4)
+            latents = self.scheduler.add_noise(init_latents, noise, timestep).permute(0, 2, 1, 3, 4)
+        else:
+            if latents.dim() != 5:
+                # [B, F, C, H, W]
+                raise ValueError(f"`latents` expected to have dim=5, but found {latents.dim()=}")
+            latents = latents.to(device, dtype=dtype)
 
         return latents
 
     @torch.no_grad()
     def __call__(
         self,
-        video: List[PipelineImageInput],
+        video: List[List[PipelineImageInput]] = None,
         prompt: Optional[Union[str, List[str]]] = None,
         height: Optional[int] = None,
         width: Optional[int] = None,
@@ -777,6 +788,8 @@ def __call__(
             negative_prompt=negative_prompt,
             prompt_embeds=prompt_embeds,
             negative_prompt_embeds=negative_prompt_embeds,
+            video=video,
+            latents=latents,
         )
 
         # 2. Define call parameters
@@ -825,7 +838,10 @@ def __call__(
                 image_embeds = torch.cat([negative_image_embeds, image_embeds])
 
         # 4. Preprocess video
-        video = self.image_processor.preprocess(video, height=height, width=width)
+        if latents is None:
+            if not isinstance(video[0], list):
+                video = [video]
+            video = torch.stack([self.image_processor.preprocess(vid, height=height, width=width) for vid in video])
 
         # 5. Prepare timesteps
         timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)

From b5b5a3aad5c151227ac5d074fd719abb56050aa4 Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Wed, 3 Jan 2024 16:17:09 +0530
Subject: [PATCH 12/47] fix __init__.py files

---
 src/diffusers/__init__.py                      |  2 ++
 src/diffusers/pipelines/__init__.py            |  5 ++---
 .../pipelines/animatediff/__init__.py          | 13 ++++---------
 .../pipeline_animatediff_video2video.py        | 18 +++++++++---------
 4 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 180b210953c1..5b5f7bb7cb43 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -207,6 +207,7 @@
             "AmusedInpaintPipeline",
             "AmusedPipeline",
             "AnimateDiffPipeline",
+            "AnimateDiffVideoToVideoPipeline"
             "AudioLDM2Pipeline",
             "AudioLDM2ProjectionModel",
             "AudioLDM2UNet2DConditionModel",
@@ -567,6 +568,7 @@
             AmusedInpaintPipeline,
             AmusedPipeline,
             AnimateDiffPipeline,
+            AnimateDiffVideoToVideoPipeline,
             AudioLDM2Pipeline,
             AudioLDM2ProjectionModel,
             AudioLDM2UNet2DConditionModel,
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 9f0eca17401c..60b83914edf5 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -111,8 +111,7 @@
     _import_structure["amused"] = ["AmusedImg2ImgPipeline", "AmusedInpaintPipeline", "AmusedPipeline"]
     _import_structure["animatediff"] = [
         "AnimateDiffPipeline",
-        "AnimateDiffImg2VideoPipeline",
-        "AnimateDiffVideo2VideoPipeline",
+        "AnimateDiffVideoToVideoPipeline",
     ]
     _import_structure["audioldm"] = ["AudioLDMPipeline"]
     _import_structure["audioldm2"] = [
@@ -342,7 +341,7 @@
         from ..utils.dummy_torch_and_transformers_objects import *
     else:
         from .amused import AmusedImg2ImgPipeline, AmusedInpaintPipeline, AmusedPipeline
-        from .animatediff import AnimateDiffImg2VideoPipeline, AnimateDiffPipeline, AnimateDiffVideo2VideoPipeline
+        from .animatediff import AnimateDiffPipeline, AnimateDiffVideoToVideoPipeline
         from .audioldm import AudioLDMPipeline
         from .audioldm2 import (
             AudioLDM2Pipeline,
diff --git a/src/diffusers/pipelines/animatediff/__init__.py b/src/diffusers/pipelines/animatediff/__init__.py
index c82569bfe982..2b4493d9b908 100644
--- a/src/diffusers/pipelines/animatediff/__init__.py
+++ b/src/diffusers/pipelines/animatediff/__init__.py
@@ -22,13 +22,9 @@
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
     _import_structure["pipeline_animatediff"] = ["AnimateDiffPipeline", "AnimateDiffPipelineOutput"]
-    _import_structure["pipeline_animatediff_img2video"] = [
-        "AnimateDiffImg2VideoPipeline",
-        "AnimateDiffImg2VideoPipelineOutput",
-    ]
     _import_structure["pipeline_animatediff_video2video"] = [
-        "AnimateDiffVideo2VideoPipeline",
-        "AnimateDiffVideo2VideoPipelineOutput",
+        "AnimateDiffVideoToVideoPipeline",
+        "AnimateDiffVideoToVideoPipelineOutput",
     ]
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
@@ -40,10 +36,9 @@
 
     else:
         from .pipeline_animatediff import AnimateDiffPipeline, AnimateDiffPipelineOutput
-        from .pipeline_animatediff_img2video import AnimateDiffImg2VideoPipeline, AnimateDiffImg2VideoPipelineOutput
         from .pipeline_animatediff_video2video import (
-            AnimateDiffVideo2VideoPipeline,
-            AnimateDiffVideo2VideoPipelineOutput,
+            AnimateDiffVideoToVideoPipeline,
+            AnimateDiffVideoToVideoPipelineOutput,
         )
 
 else:
diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
index bc4e154c3c38..fdcf02e98abf 100644
--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
@@ -46,11 +46,11 @@
         ```py
         >>> import imageio
         >>> import torch
-        >>> from diffusers import MotionAdapter, AnimateDiffVideo2VideoPipeline, DDIMScheduler
+        >>> from diffusers import MotionAdapter, AnimateDiffVideoToVideoPipeline, DDIMScheduler
         >>> from diffusers.utils import export_to_gif, load_image
 
         >>> adapter = MotionAdapter.from_pretrained("diffusers/motion-adapter")
-        >>> pipe = AnimateDiffVideo2VideoPipeline.from_pretrained("SG161222/Realistic_Vision_V5.1_noVAE", motion_adapter=adapter).to("cuda")
+        >>> pipe = AnimateDiffVideoToVideoPipeline.from_pretrained("SG161222/Realistic_Vision_V5.1_noVAE", motion_adapter=adapter).to("cuda")
         >>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False, timespace_spacing="linspace")
 
         >>> def load_video(file_path):
@@ -144,11 +144,11 @@ def retrieve_timesteps(
 
 
 @dataclass
-class AnimateDiffVideo2VideoPipelineOutput(BaseOutput):
+class AnimateDiffVideoToVideoPipelineOutput(BaseOutput):
     frames: Union[torch.Tensor, np.ndarray]
 
 
-class AnimateDiffVideo2VideoPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
+class AnimateDiffVideoToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
     r"""
     Pipeline for text-to-video generation.
 
@@ -751,7 +751,7 @@ def __call__(
                 The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or
                 `np.array`.
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`AnimateDiffVideo2VideoPipelineOutput`] instead
+                Whether or not to return a [`AnimateDiffVideoToVideoPipelineOutput`] instead
                 of a plain tuple.
             callback (`Callable`, *optional*):
                 A function that calls every `callback_steps` steps during inference. The function is called with the
@@ -768,8 +768,8 @@ def __call__(
         Examples:
 
         Returns:
-            [`AnimateDiffVideo2VideoPipelineOutput`] or `tuple`:
-                If `return_dict` is `True`, [`AnimateDiffVideo2VideoPipelineOutput`] is
+            [`AnimateDiffVideoToVideoPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`AnimateDiffVideoToVideoPipelineOutput`] is
                 returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
         """
         # 0. Default height and width to unet
@@ -897,7 +897,7 @@ def __call__(
                         callback(i, t, latents)
 
         if output_type == "latent":
-            return AnimateDiffVideo2VideoPipelineOutput(frames=latents)
+            return AnimateDiffVideoToVideoPipelineOutput(frames=latents)
 
         # 10. Post-processing
         video_tensor = self.decode_latents(latents)
@@ -913,4 +913,4 @@ def __call__(
         if not return_dict:
             return (video,)
 
-        return AnimateDiffVideo2VideoPipelineOutput(frames=video)
+        return AnimateDiffVideoToVideoPipelineOutput(frames=video)

From 8bb0855a77dfeefd0d2913e5cc7e8b87accae0f7 Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Wed, 3 Jan 2024 16:17:35 +0530
Subject: [PATCH 13/47] move img2vid to community

---
 .../pipeline_animatediff_img2video.py         | 36 +++++++++----------
 1 file changed, 18 insertions(+), 18 deletions(-)
 rename {src/diffusers/pipelines/animatediff => examples/community}/pipeline_animatediff_img2video.py (96%)

diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_img2video.py b/examples/community/pipeline_animatediff_img2video.py
similarity index 96%
rename from src/diffusers/pipelines/animatediff/pipeline_animatediff_img2video.py
rename to examples/community/pipeline_animatediff_img2video.py
index 893192074d95..51cbf8013808 100644
--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_img2video.py
+++ b/examples/community/pipeline_animatediff_img2video.py
@@ -21,12 +21,12 @@
 import torch
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
 
-from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
-from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
-from ...models.lora import adjust_lora_scale_text_encoder
-from ...models.unet_motion_model import MotionAdapter
-from ...schedulers import (
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.models.unet_motion_model import MotionAdapter
+from diffusers.schedulers import (
     DDIMScheduler,
     DPMSolverMultistepScheduler,
     EulerAncestralDiscreteScheduler,
@@ -34,9 +34,9 @@
     LMSDiscreteScheduler,
     PNDMScheduler,
 )
-from ...utils import USE_PEFT_BACKEND, BaseOutput, logging, scale_lora_layers, unscale_lora_layers
-from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from diffusers.utils import USE_PEFT_BACKEND, BaseOutput, logging, scale_lora_layers, unscale_lora_layers
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -45,11 +45,11 @@
     Examples:
         ```py
         >>> import torch
-        >>> from diffusers import MotionAdapter, AnimateDiffImg2VideoPipeline, DDIMScheduler
+        >>> from diffusers import MotionAdapter, DiffusionPipeline, DDIMScheduler
         >>> from diffusers.utils import export_to_gif, load_image
 
         >>> adapter = MotionAdapter.from_pretrained("diffusers/motion-adapter")
-        >>> pipe = AnimateDiffImg2VideoPipeline.from_pretrained("SG161222/Realistic_Vision_V5.1_noVAE", motion_adapter=adapter).to("cuda")
+        >>> pipe = DiffusionPipeline.from_pretrained("SG161222/Realistic_Vision_V5.1_noVAE", motion_adapter=adapter, custom_pipeline="pipeline_animatediff_img2video").to("cuda")
         >>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False, timespace_spacing="linspace")
 
         >>> img = load_image("snail.png")
@@ -207,11 +207,11 @@ def retrieve_timesteps(
 
 
 @dataclass
-class AnimateDiffImg2VideoPipelineOutput(BaseOutput):
+class AnimateDiffImgToVideoPipelineOutput(BaseOutput):
     frames: Union[torch.Tensor, np.ndarray]
 
 
-class AnimateDiffImg2VideoPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
+class AnimateDiffImgToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
     r"""
     Pipeline for text-to-video generation.
 
@@ -802,7 +802,7 @@ def __call__(
                 The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or
                 `np.array`.
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`AnimateDiffImg2VideoPipelineOutput`] instead
+                Whether or not to return a [`AnimateDiffImgToVideoPipelineOutput`] instead
                 of a plain tuple.
             callback (`Callable`, *optional*):
                 A function that calls every `callback_steps` steps during inference. The function is called with the
@@ -822,8 +822,8 @@ def __call__(
         Examples:
 
         Returns:
-            [`AnimateDiffImg2VideoPipelineOutput`] or `tuple`:
-                If `return_dict` is `True`, [`AnimateDiffImg2VideoPipelineOutput`] is
+            [`AnimateDiffImgToVideoPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`AnimateDiffImgToVideoPipelineOutput`] is
                 returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
         """
         # 0. Default height and width to unet
@@ -950,7 +950,7 @@ def __call__(
                         callback(i, t, latents)
 
         if output_type == "latent":
-            return AnimateDiffImg2VideoPipelineOutput(frames=latents)
+            return AnimateDiffImgToVideoPipelineOutput(frames=latents)
 
         # 10. Post-processing
         video_tensor = self.decode_latents(latents)
@@ -966,4 +966,4 @@ def __call__(
         if not return_dict:
             return (video,)
 
-        return AnimateDiffImg2VideoPipelineOutput(frames=video)
+        return AnimateDiffImgToVideoPipelineOutput(frames=video)

From 26d3145bcc0d79745a9b85737482d12bf1827151 Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Wed, 3 Jan 2024 16:29:32 +0530
Subject: [PATCH 14/47] update community readme and examples

---
 examples/community/README.md                  | 29 ++++++++++++++++++-
 .../pipeline_animatediff_img2video.py         |  2 +-
 .../pipeline_animatediff_video2video.py       |  2 +-
 3 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/examples/community/README.md b/examples/community/README.md
index c3aa1ecf3d64..2bba16881732 100755
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -53,6 +53,7 @@ prompt-to-prompt | change parts of a prompt and retain image structure (see [pap
 |   Regional Prompting Pipeline                                                                                               | Assign multiple prompts for different regions                                                                                                                                                                                                                                                                                                                                                    |  [Regional Prompting Pipeline](#regional-prompting-pipeline) | - | [hako-mikan](https://github.com/hako-mikan) |
 | LDM3D-sr (LDM3D upscaler)                                                                                                             | Upscale low resolution RGB and depth inputs to high resolution                                                                                                                                                                                                                                                                                                                                                                                                                              | [StableDiffusionUpscaleLDM3D Pipeline](https://github.com/estelleafl/diffusers/tree/ldm3d_upscaler_community/examples/community#stablediffusionupscaleldm3d-pipeline)                                                                             | -                                                                                                                                                                                                             |                                                        [Estelle Aflalo](https://github.com/estelleafl) |
 | AnimateDiff ControlNet Pipeline                                                                                                    | Combines AnimateDiff with precise motion control using ControlNets                                                                                                                                                                                                                                                                                                                                                                                                                                    | [AnimateDiff ControlNet Pipeline](#animatediff-controlnet-pipeline) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1SKboYeGjEQmQPWoFC0aLYpBlYdHXkvAu?usp=sharing) | [Aryan V S](https://github.com/a-r-r-o-w) and [Edoardo Botta](https://github.com/EdoardoBotta) |
+| AnimateDiff Image-To-Video Pipeline                                                                                                    | Image-To-Video experimental support for AnimateDiff (open to improvements)                                                                                                                                                                                                                                                                                                                                                                                                                                    | [AnimateDiff Image To Video Pipeline](#animatediff-image-to-video-pipeline) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://drive.google.com/file/d/1TvzCDPHhfFtdcJZe4RLloAwyoLKuttWK/view?usp=sharing) | [Aryan V S](https://github.com/a-r-r-o-w) |
 |   DemoFusion Pipeline                                                                                                    | Implementation of [DemoFusion: Democratising High-Resolution Image Generation With No $$$](https://arxiv.org/abs/2311.16973)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [DemoFusion Pipeline](#DemoFusion)      | - |              [Ruoyi Du](https://github.com/RuoyiDu) |
 
 To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.
@@ -2942,7 +2943,7 @@ pipe = DiffusionPipeline.from_pretrained(
     custom_pipeline="pipeline_animatediff_controlnet",
 ).to(device="cuda", dtype=torch.float16)
 pipe.scheduler = DPMSolverMultistepScheduler.from_pretrained(
-    model_id, subfolder="scheduler", clip_sample=False, timestep_spacing="linspace", steps_offset=1
+    model_id, subfolder="scheduler", beta_schedule="linear", clip_sample=False, timestep_spacing="linspace", steps_offset=1
 )
 pipe.enable_vae_slicing()
 
@@ -2981,7 +2982,33 @@ export_to_gif(result.frames[0], "result.gif")
     <td align=center><img src="https://github.com/huggingface/diffusers/assets/72266394/eb7d2952-72e4-44fa-b664-077c79b4fc70" alt="gif-2"></td>
   </tr>
 </table>
+
+### AnimateDiff Image-To-Video Pipeline
+
+This pipeline adds experimental support for the image-to-video task using AnimateDiff. Refer to [this](https://github.com/huggingface/diffusers/pull/6328) PR for more examples and results.
+
+```py
+import torch
+from diffusers import MotionAdapter, DiffusionPipeline, DDIMScheduler
+from diffusers.utils import export_to_gif, load_image
+
+adapter = MotionAdapter.from_pretrained("diffusers/motion-adapter")
+pipe = DiffusionPipeline.from_pretrained("SG161222/Realistic_Vision_V5.1_noVAE", motion_adapter=adapter, custom_pipeline="pipeline_animatediff_img2video").to("cuda")
+pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False, timespace_spacing="linspace")
+
+image = load_image("snail.png")
+output = pipe(
+  image=image,
+  prompt="A snail moving on the ground",
+  strength=0.8,
+  latent_interpolation_method="slerp", # can be lerp, slerp, or your own callback
+)
+frames = output.frames[0]
+export_to_gif(frames, "animation.gif")
+```
+
 ### DemoFusion
+
 This pipeline is the official implementation of [DemoFusion: Democratising High-Resolution Image Generation With No $$$](https://arxiv.org/abs/2311.16973).
 The original repo can be found at [repo](https://github.com/PRIS-CV/DemoFusion).
 - `view_batch_size` (`int`, defaults to 16):
diff --git a/examples/community/pipeline_animatediff_img2video.py b/examples/community/pipeline_animatediff_img2video.py
index 51cbf8013808..412aff344499 100644
--- a/examples/community/pipeline_animatediff_img2video.py
+++ b/examples/community/pipeline_animatediff_img2video.py
@@ -52,7 +52,7 @@
         >>> pipe = DiffusionPipeline.from_pretrained("SG161222/Realistic_Vision_V5.1_noVAE", motion_adapter=adapter, custom_pipeline="pipeline_animatediff_img2video").to("cuda")
         >>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False, timespace_spacing="linspace")
 
-        >>> img = load_image("snail.png")
+        >>> image = load_image("snail.png")
         >>> output = pipe(image=image, prompt="A snail moving on the ground", strength=0.8, latent_interpolation_method="slerp")
         >>> frames = output.frames[0]
         >>> export_to_gif(frames, "animation.gif")
diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
index fdcf02e98abf..63d2d9d63484 100644
--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
@@ -61,7 +61,7 @@
         >>>         images.append(pil_image)
         >>>     return images
 
-        >>> video = load_image("animation_fireworks.png")
+        >>> video = load_video("animation_fireworks.gif")
         >>> output = pipe(video=video, prompt="Closeup of a woman, fireworks in the background", strength=0.7)
         >>> frames = output.frames[0]
         >>> export_to_gif(frames, "animation.gif")

From 3196a795e966d6a4d5180ec1d4aa234aacb7498b Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Wed, 3 Jan 2024 16:41:29 +0530
Subject: [PATCH 15/47] fix

---
 examples/community/pipeline_animatediff_img2video.py | 2 +-
 src/diffusers/__init__.py                            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/community/pipeline_animatediff_img2video.py b/examples/community/pipeline_animatediff_img2video.py
index 412aff344499..2e1d33df94e5 100644
--- a/examples/community/pipeline_animatediff_img2video.py
+++ b/examples/community/pipeline_animatediff_img2video.py
@@ -26,6 +26,7 @@
 from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
 from diffusers.models.lora import adjust_lora_scale_text_encoder
 from diffusers.models.unet_motion_model import MotionAdapter
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.schedulers import (
     DDIMScheduler,
     DPMSolverMultistepScheduler,
@@ -36,7 +37,6 @@
 )
 from diffusers.utils import USE_PEFT_BACKEND, BaseOutput, logging, scale_lora_layers, unscale_lora_layers
 from diffusers.utils.torch_utils import randn_tensor
-from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 5b5f7bb7cb43..83e4f7ee8fd6 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -207,7 +207,7 @@
             "AmusedInpaintPipeline",
             "AmusedPipeline",
             "AnimateDiffPipeline",
-            "AnimateDiffVideoToVideoPipeline"
+            "AnimateDiffVideoToVideoPipeline",
             "AudioLDM2Pipeline",
             "AudioLDM2ProjectionModel",
             "AudioLDM2UNet2DConditionModel",

From 5a4f2eeb8688c395828edc93fdc18d517e75d356 Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Wed, 3 Jan 2024 19:48:13 +0530
Subject: [PATCH 16/47] make fix-copies

---
 .../utils/dummy_torch_and_transformers_objects.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index 2eb9599658d9..e0d5c77d0e8c 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -92,6 +92,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class AnimateDiffVideoToVideoPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class AudioLDM2Pipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 

From 7fad71ae6526d4ec81a16dfb384d68761ea6799b Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Thu, 4 Jan 2024 10:29:15 +0530
Subject: [PATCH 17/47] add vid2vid batch params

---
 tests/pipelines/pipeline_params.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/pipelines/pipeline_params.py b/tests/pipelines/pipeline_params.py
index f5be787656c7..4e2c4dcdd9cb 100644
--- a/tests/pipelines/pipeline_params.py
+++ b/tests/pipelines/pipeline_params.py
@@ -125,3 +125,5 @@
 TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS = frozenset(["input_tokens"])
 
 TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS = frozenset(["prompt_embeds"])
+
+VIDEO_TO_VIDEO_BATCH_PARAMS = frozenset(["prompt", "negative_prompt", "video"])

From 71e8770da3a6b29b0a025c30f7875456d6f91e6e Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Thu, 4 Jan 2024 10:30:41 +0530
Subject: [PATCH 18/47] apply suggestions from review

Co-Authored-By: Dhruv Nair <dhruv.nair@gmail.com>
---
 .../pipeline_animatediff_video2video.py       | 53 ++++++++++++-------
 1 file changed, 35 insertions(+), 18 deletions(-)

diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
index 63d2d9d63484..e1e26430285e 100644
--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
@@ -606,6 +606,9 @@ def get_timesteps(self, num_inference_steps, strength, device):
     def prepare_latents(
         self,
         video,
+        height,
+        width,
+        num_channels_latents,
         batch_size,
         timestep,
         dtype,
@@ -613,7 +616,25 @@ def prepare_latents(
         generator,
         latents=None,
     ):
-        print(batch_size)
+        # video must be a list of list of images
+        # the outer list denotes having multiple videos as input, whereas inner list means the frames of the video as images
+        if not isinstance(video[0], list):
+            video = [video]
+        if latents is None:
+            video = torch.stack([self.image_processor.preprocess(vid, height=height, width=width) for vid in video])
+            video = video.to(device=device, dtype=dtype)
+            num_frames = video.shape[1]
+        else:
+            num_frames = latents.shape[2]
+
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_frames,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
                 f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -621,8 +642,6 @@ def prepare_latents(
             )
 
         if latents is None:
-            video = video.to(device=device, dtype=dtype)
-
             # make sure the VAE is in float32 mode, as it overflows in float16
             if self.vae.config.force_upcast:
                 video = video.float()
@@ -672,9 +691,9 @@ def prepare_latents(
             noise = randn_tensor(init_latents.shape, generator=generator, device=device, dtype=dtype)
             latents = self.scheduler.add_noise(init_latents, noise, timestep).permute(0, 2, 1, 3, 4)
         else:
-            if latents.dim() != 5:
+            if shape != latents.shape:
                 # [B, F, C, H, W]
-                raise ValueError(f"`latents` expected to have dim=5, but found {latents.dim()=}")
+                raise ValueError(f"`latents` expected to have {shape=}, but found {latents.shape=}")
             latents = latents.to(device, dtype=dtype)
 
         return latents
@@ -837,20 +856,18 @@ def __call__(
             if do_classifier_free_guidance:
                 image_embeds = torch.cat([negative_image_embeds, image_embeds])
 
-        # 4. Preprocess video
-        if latents is None:
-            if not isinstance(video[0], list):
-                video = [video]
-            video = torch.stack([self.image_processor.preprocess(vid, height=height, width=width) for vid in video])
-
-        # 5. Prepare timesteps
+        # 4. Prepare timesteps
         timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
         timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
         latent_timestep = timesteps[:1].repeat(batch_size * num_videos_per_prompt)
 
-        # 6. Prepare latent variables
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
         latents = self.prepare_latents(
             video=video,
+            height=height,
+            width=width,
+            num_channels_latents=num_channels_latents,
             batch_size=batch_size * num_videos_per_prompt,
             timestep=latent_timestep,
             dtype=prompt_embeds.dtype,
@@ -859,13 +876,13 @@ def __call__(
             latents=latents,
         )
 
-        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
-        # 8. Add image embeds for IP-Adapter
+        # 7. Add image embeds for IP-Adapter
         added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
 
-        # 9. Denoising loop
+        # 8. Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
@@ -899,7 +916,7 @@ def __call__(
         if output_type == "latent":
             return AnimateDiffVideoToVideoPipelineOutput(frames=latents)
 
-        # 10. Post-processing
+        # 9. Post-processing
         video_tensor = self.decode_latents(latents)
 
         if output_type == "pt":
@@ -907,7 +924,7 @@ def __call__(
         else:
             video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
 
-        # 11. Offload all models
+        # 10. Offload all models
         self.maybe_free_model_hooks()
 
         if not return_dict:

From 068e9d7000a6aebc14caf076a8afa9d18dd312a8 Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Thu, 4 Jan 2024 10:31:41 +0530
Subject: [PATCH 19/47] add test for animatediff vid2vid

---
 .../test_animatediff_video2video.py           | 341 ++++++++++++++++++
 1 file changed, 341 insertions(+)
 create mode 100644 tests/pipelines/animatediff/test_animatediff_video2video.py

diff --git a/tests/pipelines/animatediff/test_animatediff_video2video.py b/tests/pipelines/animatediff/test_animatediff_video2video.py
new file mode 100644
index 000000000000..090f0fedd93a
--- /dev/null
+++ b/tests/pipelines/animatediff/test_animatediff_video2video.py
@@ -0,0 +1,341 @@
+import gc
+import unittest
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+import diffusers
+from diffusers import (
+    AnimateDiffVideoToVideoPipeline,
+    AutoencoderKL,
+    DDIMScheduler,
+    MotionAdapter,
+    UNet2DConditionModel,
+    UNetMotionModel,
+)
+from diffusers.utils import is_xformers_available, logging
+from diffusers.utils.testing_utils import numpy_cosine_similarity_distance, require_torch_gpu, slow, torch_device
+
+from ..pipeline_params import TEXT_TO_IMAGE_PARAMS, VIDEO_TO_VIDEO_BATCH_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+def to_np(tensor):
+    if isinstance(tensor, torch.Tensor):
+        tensor = tensor.detach().cpu().numpy()
+
+    return tensor
+
+
+class AnimateDiffVideoToVideoPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = AnimateDiffVideoToVideoPipeline
+    params = TEXT_TO_IMAGE_PARAMS
+    batch_params = VIDEO_TO_VIDEO_BATCH_PARAMS
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "generator",
+            "latents",
+            "return_dict",
+            "callback",
+            "callback_steps",
+        ]
+    )
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+            norm_num_groups=2,
+        )
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="linear",
+            clip_sample=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        motion_adapter = MotionAdapter(
+            block_out_channels=(32, 64),
+            motion_layers_per_block=2,
+            motion_norm_num_groups=2,
+            motion_num_attention_heads=4,
+        )
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "motion_adapter": motion_adapter,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        video_height = 32
+        video_width = 32
+        video_num_frames = 2
+        video = [Image.new("RGB", (video_width, video_height))] * video_num_frames
+
+        inputs = {
+            "video": video,
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 7.5,
+            "output_type": "pt",
+        }
+        return inputs
+
+    def test_motion_unet_loading(self):
+        components = self.get_dummy_components()
+        pipe = AnimateDiffVideoToVideoPipeline(**components)
+
+        assert isinstance(pipe.unet, UNetMotionModel)
+
+    @unittest.skip("Attention slicing is not enabled in this pipeline")
+    def test_attention_slicing_forward_pass(self):
+        pass
+
+    def test_inference_batch_single_identical(
+        self,
+        batch_size=2,
+        expected_max_diff=1e-4,
+        additional_params_copy_to_batched_inputs=["num_inference_steps"],
+    ):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for components in pipe.components.values():
+            if hasattr(components, "set_default_attn_processor"):
+                components.set_default_attn_processor()
+
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        inputs = self.get_dummy_inputs(torch_device)
+        # Reset generator in case it is has been used in self.get_dummy_inputs
+        inputs["generator"] = self.get_generator(0)
+
+        logger = logging.get_logger(pipe.__module__)
+        logger.setLevel(level=diffusers.logging.FATAL)
+
+        # batchify inputs
+        batched_inputs = {}
+        batched_inputs.update(inputs)
+
+        for name in self.batch_params:
+            if name not in inputs:
+                continue
+
+            value = inputs[name]
+            if name == "prompt":
+                len_prompt = len(value)
+                batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
+                batched_inputs[name][-1] = 100 * "very long"
+
+            else:
+                batched_inputs[name] = batch_size * [value]
+
+        if "generator" in inputs:
+            batched_inputs["generator"] = [self.get_generator(i) for i in range(batch_size)]
+
+        if "batch_size" in inputs:
+            batched_inputs["batch_size"] = batch_size
+
+        for arg in additional_params_copy_to_batched_inputs:
+            batched_inputs[arg] = inputs[arg]
+
+        output = pipe(**inputs)
+        output_batch = pipe(**batched_inputs)
+
+        assert output_batch[0].shape[0] == batch_size
+
+        max_diff = np.abs(to_np(output_batch[0][0]) - to_np(output[0][0])).max()
+        assert max_diff < expected_max_diff
+
+    @unittest.skipIf(torch_device != "cuda", reason="CUDA and CPU are required to switch devices")
+    def test_to_device(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+
+        pipe.to("cpu")
+        # pipeline creates a new motion UNet under the hood. So we need to check the device from pipe.components
+        model_devices = [
+            component.device.type for component in pipe.components.values() if hasattr(component, "device")
+        ]
+        self.assertTrue(all(device == "cpu" for device in model_devices))
+
+        output_cpu = pipe(**self.get_dummy_inputs("cpu"))[0]
+        self.assertTrue(np.isnan(output_cpu).sum() == 0)
+
+        pipe.to("cuda")
+        model_devices = [
+            component.device.type for component in pipe.components.values() if hasattr(component, "device")
+        ]
+        self.assertTrue(all(device == "cuda" for device in model_devices))
+
+        output_cuda = pipe(**self.get_dummy_inputs("cuda"))[0]
+        self.assertTrue(np.isnan(to_np(output_cuda)).sum() == 0)
+
+    def test_to_dtype(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+
+        # pipeline creates a new motion UNet under the hood. So we need to check the dtype from pipe.components
+        model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
+        self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes))
+
+        pipe.to(torch_dtype=torch.float16)
+        model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
+        self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes))
+
+    def test_prompt_embeds(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs.pop("prompt")
+        inputs["prompt_embeds"] = torch.randn((1, 4, 32), device=torch_device)
+        pipe(**inputs)
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_without_offload = pipe(**inputs).frames[0]
+        output_without_offload = (
+            output_without_offload.cpu() if torch.is_tensor(output_without_offload) else output_without_offload
+        )
+
+        pipe.enable_xformers_memory_efficient_attention()
+        inputs = self.get_dummy_inputs(torch_device)
+        output_with_offload = pipe(**inputs).frames[0]
+        output_with_offload = (
+            output_with_offload.cpu() if torch.is_tensor(output_with_offload) else output_without_offload
+        )
+
+        max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
+        self.assertLess(max_diff, 1e-4, "XFormers attention should not affect the inference results")
+
+
+@slow
+@require_torch_gpu
+class AnimateDiffVideoToVideoPipelineSlowTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_animatediff_vid2vid(self):
+        adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
+        pipe = AnimateDiffVideoToVideoPipeline.from_pretrained("frankjoshua/toonyou_beta6", motion_adapter=adapter)
+        pipe = pipe.to(torch_device)
+        pipe.scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="linear",
+            steps_offset=1,
+            clip_sample=False,
+        )
+        pipe.enable_vae_slicing()
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "night, b&w photo of old house, post apocalypse, forest, storm weather, wind, rocks, 8k uhd, dslr, soft lighting, high quality, film grain"
+        negative_prompt = "bad quality, worse quality"
+
+        def load_video(file_path):
+            import imageio
+
+            images = []
+            vid = imageio.get_reader(file_path)
+            for frame in vid:
+                pil_image = Image.fromarray(frame)
+                images.append(pil_image)
+            return images
+
+        video = load_video("/path/to/huggingface/video.gif")
+
+        generator = torch.Generator("cpu").manual_seed(0)
+        output = pipe(
+            video=video,
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            num_frames=16,
+            generator=generator,
+            guidance_scale=7.5,
+            num_inference_steps=3,
+            output_type="np",
+        )
+
+        image = output.frames[0]
+        assert image.shape == (16, 512, 512, 3)
+
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = np.array(
+            [
+                0.11357737,
+                0.11285847,
+                0.11180121,
+                0.11084166,
+                0.11414117,
+                0.09785956,
+                0.10742754,
+                0.10510018,
+                0.08045256,
+            ]
+        )
+        assert numpy_cosine_similarity_distance(image_slice.flatten(), expected_slice.flatten()) < 1e-3

From da4c308a3dfe2c596f1721179f7b5255d2736022 Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Thu, 4 Jan 2024 10:46:55 +0530
Subject: [PATCH 20/47] torch.stack -> torch.cat

Co-Authored-By: Dhruv Nair <dhruv.nair@gmail.com>
---
 .../pipeline_animatediff_video2video.py          | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
index e1e26430285e..e5f8dd6864d1 100644
--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
@@ -617,11 +617,12 @@ def prepare_latents(
         latents=None,
     ):
         # video must be a list of list of images
-        # the outer list denotes having multiple videos as input, whereas inner list means the frames of the video as images
+        # the outer list denotes having multiple videos as input, whereas inner list means the frames of the video
+        # as a list of images
         if not isinstance(video[0], list):
             video = [video]
         if latents is None:
-            video = torch.stack([self.image_processor.preprocess(vid, height=height, width=width) for vid in video])
+            video = torch.cat([self.image_processor.preprocess(vid, height=height, width=width).unsqueeze(0) for vid in video], dim=0)
             video = video.to(device=device, dtype=dtype)
             num_frames = video.shape[1]
         else:
@@ -654,14 +655,11 @@ def prepare_latents(
                         f" size of {batch_size}. Make sure the batch size matches the length of the generators."
                     )
 
-                init_latents = [
-                    retrieve_latents(self.vae.encode(video[i]), generator=generator[i]) for i in range(batch_size)
-                ]
-                init_latents = torch.stack(init_latents)
+                init_latents = [retrieve_latents(self.vae.encode(video[i]), generator=generator[i]).unsqueeze(0) for i in range(batch_size)]
             else:
-                init_latents = torch.stack(
-                    [retrieve_latents(self.vae.encode(vid), generator=generator) for vid in video]
-                )
+                init_latents = [retrieve_latents(self.vae.encode(vid), generator=generator).unsqueeze(0) for vid in video]
+            
+            init_latents = torch.cat(init_latents, dim=0)
 
             # restore vae to original dtype
             if self.vae.config.force_upcast:

From be2bb217b2856f983544025c72d34f723e8b50fd Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Thu, 4 Jan 2024 10:47:21 +0530
Subject: [PATCH 21/47] make style

---
 .../pipeline_animatediff_video2video.py           | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
index e5f8dd6864d1..f3ff4c2cd34e 100644
--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
@@ -622,7 +622,9 @@ def prepare_latents(
         if not isinstance(video[0], list):
             video = [video]
         if latents is None:
-            video = torch.cat([self.image_processor.preprocess(vid, height=height, width=width).unsqueeze(0) for vid in video], dim=0)
+            video = torch.cat(
+                [self.image_processor.preprocess(vid, height=height, width=width).unsqueeze(0) for vid in video], dim=0
+            )
             video = video.to(device=device, dtype=dtype)
             num_frames = video.shape[1]
         else:
@@ -655,10 +657,15 @@ def prepare_latents(
                         f" size of {batch_size}. Make sure the batch size matches the length of the generators."
                     )
 
-                init_latents = [retrieve_latents(self.vae.encode(video[i]), generator=generator[i]).unsqueeze(0) for i in range(batch_size)]
+                init_latents = [
+                    retrieve_latents(self.vae.encode(video[i]), generator=generator[i]).unsqueeze(0)
+                    for i in range(batch_size)
+                ]
             else:
-                init_latents = [retrieve_latents(self.vae.encode(vid), generator=generator).unsqueeze(0) for vid in video]
-            
+                init_latents = [
+                    retrieve_latents(self.vae.encode(vid), generator=generator).unsqueeze(0) for vid in video
+                ]
+
             init_latents = torch.cat(init_latents, dim=0)
 
             # restore vae to original dtype

From 43b44100d70a7b2bf55fe9332596ea03e676e1c7 Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Thu, 4 Jan 2024 11:11:40 +0530
Subject: [PATCH 22/47] docs for vid2vid

---
 docs/source/en/api/pipelines/animatediff.md | 73 +++++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/docs/source/en/api/pipelines/animatediff.md b/docs/source/en/api/pipelines/animatediff.md
index fb38687e882e..79d1d4d66b4d 100644
--- a/docs/source/en/api/pipelines/animatediff.md
+++ b/docs/source/en/api/pipelines/animatediff.md
@@ -25,6 +25,7 @@ The abstract of the paper is the following:
 | Pipeline | Tasks | Demo
 |---|---|:---:|
 | [AnimateDiffPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/animatediff/pipeline_animatediff.py) | *Text-to-Video Generation with AnimateDiff* |
+| [AnimateDiffVideoToVideoPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py) | *Video-to-Video Generation with AnimateDiff* |
 
 ## Available checkpoints
 
@@ -32,6 +33,8 @@ Motion Adapter checkpoints can be found under [guoyww](https://huggingface.co/gu
 
 ## Usage example
 
+### AnimateDiffPipeline
+
 AnimateDiff works with a MotionAdapter checkpoint and a Stable Diffusion model checkpoint. The MotionAdapter is a collection of Motion Modules that are responsible for adding coherent motion across image frames. These modules are applied after the Resnet and Attention blocks in Stable Diffusion UNet.
 
 The following example demonstrates how to use a *MotionAdapter* checkpoint with Diffusers for inference based on StableDiffusion-1.4/1.5.
@@ -96,6 +99,60 @@ Here are some sample outputs:
 
 AnimateDiff tends to work better with finetuned Stable Diffusion models. If you plan on using a scheduler that can clip samples, make sure to disable it by setting `clip_sample=False` in the scheduler as this can also have an adverse effect on generated samples. Additionally, the AnimateDiff checkpoints can be sensitive to the beta schedule of the scheduler. We recommend setting this to `linear`.
 
+### AnimateDiffVideoToVideoPipeline
+
+AnimateDiff can also be used to generate visually similar videos or enable style/character/background edits starting from an initial video, allowing you to seamlessly explore creative possibilities.
+
+```python
+import torch
+from diffusers import AnimateDiffVideoToVideoPipeline, DDIMScheduler, MotionAdapter
+from diffusers.utils import export_to_gif
+
+# Load the motion adapter
+adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16)
+# load SD 1.5 based finetuned model
+model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
+pipe = AnimateDiffVideoToVideoPipeline.from_pretrained(model_id, motion_adapter=adapter, torch_dtype=torch.float16)
+scheduler = DDIMScheduler.from_pretrained(
+    model_id,
+    subfolder="scheduler",
+    clip_sample=False,
+    timestep_spacing="linspace",
+    beta_schedule="linear",
+    steps_offset=1,
+)
+pipe.scheduler = scheduler
+
+# enable memory savings
+pipe.enable_vae_slicing()
+pipe.enable_model_cpu_offload()
+
+# helper function to load videos
+def load_video(file_path: str):
+    import imageio
+
+    images = []
+    vid = imageio.get_reader(file_path)
+    for frame in vid:
+        pil_image = Image.fromarray(frame)
+        images.append(pil_image)
+    return images
+
+# load initial video
+video = load_video("/path/to/local/animation_fireworks.gif")
+
+output = pipe(
+    video = video,
+    prompt="closeup of a handsome man, robert downey jr., iron man, fireworks in the background, realistic, high quality",
+    negative_prompt="bad quality, worse quality",
+    guidance_scale=7.5,
+    num_inference_steps=25,
+    generator=torch.Generator("cpu").manual_seed(42),
+)
+frames = output.frames[0]
+export_to_gif(frames, "animation.gif")
+```
+
 </Tip>
 
 ## Using Motion LoRAs
@@ -256,3 +313,19 @@ Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers)
 ## AnimateDiffPipelineOutput
 
 [[autodoc]] pipelines.animatediff.AnimateDiffPipelineOutput
+
+## AnimateDiffVideoToVideoPipeline
+
+[[autodoc]] AnimateDiffVideoToVideoPipeline
+	- all
+	- __call__
+    - enable_freeu
+    - disable_freeu
+    - enable_vae_slicing
+    - disable_vae_slicing
+    - enable_vae_tiling
+    - disable_vae_tiling
+
+## AnimateDiffVideoToVideoPipelineOutput
+
+[[autodoc]] pipelines.animatediff.AnimateDiffVideoToVideoPipelineOutput

From 4ce5bae4b9f10c61526d10f403d7acc30bd2a2d5 Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Thu, 4 Jan 2024 11:12:38 +0530
Subject: [PATCH 23/47] update

---
 docs/source/en/api/pipelines/animatediff.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/animatediff.md b/docs/source/en/api/pipelines/animatediff.md
index 79d1d4d66b4d..9f8d6cc872c1 100644
--- a/docs/source/en/api/pipelines/animatediff.md
+++ b/docs/source/en/api/pipelines/animatediff.md
@@ -101,7 +101,7 @@ AnimateDiff tends to work better with finetuned Stable Diffusion models. If you
 
 ### AnimateDiffVideoToVideoPipeline
 
-AnimateDiff can also be used to generate visually similar videos or enable style/character/background edits starting from an initial video, allowing you to seamlessly explore creative possibilities.
+AnimateDiff can also be used to generate visually similar videos or enable style/character/background or other edits starting from an initial video, allowing you to seamlessly explore creative possibilities.
 
 ```python
 import torch

From f895be8acbe64f94351df0440f8883cdeaa7024a Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Thu, 4 Jan 2024 11:21:22 +0530
Subject: [PATCH 24/47] fix prepare_latents

---
 .../pipeline_animatediff_img2video.py         | 124 ++++++++++--------
 .../pipeline_animatediff_video2video.py       |   2 +-
 2 files changed, 73 insertions(+), 53 deletions(-)

diff --git a/examples/community/pipeline_animatediff_img2video.py b/examples/community/pipeline_animatediff_img2video.py
index 2e1d33df94e5..b48ac2d793e9 100644
--- a/examples/community/pipeline_animatediff_img2video.py
+++ b/examples/community/pipeline_animatediff_img2video.py
@@ -65,6 +65,14 @@ def lerp(
     v1: torch.Tensor,
     t: Union[float, torch.Tensor],
 ) -> torch.Tensor:
+    r"""
+    Linear Interpolation between two tensors.
+
+    Args:
+        v0 (`torch.Tensor`): First tensor.
+        v1 (`torch.Tensor`): Second tensor.
+        t: (`float` or `torch.Tensor`): Interpolation factor.
+    """
     t_is_float = False
     input_device = v0.device
     v0 = v0.cpu().numpy()
@@ -95,6 +103,17 @@ def slerp(
     t: Union[float, torch.Tensor],
     DOT_THRESHOLD: float = 0.9995,
 ) -> torch.Tensor:
+    r"""
+    Spherical Linear Interpolation between two tensors.
+
+    Args:
+        v0 (`torch.Tensor`): First tensor.
+        v1 (`torch.Tensor`): Second tensor.
+        t: (`float` or `torch.Tensor`): Interpolation factor.
+        DOT_THRESHOLD (`float`):
+            Dot product threshold exceeding which linear interpolation will be used
+            because input tensors are close to parallel.
+    """
     t_is_float = False
     input_device = v0.device
     v0 = v0.cpu().numpy()
@@ -109,7 +128,7 @@ def slerp(
     dot = np.sum(v0 * v1 / (np.linalg.norm(v0) * np.linalg.norm(v1)))
 
     if np.abs(dot) > DOT_THRESHOLD:
-        # v1 and v2 are close to parallel, so use linear interpolation instead
+        # v0 and v1 are close to parallel, so use linear interpolation instead
         v2 = lerp(v0, v1, t)
     else:
         theta_0 = np.arccos(dot)
@@ -661,7 +680,7 @@ def prepare_latents(
         device,
         generator,
         latents=None,
-        latent_interpolation_method="lerp",
+        latent_interpolation_method="slerp",
     ):
         shape = (
             batch_size,
@@ -670,58 +689,59 @@ def prepare_latents(
             height // self.vae_scale_factor,
             width // self.vae_scale_factor,
         )
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
 
-        image = image.to(device=device, dtype=dtype)
-
-        if image.shape[1] == 4:
-            latents = image
-        else:
-            # make sure the VAE is in float32 mode, as it overflows in float16
-            if self.vae.config.force_upcast:
-                image = image.float()
-                self.vae.to(dtype=torch.float32)
-
-            if isinstance(generator, list):
-                if len(generator) != batch_size:
-                    raise ValueError(
-                        f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                        f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-                    )
-
-                init_latents = [
-                    retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
-                    for i in range(batch_size)
-                ]
-                init_latents = torch.cat(init_latents, dim=0)
-            else:
-                init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
-
-            if self.vae.config.force_upcast:
-                self.vae.to(dtype)
+        if latents is None:
+            image = image.to(device=device, dtype=dtype)
 
-            init_latents = init_latents.to(dtype)
-            init_latents = self.vae.config.scaling_factor * init_latents
-            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-            latents = latents * self.scheduler.init_noise_sigma
-
-            if latent_interpolation_method == "lerp":
-
-                def latent_cls(v0, v1, index):
-                    return lerp(v0, v1, index / num_frames * (1 - strength))
-            elif latent_interpolation_method == "slerp":
-
-                def latent_cls(v0, v1, index):
-                    return slerp(v0, v1, index / num_frames * (1 - strength))
+            if image.shape[1] == 4:
+                latents = image
             else:
-                latent_cls = latent_interpolation_method
-
-            for i in range(num_frames):
-                latents[:, :, i, :, :] = latent_cls(latents[:, :, i, :, :], init_latents, i)
+                # make sure the VAE is in float32 mode, as it overflows in float16
+                if self.vae.config.force_upcast:
+                    image = image.float()
+                    self.vae.to(dtype=torch.float32)
+
+                if isinstance(generator, list):
+                    if len(generator) != batch_size:
+                        raise ValueError(
+                            f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                            f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                        )
+
+                    init_latents = [
+                        retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                        for i in range(batch_size)
+                    ]
+                    init_latents = torch.cat(init_latents, dim=0)
+                else:
+                    init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+                if self.vae.config.force_upcast:
+                    self.vae.to(dtype)
+
+                init_latents = init_latents.to(dtype)
+                init_latents = self.vae.config.scaling_factor * init_latents
+                latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+                latents = latents * self.scheduler.init_noise_sigma
+
+                if latent_interpolation_method == "lerp":
+
+                    def latent_cls(v0, v1, index):
+                        return lerp(v0, v1, index / num_frames * (1 - strength))
+                elif latent_interpolation_method == "slerp":
+
+                    def latent_cls(v0, v1, index):
+                        return slerp(v0, v1, index / num_frames * (1 - strength))
+                else:
+                    latent_cls = latent_interpolation_method
+
+                for i in range(num_frames):
+                    latents[:, :, i, :, :] = latent_cls(latents[:, :, i, :, :], init_latents, i)
+        else:
+            if shape != latents.shape:
+                # [B, C, F, H, W]
+                raise ValueError(f"`latents` expected to have {shape=}, but found {latents.shape=}")
+            latents = latents.to(device, dtype=dtype)
 
         return latents
 
@@ -751,7 +771,7 @@ def __call__(
         callback_steps: Optional[int] = 1,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         clip_skip: Optional[int] = None,
-        latent_interpolation_method: Union[str, Callable[[torch.Tensor, torch.Tensor, int], torch.Tensor]] = "lerp",
+        latent_interpolation_method: Union[str, Callable[[torch.Tensor, torch.Tensor, int], torch.Tensor]] = "slerp",
     ):
         r"""
         The call function to the pipeline for generation.
diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
index f3ff4c2cd34e..ff43275fe28b 100644
--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
@@ -697,7 +697,7 @@ def prepare_latents(
             latents = self.scheduler.add_noise(init_latents, noise, timestep).permute(0, 2, 1, 3, 4)
         else:
             if shape != latents.shape:
-                # [B, F, C, H, W]
+                # [B, C, F, H, W]
                 raise ValueError(f"`latents` expected to have {shape=}, but found {latents.shape=}")
             latents = latents.to(device, dtype=dtype)
 

From 2b0533d991b23bb576b3da21ec92aa27e1b52a47 Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Thu, 4 Jan 2024 11:32:08 +0530
Subject: [PATCH 25/47] fix docs

---
 docs/source/en/api/pipelines/animatediff.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/api/pipelines/animatediff.md b/docs/source/en/api/pipelines/animatediff.md
index 9f8d6cc872c1..5f0eb339dc8c 100644
--- a/docs/source/en/api/pipelines/animatediff.md
+++ b/docs/source/en/api/pipelines/animatediff.md
@@ -99,6 +99,8 @@ Here are some sample outputs:
 
 AnimateDiff tends to work better with finetuned Stable Diffusion models. If you plan on using a scheduler that can clip samples, make sure to disable it by setting `clip_sample=False` in the scheduler as this can also have an adverse effect on generated samples. Additionally, the AnimateDiff checkpoints can be sensitive to the beta schedule of the scheduler. We recommend setting this to `linear`.
 
+</Tip>
+
 ### AnimateDiffVideoToVideoPipeline
 
 AnimateDiff can also be used to generate visually similar videos or enable style/character/background or other edits starting from an initial video, allowing you to seamlessly explore creative possibilities.
@@ -153,8 +155,6 @@ frames = output.frames[0]
 export_to_gif(frames, "animation.gif")
 ```
 
-</Tip>
-
 ## Using Motion LoRAs
 
 Motion LoRAs are a collection of LoRAs that work with the `guoyww/animatediff-motion-adapter-v1-5-2` checkpoint. These LoRAs are responsible for adding specific types of motion to the animations.

From cf2b1b3dfc9a1324ac93a630f4d3d0c45676c8c5 Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Wed, 10 Jan 2024 02:52:29 +0530
Subject: [PATCH 26/47] remove img2vid

---
 .../pipeline_animatediff_img2video.py         | 989 ------------------
 1 file changed, 989 deletions(-)
 delete mode 100644 examples/community/pipeline_animatediff_img2video.py

diff --git a/examples/community/pipeline_animatediff_img2video.py b/examples/community/pipeline_animatediff_img2video.py
deleted file mode 100644
index b48ac2d793e9..000000000000
--- a/examples/community/pipeline_animatediff_img2video.py
+++ /dev/null
@@ -1,989 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from dataclasses import dataclass
-from types import FunctionType
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import numpy as np
-import torch
-from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
-
-from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
-from diffusers.loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
-from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
-from diffusers.models.lora import adjust_lora_scale_text_encoder
-from diffusers.models.unet_motion_model import MotionAdapter
-from diffusers.pipelines.pipeline_utils import DiffusionPipeline
-from diffusers.schedulers import (
-    DDIMScheduler,
-    DPMSolverMultistepScheduler,
-    EulerAncestralDiscreteScheduler,
-    EulerDiscreteScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-)
-from diffusers.utils import USE_PEFT_BACKEND, BaseOutput, logging, scale_lora_layers, unscale_lora_layers
-from diffusers.utils.torch_utils import randn_tensor
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> import torch
-        >>> from diffusers import MotionAdapter, DiffusionPipeline, DDIMScheduler
-        >>> from diffusers.utils import export_to_gif, load_image
-
-        >>> adapter = MotionAdapter.from_pretrained("diffusers/motion-adapter")
-        >>> pipe = DiffusionPipeline.from_pretrained("SG161222/Realistic_Vision_V5.1_noVAE", motion_adapter=adapter, custom_pipeline="pipeline_animatediff_img2video").to("cuda")
-        >>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False, timespace_spacing="linspace")
-
-        >>> image = load_image("snail.png")
-        >>> output = pipe(image=image, prompt="A snail moving on the ground", strength=0.8, latent_interpolation_method="slerp")
-        >>> frames = output.frames[0]
-        >>> export_to_gif(frames, "animation.gif")
-        ```
-"""
-
-
-def lerp(
-    v0: torch.Tensor,
-    v1: torch.Tensor,
-    t: Union[float, torch.Tensor],
-) -> torch.Tensor:
-    r"""
-    Linear Interpolation between two tensors.
-
-    Args:
-        v0 (`torch.Tensor`): First tensor.
-        v1 (`torch.Tensor`): Second tensor.
-        t: (`float` or `torch.Tensor`): Interpolation factor.
-    """
-    t_is_float = False
-    input_device = v0.device
-    v0 = v0.cpu().numpy()
-    v1 = v1.cpu().numpy()
-
-    if isinstance(t, torch.Tensor):
-        t = t.cpu().numpy()
-    else:
-        t_is_float = True
-        t = np.array([t], dtype=v0.dtype)
-
-    t = t[..., None]
-    v0 = v0[None, ...]
-    v1 = v1[None, ...]
-    v2 = (1 - t) * v0 + t * v1
-
-    if t_is_float and v0.ndim > 1:
-        assert v2.shape[0] == 1
-        v2 = np.squeeze(v2, axis=0)
-
-    v2 = torch.from_numpy(v2).to(input_device)
-    return v2
-
-
-def slerp(
-    v0: torch.Tensor,
-    v1: torch.Tensor,
-    t: Union[float, torch.Tensor],
-    DOT_THRESHOLD: float = 0.9995,
-) -> torch.Tensor:
-    r"""
-    Spherical Linear Interpolation between two tensors.
-
-    Args:
-        v0 (`torch.Tensor`): First tensor.
-        v1 (`torch.Tensor`): Second tensor.
-        t: (`float` or `torch.Tensor`): Interpolation factor.
-        DOT_THRESHOLD (`float`):
-            Dot product threshold exceeding which linear interpolation will be used
-            because input tensors are close to parallel.
-    """
-    t_is_float = False
-    input_device = v0.device
-    v0 = v0.cpu().numpy()
-    v1 = v1.cpu().numpy()
-
-    if isinstance(t, torch.Tensor):
-        t = t.cpu().numpy()
-    else:
-        t_is_float = True
-        t = np.array([t], dtype=v0.dtype)
-
-    dot = np.sum(v0 * v1 / (np.linalg.norm(v0) * np.linalg.norm(v1)))
-
-    if np.abs(dot) > DOT_THRESHOLD:
-        # v0 and v1 are close to parallel, so use linear interpolation instead
-        v2 = lerp(v0, v1, t)
-    else:
-        theta_0 = np.arccos(dot)
-        sin_theta_0 = np.sin(theta_0)
-        theta_t = theta_0 * t
-        sin_theta_t = np.sin(theta_t)
-        s0 = np.sin(theta_0 - theta_t) / sin_theta_0
-        s1 = sin_theta_t / sin_theta_0
-        s0 = s0[..., None]
-        s1 = s1[..., None]
-        v0 = v0[None, ...]
-        v1 = v1[None, ...]
-        v2 = s0 * v0 + s1 * v1
-
-    if t_is_float and v0.ndim > 1:
-        assert v2.shape[0] == 1
-        v2 = np.squeeze(v2, axis=0)
-
-    v2 = torch.from_numpy(v2).to(input_device)
-    return v2
-
-
-def tensor2vid(video: torch.Tensor, processor, output_type="np"):
-    # Based on:
-    # https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/pipelines/multi_modal/text_to_video_synthesis_pipeline.py#L78
-
-    batch_size, channels, num_frames, height, width = video.shape
-    outputs = []
-    for batch_idx in range(batch_size):
-        batch_vid = video[batch_idx].permute(1, 0, 2, 3)
-        batch_output = processor.postprocess(batch_vid, output_type)
-
-        outputs.append(batch_output)
-
-    return outputs
-
-
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
-def retrieve_latents(
-    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
-):
-    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
-        return encoder_output.latent_dist.sample(generator)
-    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
-        return encoder_output.latent_dist.mode()
-    elif hasattr(encoder_output, "latents"):
-        return encoder_output.latents
-    else:
-        raise AttributeError("Could not access latents of provided encoder_output")
-
-
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
-def retrieve_timesteps(
-    scheduler,
-    num_inference_steps: Optional[int] = None,
-    device: Optional[Union[str, torch.device]] = None,
-    timesteps: Optional[List[int]] = None,
-    **kwargs,
-):
-    """
-    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
-    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
-
-    Args:
-        scheduler (`SchedulerMixin`):
-            The scheduler to get timesteps from.
-        num_inference_steps (`int`):
-            The number of diffusion steps used when generating samples with a pre-trained model. If used,
-            `timesteps` must be `None`.
-        device (`str` or `torch.device`, *optional*):
-            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
-        timesteps (`List[int]`, *optional*):
-                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
-                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
-                must be `None`.
-
-    Returns:
-        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
-        second element is the number of inference steps.
-    """
-    if timesteps is not None:
-        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
-        if not accepts_timesteps:
-            raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
-                f" timestep schedules. Please check whether you are using the correct scheduler."
-            )
-        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
-    else:
-        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-    return timesteps, num_inference_steps
-
-
-@dataclass
-class AnimateDiffImgToVideoPipelineOutput(BaseOutput):
-    frames: Union[torch.Tensor, np.ndarray]
-
-
-class AnimateDiffImgToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
-    r"""
-    Pipeline for text-to-video generation.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
-    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
-
-    The pipeline also inherits the following loading methods:
-        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
-        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
-        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
-        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
-        tokenizer (`CLIPTokenizer`):
-            A [`~transformers.CLIPTokenizer`] to tokenize text.
-        unet ([`UNet2DConditionModel`]):
-            A [`UNet2DConditionModel`] used to create a UNetMotionModel to denoise the encoded video latents.
-        motion_adapter ([`MotionAdapter`]):
-            A [`MotionAdapter`] to be used in combination with `unet` to denoise the encoded video latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-    """
-
-    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
-    _optional_components = ["feature_extractor", "image_encoder"]
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        motion_adapter: MotionAdapter,
-        scheduler: Union[
-            DDIMScheduler,
-            PNDMScheduler,
-            LMSDiscreteScheduler,
-            EulerDiscreteScheduler,
-            EulerAncestralDiscreteScheduler,
-            DPMSolverMultistepScheduler,
-        ],
-        feature_extractor: CLIPImageProcessor = None,
-        image_encoder: CLIPVisionModelWithProjection = None,
-    ):
-        super().__init__()
-        unet = UNetMotionModel.from_unet2d(unet, motion_adapter)
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            motion_adapter=motion_adapter,
-            scheduler=scheduler,
-            feature_extractor=feature_extractor,
-            image_encoder=image_encoder,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt
-    def encode_prompt(
-        self,
-        prompt,
-        device,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        lora_scale: Optional[float] = None,
-        clip_skip: Optional[int] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            device: (`torch.device`):
-                torch device
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            lora_scale (`float`, *optional*):
-                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
-            clip_skip (`int`, *optional*):
-                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
-                the output of the pre-final layer will be used for computing the prompt embeddings.
-        """
-        # set lora scale so that monkey patched LoRA
-        # function of text encoder can correctly access it
-        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
-            self._lora_scale = lora_scale
-
-            # dynamically adjust the LoRA scale
-            if not USE_PEFT_BACKEND:
-                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
-            else:
-                scale_lora_layers(self.text_encoder, lora_scale)
-
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
-
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask.to(device)
-            else:
-                attention_mask = None
-
-            if clip_skip is None:
-                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
-                prompt_embeds = prompt_embeds[0]
-            else:
-                prompt_embeds = self.text_encoder(
-                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
-                )
-                # Access the `hidden_states` first, that contains a tuple of
-                # all the hidden states from the encoder layers. Then index into
-                # the tuple to access the hidden states from the desired layer.
-                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
-                # We also need to apply the final LayerNorm here to not mess with the
-                # representations. The `last_hidden_states` that we typically use for
-                # obtaining the final prompt representations passes through the LayerNorm
-                # layer.
-                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
-
-        if self.text_encoder is not None:
-            prompt_embeds_dtype = self.text_encoder.dtype
-        elif self.unet is not None:
-            prompt_embeds_dtype = self.unet.dtype
-        else:
-            prompt_embeds_dtype = prompt_embeds.dtype
-
-        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif prompt is not None and type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
-
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = uncond_input.attention_mask.to(device)
-            else:
-                attention_mask = None
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids.to(device),
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
-
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-
-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
-            # Retrieve the original scale by scaling back the LoRA layers
-            unscale_lora_layers(self.text_encoder, lora_scale)
-
-        return prompt_embeds, negative_prompt_embeds
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
-    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
-        dtype = next(self.image_encoder.parameters()).dtype
-
-        if not isinstance(image, torch.Tensor):
-            image = self.feature_extractor(image, return_tensors="pt").pixel_values
-
-        image = image.to(device=device, dtype=dtype)
-        if output_hidden_states:
-            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
-            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
-            uncond_image_enc_hidden_states = self.image_encoder(
-                torch.zeros_like(image), output_hidden_states=True
-            ).hidden_states[-2]
-            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
-                num_images_per_prompt, dim=0
-            )
-            return image_enc_hidden_states, uncond_image_enc_hidden_states
-        else:
-            image_embeds = self.image_encoder(image).image_embeds
-            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
-            uncond_image_embeds = torch.zeros_like(image_embeds)
-
-            return image_embeds, uncond_image_embeds
-
-    # Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-
-        batch_size, channels, num_frames, height, width = latents.shape
-        latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
-
-        image = self.vae.decode(latents).sample
-        video = (
-            image[None, :]
-            .reshape(
-                (
-                    batch_size,
-                    num_frames,
-                    -1,
-                )
-                + image.shape[2:]
-            )
-            .permute(0, 2, 1, 3, 4)
-        )
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        video = video.float()
-        return video
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
-    def enable_vae_slicing(self):
-        r"""
-        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
-        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
-        """
-        self.vae.enable_slicing()
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
-    def disable_vae_slicing(self):
-        r"""
-        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
-        computing decoding in one step.
-        """
-        self.vae.disable_slicing()
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
-    def enable_vae_tiling(self):
-        r"""
-        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
-        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
-        processing larger images.
-        """
-        self.vae.enable_tiling()
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
-    def disable_vae_tiling(self):
-        r"""
-        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
-        computing decoding in one step.
-        """
-        self.vae.disable_tiling()
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
-    def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
-        r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
-        The suffixes after the scaling factors represent the stages where they are being applied.
-
-        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
-        that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
-        Args:
-            s1 (`float`):
-                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
-                mitigate "oversmoothing effect" in the enhanced denoising process.
-            s2 (`float`):
-                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
-                mitigate "oversmoothing effect" in the enhanced denoising process.
-            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
-            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
-        """
-        if not hasattr(self, "unet"):
-            raise ValueError("The pipeline must have `unet` for using FreeU.")
-        self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
-    def disable_freeu(self):
-        """Disables the FreeU mechanism if enabled."""
-        self.unet.disable_freeu()
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    def check_inputs(
-        self,
-        prompt,
-        height,
-        width,
-        callback_steps,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-        callback_on_step_end_tensor_inputs=None,
-        latent_interpolation_method=None,
-    ):
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-        if callback_on_step_end_tensor_inputs is not None and not all(
-            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
-        ):
-            raise ValueError(
-                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-        if latent_interpolation_method is not None:
-            if latent_interpolation_method not in ["lerp", "slerp"] and not isinstance(
-                latent_interpolation_method, FunctionType
-            ):
-                raise ValueError(
-                    "`latent_interpolation_method` must be one of `lerp`, `slerp` or a Callable[[torch.Tensor, torch.Tensor, int], torch.Tensor]"
-                )
-
-    def prepare_latents(
-        self,
-        image,
-        strength,
-        batch_size,
-        num_channels_latents,
-        num_frames,
-        height,
-        width,
-        dtype,
-        device,
-        generator,
-        latents=None,
-        latent_interpolation_method="slerp",
-    ):
-        shape = (
-            batch_size,
-            num_channels_latents,
-            num_frames,
-            height // self.vae_scale_factor,
-            width // self.vae_scale_factor,
-        )
-
-        if latents is None:
-            image = image.to(device=device, dtype=dtype)
-
-            if image.shape[1] == 4:
-                latents = image
-            else:
-                # make sure the VAE is in float32 mode, as it overflows in float16
-                if self.vae.config.force_upcast:
-                    image = image.float()
-                    self.vae.to(dtype=torch.float32)
-
-                if isinstance(generator, list):
-                    if len(generator) != batch_size:
-                        raise ValueError(
-                            f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                            f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-                        )
-
-                    init_latents = [
-                        retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
-                        for i in range(batch_size)
-                    ]
-                    init_latents = torch.cat(init_latents, dim=0)
-                else:
-                    init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
-
-                if self.vae.config.force_upcast:
-                    self.vae.to(dtype)
-
-                init_latents = init_latents.to(dtype)
-                init_latents = self.vae.config.scaling_factor * init_latents
-                latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-                latents = latents * self.scheduler.init_noise_sigma
-
-                if latent_interpolation_method == "lerp":
-
-                    def latent_cls(v0, v1, index):
-                        return lerp(v0, v1, index / num_frames * (1 - strength))
-                elif latent_interpolation_method == "slerp":
-
-                    def latent_cls(v0, v1, index):
-                        return slerp(v0, v1, index / num_frames * (1 - strength))
-                else:
-                    latent_cls = latent_interpolation_method
-
-                for i in range(num_frames):
-                    latents[:, :, i, :, :] = latent_cls(latents[:, :, i, :, :], init_latents, i)
-        else:
-            if shape != latents.shape:
-                # [B, C, F, H, W]
-                raise ValueError(f"`latents` expected to have {shape=}, but found {latents.shape=}")
-            latents = latents.to(device, dtype=dtype)
-
-        return latents
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        image: PipelineImageInput,
-        prompt: Optional[Union[str, List[str]]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_frames: int = 16,
-        num_inference_steps: int = 50,
-        timesteps: Optional[List[int]] = None,
-        guidance_scale: float = 7.5,
-        strength: float = 0.8,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_videos_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        ip_adapter_image: Optional[PipelineImageInput] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        clip_skip: Optional[int] = None,
-        latent_interpolation_method: Union[str, Callable[[torch.Tensor, torch.Tensor, int], torch.Tensor]] = "slerp",
-    ):
-        r"""
-        The call function to the pipeline for generation.
-
-        Args:
-            image (`PipelineImageInput`):
-                The input image to condition the generation on.
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
-            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
-                The height in pixels of the generated video.
-            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
-                The width in pixels of the generated video.
-            num_frames (`int`, *optional*, defaults to 16):
-                The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
-                amounts to 2 seconds of video.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
-                expense of slower inference.
-            strength (`float`, *optional*, defaults to 0.8):
-                Higher strength leads to more differences between original image and generated video.
-            guidance_scale (`float`, *optional*, defaults to 7.5):
-                A higher guidance scale value encourages the model to generate images closely linked to the text
-                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
-                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
-                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
-                generation deterministic.
-            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
-                `(batch_size, num_channel, num_frames, height, width)`.
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
-                provided, text embeddings are generated from the `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
-                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
-            ip_adapter_image: (`PipelineImageInput`, *optional*):
-                Optional image input to work with IP Adapters.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or
-                `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`AnimateDiffImgToVideoPipelineOutput`] instead
-                of a plain tuple.
-            callback (`Callable`, *optional*):
-                A function that calls every `callback_steps` steps during inference. The function is called with the
-                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function is called. If not specified, the callback is called at
-                every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
-                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
-            clip_skip (`int`, *optional*):
-                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
-                the output of the pre-final layer will be used for computing the prompt embeddings.
-            latent_interpolation_method (`str` or `Callable[[torch.Tensor, torch.Tensor, int], torch.Tensor]]`, *optional*):
-                Must be one of "lerp", "slerp" or a callable that takes in a random noisy latent, image latent and a frame index
-                as input and returns an initial latent for sampling.
-        Examples:
-
-        Returns:
-            [`AnimateDiffImgToVideoPipelineOutput`] or `tuple`:
-                If `return_dict` is `True`, [`AnimateDiffImgToVideoPipelineOutput`] is
-                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
-        """
-        # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor
-
-        num_videos_per_prompt = 1
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt=prompt,
-            height=height,
-            width=width,
-            callback_steps=callback_steps,
-            negative_prompt=negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            latent_interpolation_method=latent_interpolation_method,
-        )
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        device = self._execution_device
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        text_encoder_lora_scale = (
-            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
-        )
-        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
-            prompt,
-            device,
-            num_videos_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            lora_scale=text_encoder_lora_scale,
-            clip_skip=clip_skip,
-        )
-
-        # For classifier free guidance, we need to do two forward passes.
-        # Here we concatenate the unconditional and text embeddings into a single batch
-        # to avoid doing two forward passes
-        if do_classifier_free_guidance:
-            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
-
-        if ip_adapter_image is not None:
-            output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
-            image_embeds, negative_image_embeds = self.encode_image(
-                ip_adapter_image, device, num_videos_per_prompt, output_hidden_state
-            )
-            if do_classifier_free_guidance:
-                image_embeds = torch.cat([negative_image_embeds, image_embeds])
-
-        # 4. Preprocess image
-        image = self.image_processor.preprocess(image, height=height, width=width)
-
-        # 5. Prepare timesteps
-        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
-
-        # 6. Prepare latent variables
-        num_channels_latents = self.unet.config.in_channels
-        latents = self.prepare_latents(
-            image=image,
-            strength=strength,
-            batch_size=batch_size * num_videos_per_prompt,
-            num_channels_latents=num_channels_latents,
-            num_frames=num_frames,
-            height=height,
-            width=width,
-            dtype=prompt_embeds.dtype,
-            device=device,
-            generator=generator,
-            latents=latents,
-            latent_interpolation_method=latent_interpolation_method,
-        )
-
-        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 8. Add image embeds for IP-Adapter
-        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
-
-        # 9. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    added_cond_kwargs=added_cond_kwargs,
-                ).sample
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        if output_type == "latent":
-            return AnimateDiffImgToVideoPipelineOutput(frames=latents)
-
-        # 10. Post-processing
-        video_tensor = self.decode_latents(latents)
-
-        if output_type == "pt":
-            video = video_tensor
-        else:
-            video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
-
-        # 11. Offload all models
-        self.maybe_free_model_hooks()
-
-        if not return_dict:
-            return (video,)
-
-        return AnimateDiffImgToVideoPipelineOutput(frames=video)

From f6f4079a503cdcb9ae1b1affa93a6c99287b393c Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Wed, 10 Jan 2024 02:52:54 +0530
Subject: [PATCH 27/47] update README to :main

---
 examples/community/README.md | 204 +++++++++++++++++++++++++++++------
 1 file changed, 174 insertions(+), 30 deletions(-)

diff --git a/examples/community/README.md b/examples/community/README.md
index 2bba16881732..3284b2ab27d7 100755
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -21,6 +21,7 @@ If a community doesn't work as expected, please open an issue and ping the autho
 | Seed Resizing Stable Diffusion                                                                                                        | Stable Diffusion Pipeline that supports resizing an image and retaining the concepts of the 512 by 512 generation.                                                                                                                                                                                                                                                                                                                                                                                       | [Seed Resizing](#seed-resizing)                                                           | -                                                                                                                                                                                                                  |                      [Mark Rich](https://github.com/MarkRich) |
 | Imagic Stable Diffusion                                                                                                               | Stable Diffusion Pipeline that enables writing a text prompt to edit an existing image                                                                                                                                                                                                                                                                                                                                                                                                                   | [Imagic Stable Diffusion](#imagic-stable-diffusion)                                       | -                                                                                                                                                                                                                  |                      [Mark Rich](https://github.com/MarkRich) |
 | Multilingual Stable Diffusion                                                                                                         | Stable Diffusion Pipeline that supports prompts in 50 different languages.                                                                                                                                                                                                                                                                                                                                                                                                                               | [Multilingual Stable Diffusion](#multilingual-stable-diffusion-pipeline)                  | -                                                                                                                                                                                                                  |          [Juan Carlos Piñeros](https://github.com/juancopi81) |
+| GlueGen Stable Diffusion                                                                                                         | Stable Diffusion Pipeline that supports prompts in different languages using GlueGen adapter.                                                                                                                                                                                                                                                                                                                                                                                                                               | [GlueGen Stable Diffusion](#gluegen-stable-diffusion-pipeline)                  | -                                                                                                                                                                                                                  |          [Phạm Hồng Vinh](https://github.com/rootonchair) |
 | Image to Image Inpainting Stable Diffusion                                                                                            | Stable Diffusion Pipeline that enables the overlaying of two images and subsequent inpainting                                                                                                                                                                                                                                                                                                                                                                                                            | [Image to Image Inpainting Stable Diffusion](#image-to-image-inpainting-stable-diffusion) | -                                                                                                                                                                                                                  |                    [Alex McKinney](https://github.com/vvvm23) |
 | Text Based Inpainting Stable Diffusion                                                                                                | Stable Diffusion Inpainting Pipeline that enables passing a text prompt to generate the mask for inpainting                                                                                                                                                                                                                                                                                                                                                                                              | [Text Based Inpainting Stable Diffusion](#image-to-image-inpainting-stable-diffusion)     | -                                                                                                                                                                                                                  |                   [Dhruv Karan](https://github.com/unography) |
 | Bit Diffusion                                                                                                                         | Diffusion on discrete data                                                                                                                                                                                                                                                                                                                                                                                                                                                                               | [Bit Diffusion](#bit-diffusion)                                                           | -  |                       [Stuti R.](https://github.com/kingstut) |
@@ -53,8 +54,9 @@ prompt-to-prompt | change parts of a prompt and retain image structure (see [pap
 |   Regional Prompting Pipeline                                                                                               | Assign multiple prompts for different regions                                                                                                                                                                                                                                                                                                                                                    |  [Regional Prompting Pipeline](#regional-prompting-pipeline) | - | [hako-mikan](https://github.com/hako-mikan) |
 | LDM3D-sr (LDM3D upscaler)                                                                                                             | Upscale low resolution RGB and depth inputs to high resolution                                                                                                                                                                                                                                                                                                                                                                                                                              | [StableDiffusionUpscaleLDM3D Pipeline](https://github.com/estelleafl/diffusers/tree/ldm3d_upscaler_community/examples/community#stablediffusionupscaleldm3d-pipeline)                                                                             | -                                                                                                                                                                                                             |                                                        [Estelle Aflalo](https://github.com/estelleafl) |
 | AnimateDiff ControlNet Pipeline                                                                                                    | Combines AnimateDiff with precise motion control using ControlNets                                                                                                                                                                                                                                                                                                                                                                                                                                    | [AnimateDiff ControlNet Pipeline](#animatediff-controlnet-pipeline) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1SKboYeGjEQmQPWoFC0aLYpBlYdHXkvAu?usp=sharing) | [Aryan V S](https://github.com/a-r-r-o-w) and [Edoardo Botta](https://github.com/EdoardoBotta) |
-| AnimateDiff Image-To-Video Pipeline                                                                                                    | Image-To-Video experimental support for AnimateDiff (open to improvements)                                                                                                                                                                                                                                                                                                                                                                                                                                    | [AnimateDiff Image To Video Pipeline](#animatediff-image-to-video-pipeline) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://drive.google.com/file/d/1TvzCDPHhfFtdcJZe4RLloAwyoLKuttWK/view?usp=sharing) | [Aryan V S](https://github.com/a-r-r-o-w) |
 |   DemoFusion Pipeline                                                                                                    | Implementation of [DemoFusion: Democratising High-Resolution Image Generation With No $$$](https://arxiv.org/abs/2311.16973)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [DemoFusion Pipeline](#DemoFusion)      | - |              [Ruoyi Du](https://github.com/RuoyiDu) |
+|   Null-Text Inversion Pipeline  | Implement [Null-text Inversion for Editing Real Images using Guided Diffusion Models](https://arxiv.org/abs/2211.09794) as a pipeline.                                                                                                                                                                                                                                                                                                                                                                                                                                      | [Null-Text Inversion](https://github.com/google/prompt-to-prompt/)      | - |              [Junsheng Luan](https://github.com/Junsheng121) |
+|   Rerender A Video Pipeline                                                                                                    | Implementation of [[SIGGRAPH Asia 2023] Rerender A Video: Zero-Shot Text-Guided Video-to-Video Translation](https://arxiv.org/abs/2306.07954)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [Rerender A Video Pipeline](#Rerender_A_Video)      | - |              [Yifan Zhou](https://github.com/SingleZombie) |
 
 To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.
 ```py
@@ -739,6 +741,49 @@ grid = image_grid(images, rows=2, cols=2)
 This example produces the following images:
 ![image](https://user-images.githubusercontent.com/4313860/198328706-295824a4-9856-4ce5-8e66-278ceb42fd29.png)
 
+### GlueGen Stable Diffusion Pipeline
+GlueGen is a minimal adapter that allow alignment between any encoder (Text Encoder of different language, Multilingual Roberta, AudioClip) and CLIP text encoder used in standard Stable Diffusion model. This method allows easy language adaptation to available english Stable Diffusion checkpoints without the need of an image captioning dataset as well as long training hours. 
+
+Make sure you downloaded `gluenet_French_clip_overnorm_over3_noln.ckpt` for French (there are also pre-trained weights for Chinese, Italian, Japanese, Spanish or train your own) at [GlueGen's official repo](https://github.com/salesforce/GlueGen/tree/main)
+
+```python
+from PIL import Image
+
+import torch
+
+from transformers import AutoModel, AutoTokenizer
+
+from diffusers import DiffusionPipeline
+
+if __name__ == "__main__":
+    device = "cuda"
+
+    lm_model_id = "xlm-roberta-large"
+    token_max_length = 77
+
+    text_encoder = AutoModel.from_pretrained(lm_model_id)
+    tokenizer = AutoTokenizer.from_pretrained(lm_model_id, model_max_length=token_max_length, use_fast=False)
+
+    tensor_norm = torch.Tensor([[43.8203],[28.3668],[27.9345],[28.0084],[28.2958],[28.2576],[28.3373],[28.2695],[28.4097],[28.2790],[28.2825],[28.2807],[28.2775],[28.2708],[28.2682],[28.2624],[28.2589],[28.2611],[28.2616],[28.2639],[28.2613],[28.2566],[28.2615],[28.2665],[28.2799],[28.2885],[28.2852],[28.2863],[28.2780],[28.2818],[28.2764],[28.2532],[28.2412],[28.2336],[28.2514],[28.2734],[28.2763],[28.2977],[28.2971],[28.2948],[28.2818],[28.2676],[28.2831],[28.2890],[28.2979],[28.2999],[28.3117],[28.3363],[28.3554],[28.3626],[28.3589],[28.3597],[28.3543],[28.3660],[28.3731],[28.3717],[28.3812],[28.3753],[28.3810],[28.3777],[28.3693],[28.3713],[28.3670],[28.3691],[28.3679],[28.3624],[28.3703],[28.3703],[28.3720],[28.3594],[28.3576],[28.3562],[28.3438],[28.3376],[28.3389],[28.3433],[28.3191]])
+
+    pipeline = DiffusionPipeline.from_pretrained(
+        "runwayml/stable-diffusion-v1-5",
+        text_encoder=text_encoder,
+        tokenizer=tokenizer,
+        custom_pipeline="gluegen"
+    ).to(device)
+    pipeline.load_language_adapter("gluenet_French_clip_overnorm_over3_noln.ckpt", num_token=token_max_length, dim=1024, dim_out=768, tensor_norm=tensor_norm)
+
+    prompt = "une voiture sur la plage" 
+
+    generator = torch.Generator(device=device).manual_seed(42) 
+    image = pipeline(prompt, generator=generator).images[0]
+    image.save("gluegen_output_fr.png")
+```
+Which will produce:
+
+![output_image](https://github.com/rootonchair/diffusers/assets/23548268/db43ffb6-8667-47c1-8872-26f85dc0a57f)
+
 ### Image to Image Inpainting Stable Diffusion
 
 Similar to the standard stable diffusion inpainting example, except with the addition of an `inner_image` argument.
@@ -1177,7 +1222,7 @@ The resulting images in order:-
 ![result5](https://huggingface.co/datasets/NagaSaiAbhinay/UnCLIPImageInterpolationSamples/resolve/main/starry_to_flowers_5.png)
 
 ### DDIM Noise Comparative Analysis Pipeline
-#### **Research question: What visual concepts do the diffusion models learn from each noise level during training?**
+#### **Research question: What visual concepts do the diffusion models learn from each noise level during training?**
 The [P2 weighting (CVPR 2022)](https://arxiv.org/abs/2204.00227) paper proposed an approach to answer the above question, which is their second contribution.
 The approach consists of the following steps:
 
@@ -1901,7 +1946,7 @@ output = pipeline(
     canvas_width=352,
     regions=[
         Text2ImageRegion(0, 800, 0, 352, guidance_scale=8,
-            prompt=f"best quality, masterpiece, WLOP, sakimichan, art contest winner on pixiv, 8K, intricate details, wet effects, rain drops, ethereal, mysterious, futuristic, UHD, HDR, cinematic lighting, in a beautiful forest, rainy day, award winning, trending on artstation, beautiful confident cheerful young woman, wearing a futuristic sleeveless dress, ultra beautiful detailed  eyes, hyper-detailed face, complex,  perfect, model,  textured,  chiaroscuro, professional make-up, realistic, figure in frame, "),
+            prompt=f"best quality, masterpiece, WLOP, sakimichan, art contest winner on pixiv, 8K, intricate details, wet effects, rain drops, ethereal, mysterious, futuristic, UHD, HDR, cinematic lighting, in a beautiful forest, rainy day, award winning, trending on artstation, beautiful confident cheerful young woman, wearing a futuristic sleeveless dress, ultra beautiful detailed  eyes, hyper-detailed face, complex,  perfect, model,  textured,  chiaroscuro, professional make-up, realistic, figure in frame, "),
         Image2ImageRegion(352-800, 352, 0, 352, reference_image=iic_image, strength=1.0),
     ],
     num_inference_steps=100,
@@ -2943,7 +2988,7 @@ pipe = DiffusionPipeline.from_pretrained(
     custom_pipeline="pipeline_animatediff_controlnet",
 ).to(device="cuda", dtype=torch.float16)
 pipe.scheduler = DPMSolverMultistepScheduler.from_pretrained(
-    model_id, subfolder="scheduler", beta_schedule="linear", clip_sample=False, timestep_spacing="linspace", steps_offset=1
+    model_id, subfolder="scheduler", clip_sample=False, timestep_spacing="linspace", steps_offset=1
 )
 pipe.enable_vae_slicing()
 
@@ -2982,33 +3027,7 @@ export_to_gif(result.frames[0], "result.gif")
     <td align=center><img src="https://github.com/huggingface/diffusers/assets/72266394/eb7d2952-72e4-44fa-b664-077c79b4fc70" alt="gif-2"></td>
   </tr>
 </table>
-
-### AnimateDiff Image-To-Video Pipeline
-
-This pipeline adds experimental support for the image-to-video task using AnimateDiff. Refer to [this](https://github.com/huggingface/diffusers/pull/6328) PR for more examples and results.
-
-```py
-import torch
-from diffusers import MotionAdapter, DiffusionPipeline, DDIMScheduler
-from diffusers.utils import export_to_gif, load_image
-
-adapter = MotionAdapter.from_pretrained("diffusers/motion-adapter")
-pipe = DiffusionPipeline.from_pretrained("SG161222/Realistic_Vision_V5.1_noVAE", motion_adapter=adapter, custom_pipeline="pipeline_animatediff_img2video").to("cuda")
-pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False, timespace_spacing="linspace")
-
-image = load_image("snail.png")
-output = pipe(
-  image=image,
-  prompt="A snail moving on the ground",
-  strength=0.8,
-  latent_interpolation_method="slerp", # can be lerp, slerp, or your own callback
-)
-frames = output.frames[0]
-export_to_gif(frames, "animation.gif")
-```
-
 ### DemoFusion
-
 This pipeline is the official implementation of [DemoFusion: Democratising High-Resolution Image Generation With No $$$](https://arxiv.org/abs/2311.16973).
 The original repo can be found at [repo](https://github.com/PRIS-CV/DemoFusion).
 - `view_batch_size` (`int`, defaults to 16):
@@ -3129,3 +3148,128 @@ output_image = PIL.Image.fromarray(output)
 output_image.save("./output.png")
 
 ```
+
+### Null-Text Inversion pipeline
+
+This pipeline provides null-text inversion for editing real images. It enables null-text optimization, and DDIM reconstruction via w, w/o null-text optimization. No prompt-to-prompt code is implemented as there is a Prompt2PromptPipeline.
+* Reference paper
+    ```@article{hertz2022prompt,
+  title={Prompt-to-prompt image editing with cross attention control},
+  author={Hertz, Amir and Mokady, Ron and Tenenbaum, Jay and Aberman, Kfir and Pritch, Yael and Cohen-Or, Daniel},
+  booktitle={arXiv preprint arXiv:2208.01626},
+  year={2022}
+    ```}
+
+```py
+from diffusers.schedulers import DDIMScheduler
+from examples.community.pipeline_null_text_inversion import NullTextPipeline
+import torch
+
+# Load the pipeline
+device = "cuda"
+# Provide invert_prompt and the image for null-text optimization.
+invert_prompt = "A lying cat"
+input_image = "siamese.jpg"
+steps = 50
+
+# Provide prompt used for generation. Same if reconstruction 
+prompt = "A lying cat"
+# or different if editing.
+prompt = "A lying dog"
+
+#Float32 is essential to a well optimization
+model_path = "runwayml/stable-diffusion-v1-5"
+scheduler = DDIMScheduler(num_train_timesteps=1000, beta_start=0.00085, beta_end=0.0120, beta_schedule="scaled_linear")
+pipeline = NullTextPipeline.from_pretrained(model_path, scheduler = scheduler, torch_dtype=torch.float32).to(device)
+
+#Saves the inverted_latent to save time
+inverted_latent, uncond = pipeline.invert(input_image, invert_prompt, num_inner_steps=10, early_stop_epsilon= 1e-5, num_inference_steps = steps)
+pipeline(prompt, uncond, inverted_latent, guidance_scale=7.5, num_inference_steps=steps).images[0].save(input_image+".output.jpg")
+```
+### Rerender_A_Video
+
+This is the Diffusers implementation of zero-shot video-to-video translation pipeline [Rerender_A_Video](https://github.com/williamyang1991/Rerender_A_Video) (without Ebsynth postprocessing). To run the code, please install gmflow. Then modify the path in `examples/community/rerender_a_video.py`:
+
+```py
+gmflow_dir = "/path/to/gmflow"
+```
+
+After that, you can run the pipeline with:
+
+```py
+from diffusers import ControlNetModel, AutoencoderKL, DDIMScheduler
+from diffusers.utils import export_to_video
+import numpy as np
+import torch
+
+import cv2
+from PIL import Image
+
+def video_to_frame(video_path: str, interval: int):
+    vidcap = cv2.VideoCapture(video_path)
+    success = True
+
+    count = 0
+    res = []
+    while success:
+        count += 1
+        success, image = vidcap.read()
+        if count % interval != 1:
+            continue
+        if image is not None:
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+            res.append(image)
+
+    vidcap.release()
+    return res
+
+input_video_path = 'path/to/video'
+input_interval = 10
+frames = video_to_frame(
+    input_video_path, input_interval)
+
+control_frames = []
+# get canny image
+for frame in frames:
+    np_image = cv2.Canny(frame, 50, 100)
+    np_image = np_image[:, :, None]
+    np_image = np.concatenate([np_image, np_image, np_image], axis=2)
+    canny_image = Image.fromarray(np_image)
+    control_frames.append(canny_image)
+
+# You can use any ControlNet here
+controlnet = ControlNetModel.from_pretrained(
+    "lllyasviel/sd-controlnet-canny").to('cuda')
+
+# You can use any fintuned SD here
+pipe = DiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, custom_pipeline='rerender_a_video').to('cuda')
+
+# Optional: you can download vae-ft-mse-840000-ema-pruned.ckpt to enhance the results
+# pipe.vae = AutoencoderKL.from_single_file(
+#     "path/to/vae-ft-mse-840000-ema-pruned.ckpt").to('cuda')
+
+pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+
+generator = torch.manual_seed(0)
+frames = [Image.fromarray(frame) for frame in frames]
+output_frames = pipe(
+    "a beautiful woman in CG style, best quality, extremely detailed",
+
+    frames,
+    control_frames,
+    num_inference_steps=20,
+    strength=0.75,
+    controlnet_conditioning_scale=0.7,
+    generator=generator,
+    warp_start=0.0,
+    warp_end=0.1,
+    mask_start=0.5,
+    mask_end=0.8,
+    mask_strength=0.5,
+    negative_prompt='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
+).frames
+
+export_to_video(
+    output_frames, "/path/to/video.mp4", 5)
+```

From b1c5db9d13676cda7f66da29fe213e6ea7277ad1 Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Wed, 10 Jan 2024 20:40:01 +0530
Subject: [PATCH 28/47] remove slow test

---
 .../test_animatediff_video2video.py           | 74 +------------------
 1 file changed, 1 insertion(+), 73 deletions(-)

diff --git a/tests/pipelines/animatediff/test_animatediff_video2video.py b/tests/pipelines/animatediff/test_animatediff_video2video.py
index 090f0fedd93a..6c1cc17bc803 100644
--- a/tests/pipelines/animatediff/test_animatediff_video2video.py
+++ b/tests/pipelines/animatediff/test_animatediff_video2video.py
@@ -1,4 +1,3 @@
-import gc
 import unittest
 
 import numpy as np
@@ -16,7 +15,7 @@
     UNetMotionModel,
 )
 from diffusers.utils import is_xformers_available, logging
-from diffusers.utils.testing_utils import numpy_cosine_similarity_distance, require_torch_gpu, slow, torch_device
+from diffusers.utils.testing_utils import torch_device
 
 from ..pipeline_params import TEXT_TO_IMAGE_PARAMS, VIDEO_TO_VIDEO_BATCH_PARAMS
 from ..test_pipelines_common import PipelineTesterMixin
@@ -268,74 +267,3 @@ def test_xformers_attention_forwardGenerator_pass(self):
 
         max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
         self.assertLess(max_diff, 1e-4, "XFormers attention should not affect the inference results")
-
-
-@slow
-@require_torch_gpu
-class AnimateDiffVideoToVideoPipelineSlowTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_animatediff_vid2vid(self):
-        adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
-        pipe = AnimateDiffVideoToVideoPipeline.from_pretrained("frankjoshua/toonyou_beta6", motion_adapter=adapter)
-        pipe = pipe.to(torch_device)
-        pipe.scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="linear",
-            steps_offset=1,
-            clip_sample=False,
-        )
-        pipe.enable_vae_slicing()
-        pipe.enable_model_cpu_offload()
-        pipe.set_progress_bar_config(disable=None)
-
-        prompt = "night, b&w photo of old house, post apocalypse, forest, storm weather, wind, rocks, 8k uhd, dslr, soft lighting, high quality, film grain"
-        negative_prompt = "bad quality, worse quality"
-
-        def load_video(file_path):
-            import imageio
-
-            images = []
-            vid = imageio.get_reader(file_path)
-            for frame in vid:
-                pil_image = Image.fromarray(frame)
-                images.append(pil_image)
-            return images
-
-        video = load_video("/path/to/huggingface/video.gif")
-
-        generator = torch.Generator("cpu").manual_seed(0)
-        output = pipe(
-            video=video,
-            prompt=prompt,
-            negative_prompt=negative_prompt,
-            num_frames=16,
-            generator=generator,
-            guidance_scale=7.5,
-            num_inference_steps=3,
-            output_type="np",
-        )
-
-        image = output.frames[0]
-        assert image.shape == (16, 512, 512, 3)
-
-        image_slice = image[0, -3:, -3:, -1]
-        expected_slice = np.array(
-            [
-                0.11357737,
-                0.11285847,
-                0.11180121,
-                0.11084166,
-                0.11414117,
-                0.09785956,
-                0.10742754,
-                0.10510018,
-                0.08045256,
-            ]
-        )
-        assert numpy_cosine_similarity_distance(image_slice.flatten(), expected_slice.flatten()) < 1e-3

From 3fc8623b4b0192a629cb153b2026cb4fee972c81 Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Wed, 10 Jan 2024 20:56:09 +0530
Subject: [PATCH 29/47] refactor pipeline output

---
 .../pipelines/animatediff/__init__.py         | 17 ++++------
 .../animatediff/pipeline_animatediff.py       | 17 ++--------
 .../pipeline_animatediff_video2video.py       | 32 ++++++++-----------
 .../pipelines/animatediff/pipeline_output.py  | 22 +++++++++++++
 4 files changed, 43 insertions(+), 45 deletions(-)
 create mode 100644 src/diffusers/pipelines/animatediff/pipeline_output.py

diff --git a/src/diffusers/pipelines/animatediff/__init__.py b/src/diffusers/pipelines/animatediff/__init__.py
index 2b4493d9b908..35b99a76fd21 100644
--- a/src/diffusers/pipelines/animatediff/__init__.py
+++ b/src/diffusers/pipelines/animatediff/__init__.py
@@ -11,7 +11,7 @@
 
 
 _dummy_objects = {}
-_import_structure = {}
+_import_structure = {"pipeline_output": ["AnimateDiffPipelineOutput"]}
 
 try:
     if not (is_transformers_available() and is_torch_available()):
@@ -21,11 +21,8 @@
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
-    _import_structure["pipeline_animatediff"] = ["AnimateDiffPipeline", "AnimateDiffPipelineOutput"]
-    _import_structure["pipeline_animatediff_video2video"] = [
-        "AnimateDiffVideoToVideoPipeline",
-        "AnimateDiffVideoToVideoPipelineOutput",
-    ]
+    _import_structure["pipeline_animatediff"] = ["AnimateDiffPipeline"]
+    _import_structure["pipeline_animatediff_video2video"] = ["AnimateDiffVideoToVideoPipeline"]
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
     try:
@@ -35,11 +32,9 @@
         from ...utils.dummy_torch_and_transformers_objects import *
 
     else:
-        from .pipeline_animatediff import AnimateDiffPipeline, AnimateDiffPipelineOutput
-        from .pipeline_animatediff_video2video import (
-            AnimateDiffVideoToVideoPipeline,
-            AnimateDiffVideoToVideoPipelineOutput,
-        )
+        from .pipeline_animatediff import AnimateDiffPipeline
+        from .pipeline_animatediff_video2video import AnimateDiffVideoToVideoPipeline
+        from .pipeline_output import AnimateDiffPipelineOutput
 
 else:
     import sys
diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py
index b0fe790c2222..b257a84c97c9 100644
--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 
 import inspect
-from dataclasses import dataclass
 from typing import Any, Callable, Dict, List, Optional, Union
 
-import numpy as np
 import torch
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
 
@@ -33,16 +31,10 @@
     LMSDiscreteScheduler,
     PNDMScheduler,
 )
-from ...utils import (
-    USE_PEFT_BACKEND,
-    BaseOutput,
-    logging,
-    replace_example_docstring,
-    scale_lora_layers,
-    unscale_lora_layers,
-)
+from ...utils import USE_PEFT_BACKEND, logging, replace_example_docstring, scale_lora_layers, unscale_lora_layers
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import AnimateDiffPipelineOutput
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -79,11 +71,6 @@ def tensor2vid(video: torch.Tensor, processor, output_type="np"):
     return outputs
 
 
-@dataclass
-class AnimateDiffPipelineOutput(BaseOutput):
-    frames: Union[torch.Tensor, np.ndarray]
-
-
 class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
     r"""
     Pipeline for text-to-video generation.
diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
index ff43275fe28b..7513b325548e 100644
--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
@@ -13,11 +13,9 @@
 # limitations under the License.
 
 import inspect
-from dataclasses import dataclass
 from types import FunctionType
 from typing import Any, Callable, Dict, List, Optional, Union
 
-import numpy as np
 import torch
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
 
@@ -34,9 +32,10 @@
     LMSDiscreteScheduler,
     PNDMScheduler,
 )
-from ...utils import USE_PEFT_BACKEND, BaseOutput, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import AnimateDiffPipelineOutput
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -54,12 +53,12 @@
         >>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False, timespace_spacing="linspace")
 
         >>> def load_video(file_path):
-        >>>     images = []
-        >>>     vid = imageio.get_reader(file_path)
-        >>>     for i, frame in enumerate(vid):
-        >>>         pil_image = Image.fromarray(frame)
-        >>>         images.append(pil_image)
-        >>>     return images
+        ...     images = []
+        ...     vid = imageio.get_reader(file_path)
+        ...     for i, frame in enumerate(vid):
+        ...         pil_image = Image.fromarray(frame)
+        ...         images.append(pil_image)
+        ...     return images
 
         >>> video = load_video("animation_fireworks.gif")
         >>> output = pipe(video=video, prompt="Closeup of a woman, fireworks in the background", strength=0.7)
@@ -143,11 +142,6 @@ def retrieve_timesteps(
     return timesteps, num_inference_steps
 
 
-@dataclass
-class AnimateDiffVideoToVideoPipelineOutput(BaseOutput):
-    frames: Union[torch.Tensor, np.ndarray]
-
-
 class AnimateDiffVideoToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
     r"""
     Pipeline for text-to-video generation.
@@ -775,7 +769,7 @@ def __call__(
                 The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or
                 `np.array`.
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`AnimateDiffVideoToVideoPipelineOutput`] instead
+                Whether or not to return a [`AnimateDiffPipelineOutput`] instead
                 of a plain tuple.
             callback (`Callable`, *optional*):
                 A function that calls every `callback_steps` steps during inference. The function is called with the
@@ -792,8 +786,8 @@ def __call__(
         Examples:
 
         Returns:
-            [`AnimateDiffVideoToVideoPipelineOutput`] or `tuple`:
-                If `return_dict` is `True`, [`AnimateDiffVideoToVideoPipelineOutput`] is
+            [`AnimateDiffPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`AnimateDiffPipelineOutput`] is
                 returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
         """
         # 0. Default height and width to unet
@@ -919,7 +913,7 @@ def __call__(
                         callback(i, t, latents)
 
         if output_type == "latent":
-            return AnimateDiffVideoToVideoPipelineOutput(frames=latents)
+            return AnimateDiffPipelineOutput(frames=latents)
 
         # 9. Post-processing
         video_tensor = self.decode_latents(latents)
@@ -935,4 +929,4 @@ def __call__(
         if not return_dict:
             return (video,)
 
-        return AnimateDiffVideoToVideoPipelineOutput(frames=video)
+        return AnimateDiffPipelineOutput(frames=video)
diff --git a/src/diffusers/pipelines/animatediff/pipeline_output.py b/src/diffusers/pipelines/animatediff/pipeline_output.py
new file mode 100644
index 000000000000..79a399fbc3a6
--- /dev/null
+++ b/src/diffusers/pipelines/animatediff/pipeline_output.py
@@ -0,0 +1,22 @@
+from dataclasses import dataclass
+from typing import List, Union
+
+import numpy as np
+import PIL.Image
+import torch
+
+from ...utils import BaseOutput
+
+
+@dataclass
+class AnimateDiffPipelineOutput(BaseOutput):
+    r"""
+    Output class for AnimateDiff pipelines.
+
+    Args:
+        frames (`List[List[PIL.Image.Image]]` or `torch.Tensor` or `np.ndarray`):
+            List of PIL Images of length `batch_size` or torch.Tensor or np.ndarray of shape
+            `(batch_size, num_frames, height, width, num_channels)`.
+    """
+
+    frames: Union[List[List[PIL.Image.Image]], torch.Tensor, np.ndarray]

From 817b44e45ddd8c294349443e2544690337c9c5a5 Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Wed, 10 Jan 2024 21:37:50 +0530
Subject: [PATCH 30/47] update docs

---
 docs/source/en/api/pipelines/animatediff.md | 57 +++++++++++++++------
 1 file changed, 41 insertions(+), 16 deletions(-)

diff --git a/docs/source/en/api/pipelines/animatediff.md b/docs/source/en/api/pipelines/animatediff.md
index 5f0eb339dc8c..58327c8c5979 100644
--- a/docs/source/en/api/pipelines/animatediff.md
+++ b/docs/source/en/api/pipelines/animatediff.md
@@ -155,6 +155,47 @@ frames = output.frames[0]
 export_to_gif(frames, "animation.gif")
 ```
 
+Here are some sample outputs:
+
+<table>
+    <tr>
+      <th align=center>Source Video</th>
+      <th align=center>Output Video</th>
+    </tr>
+    <tr>
+        <td align=center>
+          raccoon playing a guitar
+          <br />
+          <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-vid2vid-input-1.gif"
+              alt="racoon playing a guitar"
+              style="width: 300px;" />
+        </td>
+        <td align=center>
+          panda playing a guitar
+          <br/>
+          <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-vid2vid-output-1.gif"
+              alt="panda playing a guitar"
+              style="width: 300px;" />
+        </td>
+    </tr>
+    <tr>
+        <td align=center>
+          closeup of margot robbie, fireworks in the background, high quality
+          <br />
+          <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-vid2vid-input-2.gif"
+              alt="closeup of margot robbie, fireworks in the background, high quality"
+              style="width: 300px;" />
+        </td>
+        <td align=center>
+          closeup of tony stark, robert downey jr, fireworks
+          <br/>
+          <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-vid2vid-output-2.gif"
+              alt="closeup of tony stark, robert downey jr, fireworks"
+              style="width: 300px;" />
+        </td>
+    </tr>
+</table>
+
 ## Using Motion LoRAs
 
 Motion LoRAs are a collection of LoRAs that work with the `guoyww/animatediff-motion-adapter-v1-5-2` checkpoint. These LoRAs are responsible for adding specific types of motion to the animations.
@@ -313,19 +354,3 @@ Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers)
 ## AnimateDiffPipelineOutput
 
 [[autodoc]] pipelines.animatediff.AnimateDiffPipelineOutput
-
-## AnimateDiffVideoToVideoPipeline
-
-[[autodoc]] AnimateDiffVideoToVideoPipeline
-	- all
-	- __call__
-    - enable_freeu
-    - disable_freeu
-    - enable_vae_slicing
-    - disable_vae_slicing
-    - enable_vae_tiling
-    - disable_vae_tiling
-
-## AnimateDiffVideoToVideoPipelineOutput
-
-[[autodoc]] pipelines.animatediff.AnimateDiffVideoToVideoPipelineOutput

From 65430795398ea18db15d3e1febe62c79ee8b5f3c Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Thu, 11 Jan 2024 23:10:34 +0530
Subject: [PATCH 31/47] update docs

---
 docs/source/en/api/pipelines/animatediff.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/api/pipelines/animatediff.md b/docs/source/en/api/pipelines/animatediff.md
index 58327c8c5979..d93d4083371a 100644
--- a/docs/source/en/api/pipelines/animatediff.md
+++ b/docs/source/en/api/pipelines/animatediff.md
@@ -141,11 +141,11 @@ def load_video(file_path: str):
     return images
 
 # load initial video
-video = load_video("/path/to/local/animation_fireworks.gif")
+video = load_video("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-vid2vid-input-1.gif")
 
 output = pipe(
     video = video,
-    prompt="closeup of a handsome man, robert downey jr., iron man, fireworks in the background, realistic, high quality",
+    prompt="panda playing a guitar, on a boat, in the ocean, high quality",
     negative_prompt="bad quality, worse quality",
     guidance_scale=7.5,
     num_inference_steps=25,

From df602b374cff41190be3ec6ca056395acc66fdb0 Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Thu, 11 Jan 2024 23:13:47 +0530
Subject: [PATCH 32/47] merge community readme from :main

---
 examples/community/README.md | 61 ++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/examples/community/README.md b/examples/community/README.md
index 3284b2ab27d7..de1daece3f9d 100755
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -57,6 +57,7 @@ prompt-to-prompt | change parts of a prompt and retain image structure (see [pap
 |   DemoFusion Pipeline                                                                                                    | Implementation of [DemoFusion: Democratising High-Resolution Image Generation With No $$$](https://arxiv.org/abs/2311.16973)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [DemoFusion Pipeline](#DemoFusion)      | - |              [Ruoyi Du](https://github.com/RuoyiDu) |
 |   Null-Text Inversion Pipeline  | Implement [Null-text Inversion for Editing Real Images using Guided Diffusion Models](https://arxiv.org/abs/2211.09794) as a pipeline.                                                                                                                                                                                                                                                                                                                                                                                                                                      | [Null-Text Inversion](https://github.com/google/prompt-to-prompt/)      | - |              [Junsheng Luan](https://github.com/Junsheng121) |
 |   Rerender A Video Pipeline                                                                                                    | Implementation of [[SIGGRAPH Asia 2023] Rerender A Video: Zero-Shot Text-Guided Video-to-Video Translation](https://arxiv.org/abs/2306.07954)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [Rerender A Video Pipeline](#Rerender_A_Video)      | - |              [Yifan Zhou](https://github.com/SingleZombie) |
+| StyleAligned Pipeline                                                                                                    | Implementation of [Style Aligned Image Generation via Shared Attention](https://arxiv.org/abs/2312.02133)                                                                                                                                                                                                                                                                                                                                                                                                                                   | [StyleAligned Pipeline](#stylealigned-pipeline) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://drive.google.com/file/d/15X2E0jFPTajUIjS0FzX50OaHsCbP2lQ0/view?usp=sharing) | [Aryan V S](https://github.com/a-r-r-o-w) |
 
 To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.
 ```py
@@ -3027,7 +3028,9 @@ export_to_gif(result.frames[0], "result.gif")
     <td align=center><img src="https://github.com/huggingface/diffusers/assets/72266394/eb7d2952-72e4-44fa-b664-077c79b4fc70" alt="gif-2"></td>
   </tr>
 </table>
+
 ### DemoFusion
+
 This pipeline is the official implementation of [DemoFusion: Democratising High-Resolution Image Generation With No $$$](https://arxiv.org/abs/2311.16973).
 The original repo can be found at [repo](https://github.com/PRIS-CV/DemoFusion).
 - `view_batch_size` (`int`, defaults to 16):
@@ -3273,3 +3276,61 @@ output_frames = pipe(
 export_to_video(
     output_frames, "/path/to/video.mp4", 5)
 ```
+
+### StyleAligned Pipeline
+
+This pipeline is the implementation of [Style Aligned Image Generation via Shared Attention](https://arxiv.org/abs/2312.02133).
+
+> Large-scale Text-to-Image (T2I) models have rapidly gained prominence across creative fields, generating visually compelling outputs from textual prompts. However, controlling these models to ensure consistent style remains challenging, with existing methods necessitating fine-tuning and manual intervention to disentangle content and style. In this paper, we introduce StyleAligned, a novel technique designed to establish style alignment among a series of generated images. By employing minimal `attention sharing' during the diffusion process, our method maintains style consistency across images within T2I models. This approach allows for the creation of style-consistent images using a reference style through a straightforward inversion operation. Our method's evaluation across diverse styles and text prompts demonstrates high-quality synthesis and fidelity, underscoring its efficacy in achieving consistent style across various inputs.
+
+```python
+from typing import List
+
+import torch
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from PIL import Image
+
+model_id = "a-r-r-o-w/dreamshaper-xl-turbo"
+pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16, variant="fp16", custom_pipeline="pipeline_sdxl_style_aligned")
+pipe = pipe.to("cuda")
+
+# Enable memory saving techniques
+pipe.enable_vae_slicing()
+pipe.enable_vae_tiling()
+
+prompt = [
+  "a toy train. macro photo. 3d game asset",
+  "a toy airplane. macro photo. 3d game asset",
+  "a toy bicycle. macro photo. 3d game asset",
+  "a toy car. macro photo. 3d game asset",
+]
+negative_prompt = "low quality, worst quality, "
+
+# Enable StyleAligned
+pipe.enable_style_aligned(
+    share_group_norm=False,
+    share_layer_norm=False,
+    share_attention=True,
+    adain_queries=True,
+    adain_keys=True,
+    adain_values=False,
+    full_attention_share=False,
+    shared_score_scale=1.0,
+    shared_score_shift=0.0,
+    only_self_level=0.0,
+)
+
+# Run inference
+images = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    guidance_scale=2,
+    height=1024,
+    width=1024,
+    num_inference_steps=10,
+    generator=torch.Generator().manual_seed(42),
+).images
+
+# Disable StyleAligned if you do not wish to use it anymore
+pipe.disable_style_aligned()
+```

From 5042b7de77a1cf83cca8eb68743525e8b756fe6a Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Thu, 11 Jan 2024 23:38:19 +0530
Subject: [PATCH 33/47] final fix i promise

---
 examples/community/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/community/README.md b/examples/community/README.md
index de1daece3f9d..7d8d190f037f 100755
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -1223,7 +1223,7 @@ The resulting images in order:-
 ![result5](https://huggingface.co/datasets/NagaSaiAbhinay/UnCLIPImageInterpolationSamples/resolve/main/starry_to_flowers_5.png)
 
 ### DDIM Noise Comparative Analysis Pipeline
-#### **Research question: What visual concepts do the diffusion models learn from each noise level during training?**
+#### **Research question: What visual concepts do the diffusion models learn from each noise level during training?**
 The [P2 weighting (CVPR 2022)](https://arxiv.org/abs/2204.00227) paper proposed an approach to answer the above question, which is their second contribution.
 The approach consists of the following steps:
 
@@ -1947,7 +1947,7 @@ output = pipeline(
     canvas_width=352,
     regions=[
         Text2ImageRegion(0, 800, 0, 352, guidance_scale=8,
-            prompt=f"best quality, masterpiece, WLOP, sakimichan, art contest winner on pixiv, 8K, intricate details, wet effects, rain drops, ethereal, mysterious, futuristic, UHD, HDR, cinematic lighting, in a beautiful forest, rainy day, award winning, trending on artstation, beautiful confident cheerful young woman, wearing a futuristic sleeveless dress, ultra beautiful detailed  eyes, hyper-detailed face, complex,  perfect, model,  textured,  chiaroscuro, professional make-up, realistic, figure in frame, "),
+            prompt=f"best quality, masterpiece, WLOP, sakimichan, art contest winner on pixiv, 8K, intricate details, wet effects, rain drops, ethereal, mysterious, futuristic, UHD, HDR, cinematic lighting, in a beautiful forest, rainy day, award winning, trending on artstation, beautiful confident cheerful young woman, wearing a futuristic sleeveless dress, ultra beautiful detailed  eyes, hyper-detailed face, complex,  perfect, model,  textured,  chiaroscuro, professional make-up, realistic, figure in frame, "),
         Image2ImageRegion(352-800, 352, 0, 352, reference_image=iic_image, strength=1.0),
     ],
     num_inference_steps=100,
@@ -3333,4 +3333,4 @@ images = pipe(
 
 # Disable StyleAligned if you do not wish to use it anymore
 pipe.disable_style_aligned()
-```
+```
\ No newline at end of file

From 9a2d8bacac067d0319955472ee2214e4b8f12120 Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Fri, 12 Jan 2024 15:36:16 +0530
Subject: [PATCH 34/47] add support for url in animatediff example

---
 docs/source/en/api/pipelines/animatediff.md | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/docs/source/en/api/pipelines/animatediff.md b/docs/source/en/api/pipelines/animatediff.md
index d93d4083371a..de8a06718f21 100644
--- a/docs/source/en/api/pipelines/animatediff.md
+++ b/docs/source/en/api/pipelines/animatediff.md
@@ -106,9 +106,13 @@ AnimateDiff tends to work better with finetuned Stable Diffusion models. If you
 AnimateDiff can also be used to generate visually similar videos or enable style/character/background or other edits starting from an initial video, allowing you to seamlessly explore creative possibilities.
 
 ```python
+import imageio
+import requests
 import torch
 from diffusers import AnimateDiffVideoToVideoPipeline, DDIMScheduler, MotionAdapter
 from diffusers.utils import export_to_gif
+from io import BytesIO
+from PIL import Image
 
 # Load the motion adapter
 adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16)
@@ -131,16 +135,24 @@ pipe.enable_model_cpu_offload()
 
 # helper function to load videos
 def load_video(file_path: str):
-    import imageio
-
     images = []
-    vid = imageio.get_reader(file_path)
+
+    if file_path.startswith(('http://', 'https://')):
+        # If the file_path is a URL
+        response = requests.get(file_path)
+        response.raise_for_status()
+        content = BytesIO(response.content)
+        vid = imageio.get_reader(content)
+    else:
+        # Assuming it's a local file path
+        vid = imageio.get_reader(file_path)
+
     for frame in vid:
         pil_image = Image.fromarray(frame)
         images.append(pil_image)
+
     return images
 
-# load initial video
 video = load_video("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-vid2vid-input-1.gif")
 
 output = pipe(
@@ -149,6 +161,7 @@ output = pipe(
     negative_prompt="bad quality, worse quality",
     guidance_scale=7.5,
     num_inference_steps=25,
+    strength=0.5,
     generator=torch.Generator("cpu").manual_seed(42),
 )
 frames = output.frames[0]

From 78fa5a8f53ded14cd76afbd34769ef29f9b6bbd7 Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Fri, 12 Jan 2024 15:36:49 +0530
Subject: [PATCH 35/47] update example

---
 .../pipeline_animatediff_video2video.py       | 30 ++++++++++++++-----
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
index 7513b325548e..1806d8a95cc4 100644
--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
@@ -44,24 +44,38 @@
     Examples:
         ```py
         >>> import imageio
+        >>> import requests
         >>> import torch
-        >>> from diffusers import MotionAdapter, AnimateDiffVideoToVideoPipeline, DDIMScheduler
-        >>> from diffusers.utils import export_to_gif, load_image
+        >>> from diffusers import AnimateDiffVideoToVideoPipeline, DDIMScheduler, MotionAdapter
+        >>> from diffusers.utils import export_to_gif
+        >>> from io import BytesIO
+        >>> from PIL import Image
 
-        >>> adapter = MotionAdapter.from_pretrained("diffusers/motion-adapter")
+        >>> adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16)
         >>> pipe = AnimateDiffVideoToVideoPipeline.from_pretrained("SG161222/Realistic_Vision_V5.1_noVAE", motion_adapter=adapter).to("cuda")
         >>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False, timespace_spacing="linspace")
 
-        >>> def load_video(file_path):
+        >>> def load_video(file_path: str):
         ...     images = []
-        ...     vid = imageio.get_reader(file_path)
-        ...     for i, frame in enumerate(vid):
+        ...
+        ...     if file_path.startswith(('http://', 'https://')):
+        ...         # If the file_path is a URL
+        ...         response = requests.get(file_path)
+        ...         response.raise_for_status()
+        ...         content = BytesIO(response.content)
+        ...         vid = imageio.get_reader(content)
+        ...     else:
+        ...         # Assuming it's a local file path
+        ...         vid = imageio.get_reader(file_path)
+        ...
+        ...     for frame in vid:
         ...         pil_image = Image.fromarray(frame)
         ...         images.append(pil_image)
+        ...
         ...     return images
 
-        >>> video = load_video("animation_fireworks.gif")
-        >>> output = pipe(video=video, prompt="Closeup of a woman, fireworks in the background", strength=0.7)
+        >>> video = load_video("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-vid2vid-input-1.gif")
+        >>> output = pipe(video=video, prompt="panda playing a guitar, on a boat, in the ocean, high quality", strength=0.5)
         >>> frames = output.frames[0]
         >>> export_to_gif(frames, "animation.gif")
         ```

From fe871f751063f04f4acfb850b9779c47cfd7d620 Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Wed, 17 Jan 2024 19:00:32 +0530
Subject: [PATCH 36/47] update callbacks to latest implementation

---
 .../pipeline_animatediff_video2video.py       | 110 +++++++++++++-----
 1 file changed, 81 insertions(+), 29 deletions(-)

diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
index 1806d8a95cc4..d64dc28f2523 100644
--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import inspect
-from types import FunctionType
 from typing import Any, Callable, Dict, List, Optional, Union
 
 import torch
@@ -82,6 +81,7 @@
 """
 
 
+# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid
 def tensor2vid(video: torch.Tensor, processor, output_type="np"):
     # Based on:
     # https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/pipelines/multi_modal/text_to_video_synthesis_pipeline.py#L78
@@ -187,6 +187,7 @@ class AnimateDiffVideoToVideoPipeline(DiffusionPipeline, TextualInversionLoaderM
 
     model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
     _optional_components = ["feature_extractor", "image_encoder"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
 
     def __init__(
         self,
@@ -545,7 +546,6 @@ def check_inputs(
         prompt_embeds=None,
         negative_prompt_embeds=None,
         callback_on_step_end_tensor_inputs=None,
-        latent_interpolation_method=None,
     ):
         if strength < 0 or strength > 1:
             raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
@@ -591,14 +591,6 @@ def check_inputs(
                     f" {negative_prompt_embeds.shape}."
                 )
 
-        if latent_interpolation_method is not None:
-            if latent_interpolation_method not in ["lerp", "slerp"] and not isinstance(
-                latent_interpolation_method, FunctionType
-            ):
-                raise ValueError(
-                    "`latent_interpolation_method` must be one of `lerp`, `slerp` or a Callable[[torch.Tensor, torch.Tensor, int], torch.Tensor]"
-                )
-
         if video is not None and latents is not None:
             raise ValueError("Only one of `video` or `latents` should be provided")
 
@@ -711,6 +703,29 @@ def prepare_latents(
 
         return latents
 
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
     @torch.no_grad()
     def __call__(
         self,
@@ -736,6 +751,9 @@ def __call__(
         callback_steps: Optional[int] = 1,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
     ):
         r"""
         The call function to the pipeline for generation.
@@ -785,18 +803,22 @@ def __call__(
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`AnimateDiffPipelineOutput`] instead
                 of a plain tuple.
-            callback (`Callable`, *optional*):
-                A function that calls every `callback_steps` steps during inference. The function is called with the
-                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function is called. If not specified, the callback is called at
-                every step.
             cross_attention_kwargs (`dict`, *optional*):
                 A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
                 [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
             clip_skip (`int`, *optional*):
                 Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                 the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeine class.
+
         Examples:
 
         Returns:
@@ -804,6 +826,25 @@ def __call__(
                 If `return_dict` is `True`, [`AnimateDiffPipelineOutput`] is
                 returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
         """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using"
+                " `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using"
+                " `callback_on_step_end`",
+            )
+
         # 0. Default height and width to unet
         height = height or self.unet.config.sample_size * self.vae_scale_factor
         width = width or self.unet.config.sample_size * self.vae_scale_factor
@@ -822,8 +863,13 @@ def __call__(
             negative_prompt_embeds=negative_prompt_embeds,
             video=video,
             latents=latents,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
         )
 
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
             batch_size = 1
@@ -834,31 +880,26 @@ def __call__(
 
         device = self._execution_device
 
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
         # 3. Encode input prompt
         text_encoder_lora_scale = (
-            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
         )
         prompt_embeds, negative_prompt_embeds = self.encode_prompt(
             prompt,
             device,
             num_videos_per_prompt,
-            do_classifier_free_guidance,
+            self.do_classifier_free_guidance,
             negative_prompt,
             prompt_embeds=prompt_embeds,
             negative_prompt_embeds=negative_prompt_embeds,
             lora_scale=text_encoder_lora_scale,
-            clip_skip=clip_skip,
+            clip_skip=self.clip_skip,
         )
 
         # For classifier free guidance, we need to do two forward passes.
         # Here we concatenate the unconditional and text embeddings into a single batch
         # to avoid doing two forward passes
-        if do_classifier_free_guidance:
+        if self.do_classifier_free_guidance:
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
 
         if ip_adapter_image is not None:
@@ -866,13 +907,14 @@ def __call__(
             image_embeds, negative_image_embeds = self.encode_image(
                 ip_adapter_image, device, num_videos_per_prompt, output_hidden_state
             )
-            if do_classifier_free_guidance:
+            if self.do_classifier_free_guidance:
                 image_embeds = torch.cat([negative_image_embeds, image_embeds])
 
         # 4. Prepare timesteps
         timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
         timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
         latent_timestep = timesteps[:1].repeat(batch_size * num_videos_per_prompt)
+        self._num_timesteps = len(timesteps)
 
         # 5. Prepare latent variables
         num_channels_latents = self.unet.config.in_channels
@@ -900,7 +942,7 @@ def __call__(
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 # predict the noise residual
@@ -908,18 +950,28 @@ def __call__(
                     latent_model_input,
                     t,
                     encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
                     added_cond_kwargs=added_cond_kwargs,
                 ).sample
 
                 # perform guidance
-                if do_classifier_free_guidance:
+                if self.do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
                     noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1
                 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
                 # call the callback, if provided
                 if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()

From 600e414dbfbf1600667e2f7a336b54ea09c103f5 Mon Sep 17 00:00:00 2001
From: Aryan V S <avs050602@gmail.com>
Date: Wed, 17 Jan 2024 19:01:02 +0530
Subject: [PATCH 37/47] Update
 src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 .../pipelines/animatediff/pipeline_animatediff_video2video.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
index d64dc28f2523..6ea27f8ef89e 100644
--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
@@ -158,7 +158,7 @@ def retrieve_timesteps(
 
 class AnimateDiffVideoToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
     r"""
-    Pipeline for text-to-video generation.
+    Pipeline for video-to-video generation.
 
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
     implemented for all pipelines (downloading, saving, running on a particular device, etc.).

From 254ea67e4ed25600f4561bc353d1b408e2d4da03 Mon Sep 17 00:00:00 2001
From: Aryan V S <avs050602@gmail.com>
Date: Wed, 17 Jan 2024 19:02:08 +0530
Subject: [PATCH 38/47] Update
 src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 .../animatediff/pipeline_animatediff_video2video.py    | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
index 6ea27f8ef89e..9f845e6c36e7 100644
--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
@@ -677,15 +677,11 @@ def prepare_latents(
 
             if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
                 # expand init_latents for batch_size
-                deprecation_message = (
+                error_message = (
                     f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
-                    " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
-                    " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
-                    " your script to pass as many initial images as text prompts to suppress this warning."
+                    " images (`image`). Please make sure to update your script to pass as many initial images as text prompts"
                 )
-                deprecate("len(prompt) != len(video)", "1.0.0", deprecation_message, standard_warn=False)
-                additional_video_per_prompt = batch_size // init_latents.shape[0]
-                init_latents = torch.cat([init_latents] * additional_video_per_prompt, dim=0)
+                raise ValueError(error_message)
             elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
                 raise ValueError(
                     f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."

From ed37cae9d83443ae1676559ae049a975a6d7f03a Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Wed, 17 Jan 2024 19:05:28 +0530
Subject: [PATCH 39/47] fix merge

---
 .../pipelines/animatediff/pipeline_animatediff.py          | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py
index 7535fbe0f3a4..e0dbc18916e0 100644
--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py
@@ -14,7 +14,6 @@
 
 import inspect
 import math
-from dataclasses import dataclass
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
@@ -36,7 +35,6 @@
 )
 from ...utils import (
     USE_PEFT_BACKEND,
-    BaseOutput,
     deprecate,
     logging,
     replace_example_docstring,
@@ -147,11 +145,6 @@ def _freq_mix_3d(x: torch.Tensor, noise: torch.Tensor, LPF: torch.Tensor) -> tor
     return x_mixed
 
 
-@dataclass
-class AnimateDiffPipelineOutput(BaseOutput):
-    frames: Union[torch.Tensor, np.ndarray]
-
-
 class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
     r"""
     Pipeline for text-to-video generation.

From 6b84aef900ea6c76d15c0fbda3fa771d48667513 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 19 Jan 2024 12:04:57 +0200
Subject: [PATCH 40/47] Apply suggestions from code review

---
 .../pipeline_animatediff_video2video.py       | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
index 9f845e6c36e7..698c70072b37 100644
--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
@@ -749,7 +749,6 @@ def __call__(
         clip_skip: Optional[int] = None,
         callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
         callback_on_step_end_tensor_inputs: List[str] = ["latents"],
-        **kwargs,
     ):
         r"""
         The call function to the pipeline for generation.
@@ -823,24 +822,6 @@ def __call__(
                 returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
         """
 
-        callback = kwargs.pop("callback", None)
-        callback_steps = kwargs.pop("callback_steps", None)
-
-        if callback is not None:
-            deprecate(
-                "callback",
-                "1.0.0",
-                "Passing `callback` as an input argument to `__call__` is deprecated, consider using"
-                " `callback_on_step_end`",
-            )
-        if callback_steps is not None:
-            deprecate(
-                "callback_steps",
-                "1.0.0",
-                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using"
-                " `callback_on_step_end`",
-            )
-
         # 0. Default height and width to unet
         height = height or self.unet.config.sample_size * self.vae_scale_factor
         width = width or self.unet.config.sample_size * self.vae_scale_factor

From 1c645ed26baef8d032afd5d14e539c8a18d6f8f8 Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Fri, 19 Jan 2024 16:11:08 +0530
Subject: [PATCH 41/47] remove callback and callback_steps as suggested in
 review

---
 .../pipeline_animatediff_video2video.py         | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
index 698c70072b37..1c89dc409bfb 100644
--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
@@ -31,7 +31,7 @@
     LMSDiscreteScheduler,
     PNDMScheduler,
 )
-from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from .pipeline_output import AnimateDiffPipelineOutput
@@ -539,7 +539,6 @@ def check_inputs(
         strength,
         height,
         width,
-        callback_steps,
         video=None,
         latents=None,
         negative_prompt=None,
@@ -553,11 +552,6 @@ def check_inputs(
         if height % 8 != 0 or width % 8 != 0:
             raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
-        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
         if callback_on_step_end_tensor_inputs is not None and not all(
             k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
         ):
@@ -743,8 +737,6 @@ def __call__(
         ip_adapter_image: Optional[PipelineImageInput] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
-        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
-        callback_steps: Optional[int] = 1,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         clip_skip: Optional[int] = None,
         callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
@@ -834,7 +826,6 @@ def __call__(
             strength=strength,
             height=height,
             width=width,
-            callback_steps=callback_steps,
             negative_prompt=negative_prompt,
             prompt_embeds=prompt_embeds,
             negative_prompt_embeds=negative_prompt_embeds,
@@ -949,12 +940,6 @@ def __call__(
                     prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
                     negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
 
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
         if output_type == "latent":
             return AnimateDiffPipelineOutput(frames=latents)
 

From fdbb68f7d1b597f22740b9abfac0bdb564afef04 Mon Sep 17 00:00:00 2001
From: Aryan V S <avs050602@gmail.com>
Date: Fri, 19 Jan 2024 17:02:28 +0530
Subject: [PATCH 42/47] Update
 tests/pipelines/animatediff/test_animatediff_video2video.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 tests/pipelines/animatediff/test_animatediff_video2video.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/pipelines/animatediff/test_animatediff_video2video.py b/tests/pipelines/animatediff/test_animatediff_video2video.py
index 6c1cc17bc803..3226bdb3ca6e 100644
--- a/tests/pipelines/animatediff/test_animatediff_video2video.py
+++ b/tests/pipelines/animatediff/test_animatediff_video2video.py
@@ -38,8 +38,8 @@ class AnimateDiffVideoToVideoPipelineFastTests(PipelineTesterMixin, unittest.Tes
             "generator",
             "latents",
             "return_dict",
-            "callback",
-            "callback_steps",
+            "callback_on_step_end",
+            "callback_on_step_end_tensor_inputs",
         ]
     )
 

From 5674a7106e4cc88f9409d00610ea0137898cf22b Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Tue, 23 Jan 2024 11:40:58 +0530
Subject: [PATCH 43/47] fix import error caused due to unet refactor in #6630

---
 .../pipelines/animatediff/pipeline_animatediff_video2video.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
index 1c89dc409bfb..c437eecc6493 100644
--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
@@ -22,7 +22,7 @@
 from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
 from ...models.lora import adjust_lora_scale_text_encoder
-from ...models.unet_motion_model import MotionAdapter
+from ...models.unets.unet_motion_model import MotionAdapter
 from ...schedulers import (
     DDIMScheduler,
     DPMSolverMultistepScheduler,

From 032c24f9b38150510af7fbd682d0515602afa7a3 Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Tue, 23 Jan 2024 11:44:04 +0530
Subject: [PATCH 44/47] fix numpy import error after tensor2vid refactor in
 #6626

---
 src/diffusers/pipelines/animatediff/pipeline_animatediff.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py
index 016a3a5ca043..ee1062ee81ff 100644
--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py
@@ -16,6 +16,7 @@
 import math
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
+import numpy as np
 import torch
 import torch.fft as fft
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection

From 41ac8626a3c7df62caf5bedd1c2a0fbe1f3e6061 Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Tue, 23 Jan 2024 11:44:20 +0530
Subject: [PATCH 45/47] make fix-copies

---
 .../animatediff/pipeline_animatediff_video2video.py  | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
index c437eecc6493..7f64be3c5cde 100644
--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
@@ -83,9 +83,6 @@
 
 # Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid
 def tensor2vid(video: torch.Tensor, processor, output_type="np"):
-    # Based on:
-    # https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/pipelines/multi_modal/text_to_video_synthesis_pipeline.py#L78
-
     batch_size, channels, num_frames, height, width = video.shape
     outputs = []
     for batch_idx in range(batch_size):
@@ -94,6 +91,15 @@ def tensor2vid(video: torch.Tensor, processor, output_type="np"):
 
         outputs.append(batch_output)
 
+    if output_type == "np":
+        outputs = np.stack(outputs)
+
+    elif output_type == "pt":
+        outputs = torch.stack(outputs)
+
+    elif not output_type == "pil":
+        raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil]")
+
     return outputs
 
 

From c3a70eb68f60351a6b2e37b8bdf7a3fe910b50d4 Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Tue, 23 Jan 2024 11:45:03 +0530
Subject: [PATCH 46/47] fix numpy error

---
 .../pipelines/animatediff/pipeline_animatediff_video2video.py    | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
index 7f64be3c5cde..fd3c2a8a38f3 100644
--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
@@ -15,6 +15,7 @@
 import inspect
 from typing import Any, Callable, Dict, List, Optional, Union
 
+import numpy as np
 import torch
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
 

From 8b820a0ca035ea3891a44f35cd01ece308107c85 Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Tue, 23 Jan 2024 19:23:23 +0530
Subject: [PATCH 47/47] fix progress bar test

---
 .../pipelines/animatediff/pipeline_animatediff_video2video.py   | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
index fd3c2a8a38f3..3d01009cbac7 100644
--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
@@ -947,6 +947,8 @@ def __call__(
                     prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
                     negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
 
+                progress_bar.update()
+
         if output_type == "latent":
             return AnimateDiffPipelineOutput(frames=latents)