From 3cdf465efd5ec595e31e48c1fedb25da583005d7 Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Sun, 25 Feb 2024 17:40:54 +0530
Subject: [PATCH 1/9] update svd docs

---
 .../pipeline_stable_video_diffusion.py        | 98 ++++++++++---------
 1 file changed, 51 insertions(+), 47 deletions(-)

diff --git a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
index f53ebbafee2e..01c19a8b198b 100644
--- a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
+++ b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
@@ -24,13 +24,31 @@
 from ...image_processor import VaeImageProcessor
 from ...models import AutoencoderKLTemporalDecoder, UNetSpatioTemporalConditionModel
 from ...schedulers import EulerDiscreteScheduler
-from ...utils import BaseOutput, logging
+from ...utils import BaseOutput, logging, replace_example_docstring
 from ...utils.torch_utils import is_compiled_module, randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
+EXAMPLE_DOC_STRING = """
+    Examples:
+
+        ```py
+        from diffusers import StableVideoDiffusionPipeline
+        from diffusers.utils import load_image, export_to_video
+
+        pipe = StableVideoDiffusionPipeline.from_pretrained("stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16")
+        pipe.to("cuda")
+
+        image = load_image("https://lh3.googleusercontent.com/y-iFOHfLTwkuQSUegpwDdgKmOjRSTvPxat63dQLB25xkTs4lhIbRUFeNBWZzYf370g=s1200")
+        image = image.resize((1024, 576))
+
+        frames = pipe(image, num_frames=25, decode_chunk_size=8).frames[0]
+        export_to_video(frames, "generated.mp4", fps=7)
+        ```
+"""
+
 
 def _append_dims(x, target_dims):
     """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
@@ -44,18 +62,16 @@ def _append_dims(x, target_dims):
 def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type: str = "np"):
     batch_size, channels, num_frames, height, width = video.shape
     outputs = []
+
     for batch_idx in range(batch_size):
         batch_vid = video[batch_idx].permute(1, 0, 2, 3)
         batch_output = processor.postprocess(batch_vid, output_type)
-
         outputs.append(batch_output)
 
     if output_type == "np":
         outputs = np.stack(outputs)
-
     elif output_type == "pt":
         outputs = torch.stack(outputs)
-
     elif not output_type == "pil":
         raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil]")
 
@@ -65,15 +81,15 @@ def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type:
 @dataclass
 class StableVideoDiffusionPipelineOutput(BaseOutput):
     r"""
-    Output class for zero-shot text-to-video pipeline.
+    Output class for Stable Video Diffusion pipeline.
 
     Args:
-        frames (`[List[PIL.Image.Image]`, `np.ndarray`]):
-            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
-            num_channels)`.
+        frames (`[List[PIL.Image.Image]`, `np.ndarray`, `torch.FloatTensor`]):
+            List of denoised PIL images of length `batch_size` or NumPy array of shape
+            `(batch_size, height, width, num_channels)`.
     """
 
-    frames: Union[List[PIL.Image.Image], np.ndarray]
+    frames: Union[List[PIL.Image.Image], np.ndarray, torch.FloatTensor]
 
 
 class StableVideoDiffusionPipeline(DiffusionPipeline):
@@ -307,6 +323,7 @@ def num_timesteps(self):
         return self._num_timesteps
 
     @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
         image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor],
@@ -329,19 +346,20 @@ def __call__(
         return_dict: bool = True,
     ):
         r"""
-        The call function to the pipeline for generation.
+        Function invoked when calling the pipeline for generation.
 
         Args:
             image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
-                Image or images to guide image generation. If you provide a tensor, the expected value range is between `[0,1]`.
+                Image(s) to guide image generation. If you provide a tensor, the expected value range is between `[0, 1]`.
             height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                 The height in pixels of the generated image.
             width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                 The width in pixels of the generated image.
             num_frames (`int`, *optional*):
-                The number of video frames to generate. Defaults to 14 for `stable-video-diffusion-img2vid` and to 25 for `stable-video-diffusion-img2vid-xt`
+                The number of video frames to generate. Defaults to `self.unet.config.num_frames`
+                (14 for `stable-video-diffusion-img2vid` and to 25 for `stable-video-diffusion-img2vid-xt`).
             num_inference_steps (`int`, *optional*, defaults to 25):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                The number of denoising steps. More denoising steps usually lead to a higher quality video at the
                 expense of slower inference. This parameter is modulated by `strength`.
             min_guidance_scale (`float`, *optional*, defaults to 1.0):
                 The minimum guidance scale. Used for the classifier free guidance with first frame.
@@ -351,29 +369,30 @@ def __call__(
                 Frames per second. The rate at which the generated images shall be exported to a video after generation.
                 Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training.
             motion_bucket_id (`int`, *optional*, defaults to 127):
-                The motion bucket ID. Used as conditioning for the generation. The higher the number the more motion will be in the video.
+                Used for conditioning the amount of motion for the generation. The higher the number the more motion
+                will be in the video.
             noise_aug_strength (`float`, *optional*, defaults to 0.02):
                 The amount of noise added to the init image, the higher it is the less the video will look like the init image. Increase it for more motion.
             decode_chunk_size (`int`, *optional*):
-                The number of frames to decode at a time. The higher the chunk size, the higher the temporal consistency
-                between frames, but also the higher the memory consumption. By default, the decoder will decode all frames at once
-                for maximal quality. Reduce `decode_chunk_size` to reduce memory usage.
+                The number of frames to decode at a time. Higher chunk size will lead to better temporal consistency but
+                at the cost of more memory consumption. By default, the decoder will decode all frames at once for maximal
+                quality. For lower memory usage, reduce `decode_chunk_size`.
             num_videos_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
+                The number of videos to generate per prompt.
             generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                 A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                 generation deterministic.
             latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
                 generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                 tensor is generated by sampling using the supplied random `generator`.
             output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+                The output format of the generated image. Choose between `pil`, `np` or `pt`.
             callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
+                A function that is called at the end of each denoising step during inference. The function is called
+                with the following arguments:
+                    `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`.
+                `callback_kwargs` will include a list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
             callback_on_step_end_tensor_inputs (`List`, *optional*):
                 The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                 will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
@@ -382,26 +401,12 @@ def __call__(
                 Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
                 plain tuple.
 
+        Examples:
+
         Returns:
-            [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] or `List[List[PIL.Image.Image]]` or `np.ndarray` or `torch.FloatTensor`:
                 If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is returned,
                 otherwise a `tuple` is returned where the first element is a list of list with the generated frames.
-
-        Examples:
-
-        ```py
-        from diffusers import StableVideoDiffusionPipeline
-        from diffusers.utils import load_image, export_to_video
-
-        pipe = StableVideoDiffusionPipeline.from_pretrained("stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16")
-        pipe.to("cuda")
-
-        image = load_image("https://lh3.googleusercontent.com/y-iFOHfLTwkuQSUegpwDdgKmOjRSTvPxat63dQLB25xkTs4lhIbRUFeNBWZzYf370g=s1200")
-        image = image.resize((1024, 576))
-
-        frames = pipe(image, num_frames=25, decode_chunk_size=8).frames[0]
-        export_to_video(frames, "generated.mp4", fps=7)
-        ```
         """
         # 0. Default height and width to unet
         height = height or self.unet.config.sample_size * self.vae_scale_factor
@@ -429,8 +434,7 @@ def __call__(
         # 3. Encode input image
         image_embeddings = self._encode_image(image, device, num_videos_per_prompt, self.do_classifier_free_guidance)
 
-        # NOTE: Stable Diffusion Video was conditioned on fps - 1, which
-        # is why it is reduced here.
+        # NOTE: Stable Video Diffusion was conditioned on fps - 1, which is why it is reduced here.
         # See: https://github.com/Stability-AI/generative-models/blob/ed0997173f98eaf8f4edf7ba5fe8f15c6b877fd3/scripts/sampling/simple_video_sample.py#L188
         fps = fps - 1
 
@@ -471,11 +475,11 @@ def __call__(
         )
         added_time_ids = added_time_ids.to(device)
 
-        # 4. Prepare timesteps
+        # 6. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps = self.scheduler.timesteps
 
-        # 5. Prepare latent variables
+        # 7. Prepare latent variables
         num_channels_latents = self.unet.config.in_channels
         latents = self.prepare_latents(
             batch_size * num_videos_per_prompt,
@@ -489,7 +493,7 @@ def __call__(
             latents,
         )
 
-        # 7. Prepare guidance scale
+        # 8. Prepare guidance scale
         guidance_scale = torch.linspace(min_guidance_scale, max_guidance_scale, num_frames).unsqueeze(0)
         guidance_scale = guidance_scale.to(device, latents.dtype)
         guidance_scale = guidance_scale.repeat(batch_size * num_videos_per_prompt, 1)
@@ -497,7 +501,7 @@ def __call__(
 
         self._guidance_scale = guidance_scale
 
-        # 8. Denoising loop
+        # 9. Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         self._num_timesteps = len(timesteps)
         with self.progress_bar(total=num_inference_steps) as progress_bar:

From 0e38ccda8fb50a9da48cdaada35198a44370676f Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Fri, 1 Mar 2024 17:41:13 +0530
Subject: [PATCH 2/9] fix example doc string

---
 .../pipeline_stable_video_diffusion.py          | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
index 01c19a8b198b..b963693f5619 100644
--- a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
+++ b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
@@ -33,19 +33,18 @@
 
 EXAMPLE_DOC_STRING = """
     Examples:
-
         ```py
-        from diffusers import StableVideoDiffusionPipeline
-        from diffusers.utils import load_image, export_to_video
+        >>> from diffusers import StableVideoDiffusionPipeline
+        >>> from diffusers.utils import load_image, export_to_video
 
-        pipe = StableVideoDiffusionPipeline.from_pretrained("stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16")
-        pipe.to("cuda")
+        >>> pipe = StableVideoDiffusionPipeline.from_pretrained("stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16")
+        >>> pipe.to("cuda")
 
-        image = load_image("https://lh3.googleusercontent.com/y-iFOHfLTwkuQSUegpwDdgKmOjRSTvPxat63dQLB25xkTs4lhIbRUFeNBWZzYf370g=s1200")
-        image = image.resize((1024, 576))
+        >>> image = load_image("https://lh3.googleusercontent.com/y-iFOHfLTwkuQSUegpwDdgKmOjRSTvPxat63dQLB25xkTs4lhIbRUFeNBWZzYf370g=s1200")
+        >>> image = image.resize((1024, 576))
 
-        frames = pipe(image, num_frames=25, decode_chunk_size=8).frames[0]
-        export_to_video(frames, "generated.mp4", fps=7)
+        >>> frames = pipe(image, num_frames=25, decode_chunk_size=8).frames[0]
+        >>> export_to_video(frames, "generated.mp4", fps=7)
         ```
 """
 

From 12f886a504af59821acc686d5d25b2cff689e0e2 Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Fri, 1 Mar 2024 17:45:56 +0530
Subject: [PATCH 3/9] update return type hints/docs

---
 .../pipeline_stable_video_diffusion.py               | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
index b963693f5619..388279d4f245 100644
--- a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
+++ b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
@@ -83,12 +83,12 @@ class StableVideoDiffusionPipelineOutput(BaseOutput):
     Output class for Stable Video Diffusion pipeline.
 
     Args:
-        frames (`[List[PIL.Image.Image]`, `np.ndarray`, `torch.FloatTensor`]):
-            List of denoised PIL images of length `batch_size` or NumPy array of shape
-            `(batch_size, height, width, num_channels)`.
+        frames (`[List[List[PIL.Image.Image]]`, `np.ndarray`, `torch.FloatTensor`]):
+            List of denoised PIL images of length `batch_size` or numpy array or torch tensor
+            of shape `(batch_size, num_frames, height, width, num_channels)`.
     """
 
-    frames: Union[List[PIL.Image.Image], np.ndarray, torch.FloatTensor]
+    frames: Union[List[List[PIL.Image.Image]], np.ndarray, torch.FloatTensor]
 
 
 class StableVideoDiffusionPipeline(DiffusionPipeline):
@@ -403,9 +403,9 @@ def __call__(
         Examples:
 
         Returns:
-            [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] or `List[List[PIL.Image.Image]]` or `np.ndarray` or `torch.FloatTensor`:
+            [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] or `tuple`:
                 If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is returned,
-                otherwise a `tuple` is returned where the first element is a list of list with the generated frames.
+                otherwise a `tuple` of (`List[List[PIL.Image.Image]]` or `np.ndarray` or `torch.FloatTensor`) is returned.
         """
         # 0. Default height and width to unet
         height = height or self.unet.config.sample_size * self.vae_scale_factor

From e4c4a15f86512bb6942e3ec2c06d0ff9f655f2a0 Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Fri, 1 Mar 2024 17:51:00 +0530
Subject: [PATCH 4/9] update type hints

---
 .../pipeline_stable_video_diffusion.py        | 44 +++++++++----------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
index 388279d4f245..042755c0dcd6 100644
--- a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
+++ b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
@@ -21,7 +21,7 @@
 import torch
 from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
 
-from ...image_processor import VaeImageProcessor
+from ...image_processor import VaeImageProcessor, PipelineImageInput
 from ...models import AutoencoderKLTemporalDecoder, UNetSpatioTemporalConditionModel
 from ...schedulers import EulerDiscreteScheduler
 from ...utils import BaseOutput, logging, replace_example_docstring
@@ -134,7 +134,7 @@ def __init__(
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
 
-    def _encode_image(self, image, device, num_videos_per_prompt, do_classifier_free_guidance):
+    def _encode_image(self, image: PipelineImageInput, device: Union[str, torch.device], num_videos_per_prompt: int, do_classifier_free_guidance: bool) -> torch.FloatTensor:
         dtype = next(self.image_encoder.parameters()).dtype
 
         if not isinstance(image, torch.Tensor):
@@ -179,9 +179,9 @@ def _encode_image(self, image, device, num_videos_per_prompt, do_classifier_free
     def _encode_vae_image(
         self,
         image: torch.Tensor,
-        device,
-        num_videos_per_prompt,
-        do_classifier_free_guidance,
+        device: Union[str, torch.device],
+        num_videos_per_prompt: int,
+        do_classifier_free_guidance: bool,
     ):
         image = image.to(device=device)
         image_latents = self.vae.encode(image).latent_dist.mode()
@@ -201,13 +201,13 @@ def _encode_vae_image(
 
     def _get_add_time_ids(
         self,
-        fps,
-        motion_bucket_id,
-        noise_aug_strength,
-        dtype,
-        batch_size,
-        num_videos_per_prompt,
-        do_classifier_free_guidance,
+        fps: int,
+        motion_bucket_id: int,
+        noise_aug_strength: float,
+        dtype: torch.dtype,
+        batch_size: int,
+        num_videos_per_prompt: int,
+        do_classifier_free_guidance: bool,
     ):
         add_time_ids = [fps, motion_bucket_id, noise_aug_strength]
 
@@ -227,7 +227,7 @@ def _get_add_time_ids(
 
         return add_time_ids
 
-    def decode_latents(self, latents, num_frames, decode_chunk_size=14):
+    def decode_latents(self, latents: torch.FloatTensor, num_frames: int, decode_chunk_size: int = 14):
         # [batch, frames, channels, height, width] -> [batch*frames, channels, height, width]
         latents = latents.flatten(0, 1)
 
@@ -272,15 +272,15 @@ def check_inputs(self, image, height, width):
 
     def prepare_latents(
         self,
-        batch_size,
-        num_frames,
-        num_channels_latents,
-        height,
-        width,
-        dtype,
-        device,
-        generator,
-        latents=None,
+        batch_size: int,
+        num_frames: int,
+        num_channels_latents: int,
+        height: int,
+        width: int,
+        dtype: torch.dtype,
+        device: Union[str, torch.device],
+        generator: torch.Generator,
+        latents: Optional[torch.FloatTensor] = None,
     ):
         shape = (
             batch_size,

From 080873996dc245f977df94546f6039b8a9bffcda Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=2E=20Tolga=20Cang=C3=B6z?= <mtcangoz@gmail.com>
Date: Fri, 1 Mar 2024 16:53:19 +0300
Subject: [PATCH 5/9] Fix typos in pipeline_stable_video_diffusion.py

---
 .../stable_video_diffusion/pipeline_stable_video_diffusion.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
index 042755c0dcd6..69453e501f0a 100644
--- a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
+++ b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
@@ -58,7 +58,7 @@ def _append_dims(x, target_dims):
 
 
 # Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid
-def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type: str = "np"):
+def tensor2vid(video: torch.Tensor, processor: VaeImageProcessor, output_type: str = "np"):
     batch_size, channels, num_frames, height, width = video.shape
     outputs = []
 
@@ -509,7 +509,7 @@ def __call__(
                 latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
-                # Concatenate image_latents over channels dimention
+                # Concatenate image_latents over channels dimension
                 latent_model_input = torch.cat([latent_model_input, image_latents], dim=2)
 
                 # predict the noise residual

From 03d2f6629095973ab68cbf38b62e3985c57556cb Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Sat, 2 Mar 2024 03:03:23 +0530
Subject: [PATCH 6/9] make style && make fix-copies

---
 .../pipeline_stable_video_diffusion.py             | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
index 69453e501f0a..fdfa20a3acd7 100644
--- a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
+++ b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
@@ -21,7 +21,7 @@
 import torch
 from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
 
-from ...image_processor import VaeImageProcessor, PipelineImageInput
+from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...models import AutoencoderKLTemporalDecoder, UNetSpatioTemporalConditionModel
 from ...schedulers import EulerDiscreteScheduler
 from ...utils import BaseOutput, logging, replace_example_docstring
@@ -61,16 +61,18 @@ def _append_dims(x, target_dims):
 def tensor2vid(video: torch.Tensor, processor: VaeImageProcessor, output_type: str = "np"):
     batch_size, channels, num_frames, height, width = video.shape
     outputs = []
-
     for batch_idx in range(batch_size):
         batch_vid = video[batch_idx].permute(1, 0, 2, 3)
         batch_output = processor.postprocess(batch_vid, output_type)
+
         outputs.append(batch_output)
 
     if output_type == "np":
         outputs = np.stack(outputs)
+
     elif output_type == "pt":
         outputs = torch.stack(outputs)
+
     elif not output_type == "pil":
         raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil]")
 
@@ -134,7 +136,13 @@ def __init__(
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
 
-    def _encode_image(self, image: PipelineImageInput, device: Union[str, torch.device], num_videos_per_prompt: int, do_classifier_free_guidance: bool) -> torch.FloatTensor:
+    def _encode_image(
+        self,
+        image: PipelineImageInput,
+        device: Union[str, torch.device],
+        num_videos_per_prompt: int,
+        do_classifier_free_guidance: bool,
+    ) -> torch.FloatTensor:
         dtype = next(self.image_encoder.parameters()).dtype
 
         if not isinstance(image, torch.Tensor):

From 1981780476c46791b5ed0642b491ee5e5fb63a74 Mon Sep 17 00:00:00 2001
From: Aryan <contact.aryanvs@gmail.com>
Date: Tue, 5 Mar 2024 00:50:04 +0530
Subject: [PATCH 7/9] Update
 src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 .../stable_video_diffusion/pipeline_stable_video_diffusion.py  | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
index fdfa20a3acd7..ee0d73dfb52a 100644
--- a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
+++ b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
@@ -381,8 +381,7 @@ def __call__(
             noise_aug_strength (`float`, *optional*, defaults to 0.02):
                 The amount of noise added to the init image, the higher it is the less the video will look like the init image. Increase it for more motion.
             decode_chunk_size (`int`, *optional*):
-                The number of frames to decode at a time. Higher chunk size will lead to better temporal consistency but
-                at the cost of more memory consumption. By default, the decoder will decode all frames at once for maximal
+                The number of frames to decode at a time. Higher chunk size leads to better temporal consistency at the expense of more memory usage. By default, the decoder decodes all frames at once for maximal
                 quality. For lower memory usage, reduce `decode_chunk_size`.
             num_videos_per_prompt (`int`, *optional*, defaults to 1):
                 The number of videos to generate per prompt.

From f12ffac5320df4829dfd3c6f1498dccf919a8579 Mon Sep 17 00:00:00 2001
From: Aryan <contact.aryanvs@gmail.com>
Date: Tue, 5 Mar 2024 00:50:15 +0530
Subject: [PATCH 8/9] Update
 src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 .../stable_video_diffusion/pipeline_stable_video_diffusion.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
index ee0d73dfb52a..5032f4680819 100644
--- a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
+++ b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
@@ -40,7 +40,7 @@
         >>> pipe = StableVideoDiffusionPipeline.from_pretrained("stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16")
         >>> pipe.to("cuda")
 
-        >>> image = load_image("https://lh3.googleusercontent.com/y-iFOHfLTwkuQSUegpwDdgKmOjRSTvPxat63dQLB25xkTs4lhIbRUFeNBWZzYf370g=s1200")
+        >>> image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd-docstring-example.jpeg")
         >>> image = image.resize((1024, 576))
 
         >>> frames = pipe(image, num_frames=25, decode_chunk_size=8).frames[0]

From de7585a74c16e34fded1aa4b5338fd46f3a2483b Mon Sep 17 00:00:00 2001
From: a-r-r-o-w <contact.aryanvs@gmail.com>
Date: Tue, 5 Mar 2024 00:51:57 +0530
Subject: [PATCH 9/9] update based on suggestion

---
 .../stable_video_diffusion/pipeline_stable_video_diffusion.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
index 5032f4680819..5c755be47094 100644
--- a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
+++ b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
@@ -353,7 +353,7 @@ def __call__(
         return_dict: bool = True,
     ):
         r"""
-        Function invoked when calling the pipeline for generation.
+        The call function to the pipeline for generation.
 
         Args:
             image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):