diff --git a/docs/source/en/api/pipelines/i2vgenxl.md b/docs/source/en/api/pipelines/i2vgenxl.md
index e5c8b50f0ad8..1d7eb5db16db 100644
--- a/docs/source/en/api/pipelines/i2vgenxl.md
+++ b/docs/source/en/api/pipelines/i2vgenxl.md
@@ -18,11 +18,11 @@ The abstract from the paper is:
*Video synthesis has recently made remarkable strides benefiting from the rapid development of diffusion models. However, it still encounters challenges in terms of semantic accuracy, clarity and spatio-temporal continuity. They primarily arise from the scarcity of well-aligned text-video data and the complex inherent structure of videos, making it difficult for the model to simultaneously ensure semantic and qualitative excellence. In this report, we propose a cascaded I2VGen-XL approach that enhances model performance by decoupling these two factors and ensures the alignment of the input data by utilizing static images as a form of crucial guidance. I2VGen-XL consists of two stages: i) the base stage guarantees coherent semantics and preserves content from input images by using two hierarchical encoders, and ii) the refinement stage enhances the video's details by incorporating an additional brief text and improves the resolution to 1280×720. To improve the diversity, we collect around 35 million single-shot text-video pairs and 6 billion text-image pairs to optimize the model. By this means, I2VGen-XL can simultaneously enhance the semantic accuracy, continuity of details and clarity of generated videos. Through extensive experiments, we have investigated the underlying principles of I2VGen-XL and compared it with current top methods, which can demonstrate its effectiveness on diverse data. The source code and models will be publicly available at [this https URL](https://i2vgen-xl.github.io/).*
-The original codebase can be found [here](https://github.com/ali-vilab/i2vgen-xl/). The model checkpoints can be found [here](https://huggingface.co/ali-vilab/).
+The original codebase can be found [here](https://github.com/ali-vilab/i2vgen-xl/). The model checkpoints can be found [here](https://huggingface.co/ali-vilab/).
>> import torch
>>> from diffusers import I2VGenXLPipeline
+ >>> from diffusers.utils import export_to_gif, load_image
>>> pipeline = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16")
>>> pipeline.enable_model_cpu_offload()
@@ -95,15 +96,16 @@ def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type:
@dataclass
class I2VGenXLPipelineOutput(BaseOutput):
r"""
- Output class for image-to-video pipeline.
+ Output class for image-to-video pipeline.
- Args:
- frames (`List[np.ndarray]` or `torch.FloatTensor`)
- List of denoised frames (essentially images) as NumPy arrays of shape `(height, width, num_channels)` or as
- a `torch` tensor. The length of the list denotes the video length (the number of frames).
+ Args:
+ frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
+ List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing denoised
+ PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
+ `(batch_size, num_frames, channels, height, width)`
"""
- frames: Union[List[np.ndarray], torch.FloatTensor]
+ frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]]
class I2VGenXLPipeline(DiffusionPipeline):
diff --git a/src/diffusers/pipelines/pia/pipeline_pia.py b/src/diffusers/pipelines/pia/pipeline_pia.py
index 565544a0fef4..c8b9165c84fe 100644
--- a/src/diffusers/pipelines/pia/pipeline_pia.py
+++ b/src/diffusers/pipelines/pia/pipeline_pia.py
@@ -200,13 +200,13 @@ class PIAPipelineOutput(BaseOutput):
Output class for PIAPipeline.
Args:
- frames (`torch.Tensor`, `np.ndarray`, or List[PIL.Image.Image]):
+ frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
Nested list of length `batch_size` with denoised PIL image sequences of length `num_frames`,
NumPy array of shape `(batch_size, num_frames, channels, height, width,
Torch tensor of shape `(batch_size, num_frames, channels, height, width)`.
"""
- frames: Union[torch.Tensor, np.ndarray, PIL.Image.Image]
+ frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]]
class PIAPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py
index 411515809e6f..c155386cf173 100644
--- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py
+++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py
@@ -2,6 +2,7 @@
from typing import List, Union
import numpy as np
+import PIL
import torch
from ...utils import (
@@ -12,12 +13,13 @@
@dataclass
class TextToVideoSDPipelineOutput(BaseOutput):
"""
- Output class for text-to-video pipelines.
+ Output class for text-to-video pipelines.
- Args:
- frames (`List[np.ndarray]` or `torch.FloatTensor`)
- List of denoised frames (essentially images) as NumPy arrays of shape `(height, width, num_channels)` or as
- a `torch` tensor. The length of the list denotes the video length (the number of frames).
+ Args:
+ frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
+ List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing denoised
+ PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
+ `(batch_size, num_frames, channels, height, width)`
"""
- frames: Union[List[np.ndarray], torch.FloatTensor]
+ frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]]
|