From e126a82cc5d9afbeb9b476455de24dd3e7dd358a Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 12 Apr 2023 20:55:54 +0530
Subject: [PATCH 01/71] [Tests] Speed up panorama tests (#3067)

* fix: norm group test for UNet3D.

* chore: speed up the panorama tests (fast).

* set default value of _test_inference_batch_single_identical.

* fix: batch_sizes default value.
---
 .../test_stable_diffusion_panorama.py          | 18 +++++++++++++-----
 tests/test_pipelines_common.py                 | 11 ++++++-----
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
index de9e8a79fb34..752ed6e969c3 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
@@ -49,7 +49,7 @@ def get_dummy_components(self):
         torch.manual_seed(0)
         unet = UNet2DConditionModel(
             block_out_channels=(32, 64),
-            layers_per_block=2,
+            layers_per_block=1,
             sample_size=32,
             in_channels=4,
             out_channels=4,
@@ -101,7 +101,7 @@ def get_dummy_inputs(self, device, seed=0):
             # Setting height and width to None to prevent OOMs on CPU.
             "height": None,
             "width": None,
-            "num_inference_steps": 2,
+            "num_inference_steps": 1,
             "guidance_scale": 6.0,
             "output_type": "numpy",
         }
@@ -119,10 +119,18 @@ def test_stable_diffusion_panorama_default_case(self):
         image_slice = image[0, -3:, -3:, -1]
         assert image.shape == (1, 64, 64, 3)
 
-        expected_slice = np.array([0.4794, 0.5084, 0.4992, 0.3941, 0.3555, 0.4754, 0.5248, 0.5224, 0.4839])
+        expected_slice = np.array([0.6186, 0.5374, 0.4915, 0.4135, 0.4114, 0.4563, 0.5128, 0.4977, 0.4757])
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
+    # override to speed the overall test timing up.
+    def test_inference_batch_consistent(self):
+        super().test_inference_batch_consistent(batch_sizes=[1, 2])
+
+    # override to speed the overall test timing up.
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(batch_size=2)
+
     def test_stable_diffusion_panorama_negative_prompt(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
         components = self.get_dummy_components()
@@ -138,7 +146,7 @@ def test_stable_diffusion_panorama_negative_prompt(self):
 
         assert image.shape == (1, 64, 64, 3)
 
-        expected_slice = np.array([0.5029, 0.5075, 0.5002, 0.3965, 0.3584, 0.4746, 0.5271, 0.5273, 0.4877])
+        expected_slice = np.array([0.6187, 0.5375, 0.4915, 0.4136, 0.4114, 0.4563, 0.5128, 0.4976, 0.4757])
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
@@ -158,7 +166,7 @@ def test_stable_diffusion_panorama_euler(self):
 
         assert image.shape == (1, 64, 64, 3)
 
-        expected_slice = np.array([0.4934, 0.5455, 0.4847, 0.5022, 0.5572, 0.4833, 0.5207, 0.4952, 0.5051])
+        expected_slice = np.array([0.4886, 0.5586, 0.4476, 0.5053, 0.6013, 0.4737, 0.5538, 0.5100, 0.4927])
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
diff --git a/tests/test_pipelines_common.py b/tests/test_pipelines_common.py
index 13fbe924c799..981bc9061ef9 100644
--- a/tests/test_pipelines_common.py
+++ b/tests/test_pipelines_common.py
@@ -175,8 +175,8 @@ def test_pipeline_call_signature(self):
             f"Required optional parameters not present: {remaining_required_optional_parameters}",
         )
 
-    def test_inference_batch_consistent(self):
-        self._test_inference_batch_consistent()
+    def test_inference_batch_consistent(self, batch_sizes=[2, 4, 13]):
+        self._test_inference_batch_consistent(batch_sizes=batch_sizes)
 
     def _test_inference_batch_consistent(
         self, batch_sizes=[2, 4, 13], additional_params_copy_to_batched_inputs=["num_inference_steps"]
@@ -235,11 +235,12 @@ def _test_inference_batch_consistent(
 
         logger.setLevel(level=diffusers.logging.WARNING)
 
-    def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical()
+    def test_inference_batch_single_identical(self, batch_size=3):
+        self._test_inference_batch_single_identical(batch_size=batch_size)
 
     def _test_inference_batch_single_identical(
         self,
+        batch_size=3,
         test_max_difference=None,
         test_mean_pixel_difference=None,
         relax_max_difference=False,
@@ -267,7 +268,7 @@ def _test_inference_batch_single_identical(
 
         # batchify inputs
         batched_inputs = {}
-        batch_size = 3
+        batch_size = batch_size
         for name, value in inputs.items():
             if name in self.batch_params:
                 # prompt is string

From 0a73b4d3cd1dd58e6470cfd7f1e10b7b81c63511 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 12 Apr 2023 18:18:30 +0200
Subject: [PATCH 02/71] [Post release] v0.16.0dev (#3072)

---
 examples/controlnet/train_controlnet.py                        | 2 +-
 examples/controlnet/train_controlnet_flax.py                   | 2 +-
 examples/dreambooth/train_dreambooth.py                        | 2 +-
 examples/dreambooth/train_dreambooth_flax.py                   | 2 +-
 examples/dreambooth/train_dreambooth_lora.py                   | 2 +-
 examples/instruct_pix2pix/train_instruct_pix2pix.py            | 2 +-
 examples/text_to_image/train_text_to_image.py                  | 2 +-
 examples/text_to_image/train_text_to_image_flax.py             | 2 +-
 examples/text_to_image/train_text_to_image_lora.py             | 2 +-
 examples/textual_inversion/textual_inversion.py                | 2 +-
 examples/textual_inversion/textual_inversion_flax.py           | 2 +-
 examples/unconditional_image_generation/train_unconditional.py | 2 +-
 setup.py                                                       | 2 +-
 src/diffusers/__init__.py                                      | 2 +-
 14 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/examples/controlnet/train_controlnet.py b/examples/controlnet/train_controlnet.py
index 30e43075d809..c0b52291fc9b 100644
--- a/examples/controlnet/train_controlnet.py
+++ b/examples/controlnet/train_controlnet.py
@@ -55,7 +55,7 @@
     import wandb
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.15.0")
+check_min_version("0.16.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/controlnet/train_controlnet_flax.py b/examples/controlnet/train_controlnet_flax.py
index f5ea3ce84bf3..67fe1b82d0dd 100644
--- a/examples/controlnet/train_controlnet_flax.py
+++ b/examples/controlnet/train_controlnet_flax.py
@@ -58,7 +58,7 @@
     import wandb
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.15.0")
+check_min_version("0.16.0.dev0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py
index 141aafb85128..4f731aa1f776 100644
--- a/examples/dreambooth/train_dreambooth.py
+++ b/examples/dreambooth/train_dreambooth.py
@@ -56,7 +56,7 @@
     import wandb
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.15.0")
+check_min_version("0.16.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/dreambooth/train_dreambooth_flax.py b/examples/dreambooth/train_dreambooth_flax.py
index 8c2faa7ec877..8583f64c6fbd 100644
--- a/examples/dreambooth/train_dreambooth_flax.py
+++ b/examples/dreambooth/train_dreambooth_flax.py
@@ -36,7 +36,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.15.0")
+check_min_version("0.16.0.dev0")
 
 # Cache compiled models across invocations of this script.
 cc.initialize_cache(os.path.expanduser("~/.cache/jax/compilation_cache"))
diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py
index a117bd394895..d360939c8c0c 100644
--- a/examples/dreambooth/train_dreambooth_lora.py
+++ b/examples/dreambooth/train_dreambooth_lora.py
@@ -53,7 +53,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.15.0")
+check_min_version("0.16.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/instruct_pix2pix/train_instruct_pix2pix.py b/examples/instruct_pix2pix/train_instruct_pix2pix.py
index b542d01c112a..155c370614dc 100644
--- a/examples/instruct_pix2pix/train_instruct_pix2pix.py
+++ b/examples/instruct_pix2pix/train_instruct_pix2pix.py
@@ -51,7 +51,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.15.0")
+check_min_version("0.16.0.dev0")
 
 logger = get_logger(__name__, log_level="INFO")
 
diff --git a/examples/text_to_image/train_text_to_image.py b/examples/text_to_image/train_text_to_image.py
index fde762814b54..4bbf4706f01c 100644
--- a/examples/text_to_image/train_text_to_image.py
+++ b/examples/text_to_image/train_text_to_image.py
@@ -50,7 +50,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.15.0")
+check_min_version("0.16.0.dev0")
 
 logger = get_logger(__name__, log_level="INFO")
 
diff --git a/examples/text_to_image/train_text_to_image_flax.py b/examples/text_to_image/train_text_to_image_flax.py
index cdfc546a8f58..41a02d68f2b1 100644
--- a/examples/text_to_image/train_text_to_image_flax.py
+++ b/examples/text_to_image/train_text_to_image_flax.py
@@ -33,7 +33,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.15.0")
+check_min_version("0.16.0.dev0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/text_to_image/train_text_to_image_lora.py b/examples/text_to_image/train_text_to_image_lora.py
index a50ca222a4a0..8dfd96904bd0 100644
--- a/examples/text_to_image/train_text_to_image_lora.py
+++ b/examples/text_to_image/train_text_to_image_lora.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.15.0")
+check_min_version("0.16.0.dev0")
 
 logger = get_logger(__name__, log_level="INFO")
 
diff --git a/examples/textual_inversion/textual_inversion.py b/examples/textual_inversion/textual_inversion.py
index aebc524bbb36..e157e629df64 100644
--- a/examples/textual_inversion/textual_inversion.py
+++ b/examples/textual_inversion/textual_inversion.py
@@ -77,7 +77,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.15.0")
+check_min_version("0.16.0.dev0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/textual_inversion/textual_inversion_flax.py b/examples/textual_inversion/textual_inversion_flax.py
index 513548d947a0..1d77753791f9 100644
--- a/examples/textual_inversion/textual_inversion_flax.py
+++ b/examples/textual_inversion/textual_inversion_flax.py
@@ -56,7 +56,7 @@
 # ------------------------------------------------------------------------------
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.15.0")
+check_min_version("0.16.0.dev0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/unconditional_image_generation/train_unconditional.py b/examples/unconditional_image_generation/train_unconditional.py
index f38e908fcef6..c004acc2d850 100644
--- a/examples/unconditional_image_generation/train_unconditional.py
+++ b/examples/unconditional_image_generation/train_unconditional.py
@@ -28,7 +28,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.15.0")
+check_min_version("0.16.0.dev0")
 
 logger = get_logger(__name__, log_level="INFO")
 
diff --git a/setup.py b/setup.py
index da75dd1e2a85..19cc1dca73bb 100644
--- a/setup.py
+++ b/setup.py
@@ -226,7 +226,7 @@ def run(self):
 
 setup(
     name="diffusers",
-    version="0.15.0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="0.16.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
     description="Diffusers",
     long_description=open("README.md", "r", encoding="utf-8").read(),
     long_description_content_type="text/markdown",
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index c7d850d65953..07c17100e0e0 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.15.0"
+__version__ = "0.16.0.dev0"
 
 from .configuration_utils import ConfigMixin
 from .utils import (

From d06e06940b8af3567958935bf49d4c42768110c8 Mon Sep 17 00:00:00 2001
From: Andreas Steiner <andstein@google.com>
Date: Wed, 12 Apr 2023 20:29:18 +0200
Subject: [PATCH 03/71] Adds profiling flags, computes train metrics average.
 (#3053)

* WIP controlnet training

- bugfix --streaming
- bugfix running report_to!='wandb'
- adds memory profile before validation

* Adds final logging statement.

* Sets train epochs to 11.

Looking at a longer ~16ep run, we see only good validation images
after ~11ep:

https://wandb.ai/andsteing/controlnet_fill50k/runs/3j2hx6n8

* Removes --logging_dir (it's not used).

* Adds --profile flags.

* Updates --output_dir=runs/fill-circle-{timestamp}.

* Compute mean of `train_metrics`.

Previously `train_metrics[-1]` was logged, resulting in very bumpy train
metrics.

* Improves logging a bit.

- adds l2_grads gradient norm logging
- adds steps_per_sec
- sets walltime as x coordinate of train/step
- logs controlnet_params config

* Adds --ccache (doesn't really help though).

* minor fix in controlnet flax example (#2986)

* fix the error when push_to_hub but not log validation

* contronet_from_pt & controlnet_revision

* add intermediate checkpointing to the guide

* Bugfix --profile_steps

* Sets `RACKER_PROJECT_NAME='controlnet_fill50k'`.

* Logs fractional epoch.

* Adds relative `walltime` metric.

* Adds `StepTraceAnnotation` and uses `global_step` insetad of `step`.

* Applied `black`.

* Streamlines commands in README a bit.

* Removes `--ccache`.

This makes only a very small difference (~1 min) with this model size, so removing
the option introduced in cdb3cc.

* Re-ran `black`.

* Update examples/controlnet/README.md

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>

* Converts spaces to tab.

* Removes repeated args.

* Skips first step (compilation) in profiling

* Updates README with profiling instructions.

* Unifies tabs/spaces in README.

* Re-ran style & quality.

---------
Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
---
 examples/controlnet/README.md                | 74 ++++++++++------
 examples/controlnet/train_controlnet_flax.py | 90 +++++++++++++++-----
 2 files changed, 119 insertions(+), 45 deletions(-)

diff --git a/examples/controlnet/README.md b/examples/controlnet/README.md
index 4b388d92a195..387755624729 100644
--- a/examples/controlnet/README.md
+++ b/examples/controlnet/README.md
@@ -284,9 +284,9 @@ TPU_TYPE=v4-8
 VM_NAME=hg_flax
 
 gcloud alpha compute tpus tpu-vm create $VM_NAME \
-    --zone $ZONE \
-    --accelerator-type $TPU_TYPE \
-    --version  tpu-vm-v4-base
+ --zone $ZONE \
+ --accelerator-type $TPU_TYPE \
+ --version  tpu-vm-v4-base
 
 gcloud alpha compute tpus tpu-vm ssh $VM_NAME --zone $ZONE -- \
 ```
@@ -326,6 +326,7 @@ If you want to use Weights and Biases logging, you should also install `wandb` n
 pip install wandb
 ```
 
+
 Now let's downloading two conditioning images that we will use to run validation during the training in order to track our progress
 
 ```
@@ -343,8 +344,8 @@ Make sure you have the `MODEL_DIR`,`OUTPUT_DIR` and `HUB_MODEL_ID` environment v
 
 ```bash
 export MODEL_DIR="runwayml/stable-diffusion-v1-5"
-export OUTPUT_DIR="control_out"
-export HUB_MODEL_ID="fill-circle-controlnet"
+export OUTPUT_DIR="runs/fill-circle-{timestamp}"
+export HUB_MODEL_ID="controlnet-fill-circle"
 ```
 
 And finally start the training
@@ -363,32 +364,36 @@ python3 train_controlnet_flax.py \
  --revision="non-ema" \
  --from_pt \
  --report_to="wandb" \
- --max_train_steps=10000 \
+ --tracker_project_name=$HUB_MODEL_ID \
+ --num_train_epochs=11 \
  --push_to_hub \
  --hub_model_id=$HUB_MODEL_ID
  ```
 
 Since we passed the `--push_to_hub` flag, it will automatically create a model repo under your huggingface account based on `$HUB_MODEL_ID`. By the end of training, the final checkpoint will be automatically stored on the hub. You can find an example model repo [here](https://huggingface.co/YiYiXu/fill-circle-controlnet).
 
-Our training script also provides limited support for streaming large datasets from the Hugging Face Hub. In order to enable streaming, one must also set `--max_train_samples`.  Here is an example command:
+Our training script also provides limited support for streaming large datasets from the Hugging Face Hub. In order to enable streaming, one must also set `--max_train_samples`.  Here is an example command (from [this blog article](https://huggingface.co/blog/train-your-controlnet)):
 
 ```bash
+export MODEL_DIR="runwayml/stable-diffusion-v1-5"
+export OUTPUT_DIR="runs/uncanny-faces-{timestamp}"
+export HUB_MODEL_ID="controlnet-uncanny-faces"
+
 python3 train_controlnet_flax.py \
-	--pretrained_model_name_or_path=$MODEL_DIR \
-	--output_dir=$OUTPUT_DIR \
-	--dataset_name=multimodalart/facesyntheticsspigacaptioned \
-	--streaming \
-	--conditioning_image_column=spiga_seg \
-	--image_column=image \
-	--caption_column=image_caption \
-	--resolution=512 \
-	--max_train_samples 50 \
-	--max_train_steps 5 \
-	--learning_rate=1e-5 \
-	--validation_steps=2 \
-	--train_batch_size=1 \
-	--revision="flax" \
-	--report_to="wandb"
+ --pretrained_model_name_or_path=$MODEL_DIR \
+ --output_dir=$OUTPUT_DIR \
+ --dataset_name=multimodalart/facesyntheticsspigacaptioned \
+ --streaming \
+ --conditioning_image_column=spiga_seg \
+ --image_column=image \
+ --caption_column=image_caption \
+ --resolution=512 \
+ --max_train_samples 100000 \
+ --learning_rate=1e-5 \
+ --train_batch_size=1 \
+ --revision="flax" \
+ --report_to="wandb" \
+ --tracker_project_name=$HUB_MODEL_ID
 ```
 
 Note, however, that the performance of the TPUs might get bottlenecked as streaming with `datasets` is not optimized for images. For ensuring maximum throughput, we encourage you to explore the following options:
@@ -400,16 +405,35 @@ Note, however, that the performance of the TPUs might get bottlenecked as stream
 When work with a larger dataset, you may need to run training process for a long time and it’s useful to save regular checkpoints during the process. You can use the following argument to enable intermediate checkpointing:
 
 ```bash
-  --checkpointing_steps=500
+ --checkpointing_steps=500
 ```
 This will save the trained model in subfolders of your output_dir. Subfolder names is the number of steps performed so far; for example: a checkpoint saved after 500 training steps would be saved in a subfolder named 500 
 
 You can then start your training from this saved checkpoint with 
 
 ```bash
-   --controlnet_model_name_or_path="./control_out/500" 
+ --controlnet_model_name_or_path="./control_out/500" 
 ```
 
 We support training with the Min-SNR weighting strategy proposed in [Efficient Diffusion Training via Min-SNR Weighting Strategy](https://arxiv.org/abs/2303.09556) which helps to achieve faster convergence by rebalancing the loss. To use it, one needs to set the `--snr_gamma` argument. The recommended value when using it is `5.0`.
 
-We also support gradient accumulation - it is a technique that lets you use a bigger batch size than your machine would normally be able to fit into memory. You can use `gradient_accumulation_steps` argument to set gradient accumulation steps. The ControlNet author recommends using gradient accumulation to achieve better convergence. Read more [here](https://github.com/lllyasviel/ControlNet/blob/main/docs/train.md#more-consideration-sudden-converge-phenomenon-and-gradient-accumulation).
\ No newline at end of file
+We also support gradient accumulation - it is a technique that lets you use a bigger batch size than your machine would normally be able to fit into memory. You can use `gradient_accumulation_steps` argument to set gradient accumulation steps. The ControlNet author recommends using gradient accumulation to achieve better convergence. Read more [here](https://github.com/lllyasviel/ControlNet/blob/main/docs/train.md#more-consideration-sudden-converge-phenomenon-and-gradient-accumulation).
+
+You can **profile your code** with:
+
+```bash
+ --profile_steps==5
+```
+
+Refer to the [JAX documentation on profiling](https://jax.readthedocs.io/en/latest/profiling.html). To inspect the profile trace, you'll have to install and start Tensorboard with the profile plugin:
+
+```bash
+pip install tensorflow tensorboard-plugin-profile
+tensorboard --logdir runs/fill-circle-100steps-20230411_165612/
+```
+
+The profile can then be inspected at http://localhost:6006/#profile
+
+Sometimes you'll get version conflicts (error messages like `Duplicate plugins for name projector`), which means that you have to uninstall and reinstall all versions of Tensorflow/Tensorboard (e.g. with `pip uninstall tensorflow tf-nightly tensorboard tb-nightly tensorboard-plugin-profile && pip install tf-nightly tbp-nightly tensorboard-plugin-profile`).
+
+Note that the debugging functionality of the Tensorboard `profile` plugin is still under active development. Not all views are fully functional, and for example the `trace_viewer` cuts off events after 1M (which can result in all your device traces getting lost if you for example profile the compilation step by accident).
diff --git a/examples/controlnet/train_controlnet_flax.py b/examples/controlnet/train_controlnet_flax.py
index 67fe1b82d0dd..0b413ace09d2 100644
--- a/examples/controlnet/train_controlnet_flax.py
+++ b/examples/controlnet/train_controlnet_flax.py
@@ -18,6 +18,7 @@
 import math
 import os
 import random
+import time
 from pathlib import Path
 
 import jax
@@ -220,6 +221,28 @@ def parse_args():
         default=None,
         help="Revision of controlnet model identifier from huggingface.co/models.",
     )
+    parser.add_argument(
+        "--profile_steps",
+        type=int,
+        default=0,
+        help="How many training steps to profile in the beginning.",
+    )
+    parser.add_argument(
+        "--profile_validation",
+        action="store_true",
+        help="Whether to profile the (last) validation.",
+    )
+    parser.add_argument(
+        "--profile_memory",
+        action="store_true",
+        help="Whether to dump an initial (before training loop) and a final (at program end) memory profile.",
+    )
+    parser.add_argument(
+        "--ccache",
+        type=str,
+        default=None,
+        help="Enables compilation cache.",
+    )
     parser.add_argument(
         "--controlnet_from_pt",
         action="store_true",
@@ -234,8 +257,9 @@ def parse_args():
     parser.add_argument(
         "--output_dir",
         type=str,
-        default="controlnet-model",
-        help="The output directory where the model predictions and checkpoints will be written.",
+        default="runs/{timestamp}",
+        help="The output directory where the model predictions and checkpoints will be written. "
+        "Can contain placeholders: {timestamp}.",
     )
     parser.add_argument(
         "--cache_dir",
@@ -317,15 +341,6 @@ def parse_args():
         default=None,
         help="The name of the repository to keep in sync with the local `output_dir`.",
     )
-    parser.add_argument(
-        "--logging_dir",
-        type=str,
-        default="logs",
-        help=(
-            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
-            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
-        ),
-    )
     parser.add_argument(
         "--logging_steps",
         type=int,
@@ -459,6 +474,8 @@ def parse_args():
     parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
 
     args = parser.parse_args()
+    args.output_dir = args.output_dir.replace("{timestamp}", time.strftime("%Y%m%d_%H%M%S"))
+
     env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
     if env_local_rank != -1 and env_local_rank != args.local_rank:
         args.local_rank = env_local_rank
@@ -952,6 +969,11 @@ def cumul_grad_step(grad_idx, loss_grad_rng):
         metrics = {"loss": loss}
         metrics = jax.lax.pmean(metrics, axis_name="batch")
 
+        def l2(xs):
+            return jnp.sqrt(sum([jnp.vdot(x, x) for x in jax.tree_util.tree_leaves(xs)]))
+
+        metrics["l2_grads"] = l2(jax.tree_util.tree_leaves(grad))
+
         return new_state, metrics, new_train_rng
 
     # Create parallel version of the train step
@@ -983,32 +1005,38 @@ def cumul_grad_step(grad_idx, loss_grad_rng):
     logger.info(f"  Total train batch size (w. parallel & distributed) = {total_train_batch_size}")
     logger.info(f"  Total optimization steps = {args.num_train_epochs * num_update_steps_per_epoch}")
 
-    if jax.process_index() == 0:
+    if jax.process_index() == 0 and args.report_to == "wandb":
         wandb.define_metric("*", step_metric="train/step")
+        wandb.define_metric("train/step", step_metric="walltime")
         wandb.config.update(
             {
                 "num_train_examples": args.max_train_samples if args.streaming else len(train_dataset),
                 "total_train_batch_size": total_train_batch_size,
                 "total_optimization_step": args.num_train_epochs * num_update_steps_per_epoch,
                 "num_devices": jax.device_count(),
+                "controlnet_params": sum(np.prod(x.shape) for x in jax.tree_util.tree_leaves(state.params)),
             }
         )
 
-    global_step = 0
+    global_step = step0 = 0
     epochs = tqdm(
         range(args.num_train_epochs),
         desc="Epoch ... ",
         position=0,
         disable=jax.process_index() > 0,
     )
+    if args.profile_memory:
+        jax.profiler.save_device_memory_profile(os.path.join(args.output_dir, "memory_initial.prof"))
+    t00 = t0 = time.monotonic()
     for epoch in epochs:
         # ======================== Training ================================
 
         train_metrics = []
+        train_metric = None
 
         steps_per_epoch = (
             args.max_train_samples // total_train_batch_size
-            if args.streaming
+            if args.streaming or args.max_train_samples
             else len(train_dataset) // total_train_batch_size
         )
         train_step_progress_bar = tqdm(
@@ -1020,10 +1048,18 @@ def cumul_grad_step(grad_idx, loss_grad_rng):
         )
         # train
         for batch in train_dataloader:
+            if args.profile_steps and global_step == 1:
+                train_metric["loss"].block_until_ready()
+                jax.profiler.start_trace(args.output_dir)
+            if args.profile_steps and global_step == 1 + args.profile_steps:
+                train_metric["loss"].block_until_ready()
+                jax.profiler.stop_trace()
+
             batch = shard(batch)
-            state, train_metric, train_rngs = p_train_step(
-                state, unet_params, text_encoder_params, vae_params, batch, train_rngs
-            )
+            with jax.profiler.StepTraceAnnotation("train", step_num=global_step):
+                state, train_metric, train_rngs = p_train_step(
+                    state, unet_params, text_encoder_params, vae_params, batch, train_rngs
+                )
             train_metrics.append(train_metric)
 
             train_step_progress_bar.update(1)
@@ -1041,13 +1077,19 @@ def cumul_grad_step(grad_idx, loss_grad_rng):
 
             if global_step % args.logging_steps == 0 and jax.process_index() == 0:
                 if args.report_to == "wandb":
+                    train_metrics = jax_utils.unreplicate(train_metrics)
+                    train_metrics = jax.tree_util.tree_map(lambda *m: jnp.array(m).mean(), *train_metrics)
                     wandb.log(
                         {
+                            "walltime": time.monotonic() - t00,
                             "train/step": global_step,
-                            "train/epoch": epoch,
-                            "train/loss": jax_utils.unreplicate(train_metric)["loss"],
+                            "train/epoch": global_step / dataset_length,
+                            "train/steps_per_sec": (global_step - step0) / (time.monotonic() - t0),
+                            **{f"train/{k}": v for k, v in train_metrics.items()},
                         }
                     )
+                t0, step0 = time.monotonic(), global_step
+                train_metrics = []
             if global_step % args.checkpointing_steps == 0 and jax.process_index() == 0:
                 controlnet.save_pretrained(
                     f"{args.output_dir}/{global_step}",
@@ -1058,10 +1100,14 @@ def cumul_grad_step(grad_idx, loss_grad_rng):
         train_step_progress_bar.close()
         epochs.write(f"Epoch... ({epoch + 1}/{args.num_train_epochs} | Loss: {train_metric['loss']})")
 
-    # Create the pipeline using using the trained modules and save it.
+    # Final validation & store model.
     if jax.process_index() == 0:
         if args.validation_prompt is not None:
+            if args.profile_validation:
+                jax.profiler.start_trace(args.output_dir)
             image_logs = log_validation(controlnet, state.params, tokenizer, args, validation_rng, weight_dtype)
+            if args.profile_validation:
+                jax.profiler.stop_trace()
         else:
             image_logs = None
 
@@ -1084,6 +1130,10 @@ def cumul_grad_step(grad_idx, loss_grad_rng):
                 ignore_patterns=["step_*", "epoch_*"],
             )
 
+    if args.profile_memory:
+        jax.profiler.save_device_memory_profile(os.path.join(args.output_dir, "memory_final.prof"))
+    logger.info("Finished training.")
+
 
 if __name__ == "__main__":
     main()

From 46c52f9b9607e6ecb29c782c052aea313e6487b7 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 13 Apr 2023 00:25:10 +0200
Subject: [PATCH 04/71] [Pipelines] Make sure that None functions are correctly
 not saved (#3080)

---
 src/diffusers/pipelines/pipeline_utils.py | 25 +++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 2e20c21aaf38..72c4363da3c6 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -19,6 +19,7 @@
 import inspect
 import os
 import re
+import sys
 import warnings
 from dataclasses import dataclass
 from pathlib import Path
@@ -540,11 +541,9 @@ def save_pretrained(
             variant (`str`, *optional*):
                 If specified, weights are saved in the format pytorch_model.<variant>.bin.
         """
-        self.save_config(save_directory)
-
         model_index_dict = dict(self.config)
-        model_index_dict.pop("_class_name")
-        model_index_dict.pop("_diffusers_version")
+        model_index_dict.pop("_class_name", None)
+        model_index_dict.pop("_diffusers_version", None)
         model_index_dict.pop("_module", None)
 
         expected_modules, optional_kwargs = self._get_signature_keys(self)
@@ -557,7 +556,6 @@ def is_saveable_module(name, value):
             return True
 
         model_index_dict = {k: v for k, v in model_index_dict.items() if is_saveable_module(k, v)}
-
         for pipeline_component_name in model_index_dict.keys():
             sub_model = getattr(self, pipeline_component_name)
             model_cls = sub_model.__class__
@@ -571,7 +569,13 @@ def is_saveable_module(name, value):
             save_method_name = None
             # search for the model's base class in LOADABLE_CLASSES
             for library_name, library_classes in LOADABLE_CLASSES.items():
-                library = importlib.import_module(library_name)
+                if library_name in sys.modules:
+                    library = importlib.import_module(library_name)
+                else:
+                    logger.info(
+                        f"{library_name} is not installed. Cannot save {pipeline_component_name} as {library_classes} from {library_name}"
+                    )
+
                 for base_class, save_load_methods in library_classes.items():
                     class_candidate = getattr(library, base_class, None)
                     if class_candidate is not None and issubclass(model_cls, class_candidate):
@@ -581,6 +585,12 @@ def is_saveable_module(name, value):
                 if save_method_name is not None:
                     break
 
+            if save_method_name is None:
+                logger.warn(f"self.{pipeline_component_name}={sub_model} of type {type(sub_model)} cannot be saved.")
+                # make sure that unsaveable components are not tried to be loaded afterward
+                self.register_to_config(**{pipeline_component_name: (None, None)})
+                continue
+
             save_method = getattr(sub_model, save_method_name)
 
             # Call the save method with the argument safe_serialization only if it's supported
@@ -596,6 +606,9 @@ def is_saveable_module(name, value):
 
             save_method(os.path.join(save_directory, pipeline_component_name), **save_kwargs)
 
+        # finally save the config
+        self.save_config(save_directory)
+
     def to(
         self,
         torch_device: Optional[Union[str, torch.device]] = None,

From e748b3c6e163ce9a61965eb456704a83b855ccc3 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Wed, 12 Apr 2023 21:45:23 -1000
Subject: [PATCH 05/71] doc string example remove from_pt (#3083)

---
 .../pipeline_flax_stable_diffusion_controlnet.py                | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_controlnet.py
index df3e79a194f8..7035242a0cda 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_controlnet.py
@@ -83,7 +83,7 @@
         ...     "lllyasviel/sd-controlnet-canny", from_pt=True, dtype=jnp.float32
         ... )
         >>> pipe, params = FlaxStableDiffusionControlNetPipeline.from_pretrained(
-        ...     "runwayml/stable-diffusion-v1-5", controlnet=controlnet, from_pt=True, dtype=jnp.float32
+        ...     "runwayml/stable-diffusion-v1-5", controlnet=controlnet, revision="flax", dtype=jnp.float32
         ... )
         >>> params["controlnet"] = controlnet_params
 

From 3a9d7d97588a1bbc906d8a17be77cf382492a7b6 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 13 Apr 2023 14:32:57 +0200
Subject: [PATCH 06/71] [Tests] parallelize (#3078)

* [Tests] parallelize

* finish folder structuring

* Parallelize tests more

* Correct saving of pipelines

* make sure logging level is correct

* try again

* Apply suggestions from code review

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>

---------

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
---
 .github/workflows/pr_tests.yml                | 33 +++++++++++++------
 tests/{ => models}/test_layers_utils.py       |  0
 tests/{ => models}/test_lora_layers.py        |  0
 tests/{ => models}/test_modeling_common.py    |  0
 .../{ => models}/test_modeling_common_flax.py |  0
 tests/models/test_models_unet_1d.py           |  2 +-
 tests/models/test_models_unet_2d.py           |  2 +-
 tests/models/test_models_unet_2d_condition.py |  2 +-
 tests/models/test_models_unet_3d_condition.py |  2 +-
 tests/models/test_models_vae.py               |  2 +-
 tests/models/test_models_vae_flax.py          |  2 +-
 tests/models/test_models_vq.py                |  2 +-
 tests/{ => models}/test_unet_2d_blocks.py     |  0
 tests/{ => models}/test_unet_blocks_common.py |  0
 .../test_check_copies.py                      |  0
 .../test_check_dummies.py                     |  0
 tests/{ => others}/test_config.py             | 12 +++++++
 tests/{ => others}/test_ema.py                |  0
 tests/{ => others}/test_hub_utils.py          |  0
 tests/{ => others}/test_image_processor.py    |  0
 tests/{ => others}/test_outputs.py            |  0
 tests/{ => others}/test_training.py           |  0
 tests/{ => others}/test_utils.py              |  2 +-
 .../altdiffusion/test_alt_diffusion.py        |  4 +--
 tests/pipelines/audioldm/test_audioldm.py     |  4 +--
 .../dance_diffusion/test_dance_diffusion.py   |  4 +--
 tests/pipelines/ddim/test_ddim.py             |  4 +--
 tests/pipelines/dit/test_dit.py               |  4 +--
 .../latent_diffusion/test_latent_diffusion.py |  4 +--
 .../paint_by_example/test_paint_by_example.py |  4 +--
 tests/{ => pipelines}/pipeline_params.py      |  0
 tests/pipelines/repaint/test_repaint.py       |  4 +--
 .../test_spectrogram_diffusion.py             |  4 +--
 .../stable_diffusion/test_cycle_diffusion.py  |  4 +--
 .../test_onnx_stable_diffusion.py             |  2 +-
 .../test_onnx_stable_diffusion_img2img.py     |  2 +-
 .../test_onnx_stable_diffusion_inpaint.py     |  2 +-
 .../test_onnx_stable_diffusion_upscale.py     |  2 +-
 .../stable_diffusion/test_stable_diffusion.py |  4 +--
 .../test_stable_diffusion_controlnet.py       |  4 +--
 .../test_stable_diffusion_image_variation.py  |  4 +--
 .../test_stable_diffusion_img2img.py          |  4 +--
 .../test_stable_diffusion_inpaint.py          |  4 +--
 ...st_stable_diffusion_instruction_pix2pix.py |  4 +--
 .../test_stable_diffusion_model_editing.py    |  4 +--
 .../test_stable_diffusion_panorama.py         |  4 +--
 .../test_stable_diffusion_pix2pix_zero.py     |  4 +--
 .../test_stable_diffusion_sag.py              |  4 +--
 .../test_stable_diffusion.py                  |  4 +--
 ...test_stable_diffusion_attend_and_excite.py |  4 +--
 .../test_stable_diffusion_depth.py            |  4 +--
 .../test_stable_diffusion_inpaint.py          |  4 +--
 .../test_stable_diffusion_latent_upscale.py   |  4 +--
 .../stable_unclip/test_stable_unclip.py       |  4 +--
 .../test_stable_unclip_img2img.py             |  4 +--
 tests/{ => pipelines}/test_pipelines.py       |  0
 .../{ => pipelines}/test_pipelines_common.py  |  2 +-
 tests/{ => pipelines}/test_pipelines_flax.py  |  0
 .../test_pipelines_onnx_common.py             |  0
 .../text_to_video/test_text_to_video.py       |  4 +--
 .../text_to_video/test_text_to_video_zero.py  |  2 +-
 tests/pipelines/unclip/test_unclip.py         |  4 +--
 .../unclip/test_unclip_image_variation.py     |  4 +--
 63 files changed, 109 insertions(+), 84 deletions(-)
 rename tests/{ => models}/test_layers_utils.py (100%)
 rename tests/{ => models}/test_lora_layers.py (100%)
 rename tests/{ => models}/test_modeling_common.py (100%)
 rename tests/{ => models}/test_modeling_common_flax.py (100%)
 rename tests/{ => models}/test_unet_2d_blocks.py (100%)
 rename tests/{ => models}/test_unet_blocks_common.py (100%)
 rename tests/{repo_utils => others}/test_check_copies.py (100%)
 rename tests/{repo_utils => others}/test_check_dummies.py (100%)
 rename tests/{ => others}/test_config.py (95%)
 rename tests/{ => others}/test_ema.py (100%)
 rename tests/{ => others}/test_hub_utils.py (100%)
 rename tests/{ => others}/test_image_processor.py (100%)
 rename tests/{ => others}/test_outputs.py (100%)
 rename tests/{ => others}/test_training.py (100%)
 rename tests/{ => others}/test_utils.py (98%)
 rename tests/{ => pipelines}/pipeline_params.py (100%)
 rename tests/{ => pipelines}/test_pipelines.py (100%)
 rename tests/{ => pipelines}/test_pipelines_common.py (99%)
 rename tests/{ => pipelines}/test_pipelines_flax.py (100%)
 rename tests/{ => pipelines}/test_pipelines_onnx_common.py (100%)

diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml
index 3d5fd84ad949..d06b576fa631 100644
--- a/.github/workflows/pr_tests.yml
+++ b/.github/workflows/pr_tests.yml
@@ -21,22 +21,27 @@ jobs:
       fail-fast: false
       matrix:
         config:
-          - name: Fast PyTorch CPU tests on Ubuntu
-            framework: pytorch
+          - name: Fast PyTorch Pipeline CPU tests
+            framework: pytorch_pipelines
             runner: docker-cpu
             image: diffusers/diffusers-pytorch-cpu
-            report: torch_cpu
-          - name: Fast Flax CPU tests on Ubuntu
+            report: torch_cpu_pipelines
+          - name: Fast PyTorch Models & Schedulers CPU tests
+            framework: pytorch_models
+            runner: docker-cpu
+            image: diffusers/diffusers-pytorch-cpu
+            report: torch_cpu_models_schedulers
+          - name: Fast Flax CPU tests
             framework: flax
             runner: docker-cpu
             image: diffusers/diffusers-flax-cpu
             report: flax_cpu
-          - name: Fast ONNXRuntime CPU tests on Ubuntu
+          - name: Fast ONNXRuntime CPU tests
             framework: onnxruntime
             runner: docker-cpu
             image: diffusers/diffusers-onnxruntime-cpu
             report: onnx_cpu
-          - name: PyTorch Example CPU tests on Ubuntu
+          - name: PyTorch Example CPU tests
             framework: pytorch_examples
             runner: docker-cpu
             image: diffusers/diffusers-pytorch-cpu
@@ -71,13 +76,21 @@ jobs:
       run: |
         python utils/print_env.py
 
-    - name: Run fast PyTorch CPU tests
-      if: ${{ matrix.config.framework == 'pytorch' }}
+    - name: Run fast PyTorch Pipeline CPU tests
+      if: ${{ matrix.config.framework == 'pytorch_pipelines' }}
       run: |
         python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
           -s -v -k "not Flax and not Onnx" \
           --make-reports=tests_${{ matrix.config.report }} \
-          tests/
+          tests/pipelines
+
+    - name: Run fast PyTorch Model Scheduler CPU tests
+      if: ${{ matrix.config.framework == 'pytorch_models' }}
+      run: |
+        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
+          -s -v -k "not Flax and not Onnx" \
+          --make-reports=tests_${{ matrix.config.report }} \
+          tests/models tests/schedulers tests/others
 
     - name: Run fast Flax TPU tests
       if: ${{ matrix.config.framework == 'flax' }}
@@ -85,7 +98,7 @@ jobs:
         python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
           -s -v -k "Flax" \
           --make-reports=tests_${{ matrix.config.report }} \
-          tests/
+          tests
 
     - name: Run fast ONNXRuntime CPU tests
       if: ${{ matrix.config.framework == 'onnxruntime' }}
diff --git a/tests/test_layers_utils.py b/tests/models/test_layers_utils.py
similarity index 100%
rename from tests/test_layers_utils.py
rename to tests/models/test_layers_utils.py
diff --git a/tests/test_lora_layers.py b/tests/models/test_lora_layers.py
similarity index 100%
rename from tests/test_lora_layers.py
rename to tests/models/test_lora_layers.py
diff --git a/tests/test_modeling_common.py b/tests/models/test_modeling_common.py
similarity index 100%
rename from tests/test_modeling_common.py
rename to tests/models/test_modeling_common.py
diff --git a/tests/test_modeling_common_flax.py b/tests/models/test_modeling_common_flax.py
similarity index 100%
rename from tests/test_modeling_common_flax.py
rename to tests/models/test_modeling_common_flax.py
diff --git a/tests/models/test_models_unet_1d.py b/tests/models/test_models_unet_1d.py
index d3a3d5cfc9a0..f954d876fa76 100644
--- a/tests/models/test_models_unet_1d.py
+++ b/tests/models/test_models_unet_1d.py
@@ -20,7 +20,7 @@
 from diffusers import UNet1DModel
 from diffusers.utils import floats_tensor, slow, torch_device
 
-from ..test_modeling_common import ModelTesterMixin
+from .test_modeling_common import ModelTesterMixin
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
diff --git a/tests/models/test_models_unet_2d.py b/tests/models/test_models_unet_2d.py
index 8f831fcf7cbf..c20b0ef7d0a4 100644
--- a/tests/models/test_models_unet_2d.py
+++ b/tests/models/test_models_unet_2d.py
@@ -22,7 +22,7 @@
 from diffusers import UNet2DModel
 from diffusers.utils import floats_tensor, logging, slow, torch_all_close, torch_device
 
-from ..test_modeling_common import ModelTesterMixin
+from .test_modeling_common import ModelTesterMixin
 
 
 logger = logging.get_logger(__name__)
diff --git a/tests/models/test_models_unet_2d_condition.py b/tests/models/test_models_unet_2d_condition.py
index 17e08e0a426e..15f77fb8c106 100644
--- a/tests/models/test_models_unet_2d_condition.py
+++ b/tests/models/test_models_unet_2d_condition.py
@@ -34,7 +34,7 @@
 )
 from diffusers.utils.import_utils import is_xformers_available
 
-from ..test_modeling_common import ModelTesterMixin
+from .test_modeling_common import ModelTesterMixin
 
 
 logger = logging.get_logger(__name__)
diff --git a/tests/models/test_models_unet_3d_condition.py b/tests/models/test_models_unet_3d_condition.py
index c552b503af05..f245045bb3bb 100644
--- a/tests/models/test_models_unet_3d_condition.py
+++ b/tests/models/test_models_unet_3d_condition.py
@@ -30,7 +30,7 @@
 )
 from diffusers.utils.import_utils import is_xformers_available
 
-from ..test_modeling_common import ModelTesterMixin
+from .test_modeling_common import ModelTesterMixin
 
 
 logger = logging.get_logger(__name__)
diff --git a/tests/models/test_models_vae.py b/tests/models/test_models_vae.py
index abd4a078e692..fe0041850bb4 100644
--- a/tests/models/test_models_vae.py
+++ b/tests/models/test_models_vae.py
@@ -22,7 +22,7 @@
 from diffusers import AutoencoderKL
 from diffusers.utils import floats_tensor, load_hf_numpy, require_torch_gpu, slow, torch_all_close, torch_device
 
-from ..test_modeling_common import ModelTesterMixin
+from .test_modeling_common import ModelTesterMixin
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
diff --git a/tests/models/test_models_vae_flax.py b/tests/models/test_models_vae_flax.py
index 8fedb85eccfc..e5c56b61a5a4 100644
--- a/tests/models/test_models_vae_flax.py
+++ b/tests/models/test_models_vae_flax.py
@@ -4,7 +4,7 @@
 from diffusers.utils import is_flax_available
 from diffusers.utils.testing_utils import require_flax
 
-from ..test_modeling_common_flax import FlaxModelTesterMixin
+from .test_modeling_common_flax import FlaxModelTesterMixin
 
 
 if is_flax_available():
diff --git a/tests/models/test_models_vq.py b/tests/models/test_models_vq.py
index 66c33e07371e..015d2abfc6fa 100644
--- a/tests/models/test_models_vq.py
+++ b/tests/models/test_models_vq.py
@@ -20,7 +20,7 @@
 from diffusers import VQModel
 from diffusers.utils import floats_tensor, torch_device
 
-from ..test_modeling_common import ModelTesterMixin
+from .test_modeling_common import ModelTesterMixin
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
diff --git a/tests/test_unet_2d_blocks.py b/tests/models/test_unet_2d_blocks.py
similarity index 100%
rename from tests/test_unet_2d_blocks.py
rename to tests/models/test_unet_2d_blocks.py
diff --git a/tests/test_unet_blocks_common.py b/tests/models/test_unet_blocks_common.py
similarity index 100%
rename from tests/test_unet_blocks_common.py
rename to tests/models/test_unet_blocks_common.py
diff --git a/tests/repo_utils/test_check_copies.py b/tests/others/test_check_copies.py
similarity index 100%
rename from tests/repo_utils/test_check_copies.py
rename to tests/others/test_check_copies.py
diff --git a/tests/repo_utils/test_check_dummies.py b/tests/others/test_check_dummies.py
similarity index 100%
rename from tests/repo_utils/test_check_dummies.py
rename to tests/others/test_check_dummies.py
diff --git a/tests/test_config.py b/tests/others/test_config.py
similarity index 95%
rename from tests/test_config.py
rename to tests/others/test_config.py
index 95b0cdf9a597..a29190c199ca 100644
--- a/tests/test_config.py
+++ b/tests/others/test_config.py
@@ -141,6 +141,8 @@ def test_save_load(self):
 
     def test_load_ddim_from_pndm(self):
         logger = logging.get_logger("diffusers.configuration_utils")
+        # 30 for warning
+        logger.setLevel(30)
 
         with CaptureLogger(logger) as cap_logger:
             ddim = DDIMScheduler.from_pretrained(
@@ -153,6 +155,8 @@ def test_load_ddim_from_pndm(self):
 
     def test_load_euler_from_pndm(self):
         logger = logging.get_logger("diffusers.configuration_utils")
+        # 30 for warning
+        logger.setLevel(30)
 
         with CaptureLogger(logger) as cap_logger:
             euler = EulerDiscreteScheduler.from_pretrained(
@@ -165,6 +169,8 @@ def test_load_euler_from_pndm(self):
 
     def test_load_euler_ancestral_from_pndm(self):
         logger = logging.get_logger("diffusers.configuration_utils")
+        # 30 for warning
+        logger.setLevel(30)
 
         with CaptureLogger(logger) as cap_logger:
             euler = EulerAncestralDiscreteScheduler.from_pretrained(
@@ -177,6 +183,8 @@ def test_load_euler_ancestral_from_pndm(self):
 
     def test_load_pndm(self):
         logger = logging.get_logger("diffusers.configuration_utils")
+        # 30 for warning
+        logger.setLevel(30)
 
         with CaptureLogger(logger) as cap_logger:
             pndm = PNDMScheduler.from_pretrained(
@@ -189,6 +197,8 @@ def test_load_pndm(self):
 
     def test_overwrite_config_on_load(self):
         logger = logging.get_logger("diffusers.configuration_utils")
+        # 30 for warning
+        logger.setLevel(30)
 
         with CaptureLogger(logger) as cap_logger:
             ddpm = DDPMScheduler.from_pretrained(
@@ -212,6 +222,8 @@ def test_overwrite_config_on_load(self):
 
     def test_load_dpmsolver(self):
         logger = logging.get_logger("diffusers.configuration_utils")
+        # 30 for warning
+        logger.setLevel(30)
 
         with CaptureLogger(logger) as cap_logger:
             dpm = DPMSolverMultistepScheduler.from_pretrained(
diff --git a/tests/test_ema.py b/tests/others/test_ema.py
similarity index 100%
rename from tests/test_ema.py
rename to tests/others/test_ema.py
diff --git a/tests/test_hub_utils.py b/tests/others/test_hub_utils.py
similarity index 100%
rename from tests/test_hub_utils.py
rename to tests/others/test_hub_utils.py
diff --git a/tests/test_image_processor.py b/tests/others/test_image_processor.py
similarity index 100%
rename from tests/test_image_processor.py
rename to tests/others/test_image_processor.py
diff --git a/tests/test_outputs.py b/tests/others/test_outputs.py
similarity index 100%
rename from tests/test_outputs.py
rename to tests/others/test_outputs.py
diff --git a/tests/test_training.py b/tests/others/test_training.py
similarity index 100%
rename from tests/test_training.py
rename to tests/others/test_training.py
diff --git a/tests/test_utils.py b/tests/others/test_utils.py
similarity index 98%
rename from tests/test_utils.py
rename to tests/others/test_utils.py
index 4fc4e1a06638..6e7cc095f8df 100755
--- a/tests/test_utils.py
+++ b/tests/others/test_utils.py
@@ -167,4 +167,4 @@ def test_deprecate_stacklevel(self):
         with self.assertWarns(FutureWarning) as warning:
             deprecate(("deprecated_arg", self.higher_version, "This message is better!!!"), standard_warn=False)
         assert str(warning.warning) == "This message is better!!!"
-        assert "diffusers/tests/test_utils.py" in warning.filename
+        assert "diffusers/tests/others/test_utils.py" in warning.filename
diff --git a/tests/pipelines/altdiffusion/test_alt_diffusion.py b/tests/pipelines/altdiffusion/test_alt_diffusion.py
index faa56e18f748..4d19621f0c2c 100644
--- a/tests/pipelines/altdiffusion/test_alt_diffusion.py
+++ b/tests/pipelines/altdiffusion/test_alt_diffusion.py
@@ -28,8 +28,8 @@
 from diffusers.utils import slow, torch_device
 from diffusers.utils.testing_utils import require_torch_gpu
 
-from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
diff --git a/tests/pipelines/audioldm/test_audioldm.py b/tests/pipelines/audioldm/test_audioldm.py
index 10de5440eb00..ec72108fafc9 100644
--- a/tests/pipelines/audioldm/test_audioldm.py
+++ b/tests/pipelines/audioldm/test_audioldm.py
@@ -38,8 +38,8 @@
 )
 from diffusers.utils import slow, torch_device
 
-from ...pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS, TEXT_TO_AUDIO_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
+from ..pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS, TEXT_TO_AUDIO_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
 
 
 class AudioLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
diff --git a/tests/pipelines/dance_diffusion/test_dance_diffusion.py b/tests/pipelines/dance_diffusion/test_dance_diffusion.py
index bbd4aa694b76..5db90a3aa740 100644
--- a/tests/pipelines/dance_diffusion/test_dance_diffusion.py
+++ b/tests/pipelines/dance_diffusion/test_dance_diffusion.py
@@ -23,8 +23,8 @@
 from diffusers.utils import slow, torch_device
 from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
 
-from ...pipeline_params import UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS, UNCONDITIONAL_AUDIO_GENERATION_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
+from ..pipeline_params import UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS, UNCONDITIONAL_AUDIO_GENERATION_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
diff --git a/tests/pipelines/ddim/test_ddim.py b/tests/pipelines/ddim/test_ddim.py
index 4d2c4e490d63..319bd778e3b2 100644
--- a/tests/pipelines/ddim/test_ddim.py
+++ b/tests/pipelines/ddim/test_ddim.py
@@ -21,8 +21,8 @@
 from diffusers import DDIMPipeline, DDIMScheduler, UNet2DModel
 from diffusers.utils.testing_utils import require_torch_gpu, slow, torch_device
 
-from ...pipeline_params import UNCONDITIONAL_IMAGE_GENERATION_BATCH_PARAMS, UNCONDITIONAL_IMAGE_GENERATION_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
+from ..pipeline_params import UNCONDITIONAL_IMAGE_GENERATION_BATCH_PARAMS, UNCONDITIONAL_IMAGE_GENERATION_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
diff --git a/tests/pipelines/dit/test_dit.py b/tests/pipelines/dit/test_dit.py
index 947fd3cbf43d..d8098178f339 100644
--- a/tests/pipelines/dit/test_dit.py
+++ b/tests/pipelines/dit/test_dit.py
@@ -23,11 +23,11 @@
 from diffusers.utils import is_xformers_available, load_numpy, slow, torch_device
 from diffusers.utils.testing_utils import require_torch_gpu
 
-from ...pipeline_params import (
+from ..pipeline_params import (
     CLASS_CONDITIONED_IMAGE_GENERATION_BATCH_PARAMS,
     CLASS_CONDITIONED_IMAGE_GENERATION_PARAMS,
 )
-from ...test_pipelines_common import PipelineTesterMixin
+from ..test_pipelines_common import PipelineTesterMixin
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
diff --git a/tests/pipelines/latent_diffusion/test_latent_diffusion.py b/tests/pipelines/latent_diffusion/test_latent_diffusion.py
index 2ff7feda6317..05ff4162e5c6 100644
--- a/tests/pipelines/latent_diffusion/test_latent_diffusion.py
+++ b/tests/pipelines/latent_diffusion/test_latent_diffusion.py
@@ -23,8 +23,8 @@
 from diffusers import AutoencoderKL, DDIMScheduler, LDMTextToImagePipeline, UNet2DConditionModel
 from diffusers.utils.testing_utils import load_numpy, nightly, require_torch_gpu, slow, torch_device
 
-from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
diff --git a/tests/pipelines/paint_by_example/test_paint_by_example.py b/tests/pipelines/paint_by_example/test_paint_by_example.py
index 14b045d6c480..17feba59e8e4 100644
--- a/tests/pipelines/paint_by_example/test_paint_by_example.py
+++ b/tests/pipelines/paint_by_example/test_paint_by_example.py
@@ -27,8 +27,8 @@
 from diffusers.utils import floats_tensor, load_image, slow, torch_device
 from diffusers.utils.testing_utils import require_torch_gpu
 
-from ...pipeline_params import IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
+from ..pipeline_params import IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
diff --git a/tests/pipeline_params.py b/tests/pipelines/pipeline_params.py
similarity index 100%
rename from tests/pipeline_params.py
rename to tests/pipelines/pipeline_params.py
diff --git a/tests/pipelines/repaint/test_repaint.py b/tests/pipelines/repaint/test_repaint.py
index 060e6c9161ba..4f98675bc5af 100644
--- a/tests/pipelines/repaint/test_repaint.py
+++ b/tests/pipelines/repaint/test_repaint.py
@@ -22,8 +22,8 @@
 from diffusers import RePaintPipeline, RePaintScheduler, UNet2DModel
 from diffusers.utils.testing_utils import load_image, load_numpy, nightly, require_torch_gpu, skip_mps, torch_device
 
-from ...pipeline_params import IMAGE_INPAINTING_BATCH_PARAMS, IMAGE_INPAINTING_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
+from ..pipeline_params import IMAGE_INPAINTING_BATCH_PARAMS, IMAGE_INPAINTING_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
diff --git a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
index 594d7c598f75..3b64ea2d2fc1 100644
--- a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
+++ b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
@@ -24,8 +24,8 @@
 from diffusers.utils import require_torch_gpu, skip_mps, slow, torch_device
 from diffusers.utils.testing_utils import require_note_seq, require_onnxruntime
 
-from ...pipeline_params import TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS, TOKENS_TO_AUDIO_GENERATION_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
+from ..pipeline_params import TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS, TOKENS_TO_AUDIO_GENERATION_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
diff --git a/tests/pipelines/stable_diffusion/test_cycle_diffusion.py b/tests/pipelines/stable_diffusion/test_cycle_diffusion.py
index 5282cfd8dd24..05b72ab6a0fd 100644
--- a/tests/pipelines/stable_diffusion/test_cycle_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_cycle_diffusion.py
@@ -25,8 +25,8 @@
 from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
 from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
 
-from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
+from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
diff --git a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion.py
index 3a5f9379ae50..6c90f0526662 100644
--- a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion.py
@@ -29,7 +29,7 @@
 )
 from diffusers.utils.testing_utils import is_onnx_available, nightly, require_onnxruntime, require_torch_gpu
 
-from ...test_pipelines_onnx_common import OnnxPipelineTesterMixin
+from ..test_pipelines_onnx_common import OnnxPipelineTesterMixin
 
 
 if is_onnx_available():
diff --git a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py
index e1aa2f6dc0a1..9147dc461fc5 100644
--- a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py
+++ b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py
@@ -35,7 +35,7 @@
     require_torch_gpu,
 )
 
-from ...test_pipelines_onnx_common import OnnxPipelineTesterMixin
+from ..test_pipelines_onnx_common import OnnxPipelineTesterMixin
 
 
 if is_onnx_available():
diff --git a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint.py
index 16287d64d154..6004067887ea 100644
--- a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint.py
@@ -26,7 +26,7 @@
     require_torch_gpu,
 )
 
-from ...test_pipelines_onnx_common import OnnxPipelineTesterMixin
+from ..test_pipelines_onnx_common import OnnxPipelineTesterMixin
 
 
 if is_onnx_available():
diff --git a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_upscale.py b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_upscale.py
index d1527a42a1e5..a124c3de60ca 100644
--- a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_upscale.py
+++ b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_upscale.py
@@ -36,7 +36,7 @@
     require_torch_gpu,
 )
 
-from ...test_pipelines_onnx_common import OnnxPipelineTesterMixin
+from ..test_pipelines_onnx_common import OnnxPipelineTesterMixin
 
 
 if is_onnx_available():
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index 79796afdf597..14421a64b9e8 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -40,8 +40,8 @@
 from diffusers.utils.testing_utils import CaptureLogger, require_torch_gpu
 
 from ...models.test_models_unet_2d_condition import create_lora_layers
-from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index d556e6318f43..d7c5e2b0323a 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -33,8 +33,8 @@
 from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.testing_utils import require_torch_gpu
 
-from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
 
 
 class StableDiffusionControlNetPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
index 2a07ab64a36d..3bfa5810428a 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
@@ -32,8 +32,8 @@
 from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device
 from diffusers.utils.testing_utils import require_torch_gpu
 
-from ...pipeline_params import IMAGE_VARIATION_BATCH_PARAMS, IMAGE_VARIATION_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
+from ..pipeline_params import IMAGE_VARIATION_BATCH_PARAMS, IMAGE_VARIATION_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
index 69b92f685f25..127b1c216549 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
@@ -34,8 +34,8 @@
 from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device
 from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
 
-from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
+from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
index 3553679e0ef6..290d9b0a9134 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -34,8 +34,8 @@
 from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device
 from diffusers.utils.testing_utils import require_torch_gpu
 
-from ...pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
+from ..pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py
index 78e697fbbac3..8915f524d972 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py
@@ -34,8 +34,8 @@
 from diffusers.utils import floats_tensor, load_image, slow, torch_device
 from diffusers.utils.testing_utils import require_torch_gpu
 
-from ...pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
+from ..pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py
index 1e11500c72b1..bafad63ec2db 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py
@@ -31,8 +31,8 @@
 from diffusers.utils import slow, torch_device
 from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
 
-from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
index 752ed6e969c3..3ead4fe55bab 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
@@ -32,8 +32,8 @@
 from diffusers.utils import slow, torch_device
 from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
 
-from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py
index 59c45d603b91..0809a91041ce 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py
@@ -33,8 +33,8 @@
 from diffusers.utils import load_numpy, slow, torch_device
 from diffusers.utils.testing_utils import load_image, load_pt, require_torch_gpu, skip_mps
 
-from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
+from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py
index abaefbcad011..73859bdbf7d8 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py
@@ -29,8 +29,8 @@
 from diffusers.utils import slow, torch_device
 from diffusers.utils.testing_utils import require_torch_gpu
 
-from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
index 7b607c8fdd36..623dbde99469 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
@@ -35,8 +35,8 @@
 from diffusers.utils import load_numpy, nightly, slow, torch_device
 from diffusers.utils.testing_utils import CaptureLogger, require_torch_gpu
 
-from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py
index 90bb1461d351..f153ae08cbb6 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py
@@ -29,8 +29,8 @@
 from diffusers.utils import load_numpy, skip_mps, slow
 from diffusers.utils.testing_utils import require_torch_gpu
 
-from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
 
 
 @skip_mps
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py
index 6b0205f3faeb..7a5e02a42af4 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py
@@ -51,8 +51,8 @@
 )
 from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
 
-from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
+from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
index ee059314904f..2fa8b9045f43 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
@@ -26,8 +26,8 @@
 from diffusers.utils import floats_tensor, load_image, load_numpy, torch_device
 from diffusers.utils.testing_utils import require_torch_gpu, slow
 
-from ...pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
+from ..pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
index 38f4b053714b..aff1c1cdbde9 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
@@ -31,8 +31,8 @@
 from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
 from diffusers.utils.testing_utils import require_torch_gpu
 
-from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
+from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
diff --git a/tests/pipelines/stable_unclip/test_stable_unclip.py b/tests/pipelines/stable_unclip/test_stable_unclip.py
index 368ab21f24a9..891323d22fe0 100644
--- a/tests/pipelines/stable_unclip/test_stable_unclip.py
+++ b/tests/pipelines/stable_unclip/test_stable_unclip.py
@@ -15,8 +15,8 @@
 from diffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
 from diffusers.utils.testing_utils import load_numpy, require_torch_gpu, slow, torch_device
 
-from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
 
 
 class StableUnCLIPPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
diff --git a/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py b/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
index 907853394040..69e3225ced52 100644
--- a/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
+++ b/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
@@ -27,8 +27,8 @@
     torch_device,
 )
 
-from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
-from ...test_pipelines_common import (
+from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
+from ..test_pipelines_common import (
     PipelineTesterMixin,
     assert_mean_pixel_difference,
 )
diff --git a/tests/test_pipelines.py b/tests/pipelines/test_pipelines.py
similarity index 100%
rename from tests/test_pipelines.py
rename to tests/pipelines/test_pipelines.py
diff --git a/tests/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
similarity index 99%
rename from tests/test_pipelines_common.py
rename to tests/pipelines/test_pipelines_common.py
index 981bc9061ef9..d0712bdec8f6 100644
--- a/tests/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -85,7 +85,7 @@ def params(self) -> frozenset:
         raise NotImplementedError(
             "You need to set the attribute `params` in the child test class. "
             "`params` are checked for if all values are present in `__call__`'s signature."
-            " You can set `params` using one of the common set of parameters defined in`pipeline_params.py`"
+            " You can set `params` using one of the common set of parameters defined in `pipeline_params.py`"
             " e.g., `TEXT_TO_IMAGE_PARAMS` defines the common parameters used in text to  "
             "image pipelines, including prompts and prompt embedding overrides."
             "If your pipeline's set of arguments has minor changes from one of the common sets of arguments, "
diff --git a/tests/test_pipelines_flax.py b/tests/pipelines/test_pipelines_flax.py
similarity index 100%
rename from tests/test_pipelines_flax.py
rename to tests/pipelines/test_pipelines_flax.py
diff --git a/tests/test_pipelines_onnx_common.py b/tests/pipelines/test_pipelines_onnx_common.py
similarity index 100%
rename from tests/test_pipelines_onnx_common.py
rename to tests/pipelines/test_pipelines_onnx_common.py
diff --git a/tests/pipelines/text_to_video/test_text_to_video.py b/tests/pipelines/text_to_video/test_text_to_video.py
index 438e685a443c..b59653694616 100644
--- a/tests/pipelines/text_to_video/test_text_to_video.py
+++ b/tests/pipelines/text_to_video/test_text_to_video.py
@@ -28,8 +28,8 @@
 )
 from diffusers.utils import load_numpy, skip_mps, slow
 
-from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
diff --git a/tests/pipelines/text_to_video/test_text_to_video_zero.py b/tests/pipelines/text_to_video/test_text_to_video_zero.py
index 45bb93fbd9c6..8fc7254c52d1 100644
--- a/tests/pipelines/text_to_video/test_text_to_video_zero.py
+++ b/tests/pipelines/text_to_video/test_text_to_video_zero.py
@@ -20,7 +20,7 @@
 from diffusers import DDIMScheduler, TextToVideoZeroPipeline
 from diffusers.utils import load_pt, require_torch_gpu, slow
 
-from ...test_pipelines_common import assert_mean_pixel_difference
+from ..test_pipelines_common import assert_mean_pixel_difference
 
 
 @slow
diff --git a/tests/pipelines/unclip/test_unclip.py b/tests/pipelines/unclip/test_unclip.py
index c36fb02b190f..4df3e4d3828b 100644
--- a/tests/pipelines/unclip/test_unclip.py
+++ b/tests/pipelines/unclip/test_unclip.py
@@ -25,8 +25,8 @@
 from diffusers.utils import load_numpy, nightly, slow, torch_device
 from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
 
-from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
 
 
 class UnCLIPPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
diff --git a/tests/pipelines/unclip/test_unclip_image_variation.py b/tests/pipelines/unclip/test_unclip_image_variation.py
index 3cacb0bcad0b..57d15559cc75 100644
--- a/tests/pipelines/unclip/test_unclip_image_variation.py
+++ b/tests/pipelines/unclip/test_unclip_image_variation.py
@@ -39,8 +39,8 @@
 from diffusers.utils import floats_tensor, load_numpy, slow, torch_device
 from diffusers.utils.testing_utils import load_image, require_torch_gpu, skip_mps
 
-from ...pipeline_params import IMAGE_VARIATION_BATCH_PARAMS, IMAGE_VARIATION_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+from ..pipeline_params import IMAGE_VARIATION_BATCH_PARAMS, IMAGE_VARIATION_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
 
 
 class UnCLIPImageVariationPipelineFastTests(PipelineTesterMixin, unittest.TestCase):

From 3bf5ce21ad2fd39c0443f8f689e12761c0f67a0f Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 13 Apr 2023 14:33:11 +0200
Subject: [PATCH 07/71] Throw deprecation warning for return_cached_folder
 (#3092)

Throw deprecation warning
---
 src/diffusers/pipelines/pipeline_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 72c4363da3c6..c095da1665de 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -1059,7 +1059,7 @@ def load_module(name, value):
         return_cached_folder = kwargs.pop("return_cached_folder", False)
         if return_cached_folder:
             message = f"Passing `return_cached_folder=True` is deprecated and will be removed in `diffusers=0.17.0`. Please do the following instead: \n 1. Load the cached_folder via `cached_folder={cls}.download({pretrained_model_name_or_path})`. \n 2. Load the pipeline by loading from the cached folder: `pipeline={cls}.from_pretrained(cached_folder)`."
-            deprecate("return_cached_folder", "0.17.0", message, take_from=kwargs)
+            deprecate("return_cached_folder", "0.17.0", message)
             return model, cached_folder
 
         return model

From 3eaead0c4a55bf11bdf832eaa61d0e87fe5464df Mon Sep 17 00:00:00 2001
From: Joseph Coffland <github@joe.coffland.com>
Date: Thu, 13 Apr 2023 08:54:16 -0700
Subject: [PATCH 08/71] Allow SD attend and excite pipeline to work with any
 size output images (#2835)

Allow stable diffusion attend and excite pipeline to work with any size output image. Re: #2476, #2603
---
 ...eline_stable_diffusion_attend_and_excite.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
index 35351bae7116..c81ed5b54f94 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
@@ -14,7 +14,7 @@
 
 import inspect
 import math
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -76,7 +76,7 @@ def get_empty_store():
 
     def __call__(self, attn, is_cross: bool, place_in_unet: str):
         if self.cur_att_layer >= 0 and is_cross:
-            if attn.shape[1] == self.attn_res**2:
+            if attn.shape[1] == np.prod(self.attn_res):
                 self.step_store[place_in_unet].append(attn)
 
         self.cur_att_layer += 1
@@ -98,7 +98,7 @@ def aggregate_attention(self, from_where: List[str]) -> torch.Tensor:
         attention_maps = self.get_average_attention()
         for location in from_where:
             for item in attention_maps[location]:
-                cross_maps = item.reshape(-1, self.attn_res, self.attn_res, item.shape[-1])
+                cross_maps = item.reshape(-1, self.attn_res[0], self.attn_res[1], item.shape[-1])
                 out.append(cross_maps)
         out = torch.cat(out, dim=0)
         out = out.sum(0) / out.shape[0]
@@ -109,7 +109,7 @@ def reset(self):
         self.step_store = self.get_empty_store()
         self.attention_store = {}
 
-    def __init__(self, attn_res=16):
+    def __init__(self, attn_res):
         """
         Initialize an empty AttentionStore :param step_index: used to visualize only a specific step in the diffusion
         process
@@ -724,7 +724,7 @@ def __call__(
         max_iter_to_alter: int = 25,
         thresholds: dict = {0: 0.05, 10: 0.5, 20: 0.8},
         scale_factor: int = 20,
-        attn_res: int = 16,
+        attn_res: Optional[Tuple[int]] = None,
     ):
         r"""
         Function invoked when calling the pipeline for generation.
@@ -796,8 +796,8 @@ def __call__(
                 Dictionary defining the iterations and desired thresholds to apply iterative latent refinement in.
             scale_factor (`int`, *optional*, default to 20):
                 Scale factor that controls the step size of each Attend and Excite update.
-            attn_res (`int`, *optional*, default to 16):
-                The resolution of most semantic attention map.
+            attn_res (`tuple`, *optional*, default computed from width and height):
+                The 2D resolution of the semantic attention map.
 
         Examples:
 
@@ -870,7 +870,9 @@ def __call__(
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
-        self.attention_store = AttentionStore(attn_res=attn_res)
+        if attn_res is None:
+            attn_res = int(np.ceil(width / 32)), int(np.ceil(height / 32))
+        self.attention_store = AttentionStore(attn_res)
         self.register_attention_control()
 
         # default config for step size from original repo

From d0f258206d9cdcfb8685447d18b6881aed63143e Mon Sep 17 00:00:00 2001
From: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Date: Thu, 13 Apr 2023 13:46:28 -0700
Subject: [PATCH 09/71] [docs] Update community pipeline docs (#2989)

* update community pipeline docs

* fix formatting

* explain sharing workflows
---
 docs/source/en/_toctree.yml                   |   6 +-
 .../using-diffusers/contribute_pipeline.mdx   | 166 ++++++++++--------
 .../custom_pipeline_examples.mdx              |   2 +-
 .../custom_pipeline_overview.mdx              |  95 ++--------
 4 files changed, 106 insertions(+), 163 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index d74bd3785343..df41854a9fe7 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -25,7 +25,7 @@
     - local: using-diffusers/schedulers
       title: Load and compare different schedulers
     - local: using-diffusers/custom_pipeline_overview
-      title: Load and add custom pipelines
+      title: Load community pipelines
     - local: using-diffusers/kerascv
       title: Load KerasCV Stable Diffusion checkpoints
     title: Loading & Hub
@@ -47,9 +47,9 @@
     - local: using-diffusers/reproducibility
       title: Create reproducible pipelines
     - local: using-diffusers/custom_pipeline_examples
-      title: Community Pipelines
+      title: Community pipelines
     - local: using-diffusers/contribute_pipeline
-      title: How to contribute a Pipeline
+      title: How to contribute a community pipeline
     - local: using-diffusers/using_safetensors
       title: Using safetensors
     - local: using-diffusers/stable_diffusion_jax_how_to
diff --git a/docs/source/en/using-diffusers/contribute_pipeline.mdx b/docs/source/en/using-diffusers/contribute_pipeline.mdx
index 8ee6d6ae4fb1..2c2b5abedcec 100644
--- a/docs/source/en/using-diffusers/contribute_pipeline.mdx
+++ b/docs/source/en/using-diffusers/contribute_pipeline.mdx
@@ -10,30 +10,21 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
-# How to build a community pipeline
+# How to contribute a community pipeline
 
-*Note*: this page was built from the GitHub Issue on Community Pipelines [#841](https://github.com/huggingface/diffusers/issues/841).
+<Tip>
 
-Let's make an example!
-Say you want to define a pipeline that just does a single forward pass to a U-Net and then calls a scheduler only once (Note, this doesn't make any sense from a scientific point of view, but only represents an example of how things work under the hood).
+💡 Take a look at GitHub Issue [#841](https://github.com/huggingface/diffusers/issues/841) for more context about why we're adding community pipelines to help everyone easily share their work without being slowed down.
 
-Cool! So you open your favorite IDE and start creating your pipeline 💻.
-First, what model weights and configurations do we need?
-We have a U-Net and a scheduler, so our pipeline should take a U-Net and a scheduler as an argument.
-Also, as stated above, you'd like to be able to load weights and the scheduler config for Hub and share your code with others, so we'll inherit from `DiffusionPipeline`:
+</Tip>
 
-```python
-from diffusers import DiffusionPipeline
-import torch
+Community pipelines allow you to add any additional features you'd like on top of the [`DiffusionPipeline`]. The main benefit of building on top of the `DiffusionPipeline` is anyone can load and use your pipeline by only adding one more argument, making it super easy for the community to access.
 
+This guide will show you how to create a community pipeline and explain how they work. To keep things simple, you'll create a "one-step" pipeline where the `UNet` does a single forward pass and calls the scheduler once.
 
-class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
-    def __init__(self, unet, scheduler):
-        super().__init__()
-```
+## Initialize the pipeline
 
-Now, we must save the `unet` and `scheduler` in a config file so that you can save your pipeline with `save_pretrained`.
-Therefore, make sure you add every component that is save-able to the `register_modules` function:
+You should start by creating a `one_step_unet.py` file for your community pipeline. In this file, create a pipeline class that inherits from the [`DiffusionPipeline`] to be able to load model weights and the scheduler configuration from the Hub. The one-step pipeline needs a `UNet` and a scheduler, so you'll need to add these as arguments to the `__init__` function:
 
 ```python
 from diffusers import DiffusionPipeline
@@ -43,39 +34,54 @@ import torch
 class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
     def __init__(self, unet, scheduler):
         super().__init__()
+```
+
+To ensure your pipeline and its components (`unet` and `scheduler`) can be saved with [`~DiffusionPipeline.save_pretrained`], add them to the `register_modules` function:
+
+```diff
+  from diffusers import DiffusionPipeline
+  import torch
+
+  class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
+      def __init__(self, unet, scheduler):
+          super().__init__()
 
-        self.register_modules(unet=unet, scheduler=scheduler)
++         self.register_modules(unet=unet, scheduler=scheduler)
 ```
 
-Cool, the init is done! 🔥 Now, let's go into the forward pass, which we recommend defining as `__call__` . Here you're given all the creative freedom there is. For our amazing "one-step" pipeline, we simply create a random image and call the unet once and the scheduler once:
+Cool, the `__init__` step is done and you can move to the forward pass now! 🔥 
 
-```python
-from diffusers import DiffusionPipeline
-import torch
+## Define the forward pass
 
+In the forward pass, which we recommend defining as `__call__`, you have complete creative freedom to add whatever feature you'd like. For our amazing one-step pipeline, create a random image and only call the `unet` and `scheduler` once by setting `timestep=1`:
 
-class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
-    def __init__(self, unet, scheduler):
-        super().__init__()
+```diff
+  from diffusers import DiffusionPipeline
+  import torch
+
+
+  class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
+      def __init__(self, unet, scheduler):
+          super().__init__()
 
-        self.register_modules(unet=unet, scheduler=scheduler)
+          self.register_modules(unet=unet, scheduler=scheduler)
 
-    def __call__(self):
-        image = torch.randn(
-            (1, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size),
-        )
-        timestep = 1
++     def __call__(self):
++         image = torch.randn(
++             (1, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size),
++         )
++         timestep = 1
 
-        model_output = self.unet(image, timestep).sample
-        scheduler_output = self.scheduler.step(model_output, timestep, image).prev_sample
++         model_output = self.unet(image, timestep).sample
++         scheduler_output = self.scheduler.step(model_output, timestep, image).prev_sample
 
-        return scheduler_output
++         return scheduler_output
 ```
 
-Cool, that's it! 🚀 You can now run this pipeline by passing a `unet` and a `scheduler` to the init:
+That's it! 🚀 You can now run this pipeline by passing a `unet` and `scheduler` to it:
 
 ```python
-from diffusers import DDPMScheduler, Unet2DModel
+from diffusers import DDPMScheduler, UNet2DModel
 
 scheduler = DDPMScheduler()
 unet = UNet2DModel()
@@ -85,7 +91,7 @@ pipeline = UnetSchedulerOneForwardPipeline(unet=unet, scheduler=scheduler)
 output = pipeline()
 ```
 
-But what's even better is that you can load pre-existing weights into the pipeline if they match exactly your pipeline structure. This is e.g. the case for [https://huggingface.co/google/ddpm-cifar10-32](https://huggingface.co/google/ddpm-cifar10-32) so that we can do the following:
+But what's even better is you can load pre-existing weights into the pipeline if the pipeline structure is identical. For example, you can load the [`google/ddpm-cifar10-32`](https://huggingface.co/google/ddpm-cifar10-32) weights into the one-step pipeline:
 
 ```python
 pipeline = UnetSchedulerOneForwardPipeline.from_pretrained("google/ddpm-cifar10-32")
@@ -93,63 +99,72 @@ pipeline = UnetSchedulerOneForwardPipeline.from_pretrained("google/ddpm-cifar10-
 output = pipeline()
 ```
 
-We want to share this amazing pipeline with the community, so we would open a PR request to add the following code under `one_step_unet.py` to [https://github.com/huggingface/diffusers/tree/main/examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) .
-
-```python
-from diffusers import DiffusionPipeline
-import torch
-
+## Share your pipeline
 
-class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
-    def __init__(self, unet, scheduler):
-        super().__init__()
+Open a Pull Request on the 🧨 Diffusers [repository](https://github.com/huggingface/diffusers) to add your awesome pipeline in `one_step_unet.py` to the [examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) subfolder.
 
-        self.register_modules(unet=unet, scheduler=scheduler)
+Once it is merged, anyone with `diffusers >= 0.4.0` installed can use this pipeline magically 🪄 by specifying it in the `custom_pipeline` argument:
 
-    def __call__(self):
-        image = torch.randn(
-            (1, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size),
-        )
-        timestep = 1
-
-        model_output = self.unet(image, timestep).sample
-        scheduler_output = self.scheduler.step(model_output, timestep, image).prev_sample
+```python
+from diffusers import DiffusionPipeline
 
-        return scheduler_output
+pipe = DiffusionPipeline.from_pretrained("google/ddpm-cifar10-32", custom_pipeline="one_step_unet")
+pipe()
 ```
 
-Our amazing pipeline got merged here: [#840](https://github.com/huggingface/diffusers/pull/840).
-Now everybody that has `diffusers >= 0.4.0` installed can use our pipeline magically 🪄 as follows:
+Another way to share your community pipeline is to upload the `one_step_unet.py` file directly to your preferred [model repository](https://huggingface.co/docs/hub/models-uploading) on the Hub. Instead of specifying the `one_step_unet.py` file, pass the model repository id to the `custom_pipeline` argument:
 
 ```python
 from diffusers import DiffusionPipeline
 
-pipe = DiffusionPipeline.from_pretrained("google/ddpm-cifar10-32", custom_pipeline="one_step_unet")
-pipe()
+pipeline = DiffusionPipeline.from_pretrained("google/ddpm-cifar10-32", custom_pipeline="stevhliu/one_step_unet")
 ```
 
-Another way to upload your custom_pipeline, besides sending a PR, is uploading the code that contains it to the Hugging Face Hub, [as exemplified here](https://huggingface.co/docs/diffusers/using-diffusers/custom_pipeline_overview#loading-custom-pipelines-from-the-hub).
+Take a look at the following table to compare the two sharing workflows to help you decide the best option for you:
+
+|                | GitHub community pipeline                                                                                        | HF Hub community pipeline                                                                 |
+|----------------|------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------|
+| usage          | same                                                                                                             | same                                                                                      |
+| review process | open a Pull Request on GitHub and undergo a review process from the Diffusers team before merging; may be slower | upload directly to a Hub repository without any review; this is the fastest workflow      |
+| visibility     | included in the official Diffusers repository and documentation                                                  | included on your HF Hub profile and relies on your own usage/promotion to gain visibility |
 
-**Try it out now - it works!**
+<Tip>
 
-In general, you will want to create much more sophisticated pipelines, so we recommend looking at existing pipelines here: [https://github.com/huggingface/diffusers/tree/main/examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community).
+💡 You can use whatever package you want in your community pipeline file - as long as the user has it installed, everything will work fine. Make sure you have one and only one pipeline class that inherits from `DiffusionPipeline` because this is automatically detected.
 
-IMPORTANT:
-You can use whatever package you want in your community pipeline file - as long as the user has it installed, everything will work fine. Make sure you have one and only one pipeline class that inherits from `DiffusionPipeline` as this will be automatically detected.
+</Tip>
 
 ## How do community pipelines work?
-A community pipeline is a class that has to inherit from ['DiffusionPipeline']:
-and that has been added to `examples/community` [files](https://github.com/huggingface/diffusers/tree/main/examples/community).
-The community can load the pipeline code via the custom_pipeline argument from DiffusionPipeline. See docs [here](https://huggingface.co/docs/diffusers/api/diffusion_pipeline#diffusers.DiffusionPipeline.from_pretrained.custom_pipeline):
 
-This means:
-The model weights and configs of the pipeline should be loaded from the `pretrained_model_name_or_path` [argument](https://huggingface.co/docs/diffusers/api/diffusion_pipeline#diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path):
-whereas the code that powers the community pipeline is defined in a file added in [`examples/community`](https://github.com/huggingface/diffusers/tree/main/examples/community).
+A community pipeline is a class that inherits from [`DiffusionPipeline`] which means:
+
+- It can be loaded with the [`custom_pipeline`] argument.
+- The model weights and scheduler configuration are loaded from [`pretrained_model_name_or_path`].
+- The code that implements a feature in the community pipeline is defined in a `pipeline.py` file.
+
+Sometimes you can't load all the pipeline components weights from an official repository. In this case, the other components should be passed directly to the pipeline:
 
-Now, it might very well be that only some of your pipeline components weights can be downloaded from an official repo.
-The other components should then be passed directly to init as is the case for the ClIP guidance notebook [here](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/CLIP_Guided_Stable_diffusion_with_diffusers.ipynb#scrollTo=z9Kglma6hjki).
+```python
+from diffusers import DiffusionPipeline
+from transformers import CLIPFeatureExtractor, CLIPModel
+
+model_id = "CompVis/stable-diffusion-v1-4"
+clip_model_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
+
+feature_extractor = CLIPFeatureExtractor.from_pretrained(clip_model_id)
+clip_model = CLIPModel.from_pretrained(clip_model_id, torch_dtype=torch.float16)
+
+pipeline = DiffusionPipeline.from_pretrained(
+    model_id,
+    custom_pipeline="clip_guided_stable_diffusion",
+    clip_model=clip_model,
+    feature_extractor=feature_extractor,
+    scheduler=scheduler,
+    torch_dtype=torch.float16,
+)
+```
 
-The magic behind all of this is that we load the code directly from GitHub. You can check it out in more detail if you follow the functionality defined here:
+The magic behind community pipelines is contained in the following code. It allows the community pipeline to be loaded from GitHub or the Hub, and it'll be available to all 🧨 Diffusers packages.
 
 ```python
 # 2. Load the pipeline class, if using custom module then load it from the hub
@@ -164,6 +179,3 @@ else:
     diffusers_module = importlib.import_module(cls.__module__.split(".")[0])
     pipeline_class = getattr(diffusers_module, config_dict["_class_name"])
 ```
-
-This is why a community pipeline merged to GitHub will be directly available to all `diffusers` packages.
-
diff --git a/docs/source/en/using-diffusers/custom_pipeline_examples.mdx b/docs/source/en/using-diffusers/custom_pipeline_examples.mdx
index 2dfa71f0d33c..93ac6d1f782c 100644
--- a/docs/source/en/using-diffusers/custom_pipeline_examples.mdx
+++ b/docs/source/en/using-diffusers/custom_pipeline_examples.mdx
@@ -10,7 +10,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
-# Custom Pipelines
+# Community pipelines
 
 > **For more information about community pipelines, please have a look at [this issue](https://github.com/huggingface/diffusers/issues/841).**
 
diff --git a/docs/source/en/using-diffusers/custom_pipeline_overview.mdx b/docs/source/en/using-diffusers/custom_pipeline_overview.mdx
index 934e639983d2..3c5df7c0dd6e 100644
--- a/docs/source/en/using-diffusers/custom_pipeline_overview.mdx
+++ b/docs/source/en/using-diffusers/custom_pipeline_overview.mdx
@@ -10,19 +10,21 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
-# Loading and Adding Custom Pipelines
+# Load community pipelines
 
-Diffusers allows you to conveniently load any custom pipeline from the Hugging Face Hub as well as any [official community pipeline](https://github.com/huggingface/diffusers/tree/main/examples/community) 
-via the [`DiffusionPipeline`] class.
+Community pipelines are any [`DiffusionPipeline`] class that are different from the original implementation as specified in their paper (for example, the [`StableDiffusionControlNetPipeline`] corresponds to the [Text-to-Image Generation with ControlNet Conditioning](https://arxiv.org/abs/2302.05543) paper). They provide additional functionality or extend the original implementation of a pipeline.
 
-## Loading custom pipelines from the Hub
+There are many cool community pipelines like [Speech to Image](https://github.com/huggingface/diffusers/tree/main/examples/community#speech-to-image) or [Composable Stable Diffusion](https://github.com/huggingface/diffusers/tree/main/examples/community#composable-stable-diffusion), and you can find all the official community pipelines [here](https://github.com/huggingface/diffusers/tree/main/examples/community).
 
-Custom pipelines can be easily loaded from any model repository on the Hub that defines a diffusion pipeline in a `pipeline.py` file.
-Let's load a dummy pipeline from [hf-internal-testing/diffusers-dummy-pipeline](https://huggingface.co/hf-internal-testing/diffusers-dummy-pipeline).
+To load any community pipeline on the Hub, pass the repository id of the community pipeline to the `custom_pipeline` argument and the model repository where you'd like to load the pipeline weights and components from. For example, the example below loads a dummy pipeline from [`hf-internal-testing/diffusers-dummy-pipeline`](https://huggingface.co/hf-internal-testing/diffusers-dummy-pipeline/blob/main/pipeline.py) and the pipeline weights and components from [`google/ddpm-cifar10-32`](https://huggingface.co/google/ddpm-cifar10-32):
 
-All you need to do is pass the custom pipeline repo id with the `custom_pipeline` argument alongside the repo from where you wish to load the pipeline modules.
+<Tip warning={true}>
 
-```python
+🔒 By loading a community pipeline from the Hugging Face Hub, you are trusting that the code you are loading is safe. Make sure to inspect the code online before loading and running it automatically!
+
+</Tip>
+
+```py
 from diffusers import DiffusionPipeline
 
 pipeline = DiffusionPipeline.from_pretrained(
@@ -30,25 +32,9 @@ pipeline = DiffusionPipeline.from_pretrained(
 )
 ```
 
-This will load the custom pipeline as defined in the [model repository](https://huggingface.co/hf-internal-testing/diffusers-dummy-pipeline/blob/main/pipeline.py).
-
-<Tip warning={true} >
-
-By loading a custom pipeline from the Hugging Face Hub, you are trusting that the code you are loading 
-is safe 🔒. Make sure to check out the code online before loading & running it automatically.
-
-</Tip>
-
-## Loading official community pipelines
+Loading an official community pipeline is similar, but you can mix loading weights from an official repository id and pass pipeline components directly. The example below loads the community [CLIP Guided Stable Diffusion](https://github.com/huggingface/diffusers/tree/main/examples/community#clip-guided-stable-diffusion) pipeline, and you can pass the CLIP model components directly to it:
 
-Community pipelines are summarized in the [community examples folder](https://github.com/huggingface/diffusers/tree/main/examples/community). 
-
-Similarly, you need to pass both the *repo id* from where you wish to load the weights as well as the `custom_pipeline` argument. Here the `custom_pipeline` argument should consist simply of the filename of the community pipeline excluding the `.py` suffix, *e.g.* `clip_guided_stable_diffusion`.
-
-Since community pipelines are often more complex, one can mix loading weights from an official *repo id*
-and passing pipeline modules directly.
-
-```python
+```py
 from diffusers import DiffusionPipeline
 from transformers import CLIPImageProcessor, CLIPModel
 
@@ -65,59 +51,4 @@ pipeline = DiffusionPipeline.from_pretrained(
 )
 ```
 
-## Adding custom pipelines to the Hub
-
-To add a custom pipeline to the Hub, all you need to do is to define a pipeline class that inherits 
-from [`DiffusionPipeline`] in a `pipeline.py` file.
-Make sure that the whole pipeline is encapsulated within a single class and that the `pipeline.py` file
-has only one such class.
-
-Let's quickly define an example pipeline.
-
-
-```python
-import torch
-from diffusers import DiffusionPipeline
-
-
-class MyPipeline(DiffusionPipeline):
-    def __init__(self, unet, scheduler):
-        super().__init__()
-
-    self.register_modules(unet=unet, scheduler=scheduler)
-
-    @torch.no_grad()
-    def __call__(self, batch_size: int = 1, num_inference_steps: int = 50):
-        # Sample gaussian noise to begin loop
-        image = torch.randn(
-            (batch_size, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size)
-        )
-
-        image = image.to(self.device)
-
-        # set step values
-        self.scheduler.set_timesteps(num_inference_steps)
-
-        for t in self.progress_bar(self.scheduler.timesteps):
-            # 1. predict noise model_output
-            model_output = self.unet(image, t).sample
-
-            # 2. predict previous mean of image x_t-1 and add variance depending on eta
-            # eta corresponds to η in paper and should be between [0, 1]
-            # do x_t -> x_t-1
-            image = self.scheduler.step(model_output, t, image, eta).prev_sample
-
-        image = (image / 2 + 0.5).clamp(0, 1)
-        image = image.cpu().permute(0, 2, 3, 1).numpy()
-
-        return image
-```
-
-Now you can upload this short file under the name `pipeline.py` in your preferred [model repository](https://huggingface.co/docs/hub/models-uploading). For Stable Diffusion pipelines, you may also [join the community organisation for shared pipelines](https://huggingface.co/organizations/sd-diffusers-pipelines-library/share/BUPyDUuHcciGTOKaExlqtfFcyCZsVFdrjr) to upload yours.
-Finally, we can load the custom pipeline by passing the model repository name, *e.g.* `sd-diffusers-pipelines-library/my_custom_pipeline` alongside the model repository from where we want to load the `unet` and `scheduler` components.
-
-```python
-my_pipeline = DiffusionPipeline.from_pretrained(
-    "google/ddpm-cifar10-32", custom_pipeline="patrickvonplaten/my_custom_pipeline"
-)
-```
+For more information about community pipelines, take a look at the [Community pipelines](custom_pipeline_examples) guide for how to use them and if you're interested in adding a community pipeline check out the [How to contribute a community pipeline](contribute_pipeline) guide!
\ No newline at end of file

From 5c9dd0af952a92f19a1e672b2a9471ad5674841d Mon Sep 17 00:00:00 2001
From: Takuma Mori <takuma104@gmail.com>
Date: Fri, 14 Apr 2023 12:07:34 +0900
Subject: [PATCH 10/71] Add to support Guess Mode for
 StableDiffusionControlnetPipleline (#2998)

* add guess mode (WIP)

* fix uncond/cond order

* support guidance_scale=1.0 and batch != 1

* remove magic coeff

* add docstring

* add intergration test

* add document to controlnet.mdx

* made the comments a bit more explanatory

* fix table
---
 .../pipelines/stable_diffusion/controlnet.mdx | 36 ++++++++++++++++
 src/diffusers/models/controlnet.py            | 11 ++++-
 .../pipeline_stable_diffusion_controlnet.py   | 42 +++++++++++++++++--
 .../test_stable_diffusion_controlnet.py       | 32 ++++++++++++++
 4 files changed, 115 insertions(+), 6 deletions(-)

diff --git a/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx b/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
index 5a4cfa41ca43..af859177c002 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
@@ -242,6 +242,42 @@ image.save("./multi_controlnet_output.png")
 
 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/controlnet/multi_controlnet_output.png" width=600/>
 
+### Guess Mode
+
+Guess Mode is [a ControlNet feature that was implemented](https://github.com/lllyasviel/ControlNet#guess-mode--non-prompt-mode) after the publication of [the paper](https://arxiv.org/abs/2302.05543). The description states:
+
+>In this mode, the ControlNet encoder will try best to recognize the content of the input control map, like depth map, edge map, scribbles, etc, even if you remove all prompts.
+
+#### The core implementation:
+
+It adjusts the scale of the output residuals from ControlNet by a fixed ratio depending on the block depth. The shallowest DownBlock corresponds to `0.1`. As the blocks get deeper, the scale increases exponentially, and the scale for the output of the MidBlock becomes `1.0`. 
+
+Since the core implementation is just this, **it does not have any impact on prompt conditioning**. While it is common to use it without specifying any prompts, it is also possible to provide prompts if desired.
+
+#### Usage:
+
+Just specify `guess_mode=True` in the pipe() function. A `guidance_scale` between 3.0 and 5.0 is [recommended](https://github.com/lllyasviel/ControlNet#guess-mode--non-prompt-mode).
+```py
+from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
+import torch
+
+controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
+pipe = StableDiffusionControlNetPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", controlnet=controlnet).to(
+    "cuda"
+)
+image = pipe("", image=canny_image, guess_mode=True, guidance_scale=3.0).images[0]
+image.save("guess_mode_generated.png")
+```
+
+#### Output image comparison:
+Canny Control Example
+
+|no guess_mode with prompt|guess_mode without prompt|
+|---|---|
+|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare_guess_mode/output_images/diffusers/output_bird_canny_0.png"><img width="128" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare_guess_mode/output_images/diffusers/output_bird_canny_0.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare_guess_mode/output_images/diffusers/output_bird_canny_0_gm.png"><img width="128" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare_guess_mode/output_images/diffusers/output_bird_canny_0_gm.png"/></a>|
+
+
+
 ## Available checkpoints
 
 ControlNet requires a *control image* in addition to the text-to-image *prompt*. 
diff --git a/src/diffusers/models/controlnet.py b/src/diffusers/models/controlnet.py
index bb608ad82a7a..4f1ffe604578 100644
--- a/src/diffusers/models/controlnet.py
+++ b/src/diffusers/models/controlnet.py
@@ -456,6 +456,7 @@ def forward(
         timestep_cond: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guess_mode: bool = False,
         return_dict: bool = True,
     ) -> Union[ControlNetOutput, Tuple]:
         # check channel order
@@ -556,8 +557,14 @@ def forward(
         mid_block_res_sample = self.controlnet_mid_block(sample)
 
         # 6. scaling
-        down_block_res_samples = [sample * conditioning_scale for sample in down_block_res_samples]
-        mid_block_res_sample *= conditioning_scale
+        if guess_mode:
+            scales = torch.logspace(-1, 0, len(down_block_res_samples) + 1)  # 0.1 to 1.0
+            scales *= conditioning_scale
+            down_block_res_samples = [sample * scale for sample, scale in zip(down_block_res_samples, scales)]
+            mid_block_res_sample *= scales[-1]  # last one
+        else:
+            down_block_res_samples = [sample * conditioning_scale for sample in down_block_res_samples]
+            mid_block_res_sample *= conditioning_scale
 
         if not return_dict:
             return (down_block_res_samples, mid_block_res_sample)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index 12d21afbfeda..1ebd469f76b3 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -118,6 +118,7 @@ def forward(
         timestep_cond: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guess_mode: bool = False,
         return_dict: bool = True,
     ) -> Union[ControlNetOutput, Tuple]:
         for i, (image, scale, controlnet) in enumerate(zip(controlnet_cond, conditioning_scale, self.nets)):
@@ -131,6 +132,7 @@ def forward(
                 timestep_cond,
                 attention_mask,
                 cross_attention_kwargs,
+                guess_mode,
                 return_dict,
             )
 
@@ -627,7 +629,16 @@ def check_image(self, image, prompt, prompt_embeds):
             )
 
     def prepare_image(
-        self, image, width, height, batch_size, num_images_per_prompt, device, dtype, do_classifier_free_guidance
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance,
+        guess_mode,
     ):
         if not isinstance(image, torch.Tensor):
             if isinstance(image, PIL.Image.Image):
@@ -664,7 +675,7 @@ def prepare_image(
 
         image = image.to(device=device, dtype=dtype)
 
-        if do_classifier_free_guidance:
+        if do_classifier_free_guidance and not guess_mode:
             image = torch.cat([image] * 2)
 
         return image
@@ -747,6 +758,7 @@ def __call__(
         callback_steps: int = 1,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        guess_mode: bool = False,
     ):
         r"""
         Function invoked when calling the pipeline for generation.
@@ -819,6 +831,10 @@ def __call__(
                 The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
                 to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
                 corresponding scale as a list.
+            guess_mode (`bool`, *optional*, defaults to `False`):
+                In this mode, the ControlNet encoder will try best to recognize the content of the input image even if
+                you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.
+
         Examples:
 
         Returns:
@@ -883,6 +899,7 @@ def __call__(
                 device=device,
                 dtype=self.controlnet.dtype,
                 do_classifier_free_guidance=do_classifier_free_guidance,
+                guess_mode=guess_mode,
             )
         elif isinstance(self.controlnet, MultiControlNetModel):
             images = []
@@ -897,6 +914,7 @@ def __call__(
                     device=device,
                     dtype=self.controlnet.dtype,
                     do_classifier_free_guidance=do_classifier_free_guidance,
+                    guess_mode=guess_mode,
                 )
 
                 images.append(image_)
@@ -934,15 +952,31 @@ def __call__(
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 # controlnet(s) inference
+                if guess_mode and do_classifier_free_guidance:
+                    # Infer ControlNet only for the conditional batch.
+                    controlnet_latent_model_input = latents
+                    controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+                else:
+                    controlnet_latent_model_input = latent_model_input
+                    controlnet_prompt_embeds = prompt_embeds
+
                 down_block_res_samples, mid_block_res_sample = self.controlnet(
-                    latent_model_input,
+                    controlnet_latent_model_input,
                     t,
-                    encoder_hidden_states=prompt_embeds,
+                    encoder_hidden_states=controlnet_prompt_embeds,
                     controlnet_cond=image,
                     conditioning_scale=controlnet_conditioning_scale,
+                    guess_mode=guess_mode,
                     return_dict=False,
                 )
 
+                if guess_mode and do_classifier_free_guidance:
+                    # Infered ControlNet only for the conditional batch.
+                    # To apply the output of ControlNet to both the unconditional and conditional batches,
+                    # add 0 to the unconditional batch to keep it unchanged.
+                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
+                    mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
+
                 # predict the noise residual
                 noise_pred = self.unet(
                     latent_model_input,
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
index d7c5e2b0323a..70b3652fce77 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
@@ -553,6 +553,38 @@ def test_sequential_cpu_offloading(self):
         # make sure that less than 7 GB is allocated
         assert mem_bytes < 4 * 10**9
 
+    def test_canny_guess_mode(self):
+        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
+
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+        )
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = ""
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
+        )
+
+        output = pipe(
+            prompt,
+            image,
+            generator=generator,
+            output_type="np",
+            num_inference_steps=3,
+            guidance_scale=3.0,
+            guess_mode=True,
+        )
+
+        image = output.images[0]
+        assert image.shape == (768, 512, 3)
+
+        image_slice = image[-3:, -3:, -1]
+        expected_slice = np.array([0.2724, 0.2846, 0.2724, 0.3843, 0.3682, 0.2736, 0.4675, 0.3862, 0.2887])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
 
 @slow
 @require_torch_gpu

From eb2ef316068620ab2f44b6a7d6b13a0cb146088e Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Thu, 13 Apr 2023 17:54:54 -1000
Subject: [PATCH 11/71] fix default value for attend-and-excite (#3099)

* fix default
---
 .../pipeline_stable_diffusion_attend_and_excite.py              | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
index c81ed5b54f94..fba2a4e32f88 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py
@@ -724,7 +724,7 @@ def __call__(
         max_iter_to_alter: int = 25,
         thresholds: dict = {0: 0.05, 10: 0.5, 20: 0.8},
         scale_factor: int = 20,
-        attn_res: Optional[Tuple[int]] = None,
+        attn_res: Optional[Tuple[int]] = (16, 16),
     ):
         r"""
         Function invoked when calling the pipeline for generation.

From 1bd4c9e93dcbb31135aa8594aaf28f7b6efd39ab Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Fri, 14 Apr 2023 06:39:25 -1000
Subject: [PATCH 12/71] remvoe one line as requested by gc team  (#3077)

remvoe one line
---
 examples/text_to_image/train_text_to_image_flax.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/examples/text_to_image/train_text_to_image_flax.py b/examples/text_to_image/train_text_to_image_flax.py
index 41a02d68f2b1..d44731896c1d 100644
--- a/examples/text_to_image/train_text_to_image_flax.py
+++ b/examples/text_to_image/train_text_to_image_flax.py
@@ -340,11 +340,10 @@ def preprocess_train(examples):
 
         return examples
 
-    if jax.process_index() == 0:
-        if args.max_train_samples is not None:
-            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+    if args.max_train_samples is not None:
+        dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
         # Set the training transforms
-        train_dataset = dataset["train"].with_transform(preprocess_train)
+    train_dataset = dataset["train"].with_transform(preprocess_train)
 
     def collate_fn(examples):
         pixel_values = torch.stack([example["pixel_values"] for example in examples])

From b811964a7b7f3c4cd50dc25a58789a0fed351e09 Mon Sep 17 00:00:00 2001
From: Will Berman <wlbberman@gmail.com>
Date: Fri, 14 Apr 2023 12:39:38 -0700
Subject: [PATCH 13/71] ddpm custom timesteps (#3007)

add custom timesteps test

add custom timesteps descending order check

docs

timesteps -> custom_timesteps

can only pass one of num_inference_steps and timesteps
---
 src/diffusers/schedulers/scheduling_ddpm.py | 79 ++++++++++++++++-----
 tests/schedulers/test_scheduler_ddpm.py     | 56 +++++++++++++++
 2 files changed, 119 insertions(+), 16 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py
index eaaf497f9c1d..2bc34bb8b444 100644
--- a/src/diffusers/schedulers/scheduling_ddpm.py
+++ b/src/diffusers/schedulers/scheduling_ddpm.py
@@ -162,6 +162,7 @@ def __init__(
         self.init_noise_sigma = 1.0
 
         # setable values
+        self.custom_timesteps = False
         self.num_inference_steps = None
         self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy())
 
@@ -191,31 +192,62 @@ def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] =
         """
         return sample
 
-    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+    def set_timesteps(
+        self,
+        num_inference_steps: Optional[int] = None,
+        device: Union[str, torch.device] = None,
+        timesteps: Optional[List[int]] = None,
+    ):
         """
         Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
 
         Args:
-            num_inference_steps (`int`):
-                the number of diffusion steps used when generating samples with a pre-trained model.
+            num_inference_steps (`Optional[int]`):
+                the number of diffusion steps used when generating samples with a pre-trained model. If passed, then
+                `timesteps` must be `None`.
+            device (`str` or `torch.device`, optional):
+                the device to which the timesteps are moved to.
+            custom_timesteps (`List[int]`, optional):
+                custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of equal spacing between timesteps is used. If passed, `num_inference_steps`
+                must be `None`.
+
         """
+        if num_inference_steps is not None and timesteps is not None:
+            raise ValueError("Can only pass one of `num_inference_steps` or `custom_timesteps`.")
+
+        if timesteps is not None:
+            for i in range(1, len(timesteps)):
+                if timesteps[i] >= timesteps[i - 1]:
+                    raise ValueError("`custom_timesteps` must be in descending order.")
+
+            if timesteps[0] >= self.config.num_train_timesteps:
+                raise ValueError(
+                    f"`timesteps` must start before `self.config.train_timesteps`:"
+                    f" {self.config.num_train_timesteps}."
+                )
+
+            timesteps = np.array(timesteps, dtype=np.int64)
+            self.custom_timesteps = True
+        else:
+            if num_inference_steps > self.config.num_train_timesteps:
+                raise ValueError(
+                    f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
+                    f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
+                    f" maximal {self.config.num_train_timesteps} timesteps."
+                )
 
-        if num_inference_steps > self.config.num_train_timesteps:
-            raise ValueError(
-                f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
-                f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
-                f" maximal {self.config.num_train_timesteps} timesteps."
-            )
+            self.num_inference_steps = num_inference_steps
 
-        self.num_inference_steps = num_inference_steps
+            step_ratio = self.config.num_train_timesteps // self.num_inference_steps
+            timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
+            self.custom_timesteps = False
 
-        step_ratio = self.config.num_train_timesteps // self.num_inference_steps
-        timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
         self.timesteps = torch.from_numpy(timesteps).to(device)
 
     def _get_variance(self, t, predicted_variance=None, variance_type=None):
-        num_inference_steps = self.num_inference_steps if self.num_inference_steps else self.config.num_train_timesteps
-        prev_t = t - self.config.num_train_timesteps // num_inference_steps
+        prev_t = self.previous_timestep(t)
+
         alpha_prod_t = self.alphas_cumprod[t]
         alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one
         current_beta_t = 1 - alpha_prod_t / alpha_prod_t_prev
@@ -314,8 +346,8 @@ def step(
 
         """
         t = timestep
-        num_inference_steps = self.num_inference_steps if self.num_inference_steps else self.config.num_train_timesteps
-        prev_t = timestep - self.config.num_train_timesteps // num_inference_steps
+
+        prev_t = self.previous_timestep(t)
 
         if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type in ["learned", "learned_range"]:
             model_output, predicted_variance = torch.split(model_output, sample.shape[1], dim=1)
@@ -428,3 +460,18 @@ def get_velocity(
 
     def __len__(self):
         return self.config.num_train_timesteps
+
+    def previous_timestep(self, timestep):
+        if self.custom_timesteps:
+            index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0]
+            if index == self.timesteps.shape[0] - 1:
+                prev_t = torch.tensor(-1)
+            else:
+                prev_t = self.timesteps[index + 1]
+        else:
+            num_inference_steps = (
+                self.num_inference_steps if self.num_inference_steps else self.config.num_train_timesteps
+            )
+            prev_t = timestep - self.config.num_train_timesteps // num_inference_steps
+
+        return prev_t
diff --git a/tests/schedulers/test_scheduler_ddpm.py b/tests/schedulers/test_scheduler_ddpm.py
index b55a39ee2e79..c44ded43e67e 100644
--- a/tests/schedulers/test_scheduler_ddpm.py
+++ b/tests/schedulers/test_scheduler_ddpm.py
@@ -129,3 +129,59 @@ def test_full_loop_with_v_prediction(self):
 
         assert abs(result_sum.item() - 202.0296) < 1e-2
         assert abs(result_mean.item() - 0.2631) < 1e-3
+
+    def test_custom_timesteps(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        timesteps = [100, 87, 50, 1, 0]
+
+        scheduler.set_timesteps(timesteps=timesteps)
+
+        scheduler_timesteps = scheduler.timesteps
+
+        for i, timestep in enumerate(scheduler_timesteps):
+            if i == len(timesteps) - 1:
+                expected_prev_t = -1
+            else:
+                expected_prev_t = timesteps[i + 1]
+
+            prev_t = scheduler.previous_timestep(timestep)
+            prev_t = prev_t.item()
+
+            self.assertEqual(prev_t, expected_prev_t)
+
+    def test_custom_timesteps_increasing_order(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        timesteps = [100, 87, 50, 51, 0]
+
+        with self.assertRaises(ValueError, msg="`custom_timesteps` must be in descending order."):
+            scheduler.set_timesteps(timesteps=timesteps)
+
+    def test_custom_timesteps_passing_both_num_inference_steps_and_timesteps(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        timesteps = [100, 87, 50, 1, 0]
+        num_inference_steps = len(timesteps)
+
+        with self.assertRaises(ValueError, msg="Can only pass one of `num_inference_steps` or `custom_timesteps`."):
+            scheduler.set_timesteps(num_inference_steps=num_inference_steps, timesteps=timesteps)
+
+    def test_custom_timesteps_too_large(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        timesteps = [scheduler.config.num_train_timesteps]
+
+        with self.assertRaises(
+            ValueError,
+            msg="`timesteps` must start before `self.config.train_timesteps`: {scheduler.config.num_train_timesteps}}",
+        ):
+            scheduler.set_timesteps(timesteps=timesteps)

From 807f69b32879a0ea74aa4e58ee007988507d6df8 Mon Sep 17 00:00:00 2001
From: Tommaso De Rossi <beats.by.morse@gmail.com>
Date: Sun, 16 Apr 2023 19:04:11 +0200
Subject: [PATCH 14/71] Fix breaking change in
 `pipeline_stable_diffusion_controlnet.py` (#3118)

fix breaking change
---
 .../stable_diffusion/pipeline_stable_diffusion_controlnet.py  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index 1ebd469f76b3..3b8889d92b55 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -637,8 +637,8 @@ def prepare_image(
         num_images_per_prompt,
         device,
         dtype,
-        do_classifier_free_guidance,
-        guess_mode,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
     ):
         if not isinstance(image, torch.Tensor):
             if isinstance(image, PIL.Image.Image):

From cfc99adf0f2e45afbddc117671e4faa59ca83ae2 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Sun, 16 Apr 2023 19:07:23 +0200
Subject: [PATCH 15/71] Add global pooling to controlnet (#3121)

---
 src/diffusers/models/controlnet.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/diffusers/models/controlnet.py b/src/diffusers/models/controlnet.py
index 4f1ffe604578..3ffbb04eb222 100644
--- a/src/diffusers/models/controlnet.py
+++ b/src/diffusers/models/controlnet.py
@@ -119,6 +119,7 @@ def __init__(
         projection_class_embeddings_input_dim: Optional[int] = None,
         controlnet_conditioning_channel_order: str = "rgb",
         conditioning_embedding_out_channels: Optional[Tuple[int]] = (16, 32, 96, 256),
+        global_pool_conditions: bool = False,
     ):
         super().__init__()
 
@@ -566,6 +567,12 @@ def forward(
             down_block_res_samples = [sample * conditioning_scale for sample in down_block_res_samples]
             mid_block_res_sample *= conditioning_scale
 
+        if self.config.global_pool_conditions:
+            down_block_res_samples = [
+                torch.mean(sample, dim=(2, 3), keepdim=True) for sample in down_block_res_samples
+            ]
+            mid_block_res_sample = torch.mean(mid_block_res_sample, dim=(2, 3), keepdim=True)
+
         if not return_dict:
             return (down_block_res_samples, mid_block_res_sample)
 

From beb848e2b6cc888bd5039e6f6cac7c932c6c3225 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 17 Apr 2023 11:53:04 +0200
Subject: [PATCH 16/71] [Bug fix] Fix img2img processor with safety checker
 (#3127)

Fix img2img processor with safety checker
---
 .../pipelines/stable_diffusion/safety_checker.py   |  5 ++++-
 .../test_stable_diffusion_img2img.py               | 14 ++++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/safety_checker.py b/src/diffusers/pipelines/stable_diffusion/safety_checker.py
index 84b8aeb7bcde..38c7b22d08d4 100644
--- a/src/diffusers/pipelines/stable_diffusion/safety_checker.py
+++ b/src/diffusers/pipelines/stable_diffusion/safety_checker.py
@@ -85,7 +85,10 @@ def forward(self, clip_input, images):
 
         for idx, has_nsfw_concept in enumerate(has_nsfw_concepts):
             if has_nsfw_concept:
-                images[idx] = np.zeros(images[idx].shape)  # black image
+                if torch.is_tensor(images) or torch.is_tensor(images[0]):
+                    images[idx] = torch.zeros_like(images[idx])  # black image
+                else:
+                    images[idx] = np.zeros(images[idx].shape)  # black image
 
         if any(has_nsfw_concepts):
             logger.warning(
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
index 127b1c216549..0e2c4acb5484 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
@@ -453,6 +453,20 @@ def test_stable_diffusion_img2img_pipeline_multiple_of_8(self):
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-3
 
+    def test_img2img_safety_checker_works(self):
+        sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+        sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        inputs["num_inference_steps"] = 20
+        # make sure the safety checker is activated
+        inputs["prompt"] = "naked, sex, porn"
+        out = sd_pipe(**inputs)
+
+        assert out.nsfw_content_detected[0], f"Safety checker should work for prompt: {inputs['prompt']}"
+        assert np.abs(out.images[0]).sum() < 1e-5  # should be all zeros
+
 
 @nightly
 @require_torch_gpu

From ca783a0f1f4ce8b0a16e6b96a8890edc47489e3a Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 17 Apr 2023 12:52:40 +0200
Subject: [PATCH 17/71] [Bug fix] Make sure correct timesteps are chosen for
 img2img (#3128)

Make sure correct timesteps are chosen for img2img
---
 .../pipeline_alt_diffusion_img2img.py         |  2 +-
 .../pipeline_cycle_diffusion.py               |  2 +-
 .../pipeline_stable_diffusion_depth2img.py    |  2 +-
 .../pipeline_stable_diffusion_img2img.py      |  2 +-
 ...ipeline_stable_diffusion_inpaint_legacy.py |  2 +-
 .../test_stable_diffusion_img2img.py          | 28 +++++++++++++++++++
 6 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index bb8116f2f5d5..86fc47f424e9 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -503,7 +503,7 @@ def get_timesteps(self, num_inference_steps, strength, device):
         init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
 
         t_start = max(num_inference_steps - init_timestep, 0)
-        timesteps = self.scheduler.timesteps[t_start:]
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
 
         return timesteps, num_inference_steps - t_start
 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
index dd8e4f16dfc0..e2accb6d2d2a 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
@@ -528,7 +528,7 @@ def get_timesteps(self, num_inference_steps, strength, device):
         init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
 
         t_start = max(num_inference_steps - init_timestep, 0)
-        timesteps = self.scheduler.timesteps[t_start:]
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
 
         return timesteps, num_inference_steps - t_start
 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
index 54f00ebc23f2..4fe117ba120b 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
@@ -390,7 +390,7 @@ def get_timesteps(self, num_inference_steps, strength, device):
         init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
 
         t_start = max(num_inference_steps - init_timestep, 0)
-        timesteps = self.scheduler.timesteps[t_start:]
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
 
         return timesteps, num_inference_steps - t_start
 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index a0befdae73c4..5860a53ad528 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -511,7 +511,7 @@ def get_timesteps(self, num_inference_steps, strength, device):
         init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
 
         t_start = max(num_inference_steps - init_timestep, 0)
-        timesteps = self.scheduler.timesteps[t_start:]
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
 
         return timesteps, num_inference_steps - t_start
 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
index b7a0c942bbe2..6d9cbaf67a07 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
@@ -507,7 +507,7 @@ def get_timesteps(self, num_inference_steps, strength, device):
         init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
 
         t_start = max(num_inference_steps - init_timestep, 0)
-        timesteps = self.scheduler.timesteps[t_start:]
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
 
         return timesteps, num_inference_steps - t_start
 
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
index 0e2c4acb5484..4262114c78eb 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
@@ -25,6 +25,7 @@
     AutoencoderKL,
     DDIMScheduler,
     DPMSolverMultistepScheduler,
+    HeunDiscreteScheduler,
     LMSDiscreteScheduler,
     PNDMScheduler,
     StableDiffusionImg2ImgPipeline,
@@ -416,6 +417,33 @@ def test_stable_diffusion_pipeline_with_model_offloading(self):
         for module in pipe.text_encoder, pipe.unet, pipe.vae:
             assert module.device == torch.device("cpu")
 
+    def test_img2img_2nd_order(self):
+        sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+        sd_pipe.scheduler = HeunDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_inputs(torch_device)
+        inputs["num_inference_steps"] = 10
+        inputs["strength"] = 0.75
+        image = sd_pipe(**inputs).images[0]
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/img2img/img2img_heun.npy"
+        )
+        max_diff = np.abs(expected_image - image).max()
+        assert max_diff < 5e-2
+
+        inputs = self.get_inputs(torch_device)
+        inputs["num_inference_steps"] = 11
+        inputs["strength"] = 0.75
+        image_other = sd_pipe(**inputs).images[0]
+
+        mean_diff = np.abs(image - image_other).mean()
+
+        # images should be very similar
+        assert mean_diff < 5e-2
+
     def test_stable_diffusion_img2img_pipeline_multiple_of_8(self):
         init_image = load_image(
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"

From ed8fd38337c0f75259cae86c4013c6125fe96a61 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 17 Apr 2023 17:19:11 +0200
Subject: [PATCH 18/71] Improve deprecation warnings (#3131)

---
 src/diffusers/pipelines/pipeline_utils.py | 10 +++++-----
 tests/models/test_lora_layers.py          |  1 +
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index c095da1665de..d531d967c3d1 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -201,24 +201,24 @@ def variant_compatible_siblings(filenames, variant=None) -> Union[List[os.PathLi
     # .bin, .safetensors, ...
     weight_suffixs = [w.split(".")[-1] for w in weight_names]
     # -00001-of-00002
-    transformers_index_format = "\d{5}-of-\d{5}"
+    transformers_index_format = r"\d{5}-of-\d{5}"
 
     if variant is not None:
         # `diffusion_pytorch_model.fp16.bin` as well as `model.fp16-00001-of-00002.safetenstors`
         variant_file_re = re.compile(
-            f"({'|'.join(weight_prefixes)})\.({variant}|{variant}-{transformers_index_format})\.({'|'.join(weight_suffixs)})$"
+            rf"({'|'.join(weight_prefixes)})\.({variant}|{variant}-{transformers_index_format})\.({'|'.join(weight_suffixs)})$"
         )
         # `text_encoder/pytorch_model.bin.index.fp16.json`
         variant_index_re = re.compile(
-            f"({'|'.join(weight_prefixes)})\.({'|'.join(weight_suffixs)})\.index\.{variant}\.json$"
+            rf"({'|'.join(weight_prefixes)})\.({'|'.join(weight_suffixs)})\.index\.{variant}\.json$"
         )
 
     # `diffusion_pytorch_model.bin` as well as `model-00001-of-00002.safetenstors`
     non_variant_file_re = re.compile(
-        f"({'|'.join(weight_prefixes)})(-{transformers_index_format})?\.({'|'.join(weight_suffixs)})$"
+        rf"({'|'.join(weight_prefixes)})(-{transformers_index_format})?\.({'|'.join(weight_suffixs)})$"
     )
     # `text_encoder/pytorch_model.bin.index.json`
-    non_variant_index_re = re.compile(f"({'|'.join(weight_prefixes)})\.({'|'.join(weight_suffixs)})\.index\.json")
+    non_variant_index_re = re.compile(rf"({'|'.join(weight_prefixes)})\.({'|'.join(weight_suffixs)})\.index\.json")
 
     if variant is not None:
         variant_weights = {f for f in filenames if variant_file_re.match(f.split("/")[-1]) is not None}
diff --git a/tests/models/test_lora_layers.py b/tests/models/test_lora_layers.py
index 9bcdc5d93301..6f75902d388f 100644
--- a/tests/models/test_lora_layers.py
+++ b/tests/models/test_lora_layers.py
@@ -71,6 +71,7 @@ def get_dummy_components(self):
             beta_schedule="scaled_linear",
             clip_sample=False,
             set_alpha_to_one=False,
+            steps_offset=1,
         )
         torch.manual_seed(0)
         vae = AutoencoderKL(

From 703307efcc49fbc3f1362344dc5d577e4c4595c8 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 17 Apr 2023 18:16:28 +0200
Subject: [PATCH 19/71] Fix config deprecation (#3129)

* Better deprecation message

* Better deprecation message

* Better doc string

* Fixes

* fix more

* fix more

* Improve __getattr__

* correct more

* fix more

* fix

* Improve more

* more improvements

* fix more

* Apply suggestions from code review

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>

* make style

* Fix all rest & add tests & remove old deprecation fns

---------

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
---
 .../community/unclip_image_interpolation.py   | 12 ++--
 .../community/unclip_text_interpolation.py    | 12 ++--
 src/diffusers/configuration_utils.py          | 18 +++++
 src/diffusers/models/autoencoder_kl.py        | 12 +---
 src/diffusers/models/modeling_utils.py        | 21 +++++-
 src/diffusers/models/unet_1d.py               | 12 +---
 src/diffusers/models/unet_2d.py               | 12 +---
 src/diffusers/models/unet_2d_condition.py     | 12 +---
 src/diffusers/pipelines/pipeline_utils.py     | 69 +++++++++----------
 .../pipeline_text_to_video_zero.py            |  2 +-
 .../pipelines/unclip/pipeline_unclip.py       | 12 ++--
 .../unclip/pipeline_unclip_image_variation.py | 12 ++--
 .../versatile_diffusion/modeling_text_unet.py | 15 +---
 ...ipeline_versatile_diffusion_dual_guided.py |  2 +-
 ...ine_versatile_diffusion_image_variation.py |  2 +-
 ...eline_versatile_diffusion_text_to_image.py |  2 +-
 src/diffusers/schedulers/scheduling_ddpm.py   | 12 +---
 src/diffusers/utils/deprecation_utils.py      |  4 +-
 tests/models/test_modeling_common.py          | 47 ++++++++++++-
 tests/pipelines/unclip/test_unclip.py         |  8 +--
 .../unclip/test_unclip_image_variation.py     | 13 ++--
 tests/schedulers/test_schedulers.py           | 44 ++++++++++++
 22 files changed, 209 insertions(+), 146 deletions(-)

diff --git a/examples/community/unclip_image_interpolation.py b/examples/community/unclip_image_interpolation.py
index d0b54125b688..453ac07af7c6 100644
--- a/examples/community/unclip_image_interpolation.py
+++ b/examples/community/unclip_image_interpolation.py
@@ -372,9 +372,9 @@ def __call__(
         self.decoder_scheduler.set_timesteps(decoder_num_inference_steps, device=device)
         decoder_timesteps_tensor = self.decoder_scheduler.timesteps
 
-        num_channels_latents = self.decoder.in_channels
-        height = self.decoder.sample_size
-        width = self.decoder.sample_size
+        num_channels_latents = self.decoder.config.in_channels
+        height = self.decoder.config.sample_size
+        width = self.decoder.config.sample_size
 
         decoder_latents = self.prepare_latents(
             (batch_size, num_channels_latents, height, width),
@@ -425,9 +425,9 @@ def __call__(
         self.super_res_scheduler.set_timesteps(super_res_num_inference_steps, device=device)
         super_res_timesteps_tensor = self.super_res_scheduler.timesteps
 
-        channels = self.super_res_first.in_channels // 2
-        height = self.super_res_first.sample_size
-        width = self.super_res_first.sample_size
+        channels = self.super_res_first.config.in_channels // 2
+        height = self.super_res_first.config.sample_size
+        width = self.super_res_first.config.sample_size
 
         super_res_latents = self.prepare_latents(
             (batch_size, channels, height, width),
diff --git a/examples/community/unclip_text_interpolation.py b/examples/community/unclip_text_interpolation.py
index ac6b73d974b6..290f45317004 100644
--- a/examples/community/unclip_text_interpolation.py
+++ b/examples/community/unclip_text_interpolation.py
@@ -452,9 +452,9 @@ def __call__(
         self.decoder_scheduler.set_timesteps(decoder_num_inference_steps, device=device)
         decoder_timesteps_tensor = self.decoder_scheduler.timesteps
 
-        num_channels_latents = self.decoder.in_channels
-        height = self.decoder.sample_size
-        width = self.decoder.sample_size
+        num_channels_latents = self.decoder.config.in_channels
+        height = self.decoder.config.sample_size
+        width = self.decoder.config.sample_size
 
         decoder_latents = self.prepare_latents(
             (batch_size, num_channels_latents, height, width),
@@ -505,9 +505,9 @@ def __call__(
         self.super_res_scheduler.set_timesteps(super_res_num_inference_steps, device=device)
         super_res_timesteps_tensor = self.super_res_scheduler.timesteps
 
-        channels = self.super_res_first.in_channels // 2
-        height = self.super_res_first.sample_size
-        width = self.super_res_first.sample_size
+        channels = self.super_res_first.config.in_channels // 2
+        height = self.super_res_first.config.sample_size
+        width = self.super_res_first.config.sample_size
 
         super_res_latents = self.prepare_latents(
             (batch_size, channels, height, width),
diff --git a/src/diffusers/configuration_utils.py b/src/diffusers/configuration_utils.py
index 45930431351a..772e119fbe97 100644
--- a/src/diffusers/configuration_utils.py
+++ b/src/diffusers/configuration_utils.py
@@ -118,6 +118,24 @@ def register_to_config(self, **kwargs):
 
         self._internal_dict = FrozenDict(internal_dict)
 
+    def __getattr__(self, name: str) -> Any:
+        """The only reason we overwrite `getattr` here is to gracefully deprecate accessing
+        config attributes directly. See https://github.com/huggingface/diffusers/pull/3129
+
+        Tihs funtion is mostly copied from PyTorch's __getattr__ overwrite:
+        https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
+        """
+
+        is_in_config = "_internal_dict" in self.__dict__ and hasattr(self.__dict__["_internal_dict"], name)
+        is_attribute = name in self.__dict__
+
+        if is_in_config and not is_attribute:
+            deprecation_message = f"Accessing config attribute `{name}` directly via '{type(self).__name__}' object attribute is deprecated. Please access '{name}' over '{type(self).__name__}'s config object instead, e.g. 'scheduler.config.{name}'."
+            deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False)
+            return self._internal_dict[name]
+
+        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
+
     def save_config(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
         """
         Save a configuration object to the directory `save_directory`, so that it can be re-loaded using the
diff --git a/src/diffusers/models/autoencoder_kl.py b/src/diffusers/models/autoencoder_kl.py
index 5d1c54a9af25..1a8a204d80ce 100644
--- a/src/diffusers/models/autoencoder_kl.py
+++ b/src/diffusers/models/autoencoder_kl.py
@@ -18,7 +18,7 @@
 import torch.nn as nn
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, apply_forward_hook, deprecate
+from ..utils import BaseOutput, apply_forward_hook
 from .modeling_utils import ModelMixin
 from .vae import Decoder, DecoderOutput, DiagonalGaussianDistribution, Encoder
 
@@ -123,16 +123,6 @@ def __init__(
         self.tile_latent_min_size = int(sample_size / (2 ** (len(self.config.block_out_channels) - 1)))
         self.tile_overlap_factor = 0.25
 
-    @property
-    def block_out_channels(self):
-        deprecate(
-            "block_out_channels",
-            "1.0.0",
-            "Accessing `block_out_channels` directly via vae.block_out_channels is deprecated. Please use `vae.config.block_out_channels instead`",
-            standard_warn=False,
-        )
-        return self.config.block_out_channels
-
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, (Encoder, Decoder)):
             module.gradient_checkpointing = value
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
index 6a849f6f0e45..5363e6330623 100644
--- a/src/diffusers/models/modeling_utils.py
+++ b/src/diffusers/models/modeling_utils.py
@@ -17,7 +17,7 @@
 import inspect
 import os
 from functools import partial
-from typing import Callable, List, Optional, Tuple, Union
+from typing import Any, Callable, List, Optional, Tuple, Union
 
 import torch
 from torch import Tensor, device
@@ -32,6 +32,7 @@
     WEIGHTS_NAME,
     _add_variant,
     _get_model_file,
+    deprecate,
     is_accelerate_available,
     is_safetensors_available,
     is_torch_version,
@@ -156,6 +157,24 @@ class ModelMixin(torch.nn.Module):
     def __init__(self):
         super().__init__()
 
+    def __getattr__(self, name: str) -> Any:
+        """The only reason we overwrite `getattr` here is to gracefully deprecate accessing
+        config attributes directly. See https://github.com/huggingface/diffusers/pull/3129 We need to overwrite
+        __getattr__ here in addition so that we don't trigger `torch.nn.Module`'s __getattr__':
+        https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
+        """
+
+        is_in_config = "_internal_dict" in self.__dict__ and hasattr(self.__dict__["_internal_dict"], name)
+        is_attribute = name in self.__dict__
+
+        if is_in_config and not is_attribute:
+            deprecation_message = f"Accessing config attribute `{name}` directly via '{type(self).__name__}' object attribute is deprecated. Please access '{name}' over '{type(self).__name__}'s config object instead, e.g. 'unet.config.{name}'."
+            deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False, stacklevel=3)
+            return self._internal_dict[name]
+
+        # call PyTorch's https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
+        return super().__getattr__(name)
+
     @property
     def is_gradient_checkpointing(self) -> bool:
         """
diff --git a/src/diffusers/models/unet_1d.py b/src/diffusers/models/unet_1d.py
index c7755bb3ed45..34a1d2b5160e 100644
--- a/src/diffusers/models/unet_1d.py
+++ b/src/diffusers/models/unet_1d.py
@@ -19,7 +19,7 @@
 import torch.nn as nn
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, deprecate
+from ..utils import BaseOutput
 from .embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
 from .modeling_utils import ModelMixin
 from .unet_1d_blocks import get_down_block, get_mid_block, get_out_block, get_up_block
@@ -190,16 +190,6 @@ def __init__(
             fc_dim=block_out_channels[-1] // 4,
         )
 
-    @property
-    def in_channels(self):
-        deprecate(
-            "in_channels",
-            "1.0.0",
-            "Accessing `in_channels` directly via unet.in_channels is deprecated. Please use `unet.config.in_channels` instead",
-            standard_warn=False,
-        )
-        return self.config.in_channels
-
     def forward(
         self,
         sample: torch.FloatTensor,
diff --git a/src/diffusers/models/unet_2d.py b/src/diffusers/models/unet_2d.py
index a83e4917c143..2a6a1b9de5f2 100644
--- a/src/diffusers/models/unet_2d.py
+++ b/src/diffusers/models/unet_2d.py
@@ -18,7 +18,7 @@
 import torch.nn as nn
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, deprecate
+from ..utils import BaseOutput
 from .embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
 from .modeling_utils import ModelMixin
 from .unet_2d_blocks import UNetMidBlock2D, get_down_block, get_up_block
@@ -216,16 +216,6 @@ def __init__(
         self.conv_act = nn.SiLU()
         self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, kernel_size=3, padding=1)
 
-    @property
-    def in_channels(self):
-        deprecate(
-            "in_channels",
-            "1.0.0",
-            "Accessing `in_channels` directly via unet.in_channels is deprecated. Please use `unet.config.in_channels` instead",
-            standard_warn=False,
-        )
-        return self.config.in_channels
-
     def forward(
         self,
         sample: torch.FloatTensor,
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index 1b982aedc5de..b2814356939b 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -21,7 +21,7 @@
 
 from ..configuration_utils import ConfigMixin, register_to_config
 from ..loaders import UNet2DConditionLoadersMixin
-from ..utils import BaseOutput, deprecate, logging
+from ..utils import BaseOutput, logging
 from .attention_processor import AttentionProcessor, AttnProcessor
 from .embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
 from .modeling_utils import ModelMixin
@@ -447,16 +447,6 @@ def __init__(
             block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
         )
 
-    @property
-    def in_channels(self):
-        deprecate(
-            "in_channels",
-            "1.0.0",
-            "Accessing `in_channels` directly via unet.in_channels is deprecated. Please use `unet.config.in_channels` instead",
-            standard_warn=False,
-        )
-        return self.config.in_channels
-
     @property
     def attn_processors(self) -> Dict[str, AttentionProcessor]:
         r"""
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index d531d967c3d1..2d61f1a3700f 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -508,7 +508,7 @@ def register_modules(self, **kwargs):
             setattr(self, name, module)
 
     def __setattr__(self, name: str, value: Any):
-        if hasattr(self, name) and hasattr(self.config, name):
+        if name in self.__dict__ and hasattr(self.config, name):
             # We need to overwrite the config if name exists in config
             if isinstance(getattr(self.config, name), (tuple, list)):
                 if value is not None and self.config[name][0] is not None:
@@ -648,26 +648,25 @@ def module_is_offloaded(module):
             )
 
         module_names, _ = self._get_signature_keys(self)
-        module_names = [m for m in module_names if hasattr(self, m)]
+        modules = [getattr(self, n, None) for n in module_names]
+        modules = [m for m in modules if isinstance(m, torch.nn.Module)]
 
         is_offloaded = pipeline_is_offloaded or pipeline_is_sequentially_offloaded
-        for name in module_names:
-            module = getattr(self, name)
-            if isinstance(module, torch.nn.Module):
-                module.to(torch_device, torch_dtype)
-                if (
-                    module.dtype == torch.float16
-                    and str(torch_device) in ["cpu"]
-                    and not silence_dtype_warnings
-                    and not is_offloaded
-                ):
-                    logger.warning(
-                        "Pipelines loaded with `torch_dtype=torch.float16` cannot run with `cpu` device. It"
-                        " is not recommended to move them to `cpu` as running them will fail. Please make"
-                        " sure to use an accelerator to run the pipeline in inference, due to the lack of"
-                        " support for`float16` operations on this device in PyTorch. Please, remove the"
-                        " `torch_dtype=torch.float16` argument, or use another device for inference."
-                    )
+        for module in modules:
+            module.to(torch_device, torch_dtype)
+            if (
+                module.dtype == torch.float16
+                and str(torch_device) in ["cpu"]
+                and not silence_dtype_warnings
+                and not is_offloaded
+            ):
+                logger.warning(
+                    "Pipelines loaded with `torch_dtype=torch.float16` cannot run with `cpu` device. It"
+                    " is not recommended to move them to `cpu` as running them will fail. Please make"
+                    " sure to use an accelerator to run the pipeline in inference, due to the lack of"
+                    " support for`float16` operations on this device in PyTorch. Please, remove the"
+                    " `torch_dtype=torch.float16` argument, or use another device for inference."
+                )
         return self
 
     @property
@@ -677,12 +676,12 @@ def device(self) -> torch.device:
             `torch.device`: The torch device on which the pipeline is located.
         """
         module_names, _ = self._get_signature_keys(self)
-        module_names = [m for m in module_names if hasattr(self, m)]
+        modules = [getattr(self, n, None) for n in module_names]
+        modules = [m for m in modules if isinstance(m, torch.nn.Module)]
+
+        for module in modules:
+            return module.device
 
-        for name in module_names:
-            module = getattr(self, name)
-            if isinstance(module, torch.nn.Module):
-                return module.device
         return torch.device("cpu")
 
     @classmethod
@@ -1451,13 +1450,12 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
             for child in module.children():
                 fn_recursive_set_mem_eff(child)
 
-        module_names, _, _ = self.extract_init_dict(dict(self.config))
-        module_names = [m for m in module_names if hasattr(self, m)]
+        module_names, _ = self._get_signature_keys(self)
+        modules = [getattr(self, n, None) for n in module_names]
+        modules = [m for m in modules if isinstance(m, torch.nn.Module)]
 
-        for module_name in module_names:
-            module = getattr(self, module_name)
-            if isinstance(module, torch.nn.Module):
-                fn_recursive_set_mem_eff(module)
+        for module in modules:
+            fn_recursive_set_mem_eff(module)
 
     def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
         r"""
@@ -1484,10 +1482,9 @@ def disable_attention_slicing(self):
         self.enable_attention_slicing(None)
 
     def set_attention_slice(self, slice_size: Optional[int]):
-        module_names, _, _ = self.extract_init_dict(dict(self.config))
-        module_names = [m for m in module_names if hasattr(self, m)]
+        module_names, _ = self._get_signature_keys(self)
+        modules = [getattr(self, n, None) for n in module_names]
+        modules = [m for m in modules if isinstance(m, torch.nn.Module) and hasattr(m, "set_attention_slice")]
 
-        for module_name in module_names:
-            module = getattr(self, module_name)
-            if isinstance(module, torch.nn.Module) and hasattr(module, "set_attention_slice"):
-                module.set_attention_slice(slice_size)
+        for module in modules:
+            module.set_attention_slice(slice_size)
diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py
index cf5e6e399a77..5b163bbbc8f5 100644
--- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py
+++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py
@@ -441,7 +441,7 @@ def __call__(
         timesteps = self.scheduler.timesteps
 
         # Prepare latent variables
-        num_channels_latents = self.unet.in_channels
+        num_channels_latents = self.unet.config.in_channels
         latents = self.prepare_latents(
             batch_size * num_videos_per_prompt,
             num_channels_latents,
diff --git a/src/diffusers/pipelines/unclip/pipeline_unclip.py b/src/diffusers/pipelines/unclip/pipeline_unclip.py
index 3aac39b3a3b0..abbb48ce8f46 100644
--- a/src/diffusers/pipelines/unclip/pipeline_unclip.py
+++ b/src/diffusers/pipelines/unclip/pipeline_unclip.py
@@ -413,9 +413,9 @@ def __call__(
         self.decoder_scheduler.set_timesteps(decoder_num_inference_steps, device=device)
         decoder_timesteps_tensor = self.decoder_scheduler.timesteps
 
-        num_channels_latents = self.decoder.in_channels
-        height = self.decoder.sample_size
-        width = self.decoder.sample_size
+        num_channels_latents = self.decoder.config.in_channels
+        height = self.decoder.config.sample_size
+        width = self.decoder.config.sample_size
 
         decoder_latents = self.prepare_latents(
             (batch_size, num_channels_latents, height, width),
@@ -466,9 +466,9 @@ def __call__(
         self.super_res_scheduler.set_timesteps(super_res_num_inference_steps, device=device)
         super_res_timesteps_tensor = self.super_res_scheduler.timesteps
 
-        channels = self.super_res_first.in_channels // 2
-        height = self.super_res_first.sample_size
-        width = self.super_res_first.sample_size
+        channels = self.super_res_first.config.in_channels // 2
+        height = self.super_res_first.config.sample_size
+        width = self.super_res_first.config.sample_size
 
         super_res_latents = self.prepare_latents(
             (batch_size, channels, height, width),
diff --git a/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py b/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py
index 56d522354d9a..30d74cd36bb0 100644
--- a/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py
+++ b/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py
@@ -339,9 +339,9 @@ def __call__(
         self.decoder_scheduler.set_timesteps(decoder_num_inference_steps, device=device)
         decoder_timesteps_tensor = self.decoder_scheduler.timesteps
 
-        num_channels_latents = self.decoder.in_channels
-        height = self.decoder.sample_size
-        width = self.decoder.sample_size
+        num_channels_latents = self.decoder.config.in_channels
+        height = self.decoder.config.sample_size
+        width = self.decoder.config.sample_size
 
         if decoder_latents is None:
             decoder_latents = self.prepare_latents(
@@ -393,9 +393,9 @@ def __call__(
         self.super_res_scheduler.set_timesteps(super_res_num_inference_steps, device=device)
         super_res_timesteps_tensor = self.super_res_scheduler.timesteps
 
-        channels = self.super_res_first.in_channels // 2
-        height = self.super_res_first.sample_size
-        width = self.super_res_first.sample_size
+        channels = self.super_res_first.config.in_channels // 2
+        height = self.super_res_first.config.sample_size
+        width = self.super_res_first.config.sample_size
 
         if super_res_latents is None:
             super_res_latents = self.prepare_latents(
diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
index 35ddfcadc3cb..4377be1181a8 100644
--- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -18,7 +18,7 @@
 from ...models.embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
 from ...models.transformer_2d import Transformer2DModel
 from ...models.unet_2d_condition import UNet2DConditionOutput
-from ...utils import deprecate, logging
+from ...utils import logging
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -544,19 +544,6 @@ def __init__(
             block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
         )
 
-    @property
-    def in_channels(self):
-        deprecate(
-            "in_channels",
-            "1.0.0",
-            (
-                "Accessing `in_channels` directly via unet.in_channels is deprecated. Please use"
-                " `unet.config.in_channels` instead"
-            ),
-            standard_warn=False,
-        )
-        return self.config.in_channels
-
     @property
     def attn_processors(self) -> Dict[str, AttentionProcessor]:
         r"""
diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
index 0f385ed6612c..661a1bd3cf73 100644
--- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
+++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
@@ -533,7 +533,7 @@ def __call__(
         timesteps = self.scheduler.timesteps
 
         # 5. Prepare latent variables
-        num_channels_latents = self.image_unet.in_channels
+        num_channels_latents = self.image_unet.config.in_channels
         latents = self.prepare_latents(
             batch_size * num_images_per_prompt,
             num_channels_latents,
diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
index 2b47184d7773..e3a2ee370362 100644
--- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
+++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
@@ -378,7 +378,7 @@ def __call__(
         timesteps = self.scheduler.timesteps
 
         # 5. Prepare latent variables
-        num_channels_latents = self.image_unet.in_channels
+        num_channels_latents = self.image_unet.config.in_channels
         latents = self.prepare_latents(
             batch_size * num_images_per_prompt,
             num_channels_latents,
diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
index fdca625fd99d..26b9be2bfa76 100644
--- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
+++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
@@ -452,7 +452,7 @@ def __call__(
         timesteps = self.scheduler.timesteps
 
         # 5. Prepare latent variables
-        num_channels_latents = self.image_unet.in_channels
+        num_channels_latents = self.image_unet.config.in_channels
         latents = self.prepare_latents(
             batch_size * num_images_per_prompt,
             num_channels_latents,
diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py
index 2bc34bb8b444..a8a71fe420aa 100644
--- a/src/diffusers/schedulers/scheduling_ddpm.py
+++ b/src/diffusers/schedulers/scheduling_ddpm.py
@@ -22,7 +22,7 @@
 import torch
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, deprecate, randn_tensor
+from ..utils import BaseOutput, randn_tensor
 from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
 
 
@@ -168,16 +168,6 @@ def __init__(
 
         self.variance_type = variance_type
 
-    @property
-    def num_train_timesteps(self):
-        deprecate(
-            "num_train_timesteps",
-            "1.0.0",
-            "Accessing `num_train_timesteps` directly via scheduler.num_train_timesteps is deprecated. Please use `scheduler.config.num_train_timesteps instead`",
-            standard_warn=False,
-        )
-        return self.config.num_train_timesteps
-
     def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
         """
         Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
diff --git a/src/diffusers/utils/deprecation_utils.py b/src/diffusers/utils/deprecation_utils.py
index 6bdda664e102..f482deddd2f4 100644
--- a/src/diffusers/utils/deprecation_utils.py
+++ b/src/diffusers/utils/deprecation_utils.py
@@ -5,7 +5,7 @@
 from packaging import version
 
 
-def deprecate(*args, take_from: Optional[Union[Dict, Any]] = None, standard_warn=True):
+def deprecate(*args, take_from: Optional[Union[Dict, Any]] = None, standard_warn=True, stacklevel=2):
     from .. import __version__
 
     deprecated_kwargs = take_from
@@ -32,7 +32,7 @@ def deprecate(*args, take_from: Optional[Union[Dict, Any]] = None, standard_warn
 
         if warning is not None:
             warning = warning + " " if standard_warn else ""
-            warnings.warn(warning + message, FutureWarning, stacklevel=2)
+            warnings.warn(warning + message, FutureWarning, stacklevel=stacklevel)
 
     if isinstance(deprecated_kwargs, dict) and len(deprecated_kwargs) > 0:
         call_frame = inspect.getouterframes(inspect.currentframe())[1]
diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py
index 40aba3b24967..4a94a77fcabb 100644
--- a/tests/models/test_modeling_common.py
+++ b/tests/models/test_modeling_common.py
@@ -26,8 +26,8 @@
 
 from diffusers.models import UNet2DConditionModel
 from diffusers.training_utils import EMAModel
-from diffusers.utils import torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
+from diffusers.utils import logging, torch_device
+from diffusers.utils.testing_utils import CaptureLogger, require_torch_gpu
 
 
 class ModelUtilsTest(unittest.TestCase):
@@ -155,6 +155,49 @@ def test_from_save_pretrained(self):
         max_diff = (image - new_image).abs().sum().item()
         self.assertLessEqual(max_diff, 5e-5, "Models give different forward passes")
 
+    def test_getattr_is_correct(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+
+        # save some things to test
+        model.dummy_attribute = 5
+        model.register_to_config(test_attribute=5)
+
+        logger = logging.get_logger("diffusers.models.modeling_utils")
+        # 30 for warning
+        logger.setLevel(30)
+        with CaptureLogger(logger) as cap_logger:
+            assert hasattr(model, "dummy_attribute")
+            assert getattr(model, "dummy_attribute") == 5
+            assert model.dummy_attribute == 5
+
+        # no warning should be thrown
+        assert cap_logger.out == ""
+
+        logger = logging.get_logger("diffusers.models.modeling_utils")
+        # 30 for warning
+        logger.setLevel(30)
+        with CaptureLogger(logger) as cap_logger:
+            assert hasattr(model, "save_pretrained")
+            fn = model.save_pretrained
+            fn_1 = getattr(model, "save_pretrained")
+
+            assert fn == fn_1
+        # no warning should be thrown
+        assert cap_logger.out == ""
+
+        # warning should be thrown
+        with self.assertWarns(FutureWarning):
+            assert model.test_attribute == 5
+
+        with self.assertWarns(FutureWarning):
+            assert getattr(model, "test_attribute") == 5
+
+        with self.assertRaises(AttributeError) as error:
+            model.does_not_exist
+
+        assert str(error.exception) == f"'{type(model).__name__}' object has no attribute 'does_not_exist'"
+
     def test_from_save_pretrained_variant(self):
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
 
diff --git a/tests/pipelines/unclip/test_unclip.py b/tests/pipelines/unclip/test_unclip.py
index 4df3e4d3828b..d2c699ea501d 100644
--- a/tests/pipelines/unclip/test_unclip.py
+++ b/tests/pipelines/unclip/test_unclip.py
@@ -293,16 +293,16 @@ class DummyScheduler:
         prior_latents = pipe.prepare_latents(
             shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler()
         )
-        shape = (batch_size, decoder.in_channels, decoder.sample_size, decoder.sample_size)
+        shape = (batch_size, decoder.config.in_channels, decoder.config.sample_size, decoder.config.sample_size)
         decoder_latents = pipe.prepare_latents(
             shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler()
         )
 
         shape = (
             batch_size,
-            super_res_first.in_channels // 2,
-            super_res_first.sample_size,
-            super_res_first.sample_size,
+            super_res_first.config.in_channels // 2,
+            super_res_first.config.sample_size,
+            super_res_first.config.sample_size,
         )
         super_res_latents = pipe.prepare_latents(
             shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler()
diff --git a/tests/pipelines/unclip/test_unclip_image_variation.py b/tests/pipelines/unclip/test_unclip_image_variation.py
index 57d15559cc75..c1b8be9cd49e 100644
--- a/tests/pipelines/unclip/test_unclip_image_variation.py
+++ b/tests/pipelines/unclip/test_unclip_image_variation.py
@@ -379,16 +379,21 @@ class DummyScheduler:
         dtype = pipe.decoder.dtype
         batch_size = 1
 
-        shape = (batch_size, pipe.decoder.in_channels, pipe.decoder.sample_size, pipe.decoder.sample_size)
+        shape = (
+            batch_size,
+            pipe.decoder.config.in_channels,
+            pipe.decoder.config.sample_size,
+            pipe.decoder.config.sample_size,
+        )
         decoder_latents = pipe.prepare_latents(
             shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler()
         )
 
         shape = (
             batch_size,
-            pipe.super_res_first.in_channels // 2,
-            pipe.super_res_first.sample_size,
-            pipe.super_res_first.sample_size,
+            pipe.super_res_first.config.in_channels // 2,
+            pipe.super_res_first.config.sample_size,
+            pipe.super_res_first.config.sample_size,
         )
         super_res_latents = pipe.prepare_latents(
             shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler()
diff --git a/tests/schedulers/test_schedulers.py b/tests/schedulers/test_schedulers.py
index bfbf5cbc798f..69cddb36dde2 100755
--- a/tests/schedulers/test_schedulers.py
+++ b/tests/schedulers/test_schedulers.py
@@ -596,3 +596,47 @@ def test_trained_betas(self):
                 new_scheduler = scheduler_class.from_pretrained(tmpdirname)
 
             assert scheduler.betas.tolist() == new_scheduler.betas.tolist()
+
+    def test_getattr_is_correct(self):
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+
+            # save some things to test
+            scheduler.dummy_attribute = 5
+            scheduler.register_to_config(test_attribute=5)
+
+            logger = logging.get_logger("diffusers.configuration_utils")
+            # 30 for warning
+            logger.setLevel(30)
+            with CaptureLogger(logger) as cap_logger:
+                assert hasattr(scheduler, "dummy_attribute")
+                assert getattr(scheduler, "dummy_attribute") == 5
+                assert scheduler.dummy_attribute == 5
+
+            # no warning should be thrown
+            assert cap_logger.out == ""
+
+            logger = logging.get_logger("diffusers.schedulers.schedulering_utils")
+            # 30 for warning
+            logger.setLevel(30)
+            with CaptureLogger(logger) as cap_logger:
+                assert hasattr(scheduler, "save_pretrained")
+                fn = scheduler.save_pretrained
+                fn_1 = getattr(scheduler, "save_pretrained")
+
+                assert fn == fn_1
+            # no warning should be thrown
+            assert cap_logger.out == ""
+
+            # warning should be thrown
+            with self.assertWarns(FutureWarning):
+                assert scheduler.test_attribute == 5
+
+            with self.assertWarns(FutureWarning):
+                assert getattr(scheduler, "test_attribute") == 5
+
+            with self.assertRaises(AttributeError) as error:
+                scheduler.does_not_exist
+
+            assert str(error.exception) == f"'{type(scheduler).__name__}' object has no attribute 'does_not_exist'"

From 3b641eabe9876e7c48977b35331fda54ce972b4a Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Tue, 18 Apr 2023 08:36:13 +0530
Subject: [PATCH 20/71] feat: verfication of multi-gpu support for select
 examples. (#3126)

* feat: verfication of multi-gpu support for select examples.

* add: multi-gpu training sections to the relvant doc pages.
---
 docs/source/en/training/controlnet.mdx        | 23 +++++++++++++++++
 docs/source/en/training/instructpix2pix.mdx   | 21 ++++++++++++++++
 docs/source/en/training/text2image.mdx        | 25 +++++++++++++++++++
 .../en/training/unconditional_training.mdx    | 20 +++++++++++++++
 examples/controlnet/README.md                 | 23 +++++++++++++++++
 examples/instruct_pix2pix/README.md           | 21 ++++++++++++++++
 examples/text_to_image/README.md              | 25 +++++++++++++++++++
 examples/text_to_image/train_text_to_image.py |  4 +--
 .../unconditional_image_generation/README.md  | 23 ++++++++++++++++-
 9 files changed, 182 insertions(+), 3 deletions(-)

diff --git a/docs/source/en/training/controlnet.mdx b/docs/source/en/training/controlnet.mdx
index 6b7539b89b07..7a5454107b83 100644
--- a/docs/source/en/training/controlnet.mdx
+++ b/docs/source/en/training/controlnet.mdx
@@ -113,6 +113,29 @@ accelerate launch train_controlnet.py \
  --gradient_accumulation_steps=4
 ```
 
+## Training with multiple GPUs
+
+`accelerate` allows for seamless multi-GPU training. Follow the instructions [here](https://huggingface.co/docs/accelerate/basic_tutorials/launch)
+for running distributed training with `accelerate`. Here is an example command:
+
+```bash 
+export MODEL_DIR="runwayml/stable-diffusion-v1-5"
+export OUTPUT_DIR="path to save model"
+
+accelerate launch --mixed_precision="fp16" --multi_gpu train_controlnet.py \
+ --pretrained_model_name_or_path=$MODEL_DIR \
+ --output_dir=$OUTPUT_DIR \
+ --dataset_name=fusing/fill50k \
+ --resolution=512 \
+ --learning_rate=1e-5 \
+ --validation_image "./conditioning_image_1.png" "./conditioning_image_2.png" \
+ --validation_prompt "red circle with blue background" "cyan circle with brown floral background" \
+ --train_batch_size=4 \
+ --mixed_precision="fp16" \
+ --tracker_project_name="controlnet-demo" \
+ --report_to=wandb
+```
+
 ## Example results
 
 #### After 300 steps with batch size 8
diff --git a/docs/source/en/training/instructpix2pix.mdx b/docs/source/en/training/instructpix2pix.mdx
index e6f050b34acf..c485db6d6b20 100644
--- a/docs/source/en/training/instructpix2pix.mdx
+++ b/docs/source/en/training/instructpix2pix.mdx
@@ -126,6 +126,27 @@ accelerate launch --mixed_precision="fp16" train_instruct_pix2pix.py \
 
  ***Note: In the original paper, the authors observed that even when the model is trained with an image resolution of 256x256, it generalizes well to bigger resolutions such as 512x512. This is likely because of the larger dataset they used during training.***
 
+ ## Training with multiple GPUs
+
+`accelerate` allows for seamless multi-GPU training. Follow the instructions [here](https://huggingface.co/docs/accelerate/basic_tutorials/launch)
+for running distributed training with `accelerate`. Here is an example command:
+
+```bash 
+accelerate launch --mixed_precision="fp16" --multi_gpu train_instruct_pix2pix.py \
+ --pretrained_model_name_or_path=runwayml/stable-diffusion-v1-5 \
+ --dataset_name=sayakpaul/instructpix2pix-1000-samples \
+ --use_ema \
+ --enable_xformers_memory_efficient_attention \
+ --resolution=512 --random_flip \
+ --train_batch_size=4 --gradient_accumulation_steps=4 --gradient_checkpointing \
+ --max_train_steps=15000 \
+ --checkpointing_steps=5000 --checkpoints_total_limit=1 \
+ --learning_rate=5e-05 --lr_warmup_steps=0 \
+ --conditioning_dropout_prob=0.05 \
+ --mixed_precision=fp16 \
+ --seed=42 
+```
+
  ## Inference
 
  Once training is complete, we can perform inference:
diff --git a/docs/source/en/training/text2image.mdx b/docs/source/en/training/text2image.mdx
index 4f57ccf94de0..70f8c003a787 100644
--- a/docs/source/en/training/text2image.mdx
+++ b/docs/source/en/training/text2image.mdx
@@ -106,6 +106,31 @@ accelerate launch train_text_to_image.py \
   --lr_scheduler="constant" --lr_warmup_steps=0 \
   --output_dir=${OUTPUT_DIR}
 ```
+
+#### Training with multiple GPUs
+
+`accelerate` allows for seamless multi-GPU training. Follow the instructions [here](https://huggingface.co/docs/accelerate/basic_tutorials/launch)
+for running distributed training with `accelerate`. Here is an example command:
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export dataset_name="lambdalabs/pokemon-blip-captions"
+
+accelerate launch --mixed_precision="fp16" --multi_gpu  train_text_to_image.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --dataset_name=$dataset_name \
+  --use_ema \
+  --resolution=512 --center_crop --random_flip \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 \
+  --gradient_checkpointing \
+  --max_train_steps=15000 \ 
+  --learning_rate=1e-05 \
+  --max_grad_norm=1 \
+  --lr_scheduler="constant" --lr_warmup_steps=0 \
+  --output_dir="sd-pokemon-model" 
+```
+
 </pt>
 <jax>
 With Flax, it's possible to train a Stable Diffusion model faster on TPUs and GPUs thanks to [@duongna211](https://github.com/duongna21). This is very efficient on TPU hardware but works great on GPUs too. The Flax training script doesn't support features like gradient checkpointing or gradient accumulation yet, so you'll need a GPU with at least 30GB of memory or a TPU v3.
diff --git a/docs/source/en/training/unconditional_training.mdx b/docs/source/en/training/unconditional_training.mdx
index 26517fd1fcf8..514932d4b22d 100644
--- a/docs/source/en/training/unconditional_training.mdx
+++ b/docs/source/en/training/unconditional_training.mdx
@@ -122,6 +122,26 @@ accelerate launch train_unconditional.py \
     <img src="https://user-images.githubusercontent.com/26864830/180248200-928953b4-db38-48db-b0c6-8b740fe6786f.png"/>
 </div>
 
+### Training with multiple GPUs
+
+`accelerate` allows for seamless multi-GPU training. Follow the instructions [here](https://huggingface.co/docs/accelerate/basic_tutorials/launch)
+for running distributed training with `accelerate`. Here is an example command:
+
+```bash
+accelerate launch --mixed_precision="fp16" --multi_gpu train_unconditional.py \
+  --dataset_name="huggan/pokemon" \
+  --resolution=64 --center_crop --random_flip \
+  --output_dir="ddpm-ema-pokemon-64" \
+  --train_batch_size=16 \
+  --num_epochs=100 \
+  --gradient_accumulation_steps=1 \
+  --use_ema \
+  --learning_rate=1e-4 \
+  --lr_warmup_steps=500 \
+  --mixed_precision="fp16" \
+  --logger="wandb"
+```
+
 ## Finetuning with your own data
 
 There are two ways to finetune a model on your own dataset:
diff --git a/examples/controlnet/README.md b/examples/controlnet/README.md
index 387755624729..571e9e708cf2 100644
--- a/examples/controlnet/README.md
+++ b/examples/controlnet/README.md
@@ -96,6 +96,29 @@ accelerate launch train_controlnet.py \
  --gradient_accumulation_steps=4
 ```
 
+## Training with multiple GPUs
+
+`accelerate` allows for seamless multi-GPU training. Follow the instructions [here](https://huggingface.co/docs/accelerate/basic_tutorials/launch)
+for running distributed training with `accelerate`. Here is an example command:
+
+```bash 
+export MODEL_DIR="runwayml/stable-diffusion-v1-5"
+export OUTPUT_DIR="path to save model"
+
+accelerate launch --mixed_precision="fp16" --multi_gpu train_controlnet.py \
+ --pretrained_model_name_or_path=$MODEL_DIR \
+ --output_dir=$OUTPUT_DIR \
+ --dataset_name=fusing/fill50k \
+ --resolution=512 \
+ --learning_rate=1e-5 \
+ --validation_image "./conditioning_image_1.png" "./conditioning_image_2.png" \
+ --validation_prompt "red circle with blue background" "cyan circle with brown floral background" \
+ --train_batch_size=4 \
+ --mixed_precision="fp16" \
+ --tracker_project_name="controlnet-demo" \
+ --report_to=wandb
+```
+
 ## Example results
 
 #### After 300 steps with batch size 8
diff --git a/examples/instruct_pix2pix/README.md b/examples/instruct_pix2pix/README.md
index 02f0fed04299..94a7bd2a98f6 100644
--- a/examples/instruct_pix2pix/README.md
+++ b/examples/instruct_pix2pix/README.md
@@ -113,6 +113,27 @@ accelerate launch --mixed_precision="fp16" train_instruct_pix2pix.py \
 
  ***Note: In the original paper, the authors observed that even when the model is trained with an image resolution of 256x256, it generalizes well to bigger resolutions such as 512x512. This is likely because of the larger dataset they used during training.***
 
+ ## Training with multiple GPUs
+
+`accelerate` allows for seamless multi-GPU training. Follow the instructions [here](https://huggingface.co/docs/accelerate/basic_tutorials/launch)
+for running distributed training with `accelerate`. Here is an example command:
+
+```bash 
+accelerate launch --mixed_precision="fp16" --multi_gpu train_instruct_pix2pix.py \
+ --pretrained_model_name_or_path=runwayml/stable-diffusion-v1-5 \
+ --dataset_name=sayakpaul/instructpix2pix-1000-samples \
+ --use_ema \
+ --enable_xformers_memory_efficient_attention \
+ --resolution=512 --random_flip \
+ --train_batch_size=4 --gradient_accumulation_steps=4 --gradient_checkpointing \
+ --max_train_steps=15000 \
+ --checkpointing_steps=5000 --checkpoints_total_limit=1 \
+ --learning_rate=5e-05 --lr_warmup_steps=0 \
+ --conditioning_dropout_prob=0.05 \
+ --mixed_precision=fp16 \
+ --seed=42 
+```
+
  ## Inference
 
  Once training is complete, we can perform inference:
diff --git a/examples/text_to_image/README.md b/examples/text_to_image/README.md
index c84db0ceee64..406a64b3759f 100644
--- a/examples/text_to_image/README.md
+++ b/examples/text_to_image/README.md
@@ -111,6 +111,31 @@ image = pipe(prompt="yoda").images[0]
 image.save("yoda-pokemon.png")
 ```
 
+#### Training with multiple GPUs
+
+`accelerate` allows for seamless multi-GPU training. Follow the instructions [here](https://huggingface.co/docs/accelerate/basic_tutorials/launch)
+for running distributed training with `accelerate`. Here is an example command:
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export dataset_name="lambdalabs/pokemon-blip-captions"
+
+accelerate launch --mixed_precision="fp16" --multi_gpu  train_text_to_image.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --dataset_name=$dataset_name \
+  --use_ema \
+  --resolution=512 --center_crop --random_flip \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 \
+  --gradient_checkpointing \
+  --max_train_steps=15000 \ 
+  --learning_rate=1e-05 \
+  --max_grad_norm=1 \
+  --lr_scheduler="constant" --lr_warmup_steps=0 \
+  --output_dir="sd-pokemon-model" 
+```
+
+
 #### Training with Min-SNR weighting
 
 We support training with the Min-SNR weighting strategy proposed in [Efficient Diffusion Training via Min-SNR Weighting Strategy](https://arxiv.org/abs/2303.09556) which helps to achieve faster convergence
diff --git a/examples/text_to_image/train_text_to_image.py b/examples/text_to_image/train_text_to_image.py
index 4bbf4706f01c..67724698c099 100644
--- a/examples/text_to_image/train_text_to_image.py
+++ b/examples/text_to_image/train_text_to_image.py
@@ -64,8 +64,8 @@ def log_validation(vae, text_encoder, tokenizer, unet, args, accelerator, weight
 
     pipeline = StableDiffusionPipeline.from_pretrained(
         args.pretrained_model_name_or_path,
-        vae=vae,
-        text_encoder=text_encoder,
+        vae=accelerator.unwrap_model(vae),
+        text_encoder=accelerator.unwrap_model(text_encoder),
         tokenizer=tokenizer,
         unet=accelerator.unwrap_model(unet),
         safety_checker=None,
diff --git a/examples/unconditional_image_generation/README.md b/examples/unconditional_image_generation/README.md
index db06d9011681..d83dc928c7a1 100644
--- a/examples/unconditional_image_generation/README.md
+++ b/examples/unconditional_image_generation/README.md
@@ -1,4 +1,4 @@
-## Training examples
+## Training an unconditional diffusion model
 
 Creating a training image set is [described in a different document](https://huggingface.co/docs/datasets/image_process#image-datasets).
 
@@ -76,6 +76,27 @@ A full training run takes 2 hours on 4xV100 GPUs.
 
 <img src="https://user-images.githubusercontent.com/26864830/180248200-928953b4-db38-48db-b0c6-8b740fe6786f.png" width="700" />
 
+### Training with multiple GPUs
+
+`accelerate` allows for seamless multi-GPU training. Follow the instructions [here](https://huggingface.co/docs/accelerate/basic_tutorials/launch)
+for running distributed training with `accelerate`. Here is an example command:
+
+```bash
+accelerate launch --mixed_precision="fp16" --multi_gpu train_unconditional.py \
+  --dataset_name="huggan/pokemon" \
+  --resolution=64 --center_crop --random_flip \
+  --output_dir="ddpm-ema-pokemon-64" \
+  --train_batch_size=16 \
+  --num_epochs=100 \
+  --gradient_accumulation_steps=1 \
+  --use_ema \
+  --learning_rate=1e-4 \
+  --lr_warmup_steps=500 \
+  --mixed_precision="fp16" \
+  --logger="wandb"
+```
+
+To be able to use Weights and Biases (`wandb`) as a logger you need to install the library: `pip install wandb`. 
 
 ### Using your own data
 

From cd8b7507c2c674046be921a3954f64a9d1e83d0f Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Tue, 18 Apr 2023 02:02:25 -1000
Subject: [PATCH 21/71] speed up attend-and-excite fast tests (#3079)

---
 .../test_stable_diffusion_attend_and_excite.py      | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py
index f153ae08cbb6..846e251f3ce2 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py
@@ -44,7 +44,7 @@ def get_dummy_components(self):
         torch.manual_seed(0)
         unet = UNet2DConditionModel(
             block_out_channels=(32, 64),
-            layers_per_block=2,
+            layers_per_block=1,
             sample_size=32,
             in_channels=4,
             out_channels=4,
@@ -111,7 +111,7 @@ def get_dummy_inputs(self, device, seed=0):
             "prompt": "a cat and a frog",
             "token_indices": [2, 5],
             "generator": generator,
-            "num_inference_steps": 2,
+            "num_inference_steps": 1,
             "guidance_scale": 6.0,
             "output_type": "numpy",
             "max_iter_to_alter": 2,
@@ -132,13 +132,18 @@ def test_inference(self):
         image_slice = image[0, -3:, -3:, -1]
 
         self.assertEqual(image.shape, (1, 64, 64, 3))
-        expected_slice = np.array([0.5743, 0.6081, 0.4975, 0.5021, 0.5441, 0.4699, 0.4988, 0.4841, 0.4851])
+        expected_slice = np.array(
+            [0.63905364, 0.62897307, 0.48599017, 0.5133624, 0.5550048, 0.45769516, 0.50326973, 0.5023139, 0.45384496]
+        )
         max_diff = np.abs(image_slice.flatten() - expected_slice).max()
         self.assertLessEqual(max_diff, 1e-3)
 
     def test_inference_batch_consistent(self):
         # NOTE: Larger batch sizes cause this test to timeout, only test on smaller batches
-        self._test_inference_batch_consistent(batch_sizes=[2, 4])
+        self._test_inference_batch_consistent(batch_sizes=[1, 2])
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(batch_size=2)
 
 
 @require_torch_gpu

From 8ecdd3ef657b168a8058a99772871cae91a21b63 Mon Sep 17 00:00:00 2001
From: Cristian Garcia <cgarcia.e88@gmail.com>
Date: Tue, 18 Apr 2023 07:03:00 -0500
Subject: [PATCH 22/71] Optimize log_validation in train_controlnet_flax
 (#3110)

extract pipeline from log_validation
---
 examples/controlnet/train_controlnet_flax.py | 35 +++++++++++---------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/examples/controlnet/train_controlnet_flax.py b/examples/controlnet/train_controlnet_flax.py
index 0b413ace09d2..24b32e7f4301 100644
--- a/examples/controlnet/train_controlnet_flax.py
+++ b/examples/controlnet/train_controlnet_flax.py
@@ -76,20 +76,11 @@ def image_grid(imgs, rows, cols):
     return grid
 
 
-def log_validation(controlnet, controlnet_params, tokenizer, args, rng, weight_dtype):
-    logger.info("Running validation... ")
+def log_validation(pipeline, pipeline_params, controlnet_params, tokenizer, args, rng, weight_dtype):
+    logger.info("Running validation...")
 
-    pipeline, params = FlaxStableDiffusionControlNetPipeline.from_pretrained(
-        args.pretrained_model_name_or_path,
-        tokenizer=tokenizer,
-        controlnet=controlnet,
-        safety_checker=None,
-        dtype=weight_dtype,
-        revision=args.revision,
-        from_pt=args.from_pt,
-    )
-    params = jax_utils.replicate(params)
-    params["controlnet"] = controlnet_params
+    pipeline_params = pipeline_params.copy()
+    pipeline_params["controlnet"] = controlnet_params
 
     num_samples = jax.device_count()
     prng_seed = jax.random.split(rng, jax.device_count())
@@ -121,7 +112,7 @@ def log_validation(controlnet, controlnet_params, tokenizer, args, rng, weight_d
         images = pipeline(
             prompt_ids=prompt_ids,
             image=processed_image,
-            params=params,
+            params=pipeline_params,
             prng_seed=prng_seed,
             num_inference_steps=50,
             jit=True,
@@ -176,6 +167,7 @@ def save_model_card(repo_id: str, image_logs=None, base_model=str, repo_folder=N
 - text-to-image
 - diffusers
 - controlnet
+- jax-diffusers-event
 inference: true
 ---
     """
@@ -800,6 +792,17 @@ def main():
         ]:
             controlnet_params[key] = unet_params[key]
 
+    pipeline, pipeline_params = FlaxStableDiffusionControlNetPipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        tokenizer=tokenizer,
+        controlnet=controlnet,
+        safety_checker=None,
+        dtype=weight_dtype,
+        revision=args.revision,
+        from_pt=args.from_pt,
+    )
+    pipeline_params = jax_utils.replicate(pipeline_params)
+
     # Optimization
     if args.scale_lr:
         args.learning_rate = args.learning_rate * total_train_batch_size
@@ -1073,7 +1076,7 @@ def l2(xs):
                 and global_step % args.validation_steps == 0
                 and jax.process_index() == 0
             ):
-                _ = log_validation(controlnet, state.params, tokenizer, args, validation_rng, weight_dtype)
+                _ = log_validation(pipeline, pipeline_params, state.params, tokenizer, args, validation_rng, weight_dtype)
 
             if global_step % args.logging_steps == 0 and jax.process_index() == 0:
                 if args.report_to == "wandb":
@@ -1105,7 +1108,7 @@ def l2(xs):
         if args.validation_prompt is not None:
             if args.profile_validation:
                 jax.profiler.start_trace(args.output_dir)
-            image_logs = log_validation(controlnet, state.params, tokenizer, args, validation_rng, weight_dtype)
+            image_logs = log_validation(pipeline, pipeline_params, state.params, tokenizer, args, validation_rng, weight_dtype)
             if args.profile_validation:
                 jax.profiler.stop_trace()
         else:

From f2df39fa0e6246d13aea03364366b2d53a4ab5f9 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 18 Apr 2023 14:03:17 +0200
Subject: [PATCH 23/71] make style

---
 examples/controlnet/train_controlnet_flax.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/examples/controlnet/train_controlnet_flax.py b/examples/controlnet/train_controlnet_flax.py
index 24b32e7f4301..b25f9325403f 100644
--- a/examples/controlnet/train_controlnet_flax.py
+++ b/examples/controlnet/train_controlnet_flax.py
@@ -1076,7 +1076,9 @@ def l2(xs):
                 and global_step % args.validation_steps == 0
                 and jax.process_index() == 0
             ):
-                _ = log_validation(pipeline, pipeline_params, state.params, tokenizer, args, validation_rng, weight_dtype)
+                _ = log_validation(
+                    pipeline, pipeline_params, state.params, tokenizer, args, validation_rng, weight_dtype
+                )
 
             if global_step % args.logging_steps == 0 and jax.process_index() == 0:
                 if args.report_to == "wandb":
@@ -1108,7 +1110,9 @@ def l2(xs):
         if args.validation_prompt is not None:
             if args.profile_validation:
                 jax.profiler.start_trace(args.output_dir)
-            image_logs = log_validation(pipeline, pipeline_params, state.params, tokenizer, args, validation_rng, weight_dtype)
+            image_logs = log_validation(
+                pipeline, pipeline_params, state.params, tokenizer, args, validation_rng, weight_dtype
+            )
             if args.profile_validation:
                 jax.profiler.stop_trace()
         else:

From 4bc157ffa90a2a967247952a82ea76bea5c5d990 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 18 Apr 2023 17:35:12 +0200
Subject: [PATCH 24/71] Correct textual inversion readme (#3145)

* Update README.md

* Apply suggestions from code review
---
 examples/textual_inversion/README.md | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/examples/textual_inversion/README.md b/examples/textual_inversion/README.md
index 3a7c96be69fb..4d420b284f38 100644
--- a/examples/textual_inversion/README.md
+++ b/examples/textual_inversion/README.md
@@ -39,29 +39,31 @@ accelerate config
 
 ### Cat toy example
 
-You need to accept the model license before downloading or using the weights. In this example we'll use model version `v1-5`, so you'll need to visit [its card](https://huggingface.co/runwayml/stable-diffusion-v1-5), read the license and tick the checkbox if you agree. 
-
-You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens).
-
-Run the following command to authenticate your token
+First, let's login so that we can upload the checkpoint to the Hub during training:
 
 ```bash
 huggingface-cli login
 ```
 
-If you have already cloned the repo, then you won't need to go through these steps. 
+Now let's get our dataset. For this example we will use some cat images: https://huggingface.co/datasets/diffusers/cat_toy_example .
 
-<br>
+Let's first download it locally:
 
-Now let's get our dataset.Download 3-4 images from [here](https://drive.google.com/drive/folders/1fmJMs25nxS_rSNqS5hTcRdLem_YQXbq5) and save them in a directory. This will be our training data.
+```py
+from huggingface_hub import snapshot_download
 
-And launch the training using
+local_dir = "./cat"
+snapshot_download("diffusers/cat_toy_example", local_dir=local_dir, repo_type="dataset", ignore_patterns=".gitattributes")
+```
+
+This will be our training data.
+Now we can launch the training using
 
 **___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___**
 
 ```bash
 export MODEL_NAME="runwayml/stable-diffusion-v1-5"
-export DATA_DIR="path-to-dir-containing-images"
+export DATA_DIR="./cat"
 
 accelerate launch textual_inversion.py \
   --pretrained_model_name_or_path=$MODEL_NAME \
@@ -75,6 +77,7 @@ accelerate launch textual_inversion.py \
   --learning_rate=5.0e-04 --scale_lr \
   --lr_scheduler="constant" \
   --lr_warmup_steps=0 \
+  --push_to_hub \
   --output_dir="textual_inversion_cat"
 ```
 

From f0c74e9a756daf5295105444470655aacce5cd9c Mon Sep 17 00:00:00 2001
From: Will Berman <wlbberman@gmail.com>
Date: Tue, 18 Apr 2023 14:13:16 -0700
Subject: [PATCH 25/71] Add unet act fn to other model components (#3136)

Adding act fn config to the unet timestep class embedding and conv
activation.

The custom activation defaults to silu which is the default
activation function for both the conv act and the timestep class
embeddings so default behavior is not changed.

The only unet which use the custom activation is the stable diffusion
latent upscaler https://huggingface.co/stabilityai/sd-x2-latent-upscaler/blob/main/unet/config.json
(I ran a script against the hub to confirm).
The latent upscaler does not use the conv activation nor the timestep
class embeddings so we don't change its behavior.
---
 src/diffusers/models/unet_2d_condition.py         | 15 +++++++++++++--
 .../versatile_diffusion/modeling_text_unet.py     | 15 +++++++++++++--
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index b2814356939b..29de8734d4e7 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -248,7 +248,7 @@ def __init__(
         if class_embed_type is None and num_class_embeds is not None:
             self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
         elif class_embed_type == "timestep":
-            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim, act_fn=act_fn)
         elif class_embed_type == "identity":
             self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
         elif class_embed_type == "projection":
@@ -437,7 +437,18 @@ def __init__(
             self.conv_norm_out = nn.GroupNorm(
                 num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps
             )
-            self.conv_act = nn.SiLU()
+
+            if act_fn == "swish":
+                self.conv_act = lambda x: F.silu(x)
+            elif act_fn == "mish":
+                self.conv_act = nn.Mish()
+            elif act_fn == "silu":
+                self.conv_act = nn.SiLU()
+            elif act_fn == "gelu":
+                self.conv_act = nn.GELU()
+            else:
+                raise ValueError(f"Unsupported activation function: {act_fn}")
+
         else:
             self.conv_norm_out = None
             self.conv_act = None
diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
index 4377be1181a8..b20f18c485d0 100644
--- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -345,7 +345,7 @@ def __init__(
         if class_embed_type is None and num_class_embeds is not None:
             self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
         elif class_embed_type == "timestep":
-            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim, act_fn=act_fn)
         elif class_embed_type == "identity":
             self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
         elif class_embed_type == "projection":
@@ -534,7 +534,18 @@ def __init__(
             self.conv_norm_out = nn.GroupNorm(
                 num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps
             )
-            self.conv_act = nn.SiLU()
+
+            if act_fn == "swish":
+                self.conv_act = lambda x: F.silu(x)
+            elif act_fn == "mish":
+                self.conv_act = nn.Mish()
+            elif act_fn == "silu":
+                self.conv_act = nn.SiLU()
+            elif act_fn == "gelu":
+                self.conv_act = nn.GELU()
+            else:
+                raise ValueError(f"Unsupported activation function: {act_fn}")
+
         else:
             self.conv_norm_out = None
             self.conv_act = None

From fc1883918ff73564e088a7c655a96f52ff915045 Mon Sep 17 00:00:00 2001
From: Will Berman <wlbberman@gmail.com>
Date: Tue, 18 Apr 2023 15:05:41 -0700
Subject: [PATCH 26/71] class labels timestep embeddings projection dtype cast
 (#3137)

This mimics the dtype cast for the standard time embeddings
---
 src/diffusers/models/unet_2d_condition.py                   | 6 +++++-
 .../pipelines/versatile_diffusion/modeling_text_unet.py     | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index 29de8734d4e7..b4997a257643 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -659,7 +659,7 @@ def forward(
 
         t_emb = self.time_proj(timesteps)
 
-        # timesteps does not contain any weights and will always return f32 tensors
+        # `Timesteps` does not contain any weights and will always return f32 tensors
         # but time_embedding might actually be running in fp16. so we need to cast here.
         # there might be better ways to encapsulate this.
         t_emb = t_emb.to(dtype=self.dtype)
@@ -673,6 +673,10 @@ def forward(
             if self.config.class_embed_type == "timestep":
                 class_labels = self.time_proj(class_labels)
 
+                # `Timesteps` does not contain any weights and will always return f32 tensors
+                # there might be better ways to encapsulate this.
+                class_labels = class_labels.to(dtype=sample.dtype)
+
             class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
 
             if self.config.class_embeddings_concat:
diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
index b20f18c485d0..2a7b80d01da7 100644
--- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -756,7 +756,7 @@ def forward(
 
         t_emb = self.time_proj(timesteps)
 
-        # timesteps does not contain any weights and will always return f32 tensors
+        # `Timesteps` does not contain any weights and will always return f32 tensors
         # but time_embedding might actually be running in fp16. so we need to cast here.
         # there might be better ways to encapsulate this.
         t_emb = t_emb.to(dtype=self.dtype)
@@ -770,6 +770,10 @@ def forward(
             if self.config.class_embed_type == "timestep":
                 class_labels = self.time_proj(class_labels)
 
+                # `Timesteps` does not contain any weights and will always return f32 tensors
+                # there might be better ways to encapsulate this.
+                class_labels = class_labels.to(dtype=sample.dtype)
+
             class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
 
             if self.config.class_embeddings_concat:

From bdeff4d64a57e556c2b62f887da03a2c37c54d54 Mon Sep 17 00:00:00 2001
From: cmdr2 <secondary.cmdr2@gmail.com>
Date: Wed, 19 Apr 2023 18:07:07 +0530
Subject: [PATCH 27/71] [ckpt loader] Allow loading the Inpaint and Img2Img
 pipelines, while loading a ckpt model (#2705)

* [ckpt loader] Allow loading the Inpaint and Img2Img pipelines, while loading a ckpt model

* Address review comment from PR

* PyLint formatting

* Some more pylint fixes, unrelated to our change

* Another pylint fix

* Styling fix
---
 .../stable_diffusion/convert_from_ckpt.py     | 97 +++++++++++++++----
 1 file changed, 78 insertions(+), 19 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
index a16213639526..dbc1b27e88be 100644
--- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
@@ -45,6 +45,8 @@
     PNDMScheduler,
     PriorTransformer,
     StableDiffusionControlNetPipeline,
+    StableDiffusionImg2ImgPipeline,
+    StableDiffusionInpaintPipeline,
     StableDiffusionPipeline,
     StableUnCLIPImg2ImgPipeline,
     StableUnCLIPPipeline,
@@ -979,6 +981,7 @@ def download_from_original_stable_diffusion_ckpt(
     image_size: int = 512,
     prediction_type: str = None,
     model_type: str = None,
+    is_img2img: bool = False,
     extract_ema: bool = False,
     scheduler_type: str = "pndm",
     num_in_channels: Optional[int] = None,
@@ -1018,6 +1021,8 @@ def download_from_original_stable_diffusion_ckpt(
         model_type (`str`, *optional*, defaults to `None`):
             The pipeline type. `None` to automatically infer, or one of `["FrozenOpenCLIPEmbedder",
             "FrozenCLIPEmbedder", "PaintByExample"]`.
+        is_img2img (`bool`, *optional*, defaults to `False`):
+            Whether the model should be loaded as an img2img pipeline.
         extract_ema (`bool`, *optional*, defaults to `False`): Only relevant for
             checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights or not. Defaults to
             `False`. Pass `True` to extract the EMA weights. EMA weights usually yield higher quality images for
@@ -1193,16 +1198,44 @@ def download_from_original_stable_diffusion_ckpt(
                     requires_safety_checker=False,
                 )
             else:
-                pipe = StableDiffusionPipeline(
-                    vae=vae,
-                    text_encoder=text_model,
-                    tokenizer=tokenizer,
-                    unet=unet,
-                    scheduler=scheduler,
-                    safety_checker=None,
-                    feature_extractor=None,
-                    requires_safety_checker=False,
-                )
+                if (
+                    hasattr(original_config, "model")
+                    and hasattr(original_config.model, "target")
+                    and "LatentInpaintDiffusion" in original_config.model.target
+                ):
+                    pipe = StableDiffusionInpaintPipeline(
+                        vae=vae,
+                        text_encoder=text_model,
+                        tokenizer=tokenizer,
+                        unet=unet,
+                        scheduler=scheduler,
+                        safety_checker=None,
+                        feature_extractor=None,
+                        requires_safety_checker=False,
+                    )
+                else:
+                    if is_img2img:
+                        pipe = StableDiffusionImg2ImgPipeline(
+                            vae=vae,
+                            text_encoder=text_model,
+                            tokenizer=tokenizer,
+                            unet=unet,
+                            scheduler=scheduler,
+                            safety_checker=None,
+                            feature_extractor=None,
+                            requires_safety_checker=False,
+                        )
+                    else:
+                        pipe = StableDiffusionPipeline(
+                            vae=vae,
+                            text_encoder=text_model,
+                            tokenizer=tokenizer,
+                            unet=unet,
+                            scheduler=scheduler,
+                            safety_checker=None,
+                            feature_extractor=None,
+                            requires_safety_checker=False,
+                        )
         else:
             image_normalizer, image_noising_scheduler = stable_unclip_image_noising_components(
                 original_config, clip_stats_path=clip_stats_path, device=device
@@ -1293,15 +1326,41 @@ def download_from_original_stable_diffusion_ckpt(
                 feature_extractor=feature_extractor,
             )
         else:
-            pipe = StableDiffusionPipeline(
-                vae=vae,
-                text_encoder=text_model,
-                tokenizer=tokenizer,
-                unet=unet,
-                scheduler=scheduler,
-                safety_checker=safety_checker,
-                feature_extractor=feature_extractor,
-            )
+            if (
+                hasattr(original_config, "model")
+                and hasattr(original_config.model, "target")
+                and "LatentInpaintDiffusion" in original_config.model.target
+            ):
+                pipe = StableDiffusionInpaintPipeline(
+                    vae=vae,
+                    text_encoder=text_model,
+                    tokenizer=tokenizer,
+                    unet=unet,
+                    scheduler=scheduler,
+                    safety_checker=safety_checker,
+                    feature_extractor=feature_extractor,
+                )
+            else:
+                if is_img2img:
+                    pipe = StableDiffusionImg2ImgPipeline(
+                        vae=vae,
+                        text_encoder=text_model,
+                        tokenizer=tokenizer,
+                        unet=unet,
+                        scheduler=scheduler,
+                        safety_checker=safety_checker,
+                        feature_extractor=feature_extractor,
+                    )
+                else:
+                    pipe = StableDiffusionPipeline(
+                        vae=vae,
+                        text_encoder=text_model,
+                        tokenizer=tokenizer,
+                        unet=unet,
+                        scheduler=scheduler,
+                        safety_checker=safety_checker,
+                        feature_extractor=feature_extractor,
+                    )
     else:
         text_config = create_ldm_bert_config(original_config)
         text_model = convert_ldm_bert_checkpoint(checkpoint, text_config)

From 86ecd4b795f865b5b615b8c54991c177bb3dbef5 Mon Sep 17 00:00:00 2001
From: 1lint <105617163+1lint@users.noreply.github.com>
Date: Wed, 19 Apr 2023 11:07:36 -0500
Subject: [PATCH 28/71] add from_ckpt method as Mixin (#2318)

* add mixin class for pipeline from original sd ckpt

* Improve

* make style

* merge main into

* Improve more

* fix more

* up

* Apply suggestions from code review

* finish docs

* rename

* make style

---------

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 docs/source/en/api/loaders.mdx                |   4 +
 .../pipelines/stable_diffusion/controlnet.mdx |   1 +
 .../pipelines/stable_diffusion/depth2img.mdx  |   5 +-
 .../pipelines/stable_diffusion/img2img.mdx    |   6 +-
 .../pipelines/stable_diffusion/inpaint.mdx    |   5 +-
 .../pipelines/stable_diffusion/pix2pix.mdx    |   3 +
 .../pipelines/stable_diffusion/text2img.mdx   |   4 +
 src/diffusers/__init__.py                     |   1 -
 src/diffusers/loaders.py                      | 198 ++++++++++++++++++
 .../alt_diffusion/pipeline_alt_diffusion.py   |   8 +
 .../pipeline_alt_diffusion_img2img.py         |   8 +
 .../stable_diffusion/convert_from_ckpt.py     | 146 +++++--------
 .../pipeline_stable_diffusion.py              |  12 +-
 .../pipeline_stable_diffusion_controlnet.py   |   3 +
 .../pipeline_stable_diffusion_depth2img.py    |  11 +-
 .../pipeline_stable_diffusion_img2img.py      |  12 +-
 .../pipeline_stable_diffusion_inpaint.py      |  11 +-
 ...ipeline_stable_diffusion_inpaint_legacy.py |  14 +-
 ...eline_stable_diffusion_instruct_pix2pix.py |  11 +-
 .../dummy_torch_and_transformers_objects.py   |  15 --
 .../stable_diffusion/test_stable_diffusion.py |  57 +++++
 21 files changed, 410 insertions(+), 125 deletions(-)

diff --git a/docs/source/en/api/loaders.mdx b/docs/source/en/api/loaders.mdx
index 8cbf21b8e0cf..20134a0afe66 100644
--- a/docs/source/en/api/loaders.mdx
+++ b/docs/source/en/api/loaders.mdx
@@ -36,3 +36,7 @@ API to load such adapter neural networks via the [`loaders.py` module](https://g
 ### LoraLoaderMixin
 
 [[autodoc]] loaders.LoraLoaderMixin
+
+### FromCkptMixin
+
+[[autodoc]] loaders.FromCkptMixin
diff --git a/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx b/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
index af859177c002..dabd3ded31ce 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
@@ -308,6 +308,7 @@ All checkpoints can be found under the authors' namespace [lllyasviel](https://h
 	- disable_vae_slicing
 	- enable_xformers_memory_efficient_attention
 	- disable_xformers_memory_efficient_attention
+	- load_textual_inversion
 
 ## FlaxStableDiffusionControlNetPipeline
 [[autodoc]] FlaxStableDiffusionControlNetPipeline
diff --git a/docs/source/en/api/pipelines/stable_diffusion/depth2img.mdx b/docs/source/en/api/pipelines/stable_diffusion/depth2img.mdx
index c46576ff2887..a91167bac58c 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/depth2img.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/depth2img.mdx
@@ -30,4 +30,7 @@ Available Checkpoints are:
 	- enable_attention_slicing
 	- disable_attention_slicing
 	- enable_xformers_memory_efficient_attention
-	- disable_xformers_memory_efficient_attention
\ No newline at end of file
+	- disable_xformers_memory_efficient_attention
+	- load_textual_inversion
+	- load_lora_weights
+	- save_lora_weights
diff --git a/docs/source/en/api/pipelines/stable_diffusion/img2img.mdx b/docs/source/en/api/pipelines/stable_diffusion/img2img.mdx
index 09bfb853f9c9..7959c588608b 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/img2img.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/img2img.mdx
@@ -30,7 +30,11 @@ proposed by Chenlin Meng, Yutong He, Yang Song, Jiaming Song, Jiajun Wu, Jun-Yan
 	- disable_attention_slicing
 	- enable_xformers_memory_efficient_attention
 	- disable_xformers_memory_efficient_attention
+	- load_textual_inversion
+	- from_ckpt
+	- load_lora_weights
+	- save_lora_weights
 
 [[autodoc]] FlaxStableDiffusionImg2ImgPipeline
 	- all
-	- __call__
\ No newline at end of file
+	- __call__
diff --git a/docs/source/en/api/pipelines/stable_diffusion/inpaint.mdx b/docs/source/en/api/pipelines/stable_diffusion/inpaint.mdx
index 33e84a63261f..39e5ae0fd37d 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/inpaint.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/inpaint.mdx
@@ -31,7 +31,10 @@ Available checkpoints are:
 	- disable_attention_slicing
 	- enable_xformers_memory_efficient_attention
 	- disable_xformers_memory_efficient_attention
+	- load_textual_inversion
+	- load_lora_weights
+	- save_lora_weights
 
 [[autodoc]] FlaxStableDiffusionInpaintPipeline
 	- all
-	- __call__
\ No newline at end of file
+	- __call__
diff --git a/docs/source/en/api/pipelines/stable_diffusion/pix2pix.mdx b/docs/source/en/api/pipelines/stable_diffusion/pix2pix.mdx
index 42cd4b896b2e..d01f1df23385 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/pix2pix.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/pix2pix.mdx
@@ -68,3 +68,6 @@ images[0].save("snowy_mountains.png")
 [[autodoc]] StableDiffusionInstructPix2PixPipeline
 	- __call__
 	- all
+	- load_textual_inversion
+	- load_lora_weights
+	- save_lora_weights
diff --git a/docs/source/en/api/pipelines/stable_diffusion/text2img.mdx b/docs/source/en/api/pipelines/stable_diffusion/text2img.mdx
index 6b8d53bf6510..ce78434fdbaa 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/text2img.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/text2img.mdx
@@ -39,6 +39,10 @@ Available Checkpoints are:
 	- disable_xformers_memory_efficient_attention
 	- enable_vae_tiling
 	- disable_vae_tiling
+	- load_textual_inversion
+	- from_ckpt
+	- load_lora_weights
+	- save_lora_weights
 
 [[autodoc]] FlaxStableDiffusionPipeline
 	- all
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 07c17100e0e0..40029fcecfd1 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -109,7 +109,6 @@
 except OptionalDependencyNotAvailable:
     from .utils.dummy_torch_and_transformers_objects import *  # noqa F403
 else:
-    from .loaders import TextualInversionLoaderMixin
     from .pipelines import (
         AltDiffusionImg2ImgPipeline,
         AltDiffusionPipeline,
diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py
index e814981a85c9..3133da117390 100644
--- a/src/diffusers/loaders.py
+++ b/src/diffusers/loaders.py
@@ -13,9 +13,11 @@
 # limitations under the License.
 import os
 from collections import defaultdict
+from pathlib import Path
 from typing import Callable, Dict, List, Optional, Union
 
 import torch
+from huggingface_hub import hf_hub_download
 
 from .models.attention_processor import LoRAAttnProcessor
 from .utils import (
@@ -431,6 +433,7 @@ def load_textual_inversion(
         Example:
 
         To load a textual inversion embedding vector in `diffusers` format:
+
         ```py
         from diffusers import StableDiffusionPipeline
         import torch
@@ -463,6 +466,7 @@ def load_textual_inversion(
         image = pipe(prompt, num_inference_steps=50).images[0]
         image.save("character.png")
         ```
+
         """
         if not hasattr(self, "tokenizer") or not isinstance(self.tokenizer, PreTrainedTokenizer):
             raise ValueError(
@@ -1051,3 +1055,197 @@ def save_function(weights, filename):
 
         save_function(state_dict, os.path.join(save_directory, weight_name))
         logger.info(f"Model weights saved in {os.path.join(save_directory, weight_name)}")
+
+
+class FromCkptMixin:
+    """This helper class allows to directly load .ckpt stable diffusion file_extension
+    into the respective classes."""
+
+    @classmethod
+    def from_ckpt(cls, pretrained_model_link_or_path, **kwargs):
+        r"""
+        Instantiate a PyTorch diffusion pipeline from pre-trained pipeline weights saved in the original .ckpt format.
+
+        The pipeline is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated).
+
+        Parameters:
+            pretrained_model_link_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+                    - A link to the .ckpt file on the Hub. Should be in the format
+                      `"https://huggingface.co/<repo_id>/blob/main/<path_to_file>"`
+                    - A path to a *file* containing all pipeline weights.
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model under this dtype. If `"auto"` is passed the dtype
+                will be automatically derived from the model's weights.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory in which a downloaded pretrained model configuration should be cached if the
+                standard cache should not be used.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
+                file exists.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether or not to only look at local files (i.e., do not try to download the model).
+            use_auth_token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+                when running `huggingface-cli login` (stored in `~/.huggingface`).
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+            use_safetensors (`bool`, *optional* ):
+                If set to `True`, the pipeline will be loaded from `safetensors` weights. If set to `None` (the
+                default). The pipeline will load using `safetensors` if the safetensors weights are available *and* if
+                `safetensors` is installed. If the to `False` the pipeline will *not* use `safetensors`.
+            extract_ema (`bool`, *optional*, defaults to `False`): Only relevant for
+                checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights or not. Defaults
+                to `False`. Pass `True` to extract the EMA weights. EMA weights usually yield higher quality images for
+                inference. Non-EMA weights are usually better to continue fine-tuning.
+            upcast_attention (`bool`, *optional*, defaults to `None`):
+                Whether the attention computation should always be upcasted. This is necessary when running stable
+            image_size (`int`, *optional*, defaults to 512):
+                The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Diffusion v2
+                Base. Use 768 for Stable Diffusion v2.
+            prediction_type (`str`, *optional*):
+                The prediction type that the model was trained on. Use `'epsilon'` for Stable Diffusion v1.X and Stable
+                Diffusion v2 Base. Use `'v_prediction'` for Stable Diffusion v2.
+            num_in_channels (`int`, *optional*, defaults to None):
+                The number of input channels. If `None`, it will be automatically inferred.
+            scheduler_type (`str`, *optional*, defaults to 'pndm'):
+                Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler", "euler-ancestral", "dpm",
+                "ddim"]`.
+            load_safety_checker (`bool`, *optional*, defaults to `True`):
+                Whether to load the safety checker or not. Defaults to `True`.
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to overwrite load - and saveable variables - *i.e.* the pipeline components - of the
+                specific pipeline class. The overwritten components are then directly passed to the pipelines
+                `__init__` method. See example below for more information.
+
+        Examples:
+
+        ```py
+        >>> from diffusers import StableDiffusionPipeline
+
+        >>> # Download pipeline from huggingface.co and cache.
+        >>> pipeline = StableDiffusionPipeline.from_ckpt(
+        ...     "https://huggingface.co/WarriorMama777/OrangeMixs/blob/main/Models/AbyssOrangeMix/AbyssOrangeMix.safetensors"
+        ... )
+
+        >>> # Download pipeline from local file
+        >>> # file is downloaded under ./v1-5-pruned-emaonly.ckpt
+        >>> pipeline = StableDiffusionPipeline.from_ckpt("./v1-5-pruned-emaonly")
+
+        >>> # Enable float16 and move to GPU
+        >>> pipeline = StableDiffusionPipeline.from_ckpt(
+        ...     "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt",
+        ...     torch_dtype=torch.float16,
+        ... )
+        >>> pipeline.to("cuda")
+        ```
+        """
+        # import here to avoid circular dependency
+        from .pipelines.stable_diffusion.convert_from_ckpt import download_from_original_stable_diffusion_ckpt
+
+        cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
+        resume_download = kwargs.pop("resume_download", False)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE)
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        revision = kwargs.pop("revision", None)
+        extract_ema = kwargs.pop("extract_ema", False)
+        image_size = kwargs.pop("image_size", 512)
+        scheduler_type = kwargs.pop("scheduler_type", "pndm")
+        num_in_channels = kwargs.pop("num_in_channels", None)
+        upcast_attention = kwargs.pop("upcast_attention", None)
+        load_safety_checker = kwargs.pop("load_safety_checker", True)
+        prediction_type = kwargs.pop("prediction_type", None)
+
+        torch_dtype = kwargs.pop("torch_dtype", None)
+
+        use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False)
+
+        pipeline_name = cls.__name__
+        file_extension = pretrained_model_link_or_path.rsplit(".", 1)[-1]
+        from_safetensors = file_extension == "safetensors"
+
+        if from_safetensors and use_safetensors is True:
+            raise ValueError("Make sure to install `safetensors` with `pip install safetensors`.")
+
+        # TODO: For now we only support stable diffusion
+        stable_unclip = None
+        controlnet = False
+
+        if pipeline_name == "StableDiffusionControlNetPipeline":
+            model_type = "FrozenCLIPEmbedder"
+            controlnet = True
+        elif "StableDiffusion" in pipeline_name:
+            model_type = "FrozenCLIPEmbedder"
+        elif pipeline_name == "StableUnCLIPPipeline":
+            model_type == "FrozenOpenCLIPEmbedder"
+            stable_unclip = "txt2img"
+        elif pipeline_name == "StableUnCLIPImg2ImgPipeline":
+            model_type == "FrozenOpenCLIPEmbedder"
+            stable_unclip = "img2img"
+        elif pipeline_name == "PaintByExamplePipeline":
+            model_type == "PaintByExample"
+        elif pipeline_name == "LDMTextToImagePipeline":
+            model_type == "LDMTextToImage"
+        else:
+            raise ValueError(f"Unhandled pipeline class: {pipeline_name}")
+
+        # remove huggingface url
+        for prefix in ["https://huggingface.co/", "huggingface.co/", "hf.co/", "https://hf.co/"]:
+            if pretrained_model_link_or_path.startswith(prefix):
+                pretrained_model_link_or_path = pretrained_model_link_or_path[len(prefix) :]
+
+        # Code based on diffusers.pipelines.pipeline_utils.DiffusionPipeline.from_pretrained
+        ckpt_path = Path(pretrained_model_link_or_path)
+        if not ckpt_path.is_file():
+            # get repo_id and (potentially nested) file path of ckpt in repo
+            repo_id = str(Path().joinpath(*ckpt_path.parts[:2]))
+            file_path = str(Path().joinpath(*ckpt_path.parts[2:]))
+
+            if file_path.startswith("blob/"):
+                file_path = file_path[len("blob/") :]
+
+            if file_path.startswith("main/"):
+                file_path = file_path[len("main/") :]
+
+            pretrained_model_link_or_path = hf_hub_download(
+                repo_id,
+                filename=file_path,
+                cache_dir=cache_dir,
+                resume_download=resume_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                use_auth_token=use_auth_token,
+                revision=revision,
+                force_download=force_download,
+            )
+
+        pipe = download_from_original_stable_diffusion_ckpt(
+            pretrained_model_link_or_path,
+            pipeline_class=cls,
+            model_type=model_type,
+            stable_unclip=stable_unclip,
+            controlnet=controlnet,
+            from_safetensors=from_safetensors,
+            extract_ema=extract_ema,
+            image_size=image_size,
+            scheduler_type=scheduler_type,
+            num_in_channels=num_in_channels,
+            upcast_attention=upcast_attention,
+            load_safety_checker=load_safety_checker,
+            prediction_type=prediction_type,
+        )
+
+        if torch_dtype is not None:
+            pipe.to(torch_dtype=torch_dtype)
+
+        return pipe
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
index bf314b91116e..ff9474ffd43a 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
@@ -57,6 +57,14 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
 
+    In addition the pipeline inherits the following loading methods:
+        - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
+        - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`]
+        - *Ckpt*: [`loaders.FromCkptMixin.from_ckpt`]
+
+    as well as the following saving methods:
+        - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`]
+
     Args:
         vae ([`AutoencoderKL`]):
             Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index 86fc47f424e9..dee4a91924f7 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -96,6 +96,14 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
 
+    In addition the pipeline inherits the following loading methods:
+        - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
+        - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`]
+        - *Ckpt*: [`loaders.FromCkptMixin.from_ckpt`]
+
+    as well as the following saving methods:
+        - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`]
+
     Args:
         vae ([`AutoencoderKL`]):
             Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
index dbc1b27e88be..5961636dd197 100644
--- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
@@ -31,35 +31,30 @@
     CLIPVisionModelWithProjection,
 )
 
-from diffusers import (
+from ...models import (
     AutoencoderKL,
     ControlNetModel,
+    PriorTransformer,
+    UNet2DConditionModel,
+)
+from ...schedulers import (
     DDIMScheduler,
     DDPMScheduler,
     DPMSolverMultistepScheduler,
     EulerAncestralDiscreteScheduler,
     EulerDiscreteScheduler,
     HeunDiscreteScheduler,
-    LDMTextToImagePipeline,
     LMSDiscreteScheduler,
     PNDMScheduler,
-    PriorTransformer,
-    StableDiffusionControlNetPipeline,
-    StableDiffusionImg2ImgPipeline,
-    StableDiffusionInpaintPipeline,
-    StableDiffusionPipeline,
-    StableUnCLIPImg2ImgPipeline,
-    StableUnCLIPPipeline,
     UnCLIPScheduler,
-    UNet2DConditionModel,
 )
-from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
-from diffusers.pipelines.paint_by_example import PaintByExampleImageEncoder, PaintByExamplePipeline
-from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
-from diffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
-
 from ...utils import is_omegaconf_available, is_safetensors_available, logging
 from ...utils.import_utils import BACKENDS_MAPPING
+from ..latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
+from ..paint_by_example import PaintByExampleImageEncoder
+from ..pipeline_utils import DiffusionPipeline
+from .safety_checker import StableDiffusionSafetyChecker
+from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -981,7 +976,6 @@ def download_from_original_stable_diffusion_ckpt(
     image_size: int = 512,
     prediction_type: str = None,
     model_type: str = None,
-    is_img2img: bool = False,
     extract_ema: bool = False,
     scheduler_type: str = "pndm",
     num_in_channels: Optional[int] = None,
@@ -993,7 +987,8 @@ def download_from_original_stable_diffusion_ckpt(
     clip_stats_path: Optional[str] = None,
     controlnet: Optional[bool] = None,
     load_safety_checker: bool = True,
-) -> StableDiffusionPipeline:
+    pipeline_class: DiffusionPipeline = None,
+) -> DiffusionPipeline:
     """
     Load a Stable Diffusion pipeline object from a CompVis-style `.ckpt`/`.safetensors` file and (ideally) a `.yaml`
     config file.
@@ -1031,12 +1026,29 @@ def download_from_original_stable_diffusion_ckpt(
             Whether the attention computation should always be upcasted. This is necessary when running stable
             diffusion 2.1.
         device (`str`, *optional*, defaults to `None`):
-            The device to use. Pass `None` to determine automatically. :param from_safetensors: If `checkpoint_path` is
-            in `safetensors` format, load checkpoint with safetensors instead of PyTorch. :return: A
-            StableDiffusionPipeline object representing the passed-in `.ckpt`/`.safetensors` file.
+            The device to use. Pass `None` to determine automatically.
+        from_safetensors (`str`, *optional*, defaults to `False`):
+            If `checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch.
         load_safety_checker (`bool`, *optional*, defaults to `True`):
             Whether to load the safety checker or not. Defaults to `True`.
+        pipeline_class (`str`, *optional*, defaults to `None`):
+            The pipeline class to use. Pass `None` to determine automatically.
+        return: A StableDiffusionPipeline object representing the passed-in `.ckpt`/`.safetensors` file.
     """
+
+    # import pipelines here to avoid circular import error when using from_ckpt method
+    from diffusers import (
+        LDMTextToImagePipeline,
+        PaintByExamplePipeline,
+        StableDiffusionControlNetPipeline,
+        StableDiffusionPipeline,
+        StableUnCLIPImg2ImgPipeline,
+        StableUnCLIPPipeline,
+    )
+
+    if pipeline_class is None:
+        pipeline_class = StableDiffusionPipeline
+
     if prediction_type == "v-prediction":
         prediction_type = "v_prediction"
 
@@ -1198,44 +1210,16 @@ def download_from_original_stable_diffusion_ckpt(
                     requires_safety_checker=False,
                 )
             else:
-                if (
-                    hasattr(original_config, "model")
-                    and hasattr(original_config.model, "target")
-                    and "LatentInpaintDiffusion" in original_config.model.target
-                ):
-                    pipe = StableDiffusionInpaintPipeline(
-                        vae=vae,
-                        text_encoder=text_model,
-                        tokenizer=tokenizer,
-                        unet=unet,
-                        scheduler=scheduler,
-                        safety_checker=None,
-                        feature_extractor=None,
-                        requires_safety_checker=False,
-                    )
-                else:
-                    if is_img2img:
-                        pipe = StableDiffusionImg2ImgPipeline(
-                            vae=vae,
-                            text_encoder=text_model,
-                            tokenizer=tokenizer,
-                            unet=unet,
-                            scheduler=scheduler,
-                            safety_checker=None,
-                            feature_extractor=None,
-                            requires_safety_checker=False,
-                        )
-                    else:
-                        pipe = StableDiffusionPipeline(
-                            vae=vae,
-                            text_encoder=text_model,
-                            tokenizer=tokenizer,
-                            unet=unet,
-                            scheduler=scheduler,
-                            safety_checker=None,
-                            feature_extractor=None,
-                            requires_safety_checker=False,
-                        )
+                pipe = pipeline_class(
+                    vae=vae,
+                    text_encoder=text_model,
+                    tokenizer=tokenizer,
+                    unet=unet,
+                    scheduler=scheduler,
+                    safety_checker=None,
+                    feature_extractor=None,
+                    requires_safety_checker=False,
+                )
         else:
             image_normalizer, image_noising_scheduler = stable_unclip_image_noising_components(
                 original_config, clip_stats_path=clip_stats_path, device=device
@@ -1326,41 +1310,15 @@ def download_from_original_stable_diffusion_ckpt(
                 feature_extractor=feature_extractor,
             )
         else:
-            if (
-                hasattr(original_config, "model")
-                and hasattr(original_config.model, "target")
-                and "LatentInpaintDiffusion" in original_config.model.target
-            ):
-                pipe = StableDiffusionInpaintPipeline(
-                    vae=vae,
-                    text_encoder=text_model,
-                    tokenizer=tokenizer,
-                    unet=unet,
-                    scheduler=scheduler,
-                    safety_checker=safety_checker,
-                    feature_extractor=feature_extractor,
-                )
-            else:
-                if is_img2img:
-                    pipe = StableDiffusionImg2ImgPipeline(
-                        vae=vae,
-                        text_encoder=text_model,
-                        tokenizer=tokenizer,
-                        unet=unet,
-                        scheduler=scheduler,
-                        safety_checker=safety_checker,
-                        feature_extractor=feature_extractor,
-                    )
-                else:
-                    pipe = StableDiffusionPipeline(
-                        vae=vae,
-                        text_encoder=text_model,
-                        tokenizer=tokenizer,
-                        unet=unet,
-                        scheduler=scheduler,
-                        safety_checker=safety_checker,
-                        feature_extractor=feature_extractor,
-                    )
+            pipe = pipeline_class(
+                vae=vae,
+                text_encoder=text_model,
+                tokenizer=tokenizer,
+                unet=unet,
+                scheduler=scheduler,
+                safety_checker=safety_checker,
+                feature_extractor=feature_extractor,
+            )
     else:
         text_config = create_ldm_bert_config(original_config)
         text_model = convert_ldm_bert_checkpoint(checkpoint, text_config)
@@ -1379,7 +1337,7 @@ def download_controlnet_from_original_ckpt(
     upcast_attention: Optional[bool] = None,
     device: str = None,
     from_safetensors: bool = False,
-) -> StableDiffusionPipeline:
+) -> DiffusionPipeline:
     if not is_omegaconf_available():
         raise ValueError(BACKENDS_MAPPING["omegaconf"][1])
 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 689febe3e891..7347d70c4023 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -20,7 +20,7 @@
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
-from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from ...loaders import FromCkptMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
@@ -53,13 +53,21 @@
 """
 
 
-class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
+class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromCkptMixin):
     r"""
     Pipeline for text-to-image generation using Stable Diffusion.
 
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
 
+    In addition the pipeline inherits the following loading methods:
+        - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
+        - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`]
+        - *Ckpt*: [`loaders.FromCkptMixin.from_ckpt`]
+
+    as well as the following saving methods:
+        - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`]
+
     Args:
         vae ([`AutoencoderKL`]):
             Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index 3b8889d92b55..322f2232fc8a 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -156,6 +156,9 @@ class StableDiffusionControlNetPipeline(DiffusionPipeline, TextualInversionLoade
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
 
+    In addition the pipeline inherits the following loading methods:
+        - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
+
     Args:
         vae ([`AutoencoderKL`]):
             Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
index 4fe117ba120b..c4f9ae59a4e9 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
@@ -23,7 +23,7 @@
 from transformers import CLIPTextModel, CLIPTokenizer, DPTFeatureExtractor, DPTForDepthEstimation
 
 from ...configuration_utils import FrozenDict
-from ...loaders import TextualInversionLoaderMixin
+from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import PIL_INTERPOLATION, deprecate, is_accelerate_available, logging, randn_tensor
@@ -55,13 +55,20 @@ def preprocess(image):
     return image
 
 
-class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
+class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
     r"""
     Pipeline for text-guided image to image generation using Stable Diffusion.
 
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
 
+    In addition the pipeline inherits the following loading methods:
+        - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
+        - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`]
+
+    as well as the following saving methods:
+        - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`]
+
     Args:
         vae ([`AutoencoderKL`]):
             Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 5860a53ad528..c26ddf06cadc 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -23,7 +23,7 @@
 
 from ...configuration_utils import FrozenDict
 from ...image_processor import VaeImageProcessor
-from ...loaders import TextualInversionLoaderMixin
+from ...loaders import FromCkptMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
@@ -92,13 +92,21 @@ def preprocess(image):
     return image
 
 
-class StableDiffusionImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
+class StableDiffusionImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromCkptMixin):
     r"""
     Pipeline for text-guided image to image generation using Stable Diffusion.
 
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
 
+    In addition the pipeline inherits the following loading methods:
+        - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
+        - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`]
+        - *Ckpt*: [`loaders.FromCkptMixin.from_ckpt`]
+
+    as well as the following saving methods:
+        - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`]
+
     Args:
         vae ([`AutoencoderKL`]):
             Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index 8e0ea5a8d079..fb2e5dc424e3 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -22,7 +22,7 @@
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
-from ...loaders import TextualInversionLoaderMixin
+from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import deprecate, is_accelerate_available, is_accelerate_version, logging, randn_tensor
@@ -138,13 +138,20 @@ def prepare_mask_and_masked_image(image, mask):
     return mask, masked_image
 
 
-class StableDiffusionInpaintPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
+class StableDiffusionInpaintPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
     r"""
     Pipeline for text-guided image inpainting using Stable Diffusion. *This is an experimental feature*.
 
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
 
+    In addition the pipeline inherits the following loading methods:
+        - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
+        - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`]
+
+    as well as the following saving methods:
+        - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`]
+
     Args:
         vae ([`AutoencoderKL`]):
             Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
index 6d9cbaf67a07..1c8377c7e54e 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
@@ -22,7 +22,7 @@
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
-from ...loaders import TextualInversionLoaderMixin
+from ...loaders import FromCkptMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
@@ -82,13 +82,23 @@ def preprocess_mask(mask, scale_factor=8):
         return mask
 
 
-class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline, TextualInversionLoaderMixin):
+class StableDiffusionInpaintPipelineLegacy(
+    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromCkptMixin
+):
     r"""
     Pipeline for text-guided image inpainting using Stable Diffusion. *This is an experimental feature*.
 
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
 
+    In addition the pipeline inherits the following loading methods:
+        - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
+        - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`]
+        - *Ckpt*: [`loaders.FromCkptMixin.from_ckpt`]
+
+    as well as the following saving methods:
+        - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`]
+
     Args:
         vae ([`AutoencoderKL`]):
             Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
index f7999a08dc9b..49944cdcd636 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
@@ -20,7 +20,7 @@
 import torch
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
-from ...loaders import TextualInversionLoaderMixin
+from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
@@ -61,13 +61,20 @@ def preprocess(image):
     return image
 
 
-class StableDiffusionInstructPix2PixPipeline(DiffusionPipeline, TextualInversionLoaderMixin):
+class StableDiffusionInstructPix2PixPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
     r"""
     Pipeline for pixel-level image editing by following text instructions. Based on Stable Diffusion.
 
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
     library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
 
+    In addition the pipeline inherits the following loading methods:
+        - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
+        - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`]
+
+    as well as the following saving methods:
+        - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`]
+
     Args:
         vae ([`AutoencoderKL`]):
             Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index 8a521457f2e3..bda56d2ae8ae 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -2,21 +2,6 @@
 from ..utils import DummyObject, requires_backends
 
 
-class TextualInversionLoaderMixin(metaclass=DummyObject):
-    _backends = ["torch", "transformers"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "transformers"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-
 class AltDiffusionImg2ImgPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index 14421a64b9e8..fcfcd84c5d48 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -36,6 +36,7 @@
     UNet2DConditionModel,
     logging,
 )
+from diffusers.models.attention_processor import AttnProcessor
 from diffusers.utils import load_numpy, nightly, slow, torch_device
 from diffusers.utils.testing_utils import CaptureLogger, require_torch_gpu
 
@@ -865,6 +866,62 @@ def test_stable_diffusion_textual_inversion(self):
         assert max_diff < 5e-2
 
 
+@slow
+@require_torch_gpu
+class StableDiffusionPipelineCkptTests(unittest.TestCase):
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_download_from_hub(self):
+        ckpt_paths = [
+            "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt",
+            "https://huggingface.co/WarriorMama777/OrangeMixs/blob/main/Models/AbyssOrangeMix/AbyssOrangeMix_base.ckpt",
+        ]
+
+        for ckpt_path in ckpt_paths:
+            pipe = StableDiffusionPipeline.from_ckpt(ckpt_path, torch_dtype=torch.float16)
+            pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+            pipe.to("cuda")
+
+        image_out = pipe("test", num_inference_steps=1, output_type="np").images[0]
+
+        assert image_out.shape == (512, 512, 3)
+
+    def test_download_local(self):
+        filename = hf_hub_download("runwayml/stable-diffusion-v1-5", filename="v1-5-pruned-emaonly.ckpt")
+
+        pipe = StableDiffusionPipeline.from_ckpt(filename, torch_dtype=torch.float16)
+        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+        pipe.to("cuda")
+
+        image_out = pipe("test", num_inference_steps=1, output_type="np").images[0]
+
+        assert image_out.shape == (512, 512, 3)
+
+    def test_download_ckpt_diff_format_is_same(self):
+        ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt"
+
+        pipe = StableDiffusionPipeline.from_ckpt(ckpt_path)
+        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+        pipe.unet.set_attn_processor(AttnProcessor())
+        pipe.to("cuda")
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        image_ckpt = pipe("a turtle", num_inference_steps=5, generator=generator, output_type="np").images[0]
+
+        pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+        pipe.unet.set_attn_processor(AttnProcessor())
+        pipe.to("cuda")
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        image = pipe("a turtle", num_inference_steps=5, generator=generator, output_type="np").images[0]
+
+        assert np.max(np.abs(image - image_ckpt)) < 1e-4
+
+
 @nightly
 @require_torch_gpu
 class StableDiffusionPipelineNightlyTests(unittest.TestCase):

From bba1c1de151bf0ff0b47a7b81a8251c3bed1db1f Mon Sep 17 00:00:00 2001
From: asfiyab-nvidia <117682710+asfiyab-nvidia@users.noreply.github.com>
Date: Wed, 19 Apr 2023 09:51:03 -0700
Subject: [PATCH 29/71] Add TensorRT SD/txt2img Community Pipeline to diffusers
 along with TensorRT utils (#2974)

* Add SD/txt2img Community Pipeline to diffusers along with TensorRT utils

Signed-off-by: Asfiya Baig <asfiyab@nvidia.com>

* update installation command

Signed-off-by: Asfiya Baig <asfiyab@nvidia.com>

* update tensorrt installation

Signed-off-by: Asfiya Baig <asfiyab@nvidia.com>

* changes
1. Update setting of cache directory
2. Address comments: merge utils and pipeline code.
3. Address comments: Add section in README

Signed-off-by: Asfiya Baig <asfiyab@nvidia.com>

* apply make style

Signed-off-by: Asfiya Baig <asfiyab@nvidia.com>

---------

Signed-off-by: Asfiya Baig <asfiyab@nvidia.com>
Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 examples/community/README.md                  |  33 +-
 .../stable_diffusion_tensorrt_txt2img.py      | 926 ++++++++++++++++++
 2 files changed, 958 insertions(+), 1 deletion(-)
 create mode 100644 examples/community/stable_diffusion_tensorrt_txt2img.py

diff --git a/examples/community/README.md b/examples/community/README.md
index 11da90764579..8b5b1743203d 100644
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -31,7 +31,7 @@ MagicMix | Diffusion Pipeline for semantic mixing of an image and a text prompt
 | UnCLIP Image Interpolation Pipeline | Diffusion Pipeline that allows passing two images/image_embeddings and produces images while interpolating between their image-embeddings | [UnCLIP Image Interpolation Pipeline](#unclip-image-interpolation-pipeline)                   | -                                                                                                                                                                                                                  | [Naga Sai Abhinay Devarinti](https://github.com/Abhinay1997/) | 
 | DDIM Noise Comparative Analysis Pipeline | Investigating how the diffusion models learn visual concepts from each noise level (which is a contribution of [P2 weighting (CVPR 2022)](https://arxiv.org/abs/2204.00227)) | [DDIM Noise Comparative Analysis Pipeline](#ddim-noise-comparative-analysis-pipeline) | - |[Aengus (Duc-Anh)](https://github.com/aengusng8) |
 | CLIP Guided Img2Img Stable Diffusion Pipeline | Doing CLIP guidance for image to image generation with Stable Diffusion | [CLIP Guided Img2Img Stable Diffusion](#clip-guided-img2img-stable-diffusion) | - | [Nipun Jindal](https://github.com/nipunjindal/) | 
-
+| TensorRT Stable Diffusion Pipeline | Accelerates the Stable Diffusion Text2Image Pipeline using TensorRT | [TensorRT Stable Diffusion Pipeline](#tensorrt-text2image-stable-diffusion-pipeline) | - |[Asfiya Baig](https://github.com/asfiyab-nvidia) |
 
 
 To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.
@@ -1130,3 +1130,34 @@ Init Image
 Output Image
 
 ![img2img_clip_guidance](https://huggingface.co/datasets/njindal/images/resolve/main/clip_guided_img2img.jpg)
+
+### TensorRT Text2Image Stable Diffusion Pipeline
+
+The TensorRT Pipeline can be used to accelerate the Text2Image Stable Diffusion Inference run.
+
+NOTE: The ONNX conversions and TensorRT engine build may take up to 30 minutes.
+
+```python
+import torch
+from diffusers import DDIMScheduler
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipeline
+
+# Use the DDIMScheduler scheduler here instead
+scheduler = DDIMScheduler.from_pretrained("stabilityai/stable-diffusion-2-1",
+                                            subfolder="scheduler")
+
+pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1",
+                                                custom_pipeline="stable_diffusion_tensorrt_txt2img",
+                                                revision='fp16',
+                                                torch_dtype=torch.float16,
+                                                scheduler=scheduler,)
+
+# re-use cached folder to save ONNX models and TensorRT Engines
+pipe.set_cached_folder("stabilityai/stable-diffusion-2-1", revision='fp16',)
+
+pipe = pipe.to("cuda")
+
+prompt = "a beautiful photograph of Mt. Fuji during cherry blossom"
+image = pipe(prompt).images[0]
+image.save('tensorrt_mt_fuji.png')
+```
diff --git a/examples/community/stable_diffusion_tensorrt_txt2img.py b/examples/community/stable_diffusion_tensorrt_txt2img.py
new file mode 100644
index 000000000000..7aef2bec743f
--- /dev/null
+++ b/examples/community/stable_diffusion_tensorrt_txt2img.py
@@ -0,0 +1,926 @@
+#
+# Copyright 2023 The HuggingFace Inc. team.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import os
+from collections import OrderedDict
+from copy import copy
+from typing import List, Optional, Union
+
+import numpy as np
+import onnx
+import onnx_graphsurgeon as gs
+import tensorrt as trt
+import torch
+from huggingface_hub import snapshot_download
+from onnx import shape_inference
+from polygraphy import cuda
+from polygraphy.backend.common import bytes_from_path
+from polygraphy.backend.onnx.loader import fold_constants
+from polygraphy.backend.trt import (
+    CreateConfig,
+    Profile,
+    engine_from_bytes,
+    engine_from_network,
+    network_from_onnx_path,
+    save_engine,
+)
+from polygraphy.backend.trt import util as trt_util
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.stable_diffusion import (
+    StableDiffusionPipeline,
+    StableDiffusionPipelineOutput,
+    StableDiffusionSafetyChecker,
+)
+from diffusers.schedulers import DDIMScheduler
+from diffusers.utils import DIFFUSERS_CACHE, logging
+
+
+"""
+Installation instructions
+python3 -m pip install --upgrade tensorrt
+python3 -m pip install --upgrade polygraphy onnx-graphsurgeon --extra-index-url https://pypi.ngc.nvidia.com
+python3 -m pip install onnxruntime
+"""
+
+TRT_LOGGER = trt.Logger(trt.Logger.ERROR)
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+# Map of numpy dtype -> torch dtype
+numpy_to_torch_dtype_dict = {
+    np.uint8: torch.uint8,
+    np.int8: torch.int8,
+    np.int16: torch.int16,
+    np.int32: torch.int32,
+    np.int64: torch.int64,
+    np.float16: torch.float16,
+    np.float32: torch.float32,
+    np.float64: torch.float64,
+    np.complex64: torch.complex64,
+    np.complex128: torch.complex128,
+}
+if np.version.full_version >= "1.24.0":
+    numpy_to_torch_dtype_dict[np.bool_] = torch.bool
+else:
+    numpy_to_torch_dtype_dict[np.bool] = torch.bool
+
+# Map of torch dtype -> numpy dtype
+torch_to_numpy_dtype_dict = {value: key for (key, value) in numpy_to_torch_dtype_dict.items()}
+
+
+def device_view(t):
+    return cuda.DeviceView(ptr=t.data_ptr(), shape=t.shape, dtype=torch_to_numpy_dtype_dict[t.dtype])
+
+
+class Engine:
+    def __init__(self, engine_path):
+        self.engine_path = engine_path
+        self.engine = None
+        self.context = None
+        self.buffers = OrderedDict()
+        self.tensors = OrderedDict()
+
+    def __del__(self):
+        [buf.free() for buf in self.buffers.values() if isinstance(buf, cuda.DeviceArray)]
+        del self.engine
+        del self.context
+        del self.buffers
+        del self.tensors
+
+    def build(
+        self,
+        onnx_path,
+        fp16,
+        input_profile=None,
+        enable_preview=False,
+        enable_all_tactics=False,
+        timing_cache=None,
+        workspace_size=0,
+    ):
+        logger.warning(f"Building TensorRT engine for {onnx_path}: {self.engine_path}")
+        p = Profile()
+        if input_profile:
+            for name, dims in input_profile.items():
+                assert len(dims) == 3
+                p.add(name, min=dims[0], opt=dims[1], max=dims[2])
+
+        config_kwargs = {}
+
+        config_kwargs["preview_features"] = [trt.PreviewFeature.DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805]
+        if enable_preview:
+            # Faster dynamic shapes made optional since it increases engine build time.
+            config_kwargs["preview_features"].append(trt.PreviewFeature.FASTER_DYNAMIC_SHAPES_0805)
+        if workspace_size > 0:
+            config_kwargs["memory_pool_limits"] = {trt.MemoryPoolType.WORKSPACE: workspace_size}
+        if not enable_all_tactics:
+            config_kwargs["tactic_sources"] = []
+
+        engine = engine_from_network(
+            network_from_onnx_path(onnx_path),
+            config=CreateConfig(fp16=fp16, profiles=[p], load_timing_cache=timing_cache, **config_kwargs),
+            save_timing_cache=timing_cache,
+        )
+        save_engine(engine, path=self.engine_path)
+
+    def load(self):
+        logger.warning(f"Loading TensorRT engine: {self.engine_path}")
+        self.engine = engine_from_bytes(bytes_from_path(self.engine_path))
+
+    def activate(self):
+        self.context = self.engine.create_execution_context()
+
+    def allocate_buffers(self, shape_dict=None, device="cuda"):
+        for idx in range(trt_util.get_bindings_per_profile(self.engine)):
+            binding = self.engine[idx]
+            if shape_dict and binding in shape_dict:
+                shape = shape_dict[binding]
+            else:
+                shape = self.engine.get_binding_shape(binding)
+            dtype = trt.nptype(self.engine.get_binding_dtype(binding))
+            if self.engine.binding_is_input(binding):
+                self.context.set_binding_shape(idx, shape)
+            tensor = torch.empty(tuple(shape), dtype=numpy_to_torch_dtype_dict[dtype]).to(device=device)
+            self.tensors[binding] = tensor
+            self.buffers[binding] = cuda.DeviceView(ptr=tensor.data_ptr(), shape=shape, dtype=dtype)
+
+    def infer(self, feed_dict, stream):
+        start_binding, end_binding = trt_util.get_active_profile_bindings(self.context)
+        # shallow copy of ordered dict
+        device_buffers = copy(self.buffers)
+        for name, buf in feed_dict.items():
+            assert isinstance(buf, cuda.DeviceView)
+            device_buffers[name] = buf
+        bindings = [0] * start_binding + [buf.ptr for buf in device_buffers.values()]
+        noerror = self.context.execute_async_v2(bindings=bindings, stream_handle=stream.ptr)
+        if not noerror:
+            raise ValueError("ERROR: inference failed.")
+
+        return self.tensors
+
+
+class Optimizer:
+    def __init__(self, onnx_graph):
+        self.graph = gs.import_onnx(onnx_graph)
+
+    def cleanup(self, return_onnx=False):
+        self.graph.cleanup().toposort()
+        if return_onnx:
+            return gs.export_onnx(self.graph)
+
+    def select_outputs(self, keep, names=None):
+        self.graph.outputs = [self.graph.outputs[o] for o in keep]
+        if names:
+            for i, name in enumerate(names):
+                self.graph.outputs[i].name = name
+
+    def fold_constants(self, return_onnx=False):
+        onnx_graph = fold_constants(gs.export_onnx(self.graph), allow_onnxruntime_shape_inference=True)
+        self.graph = gs.import_onnx(onnx_graph)
+        if return_onnx:
+            return onnx_graph
+
+    def infer_shapes(self, return_onnx=False):
+        onnx_graph = gs.export_onnx(self.graph)
+        if onnx_graph.ByteSize() > 2147483648:
+            raise TypeError("ERROR: model size exceeds supported 2GB limit")
+        else:
+            onnx_graph = shape_inference.infer_shapes(onnx_graph)
+
+        self.graph = gs.import_onnx(onnx_graph)
+        if return_onnx:
+            return onnx_graph
+
+
+class BaseModel:
+    def __init__(self, model, fp16=False, device="cuda", max_batch_size=16, embedding_dim=768, text_maxlen=77):
+        self.model = model
+        self.name = "SD Model"
+        self.fp16 = fp16
+        self.device = device
+
+        self.min_batch = 1
+        self.max_batch = max_batch_size
+        self.min_image_shape = 256  # min image resolution: 256x256
+        self.max_image_shape = 1024  # max image resolution: 1024x1024
+        self.min_latent_shape = self.min_image_shape // 8
+        self.max_latent_shape = self.max_image_shape // 8
+
+        self.embedding_dim = embedding_dim
+        self.text_maxlen = text_maxlen
+
+    def get_model(self):
+        return self.model
+
+    def get_input_names(self):
+        pass
+
+    def get_output_names(self):
+        pass
+
+    def get_dynamic_axes(self):
+        return None
+
+    def get_sample_input(self, batch_size, image_height, image_width):
+        pass
+
+    def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+        return None
+
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        return None
+
+    def optimize(self, onnx_graph):
+        opt = Optimizer(onnx_graph)
+        opt.cleanup()
+        opt.fold_constants()
+        opt.infer_shapes()
+        onnx_opt_graph = opt.cleanup(return_onnx=True)
+        return onnx_opt_graph
+
+    def check_dims(self, batch_size, image_height, image_width):
+        assert batch_size >= self.min_batch and batch_size <= self.max_batch
+        assert image_height % 8 == 0 or image_width % 8 == 0
+        latent_height = image_height // 8
+        latent_width = image_width // 8
+        assert latent_height >= self.min_latent_shape and latent_height <= self.max_latent_shape
+        assert latent_width >= self.min_latent_shape and latent_width <= self.max_latent_shape
+        return (latent_height, latent_width)
+
+    def get_minmax_dims(self, batch_size, image_height, image_width, static_batch, static_shape):
+        min_batch = batch_size if static_batch else self.min_batch
+        max_batch = batch_size if static_batch else self.max_batch
+        latent_height = image_height // 8
+        latent_width = image_width // 8
+        min_image_height = image_height if static_shape else self.min_image_shape
+        max_image_height = image_height if static_shape else self.max_image_shape
+        min_image_width = image_width if static_shape else self.min_image_shape
+        max_image_width = image_width if static_shape else self.max_image_shape
+        min_latent_height = latent_height if static_shape else self.min_latent_shape
+        max_latent_height = latent_height if static_shape else self.max_latent_shape
+        min_latent_width = latent_width if static_shape else self.min_latent_shape
+        max_latent_width = latent_width if static_shape else self.max_latent_shape
+        return (
+            min_batch,
+            max_batch,
+            min_image_height,
+            max_image_height,
+            min_image_width,
+            max_image_width,
+            min_latent_height,
+            max_latent_height,
+            min_latent_width,
+            max_latent_width,
+        )
+
+
+def getOnnxPath(model_name, onnx_dir, opt=True):
+    return os.path.join(onnx_dir, model_name + (".opt" if opt else "") + ".onnx")
+
+
+def getEnginePath(model_name, engine_dir):
+    return os.path.join(engine_dir, model_name + ".plan")
+
+
+def build_engines(
+    models: dict,
+    engine_dir,
+    onnx_dir,
+    onnx_opset,
+    opt_image_height,
+    opt_image_width,
+    opt_batch_size=1,
+    force_engine_rebuild=False,
+    static_batch=False,
+    static_shape=True,
+    enable_preview=False,
+    enable_all_tactics=False,
+    timing_cache=None,
+    max_workspace_size=0,
+):
+    built_engines = {}
+    if not os.path.isdir(onnx_dir):
+        os.makedirs(onnx_dir)
+    if not os.path.isdir(engine_dir):
+        os.makedirs(engine_dir)
+
+    # Export models to ONNX
+    for model_name, model_obj in models.items():
+        engine_path = getEnginePath(model_name, engine_dir)
+        if force_engine_rebuild or not os.path.exists(engine_path):
+            logger.warning("Building Engines...")
+            logger.warning("Engine build can take a while to complete")
+            onnx_path = getOnnxPath(model_name, onnx_dir, opt=False)
+            onnx_opt_path = getOnnxPath(model_name, onnx_dir)
+            if force_engine_rebuild or not os.path.exists(onnx_opt_path):
+                if force_engine_rebuild or not os.path.exists(onnx_path):
+                    logger.warning(f"Exporting model: {onnx_path}")
+                    model = model_obj.get_model()
+                    with torch.inference_mode(), torch.autocast("cuda"):
+                        inputs = model_obj.get_sample_input(opt_batch_size, opt_image_height, opt_image_width)
+                        torch.onnx.export(
+                            model,
+                            inputs,
+                            onnx_path,
+                            export_params=True,
+                            opset_version=onnx_opset,
+                            do_constant_folding=True,
+                            input_names=model_obj.get_input_names(),
+                            output_names=model_obj.get_output_names(),
+                            dynamic_axes=model_obj.get_dynamic_axes(),
+                        )
+                    del model
+                    torch.cuda.empty_cache()
+                    gc.collect()
+                else:
+                    logger.warning(f"Found cached model: {onnx_path}")
+
+                # Optimize onnx
+                if force_engine_rebuild or not os.path.exists(onnx_opt_path):
+                    logger.warning(f"Generating optimizing model: {onnx_opt_path}")
+                    onnx_opt_graph = model_obj.optimize(onnx.load(onnx_path))
+                    onnx.save(onnx_opt_graph, onnx_opt_path)
+                else:
+                    logger.warning(f"Found cached optimized model: {onnx_opt_path} ")
+
+    # Build TensorRT engines
+    for model_name, model_obj in models.items():
+        engine_path = getEnginePath(model_name, engine_dir)
+        engine = Engine(engine_path)
+        onnx_path = getOnnxPath(model_name, onnx_dir, opt=False)
+        onnx_opt_path = getOnnxPath(model_name, onnx_dir)
+
+        if force_engine_rebuild or not os.path.exists(engine.engine_path):
+            engine.build(
+                onnx_opt_path,
+                fp16=True,
+                input_profile=model_obj.get_input_profile(
+                    opt_batch_size,
+                    opt_image_height,
+                    opt_image_width,
+                    static_batch=static_batch,
+                    static_shape=static_shape,
+                ),
+                enable_preview=enable_preview,
+                timing_cache=timing_cache,
+                workspace_size=max_workspace_size,
+            )
+        built_engines[model_name] = engine
+
+    # Load and activate TensorRT engines
+    for model_name, model_obj in models.items():
+        engine = built_engines[model_name]
+        engine.load()
+        engine.activate()
+
+    return built_engines
+
+
+def runEngine(engine, feed_dict, stream):
+    return engine.infer(feed_dict, stream)
+
+
+class CLIP(BaseModel):
+    def __init__(self, model, device, max_batch_size, embedding_dim):
+        super(CLIP, self).__init__(
+            model=model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim
+        )
+        self.name = "CLIP"
+
+    def get_input_names(self):
+        return ["input_ids"]
+
+    def get_output_names(self):
+        return ["text_embeddings", "pooler_output"]
+
+    def get_dynamic_axes(self):
+        return {"input_ids": {0: "B"}, "text_embeddings": {0: "B"}}
+
+    def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+        self.check_dims(batch_size, image_height, image_width)
+        min_batch, max_batch, _, _, _, _, _, _, _, _ = self.get_minmax_dims(
+            batch_size, image_height, image_width, static_batch, static_shape
+        )
+        return {
+            "input_ids": [(min_batch, self.text_maxlen), (batch_size, self.text_maxlen), (max_batch, self.text_maxlen)]
+        }
+
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        self.check_dims(batch_size, image_height, image_width)
+        return {
+            "input_ids": (batch_size, self.text_maxlen),
+            "text_embeddings": (batch_size, self.text_maxlen, self.embedding_dim),
+        }
+
+    def get_sample_input(self, batch_size, image_height, image_width):
+        self.check_dims(batch_size, image_height, image_width)
+        return torch.zeros(batch_size, self.text_maxlen, dtype=torch.int32, device=self.device)
+
+    def optimize(self, onnx_graph):
+        opt = Optimizer(onnx_graph)
+        opt.select_outputs([0])  # delete graph output#1
+        opt.cleanup()
+        opt.fold_constants()
+        opt.infer_shapes()
+        opt.select_outputs([0], names=["text_embeddings"])  # rename network output
+        opt_onnx_graph = opt.cleanup(return_onnx=True)
+        return opt_onnx_graph
+
+
+def make_CLIP(model, device, max_batch_size, embedding_dim, inpaint=False):
+    return CLIP(model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim)
+
+
+class UNet(BaseModel):
+    def __init__(
+        self, model, fp16=False, device="cuda", max_batch_size=16, embedding_dim=768, text_maxlen=77, unet_dim=4
+    ):
+        super(UNet, self).__init__(
+            model=model,
+            fp16=fp16,
+            device=device,
+            max_batch_size=max_batch_size,
+            embedding_dim=embedding_dim,
+            text_maxlen=text_maxlen,
+        )
+        self.unet_dim = unet_dim
+        self.name = "UNet"
+
+    def get_input_names(self):
+        return ["sample", "timestep", "encoder_hidden_states"]
+
+    def get_output_names(self):
+        return ["latent"]
+
+    def get_dynamic_axes(self):
+        return {
+            "sample": {0: "2B", 2: "H", 3: "W"},
+            "encoder_hidden_states": {0: "2B"},
+            "latent": {0: "2B", 2: "H", 3: "W"},
+        }
+
+    def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        (
+            min_batch,
+            max_batch,
+            _,
+            _,
+            _,
+            _,
+            min_latent_height,
+            max_latent_height,
+            min_latent_width,
+            max_latent_width,
+        ) = self.get_minmax_dims(batch_size, image_height, image_width, static_batch, static_shape)
+        return {
+            "sample": [
+                (2 * min_batch, self.unet_dim, min_latent_height, min_latent_width),
+                (2 * batch_size, self.unet_dim, latent_height, latent_width),
+                (2 * max_batch, self.unet_dim, max_latent_height, max_latent_width),
+            ],
+            "encoder_hidden_states": [
+                (2 * min_batch, self.text_maxlen, self.embedding_dim),
+                (2 * batch_size, self.text_maxlen, self.embedding_dim),
+                (2 * max_batch, self.text_maxlen, self.embedding_dim),
+            ],
+        }
+
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        return {
+            "sample": (2 * batch_size, self.unet_dim, latent_height, latent_width),
+            "encoder_hidden_states": (2 * batch_size, self.text_maxlen, self.embedding_dim),
+            "latent": (2 * batch_size, 4, latent_height, latent_width),
+        }
+
+    def get_sample_input(self, batch_size, image_height, image_width):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        dtype = torch.float16 if self.fp16 else torch.float32
+        return (
+            torch.randn(
+                2 * batch_size, self.unet_dim, latent_height, latent_width, dtype=torch.float32, device=self.device
+            ),
+            torch.tensor([1.0], dtype=torch.float32, device=self.device),
+            torch.randn(2 * batch_size, self.text_maxlen, self.embedding_dim, dtype=dtype, device=self.device),
+        )
+
+
+def make_UNet(model, device, max_batch_size, embedding_dim, inpaint=False):
+    return UNet(
+        model,
+        fp16=True,
+        device=device,
+        max_batch_size=max_batch_size,
+        embedding_dim=embedding_dim,
+        unet_dim=(9 if inpaint else 4),
+    )
+
+
+class VAE(BaseModel):
+    def __init__(self, model, device, max_batch_size, embedding_dim):
+        super(VAE, self).__init__(
+            model=model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim
+        )
+        self.name = "VAE decoder"
+
+    def get_input_names(self):
+        return ["latent"]
+
+    def get_output_names(self):
+        return ["images"]
+
+    def get_dynamic_axes(self):
+        return {"latent": {0: "B", 2: "H", 3: "W"}, "images": {0: "B", 2: "8H", 3: "8W"}}
+
+    def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        (
+            min_batch,
+            max_batch,
+            _,
+            _,
+            _,
+            _,
+            min_latent_height,
+            max_latent_height,
+            min_latent_width,
+            max_latent_width,
+        ) = self.get_minmax_dims(batch_size, image_height, image_width, static_batch, static_shape)
+        return {
+            "latent": [
+                (min_batch, 4, min_latent_height, min_latent_width),
+                (batch_size, 4, latent_height, latent_width),
+                (max_batch, 4, max_latent_height, max_latent_width),
+            ]
+        }
+
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        return {
+            "latent": (batch_size, 4, latent_height, latent_width),
+            "images": (batch_size, 3, image_height, image_width),
+        }
+
+    def get_sample_input(self, batch_size, image_height, image_width):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        return torch.randn(batch_size, 4, latent_height, latent_width, dtype=torch.float32, device=self.device)
+
+
+def make_VAE(model, device, max_batch_size, embedding_dim, inpaint=False):
+    return VAE(model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim)
+
+
+class TensorRTStableDiffusionPipeline(StableDiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using TensorRT accelerated Stable Diffusion.
+
+    This model inherits from [`StableDiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: DDIMScheduler,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: bool = True,
+        stages=["clip", "unet", "vae"],
+        image_height: int = 768,
+        image_width: int = 768,
+        max_batch_size: int = 16,
+        # ONNX export parameters
+        onnx_opset: int = 17,
+        onnx_dir: str = "onnx",
+        # TensorRT engine build parameters
+        engine_dir: str = "engine",
+        force_engine_rebuild: bool = False,
+        timing_cache: str = "timing_cache",
+    ):
+        super().__init__(
+            vae, text_encoder, tokenizer, unet, scheduler, safety_checker, feature_extractor, requires_safety_checker
+        )
+
+        self.vae.forward = self.vae.decode
+
+        self.stages = stages
+        self.image_height, self.image_width = image_height, image_width
+        self.inpaint = False
+        self.onnx_opset = onnx_opset
+        self.onnx_dir = onnx_dir
+        self.engine_dir = engine_dir
+        self.force_engine_rebuild = force_engine_rebuild
+        self.timing_cache = timing_cache
+        self.build_static_batch = False
+        self.build_dynamic_shape = False
+        self.build_preview_features = False
+
+        self.max_batch_size = max_batch_size
+        # TODO: Restrict batch size to 4 for larger image dimensions as a WAR for TensorRT limitation.
+        if self.build_dynamic_shape or self.image_height > 512 or self.image_width > 512:
+            self.max_batch_size = 4
+
+        self.stream = None  # loaded in loadResources()
+        self.models = {}  # loaded in __loadModels()
+        self.engine = {}  # loaded in build_engines()
+
+    def __loadModels(self):
+        # Load pipeline models
+        self.embedding_dim = self.text_encoder.config.hidden_size
+        models_args = {
+            "device": self.torch_device,
+            "max_batch_size": self.max_batch_size,
+            "embedding_dim": self.embedding_dim,
+            "inpaint": self.inpaint,
+        }
+        if "clip" in self.stages:
+            self.models["clip"] = make_CLIP(self.text_encoder, **models_args)
+        if "unet" in self.stages:
+            self.models["unet"] = make_UNet(self.unet, **models_args)
+        if "vae" in self.stages:
+            self.models["vae"] = make_VAE(self.vae, **models_args)
+
+    @classmethod
+    def set_cached_folder(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
+        cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        revision = kwargs.pop("revision", None)
+
+        cls.cached_folder = (
+            pretrained_model_name_or_path
+            if os.path.isdir(pretrained_model_name_or_path)
+            else snapshot_download(
+                pretrained_model_name_or_path,
+                cache_dir=cache_dir,
+                resume_download=resume_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                use_auth_token=use_auth_token,
+                revision=revision,
+            )
+        )
+
+    def to(self, torch_device: Optional[Union[str, torch.device]] = None, silence_dtype_warnings: bool = False):
+        super().to(torch_device, silence_dtype_warnings)
+
+        self.onnx_dir = os.path.join(self.cached_folder, self.onnx_dir)
+        self.engine_dir = os.path.join(self.cached_folder, self.engine_dir)
+        self.timing_cache = os.path.join(self.cached_folder, self.timing_cache)
+
+        # set device
+        self.torch_device = self._execution_device
+        logger.warning(f"Running inference on device: {self.torch_device}")
+
+        # load models
+        self.__loadModels()
+
+        # build engines
+        self.engine = build_engines(
+            self.models,
+            self.engine_dir,
+            self.onnx_dir,
+            self.onnx_opset,
+            opt_image_height=self.image_height,
+            opt_image_width=self.image_width,
+            force_engine_rebuild=self.force_engine_rebuild,
+            static_batch=self.build_static_batch,
+            static_shape=not self.build_dynamic_shape,
+            enable_preview=self.build_preview_features,
+            timing_cache=self.timing_cache,
+        )
+
+        return self
+
+    def __encode_prompt(self, prompt, negative_prompt):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+             prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+        """
+        # Tokenize prompt
+        text_input_ids = (
+            self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            .input_ids.type(torch.int32)
+            .to(self.torch_device)
+        )
+
+        text_input_ids_inp = device_view(text_input_ids)
+        # NOTE: output tensor for CLIP must be cloned because it will be overwritten when called again for negative prompt
+        text_embeddings = runEngine(self.engine["clip"], {"input_ids": text_input_ids_inp}, self.stream)[
+            "text_embeddings"
+        ].clone()
+
+        # Tokenize negative prompt
+        uncond_input_ids = (
+            self.tokenizer(
+                negative_prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            .input_ids.type(torch.int32)
+            .to(self.torch_device)
+        )
+        uncond_input_ids_inp = device_view(uncond_input_ids)
+        uncond_embeddings = runEngine(self.engine["clip"], {"input_ids": uncond_input_ids_inp}, self.stream)[
+            "text_embeddings"
+        ]
+
+        # Concatenate the unconditional and text embeddings into a single batch to avoid doing two forward passes for classifier free guidance
+        text_embeddings = torch.cat([uncond_embeddings, text_embeddings]).to(dtype=torch.float16)
+
+        return text_embeddings
+
+    def __denoise_latent(
+        self, latents, text_embeddings, timesteps=None, step_offset=0, mask=None, masked_image_latents=None
+    ):
+        if not isinstance(timesteps, torch.Tensor):
+            timesteps = self.scheduler.timesteps
+        for step_index, timestep in enumerate(timesteps):
+            # Expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2)
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, timestep)
+            if isinstance(mask, torch.Tensor):
+                latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
+
+            # Predict the noise residual
+            timestep_float = timestep.float() if timestep.dtype != torch.float32 else timestep
+
+            sample_inp = device_view(latent_model_input)
+            timestep_inp = device_view(timestep_float)
+            embeddings_inp = device_view(text_embeddings)
+            noise_pred = runEngine(
+                self.engine["unet"],
+                {"sample": sample_inp, "timestep": timestep_inp, "encoder_hidden_states": embeddings_inp},
+                self.stream,
+            )["latent"]
+
+            # Perform guidance
+            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            latents = self.scheduler.step(noise_pred, timestep, latents).prev_sample
+
+        latents = 1.0 / 0.18215 * latents
+        return latents
+
+    def __decode_latent(self, latents):
+        images = runEngine(self.engine["vae"], {"latent": device_view(latents)}, self.stream)["images"]
+        images = (images / 2 + 0.5).clamp(0, 1)
+        return images.cpu().permute(0, 2, 3, 1).float().numpy()
+
+    def __loadResources(self, image_height, image_width, batch_size):
+        self.stream = cuda.Stream()
+
+        # Allocate buffers for TensorRT engine bindings
+        for model_name, obj in self.models.items():
+            self.engine[model_name].allocate_buffers(
+                shape_dict=obj.get_shape_dict(batch_size, image_height, image_width), device=self.torch_device
+            )
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+
+        """
+        self.generator = generator
+        self.denoising_steps = num_inference_steps
+        self.guidance_scale = guidance_scale
+
+        # Pre-compute latent input scales and linear multistep coefficients
+        self.scheduler.set_timesteps(self.denoising_steps, device=self.torch_device)
+
+        # Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+            prompt = [prompt]
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"Expected prompt to be of type list or str but got {type(prompt)}")
+
+        if negative_prompt is None:
+            negative_prompt = [""] * batch_size
+
+        if negative_prompt is not None and isinstance(negative_prompt, str):
+            negative_prompt = [negative_prompt]
+
+        assert len(prompt) == len(negative_prompt)
+
+        if batch_size > self.max_batch_size:
+            raise ValueError(
+                f"Batch size {len(prompt)} is larger than allowed {self.max_batch_size}. If dynamic shape is used, then maximum batch size is 4"
+            )
+
+        # load resources
+        self.__loadResources(self.image_height, self.image_width, batch_size)
+
+        with torch.inference_mode(), torch.autocast("cuda"), trt.Runtime(TRT_LOGGER):
+            # CLIP text encoder
+            text_embeddings = self.__encode_prompt(prompt, negative_prompt)
+
+            # Pre-initialize latents
+            num_channels_latents = self.unet.in_channels
+            latents = self.prepare_latents(
+                batch_size,
+                num_channels_latents,
+                self.image_height,
+                self.image_width,
+                torch.float32,
+                self.torch_device,
+                generator,
+            )
+
+            # UNet denoiser
+            latents = self.__denoise_latent(latents, text_embeddings)
+
+            # VAE decode latent
+            images = self.__decode_latent(latents)
+
+        images, has_nsfw_concept = self.run_safety_checker(images, self.torch_device, text_embeddings.dtype)
+        images = self.numpy_to_pil(images)
+        return StableDiffusionPipelineOutput(images=images, nsfw_content_detected=has_nsfw_concept)

From c8fdfe457229d647d6019e449f3eb6fafb4b6e92 Mon Sep 17 00:00:00 2001
From: Chanchana Sornsoontorn <off9955555@gmail.com>
Date: Wed, 19 Apr 2023 23:51:58 +0700
Subject: [PATCH 30/71] Correct `Transformer2DModel.forward` docstring (#3074)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

⚙️chore(transformer_2d) update function signature for encoder_hidden_states
---
 src/diffusers/models/transformer_2d.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/models/transformer_2d.py b/src/diffusers/models/transformer_2d.py
index 23364bfa1d16..fde1014bd2e7 100644
--- a/src/diffusers/models/transformer_2d.py
+++ b/src/diffusers/models/transformer_2d.py
@@ -225,7 +225,7 @@ def forward(
             hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`.
                 When continuous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input
                 hidden_states
-            encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*):
+            encoder_hidden_states ( `torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
                 Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
                 self-attention.
             timestep ( `torch.long`, *optional*):

From 3becd368b14d74ca361eada8408627234996e4d1 Mon Sep 17 00:00:00 2001
From: hwuebben <wbben123@yahoo.de>
Date: Wed, 19 Apr 2023 18:58:13 +0200
Subject: [PATCH 31/71] Update pipeline_stable_diffusion_inpaint_legacy.py
 (#2903)

* Update pipeline_stable_diffusion_inpaint_legacy.py

* fix preprocessing of Pil images with adequate batch size

* revert map

* add tests

* reformat

* Update test_stable_diffusion_inpaint_legacy.py

* Update test_stable_diffusion_inpaint_legacy.py

* Update test_stable_diffusion_inpaint_legacy.py

* Update test_stable_diffusion_inpaint_legacy.py

* next try to fix the style

* wth is this

* Update testing_utils.py

* Update testing_utils.py

* Update test_stable_diffusion_inpaint_legacy.py

* Update test_stable_diffusion_inpaint_legacy.py

* Update test_stable_diffusion_inpaint_legacy.py

* Update test_stable_diffusion_inpaint_legacy.py

* Update test_stable_diffusion_inpaint_legacy.py

* Update test_stable_diffusion_inpaint_legacy.py

---------

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 ...ipeline_stable_diffusion_inpaint_legacy.py | 20 ++--
 src/diffusers/utils/testing_utils.py          | 10 ++
 .../test_stable_diffusion_inpaint_legacy.py   | 93 ++++++++++++++++++-
 3 files changed, 108 insertions(+), 15 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
index 1c8377c7e54e..3ad1d5e92273 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
@@ -41,17 +41,17 @@
 logger = logging.get_logger(__name__)
 
 
-def preprocess_image(image):
+def preprocess_image(image, batch_size):
     w, h = image.size
     w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
     image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
     image = np.array(image).astype(np.float32) / 255.0
-    image = image[None].transpose(0, 3, 1, 2)
+    image = np.vstack([image[None].transpose(0, 3, 1, 2)] * batch_size)
     image = torch.from_numpy(image)
     return 2.0 * image - 1.0
 
 
-def preprocess_mask(mask, scale_factor=8):
+def preprocess_mask(mask, batch_size, scale_factor=8):
     if not isinstance(mask, torch.FloatTensor):
         mask = mask.convert("L")
         w, h = mask.size
@@ -59,7 +59,7 @@ def preprocess_mask(mask, scale_factor=8):
         mask = mask.resize((w // scale_factor, h // scale_factor), resample=PIL_INTERPOLATION["nearest"])
         mask = np.array(mask).astype(np.float32) / 255.0
         mask = np.tile(mask, (4, 1, 1))
-        mask = mask[None].transpose(0, 1, 2, 3)  # what does this step do?
+        mask = np.vstack([mask[None]] * batch_size)
         mask = 1 - mask  # repaint white, keep black
         mask = torch.from_numpy(mask)
         return mask
@@ -521,14 +521,14 @@ def get_timesteps(self, num_inference_steps, strength, device):
 
         return timesteps, num_inference_steps - t_start
 
-    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator):
+    def prepare_latents(self, image, timestep, num_images_per_prompt, dtype, device, generator):
         image = image.to(device=self.device, dtype=dtype)
         init_latent_dist = self.vae.encode(image).latent_dist
         init_latents = init_latent_dist.sample(generator=generator)
         init_latents = self.vae.config.scaling_factor * init_latents
 
         # Expand init_latents for batch_size and num_images_per_prompt
-        init_latents = torch.cat([init_latents] * batch_size * num_images_per_prompt, dim=0)
+        init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0)
         init_latents_orig = init_latents
 
         # add noise to latents using the timesteps
@@ -659,9 +659,9 @@ def __call__(
 
         # 4. Preprocess image and mask
         if not isinstance(image, torch.FloatTensor):
-            image = preprocess_image(image)
+            image = preprocess_image(image, batch_size)
 
-        mask_image = preprocess_mask(mask_image, self.vae_scale_factor)
+        mask_image = preprocess_mask(mask_image, batch_size, self.vae_scale_factor)
 
         # 5. set timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
@@ -671,12 +671,12 @@ def __call__(
         # 6. Prepare latent variables
         # encode the init image into latents and scale the latents
         latents, init_latents_orig, noise = self.prepare_latents(
-            image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator
+            image, latent_timestep, num_images_per_prompt, prompt_embeds.dtype, device, generator
         )
 
         # 7. Prepare mask latent
         mask = mask_image.to(device=self.device, dtype=latents.dtype)
-        mask = torch.cat([mask] * batch_size * num_images_per_prompt)
+        mask = torch.cat([mask] * num_images_per_prompt)
 
         # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py
index afea0540b765..d8fed5dec1c8 100644
--- a/src/diffusers/utils/testing_utils.py
+++ b/src/diffusers/utils/testing_utils.py
@@ -279,6 +279,16 @@ def load_image(image: Union[str, PIL.Image.Image]) -> PIL.Image.Image:
     return image
 
 
+def preprocess_image(image: PIL.Image, batch_size: int):
+    w, h = image.size
+    w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
+    image = np.array(image).astype(np.float32) / 255.0
+    image = np.vstack([image[None].transpose(0, 3, 1, 2)] * batch_size)
+    image = torch.from_numpy(image)
+    return 2.0 * image - 1.0
+
+
 def export_to_video(video_frames: List[np.ndarray], output_video_path: str = None) -> str:
     if is_opencv_available():
         import cv2
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
index 15d94414ea2f..f56fa31a9601 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
@@ -34,7 +34,7 @@
     VQModel,
 )
 from diffusers.utils import floats_tensor, load_image, nightly, slow, torch_device
-from diffusers.utils.testing_utils import load_numpy, require_torch_gpu
+from diffusers.utils.testing_utils import load_numpy, preprocess_image, require_torch_gpu
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
@@ -217,6 +217,55 @@ def test_stable_diffusion_inpaint_legacy(self):
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
 
+    def test_stable_diffusion_inpaint_legacy_batched(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB")
+        init_images_tens = preprocess_image(init_image, batch_size=2)
+        init_masks_tens = init_images_tens + 4
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionInpaintPipelineLegacy(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=device).manual_seed(0)
+        images = sd_pipe(
+            [prompt] * 2,
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            image=init_images_tens,
+            mask_image=init_masks_tens,
+        ).images
+
+        assert images.shape == (2, 32, 32, 3)
+
+        image_slice_0 = images[0, -3:, -3:, -1].flatten()
+        image_slice_1 = images[1, -3:, -3:, -1].flatten()
+
+        expected_slice_0 = np.array([0.4697, 0.3770, 0.4096, 0.4653, 0.4497, 0.4183, 0.3950, 0.4668, 0.4672])
+        expected_slice_1 = np.array([0.4105, 0.4987, 0.5771, 0.4921, 0.4237, 0.5684, 0.5496, 0.4645, 0.5272])
+
+        assert np.abs(expected_slice_0 - image_slice_0).max() < 1e-2
+        assert np.abs(expected_slice_1 - image_slice_1).max() < 1e-2
+
     def test_stable_diffusion_inpaint_legacy_negative_prompt(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
         unet = self.dummy_cond_unet
@@ -349,7 +398,7 @@ def tearDown(self):
         gc.collect()
         torch.cuda.empty_cache()
 
-    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
+    def get_inputs(self, generator_device="cpu", seed=0):
         generator = torch.Generator(device=generator_device).manual_seed(seed)
         init_image = load_image(
             "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
@@ -379,7 +428,7 @@ def test_stable_diffusion_inpaint_legacy_pndm(self):
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
 
-        inputs = self.get_inputs(torch_device)
+        inputs = self.get_inputs()
         image = pipe(**inputs).images
         image_slice = image[0, 253:256, 253:256, -1].flatten()
 
@@ -388,6 +437,40 @@ def test_stable_diffusion_inpaint_legacy_pndm(self):
 
         assert np.abs(expected_slice - image_slice).max() < 1e-4
 
+    def test_stable_diffusion_inpaint_legacy_batched(self):
+        pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained(
+            "CompVis/stable-diffusion-v1-4", safety_checker=None
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        inputs = self.get_inputs()
+        inputs["prompt"] = [inputs["prompt"]] * 2
+        inputs["image"] = preprocess_image(inputs["image"], batch_size=2)
+
+        mask = inputs["mask_image"].convert("L")
+        mask = np.array(mask).astype(np.float32) / 255.0
+        mask = torch.from_numpy(1 - mask)
+        masks = torch.vstack([mask[None][None]] * 2)
+        inputs["mask_image"] = masks
+
+        image = pipe(**inputs).images
+        assert image.shape == (2, 512, 512, 3)
+
+        image_slice_0 = image[0, 253:256, 253:256, -1].flatten()
+        image_slice_1 = image[1, 253:256, 253:256, -1].flatten()
+
+        expected_slice_0 = np.array(
+            [0.52093095, 0.4176447, 0.32752383, 0.6175223, 0.50563973, 0.36470804, 0.65460044, 0.5775188, 0.44332123]
+        )
+        expected_slice_1 = np.array(
+            [0.3592432, 0.4233033, 0.3914635, 0.31014425, 0.3702293, 0.39412856, 0.17526966, 0.2642669, 0.37480092]
+        )
+
+        assert np.abs(expected_slice_0 - image_slice_0).max() < 1e-4
+        assert np.abs(expected_slice_1 - image_slice_1).max() < 1e-4
+
     def test_stable_diffusion_inpaint_legacy_k_lms(self):
         pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained(
             "CompVis/stable-diffusion-v1-4", safety_checker=None
@@ -397,7 +480,7 @@ def test_stable_diffusion_inpaint_legacy_k_lms(self):
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
 
-        inputs = self.get_inputs(torch_device)
+        inputs = self.get_inputs()
         image = pipe(**inputs).images
         image_slice = image[0, 253:256, 253:256, -1].flatten()
 
@@ -437,7 +520,7 @@ def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
 
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
+        inputs = self.get_inputs()
         pipe(**inputs, callback=callback_fn, callback_steps=1)
         assert callback_fn.has_been_called
         assert number_of_steps == 2

From a4c91be73b871e2b1b0e934d893001978415e547 Mon Sep 17 00:00:00 2001
From: superhero-7 <57797766+superhero-7@users.noreply.github.com>
Date: Thu, 20 Apr 2023 01:00:29 +0800
Subject: [PATCH 32/71] Modified altdiffusion pipline to support
 altdiffusion-m18 (#2993)

* Modified altdiffusion pipline to support altdiffusion-m18

* Modified altdiffusion pipline to support altdiffusion-m18

* Modified altdiffusion pipline to support altdiffusion-m18

* Modified altdiffusion pipline to support altdiffusion-m18

* Modified altdiffusion pipline to support altdiffusion-m18

* Modified altdiffusion pipline to support altdiffusion-m18

* Modified altdiffusion pipline to support altdiffusion-m18

---------

Co-authored-by: root <fulong_ye@163.com>
---
 .../alt_diffusion/modeling_roberta_series.py  | 35 +++++++++++++------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/src/diffusers/pipelines/alt_diffusion/modeling_roberta_series.py b/src/diffusers/pipelines/alt_diffusion/modeling_roberta_series.py
index 637d6dd18698..f73ef15d7de7 100644
--- a/src/diffusers/pipelines/alt_diffusion/modeling_roberta_series.py
+++ b/src/diffusers/pipelines/alt_diffusion/modeling_roberta_series.py
@@ -56,7 +56,7 @@ def __init__(
 
 
 class RobertaSeriesModelWithTransformation(RobertaPreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_unexpected = [r"pooler", r"logit_scale"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
     base_model_prefix = "roberta"
     config_class = RobertaSeriesConfig
@@ -65,6 +65,10 @@ def __init__(self, config):
         super().__init__(config)
         self.roberta = XLMRobertaModel(config)
         self.transformation = nn.Linear(config.hidden_size, config.project_dim)
+        self.has_pre_transformation = getattr(config, "has_pre_transformation", False)
+        if self.has_pre_transformation:
+            self.transformation_pre = nn.Linear(config.hidden_size, config.project_dim)
+            self.pre_LN = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.post_init()
 
     def forward(
@@ -95,15 +99,26 @@ def forward(
             encoder_hidden_states=encoder_hidden_states,
             encoder_attention_mask=encoder_attention_mask,
             output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
+            output_hidden_states=True if self.has_pre_transformation else output_hidden_states,
             return_dict=return_dict,
         )
 
-        projection_state = self.transformation(outputs.last_hidden_state)
-
-        return TransformationModelOutput(
-            projection_state=projection_state,
-            last_hidden_state=outputs.last_hidden_state,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
+        if self.has_pre_transformation:
+            sequence_output2 = outputs["hidden_states"][-2]
+            sequence_output2 = self.pre_LN(sequence_output2)
+            projection_state2 = self.transformation_pre(sequence_output2)
+
+            return TransformationModelOutput(
+                projection_state=projection_state2,
+                last_hidden_state=outputs.last_hidden_state,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+            )
+        else:
+            projection_state = self.transformation(outputs.last_hidden_state)
+            return TransformationModelOutput(
+                projection_state=projection_state,
+                last_hidden_state=outputs.last_hidden_state,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+            )

From 7e6886f5e93ca9bb1e6d4beece46fe1e43b819c2 Mon Sep 17 00:00:00 2001
From: Will Berman <wlbberman@gmail.com>
Date: Wed, 19 Apr 2023 10:46:51 -0700
Subject: [PATCH 33/71] controlnet training resize inputs to multiple of 8
 (#3135)

controlnet training center crop input images to multiple of 8

The pipeline code resizes inputs to multiples of 8.
Not doing this resizing in the training script is causing
the encoded image to have different height/width dimensions
than the encoded conditioning image (which uses a separate
encoder that's part of the controlnet model).

We resize and center crop the inputs to make sure they're the
same size (as well as all other images in the batch). We also
check that the initial resolution is a multiple of 8.
---
 examples/controlnet/train_controlnet.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/examples/controlnet/train_controlnet.py b/examples/controlnet/train_controlnet.py
index c0b52291fc9b..d52e610ca52d 100644
--- a/examples/controlnet/train_controlnet.py
+++ b/examples/controlnet/train_controlnet.py
@@ -525,6 +525,11 @@ def parse_args(input_args=None):
             " or the same number of `--validation_prompt`s and `--validation_image`s"
         )
 
+    if args.resolution % 8 != 0:
+        raise ValueError(
+            "`--resolution` must be divisible by 8 for consistently sized encoded images between the VAE and the controlnet encoder."
+        )
+
     return args
 
 
@@ -607,6 +612,7 @@ def tokenize_captions(examples, is_train=True):
     image_transforms = transforms.Compose(
         [
             transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(args.resolution),
             transforms.ToTensor(),
             transforms.Normalize([0.5], [0.5]),
         ]
@@ -615,6 +621,7 @@ def tokenize_captions(examples, is_train=True):
     conditioning_image_transforms = transforms.Compose(
         [
             transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(args.resolution),
             transforms.ToTensor(),
         ]
     )

From 3979aac996213fe48e03bd95384b9858dd69a2f0 Mon Sep 17 00:00:00 2001
From: nupurkmr9 <nupurkmr9@gmail.com>
Date: Thu, 20 Apr 2023 03:31:42 -0400
Subject: [PATCH 34/71] adding custom diffusion training to diffusers examples
 (#3031)

* diffusers==0.14.0 update

* custom diffusion update

* custom diffusion update

* custom diffusion update

* custom diffusion update

* custom diffusion update

* custom diffusion update

* custom diffusion

* custom diffusion

* custom diffusion

* custom diffusion

* custom diffusion

* apply formatting and get rid of bare except.

* refactor readme and other minor changes.

* misc refactor.

* fix: repo_id issue and loaders logging bug.

* fix: save_model_card.

* fix: save_model_card.

* fix: save_model_card.

* add: doc entry.

* refactor doc,.

* custom diffusion

* custom diffusion

* custom diffusion

* apply style.

* remove tralining whitespace.

* fix: toctree entry.

* remove unnecessary print.

* custom diffusion

* custom diffusion

* custom diffusion test

* custom diffusion xformer update

* custom diffusion xformer update

* custom diffusion xformer update

---------

Co-authored-by: Nupur Kumari <nupurkumari@Nupurs-MacBook-Pro.local>
Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: Nupur Kumari <nupurkumari@nupurs-mbp.wifi.local.cmu.edu>
---
 docs/source/en/_toctree.yml                   |    2 +
 docs/source/en/training/custom_diffusion.mdx  |  287 ++++
 docs/source/en/training/overview.mdx          |    4 +
 examples/custom_diffusion/README.md           |  280 ++++
 examples/custom_diffusion/requirements.txt    |    6 +
 examples/custom_diffusion/retrieve.py         |   87 ++
 .../train_custom_diffusion.py                 | 1289 +++++++++++++++++
 examples/test_examples.py                     |   24 +
 src/diffusers/loaders.py                      |   70 +-
 src/diffusers/models/attention_processor.py   |  189 +++
 tests/models/test_models_unet_2d_condition.py |  141 +-
 11 files changed, 2369 insertions(+), 10 deletions(-)
 create mode 100644 docs/source/en/training/custom_diffusion.mdx
 create mode 100644 examples/custom_diffusion/README.md
 create mode 100644 examples/custom_diffusion/requirements.txt
 create mode 100644 examples/custom_diffusion/retrieve.py
 create mode 100644 examples/custom_diffusion/train_custom_diffusion.py

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index df41854a9fe7..de33ba616d0a 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -74,6 +74,8 @@
       title: ControlNet
     - local: training/instructpix2pix
       title: InstructPix2Pix Training
+    - local: training/custom_diffusion
+      title: Custom Diffusion
     title: Training
   - sections:
     - local: using-diffusers/rl
diff --git a/docs/source/en/training/custom_diffusion.mdx b/docs/source/en/training/custom_diffusion.mdx
new file mode 100644
index 000000000000..1e1958e1c946
--- /dev/null
+++ b/docs/source/en/training/custom_diffusion.mdx
@@ -0,0 +1,287 @@
+<!--Copyright 2023 Custom Diffusion authors The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Custom Diffusion training example 
+
+[Custom Diffusion](https://arxiv.org/abs/2212.04488) is a method to customize text-to-image models like Stable Diffusion given just a few (4~5) images of a subject.
+The `train_custom_diffusion.py` script shows how to implement the training procedure and adapt it for stable diffusion.
+
+## Running locally with PyTorch
+
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install -e .
+```
+
+Then cd in the example folder and run
+
+```bash
+pip install -r requirements.txt
+pip install clip-retrieval 
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+Or for a default accelerate configuration without answering questions about your environment
+
+```bash
+accelerate config default
+```
+
+Or if your environment doesn't support an interactive shell e.g. a notebook
+
+```python
+from accelerate.utils import write_basic_config
+
+write_basic_config()
+```
+### Cat example 😺
+
+Now let's get our dataset. Download dataset from [here](https://www.cs.cmu.edu/~custom-diffusion/assets/data.zip) and unzip it. 
+
+We also collect 200 real images using `clip-retrieval` which are combined with the target images in the training dataset as a regularization. This prevents overfitting to the the given target image. The following flags enable the regularization `with_prior_preservation`, `real_prior` with `prior_loss_weight=1.`. 
+The `class_prompt` should be the category name same as target image. The collected real images are with text captions similar to the `class_prompt`. The retrieved image are saved in `class_data_dir`. You can disable `real_prior` to use generated images as regularization. To collect the real images use this command first before training. 
+
+```bash
+pip install clip-retrieval
+python retrieve.py --class_prompt cat --class_data_dir real_reg/samples_cat --num_class_images 200
+```
+
+**___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___**
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export OUTPUT_DIR="path-to-save-model"
+export INSTANCE_DIR="./data/cat"
+
+accelerate launch train_custom_diffusion.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --class_data_dir=./real_reg/samples_cat/ \
+  --with_prior_preservation --real_prior --prior_loss_weight=1.0 \
+  --class_prompt="cat" --num_class_images=200 \
+  --instance_prompt="photo of a <new1> cat"  \
+  --resolution=512  \
+  --train_batch_size=2  \
+  --learning_rate=1e-5  \
+  --lr_warmup_steps=0 \
+  --max_train_steps=250 \
+  --scale_lr --hflip  \
+  --modifier_token "<new1>" 
+```
+
+**Use `--enable_xformers_memory_efficient_attention` for faster training with lower VRAM requirement (16GB per GPU). Follow [this guide](https://github.com/facebookresearch/xformers) for installation instructions.**
+
+To track your experiments using Weights and Biases (`wandb`) and to save intermediate results (whcih we HIGHLY recommend), follow these steps:
+
+* Install `wandb`: `pip install wandb`.
+* Authorize: `wandb login`. 
+* Then specify a `validation_prompt` and set `report_to` to `wandb` while launching training. You can also configure the following related arguments:
+    * `num_validation_images`
+    * `validation_steps`
+
+Here is an example command:
+
+```bash
+accelerate launch train_custom_diffusion.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --class_data_dir=./real_reg/samples_cat/ \
+  --with_prior_preservation --real_prior --prior_loss_weight=1.0 \
+  --class_prompt="cat" --num_class_images=200 \
+  --instance_prompt="photo of a <new1> cat"  \
+  --resolution=512  \
+  --train_batch_size=2  \
+  --learning_rate=1e-5  \
+  --lr_warmup_steps=0 \
+  --max_train_steps=250 \
+  --scale_lr --hflip  \
+  --modifier_token "<new1>" \
+  --validation_prompt="<new1> cat sitting in a bucket" \
+  --report_to="wandb"
+```
+
+Here is an example [Weights and Biases page](https://wandb.ai/sayakpaul/custom-diffusion/runs/26ghrcau) where you can check out the intermediate results along with other training details.  
+
+If you specify `--push_to_hub`, the learned parameters will be pushed to a repository on the Hugging Face Hub. Here is an [example repository](https://huggingface.co/sayakpaul/custom-diffusion-cat).
+
+### Training on multiple concepts 🐱🪵
+
+Provide a [json](https://github.com/adobe-research/custom-diffusion/blob/main/assets/concept_list.json) file with the info about each concept, similar to [this](https://github.com/ShivamShrirao/diffusers/blob/main/examples/dreambooth/train_dreambooth.py).
+
+To collect the real images run this command for each concept in the json file. 
+
+```bash
+pip install clip-retrieval
+python retrieve.py --class_prompt {} --class_data_dir {} --num_class_images 200
+```
+
+And then we're ready to start training!
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export OUTPUT_DIR="path-to-save-model"
+
+accelerate launch train_custom_diffusion.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --output_dir=$OUTPUT_DIR \
+  --concepts_list=./concept_list.json \
+  --with_prior_preservation --real_prior --prior_loss_weight=1.0 \
+  --resolution=512  \
+  --train_batch_size=2  \
+  --learning_rate=1e-5  \
+  --lr_warmup_steps=0 \
+  --max_train_steps=500 \
+  --num_class_images=200 \
+  --scale_lr --hflip  \
+  --modifier_token "<new1>+<new2>" 
+```
+
+Here is an example [Weights and Biases page](https://wandb.ai/sayakpaul/custom-diffusion/runs/3990tzkg) where you can check out the intermediate results along with other training details.  
+
+### Training on human faces
+
+For fine-tuning on human faces we found the following configuration to work better: `learning_rate=5e-6`, `max_train_steps=1000 to 2000`, and `freeze_model=crossattn` with at least 15-20 images. 
+
+To collect the real images use this command first before training. 
+
+```bash
+pip install clip-retrieval
+python retrieve.py --class_prompt person --class_data_dir real_reg/samples_person --num_class_images 200
+```
+
+Then start training!
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export OUTPUT_DIR="path-to-save-model"
+export INSTANCE_DIR="path-to-images"
+
+accelerate launch train_custom_diffusion.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --class_data_dir=./real_reg/samples_person/ \
+  --with_prior_preservation --real_prior --prior_loss_weight=1.0 \
+  --class_prompt="person" --num_class_images=200 \
+  --instance_prompt="photo of a <new1> person"  \
+  --resolution=512  \
+  --train_batch_size=2  \
+  --learning_rate=5e-6  \
+  --lr_warmup_steps=0 \
+  --max_train_steps=1000 \
+  --scale_lr --hflip --noaug \
+  --freeze_model crossattn \
+  --modifier_token "<new1>" \
+  --enable_xformers_memory_efficient_attention 
+```
+
+## Inference
+
+Once you have trained a model using the above command, you can run inference using the below command. Make sure to include the `modifier token` (e.g. \<new1\> in above example) in your prompt.
+
+```python
+import torch
+from diffusers import DiffusionPipeline
+
+pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16).to("cuda")
+pipe.unet.load_attn_procs("path-to-save-model", weight_name="pytorch_custom_diffusion_weights.bin")
+pipe.load_textual_inversion("path-to-save-model", weight_name="<new1>.bin")
+
+image = pipe(
+    "<new1> cat sitting in a bucket",
+    num_inference_steps=100,
+    guidance_scale=6.0,
+    eta=1.0,
+).images[0]
+image.save("cat.png")
+```
+
+It's possible to directly load these parameters from a Hub repository:
+
+```python
+import torch
+from huggingface_hub.repocard import RepoCard
+from diffusers import DiffusionPipeline
+
+model_id = "sayakpaul/custom-diffusion-cat"
+card = RepoCard.load(model_id)
+base_model_id = card.data.to_dict()["base_model"]
+
+pipe = DiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16).to("cuda")
+pipe.unet.load_attn_procs(model_id, weight_name="pytorch_custom_diffusion_weights.bin")
+pipe.load_textual_inversion(model_id, weight_name="<new1>.bin")
+
+image = pipe(
+    "<new1> cat sitting in a bucket",
+    num_inference_steps=100,
+    guidance_scale=6.0,
+    eta=1.0,
+).images[0]
+image.save("cat.png")
+```
+
+Here is an example of performing inference with multiple concepts:
+
+```python
+import torch
+from huggingface_hub.repocard import RepoCard
+from diffusers import DiffusionPipeline
+
+model_id = "sayakpaul/custom-diffusion-cat-wooden-pot"
+card = RepoCard.load(model_id)
+base_model_id = card.data.to_dict()["base_model"]
+
+pipe = DiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16).to("cuda")
+pipe.unet.load_attn_procs(model_id, weight_name="pytorch_custom_diffusion_weights.bin")
+pipe.load_textual_inversion(model_id, weight_name="<new1>.bin")
+pipe.load_textual_inversion(model_id, weight_name="<new2>.bin")
+
+image = pipe(
+    "the <new1> cat sculpture in the style of a <new2> wooden pot",
+    num_inference_steps=100,
+    guidance_scale=6.0,
+    eta=1.0,
+).images[0]
+image.save("multi-subject.png")
+```
+
+Here, `cat` and `wooden pot` refer to the multiple concepts.
+
+### Inference from a training checkpoint
+
+You can also perform inference from one of the complete checkpoint saved during the training process, if you used the `--checkpointing_steps` argument. 
+
+TODO.
+
+## Set grads to none
+To save even more memory, pass the `--set_grads_to_none` argument to the script. This will set grads to None instead of zero. However, be aware that it changes certain behaviors, so if you start experiencing any problems, remove this argument.
+
+More info: https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html
+
+## Experimental results
+You can refer to [our webpage](https://www.cs.cmu.edu/~custom-diffusion/) that discusses our experiments in detail. 
diff --git a/docs/source/en/training/overview.mdx b/docs/source/en/training/overview.mdx
index 5ad3a1f06cc1..c5cea3bb0a96 100644
--- a/docs/source/en/training/overview.mdx
+++ b/docs/source/en/training/overview.mdx
@@ -39,6 +39,8 @@ Training examples show how to pretrain or fine-tune diffusion models for a varie
 - [Dreambooth](./dreambooth)
 - [LoRA Support](./lora)
 - [ControlNet](./controlnet)
+- [InstructPix2Pix](./instructpix2pix)
+- [Custom Diffusion](./custom_diffusion)
 
 If possible, please [install xFormers](../optimization/xformers) for memory efficient attention. This could help make your training faster and less memory intensive.
 
@@ -50,6 +52,8 @@ If possible, please [install xFormers](../optimization/xformers) for memory effi
 | [**Dreambooth**](./dreambooth) | ✅ | - | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_dreambooth_training.ipynb)
 | [**Training with LoRA**](./lora) | ✅ | - | - |
 | [**ControlNet**](./controlnet) | ✅ | ✅ | - |
+| [**InstructPix2Pix**](./instructpix2pix) | ✅ | ✅ | - |
+| [**Custom Diffusion**](./custom_diffusion) | ✅ | ✅ | - |
 
 ## Community
 
diff --git a/examples/custom_diffusion/README.md b/examples/custom_diffusion/README.md
new file mode 100644
index 000000000000..ecd972737bc3
--- /dev/null
+++ b/examples/custom_diffusion/README.md
@@ -0,0 +1,280 @@
+# Custom Diffusion training example 
+
+[Custom Diffusion](https://arxiv.org/abs/2212.04488) is a method to customize text-to-image models like Stable Diffusion given just a few (4~5) images of a subject.
+The `train_custom_diffusion.py` script shows how to implement the training procedure and adapt it for stable diffusion.
+
+## Running locally with PyTorch
+
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install -e .
+```
+
+Then cd in the example folder and run
+
+```bash
+pip install -r requirements.txt
+pip install clip-retrieval 
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+Or for a default accelerate configuration without answering questions about your environment
+
+```bash
+accelerate config default
+```
+
+Or if your environment doesn't support an interactive shell e.g. a notebook
+
+```python
+from accelerate.utils import write_basic_config
+write_basic_config()
+```
+### Cat example 😺
+
+Now let's get our dataset. Download dataset from [here](https://www.cs.cmu.edu/~custom-diffusion/assets/data.zip) and unzip it. 
+
+We also collect 200 real images using `clip-retrieval` which are combined with the target images in the training dataset as a regularization. This prevents overfitting to the the given target image. The following flags enable the regularization `with_prior_preservation`, `real_prior` with `prior_loss_weight=1.`. 
+The `class_prompt` should be the category name same as target image. The collected real images are with text captions similar to the `class_prompt`. The retrieved image are saved in `class_data_dir`. You can disable `real_prior` to use generated images as regularization. To collect the real images use this command first before training. 
+
+```bash
+pip install clip-retrieval
+python retrieve.py --class_prompt cat --class_data_dir real_reg/samples_cat --num_class_images 200
+```
+
+**___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___**
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export OUTPUT_DIR="path-to-save-model"
+export INSTANCE_DIR="./data/cat"
+
+accelerate launch train_custom_diffusion.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --class_data_dir=./real_reg/samples_cat/ \
+  --with_prior_preservation --real_prior --prior_loss_weight=1.0 \
+  --class_prompt="cat" --num_class_images=200 \
+  --instance_prompt="photo of a <new1> cat"  \
+  --resolution=512  \
+  --train_batch_size=2  \
+  --learning_rate=1e-5  \
+  --lr_warmup_steps=0 \
+  --max_train_steps=250 \
+  --scale_lr --hflip  \
+  --modifier_token "<new1>" 
+```
+
+**Use `--enable_xformers_memory_efficient_attention` for faster training with lower VRAM requirement (16GB per GPU). Follow [this guide](https://github.com/facebookresearch/xformers) for installation instructions.**
+
+To track your experiments using Weights and Biases (`wandb`) and to save intermediate results (whcih we HIGHLY recommend), follow these steps:
+
+* Install `wandb`: `pip install wandb`.
+* Authorize: `wandb login`. 
+* Then specify a `validation_prompt` and set `report_to` to `wandb` while launching training. You can also configure the following related arguments:
+    * `num_validation_images`
+    * `validation_steps`
+
+Here is an example command:
+
+```bash
+accelerate launch train_custom_diffusion.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --class_data_dir=./real_reg/samples_cat/ \
+  --with_prior_preservation --real_prior --prior_loss_weight=1.0 \
+  --class_prompt="cat" --num_class_images=200 \
+  --instance_prompt="photo of a <new1> cat"  \
+  --resolution=512  \
+  --train_batch_size=2  \
+  --learning_rate=1e-5  \
+  --lr_warmup_steps=0 \
+  --max_train_steps=250 \
+  --scale_lr --hflip  \
+  --modifier_token "<new1>" \
+  --validation_prompt="<new1> cat sitting in a bucket" \
+  --report_to="wandb"
+```
+
+Here is an example [Weights and Biases page](https://wandb.ai/sayakpaul/custom-diffusion/runs/26ghrcau) where you can check out the intermediate results along with other training details.  
+
+If you specify `--push_to_hub`, the learned parameters will be pushed to a repository on the Hugging Face Hub. Here is an [example repository](https://huggingface.co/sayakpaul/custom-diffusion-cat).
+
+### Training on multiple concepts 🐱🪵
+
+Provide a [json](https://github.com/adobe-research/custom-diffusion/blob/main/assets/concept_list.json) file with the info about each concept, similar to [this](https://github.com/ShivamShrirao/diffusers/blob/main/examples/dreambooth/train_dreambooth.py).
+
+To collect the real images run this command for each concept in the json file. 
+
+```bash
+pip install clip-retrieval
+python retrieve.py --class_prompt {} --class_data_dir {} --num_class_images 200
+```
+
+And then we're ready to start training!
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export OUTPUT_DIR="path-to-save-model"
+
+accelerate launch train_custom_diffusion.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --output_dir=$OUTPUT_DIR \
+  --concepts_list=./concept_list.json \
+  --with_prior_preservation --real_prior --prior_loss_weight=1.0 \
+  --resolution=512  \
+  --train_batch_size=2  \
+  --learning_rate=1e-5  \
+  --lr_warmup_steps=0 \
+  --max_train_steps=500 \
+  --num_class_images=200 \
+  --scale_lr --hflip  \
+  --modifier_token "<new1>+<new2>" 
+```
+
+Here is an example [Weights and Biases page](https://wandb.ai/sayakpaul/custom-diffusion/runs/3990tzkg) where you can check out the intermediate results along with other training details.  
+
+### Training on human faces
+
+For fine-tuning on human faces we found the following configuration to work better: `learning_rate=5e-6`, `max_train_steps=1000 to 2000`, and `freeze_model=crossattn` with at least 15-20 images. 
+
+To collect the real images use this command first before training. 
+
+```bash
+pip install clip-retrieval
+python retrieve.py --class_prompt person --class_data_dir real_reg/samples_person --num_class_images 200
+```
+
+Then start training!
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export OUTPUT_DIR="path-to-save-model"
+export INSTANCE_DIR="path-to-images"
+
+accelerate launch train_custom_diffusion.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --class_data_dir=./real_reg/samples_person/ \
+  --with_prior_preservation --real_prior --prior_loss_weight=1.0 \
+  --class_prompt="person" --num_class_images=200 \
+  --instance_prompt="photo of a <new1> person"  \
+  --resolution=512  \
+  --train_batch_size=2  \
+  --learning_rate=5e-6  \
+  --lr_warmup_steps=0 \
+  --max_train_steps=1000 \
+  --scale_lr --hflip --noaug \
+  --freeze_model crossattn \
+  --modifier_token "<new1>" \
+  --enable_xformers_memory_efficient_attention 
+```
+
+## Inference
+
+Once you have trained a model using the above command, you can run inference using the below command. Make sure to include the `modifier token` (e.g. \<new1\> in above example) in your prompt.
+
+```python
+import torch
+from diffusers import DiffusionPipeline
+
+pipe = DiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16
+).to("cuda")
+pipe.unet.load_attn_procs(
+    "path-to-save-model", weight_name="pytorch_custom_diffusion_weights.bin"
+)
+pipe.load_textual_inversion("path-to-save-model", weight_name="<new1>.bin")
+
+image = pipe(
+    "<new1> cat sitting in a bucket",
+    num_inference_steps=100,
+    guidance_scale=6.0,
+    eta=1.0,
+).images[0]
+image.save("cat.png")
+```
+
+It's possible to directly load these parameters from a Hub repository:
+
+```python
+import torch
+from huggingface_hub.repocard import RepoCard
+from diffusers import DiffusionPipeline
+
+model_id = "sayakpaul/custom-diffusion-cat"
+card = RepoCard.load(model_id)
+base_model_id = card.data.to_dict()["base_model"]
+
+pipe = DiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16).to(
+"cuda")
+pipe.unet.load_attn_procs(model_id, weight_name="pytorch_custom_diffusion_weights.bin")
+pipe.load_textual_inversion(model_id, weight_name="<new1>.bin")
+
+image = pipe(
+    "<new1> cat sitting in a bucket",
+    num_inference_steps=100,
+    guidance_scale=6.0,
+    eta=1.0,
+).images[0]
+image.save("cat.png")
+```
+
+Here is an example of performing inference with multiple concepts:
+
+```python
+import torch
+from huggingface_hub.repocard import RepoCard
+from diffusers import DiffusionPipeline
+
+model_id = "sayakpaul/custom-diffusion-cat-wooden-pot"
+card = RepoCard.load(model_id)
+base_model_id = card.data.to_dict()["base_model"]
+
+pipe = DiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16).to(
+"cuda")
+pipe.unet.load_attn_procs(model_id, weight_name="pytorch_custom_diffusion_weights.bin")
+pipe.load_textual_inversion(model_id, weight_name="<new1>.bin")
+pipe.load_textual_inversion(model_id, weight_name="<new2>.bin")
+
+image = pipe(
+    "the <new1> cat sculpture in the style of a <new2> wooden pot",
+    num_inference_steps=100,
+    guidance_scale=6.0,
+    eta=1.0,
+).images[0]
+image.save("multi-subject.png")
+```
+
+Here, `cat` and `wooden pot` refer to the multiple concepts.
+
+### Inference from a training checkpoint
+
+You can also perform inference from one of the complete checkpoint saved during the training process, if you used the `--checkpointing_steps` argument. 
+
+TODO.
+
+## Set grads to none
+To save even more memory, pass the `--set_grads_to_none` argument to the script. This will set grads to None instead of zero. However, be aware that it changes certain behaviors, so if you start experiencing any problems, remove this argument.
+
+More info: https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html
+
+## Experimental results
+You can refer to [our webpage](https://www.cs.cmu.edu/~custom-diffusion/) that discusses our experiments in detail. 
\ No newline at end of file
diff --git a/examples/custom_diffusion/requirements.txt b/examples/custom_diffusion/requirements.txt
new file mode 100644
index 000000000000..7d93f3d03bd8
--- /dev/null
+++ b/examples/custom_diffusion/requirements.txt
@@ -0,0 +1,6 @@
+accelerate
+torchvision
+transformers>=4.25.1
+ftfy
+tensorboard
+Jinja2
diff --git a/examples/custom_diffusion/retrieve.py b/examples/custom_diffusion/retrieve.py
new file mode 100644
index 000000000000..7b7635c1887d
--- /dev/null
+++ b/examples/custom_diffusion/retrieve.py
@@ -0,0 +1,87 @@
+#  Copyright 2023 Custom Diffusion authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from io import BytesIO
+from pathlib import Path
+
+import requests
+from clip_retrieval.clip_client import ClipClient
+from PIL import Image
+from tqdm import tqdm
+
+
+def retrieve(class_prompt, class_data_dir, num_class_images):
+    factor = 1.5
+    num_images = int(factor * num_class_images)
+    client = ClipClient(
+        url="https://knn.laion.ai/knn-service", indice_name="laion_400m", num_images=num_images, aesthetic_weight=0.1
+    )
+
+    os.makedirs(f"{class_data_dir}/images", exist_ok=True)
+    if len(list(Path(f"{class_data_dir}/images").iterdir())) >= num_class_images:
+        return
+
+    while True:
+        class_images = client.query(text=class_prompt)
+        if len(class_images) >= factor * num_class_images or num_images > 1e4:
+            break
+        else:
+            num_images = int(factor * num_images)
+            client = ClipClient(
+                url="https://knn.laion.ai/knn-service",
+                indice_name="laion_400m",
+                num_images=num_images,
+                aesthetic_weight=0.1,
+            )
+
+    count = 0
+    total = 0
+    pbar = tqdm(desc="downloading real regularization images", total=num_class_images)
+
+    with open(f"{class_data_dir}/caption.txt", "w") as f1, open(f"{class_data_dir}/urls.txt", "w") as f2, open(
+        f"{class_data_dir}/images.txt", "w"
+    ) as f3:
+        while total < num_class_images:
+            images = class_images[count]
+            count += 1
+            try:
+                img = requests.get(images["url"])
+                if img.status_code == 200:
+                    _ = Image.open(BytesIO(img.content))
+                    with open(f"{class_data_dir}/images/{total}.jpg", "wb") as f:
+                        f.write(img.content)
+                    f1.write(images["caption"] + "\n")
+                    f2.write(images["url"] + "\n")
+                    f3.write(f"{class_data_dir}/images/{total}.jpg" + "\n")
+                    total += 1
+                    pbar.update(1)
+                else:
+                    continue
+            except Exception:
+                continue
+    return
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("", add_help=False)
+    parser.add_argument("--class_prompt", help="text prompt to retrieve images", required=True, type=str)
+    parser.add_argument("--class_data_dir", help="path to save images", required=True, type=str)
+    parser.add_argument("--num_class_images", help="number of images to download", default=200, type=int)
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    retrieve(args.class_prompt, args.class_data_dir, args.num_class_images)
diff --git a/examples/custom_diffusion/train_custom_diffusion.py b/examples/custom_diffusion/train_custom_diffusion.py
new file mode 100644
index 000000000000..49b05e6b5db3
--- /dev/null
+++ b/examples/custom_diffusion/train_custom_diffusion.py
@@ -0,0 +1,1289 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 Custom Diffusion authors and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import hashlib
+import itertools
+import json
+import logging
+import math
+import os
+import random
+import warnings
+from pathlib import Path
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from huggingface_hub import HfApi, create_repo
+from packaging import version
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    DiffusionPipeline,
+    DPMSolverMultistepScheduler,
+    UNet2DConditionModel,
+)
+from diffusers.loaders import AttnProcsLayers
+from diffusers.models.attention_processor import CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor
+from diffusers.optimization import get_scheduler
+from diffusers.utils import check_min_version, is_wandb_available
+from diffusers.utils.import_utils import is_xformers_available
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.15.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def freeze_params(params):
+    for param in params:
+        param.requires_grad = False
+
+
+def save_model_card(repo_id: str, images=None, base_model=str, prompt=str, repo_folder=None):
+    img_str = ""
+    for i, image in enumerate(images):
+        image.save(os.path.join(repo_folder, f"image_{i}.png"))
+        img_str += f"![img_{i}](./image_{i}.png)\n"
+
+    yaml = f"""
+---
+license: creativeml-openrail-m
+base_model: {base_model}
+instance_prompt: {prompt}
+tags:
+- stable-diffusion
+- stable-diffusion-diffusers
+- text-to-image
+- diffusers
+- custom-diffusion
+inference: true
+---
+    """
+    model_card = f"""
+# Custom Diffusion - {repo_id}
+
+These are Custom Diffusion adaption weights for {base_model}. The weights were trained on {prompt} using [Custom Diffusion](https://www.cs.cmu.edu/~custom-diffusion). You can find some example images in the following. \n
+{img_str}
+
+\nFor more details on the training, please follow [this link](https://github.com/huggingface/diffusers/blob/main/examples/custom_diffusion).
+"""
+    with open(os.path.join(repo_folder, "README.md"), "w") as f:
+        f.write(yaml + model_card)
+
+
+def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        revision=revision,
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "RobertaSeriesModelWithTransformation":
+        from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation
+
+        return RobertaSeriesModelWithTransformation
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def collate_fn(examples, with_prior_preservation):
+    input_ids = [example["instance_prompt_ids"] for example in examples]
+    pixel_values = [example["instance_images"] for example in examples]
+    mask = [example["mask"] for example in examples]
+    # Concat class and instance examples for prior preservation.
+    # We do this to avoid doing two forward passes.
+    if with_prior_preservation:
+        input_ids += [example["class_prompt_ids"] for example in examples]
+        pixel_values += [example["class_images"] for example in examples]
+        mask += [example["class_mask"] for example in examples]
+
+    input_ids = torch.cat(input_ids, dim=0)
+    pixel_values = torch.stack(pixel_values)
+    mask = torch.stack(mask)
+    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+    mask = mask.to(memory_format=torch.contiguous_format).float()
+
+    batch = {"input_ids": input_ids, "pixel_values": pixel_values, "mask": mask.unsqueeze(1)}
+    return batch
+
+
+class PromptDataset(Dataset):
+    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
+
+    def __init__(self, prompt, num_samples):
+        self.prompt = prompt
+        self.num_samples = num_samples
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, index):
+        example = {}
+        example["prompt"] = self.prompt
+        example["index"] = index
+        return example
+
+
+class CustomDiffusionDataset(Dataset):
+    """
+    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
+    It pre-processes the images and the tokenizes prompts.
+    """
+
+    def __init__(
+        self,
+        concepts_list,
+        tokenizer,
+        size=512,
+        mask_size=64,
+        center_crop=False,
+        with_prior_preservation=False,
+        num_class_images=200,
+        hflip=False,
+        aug=True,
+    ):
+        self.size = size
+        self.mask_size = mask_size
+        self.center_crop = center_crop
+        self.tokenizer = tokenizer
+        self.interpolation = Image.BILINEAR
+        self.aug = aug
+
+        self.instance_images_path = []
+        self.class_images_path = []
+        self.with_prior_preservation = with_prior_preservation
+        for concept in concepts_list:
+            inst_img_path = [
+                (x, concept["instance_prompt"]) for x in Path(concept["instance_data_dir"]).iterdir() if x.is_file()
+            ]
+            self.instance_images_path.extend(inst_img_path)
+
+            if with_prior_preservation:
+                class_data_root = Path(concept["class_data_dir"])
+                if os.path.isdir(class_data_root):
+                    class_images_path = list(class_data_root.iterdir())
+                    class_prompt = [concept["class_prompt"] for _ in range(len(class_images_path))]
+                else:
+                    with open(class_data_root, "r") as f:
+                        class_images_path = f.read().splitlines()
+                    with open(concept["class_prompt"], "r") as f:
+                        class_prompt = f.read().splitlines()
+
+                class_img_path = [(x, y) for (x, y) in zip(class_images_path, class_prompt)]
+                self.class_images_path.extend(class_img_path[:num_class_images])
+
+        random.shuffle(self.instance_images_path)
+        self.num_instance_images = len(self.instance_images_path)
+        self.num_class_images = len(self.class_images_path)
+        self._length = max(self.num_class_images, self.num_instance_images)
+        self.flip = transforms.RandomHorizontalFlip(0.5 * hflip)
+
+        self.image_transforms = transforms.Compose(
+            [
+                self.flip,
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+
+    def __len__(self):
+        return self._length
+
+    def preprocess(self, image, scale, resample):
+        outer, inner = self.size, scale
+        factor = self.size // self.mask_size
+        if scale > self.size:
+            outer, inner = scale, self.size
+        top, left = np.random.randint(0, outer - inner + 1), np.random.randint(0, outer - inner + 1)
+        image = image.resize((scale, scale), resample=resample)
+        image = np.array(image).astype(np.uint8)
+        image = (image / 127.5 - 1.0).astype(np.float32)
+        instance_image = np.zeros((self.size, self.size, 3), dtype=np.float32)
+        mask = np.zeros((self.size // factor, self.size // factor))
+        if scale > self.size:
+            instance_image = image[top : top + inner, left : left + inner, :]
+            mask = np.ones((self.size // factor, self.size // factor))
+        else:
+            instance_image[top : top + inner, left : left + inner, :] = image
+            mask[
+                top // factor + 1 : (top + scale) // factor - 1, left // factor + 1 : (left + scale) // factor - 1
+            ] = 1.0
+        return instance_image, mask
+
+    def __getitem__(self, index):
+        example = {}
+        instance_image, instance_prompt = self.instance_images_path[index % self.num_instance_images]
+        instance_image = Image.open(instance_image)
+        if not instance_image.mode == "RGB":
+            instance_image = instance_image.convert("RGB")
+        instance_image = self.flip(instance_image)
+
+        # apply resize augmentation and create a valid image region mask
+        random_scale = self.size
+        if self.aug:
+            random_scale = (
+                np.random.randint(self.size // 3, self.size + 1)
+                if np.random.uniform() < 0.66
+                else np.random.randint(int(1.2 * self.size), int(1.4 * self.size))
+            )
+        instance_image, mask = self.preprocess(instance_image, random_scale, self.interpolation)
+
+        if random_scale < 0.6 * self.size:
+            instance_prompt = np.random.choice(["a far away ", "very small "]) + instance_prompt
+        elif random_scale > self.size:
+            instance_prompt = np.random.choice(["zoomed in ", "close up "]) + instance_prompt
+
+        example["instance_images"] = torch.from_numpy(instance_image).permute(2, 0, 1)
+        example["mask"] = torch.from_numpy(mask)
+        example["instance_prompt_ids"] = self.tokenizer(
+            instance_prompt,
+            truncation=True,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids
+
+        if self.with_prior_preservation:
+            class_image, class_prompt = self.class_images_path[index % self.num_class_images]
+            class_image = Image.open(class_image)
+            if not class_image.mode == "RGB":
+                class_image = class_image.convert("RGB")
+            example["class_images"] = self.image_transforms(class_image)
+            example["class_mask"] = torch.ones_like(example["mask"])
+            example["class_prompt_ids"] = self.tokenizer(
+                class_prompt,
+                truncation=True,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                return_tensors="pt",
+            ).input_ids
+
+        return example
+
+
+def save_new_embed(text_encoder, modifier_token_id, accelerator, args, output_dir):
+    """Saves the new token embeddings from the text encoder."""
+    logger.info("Saving embeddings")
+    learned_embeds = accelerator.unwrap_model(text_encoder).get_input_embeddings().weight
+    for x, y in zip(modifier_token_id, args.modifier_token):
+        learned_embeds_dict = {}
+        learned_embeds_dict[y] = learned_embeds[x]
+        torch.save(learned_embeds_dict, f"{output_dir}/{y}.bin")
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Custom Diffusion training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default=None,
+        help="A folder containing the training data of instance images.",
+    )
+    parser.add_argument(
+        "--class_data_dir",
+        type=str,
+        default=None,
+        help="A folder containing the training data of class images.",
+    )
+    parser.add_argument(
+        "--instance_prompt",
+        type=str,
+        default=None,
+        help="The prompt with identifier specifying the instance",
+    )
+    parser.add_argument(
+        "--class_prompt",
+        type=str,
+        default=None,
+        help="The prompt to specify images in the same class as provided instance images.",
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        help="A prompt that is used during validation to verify that the model is learning.",
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=2,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=50,
+        help=(
+            "Run dreambooth validation every X epochs. Dreambooth validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument(
+        "--with_prior_preservation",
+        default=False,
+        action="store_true",
+        help="Flag to add prior preservation loss.",
+    )
+    parser.add_argument(
+        "--real_prior",
+        default=False,
+        action="store_true",
+        help="real images as prior.",
+    )
+    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
+    parser.add_argument(
+        "--num_class_images",
+        type=int,
+        default=200,
+        help=(
+            "Minimal class images for prior preservation loss. If there are not enough images already present in"
+            " class_data_dir, additional images will be sampled with class_prompt."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="custom-diffusion-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=42, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=250,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=(
+            "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`."
+            " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state"
+            " for more docs"
+        ),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-5,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=2,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument(
+        "--freeze_model",
+        type=str,
+        default="crossattn_kv",
+        choices=["crossattn_kv", "crossattn"],
+        help="crossattn to enable fine-tuning of all params in the cross attention",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--prior_generation_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp32", "fp16", "bf16"],
+        help=(
+            "Choose prior generation precision between fp32, fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to  fp16 if a GPU is available else fp32."
+        ),
+    )
+    parser.add_argument(
+        "--concepts_list",
+        type=str,
+        default=None,
+        help="Path to json containing multiple concepts, will overwrite parameters like instance_prompt, class_prompt, etc.",
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument(
+        "--set_grads_to_none",
+        action="store_true",
+        help=(
+            "Save more memory by using setting grads to None instead of zero. Be aware, that this changes certain"
+            " behaviors, so disable this argument if it causes any problems. More info:"
+            " https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html"
+        ),
+    )
+    parser.add_argument(
+        "--modifier_token",
+        type=str,
+        default=None,
+        help="A token to use as a modifier for the concept.",
+    )
+    parser.add_argument(
+        "--initializer_token", type=str, default="ktn+pll+ucd", help="A token to use as initializer word."
+    )
+    parser.add_argument("--hflip", action="store_true", help="Apply horizontal flip data augmentation.")
+    parser.add_argument(
+        "--noaug",
+        action="store_true",
+        help="Dont apply augmentation during data augmentation when this flag is enabled.",
+    )
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.with_prior_preservation:
+        if args.concepts_list is None:
+            if args.class_data_dir is None:
+                raise ValueError("You must specify a data directory for class images.")
+            if args.class_prompt is None:
+                raise ValueError("You must specify prompt for class images.")
+    else:
+        # logger is not available yet
+        if args.class_data_dir is not None:
+            warnings.warn("You need not use --class_data_dir without --with_prior_preservation.")
+        if args.class_prompt is not None:
+            warnings.warn("You need not use --class_prompt without --with_prior_preservation.")
+
+    return args
+
+
+def main(args):
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(total_limit=args.checkpoints_total_limit)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        logging_dir=logging_dir,
+        project_config=accelerator_project_config,
+    )
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+        import wandb
+
+    # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate
+    # This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models.
+    # TODO (patil-suraj): Remove this check when gradient accumulation with two models is enabled in accelerate.
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("custom-diffusion", config=vars(args))
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+    if args.concepts_list is None:
+        args.concepts_list = [
+            {
+                "instance_prompt": args.instance_prompt,
+                "class_prompt": args.class_prompt,
+                "instance_data_dir": args.instance_data_dir,
+                "class_data_dir": args.class_data_dir,
+            }
+        ]
+    else:
+        with open(args.concepts_list, "r") as f:
+            args.concepts_list = json.load(f)
+
+    # Generate class images if prior preservation is enabled.
+    if args.with_prior_preservation:
+        for i, concept in enumerate(args.concepts_list):
+            class_images_dir = Path(concept["class_data_dir"])
+            if not class_images_dir.exists():
+                class_images_dir.mkdir(parents=True, exist_ok=True)
+            if args.real_prior:
+                assert (
+                    class_images_dir / "images"
+                ).exists(), f"Please run: python retrieve.py --class_prompt \"{concept['class_prompt']}\" --class_data_dir {class_images_dir} --num_class_images {args.num_class_images}"
+                assert (
+                    len(list((class_images_dir / "images").iterdir())) == args.num_class_images
+                ), f"Please run: python retrieve.py --class_prompt \"{concept['class_prompt']}\" --class_data_dir {class_images_dir} --num_class_images {args.num_class_images}"
+                assert (
+                    class_images_dir / "caption.txt"
+                ).exists(), f"Please run: python retrieve.py --class_prompt \"{concept['class_prompt']}\" --class_data_dir {class_images_dir} --num_class_images {args.num_class_images}"
+                assert (
+                    class_images_dir / "images.txt"
+                ).exists(), f"Please run: python retrieve.py --class_prompt \"{concept['class_prompt']}\" --class_data_dir {class_images_dir} --num_class_images {args.num_class_images}"
+                concept["class_prompt"] = os.path.join(class_images_dir, "caption.txt")
+                concept["class_data_dir"] = os.path.join(class_images_dir, "images.txt")
+                args.concepts_list[i] = concept
+                accelerator.wait_for_everyone()
+            else:
+                cur_class_images = len(list(class_images_dir.iterdir()))
+
+                if cur_class_images < args.num_class_images:
+                    torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32
+                    if args.prior_generation_precision == "fp32":
+                        torch_dtype = torch.float32
+                    elif args.prior_generation_precision == "fp16":
+                        torch_dtype = torch.float16
+                    elif args.prior_generation_precision == "bf16":
+                        torch_dtype = torch.bfloat16
+                    pipeline = DiffusionPipeline.from_pretrained(
+                        args.pretrained_model_name_or_path,
+                        torch_dtype=torch_dtype,
+                        safety_checker=None,
+                        revision=args.revision,
+                    )
+                    pipeline.set_progress_bar_config(disable=True)
+
+                    num_new_images = args.num_class_images - cur_class_images
+                    logger.info(f"Number of class images to sample: {num_new_images}.")
+
+                    sample_dataset = PromptDataset(args.class_prompt, num_new_images)
+                    sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
+
+                    sample_dataloader = accelerator.prepare(sample_dataloader)
+                    pipeline.to(accelerator.device)
+
+                    for example in tqdm(
+                        sample_dataloader,
+                        desc="Generating class images",
+                        disable=not accelerator.is_local_main_process,
+                    ):
+                        images = pipeline(example["prompt"]).images
+
+                        for i, image in enumerate(images):
+                            hash_image = hashlib.sha1(image.tobytes()).hexdigest()
+                            image_filename = (
+                                class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
+                            )
+                            image.save(image_filename)
+
+                    del pipeline
+                    if torch.cuda.is_available():
+                        torch.cuda.empty_cache()
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load the tokenizer
+    if args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.tokenizer_name,
+            revision=args.revision,
+            use_fast=False,
+        )
+    elif args.pretrained_model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.pretrained_model_name_or_path,
+            subfolder="tokenizer",
+            revision=args.revision,
+            use_fast=False,
+        )
+
+    # import correct text encoder class
+    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision)
+
+    # Load scheduler and models
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    text_encoder = text_encoder_cls.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
+    )
+    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision)
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision
+    )
+
+    # Adding a modifier token which is optimized ####
+    # Code taken from https://github.com/huggingface/diffusers/blob/main/examples/textual_inversion/textual_inversion.py
+    modifier_token_id = []
+    initializer_token_id = []
+    if args.modifier_token is not None:
+        args.modifier_token = args.modifier_token.split("+")
+        args.initializer_token = args.initializer_token.split("+")
+        if len(args.modifier_token) > len(args.initializer_token):
+            raise ValueError("You must specify + separated initializer token for each modifier token.")
+        for modifier_token, initializer_token in zip(
+            args.modifier_token, args.initializer_token[: len(args.modifier_token)]
+        ):
+            # Add the placeholder token in tokenizer
+            num_added_tokens = tokenizer.add_tokens(modifier_token)
+            if num_added_tokens == 0:
+                raise ValueError(
+                    f"The tokenizer already contains the token {modifier_token}. Please pass a different"
+                    " `modifier_token` that is not already in the tokenizer."
+                )
+
+            # Convert the initializer_token, placeholder_token to ids
+            token_ids = tokenizer.encode([initializer_token], add_special_tokens=False)
+            print(token_ids)
+            # Check if initializer_token is a single token or a sequence of tokens
+            if len(token_ids) > 1:
+                raise ValueError("The initializer token must be a single token.")
+
+            initializer_token_id.append(token_ids[0])
+            modifier_token_id.append(tokenizer.convert_tokens_to_ids(modifier_token))
+
+        # Resize the token embeddings as we are adding new special tokens to the tokenizer
+        text_encoder.resize_token_embeddings(len(tokenizer))
+
+        # Initialise the newly added placeholder token with the embeddings of the initializer token
+        token_embeds = text_encoder.get_input_embeddings().weight.data
+        for x, y in zip(modifier_token_id, initializer_token_id):
+            token_embeds[x] = token_embeds[y]
+
+        # Freeze all parameters except for the token embeddings in text encoder
+        params_to_freeze = itertools.chain(
+            text_encoder.text_model.encoder.parameters(),
+            text_encoder.text_model.final_layer_norm.parameters(),
+            text_encoder.text_model.embeddings.position_embedding.parameters(),
+        )
+        freeze_params(params_to_freeze)
+    ########################################################
+    ########################################################
+
+    vae.requires_grad_(False)
+    if args.modifier_token is None:
+        text_encoder.requires_grad_(False)
+    unet.requires_grad_(False)
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move unet, vae and text_encoder to device and cast to weight_dtype
+    if accelerator.mixed_precision != "fp16" and args.modifier_token is not None:
+        text_encoder.to(accelerator.device, dtype=weight_dtype)
+    unet.to(accelerator.device, dtype=weight_dtype)
+    vae.to(accelerator.device, dtype=weight_dtype)
+
+    attention_class = CustomDiffusionAttnProcessor
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warn(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            attention_class = CustomDiffusionXFormersAttnProcessor
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    # now we will add new Custom Diffusion weights to the attention layers
+    # It's important to realize here how many attention weights will be added and of which sizes
+    # The sizes of the attention layers consist only of two different variables:
+    # 1) - the "hidden_size", which is increased according to `unet.config.block_out_channels`.
+    # 2) - the "cross attention size", which is set to `unet.config.cross_attention_dim`.
+
+    # Let's first see how many attention processors we will have to set.
+    # For Stable Diffusion, it should be equal to:
+    # - down blocks (2x attention layers) * (2x transformer layers) * (3x down blocks) = 12
+    # - mid blocks (2x attention layers) * (1x transformer layers) * (1x mid blocks) = 2
+    # - up blocks (2x attention layers) * (3x transformer layers) * (3x down blocks) = 18
+    # => 32 layers
+
+    # Only train key, value projection layers if freeze_model = 'crossattn_kv' else train all params in the cross attention layer
+    train_kv = True
+    train_q_out = False if args.freeze_model == "crossattn_kv" else True
+    custom_diffusion_attn_procs = {}
+
+    st = unet.state_dict()
+    for name, _ in unet.attn_processors.items():
+        cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+        if name.startswith("mid_block"):
+            hidden_size = unet.config.block_out_channels[-1]
+        elif name.startswith("up_blocks"):
+            block_id = int(name[len("up_blocks.")])
+            hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+        elif name.startswith("down_blocks"):
+            block_id = int(name[len("down_blocks.")])
+            hidden_size = unet.config.block_out_channels[block_id]
+        layer_name = name.split(".processor")[0]
+        weights = {
+            "to_k_custom_diffusion.weight": st[layer_name + ".to_k.weight"],
+            "to_v_custom_diffusion.weight": st[layer_name + ".to_v.weight"],
+        }
+        if train_q_out:
+            weights["to_q_custom_diffusion.weight"] = st[layer_name + ".to_q.weight"]
+            weights["to_out_custom_diffusion.0.weight"] = st[layer_name + ".to_out.0.weight"]
+            weights["to_out_custom_diffusion.0.bias"] = st[layer_name + ".to_out.0.bias"]
+        if cross_attention_dim is not None:
+            custom_diffusion_attn_procs[name] = attention_class(
+                train_kv=train_kv,
+                train_q_out=train_q_out,
+                hidden_size=hidden_size,
+                cross_attention_dim=cross_attention_dim,
+            ).to(unet.device)
+            custom_diffusion_attn_procs[name].load_state_dict(weights)
+        else:
+            custom_diffusion_attn_procs[name] = attention_class(
+                train_kv=False,
+                train_q_out=False,
+                hidden_size=hidden_size,
+                cross_attention_dim=cross_attention_dim,
+            )
+    del st
+    unet.set_attn_processor(custom_diffusion_attn_procs)
+    custom_diffusion_layers = AttnProcsLayers(unet.attn_processors)
+
+    accelerator.register_for_checkpointing(custom_diffusion_layers)
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+        if args.modifier_token is not None:
+            text_encoder.gradient_checkpointing_enable()
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+        if args.with_prior_preservation:
+            args.learning_rate = args.learning_rate * 2.0
+
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    # Optimizer creation
+    optimizer = optimizer_class(
+        itertools.chain(text_encoder.get_input_embeddings().parameters(), custom_diffusion_layers.parameters())
+        if args.modifier_token is not None
+        else custom_diffusion_layers.parameters(),
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # Dataset and DataLoaders creation:
+    train_dataset = CustomDiffusionDataset(
+        concepts_list=args.concepts_list,
+        tokenizer=tokenizer,
+        with_prior_preservation=args.with_prior_preservation,
+        size=args.resolution,
+        mask_size=vae.encode(
+            torch.randn(1, 3, args.resolution, args.resolution).to(dtype=weight_dtype).to(accelerator.device)
+        )
+        .latent_dist.sample()
+        .size()[-1],
+        center_crop=args.center_crop,
+        num_class_images=args.num_class_images,
+        hflip=args.hflip,
+        aug=not args.noaug,
+    )
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        collate_fn=lambda examples: collate_fn(examples, args.with_prior_preservation),
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+    )
+
+    # Prepare everything with our `accelerator`.
+    if args.modifier_token is not None:
+        custom_diffusion_layers, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            custom_diffusion_layers, text_encoder, optimizer, train_dataloader, lr_scheduler
+        )
+    else:
+        custom_diffusion_layers, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            custom_diffusion_layers, optimizer, train_dataloader, lr_scheduler
+        )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            resume_global_step = global_step * args.gradient_accumulation_steps
+            first_epoch = global_step // num_update_steps_per_epoch
+            resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps)
+
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
+    progress_bar.set_description("Steps")
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        unet.train()
+        if args.modifier_token is not None:
+            text_encoder.train()
+        for step, batch in enumerate(train_dataloader):
+            # Skip steps until we reach the resumed step
+            if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
+                if step % args.gradient_accumulation_steps == 0:
+                    progress_bar.update(1)
+                continue
+
+            with accelerator.accumulate(unet), accelerator.accumulate(text_encoder):
+                # Convert images to latent space
+                latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
+                latents = latents * vae.config.scaling_factor
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                bsz = latents.shape[0]
+                # Sample a random timestep for each image
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                # Get the text embedding for conditioning
+                encoder_hidden_states = text_encoder(batch["input_ids"])[0]
+
+                # Predict the noise residual
+                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                if args.with_prior_preservation:
+                    # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
+                    model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
+                    target, target_prior = torch.chunk(target, 2, dim=0)
+                    mask = torch.chunk(batch["mask"], 2, dim=0)[0]
+                    # Compute instance loss
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+                    loss = ((loss * mask).sum([1, 2, 3]) / mask.sum([1, 2, 3])).mean()
+
+                    # Compute prior loss
+                    prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")
+
+                    # Add the prior loss to the instance loss.
+                    loss = loss + args.prior_loss_weight * prior_loss
+                else:
+                    mask = batch["mask"]
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+                    loss = ((loss * mask).sum([1, 2, 3]) / mask.sum([1, 2, 3])).mean()
+                accelerator.backward(loss)
+                # Zero out the gradients for all token embeddings except the newly added
+                # embeddings for the concept, as we only want to optimize the concept embeddings
+                if args.modifier_token is not None:
+                    if accelerator.num_processes > 1:
+                        grads_text_encoder = text_encoder.module.get_input_embeddings().weight.grad
+                    else:
+                        grads_text_encoder = text_encoder.get_input_embeddings().weight.grad
+                    # Get the index for tokens that we want to zero the grads for
+                    index_grads_to_zero = torch.arange(len(tokenizer)) != modifier_token_id[0]
+                    for i in range(len(modifier_token_id[1:])):
+                        index_grads_to_zero = index_grads_to_zero & (
+                            torch.arange(len(tokenizer)) != modifier_token_id[i]
+                        )
+                    grads_text_encoder.data[index_grads_to_zero, :] = grads_text_encoder.data[
+                        index_grads_to_zero, :
+                    ].fill_(0)
+
+                if accelerator.sync_gradients:
+                    params_to_clip = (
+                        itertools.chain(text_encoder.parameters(), custom_diffusion_layers.parameters())
+                        if args.modifier_token is not None
+                        else custom_diffusion_layers.parameters()
+                    )
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad(set_to_none=args.set_grads_to_none)
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if global_step % args.checkpointing_steps == 0:
+                    if accelerator.is_main_process:
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if args.validation_prompt is not None and global_step % args.validation_steps == 0:
+                logger.info(
+                    f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+                    f" {args.validation_prompt}."
+                )
+                # create pipeline
+                pipeline = DiffusionPipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    unet=accelerator.unwrap_model(unet),
+                    text_encoder=accelerator.unwrap_model(text_encoder),
+                    tokenizer=tokenizer,
+                    revision=args.revision,
+                )
+                pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
+                pipeline = pipeline.to(accelerator.device)
+                pipeline.set_progress_bar_config(disable=True)
+
+                # run inference
+                generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+                images = [
+                    pipeline(args.validation_prompt, num_inference_steps=25, generator=generator, eta=1.0).images[0]
+                    for _ in range(args.num_validation_images)
+                ]
+
+                for tracker in accelerator.trackers:
+                    if tracker.name == "tensorboard":
+                        np_images = np.stack([np.asarray(img) for img in images])
+                        tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
+                    if tracker.name == "wandb":
+                        tracker.log(
+                            {
+                                "validation": [
+                                    wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
+                                    for i, image in enumerate(images)
+                                ]
+                            }
+                        )
+
+                del pipeline
+                torch.cuda.empty_cache()
+
+    # Save the custom diffusion layers
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = unet.to(torch.float32)
+        unet.save_attn_procs(args.output_dir)
+        save_new_embed(text_encoder, modifier_token_id, accelerator, args, args.output_dir)
+
+        # Final inference
+        # Load previous pipeline
+        pipeline = DiffusionPipeline.from_pretrained(
+            args.pretrained_model_name_or_path, revision=args.revision, torch_dtype=weight_dtype
+        )
+        pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
+        pipeline = pipeline.to(accelerator.device)
+
+        # load attention processors
+        pipeline.unet.load_attn_procs(args.output_dir, weight_name="pytorch_custom_diffusion_weights.bin")
+        for token in args.modifier_token:
+            pipeline.load_textual_inversion(args.output_dir, weight_name=f"{token}.bin")
+
+        # run inference
+        if args.validation_prompt and args.num_validation_images > 0:
+            generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+            images = [
+                pipeline(args.validation_prompt, num_inference_steps=25, generator=generator, eta=1.0).images[0]
+                for _ in range(args.num_validation_images)
+            ]
+
+            for tracker in accelerator.trackers:
+                if tracker.name == "tensorboard":
+                    np_images = np.stack([np.asarray(img) for img in images])
+                    tracker.writer.add_images("test", np_images, epoch, dataformats="NHWC")
+                if tracker.name == "wandb":
+                    tracker.log(
+                        {
+                            "test": [
+                                wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
+                                for i, image in enumerate(images)
+                            ]
+                        }
+                    )
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                images=images,
+                base_model=args.pretrained_model_name_or_path,
+                prompt=args.instance_prompt,
+                repo_folder=args.output_dir,
+            )
+            api = HfApi(token=args.hub_token)
+            api.upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/test_examples.py b/examples/test_examples.py
index d9a1f86e53aa..a77fa4c7da23 100644
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -221,6 +221,30 @@ def test_dreambooth_checkpointing(self):
             self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-4")))
             self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-6")))
 
+    def test_custom_diffusion(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/custom_diffusion/train_custom_diffusion.py
+                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-pipe
+                --instance_data_dir docs/source/en/imgs
+                --instance_prompt <new1>
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 1.0e-05
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --modifier_token <new1>
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_custom_diffusion_weights.bin")))
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "<new1>.bin")))
+
     def test_text_to_image(self):
         with tempfile.TemporaryDirectory() as tmpdir:
             test_args = f"""
diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py
index 3133da117390..82c1ac61ca9e 100644
--- a/src/diffusers/loaders.py
+++ b/src/diffusers/loaders.py
@@ -19,7 +19,11 @@
 import torch
 from huggingface_hub import hf_hub_download
 
-from .models.attention_processor import LoRAAttnProcessor
+from .models.attention_processor import (
+    CustomDiffusionAttnProcessor,
+    CustomDiffusionXFormersAttnProcessor,
+    LoRAAttnProcessor,
+)
 from .utils import (
     DIFFUSERS_CACHE,
     HF_HUB_OFFLINE,
@@ -48,6 +52,9 @@
 TEXT_INVERSION_NAME = "learned_embeds.bin"
 TEXT_INVERSION_NAME_SAFE = "learned_embeds.safetensors"
 
+CUSTOM_DIFFUSION_WEIGHT_NAME = "pytorch_custom_diffusion_weights.bin"
+CUSTOM_DIFFUSION_WEIGHT_NAME_SAFE = "pytorch_custom_diffusion_weights.safetensors"
+
 
 class AttnProcsLayers(torch.nn.Module):
     def __init__(self, state_dict: Dict[str, torch.Tensor]):
@@ -215,6 +222,7 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
         attn_processors = {}
 
         is_lora = all("lora" in k for k in state_dict.keys())
+        is_custom_diffusion = any("custom_diffusion" in k for k in state_dict.keys())
 
         if is_lora:
             lora_grouped_dict = defaultdict(dict)
@@ -231,9 +239,38 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
                     hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=rank
                 )
                 attn_processors[key].load_state_dict(value_dict)
-
+        elif is_custom_diffusion:
+            custom_diffusion_grouped_dict = defaultdict(dict)
+            for key, value in state_dict.items():
+                if len(value) == 0:
+                    custom_diffusion_grouped_dict[key] = {}
+                else:
+                    if "to_out" in key:
+                        attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:])
+                    else:
+                        attn_processor_key, sub_key = ".".join(key.split(".")[:-2]), ".".join(key.split(".")[-2:])
+                    custom_diffusion_grouped_dict[attn_processor_key][sub_key] = value
+
+            for key, value_dict in custom_diffusion_grouped_dict.items():
+                if len(value_dict) == 0:
+                    attn_processors[key] = CustomDiffusionAttnProcessor(
+                        train_kv=False, train_q_out=False, hidden_size=None, cross_attention_dim=None
+                    )
+                else:
+                    cross_attention_dim = value_dict["to_k_custom_diffusion.weight"].shape[1]
+                    hidden_size = value_dict["to_k_custom_diffusion.weight"].shape[0]
+                    train_q_out = True if "to_q_custom_diffusion.weight" in value_dict else False
+                    attn_processors[key] = CustomDiffusionAttnProcessor(
+                        train_kv=True,
+                        train_q_out=train_q_out,
+                        hidden_size=hidden_size,
+                        cross_attention_dim=cross_attention_dim,
+                    )
+                    attn_processors[key].load_state_dict(value_dict)
         else:
-            raise ValueError(f"{model_file} does not seem to be in the correct format expected by LoRA training.")
+            raise ValueError(
+                f"{model_file} does not seem to be in the correct format expected by LoRA or Custom Diffusion training."
+            )
 
         # set correct dtype & device
         attn_processors = {k: v.to(device=self.device, dtype=self.dtype) for k, v in attn_processors.items()}
@@ -287,16 +324,31 @@ def save_function(weights, filename):
 
         os.makedirs(save_directory, exist_ok=True)
 
-        model_to_save = AttnProcsLayers(self.attn_processors)
-
-        # Save the model
-        state_dict = model_to_save.state_dict()
+        is_custom_diffusion = any(
+            isinstance(x, (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor))
+            for (_, x) in self.attn_processors.items()
+        )
+        if is_custom_diffusion:
+            model_to_save = AttnProcsLayers(
+                {
+                    y: x
+                    for (y, x) in self.attn_processors.items()
+                    if isinstance(x, (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor))
+                }
+            )
+            state_dict = model_to_save.state_dict()
+            for name, attn in self.attn_processors.items():
+                if len(attn.state_dict()) == 0:
+                    state_dict[name] = {}
+        else:
+            model_to_save = AttnProcsLayers(self.attn_processors)
+            state_dict = model_to_save.state_dict()
 
         if weight_name is None:
             if safe_serialization:
-                weight_name = LORA_WEIGHT_NAME_SAFE
+                weight_name = CUSTOM_DIFFUSION_WEIGHT_NAME_SAFE if is_custom_diffusion else LORA_WEIGHT_NAME_SAFE
             else:
-                weight_name = LORA_WEIGHT_NAME
+                weight_name = CUSTOM_DIFFUSION_WEIGHT_NAME if is_custom_diffusion else LORA_WEIGHT_NAME
 
         # Save the model
         save_function(state_dict, os.path.join(save_directory, weight_name))
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index f2a5a376bf39..b8787aed91f2 100644
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -149,6 +149,9 @@ def set_use_memory_efficient_attention_xformers(
         is_lora = hasattr(self, "processor") and isinstance(
             self.processor, (LoRAAttnProcessor, LoRAXFormersAttnProcessor)
         )
+        is_custom_diffusion = hasattr(self, "processor") and isinstance(
+            self.processor, (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor)
+        )
 
         if use_memory_efficient_attention_xformers:
             if self.added_kv_proj_dim is not None:
@@ -192,6 +195,17 @@ def set_use_memory_efficient_attention_xformers(
                 )
                 processor.load_state_dict(self.processor.state_dict())
                 processor.to(self.processor.to_q_lora.up.weight.device)
+            elif is_custom_diffusion:
+                processor = CustomDiffusionXFormersAttnProcessor(
+                    train_kv=self.processor.train_kv,
+                    train_q_out=self.processor.train_q_out,
+                    hidden_size=self.processor.hidden_size,
+                    cross_attention_dim=self.processor.cross_attention_dim,
+                    attention_op=attention_op,
+                )
+                processor.load_state_dict(self.processor.state_dict())
+                if hasattr(self.processor, "to_k_custom_diffusion"):
+                    processor.to(self.processor.to_k_custom_diffusion.weight.device)
             else:
                 processor = XFormersAttnProcessor(attention_op=attention_op)
         else:
@@ -203,6 +217,16 @@ def set_use_memory_efficient_attention_xformers(
                 )
                 processor.load_state_dict(self.processor.state_dict())
                 processor.to(self.processor.to_q_lora.up.weight.device)
+            elif is_custom_diffusion:
+                processor = CustomDiffusionAttnProcessor(
+                    train_kv=self.processor.train_kv,
+                    train_q_out=self.processor.train_q_out,
+                    hidden_size=self.processor.hidden_size,
+                    cross_attention_dim=self.processor.cross_attention_dim,
+                )
+                processor.load_state_dict(self.processor.state_dict())
+                if hasattr(self.processor, "to_k_custom_diffusion"):
+                    processor.to(self.processor.to_k_custom_diffusion.weight.device)
             else:
                 processor = AttnProcessor()
 
@@ -459,6 +483,84 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a
         return hidden_states
 
 
+class CustomDiffusionAttnProcessor(nn.Module):
+    def __init__(
+        self,
+        train_kv=True,
+        train_q_out=True,
+        hidden_size=None,
+        cross_attention_dim=None,
+        out_bias=True,
+        dropout=0.0,
+    ):
+        super().__init__()
+        self.train_kv = train_kv
+        self.train_q_out = train_q_out
+
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+
+        # `_custom_diffusion` id for easy serialization and loading.
+        if self.train_kv:
+            self.to_k_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+            self.to_v_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        if self.train_q_out:
+            self.to_q_custom_diffusion = nn.Linear(hidden_size, hidden_size, bias=False)
+            self.to_out_custom_diffusion = nn.ModuleList([])
+            self.to_out_custom_diffusion.append(nn.Linear(hidden_size, hidden_size, bias=out_bias))
+            self.to_out_custom_diffusion.append(nn.Dropout(dropout))
+
+    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None):
+        batch_size, sequence_length, _ = hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if self.train_q_out:
+            query = self.to_q_custom_diffusion(hidden_states)
+        else:
+            query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            crossattn = False
+            encoder_hidden_states = hidden_states
+        else:
+            crossattn = True
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        if self.train_kv:
+            key = self.to_k_custom_diffusion(encoder_hidden_states)
+            value = self.to_v_custom_diffusion(encoder_hidden_states)
+        else:
+            key = attn.to_k(encoder_hidden_states)
+            value = attn.to_v(encoder_hidden_states)
+
+        if crossattn:
+            detach = torch.ones_like(key)
+            detach[:, :1, :] = detach[:, :1, :] * 0.0
+            key = detach * key + (1 - detach) * key.detach()
+            value = detach * value + (1 - detach) * value.detach()
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        if self.train_q_out:
+            # linear proj
+            hidden_states = self.to_out_custom_diffusion[0](hidden_states)
+            # dropout
+            hidden_states = self.to_out_custom_diffusion[1](hidden_states)
+        else:
+            # linear proj
+            hidden_states = attn.to_out[0](hidden_states)
+            # dropout
+            hidden_states = attn.to_out[1](hidden_states)
+
+        return hidden_states
+
+
 class AttnAddedKVProcessor:
     def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None):
         residual = hidden_states
@@ -699,6 +801,91 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a
         return hidden_states
 
 
+class CustomDiffusionXFormersAttnProcessor(nn.Module):
+    def __init__(
+        self,
+        train_kv=True,
+        train_q_out=False,
+        hidden_size=None,
+        cross_attention_dim=None,
+        out_bias=True,
+        dropout=0.0,
+        attention_op: Optional[Callable] = None,
+    ):
+        super().__init__()
+        self.train_kv = train_kv
+        self.train_q_out = train_q_out
+
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.attention_op = attention_op
+
+        # `_custom_diffusion` id for easy serialization and loading.
+        if self.train_kv:
+            self.to_k_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+            self.to_v_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        if self.train_q_out:
+            self.to_q_custom_diffusion = nn.Linear(hidden_size, hidden_size, bias=False)
+            self.to_out_custom_diffusion = nn.ModuleList([])
+            self.to_out_custom_diffusion.append(nn.Linear(hidden_size, hidden_size, bias=out_bias))
+            self.to_out_custom_diffusion.append(nn.Dropout(dropout))
+
+    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None):
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if self.train_q_out:
+            query = self.to_q_custom_diffusion(hidden_states)
+        else:
+            query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            crossattn = False
+            encoder_hidden_states = hidden_states
+        else:
+            crossattn = True
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        if self.train_kv:
+            key = self.to_k_custom_diffusion(encoder_hidden_states)
+            value = self.to_v_custom_diffusion(encoder_hidden_states)
+        else:
+            key = attn.to_k(encoder_hidden_states)
+            value = attn.to_v(encoder_hidden_states)
+
+        if crossattn:
+            detach = torch.ones_like(key)
+            detach[:, :1, :] = detach[:, :1, :] * 0.0
+            key = detach * key + (1 - detach) * key.detach()
+            value = detach * value + (1 - detach) * value.detach()
+
+        query = attn.head_to_batch_dim(query).contiguous()
+        key = attn.head_to_batch_dim(key).contiguous()
+        value = attn.head_to_batch_dim(value).contiguous()
+
+        hidden_states = xformers.ops.memory_efficient_attention(
+            query, key, value, attn_bias=attention_mask, op=self.attention_op, scale=attn.scale
+        )
+        hidden_states = hidden_states.to(query.dtype)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        if self.train_q_out:
+            # linear proj
+            hidden_states = self.to_out_custom_diffusion[0](hidden_states)
+            # dropout
+            hidden_states = self.to_out_custom_diffusion[1](hidden_states)
+        else:
+            # linear proj
+            hidden_states = attn.to_out[0](hidden_states)
+            # dropout
+            hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+
+
 class SlicedAttnProcessor:
     def __init__(self, slice_size):
         self.slice_size = slice_size
@@ -834,4 +1021,6 @@ def __call__(self, attn: "Attention", hidden_states, encoder_hidden_states=None,
     AttnAddedKVProcessor2_0,
     LoRAAttnProcessor,
     LoRAXFormersAttnProcessor,
+    CustomDiffusionAttnProcessor,
+    CustomDiffusionXFormersAttnProcessor,
 ]
diff --git a/tests/models/test_models_unet_2d_condition.py b/tests/models/test_models_unet_2d_condition.py
index 15f77fb8c106..2576297762a8 100644
--- a/tests/models/test_models_unet_2d_condition.py
+++ b/tests/models/test_models_unet_2d_condition.py
@@ -22,7 +22,7 @@
 from parameterized import parameterized
 
 from diffusers import UNet2DConditionModel
-from diffusers.models.attention_processor import LoRAAttnProcessor
+from diffusers.models.attention_processor import CustomDiffusionAttnProcessor, LoRAAttnProcessor
 from diffusers.utils import (
     floats_tensor,
     load_hf_numpy,
@@ -68,6 +68,55 @@ def create_lora_layers(model, mock_weights: bool = True):
     return lora_attn_procs
 
 
+def create_custom_diffusion_layers(model, mock_weights: bool = True):
+    train_kv = True
+    train_q_out = True
+    custom_diffusion_attn_procs = {}
+
+    st = model.state_dict()
+    for name, _ in model.attn_processors.items():
+        cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim
+        if name.startswith("mid_block"):
+            hidden_size = model.config.block_out_channels[-1]
+        elif name.startswith("up_blocks"):
+            block_id = int(name[len("up_blocks.")])
+            hidden_size = list(reversed(model.config.block_out_channels))[block_id]
+        elif name.startswith("down_blocks"):
+            block_id = int(name[len("down_blocks.")])
+            hidden_size = model.config.block_out_channels[block_id]
+        layer_name = name.split(".processor")[0]
+        weights = {
+            "to_k_custom_diffusion.weight": st[layer_name + ".to_k.weight"],
+            "to_v_custom_diffusion.weight": st[layer_name + ".to_v.weight"],
+        }
+        if train_q_out:
+            weights["to_q_custom_diffusion.weight"] = st[layer_name + ".to_q.weight"]
+            weights["to_out_custom_diffusion.0.weight"] = st[layer_name + ".to_out.0.weight"]
+            weights["to_out_custom_diffusion.0.bias"] = st[layer_name + ".to_out.0.bias"]
+        if cross_attention_dim is not None:
+            custom_diffusion_attn_procs[name] = CustomDiffusionAttnProcessor(
+                train_kv=train_kv,
+                train_q_out=train_q_out,
+                hidden_size=hidden_size,
+                cross_attention_dim=cross_attention_dim,
+            ).to(model.device)
+            custom_diffusion_attn_procs[name].load_state_dict(weights)
+            if mock_weights:
+                # add 1 to weights to mock trained weights
+                with torch.no_grad():
+                    custom_diffusion_attn_procs[name].to_k_custom_diffusion.weight += 1
+                    custom_diffusion_attn_procs[name].to_v_custom_diffusion.weight += 1
+        else:
+            custom_diffusion_attn_procs[name] = CustomDiffusionAttnProcessor(
+                train_kv=False,
+                train_q_out=False,
+                hidden_size=hidden_size,
+                cross_attention_dim=cross_attention_dim,
+            )
+    del st
+    return custom_diffusion_attn_procs
+
+
 class UNet2DConditionModelTests(ModelTesterMixin, unittest.TestCase):
     model_class = UNet2DConditionModel
 
@@ -569,6 +618,96 @@ def test_lora_xformers_on_off(self):
         assert (sample - on_sample).abs().max() < 1e-4
         assert (sample - off_sample).abs().max() < 1e-4
 
+    def test_custom_diffusion_processors(self):
+        # enable deterministic behavior for gradient checkpointing
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["attention_head_dim"] = (8, 16)
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+
+        with torch.no_grad():
+            sample1 = model(**inputs_dict).sample
+
+        custom_diffusion_attn_procs = create_custom_diffusion_layers(model, mock_weights=False)
+
+        # make sure we can set a list of attention processors
+        model.set_attn_processor(custom_diffusion_attn_procs)
+        model.to(torch_device)
+
+        # test that attn processors can be set to itself
+        model.set_attn_processor(model.attn_processors)
+
+        with torch.no_grad():
+            sample2 = model(**inputs_dict).sample
+
+        assert (sample1 - sample2).abs().max() < 1e-4
+
+    def test_custom_diffusion_save_load(self):
+        # enable deterministic behavior for gradient checkpointing
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["attention_head_dim"] = (8, 16)
+
+        torch.manual_seed(0)
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+
+        with torch.no_grad():
+            old_sample = model(**inputs_dict).sample
+
+        custom_diffusion_attn_procs = create_custom_diffusion_layers(model, mock_weights=False)
+        model.set_attn_processor(custom_diffusion_attn_procs)
+
+        with torch.no_grad():
+            sample = model(**inputs_dict).sample
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_attn_procs(tmpdirname)
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_custom_diffusion_weights.bin")))
+            torch.manual_seed(0)
+            new_model = self.model_class(**init_dict)
+            new_model.to(torch_device)
+            new_model.load_attn_procs(tmpdirname, weight_name="pytorch_custom_diffusion_weights.bin")
+
+        with torch.no_grad():
+            new_sample = new_model(**inputs_dict).sample
+
+        assert (sample - new_sample).abs().max() < 1e-4
+
+        # custom diffusion and no custom diffusion should be the same
+        assert (sample - old_sample).abs().max() < 1e-4
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_custom_diffusion_xformers_on_off(self):
+        # enable deterministic behavior for gradient checkpointing
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["attention_head_dim"] = (8, 16)
+
+        torch.manual_seed(0)
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        custom_diffusion_attn_procs = create_custom_diffusion_layers(model, mock_weights=False)
+        model.set_attn_processor(custom_diffusion_attn_procs)
+
+        # default
+        with torch.no_grad():
+            sample = model(**inputs_dict).sample
+
+            model.enable_xformers_memory_efficient_attention()
+            on_sample = model(**inputs_dict).sample
+
+            model.disable_xformers_memory_efficient_attention()
+            off_sample = model(**inputs_dict).sample
+
+        assert (sample - on_sample).abs().max() < 1e-4
+        assert (sample - off_sample).abs().max() < 1e-4
+
 
 @slow
 class UNet2DConditionModelIntegrationTests(unittest.TestCase):

From a121e05feb2e6e4ca02c7bc51fae7019e3005d18 Mon Sep 17 00:00:00 2001
From: Mishig <mishig.davaadorj@coloradocollege.edu>
Date: Thu, 20 Apr 2023 11:04:06 +0200
Subject: [PATCH 35/71] Update custom_diffusion.mdx (#3165)

Add missing newlines for rendering the links correctly
---
 docs/source/en/training/custom_diffusion.mdx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/en/training/custom_diffusion.mdx b/docs/source/en/training/custom_diffusion.mdx
index 1e1958e1c946..245d434adeda 100644
--- a/docs/source/en/training/custom_diffusion.mdx
+++ b/docs/source/en/training/custom_diffusion.mdx
@@ -279,9 +279,11 @@ You can also perform inference from one of the complete checkpoint saved during
 TODO.
 
 ## Set grads to none
+
 To save even more memory, pass the `--set_grads_to_none` argument to the script. This will set grads to None instead of zero. However, be aware that it changes certain behaviors, so if you start experiencing any problems, remove this argument.
 
 More info: https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html
 
 ## Experimental results
+
 You can refer to [our webpage](https://www.cs.cmu.edu/~custom-diffusion/) that discusses our experiments in detail. 

From a5b242d30de029828b8279cf291774a66b6b6298 Mon Sep 17 00:00:00 2001
From: XinyuYe-Intel <xinyu.ye@intel.com>
Date: Thu, 20 Apr 2023 18:55:42 +0800
Subject: [PATCH 36/71] Added distillation for quantization example on textual
 inversion. (#2760)

* Added distillation for quantization example on textual inversion.

Signed-off-by: Ye, Xinyu <xinyu.ye@intel.com>

* refined readme and code style.

Signed-off-by: Ye, Xinyu <xinyu.ye@intel.com>

* Update text2images.py

* refined code of model load and added compatibility check.

Signed-off-by: Ye, Xinyu <xinyu.ye@intel.com>

* fixed code style.

Signed-off-by: Ye, Xinyu <xinyu.ye@intel.com>

* fix C403 [*] Unnecessary `list` comprehension (rewrite as a `set` comprehension)

Signed-off-by: Ye, Xinyu <xinyu.ye@intel.com>

---------

Signed-off-by: Ye, Xinyu <xinyu.ye@intel.com>
---
 .../textual_inversion_dfq/README.md           |   93 ++
 .../textual_inversion_dfq/requirements.txt    |    7 +
 .../textual_inversion_dfq/text2images.py      |  112 ++
 .../textual_inversion.py                      | 1018 +++++++++++++++++
 4 files changed, 1230 insertions(+)
 create mode 100644 examples/research_projects/intel_opts/textual_inversion_dfq/README.md
 create mode 100644 examples/research_projects/intel_opts/textual_inversion_dfq/requirements.txt
 create mode 100644 examples/research_projects/intel_opts/textual_inversion_dfq/text2images.py
 create mode 100644 examples/research_projects/intel_opts/textual_inversion_dfq/textual_inversion.py

diff --git a/examples/research_projects/intel_opts/textual_inversion_dfq/README.md b/examples/research_projects/intel_opts/textual_inversion_dfq/README.md
new file mode 100644
index 000000000000..4a227cdb4d63
--- /dev/null
+++ b/examples/research_projects/intel_opts/textual_inversion_dfq/README.md
@@ -0,0 +1,93 @@
+# Distillation for quantization on Textual Inversion models to personalize text2image
+
+[Textual inversion](https://arxiv.org/abs/2208.01618) is a method to personalize text2image models like stable diffusion on your own images._By using just 3-5 images new concepts can be taught to Stable Diffusion and the model personalized on your own images_
+The `textual_inversion.py` script shows how to implement the training procedure and adapt it for stable diffusion.
+We have enabled distillation for quantization in `textual_inversion.py` to do quantization aware training as well as distillation on the model generated by Textual Inversion method.
+
+## Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+```bash
+pip install -r requirements.txt
+```
+
+## Prepare Datasets
+
+One picture which is from the huggingface datasets [sd-concepts-library/dicoo2](https://huggingface.co/sd-concepts-library/dicoo2) is needed, and save it to the `./dicoo` directory. The picture is shown below:
+
+<a href="https://huggingface.co/sd-concepts-library/dicoo2/blob/main/concept_images/1.jpeg">
+    <img src="https://huggingface.co/sd-concepts-library/dicoo2/resolve/main/concept_images/1.jpeg" width = "300" height="300">
+</a>
+
+## Get a FP32 Textual Inversion model
+
+Use the following command to fine-tune the Stable Diffusion model on the above dataset to obtain the FP32 Textual Inversion model.
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export DATA_DIR="./dicoo"
+
+accelerate launch textual_inversion.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --train_data_dir=$DATA_DIR \
+  --learnable_property="object" \
+  --placeholder_token="<dicoo>" --initializer_token="toy" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 \
+  --max_train_steps=3000 \
+  --learning_rate=5.0e-04 --scale_lr \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --output_dir="dicoo_model"
+```
+
+## Do distillation for quantization
+
+Distillation for quantization is a method that combines [intermediate layer knowledge distillation](https://github.com/intel/neural-compressor/blob/master/docs/source/distillation.md#intermediate-layer-knowledge-distillation) and [quantization aware training](https://github.com/intel/neural-compressor/blob/master/docs/source/quantization.md#quantization-aware-training) in the same training process to improve the performance of the quantized model. Provided a FP32 model, the distillation for quantization approach will take this model itself as the teacher model and transfer the knowledges of the specified layers to the student model, i.e. quantized version of the FP32 model, during the quantization aware training process.
+
+Once you have the FP32 Textual Inversion model, the following command will take the FP32 Textual Inversion model as input to do distillation for quantization and generate the INT8 Textual Inversion model.
+
+```bash
+export FP32_MODEL_NAME="./dicoo_model"
+export DATA_DIR="./dicoo"
+
+accelerate launch textual_inversion.py \
+  --pretrained_model_name_or_path=$FP32_MODEL_NAME \
+  --train_data_dir=$DATA_DIR \
+  --use_ema --learnable_property="object" \
+  --placeholder_token="<dicoo>" --initializer_token="toy" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 \
+  --max_train_steps=300 \
+  --learning_rate=5.0e-04 --max_grad_norm=3 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --output_dir="int8_model" \
+  --do_quantization --do_distillation --verify_loading
+```
+
+After the distillation for quantization process, the quantized UNet would be 4 times smaller (3279MB -> 827MB).
+
+## Inference
+
+Once you have trained a INT8 model with the above command, the inference can be done simply using the `text2images.py` script. Make sure to include the `placeholder_token` in your prompt.
+
+```bash
+export INT8_MODEL_NAME="./int8_model"
+
+python text2images.py \
+  --pretrained_model_name_or_path=$INT8_MODEL_NAME \
+  --caption "a lovely <dicoo> in red dress and hat, in the snowly and brightly night, with many brighly buildings." \
+  --images_num 4
+```
+
+Here is the comparison of images generated by the FP32 model (left) and INT8 model (right) respectively:
+
+<p float="left">
+  <img src="https://huggingface.co/datasets/Intel/textual_inversion_dicoo_dfq/resolve/main/FP32.png" width = "300" height = "300" alt="FP32" align=center />
+  <img src="https://huggingface.co/datasets/Intel/textual_inversion_dicoo_dfq/resolve/main/INT8.png" width = "300" height = "300" alt="INT8" align=center />
+</p>
+
diff --git a/examples/research_projects/intel_opts/textual_inversion_dfq/requirements.txt b/examples/research_projects/intel_opts/textual_inversion_dfq/requirements.txt
new file mode 100644
index 000000000000..cbd4c957be44
--- /dev/null
+++ b/examples/research_projects/intel_opts/textual_inversion_dfq/requirements.txt
@@ -0,0 +1,7 @@
+accelerate
+torchvision
+transformers>=4.25.0
+ftfy
+tensorboard
+modelcards
+neural-compressor
\ No newline at end of file
diff --git a/examples/research_projects/intel_opts/textual_inversion_dfq/text2images.py b/examples/research_projects/intel_opts/textual_inversion_dfq/text2images.py
new file mode 100644
index 000000000000..a99d727712eb
--- /dev/null
+++ b/examples/research_projects/intel_opts/textual_inversion_dfq/text2images.py
@@ -0,0 +1,112 @@
+import argparse
+import math
+import os
+
+import torch
+from neural_compressor.utils.pytorch import load
+from PIL import Image
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from diffusers import AutoencoderKL, StableDiffusionPipeline, UNet2DConditionModel
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-m",
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "-c",
+        "--caption",
+        type=str,
+        default="robotic cat with wings",
+        help="Text used to generate images.",
+    )
+    parser.add_argument(
+        "-n",
+        "--images_num",
+        type=int,
+        default=4,
+        help="How much images to generate.",
+    )
+    parser.add_argument(
+        "-s",
+        "--seed",
+        type=int,
+        default=42,
+        help="Seed for random process.",
+    )
+    parser.add_argument(
+        "-ci",
+        "--cuda_id",
+        type=int,
+        default=0,
+        help="cuda_id.",
+    )
+    args = parser.parse_args()
+    return args
+
+
+def image_grid(imgs, rows, cols):
+    if not len(imgs) == rows * cols:
+        raise ValueError("The specified number of rows and columns are not correct.")
+
+    w, h = imgs[0].size
+    grid = Image.new("RGB", size=(cols * w, rows * h))
+    grid_w, grid_h = grid.size
+
+    for i, img in enumerate(imgs):
+        grid.paste(img, box=(i % cols * w, i // cols * h))
+    return grid
+
+
+def generate_images(
+    pipeline,
+    prompt="robotic cat with wings",
+    guidance_scale=7.5,
+    num_inference_steps=50,
+    num_images_per_prompt=1,
+    seed=42,
+):
+    generator = torch.Generator(pipeline.device).manual_seed(seed)
+    images = pipeline(
+        prompt,
+        guidance_scale=guidance_scale,
+        num_inference_steps=num_inference_steps,
+        generator=generator,
+        num_images_per_prompt=num_images_per_prompt,
+    ).images
+    _rows = int(math.sqrt(num_images_per_prompt))
+    grid = image_grid(images, rows=_rows, cols=num_images_per_prompt // _rows)
+    return grid, images
+
+
+args = parse_args()
+# Load models and create wrapper for stable diffusion
+tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")
+text_encoder = CLIPTextModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder")
+vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
+unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet")
+
+pipeline = StableDiffusionPipeline.from_pretrained(
+    args.pretrained_model_name_or_path, text_encoder=text_encoder, vae=vae, unet=unet, tokenizer=tokenizer
+)
+pipeline.safety_checker = lambda images, clip_input: (images, False)
+if os.path.exists(os.path.join(args.pretrained_model_name_or_path, "best_model.pt")):
+    unet = load(args.pretrained_model_name_or_path, model=unet)
+    unet.eval()
+    setattr(pipeline, "unet", unet)
+else:
+    unet = unet.to(torch.device("cuda", args.cuda_id))
+pipeline = pipeline.to(unet.device)
+grid, images = generate_images(pipeline, prompt=args.caption, num_images_per_prompt=args.images_num, seed=args.seed)
+grid.save(os.path.join(args.pretrained_model_name_or_path, "{}.png".format("_".join(args.caption.split()))))
+dirname = os.path.join(args.pretrained_model_name_or_path, "_".join(args.caption.split()))
+os.makedirs(dirname, exist_ok=True)
+for idx, image in enumerate(images):
+    image.save(os.path.join(dirname, "{}.png".format(idx + 1)))
diff --git a/examples/research_projects/intel_opts/textual_inversion_dfq/textual_inversion.py b/examples/research_projects/intel_opts/textual_inversion_dfq/textual_inversion.py
new file mode 100644
index 000000000000..7afb6c67ef8e
--- /dev/null
+++ b/examples/research_projects/intel_opts/textual_inversion_dfq/textual_inversion.py
@@ -0,0 +1,1018 @@
+import argparse
+import itertools
+import math
+import os
+import random
+from pathlib import Path
+from typing import Iterable, Optional
+
+import numpy as np
+import PIL
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from accelerate import Accelerator
+from accelerate.utils import set_seed
+from huggingface_hub import HfFolder, Repository, whoami
+from neural_compressor.utils import logger
+from packaging import version
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel
+from diffusers.optimization import get_scheduler
+
+
+if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.Resampling.BILINEAR,
+        "bilinear": PIL.Image.Resampling.BILINEAR,
+        "bicubic": PIL.Image.Resampling.BICUBIC,
+        "lanczos": PIL.Image.Resampling.LANCZOS,
+        "nearest": PIL.Image.Resampling.NEAREST,
+    }
+else:
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.LINEAR,
+        "bilinear": PIL.Image.BILINEAR,
+        "bicubic": PIL.Image.BICUBIC,
+        "lanczos": PIL.Image.LANCZOS,
+        "nearest": PIL.Image.NEAREST,
+    }
+# ------------------------------------------------------------------------------
+
+
+def save_progress(text_encoder, placeholder_token_id, accelerator, args, save_path):
+    logger.info("Saving embeddings")
+    learned_embeds = accelerator.unwrap_model(text_encoder).get_input_embeddings().weight[placeholder_token_id]
+    learned_embeds_dict = {args.placeholder_token: learned_embeds.detach().cpu()}
+    torch.save(learned_embeds_dict, save_path)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Example of distillation for quantization on Textual Inversion.")
+    parser.add_argument(
+        "--save_steps",
+        type=int,
+        default=500,
+        help="Save learned_embeds.bin every X updates steps.",
+    )
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--train_data_dir", type=str, default=None, required=True, help="A folder containing the training data."
+    )
+    parser.add_argument(
+        "--placeholder_token",
+        type=str,
+        default=None,
+        required=True,
+        help="A token to use as a placeholder for the concept.",
+    )
+    parser.add_argument(
+        "--initializer_token", type=str, default=None, required=True, help="A token to use as initializer word."
+    )
+    parser.add_argument("--learnable_property", type=str, default="object", help="Choose between 'object' and 'style'")
+    parser.add_argument("--repeats", type=int, default=100, help="How many times to repeat the training data.")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="text-inversion-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=42, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution"
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=5000,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="no",
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose"
+            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
+            "and an Nvidia Ampere GPU."
+        ),
+    )
+    parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--do_quantization", action="store_true", help="Whether or not to do quantization.")
+    parser.add_argument("--do_distillation", action="store_true", help="Whether or not to do distillation.")
+    parser.add_argument(
+        "--verify_loading", action="store_true", help="Whether or not to verify the loading of the quantized model."
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.train_data_dir is None:
+        raise ValueError("You must specify a train data directory.")
+
+    return args
+
+
+imagenet_templates_small = [
+    "a photo of a {}",
+    "a rendering of a {}",
+    "a cropped photo of the {}",
+    "the photo of a {}",
+    "a photo of a clean {}",
+    "a photo of a dirty {}",
+    "a dark photo of the {}",
+    "a photo of my {}",
+    "a photo of the cool {}",
+    "a close-up photo of a {}",
+    "a bright photo of the {}",
+    "a cropped photo of a {}",
+    "a photo of the {}",
+    "a good photo of the {}",
+    "a photo of one {}",
+    "a close-up photo of the {}",
+    "a rendition of the {}",
+    "a photo of the clean {}",
+    "a rendition of a {}",
+    "a photo of a nice {}",
+    "a good photo of a {}",
+    "a photo of the nice {}",
+    "a photo of the small {}",
+    "a photo of the weird {}",
+    "a photo of the large {}",
+    "a photo of a cool {}",
+    "a photo of a small {}",
+]
+
+imagenet_style_templates_small = [
+    "a painting in the style of {}",
+    "a rendering in the style of {}",
+    "a cropped painting in the style of {}",
+    "the painting in the style of {}",
+    "a clean painting in the style of {}",
+    "a dirty painting in the style of {}",
+    "a dark painting in the style of {}",
+    "a picture in the style of {}",
+    "a cool painting in the style of {}",
+    "a close-up painting in the style of {}",
+    "a bright painting in the style of {}",
+    "a cropped painting in the style of {}",
+    "a good painting in the style of {}",
+    "a close-up painting in the style of {}",
+    "a rendition in the style of {}",
+    "a nice painting in the style of {}",
+    "a small painting in the style of {}",
+    "a weird painting in the style of {}",
+    "a large painting in the style of {}",
+]
+
+
+# Adapted from torch-ema https://github.com/fadel/pytorch_ema/blob/master/torch_ema/ema.py#L14
+class EMAModel:
+    """
+    Exponential Moving Average of models weights
+    """
+
+    def __init__(self, parameters: Iterable[torch.nn.Parameter], decay=0.9999):
+        parameters = list(parameters)
+        self.shadow_params = [p.clone().detach() for p in parameters]
+
+        self.decay = decay
+        self.optimization_step = 0
+
+    def get_decay(self, optimization_step):
+        """
+        Compute the decay factor for the exponential moving average.
+        """
+        value = (1 + optimization_step) / (10 + optimization_step)
+        return 1 - min(self.decay, value)
+
+    @torch.no_grad()
+    def step(self, parameters):
+        parameters = list(parameters)
+
+        self.optimization_step += 1
+        self.decay = self.get_decay(self.optimization_step)
+
+        for s_param, param in zip(self.shadow_params, parameters):
+            if param.requires_grad:
+                tmp = self.decay * (s_param - param)
+                s_param.sub_(tmp)
+            else:
+                s_param.copy_(param)
+
+        torch.cuda.empty_cache()
+
+    def copy_to(self, parameters: Iterable[torch.nn.Parameter]) -> None:
+        """
+        Copy current averaged parameters into given collection of parameters.
+        Args:
+            parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+                updated with the stored moving averages. If `None`, the
+                parameters with which this `ExponentialMovingAverage` was
+                initialized will be used.
+        """
+        parameters = list(parameters)
+        for s_param, param in zip(self.shadow_params, parameters):
+            param.data.copy_(s_param.data)
+
+    def to(self, device=None, dtype=None) -> None:
+        r"""Move internal buffers of the ExponentialMovingAverage to `device`.
+        Args:
+            device: like `device` argument to `torch.Tensor.to`
+        """
+        # .to() on the tensors handles None correctly
+        self.shadow_params = [
+            p.to(device=device, dtype=dtype) if p.is_floating_point() else p.to(device=device)
+            for p in self.shadow_params
+        ]
+
+
+class TextualInversionDataset(Dataset):
+    def __init__(
+        self,
+        data_root,
+        tokenizer,
+        learnable_property="object",  # [object, style]
+        size=512,
+        repeats=100,
+        interpolation="bicubic",
+        flip_p=0.5,
+        set="train",
+        placeholder_token="*",
+        center_crop=False,
+    ):
+        self.data_root = data_root
+        self.tokenizer = tokenizer
+        self.learnable_property = learnable_property
+        self.size = size
+        self.placeholder_token = placeholder_token
+        self.center_crop = center_crop
+        self.flip_p = flip_p
+
+        self.image_paths = [os.path.join(self.data_root, file_path) for file_path in os.listdir(self.data_root)]
+
+        self.num_images = len(self.image_paths)
+        self._length = self.num_images
+
+        if set == "train":
+            self._length = self.num_images * repeats
+
+        self.interpolation = {
+            "linear": PIL_INTERPOLATION["linear"],
+            "bilinear": PIL_INTERPOLATION["bilinear"],
+            "bicubic": PIL_INTERPOLATION["bicubic"],
+            "lanczos": PIL_INTERPOLATION["lanczos"],
+        }[interpolation]
+
+        self.templates = imagenet_style_templates_small if learnable_property == "style" else imagenet_templates_small
+        self.flip_transform = transforms.RandomHorizontalFlip(p=self.flip_p)
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, i):
+        example = {}
+        image = Image.open(self.image_paths[i % self.num_images])
+
+        if not image.mode == "RGB":
+            image = image.convert("RGB")
+
+        placeholder_string = self.placeholder_token
+        text = random.choice(self.templates).format(placeholder_string)
+
+        example["input_ids"] = self.tokenizer(
+            text,
+            padding="max_length",
+            truncation=True,
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids[0]
+
+        # default to score-sde preprocessing
+        img = np.array(image).astype(np.uint8)
+
+        if self.center_crop:
+            crop = min(img.shape[0], img.shape[1])
+            (
+                h,
+                w,
+            ) = (
+                img.shape[0],
+                img.shape[1],
+            )
+            img = img[(h - crop) // 2 : (h + crop) // 2, (w - crop) // 2 : (w + crop) // 2]
+
+        image = Image.fromarray(img)
+        image = image.resize((self.size, self.size), resample=self.interpolation)
+
+        image = self.flip_transform(image)
+        image = np.array(image).astype(np.uint8)
+        image = (image / 127.5 - 1.0).astype(np.float32)
+
+        example["pixel_values"] = torch.from_numpy(image).permute(2, 0, 1)
+        return example
+
+
+def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
+    if token is None:
+        token = HfFolder.get_token()
+    if organization is None:
+        username = whoami(token)["name"]
+        return f"{username}/{model_id}"
+    else:
+        return f"{organization}/{model_id}"
+
+
+def freeze_params(params):
+    for param in params:
+        param.requires_grad = False
+
+
+def image_grid(imgs, rows, cols):
+    if not len(imgs) == rows * cols:
+        raise ValueError("The specified number of rows and columns are not correct.")
+
+    w, h = imgs[0].size
+    grid = Image.new("RGB", size=(cols * w, rows * h))
+    grid_w, grid_h = grid.size
+
+    for i, img in enumerate(imgs):
+        grid.paste(img, box=(i % cols * w, i // cols * h))
+    return grid
+
+
+def generate_images(pipeline, prompt="", guidance_scale=7.5, num_inference_steps=50, num_images_per_prompt=1, seed=42):
+    generator = torch.Generator(pipeline.device).manual_seed(seed)
+    images = pipeline(
+        prompt,
+        guidance_scale=guidance_scale,
+        num_inference_steps=num_inference_steps,
+        generator=generator,
+        num_images_per_prompt=num_images_per_prompt,
+    ).images
+    _rows = int(math.sqrt(num_images_per_prompt))
+    grid = image_grid(images, rows=_rows, cols=num_images_per_prompt // _rows)
+    return grid
+
+
+def main():
+    args = parse_args()
+    logging_dir = os.path.join(args.output_dir, args.logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with="tensorboard",
+        logging_dir=logging_dir,
+    )
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.push_to_hub:
+            if args.hub_model_id is None:
+                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
+            else:
+                repo_name = args.hub_model_id
+            repo = Repository(args.output_dir, clone_from=repo_name)
+
+            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
+                if "step_*" not in gitignore:
+                    gitignore.write("step_*\n")
+                if "epoch_*" not in gitignore:
+                    gitignore.write("epoch_*\n")
+        elif args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+    # Load the tokenizer and add the placeholder token as a additional special token
+    if args.tokenizer_name:
+        tokenizer = CLIPTokenizer.from_pretrained(args.tokenizer_name)
+    elif args.pretrained_model_name_or_path:
+        tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")
+
+    # Load models and create wrapper for stable diffusion
+    noise_scheduler = DDPMScheduler.from_config(args.pretrained_model_name_or_path, subfolder="scheduler")
+    text_encoder = CLIPTextModel.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        revision=args.revision,
+    )
+    vae = AutoencoderKL.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="vae",
+        revision=args.revision,
+    )
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="unet",
+        revision=args.revision,
+    )
+
+    train_unet = False
+    # Freeze vae and unet
+    freeze_params(vae.parameters())
+    if not args.do_quantization and not args.do_distillation:
+        # Add the placeholder token in tokenizer
+        num_added_tokens = tokenizer.add_tokens(args.placeholder_token)
+        if num_added_tokens == 0:
+            raise ValueError(
+                f"The tokenizer already contains the token {args.placeholder_token}. Please pass a different"
+                " `placeholder_token` that is not already in the tokenizer."
+            )
+
+        # Convert the initializer_token, placeholder_token to ids
+        token_ids = tokenizer.encode(args.initializer_token, add_special_tokens=False)
+        # Check if initializer_token is a single token or a sequence of tokens
+        if len(token_ids) > 1:
+            raise ValueError("The initializer token must be a single token.")
+
+        initializer_token_id = token_ids[0]
+        placeholder_token_id = tokenizer.convert_tokens_to_ids(args.placeholder_token)
+        # Resize the token embeddings as we are adding new special tokens to the tokenizer
+        text_encoder.resize_token_embeddings(len(tokenizer))
+
+        # Initialise the newly added placeholder token with the embeddings of the initializer token
+        token_embeds = text_encoder.get_input_embeddings().weight.data
+        token_embeds[placeholder_token_id] = token_embeds[initializer_token_id]
+
+        freeze_params(unet.parameters())
+        # Freeze all parameters except for the token embeddings in text encoder
+        params_to_freeze = itertools.chain(
+            text_encoder.text_model.encoder.parameters(),
+            text_encoder.text_model.final_layer_norm.parameters(),
+            text_encoder.text_model.embeddings.position_embedding.parameters(),
+        )
+        freeze_params(params_to_freeze)
+    else:
+        train_unet = True
+        freeze_params(text_encoder.parameters())
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Initialize the optimizer
+    optimizer = torch.optim.AdamW(
+        # only optimize the unet or embeddings of text_encoder
+        unet.parameters() if train_unet else text_encoder.get_input_embeddings().parameters(),
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    train_dataset = TextualInversionDataset(
+        data_root=args.train_data_dir,
+        tokenizer=tokenizer,
+        size=args.resolution,
+        placeholder_token=args.placeholder_token,
+        repeats=args.repeats,
+        learnable_property=args.learnable_property,
+        center_crop=args.center_crop,
+        set="train",
+    )
+    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=args.train_batch_size, shuffle=True)
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+    )
+
+    if not train_unet:
+        text_encoder = accelerator.prepare(text_encoder)
+        unet.to(accelerator.device)
+        unet.eval()
+    else:
+        unet = accelerator.prepare(unet)
+        text_encoder.to(accelerator.device)
+        text_encoder.eval()
+    optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler)
+
+    # Move vae to device
+    vae.to(accelerator.device)
+
+    # Keep vae in eval model as we don't train these
+    vae.eval()
+
+    compression_manager = None
+
+    def train_func(model):
+        if train_unet:
+            unet_ = model
+            text_encoder_ = text_encoder
+        else:
+            unet_ = unet
+            text_encoder_ = model
+        # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+        num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+        if overrode_max_train_steps:
+            args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        # Afterwards we recalculate our number of training epochs
+        args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+        # We need to initialize the trackers we use, and also store our configuration.
+        # The trackers initializes automatically on the main process.
+        if accelerator.is_main_process:
+            accelerator.init_trackers("textual_inversion", config=vars(args))
+
+        # Train!
+        total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+        logger.info("***** Running training *****")
+        logger.info(f"  Num examples = {len(train_dataset)}")
+        logger.info(f"  Num Epochs = {args.num_train_epochs}")
+        logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+        logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+        logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+        logger.info(f"  Total optimization steps = {args.max_train_steps}")
+        # Only show the progress bar once on each machine.
+        progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
+        progress_bar.set_description("Steps")
+        global_step = 0
+
+        if train_unet and args.use_ema:
+            ema_unet = EMAModel(unet_.parameters())
+
+        for epoch in range(args.num_train_epochs):
+            model.train()
+            train_loss = 0.0
+            for step, batch in enumerate(train_dataloader):
+                with accelerator.accumulate(model):
+                    # Convert images to latent space
+                    latents = vae.encode(batch["pixel_values"]).latent_dist.sample().detach()
+                    latents = latents * 0.18215
+
+                    # Sample noise that we'll add to the latents
+                    noise = torch.randn(latents.shape).to(latents.device)
+                    bsz = latents.shape[0]
+                    # Sample a random timestep for each image
+                    timesteps = torch.randint(
+                        0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device
+                    ).long()
+
+                    # Add noise to the latents according to the noise magnitude at each timestep
+                    # (this is the forward diffusion process)
+                    noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                    # Get the text embedding for conditioning
+                    encoder_hidden_states = text_encoder_(batch["input_ids"])[0]
+
+                    # Predict the noise residual
+                    model_pred = unet_(noisy_latents, timesteps, encoder_hidden_states).sample
+
+                    loss = F.mse_loss(model_pred, noise, reduction="none").mean([1, 2, 3]).mean()
+                    if train_unet and compression_manager:
+                        unet_inputs = {
+                            "sample": noisy_latents,
+                            "timestep": timesteps,
+                            "encoder_hidden_states": encoder_hidden_states,
+                        }
+                        loss = compression_manager.callbacks.on_after_compute_loss(unet_inputs, model_pred, loss)
+
+                    # Gather the losses across all processes for logging (if we use distributed training).
+                    avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+                    train_loss += avg_loss.item() / args.gradient_accumulation_steps
+
+                    # Backpropagate
+                    accelerator.backward(loss)
+
+                    if train_unet:
+                        if accelerator.sync_gradients:
+                            accelerator.clip_grad_norm_(unet_.parameters(), args.max_grad_norm)
+                    else:
+                        # Zero out the gradients for all token embeddings except the newly added
+                        # embeddings for the concept, as we only want to optimize the concept embeddings
+                        if accelerator.num_processes > 1:
+                            grads = text_encoder_.module.get_input_embeddings().weight.grad
+                        else:
+                            grads = text_encoder_.get_input_embeddings().weight.grad
+                        # Get the index for tokens that we want to zero the grads for
+                        index_grads_to_zero = torch.arange(len(tokenizer)) != placeholder_token_id
+                        grads.data[index_grads_to_zero, :] = grads.data[index_grads_to_zero, :].fill_(0)
+
+                    optimizer.step()
+                    lr_scheduler.step()
+                    optimizer.zero_grad()
+
+                # Checks if the accelerator has performed an optimization step behind the scenes
+                if accelerator.sync_gradients:
+                    if train_unet and args.use_ema:
+                        ema_unet.step(unet_.parameters())
+                    progress_bar.update(1)
+                    global_step += 1
+                    accelerator.log({"train_loss": train_loss}, step=global_step)
+                    train_loss = 0.0
+                    if not train_unet and global_step % args.save_steps == 0:
+                        save_path = os.path.join(args.output_dir, f"learned_embeds-steps-{global_step}.bin")
+                        save_progress(text_encoder_, placeholder_token_id, accelerator, args, save_path)
+
+                logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+                progress_bar.set_postfix(**logs)
+                accelerator.log(logs, step=global_step)
+
+                if global_step >= args.max_train_steps:
+                    break
+            accelerator.wait_for_everyone()
+
+        if train_unet and args.use_ema:
+            ema_unet.copy_to(unet_.parameters())
+
+        if not train_unet:
+            return text_encoder_
+
+    if not train_unet:
+        text_encoder = train_func(text_encoder)
+    else:
+        import copy
+
+        model = copy.deepcopy(unet)
+        confs = []
+        if args.do_quantization:
+            from neural_compressor import QuantizationAwareTrainingConfig
+
+            q_conf = QuantizationAwareTrainingConfig()
+            confs.append(q_conf)
+
+        if args.do_distillation:
+            teacher_model = copy.deepcopy(model)
+
+            def attention_fetcher(x):
+                return x.sample
+
+            layer_mappings = [
+                [
+                    [
+                        "conv_in",
+                    ]
+                ],
+                [
+                    [
+                        "time_embedding",
+                    ]
+                ],
+                [["down_blocks.0.attentions.0", attention_fetcher]],
+                [["down_blocks.0.attentions.1", attention_fetcher]],
+                [
+                    [
+                        "down_blocks.0.resnets.0",
+                    ]
+                ],
+                [
+                    [
+                        "down_blocks.0.resnets.1",
+                    ]
+                ],
+                [
+                    [
+                        "down_blocks.0.downsamplers.0",
+                    ]
+                ],
+                [["down_blocks.1.attentions.0", attention_fetcher]],
+                [["down_blocks.1.attentions.1", attention_fetcher]],
+                [
+                    [
+                        "down_blocks.1.resnets.0",
+                    ]
+                ],
+                [
+                    [
+                        "down_blocks.1.resnets.1",
+                    ]
+                ],
+                [
+                    [
+                        "down_blocks.1.downsamplers.0",
+                    ]
+                ],
+                [["down_blocks.2.attentions.0", attention_fetcher]],
+                [["down_blocks.2.attentions.1", attention_fetcher]],
+                [
+                    [
+                        "down_blocks.2.resnets.0",
+                    ]
+                ],
+                [
+                    [
+                        "down_blocks.2.resnets.1",
+                    ]
+                ],
+                [
+                    [
+                        "down_blocks.2.downsamplers.0",
+                    ]
+                ],
+                [
+                    [
+                        "down_blocks.3.resnets.0",
+                    ]
+                ],
+                [
+                    [
+                        "down_blocks.3.resnets.1",
+                    ]
+                ],
+                [
+                    [
+                        "up_blocks.0.resnets.0",
+                    ]
+                ],
+                [
+                    [
+                        "up_blocks.0.resnets.1",
+                    ]
+                ],
+                [
+                    [
+                        "up_blocks.0.resnets.2",
+                    ]
+                ],
+                [
+                    [
+                        "up_blocks.0.upsamplers.0",
+                    ]
+                ],
+                [["up_blocks.1.attentions.0", attention_fetcher]],
+                [["up_blocks.1.attentions.1", attention_fetcher]],
+                [["up_blocks.1.attentions.2", attention_fetcher]],
+                [
+                    [
+                        "up_blocks.1.resnets.0",
+                    ]
+                ],
+                [
+                    [
+                        "up_blocks.1.resnets.1",
+                    ]
+                ],
+                [
+                    [
+                        "up_blocks.1.resnets.2",
+                    ]
+                ],
+                [
+                    [
+                        "up_blocks.1.upsamplers.0",
+                    ]
+                ],
+                [["up_blocks.2.attentions.0", attention_fetcher]],
+                [["up_blocks.2.attentions.1", attention_fetcher]],
+                [["up_blocks.2.attentions.2", attention_fetcher]],
+                [
+                    [
+                        "up_blocks.2.resnets.0",
+                    ]
+                ],
+                [
+                    [
+                        "up_blocks.2.resnets.1",
+                    ]
+                ],
+                [
+                    [
+                        "up_blocks.2.resnets.2",
+                    ]
+                ],
+                [
+                    [
+                        "up_blocks.2.upsamplers.0",
+                    ]
+                ],
+                [["up_blocks.3.attentions.0", attention_fetcher]],
+                [["up_blocks.3.attentions.1", attention_fetcher]],
+                [["up_blocks.3.attentions.2", attention_fetcher]],
+                [
+                    [
+                        "up_blocks.3.resnets.0",
+                    ]
+                ],
+                [
+                    [
+                        "up_blocks.3.resnets.1",
+                    ]
+                ],
+                [
+                    [
+                        "up_blocks.3.resnets.2",
+                    ]
+                ],
+                [["mid_block.attentions.0", attention_fetcher]],
+                [
+                    [
+                        "mid_block.resnets.0",
+                    ]
+                ],
+                [
+                    [
+                        "mid_block.resnets.1",
+                    ]
+                ],
+                [
+                    [
+                        "conv_out",
+                    ]
+                ],
+            ]
+            layer_names = [layer_mapping[0][0] for layer_mapping in layer_mappings]
+            if not set(layer_names).issubset([n[0] for n in model.named_modules()]):
+                raise ValueError(
+                    "Provided model is not compatible with the default layer_mappings, "
+                    'please use the model fine-tuned from "CompVis/stable-diffusion-v1-4", '
+                    "or modify the layer_mappings variable to fit your model."
+                    f"\nDefault layer_mappings are as such:\n{layer_mappings}"
+                )
+            from neural_compressor.config import DistillationConfig, IntermediateLayersKnowledgeDistillationLossConfig
+
+            distillation_criterion = IntermediateLayersKnowledgeDistillationLossConfig(
+                layer_mappings=layer_mappings,
+                loss_types=["MSE"] * len(layer_mappings),
+                loss_weights=[1.0 / len(layer_mappings)] * len(layer_mappings),
+                add_origin_loss=True,
+            )
+            d_conf = DistillationConfig(teacher_model=teacher_model, criterion=distillation_criterion)
+            confs.append(d_conf)
+
+        from neural_compressor.training import prepare_compression
+
+        compression_manager = prepare_compression(model, confs)
+        compression_manager.callbacks.on_train_begin()
+        model = compression_manager.model
+        train_func(model)
+        compression_manager.callbacks.on_train_end()
+
+        # Save the resulting model and its corresponding configuration in the given directory
+        model.save(args.output_dir)
+
+        logger.info(f"Optimized model saved to: {args.output_dir}.")
+
+        # change to framework model for further use
+        model = model.model
+
+    # Create the pipeline using using the trained modules and save it.
+    templates = imagenet_style_templates_small if args.learnable_property == "style" else imagenet_templates_small
+    prompt = templates[0].format(args.placeholder_token)
+    if accelerator.is_main_process:
+        pipeline = StableDiffusionPipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            text_encoder=accelerator.unwrap_model(text_encoder),
+            vae=vae,
+            unet=accelerator.unwrap_model(unet),
+            tokenizer=tokenizer,
+        )
+        pipeline.save_pretrained(args.output_dir)
+        pipeline = pipeline.to(unet.device)
+        baseline_model_images = generate_images(pipeline, prompt=prompt, seed=args.seed)
+        baseline_model_images.save(
+            os.path.join(args.output_dir, "{}_baseline_model.png".format("_".join(prompt.split())))
+        )
+
+        if not train_unet:
+            # Also save the newly trained embeddings
+            save_path = os.path.join(args.output_dir, "learned_embeds.bin")
+            save_progress(text_encoder, placeholder_token_id, accelerator, args, save_path)
+        else:
+            setattr(pipeline, "unet", accelerator.unwrap_model(model))
+            if args.do_quantization:
+                pipeline = pipeline.to(torch.device("cpu"))
+
+            optimized_model_images = generate_images(pipeline, prompt=prompt, seed=args.seed)
+            optimized_model_images.save(
+                os.path.join(args.output_dir, "{}_optimized_model.png".format("_".join(prompt.split())))
+            )
+
+        if args.push_to_hub:
+            repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
+
+    accelerator.end_training()
+
+    if args.do_quantization and args.verify_loading:
+        # Load the model obtained after Intel Neural Compressor quantization
+        from neural_compressor.utils.pytorch import load
+
+        loaded_model = load(args.output_dir, model=unet)
+        loaded_model.eval()
+
+        setattr(pipeline, "unet", loaded_model)
+        if args.do_quantization:
+            pipeline = pipeline.to(torch.device("cpu"))
+
+        loaded_model_images = generate_images(pipeline, prompt=prompt, seed=args.seed)
+        if loaded_model_images != optimized_model_images:
+            logger.info("The quantized model was not successfully loaded.")
+        else:
+            logger.info("The quantized model was successfully loaded.")
+
+
+if __name__ == "__main__":
+    main()

From 17470057d202e2f6841398ccc627f3697b01c51b Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 20 Apr 2023 13:09:20 +0200
Subject: [PATCH 37/71] make style

---
 src/diffusers/loaders.py         | 2 +-
 tests/models/test_lora_layers.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py
index 82c1ac61ca9e..b4c443fd303b 100644
--- a/src/diffusers/loaders.py
+++ b/src/diffusers/loaders.py
@@ -848,7 +848,7 @@ def _modify_text_encoder(self, attn_processors: Dict[str, LoRAAttnProcessor]):
         """
         # Loop over the original attention modules.
         for name, _ in self.text_encoder.named_modules():
-            if any([x in name for x in TEXT_ENCODER_TARGET_MODULES]):
+            if any(x in name for x in TEXT_ENCODER_TARGET_MODULES):
                 # Retrieve the module and its corresponding LoRA processor.
                 module = self.text_encoder.get_submodule(name)
                 # Construct a new function that performs the LoRA merging. We will monkey patch
diff --git a/tests/models/test_lora_layers.py b/tests/models/test_lora_layers.py
index 6f75902d388f..6f1e85e15558 100644
--- a/tests/models/test_lora_layers.py
+++ b/tests/models/test_lora_layers.py
@@ -46,7 +46,7 @@ def create_unet_lora_layers(unet: nn.Module):
 def create_text_encoder_lora_layers(text_encoder: nn.Module):
     text_lora_attn_procs = {}
     for name, module in text_encoder.named_modules():
-        if any([x in name for x in TEXT_ENCODER_TARGET_MODULES]):
+        if any(x in name for x in TEXT_ENCODER_TARGET_MODULES):
             text_lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=module.out_features, cross_attention_dim=None)
     text_encoder_lora_layers = AttnProcsLayers(text_lora_attn_procs)
     return text_encoder_lora_layers

From 7b0ba4820a7546520da4b099fc6c523d5b6d3383 Mon Sep 17 00:00:00 2001
From: clarencechen <clarencechenct@gmail.com>
Date: Thu, 20 Apr 2023 04:13:47 -0700
Subject: [PATCH 38/71] Update Noise Autocorrelation Loss Function for
 Pix2PixZero Pipeline (#2942)

* Update Pix2PixZero Auto-correlation Loss

* Add fast inversion tests

* Clarify purpose and mark as deprecated

Fix inversion prompt broadcasting

* Register modules set to `None` in config for `test_save_load_optional_components`

* Update new tests to coordinate with #2953
---
 .../pipeline_stable_diffusion_pix2pix_zero.py | 64 +++++++------
 .../test_stable_diffusion_pix2pix_zero.py     | 91 ++++++++++++++++++-
 2 files changed, 123 insertions(+), 32 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
index 0239c8128171..6444ec7c8506 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
@@ -36,6 +36,7 @@
 from ...utils import (
     PIL_INTERPOLATION,
     BaseOutput,
+    deprecate,
     is_accelerate_available,
     is_accelerate_version,
     logging,
@@ -721,23 +722,31 @@ def prepare_image_latents(self, image, batch_size, dtype, device, generator=None
             )
 
         if isinstance(generator, list):
-            init_latents = [
-                self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
-            ]
-            init_latents = torch.cat(init_latents, dim=0)
+            latents = [self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)]
+            latents = torch.cat(latents, dim=0)
         else:
-            init_latents = self.vae.encode(image).latent_dist.sample(generator)
-
-        init_latents = self.vae.config.scaling_factor * init_latents
-
-        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
-            raise ValueError(
-                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
-            )
+            latents = self.vae.encode(image).latent_dist.sample(generator)
+
+        latents = self.vae.config.scaling_factor * latents
+
+        if batch_size != latents.shape[0]:
+            if batch_size % latents.shape[0] == 0:
+                # expand image_latents for batch_size
+                deprecation_message = (
+                    f"You have passed {batch_size} text prompts (`prompt`), but only {latents.shape[0]} initial"
+                    " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                    " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                    " your script to pass as many initial images as text prompts to suppress this warning."
+                )
+                deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+                additional_latents_per_image = batch_size // latents.shape[0]
+                latents = torch.cat([latents] * additional_latents_per_image, dim=0)
+            else:
+                raise ValueError(
+                    f"Cannot duplicate `image` of batch size {latents.shape[0]} to {batch_size} text prompts."
+                )
         else:
-            init_latents = torch.cat([init_latents], dim=0)
-
-        latents = init_latents
+            latents = torch.cat([latents], dim=0)
 
         return latents
 
@@ -759,23 +768,18 @@ def get_epsilon(self, model_output: torch.Tensor, sample: torch.Tensor, timestep
             )
 
     def auto_corr_loss(self, hidden_states, generator=None):
-        batch_size, channel, height, width = hidden_states.shape
-        if batch_size > 1:
-            raise ValueError("Only batch_size 1 is supported for now")
-
-        hidden_states = hidden_states.squeeze(0)
-        # hidden_states must be shape [C,H,W] now
         reg_loss = 0.0
         for i in range(hidden_states.shape[0]):
-            noise = hidden_states[i][None, None, :, :]
-            while True:
-                roll_amount = torch.randint(noise.shape[2] // 2, (1,), generator=generator).item()
-                reg_loss += (noise * torch.roll(noise, shifts=roll_amount, dims=2)).mean() ** 2
-                reg_loss += (noise * torch.roll(noise, shifts=roll_amount, dims=3)).mean() ** 2
-
-                if noise.shape[2] <= 8:
-                    break
-                noise = F.avg_pool2d(noise, kernel_size=2)
+            for j in range(hidden_states.shape[1]):
+                noise = hidden_states[i : i + 1, j : j + 1, :, :]
+                while True:
+                    roll_amount = torch.randint(noise.shape[2] // 2, (1,), generator=generator).item()
+                    reg_loss += (noise * torch.roll(noise, shifts=roll_amount, dims=2)).mean() ** 2
+                    reg_loss += (noise * torch.roll(noise, shifts=roll_amount, dims=3)).mean() ** 2
+
+                    if noise.shape[2] <= 8:
+                        break
+                    noise = F.avg_pool2d(noise, kernel_size=2)
         return reg_loss
 
     def kl_divergence(self, hidden_states):
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py
index 0809a91041ce..661926daaa3e 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 
 import gc
+import random
+import tempfile
 import unittest
 
 import numpy as np
@@ -30,7 +32,7 @@
     StableDiffusionPix2PixZeroPipeline,
     UNet2DConditionModel,
 )
-from diffusers.utils import load_numpy, slow, torch_device
+from diffusers.utils import floats_tensor, load_numpy, slow, torch_device
 from diffusers.utils.testing_utils import load_image, load_pt, require_torch_gpu, skip_mps
 
 from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
@@ -69,6 +71,7 @@ def get_dummy_components(self):
             cross_attention_dim=32,
         )
         scheduler = DDIMScheduler()
+        inverse_scheduler = DDIMInverseScheduler()
         torch.manual_seed(0)
         vae = AutoencoderKL(
             block_out_channels=[32, 64],
@@ -101,7 +104,7 @@ def get_dummy_components(self):
             "tokenizer": tokenizer,
             "safety_checker": None,
             "feature_extractor": None,
-            "inverse_scheduler": None,
+            "inverse_scheduler": inverse_scheduler,
             "caption_generator": None,
             "caption_processor": None,
         }
@@ -122,6 +125,90 @@ def get_dummy_inputs(self, device, seed=0):
         }
         return inputs
 
+    def get_dummy_inversion_inputs(self, device, seed=0):
+        dummy_image = floats_tensor((2, 3, 32, 32), rng=random.Random(seed)).to(torch_device)
+        generator = torch.manual_seed(seed)
+
+        inputs = {
+            "prompt": [
+                "A painting of a squirrel eating a burger",
+                "A painting of a burger eating a squirrel",
+            ],
+            "image": dummy_image.cpu(),
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "generator": generator,
+            "output_type": "numpy",
+        }
+        return inputs
+
+    def test_save_load_optional_components(self):
+        if not hasattr(self.pipeline_class, "_optional_components"):
+            return
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        # set all optional components to None and update pipeline config accordingly
+        for optional_component in pipe._optional_components:
+            setattr(pipe, optional_component, None)
+        pipe.register_modules(**{optional_component: None for optional_component in pipe._optional_components})
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output = pipe(**inputs)[0]
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pipe.save_pretrained(tmpdir)
+            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
+            pipe_loaded.to(torch_device)
+            pipe_loaded.set_progress_bar_config(disable=None)
+
+        for optional_component in pipe._optional_components:
+            self.assertTrue(
+                getattr(pipe_loaded, optional_component) is None,
+                f"`{optional_component}` did not stay set to None after loading.",
+            )
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_loaded = pipe_loaded(**inputs)[0]
+
+        max_diff = np.abs(output - output_loaded).max()
+        self.assertLess(max_diff, 1e-4)
+
+    def test_stable_diffusion_pix2pix_zero_inversion(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPix2PixZeroPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inversion_inputs(device)
+        inputs["image"] = inputs["image"][:1]
+        inputs["prompt"] = inputs["prompt"][:1]
+        image = sd_pipe.invert(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+        assert image.shape == (1, 32, 32, 3)
+        expected_slice = np.array([0.4833, 0.4696, 0.5574, 0.5194, 0.5248, 0.5638, 0.5040, 0.5423, 0.5072])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_stable_diffusion_pix2pix_zero_inversion_batch(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionPix2PixZeroPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inversion_inputs(device)
+        image = sd_pipe.invert(**inputs).images
+        image_slice = image[1, -3:, -3:, -1]
+        assert image.shape == (2, 32, 32, 3)
+        expected_slice = np.array([0.6672, 0.5203, 0.4908, 0.4376, 0.4517, 0.5544, 0.4605, 0.4826, 0.5007])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
     def test_stable_diffusion_pix2pix_zero_default_case(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
         components = self.get_dummy_components()

From 3045fb276352681f6b9075956e599dd8ef571872 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Thu, 20 Apr 2023 17:25:17 +0530
Subject: [PATCH 39/71] [DreamBooth] add text encoder LoRA support in the
 DreamBooth training script (#3130)

* add: LoRA text encoder support for DreamBooth example.

* fix initialization.

* fix: modification call.

* add: entry in the readme.

* use dog dataset from hub.

* fix: params to clip.

* add entry to the LoRA doc.

* add: tests for lora.

* remove unnecessary list comprehension./
---
 docs/source/en/training/dreambooth.mdx       |  13 ++-
 docs/source/en/training/lora.mdx             |   9 +-
 examples/dreambooth/README.md                |  43 +++++---
 examples/dreambooth/train_dreambooth_lora.py | 101 +++++++++++++++----
 examples/test_examples.py                    |  63 ++++++++++++
 5 files changed, 197 insertions(+), 32 deletions(-)

diff --git a/docs/source/en/training/dreambooth.mdx b/docs/source/en/training/dreambooth.mdx
index 908355e496dc..88ded0e009dc 100644
--- a/docs/source/en/training/dreambooth.mdx
+++ b/docs/source/en/training/dreambooth.mdx
@@ -60,7 +60,18 @@ DreamBooth finetuning is very sensitive to hyperparameters and easy to overfit.
 
 <frameworkcontent>
 <pt>
-Let's try DreamBooth with a [few images of a dog](https://drive.google.com/drive/folders/1BO_dyz-p65qhBRRMRA4TbZ8qW4rB99JZ); download and save them to a directory and then set the `INSTANCE_DIR` environment variable to that path:
+Let's try DreamBooth with a
+[few images of a dog](https://huggingface.co/datasets/diffusers/dog-example);
+download and save them to a directory and then set the `INSTANCE_DIR` environment variable to that path:
+
+```python 
+local_dir = "./path_to_training_images"
+snapshot_download(
+    "diffusers/dog-example",
+    local_dir=local_dir, repo_type="dataset",
+    ignore_patterns=".gitattributes",
+)
+```
 
 ```bash
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
diff --git a/docs/source/en/training/lora.mdx b/docs/source/en/training/lora.mdx
index 1c72fbbc8d58..ac2311df9f1e 100644
--- a/docs/source/en/training/lora.mdx
+++ b/docs/source/en/training/lora.mdx
@@ -16,7 +16,9 @@ specific language governing permissions and limitations under the License.
 
 <Tip warning={true}>
 
-Currently, LoRA is only supported for the attention layers of the [`UNet2DConditionalModel`].
+Currently, LoRA is only supported for the attention layers of the [`UNet2DConditionalModel`]. We also 
+support LoRA fine-tuning of the text encoder for DreamBooth in a limited capacity. For more details on how we support 
+LoRA fine-tuning of the text encoder, refer to the discussion on [this PR](https://github.com/huggingface/diffusers/pull/2918). 
 
 </Tip>
 
@@ -175,6 +177,11 @@ accelerate launch train_dreambooth_lora.py \
   --push_to_hub
 ```
 
+It's also possible to additionally fine-tune the text encoder with LoRA. This, in most cases, leads
+to better results with a slight increase in the compute. To allow fine-tuning the text encoder with LoRA,
+specify the `--train_text_encoder` while launching the `train_dreambooth_lora.py` script. 
+
+
 ### Inference[[dreambooth-inference]]
 
 Now you can use the model for inference by loading the base model in the [`StableDiffusionPipeline`]:
diff --git a/examples/dreambooth/README.md b/examples/dreambooth/README.md
index d53f17114404..8447c7560720 100644
--- a/examples/dreambooth/README.md
+++ b/examples/dreambooth/README.md
@@ -45,15 +45,28 @@ write_basic_config()
 
 ### Dog toy example
 
-Now let's get our dataset. Download images from [here](https://drive.google.com/drive/folders/1BO_dyz-p65qhBRRMRA4TbZ8qW4rB99JZ) and save them in a directory. This will be our training data.
+Now let's get our dataset. For this example we will use some dog images: https://huggingface.co/datasets/diffusers/dog-example.
 
-And launch the training using
+Let's first download it locally:
+
+```python
+from huggingface_hub import snapshot_download
+
+local_dir = "./dog"
+snapshot_download(
+    "diffusers/dog-example",
+    local_dir=local_dir, repo_type="dataset",
+    ignore_patterns=".gitattributes",
+)
+```
+
+And launch the training using:
 
 **___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___**
 
 ```bash
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export INSTANCE_DIR="path-to-instance-images"
+export INSTANCE_DIR="dog"
 export OUTPUT_DIR="path-to-save-model"
 
 accelerate launch train_dreambooth.py \
@@ -77,7 +90,7 @@ According to the paper, it's recommended to generate `num_epochs * num_samples`
 
 ```bash
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export INSTANCE_DIR="path-to-instance-images"
+export INSTANCE_DIR="dog"
 export CLASS_DIR="path-to-class-images"
 export OUTPUT_DIR="path-to-save-model"
 
@@ -108,7 +121,7 @@ To install `bitandbytes` please refer to this [readme](https://github.com/TimDet
 
 ```bash
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export INSTANCE_DIR="path-to-instance-images"
+export INSTANCE_DIR="dog"
 export CLASS_DIR="path-to-class-images"
 export OUTPUT_DIR="path-to-save-model"
 
@@ -141,7 +154,7 @@ It is possible to run dreambooth on a 12GB GPU by using the following optimizati
 
 ```bash
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export INSTANCE_DIR="path-to-instance-images"
+export INSTANCE_DIR="dog"
 export CLASS_DIR="path-to-class-images"
 export OUTPUT_DIR="path-to-save-model"
 
@@ -185,7 +198,7 @@ does not seem to be compatible with DeepSpeed at the moment.
 
 ```bash
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export INSTANCE_DIR="path-to-instance-images"
+export INSTANCE_DIR="dog"
 export CLASS_DIR="path-to-class-images"
 export OUTPUT_DIR="path-to-save-model"
 
@@ -217,7 +230,7 @@ ___Note: Training text encoder requires more memory, with this option the traini
 
 ```bash
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export INSTANCE_DIR="path-to-instance-images"
+export INSTANCE_DIR="dog"
 export CLASS_DIR="path-to-class-images"
 export OUTPUT_DIR="path-to-save-model"
 
@@ -300,7 +313,7 @@ Now, you can launch the training. Here we will use [Stable Diffusion 1-5](https:
 
 ```bash
 export MODEL_NAME="runwayml/stable-diffusion-v1-5"
-export INSTANCE_DIR="path-to-instance-images"
+export INSTANCE_DIR="dog"
 export OUTPUT_DIR="path-to-save-model"
 ```
 
@@ -342,6 +355,12 @@ The final LoRA embedding weights have been uploaded to [patrickvonplaten/lora_dr
 The training results are summarized [here](https://api.wandb.ai/report/patrickvonplaten/xm6cd5q5).
 You can use the `Step` slider to see how the model learned the features of our subject while the model trained.
 
+Optionally, we can also train additional LoRA layers for the text encoder. Specify the `train_text_encoder` argument above for that. If you're interested to know more about how we
+enable this support, check out this [PR](https://github.com/huggingface/diffusers/pull/2918). 
+
+With the default hyperparameters from the above, the training seems to go in a positive direction. Check out [this panel](https://wandb.ai/sayakpaul/dreambooth-lora/reports/test-23-04-17-17-00-13---Vmlldzo0MDkwNjMy). The trained LoRA layers are available [here](https://huggingface.co/sayakpaul/dreambooth).
+
+
 ### Inference
 
 After training, LoRA weights can be loaded very easily into the original pipeline. First, you need to 
@@ -386,7 +405,7 @@ pip install -U -r requirements_flax.txt
 
 ```bash
 export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
-export INSTANCE_DIR="path-to-instance-images"
+export INSTANCE_DIR="dog"
 export OUTPUT_DIR="path-to-save-model"
 
 python train_dreambooth_flax.py \
@@ -405,7 +424,7 @@ python train_dreambooth_flax.py \
 
 ```bash
 export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
-export INSTANCE_DIR="path-to-instance-images"
+export INSTANCE_DIR="dog"
 export CLASS_DIR="path-to-class-images"
 export OUTPUT_DIR="path-to-save-model"
 
@@ -429,7 +448,7 @@ python train_dreambooth_flax.py \
 
 ```bash
 export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
-export INSTANCE_DIR="path-to-instance-images"
+export INSTANCE_DIR="dog"
 export CLASS_DIR="path-to-class-images"
 export OUTPUT_DIR="path-to-save-model"
 
diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py
index d360939c8c0c..1b75402c3550 100644
--- a/examples/dreambooth/train_dreambooth_lora.py
+++ b/examples/dreambooth/train_dreambooth_lora.py
@@ -15,6 +15,7 @@
 
 import argparse
 import hashlib
+import itertools
 import logging
 import math
 import os
@@ -43,12 +44,13 @@
     DDPMScheduler,
     DiffusionPipeline,
     DPMSolverMultistepScheduler,
+    StableDiffusionPipeline,
     UNet2DConditionModel,
 )
-from diffusers.loaders import AttnProcsLayers
+from diffusers.loaders import AttnProcsLayers, LoraLoaderMixin
 from diffusers.models.attention_processor import LoRAAttnProcessor
 from diffusers.optimization import get_scheduler
-from diffusers.utils import check_min_version, is_wandb_available
+from diffusers.utils import TEXT_ENCODER_TARGET_MODULES, check_min_version, is_wandb_available
 from diffusers.utils.import_utils import is_xformers_available
 
 
@@ -58,7 +60,7 @@
 logger = get_logger(__name__)
 
 
-def save_model_card(repo_id: str, images=None, base_model=str, prompt=str, repo_folder=None):
+def save_model_card(repo_id: str, images=None, base_model=str, train_text_encoder=False, prompt=str, repo_folder=None):
     img_str = ""
     for i, image in enumerate(images):
         image.save(os.path.join(repo_folder, f"image_{i}.png"))
@@ -83,6 +85,8 @@ def save_model_card(repo_id: str, images=None, base_model=str, prompt=str, repo_
 
 These are LoRA adaption weights for {base_model}. The weights were trained on {prompt} using [DreamBooth](https://dreambooth.github.io/). You can find some example images in the following. \n
 {img_str}
+
+LoRA for the text encoder was enabled: {train_text_encoder}.
 """
     with open(os.path.join(repo_folder, "README.md"), "w") as f:
         f.write(yaml + model_card)
@@ -219,6 +223,11 @@ def parse_args(input_args=None):
             " cropped. The images will be resized to the resolution first before cropping."
         ),
     )
+    parser.add_argument(
+        "--train_text_encoder",
+        action="store_true",
+        help="Whether to train the text encoder. If set, the text encoder should be float32 precision.",
+    )
     parser.add_argument(
         "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
     )
@@ -547,7 +556,13 @@ def main(args):
 
     # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate
     # This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models.
-    # TODO (patil-suraj): Remove this check when gradient accumulation with two models is enabled in accelerate.
+    # TODO (sayakpaul): Remove this check when gradient accumulation with two models is enabled in accelerate.
+    if args.train_text_encoder and args.gradient_accumulation_steps > 1 and accelerator.num_processes > 1:
+        raise ValueError(
+            "Gradient accumulation is not supported when training the text encoder in distributed training. "
+            "Please set gradient_accumulation_steps to 1. This feature will be supported in the future."
+        )
+
     # Make one log on every process with the configuration for debugging.
     logging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -691,7 +706,7 @@ def main(args):
     # => 32 layers
 
     # Set correct lora layers
-    lora_attn_procs = {}
+    unet_lora_attn_procs = {}
     for name in unet.attn_processors.keys():
         cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
         if name.startswith("mid_block"):
@@ -703,12 +718,33 @@ def main(args):
             block_id = int(name[len("down_blocks.")])
             hidden_size = unet.config.block_out_channels[block_id]
 
-        lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
-
-    unet.set_attn_processor(lora_attn_procs)
-    lora_layers = AttnProcsLayers(unet.attn_processors)
+        unet_lora_attn_procs[name] = LoRAAttnProcessor(
+            hidden_size=hidden_size, cross_attention_dim=cross_attention_dim
+        )
 
-    accelerator.register_for_checkpointing(lora_layers)
+    unet.set_attn_processor(unet_lora_attn_procs)
+    unet_lora_layers = AttnProcsLayers(unet.attn_processors)
+    accelerator.register_for_checkpointing(unet_lora_layers)
+
+    # The text encoder comes from 🤗 transformers, so we cannot directly modify it.
+    # So, instead, we monkey-patch the forward calls of its attention-blocks. For this,
+    # we first load a dummy pipeline with the text encoder and then do the monkey-patching.
+    text_encoder_lora_layers = None
+    if args.train_text_encoder:
+        text_lora_attn_procs = {}
+        for name, module in text_encoder.named_modules():
+            if any(x in name for x in TEXT_ENCODER_TARGET_MODULES):
+                text_lora_attn_procs[name] = LoRAAttnProcessor(
+                    hidden_size=module.out_features, cross_attention_dim=None
+                )
+        text_encoder_lora_layers = AttnProcsLayers(text_lora_attn_procs)
+        temp_pipeline = StableDiffusionPipeline.from_pretrained(
+            args.pretrained_model_name_or_path, text_encoder=text_encoder
+        )
+        temp_pipeline._modify_text_encoder(text_lora_attn_procs)
+        text_encoder = temp_pipeline.text_encoder
+        accelerator.register_for_checkpointing(unet_lora_layers)
+        del temp_pipeline
 
     if args.scale_lr:
         args.learning_rate = (
@@ -739,8 +775,13 @@ def main(args):
         optimizer_class = torch.optim.AdamW
 
     # Optimizer creation
+    params_to_optimize = (
+        itertools.chain(unet_lora_layers.parameters(), text_encoder_lora_layers.parameters())
+        if args.train_text_encoder
+        else unet_lora_layers.parameters()
+    )
     optimizer = optimizer_class(
-        lora_layers.parameters(),
+        params_to_optimize,
         lr=args.learning_rate,
         betas=(args.adam_beta1, args.adam_beta2),
         weight_decay=args.adam_weight_decay,
@@ -784,9 +825,14 @@ def main(args):
     )
 
     # Prepare everything with our `accelerator`.
-    lora_layers, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-        lora_layers, optimizer, train_dataloader, lr_scheduler
-    )
+    if args.train_text_encoder:
+        unet_lora_layers, text_encoder_lora_layers, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet_lora_layers, text_encoder_lora_layers, optimizer, train_dataloader, lr_scheduler
+        )
+    else:
+        unet_lora_layers, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet_lora_layers, optimizer, train_dataloader, lr_scheduler
+        )
 
     # We need to recalculate our total training steps as the size of the training dataloader may have changed.
     num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
@@ -845,6 +891,8 @@ def main(args):
 
     for epoch in range(first_epoch, args.num_train_epochs):
         unet.train()
+        if args.train_text_encoder:
+            text_encoder.train()
         for step, batch in enumerate(train_dataloader):
             # Skip steps until we reach the resumed step
             if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
@@ -900,7 +948,11 @@ def main(args):
 
                 accelerator.backward(loss)
                 if accelerator.sync_gradients:
-                    params_to_clip = lora_layers.parameters()
+                    params_to_clip = (
+                        itertools.chain(unet_lora_layers.parameters(), text_encoder_lora_layers.parameters())
+                        if args.train_text_encoder
+                        else unet_lora_layers.parameters()
+                    )
                     accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
                 optimizer.step()
                 lr_scheduler.step()
@@ -914,7 +966,14 @@ def main(args):
                 if global_step % args.checkpointing_steps == 0:
                     if accelerator.is_main_process:
                         save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
-                        accelerator.save_state(save_path)
+                        # We combine the text encoder and UNet LoRA parameters with a simple
+                        # custom logic. `accelerator.save_state()` won't know that. So,
+                        # use `LoraLoaderMixin.save_lora_weights()`.
+                        LoraLoaderMixin.save_lora_weights(
+                            save_directory=save_path,
+                            unet_lora_layers=unet_lora_layers,
+                            text_encoder_lora_layers=text_encoder_lora_layers,
+                        )
                         logger.info(f"Saved state to {save_path}")
 
             logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
@@ -970,7 +1029,12 @@ def main(args):
     accelerator.wait_for_everyone()
     if accelerator.is_main_process:
         unet = unet.to(torch.float32)
-        unet.save_attn_procs(args.output_dir)
+        text_encoder = text_encoder.to(torch.float32)
+        LoraLoaderMixin.save_lora_weights(
+            save_directory=args.output_dir,
+            unet_lora_layers=unet_lora_layers,
+            text_encoder_lora_layers=text_encoder_lora_layers,
+        )
 
         # Final inference
         # Load previous pipeline
@@ -981,7 +1045,7 @@ def main(args):
         pipeline = pipeline.to(accelerator.device)
 
         # load attention processors
-        pipeline.unet.load_attn_procs(args.output_dir)
+        pipeline.load_attn_procs(args.output_dir)
 
         # run inference
         if args.validation_prompt and args.num_validation_images > 0:
@@ -1010,6 +1074,7 @@ def main(args):
                 repo_id,
                 images=images,
                 base_model=args.pretrained_model_name_or_path,
+                train_text_encoder=args.train_text_encoder,
                 prompt=args.instance_prompt,
                 repo_folder=args.output_dir,
             )
diff --git a/examples/test_examples.py b/examples/test_examples.py
index a77fa4c7da23..238dc49d729f 100644
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -23,6 +23,7 @@
 import unittest
 from typing import List
 
+import torch
 from accelerate.utils import write_basic_config
 
 from diffusers import DiffusionPipeline, UNet2DConditionModel
@@ -221,6 +222,68 @@ def test_dreambooth_checkpointing(self):
             self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-4")))
             self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-6")))
 
+    def test_dreambooth_lora(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/dreambooth/train_dreambooth_lora.py
+                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-pipe
+                --instance_data_dir docs/source/en/imgs
+                --instance_prompt photo
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.bin")))
+
+            # make sure the state_dict has the correct naming in the parameters.
+            lora_state_dict = torch.load(os.path.join(tmpdir, "pytorch_lora_weights.bin"))
+            is_lora = all("lora" in k for k in lora_state_dict.keys())
+            self.assertTrue(is_lora)
+
+            # when not training the text encoder, all the parameters in the state dict should start
+            # with `"unet"` in their names.
+            starts_with_unet = all(key.startswith("unet") for key in lora_state_dict.keys())
+            self.assertTrue(starts_with_unet)
+
+    def test_dreambooth_lora_with_text_encoder(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/dreambooth/train_dreambooth_lora.py
+                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-pipe
+                --instance_data_dir docs/source/en/imgs
+                --instance_prompt photo
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --train_text_encoder
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.bin")))
+
+            # the names of the keys of the state dict should either start with `unet`
+            # or `text_encoder`.
+            lora_state_dict = torch.load(os.path.join(tmpdir, "pytorch_lora_weights.bin"))
+            keys = lora_state_dict.keys()
+            is_correct_naming = all(k.startswith("unet") or k.startswith("text_encoder") for k in keys)
+            self.assertTrue(is_correct_naming)
+
     def test_custom_diffusion(self):
         with tempfile.TemporaryDirectory() as tmpdir:
             test_args = f"""

From 9bce375f77d8d4de88535c651c64aff057c33545 Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Fri, 21 Apr 2023 18:24:43 +0200
Subject: [PATCH 40/71] Update Habana Gaudi documentation (#3169)

* Update Habana Gaudi doc

* Fix tables
---
 docs/source/en/optimization/habana.mdx | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/docs/source/en/optimization/habana.mdx b/docs/source/en/optimization/habana.mdx
index a5f476b0cef2..7092c89352db 100644
--- a/docs/source/en/optimization/habana.mdx
+++ b/docs/source/en/optimization/habana.mdx
@@ -16,8 +16,8 @@ specific language governing permissions and limitations under the License.
 
 ## Requirements
 
-- Optimum Habana 1.4 or later, [here](https://huggingface.co/docs/optimum/habana/installation) is how to install it.
-- SynapseAI 1.8.
+- Optimum Habana 1.5 or later, [here](https://huggingface.co/docs/optimum/habana/installation) is how to install it.
+- SynapseAI 1.9.
 
 
 ## Inference Pipeline
@@ -64,7 +64,16 @@ For more information, check out Optimum Habana's [documentation](https://hugging
 
 Here are the latencies for Habana first-generation Gaudi and Gaudi2 with the [Habana/stable-diffusion](https://huggingface.co/Habana/stable-diffusion) Gaudi configuration (mixed precision bf16/fp32):
 
+- [Stable Diffusion v1.5](https://huggingface.co/runwayml/stable-diffusion-v1-5) (512x512 resolution):
+
 |                        | Latency (batch size = 1) | Throughput (batch size = 8) |
 | ---------------------- |:------------------------:|:---------------------------:|
-| first-generation Gaudi | 4.29s                    | 0.283 images/s              |
-| Gaudi2                 | 1.54s                    | 0.904 images/s              |
+| first-generation Gaudi | 4.22s                    | 0.29 images/s               |
+| Gaudi2                 | 1.70s                    | 0.925 images/s              |
+
+- [Stable Diffusion v2.1](https://huggingface.co/stabilityai/stable-diffusion-2-1) (768x768 resolution):
+
+|                        | Latency (batch size = 1) | Throughput                      |
+| ---------------------- |:------------------------:|:-------------------------------:|
+| first-generation Gaudi | 23.3s                    | 0.045 images/s (batch size = 2) |
+| Gaudi2                 | 7.75s                    | 0.14 images/s (batch size = 5)  |

From 9c856118c72dca9cae194648492b4284a254386c Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 21 Apr 2023 18:47:33 +0200
Subject: [PATCH 41/71] Add model offload to x4 upscaler (#3187)

* Add model offload to x4 upscaler

* fix
---
 .../pipeline_stable_diffusion_upscale.py      | 34 +++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
index c0086b32d6fd..693208b18cdd 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
@@ -23,7 +23,7 @@
 from ...loaders import TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import DDPMScheduler, KarrasDiffusionSchedulers
-from ...utils import deprecate, is_accelerate_available, logging, randn_tensor
+from ...utils import deprecate, is_accelerate_available, is_accelerate_version, logging, randn_tensor
 from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 
 
@@ -129,10 +129,36 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
 
         device = torch.device(f"cuda:{gpu_id}")
 
-        for cpu_offloaded_model in [self.unet, self.text_encoder]:
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
             if cpu_offloaded_model is not None:
                 cpu_offload(cpu_offloaded_model, device)
 
+    def enable_model_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        hook = None
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
+            if cpu_offloaded_model is not None:
+                _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
+
+        # We'll offload the last model manually.
+        self.final_offload_hook = hook
+
     @property
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
     def _execution_device(self):
@@ -647,6 +673,10 @@ def __call__(
         self.vae.to(dtype=torch.float32)
         image = self.decode_latents(latents.float())
 
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+
         # 11. Convert to PIL
         if output_type == "pil":
             image = self.numpy_to_pil(image)

From 2f6351b0015a4cd610a054f973b4f75d65c83531 Mon Sep 17 00:00:00 2001
From: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Date: Fri, 21 Apr 2023 10:38:34 -0700
Subject: [PATCH 42/71] [docs] Deterministic algorithms (#3172)

deterministic algos
---
 .../en/using-diffusers/reproducibility.mdx    | 47 +++++++++++++++++--
 1 file changed, 44 insertions(+), 3 deletions(-)

diff --git a/docs/source/en/using-diffusers/reproducibility.mdx b/docs/source/en/using-diffusers/reproducibility.mdx
index 35191c139289..5bef10bfe190 100644
--- a/docs/source/en/using-diffusers/reproducibility.mdx
+++ b/docs/source/en/using-diffusers/reproducibility.mdx
@@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License.
 
 Reproducibility is important for testing, replicating results, and can even be used to [improve image quality](reusing_seeds). However, the randomness in diffusion models is a desired property because it allows the pipeline to generate different images every time it is run. While you can't expect to get the exact same results across platforms, you can expect results to be reproducible across releases and platforms within a certain tolerance range. Even then, tolerance varies depending on the diffusion pipeline and checkpoint.
 
-This is why it's important to understand how to control sources of randomness in diffusion models.
+This is why it's important to understand how to control sources of randomness in diffusion models or use deterministic algorithms.
 
 <Tip>
 
@@ -24,7 +24,7 @@ This is why it's important to understand how to control sources of randomness in
 
 </Tip>
 
-## Inference
+## Control randomness
 
 During inference, pipelines rely heavily on random sampling operations which include creating the 
 Gaussian noise tensors to denoise and adding noise to the scheduling step.
@@ -147,5 +147,46 @@ susceptible to precision error propagation. Don't expect similar results across
 different GPU hardware or PyTorch versions. In this case, you'll need to run 
 exactly the same hardware and PyTorch version for full reproducibility.
 
-## randn_tensor
+### randn_tensor
 [[autodoc]] diffusers.utils.randn_tensor
+
+## Deterministic algorithms
+
+You can also configure PyTorch to use deterministic algorithms to create a reproducible pipeline. However, you should be aware that deterministic algorithms may be slower than nondeterministic ones and you may observe a decrease in performance. But if reproducibility is important to you, then this is the way to go!
+
+Nondeterministic behavior occurs when operations are launched in more than one CUDA stream. To avoid this, set the environment varibale [`CUBLAS_WORKSPACE_CONFIG`](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility) to `:16:8` to only use one buffer size during runtime.
+
+PyTorch typically benchmarks multiple algorithms to select the fastest one, but if you want reproducibility, you should disable this feature because the benchmark may select different algorithms each time. Lastly, pass `True` to [`torch.use_deterministic_algorithms`](https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html) to enable deterministic algorithms.
+
+```py
+import os
+
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
+
+torch.backends.cudnn.benchmark = False
+torch.use_deterministic_algorithms(True)
+```
+
+Now when you run the same pipeline twice, you'll get identical results.
+
+```py
+import torch
+from diffusers import DDIMScheduler, StableDiffusionPipeline
+import numpy as np
+
+model_id = "runwayml/stable-diffusion-v1-5"
+pipe = StableDiffusionPipeline.from_pretrained(model_id).to("cuda")
+pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+g = torch.Generator(device="cuda")
+
+prompt = "A bear is playing a guitar on Times Square"
+
+g.manual_seed(0)
+result1 = pipe(prompt=prompt, num_inference_steps=50, generator=g, output_type="latent").images
+
+g.manual_seed(0)
+result2 = pipe(prompt=prompt, num_inference_steps=50, generator=g, output_type="latent").images
+
+print("L_inf dist = ", abs(result1 - result2).max())
+"L_inf dist =  tensor(0., device='cuda:0')"
+```
\ No newline at end of file

From e573ae06e2bf5aa632d9d78ce3c4c1374741287d Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Fri, 21 Apr 2023 23:14:08 +0530
Subject: [PATCH 43/71] Update custom_diffusion.mdx to credit the author
 (#3163)

* Update custom_diffusion.mdx

* fix: unnecessary list comprehension.
---
 docs/source/en/training/custom_diffusion.mdx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/en/training/custom_diffusion.mdx b/docs/source/en/training/custom_diffusion.mdx
index 245d434adeda..08604f101ea2 100644
--- a/docs/source/en/training/custom_diffusion.mdx
+++ b/docs/source/en/training/custom_diffusion.mdx
@@ -15,6 +15,8 @@ specific language governing permissions and limitations under the License.
 [Custom Diffusion](https://arxiv.org/abs/2212.04488) is a method to customize text-to-image models like Stable Diffusion given just a few (4~5) images of a subject.
 The `train_custom_diffusion.py` script shows how to implement the training procedure and adapt it for stable diffusion.
 
+This training example was contributed by [Nupur Kumari](https://nupurkmr9.github.io/) (one of the authors of Custom Diffusion). 
+
 ## Running locally with PyTorch
 
 ### Installing the dependencies

From 05d9baeacd531dc66680d974ec234940e0088d58 Mon Sep 17 00:00:00 2001
From: asfiyab-nvidia <117682710+asfiyab-nvidia@users.noreply.github.com>
Date: Fri, 21 Apr 2023 10:53:10 -0700
Subject: [PATCH 44/71] Fix TensorRT community pipeline device set function
 (#3157)

pass silence_dtype_warnings as kwarg

Signed-off-by: Asfiya Baig <asfiyab@nvidia.com>
Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 examples/community/stable_diffusion_tensorrt_txt2img.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/community/stable_diffusion_tensorrt_txt2img.py b/examples/community/stable_diffusion_tensorrt_txt2img.py
index 7aef2bec743f..aa7b5c12313b 100644
--- a/examples/community/stable_diffusion_tensorrt_txt2img.py
+++ b/examples/community/stable_diffusion_tensorrt_txt2img.py
@@ -703,7 +703,7 @@ def set_cached_folder(cls, pretrained_model_name_or_path: Optional[Union[str, os
         )
 
     def to(self, torch_device: Optional[Union[str, torch.device]] = None, silence_dtype_warnings: bool = False):
-        super().to(torch_device, silence_dtype_warnings)
+        super().to(torch_device, silence_dtype_warnings=silence_dtype_warnings)
 
         self.onnx_dir = os.path.join(self.cached_folder, self.onnx_dir)
         self.engine_dir = os.path.join(self.cached_folder, self.engine_dir)

From bc0392a0cbac301474ef82eed5818d2030a4fc4c Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Fri, 21 Apr 2023 08:01:36 -1000
Subject: [PATCH 45/71] make `from_flax` work for controlnet (#3161)

fix from_flax

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/models/modeling_pytorch_flax_utils.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/diffusers/models/modeling_pytorch_flax_utils.py b/src/diffusers/models/modeling_pytorch_flax_utils.py
index b368a74ca299..17b521b00145 100644
--- a/src/diffusers/models/modeling_pytorch_flax_utils.py
+++ b/src/diffusers/models/modeling_pytorch_flax_utils.py
@@ -110,6 +110,12 @@ def load_flax_weights_in_pytorch_model(pt_model, flax_state):
                     .replace("_1", ".1")
                     .replace("_2", ".2")
                     .replace("_3", ".3")
+                    .replace("_4", ".4")
+                    .replace("_5", ".5")
+                    .replace("_6", ".6")
+                    .replace("_7", ".7")
+                    .replace("_8", ".8")
+                    .replace("_9", ".9")
                 )
 
         flax_key = ".".join(flax_key_tuple_array)

From 391cfcd7d7e3df50ba30b3771c4347848ff0b2e1 Mon Sep 17 00:00:00 2001
From: Steven Liu <59462357+stevhliu@users.noreply.github.com>
Date: Fri, 21 Apr 2023 11:03:44 -0700
Subject: [PATCH 46/71] [docs] Clarify training args (#3146)

* clarify training arg

* apply feedback
---
 docs/source/en/training/controlnet.mdx      |  1 +
 docs/source/en/training/dreambooth.mdx      | 47 ++++++++++++---------
 docs/source/en/training/instructpix2pix.mdx |  3 +-
 docs/source/en/training/lora.mdx            |  8 +++-
 docs/source/en/training/text2image.mdx      |  6 ++-
 docs/source/en/training/text_inversion.mdx  | 23 +++++++---
 6 files changed, 58 insertions(+), 30 deletions(-)

diff --git a/docs/source/en/training/controlnet.mdx b/docs/source/en/training/controlnet.mdx
index 7a5454107b83..94e3d969b80a 100644
--- a/docs/source/en/training/controlnet.mdx
+++ b/docs/source/en/training/controlnet.mdx
@@ -74,6 +74,7 @@ wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/ma
 wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_2.png
 ```
 
+Specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`~diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path`] argument.
 
 ```bash
 export MODEL_DIR="runwayml/stable-diffusion-v1-5"
diff --git a/docs/source/en/training/dreambooth.mdx b/docs/source/en/training/dreambooth.mdx
index 88ded0e009dc..c5a5a047d114 100644
--- a/docs/source/en/training/dreambooth.mdx
+++ b/docs/source/en/training/dreambooth.mdx
@@ -50,6 +50,20 @@ from accelerate.utils import write_basic_config
 write_basic_config()
 ```
 
+Finally, download a [few images of a dog](https://huggingface.co/datasets/diffusers/dog-example) to DreamBooth with:
+
+```py
+from huggingface_hub import snapshot_download
+
+local_dir = "./dog"
+snapshot_download(
+    "diffusers/dog-example",
+    local_dir=local_dir,
+    repo_type="dataset",
+    ignore_patterns=".gitattributes",
+)
+```
+
 ## Finetuning
 
 <Tip warning={true}>
@@ -60,22 +74,13 @@ DreamBooth finetuning is very sensitive to hyperparameters and easy to overfit.
 
 <frameworkcontent>
 <pt>
-Let's try DreamBooth with a
-[few images of a dog](https://huggingface.co/datasets/diffusers/dog-example);
-download and save them to a directory and then set the `INSTANCE_DIR` environment variable to that path:
+Set the `INSTANCE_DIR` environment variable to the path of the directory containing the dog images.
 
-```python 
-local_dir = "./path_to_training_images"
-snapshot_download(
-    "diffusers/dog-example",
-    local_dir=local_dir, repo_type="dataset",
-    ignore_patterns=".gitattributes",
-)
-```
+Specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`~diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path`] argument.
 
 ```bash
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export INSTANCE_DIR="path_to_training_images"
+export INSTANCE_DIR="./dog"
 export OUTPUT_DIR="path_to_saved_model"
 ```
 
@@ -105,11 +110,13 @@ Before running the script, make sure you have the requirements installed:
 pip install -U -r requirements.txt
 ```
 
+Specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`~diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path`] argument.
+
 Now you can launch the training script with the following command:
 
 ```bash
 export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
-export INSTANCE_DIR="path-to-instance-images"
+export INSTANCE_DIR="./dog"
 export OUTPUT_DIR="path-to-save-model"
 
 python train_dreambooth_flax.py \
@@ -135,7 +142,7 @@ The authors recommend generating `num_epochs * num_samples` images for prior pre
 <pt>
 ```bash
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export INSTANCE_DIR="path_to_training_images"
+export INSTANCE_DIR="./dog"
 export CLASS_DIR="path_to_class_images"
 export OUTPUT_DIR="path_to_saved_model"
 
@@ -160,7 +167,7 @@ accelerate launch train_dreambooth.py \
 <jax>
 ```bash
 export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
-export INSTANCE_DIR="path-to-instance-images"
+export INSTANCE_DIR="./dog"
 export CLASS_DIR="path-to-class-images"
 export OUTPUT_DIR="path-to-save-model"
 
@@ -197,7 +204,7 @@ Pass the `--train_text_encoder` argument to the training script to enable finetu
 <pt>
 ```bash
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export INSTANCE_DIR="path_to_training_images"
+export INSTANCE_DIR="./dog"
 export CLASS_DIR="path_to_class_images"
 export OUTPUT_DIR="path_to_saved_model"
 
@@ -224,7 +231,7 @@ accelerate launch train_dreambooth.py \
 <jax>
 ```bash
 export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
-export INSTANCE_DIR="path-to-instance-images"
+export INSTANCE_DIR="./dog"
 export CLASS_DIR="path-to-class-images"
 export OUTPUT_DIR="path-to-save-model"
 
@@ -360,7 +367,7 @@ Then pass the `--use_8bit_adam` option to the training script:
 
 ```bash
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export INSTANCE_DIR="path_to_training_images"
+export INSTANCE_DIR="./dog"
 export CLASS_DIR="path_to_class_images"
 export OUTPUT_DIR="path_to_saved_model"
 
@@ -389,7 +396,7 @@ To run DreamBooth on a 12GB GPU, you'll need to enable gradient checkpointing, t
 
 ```bash
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export INSTANCE_DIR="path-to-instance-images"
+export INSTANCE_DIR="./dog"
 export CLASS_DIR="path-to-class-images"
 export OUTPUT_DIR="path-to-save-model"
 
@@ -436,7 +443,7 @@ Launch training with the following command:
 
 ```bash
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
-export INSTANCE_DIR="path_to_training_images"
+export INSTANCE_DIR="./dog"
 export CLASS_DIR="path_to_class_images"
 export OUTPUT_DIR="path_to_saved_model"
 
diff --git a/docs/source/en/training/instructpix2pix.mdx b/docs/source/en/training/instructpix2pix.mdx
index c485db6d6b20..ff34ec335656 100644
--- a/docs/source/en/training/instructpix2pix.mdx
+++ b/docs/source/en/training/instructpix2pix.mdx
@@ -74,8 +74,7 @@ write_basic_config()
 As mentioned before, we'll use a [small toy dataset](https://huggingface.co/datasets/fusing/instructpix2pix-1000-samples) for training. The dataset 
 is a smaller version of the [original dataset](https://huggingface.co/datasets/timbrooks/instructpix2pix-clip-filtered) used in the InstructPix2Pix paper.
 
-Configure environment variables such as the dataset identifier and the Stable Diffusion
-checkpoint:
+Specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`~diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path`] argument. You'll also need to specify the dataset name in `DATASET_ID`:
 
 ```bash
 export MODEL_NAME="runwayml/stable-diffusion-v1-5"
diff --git a/docs/source/en/training/lora.mdx b/docs/source/en/training/lora.mdx
index ac2311df9f1e..7e3c3c0b2b68 100644
--- a/docs/source/en/training/lora.mdx
+++ b/docs/source/en/training/lora.mdx
@@ -52,7 +52,9 @@ Finetuning a model like Stable Diffusion, which has billions of parameters, can
 
 Let's finetune [`stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5) on the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset to generate your own Pokémon.
 
-To start, make sure you have the `MODEL_NAME` and `DATASET_NAME` environment variables set. The `OUTPUT_DIR` and `HUB_MODEL_ID` variables are optional and specify where to save the model to on the Hub:
+Specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`~diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path`] argument. You'll also need to set the `DATASET_NAME` environment variable to the name of the dataset you want to train on.
+
+The `OUTPUT_DIR` and `HUB_MODEL_ID` variables are optional and specify where to save the model to on the Hub:
 
 ```bash
 export MODEL_NAME="runwayml/stable-diffusion-v1-5"
@@ -140,7 +142,9 @@ Load the LoRA weights from your finetuned model *on top of the base model weight
 
 Let's finetune [`stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5) with DreamBooth and LoRA with some 🐶 [dog images](https://drive.google.com/drive/folders/1BO_dyz-p65qhBRRMRA4TbZ8qW4rB99JZ). Download and save these images to a directory.
 
-To start, make sure you have the `MODEL_NAME` and `INSTANCE_DIR` (path to directory containing images) environment variables set. The `OUTPUT_DIR` variables is optional and specifies where to save the model to on the Hub:
+To start, specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`~diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path`] argument. You'll also need to set `INSTANCE_DIR` to the path of the directory containing the images. 
+
+The `OUTPUT_DIR` variables is optional and specifies where to save the model to on the Hub:
 
 ```bash
 export MODEL_NAME="runwayml/stable-diffusion-v1-5"
diff --git a/docs/source/en/training/text2image.mdx b/docs/source/en/training/text2image.mdx
index 70f8c003a787..dabb68397f78 100644
--- a/docs/source/en/training/text2image.mdx
+++ b/docs/source/en/training/text2image.mdx
@@ -72,7 +72,9 @@ To load a checkpoint to resume training, pass the argument `--resume_from_checkp
 
 <frameworkcontent>
 <pt>
-Launch the [PyTorch training script](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py) for a fine-tuning run on the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset like this:
+Launch the [PyTorch training script](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py) for a fine-tuning run on the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset like this.
+
+Specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`~diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path`] argument.
 
 <literalinclude>
 {"path": "../../../../examples/text_to_image/README.md",
@@ -141,6 +143,8 @@ Before running the script, make sure you have the requirements installed:
 pip install -U -r requirements_flax.txt
 ```
 
+Specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`~diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path`] argument.
+
 Now you can launch the [Flax training script](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_flax.py) like this:
 
 ```bash
diff --git a/docs/source/en/training/text_inversion.mdx b/docs/source/en/training/text_inversion.mdx
index 6e6971d7f119..e47a0519c704 100644
--- a/docs/source/en/training/text_inversion.mdx
+++ b/docs/source/en/training/text_inversion.mdx
@@ -1,4 +1,4 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+ <!--Copyright 2023 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -81,9 +81,20 @@ To resume training from a saved checkpoint, pass the following argument to the t
 
 ## Finetuning
 
-For your training dataset, download these [images of a cat statue](https://drive.google.com/drive/folders/1fmJMs25nxS_rSNqS5hTcRdLem_YQXbq5) and store them in a directory. 
+For your training dataset, download these [images of a cat toy](https://huggingface.co/datasets/diffusers/cat_toy_example) and store them in a directory:
 
-Set the `MODEL_NAME` environment variable to the model repository id, and the `DATA_DIR` environment variable to the path of the directory containing the images. Now you can launch the [training script](https://github.com/huggingface/diffusers/blob/main/examples/textual_inversion/textual_inversion.py):
+```py
+from huggingface_hub import snapshot_download
+
+local_dir = "./cat"
+snapshot_download(
+    "diffusers/cat_toy_example", local_dir=local_dir, repo_type="dataset", ignore_patterns=".gitattributes"
+)
+```
+
+Specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`~diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path`] argument, and the `DATA_DIR` environment variable to the path of the directory containing the images. 
+
+Now you can launch the [training script](https://github.com/huggingface/diffusers/blob/main/examples/textual_inversion/textual_inversion.py):
 
 <Tip>
 
@@ -95,7 +106,7 @@ Set the `MODEL_NAME` environment variable to the model repository id, and the `D
 <pt>
 ```bash
 export MODEL_NAME="runwayml/stable-diffusion-v1-5"
-export DATA_DIR="path-to-dir-containing-images"
+export DATA_DIR="./cat"
 
 accelerate launch textual_inversion.py \
   --pretrained_model_name_or_path=$MODEL_NAME \
@@ -121,11 +132,13 @@ Before you begin, make sure you install the Flax specific dependencies:
 pip install -U -r requirements_flax.txt
 ```
 
+Specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`~diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path`] argument.
+
 Then you can launch the [training script](https://github.com/huggingface/diffusers/blob/main/examples/textual_inversion/textual_inversion_flax.py):
 
 ```bash
 export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
-export DATA_DIR="path-to-dir-containing-images"
+export DATA_DIR="./cat"
 
 python textual_inversion_flax.py \
   --pretrained_model_name_or_path=$MODEL_NAME \

From 2c04e5855cf4a4696d8b1688e2858a93b125a865 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 21 Apr 2023 20:06:19 +0200
Subject: [PATCH 47/71] Multi Vector Textual Inversion (#3144)

* Multi Vector

* Improve

* fix multi token

* improve test

* make style

* Update examples/test_examples.py

* Apply suggestions from code review

Co-authored-by: Suraj Patil <surajp815@gmail.com>

* update

* Finish

* Apply suggestions from code review

---------

Co-authored-by: Suraj Patil <surajp815@gmail.com>
---
 docs/source/en/training/text_inversion.mdx    | 12 +++
 .../mulit_token_textual_inversion/README.md   |  5 +-
 examples/test_examples.py                     |  4 +
 examples/textual_inversion/README.md          | 13 ++-
 .../textual_inversion/textual_inversion.py    | 86 ++++++++++++++++---
 5 files changed, 107 insertions(+), 13 deletions(-)

diff --git a/docs/source/en/training/text_inversion.mdx b/docs/source/en/training/text_inversion.mdx
index e47a0519c704..4cbab9886045 100644
--- a/docs/source/en/training/text_inversion.mdx
+++ b/docs/source/en/training/text_inversion.mdx
@@ -122,6 +122,18 @@ accelerate launch textual_inversion.py \
   --lr_warmup_steps=0 \
   --output_dir="textual_inversion_cat"
 ```
+
+<Tip>
+
+💡 If you want to increase the trainable capacity, you can associate your placeholder token, *e.g.* `<cat-toy>` to 
+multiple embedding vectors. This can help the model to better capture the style of more (complex) images. 
+To enable training multiple embedding vectors, simply pass:
+
+```bash
+--num_vectors=5
+```
+
+</Tip>
 </pt>
 <jax>
 If you have access to TPUs, try out the [Flax training script](https://github.com/huggingface/diffusers/blob/main/examples/textual_inversion/textual_inversion_flax.py) to train even faster (this'll also work for GPUs). With the same configuration settings, the Flax training script should be at least 70% faster than the PyTorch training script! ⚡️
diff --git a/examples/research_projects/mulit_token_textual_inversion/README.md b/examples/research_projects/mulit_token_textual_inversion/README.md
index 540e4a705f19..1303f73c1756 100644
--- a/examples/research_projects/mulit_token_textual_inversion/README.md
+++ b/examples/research_projects/mulit_token_textual_inversion/README.md
@@ -1,4 +1,7 @@
-## Multi Token Textual Inversion
+## [Deprecated] Multi Token Textual Inversion
+
+**IMPORTART: This research project is deprecated. Multi Token Textual Inversion is now supported natively in [the officail textual inversion example](https://github.com/huggingface/diffusers/tree/main/examples/textual_inversion#running-locally-with-pytorch).**
+
 The author of this project is [Isamu Isozaki](https://github.com/isamu-isozaki) - please make sure to tag the author for issue and PRs as well as @patrickvonplaten.
 
 We add multi token support to textual inversion. I added
diff --git a/examples/test_examples.py b/examples/test_examples.py
index 238dc49d729f..d4a5ef5046f0 100644
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -105,6 +105,10 @@ def test_textual_inversion(self):
                 --learnable_property object
                 --placeholder_token <cat-toy>
                 --initializer_token a
+                --validation_prompt <cat-toy>
+                --validation_steps 1
+                --save_steps 1
+                --num_vectors 2
                 --resolution 64
                 --train_batch_size 1
                 --gradient_accumulation_steps 1
diff --git a/examples/textual_inversion/README.md b/examples/textual_inversion/README.md
index 4d420b284f38..fc231f870783 100644
--- a/examples/textual_inversion/README.md
+++ b/examples/textual_inversion/README.md
@@ -36,7 +36,6 @@ And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) e
 accelerate config
 ```
 
-
 ### Cat toy example
 
 First, let's login so that we can upload the checkpoint to the Hub during training:
@@ -83,6 +82,18 @@ accelerate launch textual_inversion.py \
 
 A full training run takes ~1 hour on one V100 GPU.
 
+**Note**: As described in [the official paper](https://arxiv.org/abs/2208.01618) 
+only one embedding vector is used for the placeholder token, *e.g.* `"<cat-toy>"`.
+However, one can also add multiple embedding vectors for the placeholder token 
+to inclease the number of fine-tuneable parameters. This can help the model to learn 
+more complex details. To use multiple embedding vectors, you can should define `--num_vectors` 
+to a number larger than one, *e.g.*:
+```
+--num_vectors 5
+```
+
+The saved textual inversion vectors will then be larger in size compared to the default case.
+
 ### Inference
 
 Once you have trained a model using above command, the inference can be done simply using the `StableDiffusionPipeline`. Make sure to include the `placeholder_token` in your prompt.
diff --git a/examples/textual_inversion/textual_inversion.py b/examples/textual_inversion/textual_inversion.py
index e157e629df64..d7be58bdb9ba 100644
--- a/examples/textual_inversion/textual_inversion.py
+++ b/examples/textual_inversion/textual_inversion.py
@@ -82,6 +82,34 @@
 logger = get_logger(__name__)
 
 
+def save_model_card(repo_id: str, images=None, base_model=str, repo_folder=None):
+    img_str = ""
+    for i, image in enumerate(images):
+        image.save(os.path.join(repo_folder, f"image_{i}.png"))
+        img_str += f"![img_{i}](./image_{i}.png)\n"
+
+    yaml = f"""
+---
+license: creativeml-openrail-m
+base_model: {base_model}
+tags:
+- stable-diffusion
+- stable-diffusion-diffusers
+- text-to-image
+- diffusers
+- textual_inversion
+inference: true
+---
+    """
+    model_card = f"""
+# Textual inversion text2image fine-tuning - {repo_id}
+These are textual inversion adaption weights for {base_model}. You can find some example images in the following. \n
+{img_str}
+"""
+    with open(os.path.join(repo_folder, "README.md"), "w") as f:
+        f.write(yaml + model_card)
+
+
 def log_validation(text_encoder, tokenizer, unet, vae, args, accelerator, weight_dtype, epoch):
     logger.info(
         f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
@@ -94,6 +122,7 @@ def log_validation(text_encoder, tokenizer, unet, vae, args, accelerator, weight
         tokenizer=tokenizer,
         unet=unet,
         vae=vae,
+        safety_checker=None,
         revision=args.revision,
         torch_dtype=weight_dtype,
     )
@@ -124,11 +153,16 @@ def log_validation(text_encoder, tokenizer, unet, vae, args, accelerator, weight
 
     del pipeline
     torch.cuda.empty_cache()
+    return images
 
 
-def save_progress(text_encoder, placeholder_token_id, accelerator, args, save_path):
+def save_progress(text_encoder, placeholder_token_ids, accelerator, args, save_path):
     logger.info("Saving embeddings")
-    learned_embeds = accelerator.unwrap_model(text_encoder).get_input_embeddings().weight[placeholder_token_id]
+    learned_embeds = (
+        accelerator.unwrap_model(text_encoder)
+        .get_input_embeddings()
+        .weight[min(placeholder_token_ids) : max(placeholder_token_ids) + 1]
+    )
     learned_embeds_dict = {args.placeholder_token: learned_embeds.detach().cpu()}
     torch.save(learned_embeds_dict, save_path)
 
@@ -144,9 +178,15 @@ def parse_args():
     parser.add_argument(
         "--only_save_embeds",
         action="store_true",
-        default=False,
+        default=True,
         help="Save only the embeddings for the new concept.",
     )
+    parser.add_argument(
+        "--num_vectors",
+        type=int,
+        default=1,
+        help="How many textual inversion vectors shall be used to learn the concept.",
+    )
     parser.add_argument(
         "--pretrained_model_name_or_path",
         type=str,
@@ -581,8 +621,19 @@ def main():
     )
 
     # Add the placeholder token in tokenizer
-    num_added_tokens = tokenizer.add_tokens(args.placeholder_token)
-    if num_added_tokens == 0:
+    placeholder_tokens = [args.placeholder_token]
+
+    if args.num_vectors < 1:
+        raise ValueError(f"--num_vectors has to be larger or equal to 1, but is {args.num_vectors}")
+
+    # add dummy tokens for multi-vector
+    additional_tokens = []
+    for i in range(1, args.num_vectors):
+        additional_tokens.append(f"{args.placeholder_token}_{i}")
+    placeholder_tokens += additional_tokens
+
+    num_added_tokens = tokenizer.add_tokens(placeholder_tokens)
+    if num_added_tokens != args.num_vectors:
         raise ValueError(
             f"The tokenizer already contains the token {args.placeholder_token}. Please pass a different"
             " `placeholder_token` that is not already in the tokenizer."
@@ -595,14 +646,16 @@ def main():
         raise ValueError("The initializer token must be a single token.")
 
     initializer_token_id = token_ids[0]
-    placeholder_token_id = tokenizer.convert_tokens_to_ids(args.placeholder_token)
+    placeholder_token_ids = tokenizer.convert_tokens_to_ids(placeholder_tokens)
 
     # Resize the token embeddings as we are adding new special tokens to the tokenizer
     text_encoder.resize_token_embeddings(len(tokenizer))
 
     # Initialise the newly added placeholder token with the embeddings of the initializer token
     token_embeds = text_encoder.get_input_embeddings().weight.data
-    token_embeds[placeholder_token_id] = token_embeds[initializer_token_id]
+    with torch.no_grad():
+        for token_id in placeholder_token_ids:
+            token_embeds[token_id] = token_embeds[initializer_token_id].clone()
 
     # Freeze vae and unet
     vae.requires_grad_(False)
@@ -810,7 +863,9 @@ def main():
                 optimizer.zero_grad()
 
                 # Let's make sure we don't update any embedding weights besides the newly added token
-                index_no_updates = torch.arange(len(tokenizer)) != placeholder_token_id
+                index_no_updates = torch.ones((len(tokenizer),), dtype=torch.bool)
+                index_no_updates[min(placeholder_token_ids) : max(placeholder_token_ids) + 1] = False
+
                 with torch.no_grad():
                     accelerator.unwrap_model(text_encoder).get_input_embeddings().weight[
                         index_no_updates
@@ -818,11 +873,12 @@ def main():
 
             # Checks if the accelerator has performed an optimization step behind the scenes
             if accelerator.sync_gradients:
+                images = []
                 progress_bar.update(1)
                 global_step += 1
                 if global_step % args.save_steps == 0:
                     save_path = os.path.join(args.output_dir, f"learned_embeds-steps-{global_step}.bin")
-                    save_progress(text_encoder, placeholder_token_id, accelerator, args, save_path)
+                    save_progress(text_encoder, placeholder_token_ids, accelerator, args, save_path)
 
                 if accelerator.is_main_process:
                     if global_step % args.checkpointing_steps == 0:
@@ -831,7 +887,9 @@ def main():
                         logger.info(f"Saved state to {save_path}")
 
                     if args.validation_prompt is not None and global_step % args.validation_steps == 0:
-                        log_validation(text_encoder, tokenizer, unet, vae, args, accelerator, weight_dtype, epoch)
+                        images = log_validation(
+                            text_encoder, tokenizer, unet, vae, args, accelerator, weight_dtype, epoch
+                        )
 
             logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
             progress_bar.set_postfix(**logs)
@@ -858,9 +916,15 @@ def main():
             pipeline.save_pretrained(args.output_dir)
         # Save the newly trained embeddings
         save_path = os.path.join(args.output_dir, "learned_embeds.bin")
-        save_progress(text_encoder, placeholder_token_id, accelerator, args, save_path)
+        save_progress(text_encoder, placeholder_token_ids, accelerator, args, save_path)
 
         if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                images=images,
+                base_model=args.pretrained_model_name_or_path,
+                repo_folder=args.output_dir,
+            )
             upload_folder(
                 repo_id=repo_id,
                 folder_path=args.output_dir,

From 11f527ac0f67ec1d9e84d6a49bf6d12484032889 Mon Sep 17 00:00:00 2001
From: Youssef Adarrab <104783077+youssefadr@users.noreply.github.com>
Date: Fri, 21 Apr 2023 18:21:04 +0000
Subject: [PATCH 48/71] Add `Karras sigmas` to HeunDiscreteScheduler (#3160)

* Add karras pattern to discrete heun scheduler

* Add integration test

* Fix failing CI on pytorch test on M1 (mps)

---------

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 .../schedulers/scheduling_heun_discrete.py    | 52 ++++++++++++++++++-
 tests/schedulers/test_scheduler_heun.py       | 25 +++++++++
 2 files changed, 76 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/schedulers/scheduling_heun_discrete.py b/src/diffusers/schedulers/scheduling_heun_discrete.py
index c1fd7b4967bc..2b32cad39925 100644
--- a/src/diffusers/schedulers/scheduling_heun_discrete.py
+++ b/src/diffusers/schedulers/scheduling_heun_discrete.py
@@ -75,7 +75,11 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
         prediction_type (`str`, default `epsilon`, optional):
             prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
             process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
-            https://imagen.research.google/video/paper.pdf)
+            https://imagen.research.google/video/paper.pdf).
+        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+             This parameter controls whether to use Karras sigmas (Karras et al. (2022) scheme) for step sizes in the
+             noise schedule during the sampling process. If True, the sigmas will be determined according to a sequence
+             of noise levels {σi} as defined in Equation (5) of the paper https://arxiv.org/pdf/2206.00364.pdf.
     """
 
     _compatibles = [e.name for e in KarrasDiffusionSchedulers]
@@ -90,6 +94,7 @@ def __init__(
         beta_schedule: str = "linear",
         trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
         prediction_type: str = "epsilon",
+        use_karras_sigmas: Optional[bool] = False,
     ):
         if trained_betas is not None:
             self.betas = torch.tensor(trained_betas, dtype=torch.float32)
@@ -111,6 +116,7 @@ def __init__(
 
         #  set all values
         self.set_timesteps(num_train_timesteps, None, num_train_timesteps)
+        self.use_karras_sigmas = use_karras_sigmas
 
     def index_for_timestep(self, timestep, schedule_timesteps=None):
         if schedule_timesteps is None:
@@ -165,7 +171,13 @@ def set_timesteps(
         timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
 
         sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        log_sigmas = np.log(sigmas)
         sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+
+        if self.use_karras_sigmas:
+            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=self.num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
+
         sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
         sigmas = torch.from_numpy(sigmas).to(device=device)
         self.sigmas = torch.cat([sigmas[:1], sigmas[1:-1].repeat_interleave(2), sigmas[-1:]])
@@ -186,6 +198,44 @@ def set_timesteps(
         self.prev_derivative = None
         self.dt = None
 
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
+    def _sigma_to_t(self, sigma, log_sigmas):
+        # get log sigma
+        log_sigma = np.log(sigma)
+
+        # get distribution
+        dists = log_sigma - log_sigmas[:, np.newaxis]
+
+        # get sigmas range
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+
+        low = log_sigmas[low_idx]
+        high = log_sigmas[high_idx]
+
+        # interpolate sigmas
+        w = (low - log_sigma) / (low - high)
+        w = np.clip(w, 0, 1)
+
+        # transform interpolation to time range
+        t = (1 - w) * low_idx + w * high_idx
+        t = t.reshape(sigma.shape)
+        return t
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
+    def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor:
+        """Constructs the noise schedule of Karras et al. (2022)."""
+
+        sigma_min: float = in_sigmas[-1].item()
+        sigma_max: float = in_sigmas[0].item()
+
+        rho = 7.0  # 7.0 is the value used in the paper
+        ramp = np.linspace(0, 1, num_inference_steps)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return sigmas
+
     @property
     def state_in_first_order(self):
         return self.dt is None
diff --git a/tests/schedulers/test_scheduler_heun.py b/tests/schedulers/test_scheduler_heun.py
index 7d38c8e2374c..2fd50425938f 100644
--- a/tests/schedulers/test_scheduler_heun.py
+++ b/tests/schedulers/test_scheduler_heun.py
@@ -129,3 +129,28 @@ def test_full_loop_device(self):
             # CUDA
             assert abs(result_sum.item() - 0.1233) < 1e-2
             assert abs(result_mean.item() - 0.0002) < 1e-3
+
+    def test_full_loop_device_karras_sigmas(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config, use_karras_sigmas=True)
+
+        scheduler.set_timesteps(self.num_inference_steps, device=torch_device)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter.to(torch_device) * scheduler.init_noise_sigma
+        sample = sample.to(torch_device)
+
+        for t in scheduler.timesteps:
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 0.00015) < 1e-2
+        assert abs(result_mean.item() - 1.9869554535034695e-07) < 1e-2

From 90eac14f720cf66ca1e28f1cc4af32df44806bc7 Mon Sep 17 00:00:00 2001
From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
Date: Fri, 21 Apr 2023 18:24:37 +0000
Subject: [PATCH 49/71] [AudioLDM] Fix dtype of returned waveform (#3189)

---
 src/diffusers/pipelines/audioldm/pipeline_audioldm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
index 86a8fd659046..c80e6ef99288 100644
--- a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
+++ b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
@@ -293,7 +293,7 @@ def mel_spectrogram_to_waveform(self, mel_spectrogram):
 
         waveform = self.vocoder(mel_spectrogram)
         # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        waveform = waveform.cpu()
+        waveform = waveform.cpu().float()
         return waveform
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs

From 20e426cb5dcd4baae5f16fffc24cb10a81264577 Mon Sep 17 00:00:00 2001
From: Chengrui Wang <80876977+crywang@users.noreply.github.com>
Date: Sat, 22 Apr 2023 11:34:28 +0800
Subject: [PATCH 50/71] Fix bug in train_dreambooth_lora (#3183)

* Update train_dreambooth_lora.py

fix bug

* Update train_dreambooth_lora.py
---
 examples/dreambooth/train_dreambooth_lora.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py
index 1b75402c3550..367a3422de33 100644
--- a/examples/dreambooth/train_dreambooth_lora.py
+++ b/examples/dreambooth/train_dreambooth_lora.py
@@ -743,7 +743,7 @@ def main(args):
         )
         temp_pipeline._modify_text_encoder(text_lora_attn_procs)
         text_encoder = temp_pipeline.text_encoder
-        accelerator.register_for_checkpointing(unet_lora_layers)
+        accelerator.register_for_checkpointing(text_encoder_lora_layers)
         del temp_pipeline
 
     if args.scale_lr:

From 9965cb50eac12e397473f01535aab43aae76b4ab Mon Sep 17 00:00:00 2001
From: SkyTNT <SKYTNT@outlook.com>
Date: Sat, 22 Apr 2023 22:07:45 +0800
Subject: [PATCH 51/71] [Community Pipelines] Update lpw_stable_diffusion
 pipeline (#3197)

* Update lpw_stable_diffusion.py

* fix cpu offload
---
 examples/community/lpw_stable_diffusion.py | 749 +++++++++++++++------
 1 file changed, 533 insertions(+), 216 deletions(-)

diff --git a/examples/community/lpw_stable_diffusion.py b/examples/community/lpw_stable_diffusion.py
index e912ad5244be..56fb903c7106 100644
--- a/examples/community/lpw_stable_diffusion.py
+++ b/examples/community/lpw_stable_diffusion.py
@@ -1,6 +1,6 @@
 import inspect
 import re
-from typing import Callable, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Union
 
 import numpy as np
 import PIL
@@ -8,32 +8,23 @@
 from packaging import version
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
-import diffusers
-from diffusers import SchedulerMixin, StableDiffusionPipeline
+from diffusers import DiffusionPipeline
+from diffusers.configuration_utils import FrozenDict
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.loaders import FromCkptMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
-from diffusers.utils import logging
-
-
-try:
-    from diffusers.utils import PIL_INTERPOLATION
-except ImportError:
-    if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
-        PIL_INTERPOLATION = {
-            "linear": PIL.Image.Resampling.BILINEAR,
-            "bilinear": PIL.Image.Resampling.BILINEAR,
-            "bicubic": PIL.Image.Resampling.BICUBIC,
-            "lanczos": PIL.Image.Resampling.LANCZOS,
-            "nearest": PIL.Image.Resampling.NEAREST,
-        }
-    else:
-        PIL_INTERPOLATION = {
-            "linear": PIL.Image.LINEAR,
-            "bilinear": PIL.Image.BILINEAR,
-            "bicubic": PIL.Image.BICUBIC,
-            "lanczos": PIL.Image.LANCZOS,
-            "nearest": PIL.Image.NEAREST,
-        }
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    PIL_INTERPOLATION,
+    deprecate,
+    is_accelerate_available,
+    is_accelerate_version,
+    logging,
+    randn_tensor,
+)
+
+
 # ------------------------------------------------------------------------------
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -144,7 +135,7 @@ def multiply_range(start_position, multiplier):
     return res
 
 
-def get_prompts_with_weights(pipe: StableDiffusionPipeline, prompt: List[str], max_length: int):
+def get_prompts_with_weights(pipe: DiffusionPipeline, prompt: List[str], max_length: int):
     r"""
     Tokenize a list of prompts and return its tokens with weights of each token.
 
@@ -205,7 +196,7 @@ def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, pad, no_boseos
 
 
 def get_unweighted_text_embeddings(
-    pipe: StableDiffusionPipeline,
+    pipe: DiffusionPipeline,
     text_input: torch.Tensor,
     chunk_length: int,
     no_boseos_middle: Optional[bool] = True,
@@ -245,7 +236,7 @@ def get_unweighted_text_embeddings(
 
 
 def get_weighted_text_embeddings(
-    pipe: StableDiffusionPipeline,
+    pipe: DiffusionPipeline,
     prompt: Union[str, List[str]],
     uncond_prompt: Optional[Union[str, List[str]]] = None,
     max_embeddings_multiples: Optional[int] = 3,
@@ -261,7 +252,7 @@ def get_weighted_text_embeddings(
     Also, to regularize of the embedding, the weighted embedding would be scaled to preserve the original mean.
 
     Args:
-        pipe (`StableDiffusionPipeline`):
+        pipe (`DiffusionPipeline`):
             Pipe to provide access to the tokenizer and the text encoder.
         prompt (`str` or `List[str]`):
             The prompt or prompts to guide the image generation.
@@ -349,7 +340,7 @@ def get_weighted_text_embeddings(
         pipe.tokenizer.model_max_length,
         no_boseos_middle=no_boseos_middle,
     )
-    prompt_weights = torch.tensor(prompt_weights, dtype=text_embeddings.dtype, device=pipe.device)
+    prompt_weights = torch.tensor(prompt_weights, dtype=text_embeddings.dtype, device=text_embeddings.device)
     if uncond_prompt is not None:
         uncond_embeddings = get_unweighted_text_embeddings(
             pipe,
@@ -357,7 +348,7 @@ def get_weighted_text_embeddings(
             pipe.tokenizer.model_max_length,
             no_boseos_middle=no_boseos_middle,
         )
-        uncond_weights = torch.tensor(uncond_weights, dtype=uncond_embeddings.dtype, device=pipe.device)
+        uncond_weights = torch.tensor(uncond_weights, dtype=uncond_embeddings.dtype, device=uncond_embeddings.device)
 
     # assign weights to the prompts and normalize in the sense of mean
     # TODO: should we normalize by chunk or in a whole (current implementation)?
@@ -377,30 +368,50 @@ def get_weighted_text_embeddings(
     return text_embeddings, None
 
 
-def preprocess_image(image):
+def preprocess_image(image, batch_size):
     w, h = image.size
-    w, h = (x - x % 32 for x in (w, h))  # resize to integer multiple of 32
+    w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
     image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
     image = np.array(image).astype(np.float32) / 255.0
-    image = image[None].transpose(0, 3, 1, 2)
+    image = np.vstack([image[None].transpose(0, 3, 1, 2)] * batch_size)
     image = torch.from_numpy(image)
     return 2.0 * image - 1.0
 
 
-def preprocess_mask(mask, scale_factor=8):
-    mask = mask.convert("L")
-    w, h = mask.size
-    w, h = (x - x % 32 for x in (w, h))  # resize to integer multiple of 32
-    mask = mask.resize((w // scale_factor, h // scale_factor), resample=PIL_INTERPOLATION["nearest"])
-    mask = np.array(mask).astype(np.float32) / 255.0
-    mask = np.tile(mask, (4, 1, 1))
-    mask = mask[None].transpose(0, 1, 2, 3)  # what does this step do?
-    mask = 1 - mask  # repaint white, keep black
-    mask = torch.from_numpy(mask)
-    return mask
+def preprocess_mask(mask, batch_size, scale_factor=8):
+    if not isinstance(mask, torch.FloatTensor):
+        mask = mask.convert("L")
+        w, h = mask.size
+        w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+        mask = mask.resize((w // scale_factor, h // scale_factor), resample=PIL_INTERPOLATION["nearest"])
+        mask = np.array(mask).astype(np.float32) / 255.0
+        mask = np.tile(mask, (4, 1, 1))
+        mask = np.vstack([mask[None]] * batch_size)
+        mask = 1 - mask  # repaint white, keep black
+        mask = torch.from_numpy(mask)
+        return mask
+
+    else:
+        valid_mask_channel_sizes = [1, 3]
+        # if mask channel is fourth tensor dimension, permute dimensions to pytorch standard (B, C, H, W)
+        if mask.shape[3] in valid_mask_channel_sizes:
+            mask = mask.permute(0, 3, 1, 2)
+        elif mask.shape[1] not in valid_mask_channel_sizes:
+            raise ValueError(
+                f"Mask channel dimension of size in {valid_mask_channel_sizes} should be second or fourth dimension,"
+                f" but received mask of shape {tuple(mask.shape)}"
+            )
+        # (potentially) reduce mask channel dimension from 3 to 1 for broadcasting to latent shape
+        mask = mask.mean(dim=1, keepdim=True)
+        h, w = mask.shape[-2:]
+        h, w = (x - x % 8 for x in (h, w))  # resize to integer multiple of 8
+        mask = torch.nn.functional.interpolate(mask, (h // scale_factor, w // scale_factor))
+        return mask
 
 
-class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
+class StableDiffusionLongPromptWeightingPipeline(
+    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromCkptMixin
+):
     r"""
     Pipeline for text-to-image generation using Stable Diffusion without tokens length limit, and support parsing
     weighting in prompt.
@@ -429,66 +440,196 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
             Model that extracts features from generated images to be used as inputs for the `safety_checker`.
     """
 
-    if version.parse(version.parse(diffusers.__version__).base_version) >= version.parse("0.9.0"):
-
-        def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: CLIPTextModel,
-            tokenizer: CLIPTokenizer,
-            unet: UNet2DConditionModel,
-            scheduler: SchedulerMixin,
-            safety_checker: StableDiffusionSafetyChecker,
-            feature_extractor: CLIPImageProcessor,
-            requires_safety_checker: bool = True,
-        ):
-            super().__init__(
-                vae=vae,
-                text_encoder=text_encoder,
-                tokenizer=tokenizer,
-                unet=unet,
-                scheduler=scheduler,
-                safety_checker=safety_checker,
-                feature_extractor=feature_extractor,
-                requires_safety_checker=requires_safety_checker,
+    _optional_components = ["safety_checker", "feature_extractor"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
             )
-            self.__init__additional__()
 
-    else:
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
 
-        def __init__(
-            self,
-            vae: AutoencoderKL,
-            text_encoder: CLIPTextModel,
-            tokenizer: CLIPTokenizer,
-            unet: UNet2DConditionModel,
-            scheduler: SchedulerMixin,
-            safety_checker: StableDiffusionSafetyChecker,
-            feature_extractor: CLIPImageProcessor,
-        ):
-            super().__init__(
-                vae=vae,
-                text_encoder=text_encoder,
-                tokenizer=tokenizer,
-                unet=unet,
-                scheduler=scheduler,
-                safety_checker=safety_checker,
-                feature_extractor=feature_extractor,
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
             )
-            self.__init__additional__()
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(
+            requires_safety_checker=requires_safety_checker,
+        )
+
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding.
+
+        When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
+        steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding.
+
+        When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in
+        several steps. This is useful to save a large amount of memory and to allow the processing of larger images.
+        """
+        self.vae.enable_tiling()
+
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        Note that offloading happens on a submodule basis. Memory savings are higher than with
+        `enable_model_cpu_offload`, but performance is lower.
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
+            cpu_offload(cpu_offloaded_model, device)
+
+        if self.safety_checker is not None:
+            cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
+    def enable_model_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        hook = None
+        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
+            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
+
+        if self.safety_checker is not None:
+            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
 
-    def __init__additional__(self):
-        if not hasattr(self, "vae_scale_factor"):
-            setattr(self, "vae_scale_factor", 2 ** (len(self.vae.config.block_out_channels) - 1))
+        # We'll offload the last model manually.
+        self.final_offload_hook = hook
 
     @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
     def _execution_device(self):
         r"""
         Returns the device on which the pipeline's models will be executed. After calling
         `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
         hooks.
         """
-        if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
+        if not hasattr(self.unet, "_hf_hook"):
             return self.device
         for module in self.unet.modules():
             if (
@@ -505,8 +646,10 @@ def _encode_prompt(
         device,
         num_images_per_prompt,
         do_classifier_free_guidance,
-        negative_prompt,
-        max_embeddings_multiples,
+        negative_prompt=None,
+        max_embeddings_multiples=3,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
     ):
         r"""
         Encodes the prompt into text encoder hidden states.
@@ -526,47 +669,71 @@ def _encode_prompt(
             max_embeddings_multiples (`int`, *optional*, defaults to `3`):
                 The max multiple length of prompt embeddings compared to the max output length of text encoder.
         """
-        batch_size = len(prompt) if isinstance(prompt, list) else 1
-
-        if negative_prompt is None:
-            negative_prompt = [""] * batch_size
-        elif isinstance(negative_prompt, str):
-            negative_prompt = [negative_prompt] * batch_size
-        if batch_size != len(negative_prompt):
-            raise ValueError(
-                f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                " the batch size of `prompt`."
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if negative_prompt_embeds is None:
+            if negative_prompt is None:
+                negative_prompt = [""] * batch_size
+            elif isinstance(negative_prompt, str):
+                negative_prompt = [negative_prompt] * batch_size
+            if batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+        if prompt_embeds is None or negative_prompt_embeds is None:
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+                if do_classifier_free_guidance and negative_prompt_embeds is None:
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, self.tokenizer)
+
+            prompt_embeds1, negative_prompt_embeds1 = get_weighted_text_embeddings(
+                pipe=self,
+                prompt=prompt,
+                uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
+                max_embeddings_multiples=max_embeddings_multiples,
             )
+            if prompt_embeds is None:
+                prompt_embeds = prompt_embeds1
+            if negative_prompt_embeds is None:
+                negative_prompt_embeds = negative_prompt_embeds1
 
-        text_embeddings, uncond_embeddings = get_weighted_text_embeddings(
-            pipe=self,
-            prompt=prompt,
-            uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
-            max_embeddings_multiples=max_embeddings_multiples,
-        )
-        bs_embed, seq_len, _ = text_embeddings.shape
-        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
-        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
 
         if do_classifier_free_guidance:
-            bs_embed, seq_len, _ = uncond_embeddings.shape
-            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
-            uncond_embeddings = uncond_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
-            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+            bs_embed, seq_len, _ = negative_prompt_embeds.shape
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
 
-        return text_embeddings
+        return prompt_embeds
 
-    def check_inputs(self, prompt, height, width, strength, callback_steps):
-        if not isinstance(prompt, str) and not isinstance(prompt, list):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        strength,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         if strength < 0 or strength > 1:
             raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
 
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
         if (callback_steps is None) or (
             callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
         ):
@@ -575,17 +742,42 @@ def check_inputs(self, prompt, height, width, strength, callback_steps):
                 f" {type(callback_steps)}."
             )
 
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
     def get_timesteps(self, num_inference_steps, strength, device, is_text2img):
         if is_text2img:
             return self.scheduler.timesteps.to(device), num_inference_steps
         else:
             # get the original timestep using init_timestep
-            offset = self.scheduler.config.get("steps_offset", 0)
-            init_timestep = int(num_inference_steps * strength) + offset
-            init_timestep = min(init_timestep, num_inference_steps)
+            init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+            t_start = max(num_inference_steps - init_timestep, 0)
+            timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
 
-            t_start = max(num_inference_steps - init_timestep + offset, 0)
-            timesteps = self.scheduler.timesteps[t_start:].to(device)
             return timesteps, num_inference_steps - t_start
 
     def run_safety_checker(self, image, device, dtype):
@@ -599,7 +791,7 @@ def run_safety_checker(self, image, device, dtype):
         return image, has_nsfw_concept
 
     def decode_latents(self, latents):
-        latents = 1 / 0.18215 * latents
+        latents = 1 / self.vae.config.scaling_factor * latents
         image = self.vae.decode(latents).sample
         image = (image / 2 + 0.5).clamp(0, 1)
         # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
@@ -623,43 +815,51 @@ def prepare_extra_step_kwargs(self, generator, eta):
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
-    def prepare_latents(self, image, timestep, batch_size, height, width, dtype, device, generator, latents=None):
+    def prepare_latents(
+        self,
+        image,
+        timestep,
+        num_images_per_prompt,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
         if image is None:
-            shape = (
-                batch_size,
-                self.unet.config.in_channels,
-                height // self.vae_scale_factor,
-                width // self.vae_scale_factor,
-            )
+            batch_size = batch_size * num_images_per_prompt
+            shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
 
             if latents is None:
-                if device.type == "mps":
-                    # randn does not work reproducibly on mps
-                    latents = torch.randn(shape, generator=generator, device="cpu", dtype=dtype).to(device)
-                else:
-                    latents = torch.randn(shape, generator=generator, device=device, dtype=dtype)
+                latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
             else:
-                if latents.shape != shape:
-                    raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
                 latents = latents.to(device)
 
             # scale the initial noise by the standard deviation required by the scheduler
             latents = latents * self.scheduler.init_noise_sigma
             return latents, None, None
         else:
+            image = image.to(device=self.device, dtype=dtype)
             init_latent_dist = self.vae.encode(image).latent_dist
             init_latents = init_latent_dist.sample(generator=generator)
-            init_latents = 0.18215 * init_latents
-            init_latents = torch.cat([init_latents] * batch_size, dim=0)
+            init_latents = self.vae.config.scaling_factor * init_latents
+
+            # Expand init_latents for batch_size and num_images_per_prompt
+            init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0)
             init_latents_orig = init_latents
-            shape = init_latents.shape
 
             # add noise to latents using the timesteps
-            if device.type == "mps":
-                noise = torch.randn(shape, generator=generator, device="cpu", dtype=dtype).to(device)
-            else:
-                noise = torch.randn(shape, generator=generator, device=device, dtype=dtype)
-            latents = self.scheduler.add_noise(init_latents, noise, timestep)
+            noise = randn_tensor(init_latents.shape, generator=generator, device=self.device, dtype=dtype)
+            init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+            latents = init_latents
             return latents, init_latents_orig, noise
 
     @torch.no_grad()
@@ -675,15 +875,19 @@ def __call__(
         guidance_scale: float = 7.5,
         strength: float = 0.8,
         num_images_per_prompt: Optional[int] = 1,
+        add_predicted_noise: Optional[bool] = False,
         eta: float = 0.0,
-        generator: Optional[torch.Generator] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
         max_embeddings_multiples: Optional[int] = 3,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         is_cancelled_callback: Optional[Callable[[], bool]] = None,
         callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
     ):
         r"""
         Function invoked when calling the pipeline for generation.
@@ -723,16 +927,26 @@ def __call__(
                 `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
             num_images_per_prompt (`int`, *optional*, defaults to 1):
                 The number of images to generate per prompt.
+            add_predicted_noise (`bool`, *optional*, defaults to True):
+                Use predicted noise instead of random noise when constructing noisy versions of the original image in
+                the reverse diffusion process
             eta (`float`, *optional*, defaults to 0.0):
                 Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                 [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator`, *optional*):
-                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
-                deterministic.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
             latents (`torch.FloatTensor`, *optional*):
                 Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                 generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                 tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
             max_embeddings_multiples (`int`, *optional*, defaults to `3`):
                 The max multiple length of prompt embeddings compared to the max output length of text encoder.
             output_type (`str`, *optional*, defaults to `"pil"`):
@@ -750,6 +964,10 @@ def __call__(
             callback_steps (`int`, *optional*, defaults to 1):
                 The frequency at which the `callback` function will be called. If not specified, the callback will be
                 called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
 
         Returns:
             `None` if cancelled by `is_cancelled_callback`,
@@ -764,10 +982,18 @@ def __call__(
         width = width or self.unet.config.sample_size * self.vae_scale_factor
 
         # 1. Check inputs. Raise error if not correct
-        self.check_inputs(prompt, height, width, strength, callback_steps)
+        self.check_inputs(
+            prompt, height, width, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
 
         # 2. Define call parameters
-        batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
         device = self._execution_device
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
@@ -775,26 +1001,28 @@ def __call__(
         do_classifier_free_guidance = guidance_scale > 1.0
 
         # 3. Encode input prompt
-        text_embeddings = self._encode_prompt(
+        prompt_embeds = self._encode_prompt(
             prompt,
             device,
             num_images_per_prompt,
             do_classifier_free_guidance,
             negative_prompt,
             max_embeddings_multiples,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
         )
-        dtype = text_embeddings.dtype
+        dtype = prompt_embeds.dtype
 
         # 4. Preprocess image and mask
         if isinstance(image, PIL.Image.Image):
-            image = preprocess_image(image)
+            image = preprocess_image(image, batch_size)
         if image is not None:
             image = image.to(device=self.device, dtype=dtype)
         if isinstance(mask_image, PIL.Image.Image):
-            mask_image = preprocess_mask(mask_image, self.vae_scale_factor)
+            mask_image = preprocess_mask(mask_image, batch_size, self.vae_scale_factor)
         if mask_image is not None:
             mask = mask_image.to(device=self.device, dtype=dtype)
-            mask = torch.cat([mask] * batch_size * num_images_per_prompt)
+            mask = torch.cat([mask] * num_images_per_prompt)
         else:
             mask = None
 
@@ -807,7 +1035,9 @@ def __call__(
         latents, init_latents_orig, noise = self.prepare_latents(
             image,
             latent_timestep,
-            batch_size * num_images_per_prompt,
+            num_images_per_prompt,
+            batch_size,
+            self.unet.config.in_channels,
             height,
             width,
             dtype,
@@ -820,43 +1050,70 @@ def __call__(
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 8. Denoising loop
-        for i, t in enumerate(self.progress_bar(timesteps)):
-            # expand the latents if we are doing classifier free guidance
-            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-            # predict the noise residual
-            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
-
-            # perform guidance
-            if do_classifier_free_guidance:
-                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-            # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-            if mask is not None:
-                # masking
-                init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, torch.tensor([t]))
-                latents = (init_latents_proper * mask) + (latents * (1 - mask))
-
-            # call the callback, if provided
-            if i % callback_steps == 0:
-                if callback is not None:
-                    callback(i, t, latents)
-                if is_cancelled_callback is not None and is_cancelled_callback():
-                    return None
-
-        # 9. Post-processing
-        image = self.decode_latents(latents)
-
-        # 10. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)
-
-        # 11. Convert to PIL
-        if output_type == "pil":
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                if mask is not None:
+                    # masking
+                    if add_predicted_noise:
+                        init_latents_proper = self.scheduler.add_noise(
+                            init_latents_orig, noise_pred_uncond, torch.tensor([t])
+                        )
+                    else:
+                        init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, torch.tensor([t]))
+                    latents = (init_latents_proper * mask) + (latents * (1 - mask))
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if i % callback_steps == 0:
+                        if callback is not None:
+                            callback(i, t, latents)
+                        if is_cancelled_callback is not None and is_cancelled_callback():
+                            return None
+
+        if output_type == "latent":
+            image = latents
+            has_nsfw_concept = None
+        elif output_type == "pil":
+            # 9. Post-processing
+            image = self.decode_latents(latents)
+
+            # 10. Run safety checker
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+            # 11. Convert to PIL
             image = self.numpy_to_pil(image)
+        else:
+            # 9. Post-processing
+            image = self.decode_latents(latents)
+
+            # 10. Run safety checker
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
 
         if not return_dict:
             return image, has_nsfw_concept
@@ -873,14 +1130,17 @@ def text2img(
         guidance_scale: float = 7.5,
         num_images_per_prompt: Optional[int] = 1,
         eta: float = 0.0,
-        generator: Optional[torch.Generator] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
         max_embeddings_multiples: Optional[int] = 3,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         is_cancelled_callback: Optional[Callable[[], bool]] = None,
         callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
     ):
         r"""
         Function for text-to-image generation.
@@ -908,13 +1168,20 @@ def text2img(
             eta (`float`, *optional*, defaults to 0.0):
                 Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                 [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator`, *optional*):
-                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
-                deterministic.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
             latents (`torch.FloatTensor`, *optional*):
                 Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                 generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                 tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
             max_embeddings_multiples (`int`, *optional*, defaults to `3`):
                 The max multiple length of prompt embeddings compared to the max output length of text encoder.
             output_type (`str`, *optional*, defaults to `"pil"`):
@@ -932,7 +1199,13 @@ def text2img(
             callback_steps (`int`, *optional*, defaults to 1):
                 The frequency at which the `callback` function will be called. If not specified, the callback will be
                 called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+
         Returns:
+            `None` if cancelled by `is_cancelled_callback`,
             [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
             [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
             When returning a tuple, the first element is a list with the generated images, and the second element is a
@@ -950,12 +1223,15 @@ def text2img(
             eta=eta,
             generator=generator,
             latents=latents,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
             max_embeddings_multiples=max_embeddings_multiples,
             output_type=output_type,
             return_dict=return_dict,
             callback=callback,
             is_cancelled_callback=is_cancelled_callback,
             callback_steps=callback_steps,
+            cross_attention_kwargs=cross_attention_kwargs,
         )
 
     def img2img(
@@ -968,13 +1244,16 @@ def img2img(
         guidance_scale: Optional[float] = 7.5,
         num_images_per_prompt: Optional[int] = 1,
         eta: Optional[float] = 0.0,
-        generator: Optional[torch.Generator] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
         max_embeddings_multiples: Optional[int] = 3,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         is_cancelled_callback: Optional[Callable[[], bool]] = None,
         callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
     ):
         r"""
         Function for image-to-image generation.
@@ -1007,9 +1286,16 @@ def img2img(
             eta (`float`, *optional*, defaults to 0.0):
                 Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                 [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator`, *optional*):
-                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
-                deterministic.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
             max_embeddings_multiples (`int`, *optional*, defaults to `3`):
                 The max multiple length of prompt embeddings compared to the max output length of text encoder.
             output_type (`str`, *optional*, defaults to `"pil"`):
@@ -1027,8 +1313,13 @@ def img2img(
             callback_steps (`int`, *optional*, defaults to 1):
                 The frequency at which the `callback` function will be called. If not specified, the callback will be
                 called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+
         Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            `None` if cancelled by `is_cancelled_callback`,
             [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
             When returning a tuple, the first element is a list with the generated images, and the second element is a
             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
@@ -1044,12 +1335,15 @@ def img2img(
             num_images_per_prompt=num_images_per_prompt,
             eta=eta,
             generator=generator,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
             max_embeddings_multiples=max_embeddings_multiples,
             output_type=output_type,
             return_dict=return_dict,
             callback=callback,
             is_cancelled_callback=is_cancelled_callback,
             callback_steps=callback_steps,
+            cross_attention_kwargs=cross_attention_kwargs,
         )
 
     def inpaint(
@@ -1062,14 +1356,18 @@ def inpaint(
         num_inference_steps: Optional[int] = 50,
         guidance_scale: Optional[float] = 7.5,
         num_images_per_prompt: Optional[int] = 1,
+        add_predicted_noise: Optional[bool] = False,
         eta: Optional[float] = 0.0,
-        generator: Optional[torch.Generator] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
         max_embeddings_multiples: Optional[int] = 3,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         is_cancelled_callback: Optional[Callable[[], bool]] = None,
         callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
     ):
         r"""
         Function for inpaint.
@@ -1103,12 +1401,22 @@ def inpaint(
                 usually at the expense of lower image quality.
             num_images_per_prompt (`int`, *optional*, defaults to 1):
                 The number of images to generate per prompt.
+            add_predicted_noise (`bool`, *optional*, defaults to True):
+                Use predicted noise instead of random noise when constructing noisy versions of the original image in
+                the reverse diffusion process
             eta (`float`, *optional*, defaults to 0.0):
                 Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                 [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator`, *optional*):
-                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
-                deterministic.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
             max_embeddings_multiples (`int`, *optional*, defaults to `3`):
                 The max multiple length of prompt embeddings compared to the max output length of text encoder.
             output_type (`str`, *optional*, defaults to `"pil"`):
@@ -1126,8 +1434,13 @@ def inpaint(
             callback_steps (`int`, *optional*, defaults to 1):
                 The frequency at which the `callback` function will be called. If not specified, the callback will be
                 called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+
         Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            `None` if cancelled by `is_cancelled_callback`,
             [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
             When returning a tuple, the first element is a list with the generated images, and the second element is a
             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
@@ -1142,12 +1455,16 @@ def inpaint(
             guidance_scale=guidance_scale,
             strength=strength,
             num_images_per_prompt=num_images_per_prompt,
+            add_predicted_noise=add_predicted_noise,
             eta=eta,
             generator=generator,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
             max_embeddings_multiples=max_embeddings_multiples,
             output_type=output_type,
             return_dict=return_dict,
             callback=callback,
             is_cancelled_callback=is_cancelled_callback,
             callback_steps=callback_steps,
+            cross_attention_kwargs=cross_attention_kwargs,
         )

From 425192fe15f0b7a3fb6f72368246eed0264a36c2 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Sat, 22 Apr 2023 18:29:29 +0200
Subject: [PATCH 52/71] Make sure VAE attention works with Torch 2_0 (#3200)

* Make sure attention works with Torch 2_0

* make style

* Fix more
---
 src/diffusers/models/attention.py | 41 ++++++++++++++++++++++---------
 tests/models/test_models_vae.py   | 34 +++++++++++++++++++++++++
 2 files changed, 63 insertions(+), 12 deletions(-)

diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 5538a7b8249d..1085c452b076 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -60,7 +60,6 @@ def __init__(
         self.channels = channels
 
         self.num_heads = channels // num_head_channels if num_head_channels is not None else 1
-        self.num_head_size = num_head_channels
         self.group_norm = nn.GroupNorm(num_channels=channels, num_groups=norm_num_groups, eps=eps, affine=True)
 
         # define q,k,v as linear layers
@@ -74,18 +73,25 @@ def __init__(
         self._use_memory_efficient_attention_xformers = False
         self._attention_op = None
 
-    def reshape_heads_to_batch_dim(self, tensor):
+    def reshape_heads_to_batch_dim(self, tensor, merge_head_and_batch=True):
         batch_size, seq_len, dim = tensor.shape
         head_size = self.num_heads
         tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size)
-        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size * head_size, seq_len, dim // head_size)
+        tensor = tensor.permute(0, 2, 1, 3)
+        if merge_head_and_batch:
+            tensor = tensor.reshape(batch_size * head_size, seq_len, dim // head_size)
         return tensor
 
-    def reshape_batch_dim_to_heads(self, tensor):
-        batch_size, seq_len, dim = tensor.shape
+    def reshape_batch_dim_to_heads(self, tensor, unmerge_head_and_batch=True):
         head_size = self.num_heads
-        tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
-        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size // head_size, seq_len, dim * head_size)
+
+        if unmerge_head_and_batch:
+            batch_size, seq_len, dim = tensor.shape
+            tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
+        else:
+            batch_size, _, seq_len, dim = tensor.shape
+
+        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size, seq_len, dim * head_size)
         return tensor
 
     def set_use_memory_efficient_attention_xformers(
@@ -134,14 +140,25 @@ def forward(self, hidden_states):
 
         scale = 1 / math.sqrt(self.channels / self.num_heads)
 
-        query_proj = self.reshape_heads_to_batch_dim(query_proj)
-        key_proj = self.reshape_heads_to_batch_dim(key_proj)
-        value_proj = self.reshape_heads_to_batch_dim(value_proj)
+        use_torch_2_0_attn = (
+            hasattr(F, "scaled_dot_product_attention") and not self._use_memory_efficient_attention_xformers
+        )
+
+        query_proj = self.reshape_heads_to_batch_dim(query_proj, merge_head_and_batch=not use_torch_2_0_attn)
+        key_proj = self.reshape_heads_to_batch_dim(key_proj, merge_head_and_batch=not use_torch_2_0_attn)
+        value_proj = self.reshape_heads_to_batch_dim(value_proj, merge_head_and_batch=not use_torch_2_0_attn)
 
         if self._use_memory_efficient_attention_xformers:
             # Memory efficient attention
             hidden_states = xformers.ops.memory_efficient_attention(
-                query_proj, key_proj, value_proj, attn_bias=None, op=self._attention_op
+                query_proj, key_proj, value_proj, attn_bias=None, op=self._attention_op, scale=scale
+            )
+            hidden_states = hidden_states.to(query_proj.dtype)
+        elif use_torch_2_0_attn:
+            # the output of sdp = (batch, num_heads, seq_len, head_dim)
+            # TODO: add support for attn.scale when we move to Torch 2.1
+            hidden_states = F.scaled_dot_product_attention(
+                query_proj, key_proj, value_proj, dropout_p=0.0, is_causal=False
             )
             hidden_states = hidden_states.to(query_proj.dtype)
         else:
@@ -162,7 +179,7 @@ def forward(self, hidden_states):
             hidden_states = torch.bmm(attention_probs, value_proj)
 
         # reshape hidden_states
-        hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
+        hidden_states = self.reshape_batch_dim_to_heads(hidden_states, unmerge_head_and_batch=not use_torch_2_0_attn)
 
         # compute next hidden_states
         hidden_states = self.proj_attn(hidden_states)
diff --git a/tests/models/test_models_vae.py b/tests/models/test_models_vae.py
index fe0041850bb4..6cb71bebb9c0 100644
--- a/tests/models/test_models_vae.py
+++ b/tests/models/test_models_vae.py
@@ -319,6 +319,40 @@ def test_stable_diffusion_decode_fp16(self, seed, expected_slice):
 
         assert torch_all_close(output_slice, expected_output_slice, atol=5e-3)
 
+    @parameterized.expand([13, 16, 27])
+    @require_torch_gpu
+    def test_stable_diffusion_decode_xformers_vs_2_0_fp16(self, seed):
+        model = self.get_sd_vae_model(fp16=True)
+        encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64), fp16=True)
+
+        with torch.no_grad():
+            sample = model.decode(encoding).sample
+
+        model.enable_xformers_memory_efficient_attention()
+        with torch.no_grad():
+            sample_2 = model.decode(encoding).sample
+
+        assert list(sample.shape) == [3, 3, 512, 512]
+
+        assert torch_all_close(sample, sample_2, atol=1e-1)
+
+    @parameterized.expand([13, 16, 37])
+    @require_torch_gpu
+    def test_stable_diffusion_decode_xformers_vs_2_0(self, seed):
+        model = self.get_sd_vae_model()
+        encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64))
+
+        with torch.no_grad():
+            sample = model.decode(encoding).sample
+
+        model.enable_xformers_memory_efficient_attention()
+        with torch.no_grad():
+            sample_2 = model.decode(encoding).sample
+
+        assert list(sample.shape) == [3, 3, 512, 512]
+
+        assert torch_all_close(sample, sample_2, atol=1e-2)
+
     @parameterized.expand(
         [
             # fmt: off

From 91a2a80eb2f98a9f64b9e287715add244dc6f2f3 Mon Sep 17 00:00:00 2001
From: Will Berman <wlbberman@gmail.com>
Date: Sat, 22 Apr 2023 12:36:55 -0700
Subject: [PATCH 53/71] Revert "[Community Pipelines] Update
 lpw_stable_diffusion pipeline" (#3201)

Revert "[Community Pipelines] Update lpw_stable_diffusion pipeline (#3197)"

This reverts commit 9965cb50eac12e397473f01535aab43aae76b4ab.
---
 examples/community/lpw_stable_diffusion.py | 749 ++++++---------------
 1 file changed, 216 insertions(+), 533 deletions(-)

diff --git a/examples/community/lpw_stable_diffusion.py b/examples/community/lpw_stable_diffusion.py
index 56fb903c7106..e912ad5244be 100644
--- a/examples/community/lpw_stable_diffusion.py
+++ b/examples/community/lpw_stable_diffusion.py
@@ -1,6 +1,6 @@
 import inspect
 import re
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Callable, List, Optional, Union
 
 import numpy as np
 import PIL
@@ -8,23 +8,32 @@
 from packaging import version
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
-from diffusers import DiffusionPipeline
-from diffusers.configuration_utils import FrozenDict
-from diffusers.image_processor import VaeImageProcessor
-from diffusers.loaders import FromCkptMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+import diffusers
+from diffusers import SchedulerMixin, StableDiffusionPipeline
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
-from diffusers.schedulers import KarrasDiffusionSchedulers
-from diffusers.utils import (
-    PIL_INTERPOLATION,
-    deprecate,
-    is_accelerate_available,
-    is_accelerate_version,
-    logging,
-    randn_tensor,
-)
-
-
+from diffusers.utils import logging
+
+
+try:
+    from diffusers.utils import PIL_INTERPOLATION
+except ImportError:
+    if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
+        PIL_INTERPOLATION = {
+            "linear": PIL.Image.Resampling.BILINEAR,
+            "bilinear": PIL.Image.Resampling.BILINEAR,
+            "bicubic": PIL.Image.Resampling.BICUBIC,
+            "lanczos": PIL.Image.Resampling.LANCZOS,
+            "nearest": PIL.Image.Resampling.NEAREST,
+        }
+    else:
+        PIL_INTERPOLATION = {
+            "linear": PIL.Image.LINEAR,
+            "bilinear": PIL.Image.BILINEAR,
+            "bicubic": PIL.Image.BICUBIC,
+            "lanczos": PIL.Image.LANCZOS,
+            "nearest": PIL.Image.NEAREST,
+        }
 # ------------------------------------------------------------------------------
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -135,7 +144,7 @@ def multiply_range(start_position, multiplier):
     return res
 
 
-def get_prompts_with_weights(pipe: DiffusionPipeline, prompt: List[str], max_length: int):
+def get_prompts_with_weights(pipe: StableDiffusionPipeline, prompt: List[str], max_length: int):
     r"""
     Tokenize a list of prompts and return its tokens with weights of each token.
 
@@ -196,7 +205,7 @@ def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, pad, no_boseos
 
 
 def get_unweighted_text_embeddings(
-    pipe: DiffusionPipeline,
+    pipe: StableDiffusionPipeline,
     text_input: torch.Tensor,
     chunk_length: int,
     no_boseos_middle: Optional[bool] = True,
@@ -236,7 +245,7 @@ def get_unweighted_text_embeddings(
 
 
 def get_weighted_text_embeddings(
-    pipe: DiffusionPipeline,
+    pipe: StableDiffusionPipeline,
     prompt: Union[str, List[str]],
     uncond_prompt: Optional[Union[str, List[str]]] = None,
     max_embeddings_multiples: Optional[int] = 3,
@@ -252,7 +261,7 @@ def get_weighted_text_embeddings(
     Also, to regularize of the embedding, the weighted embedding would be scaled to preserve the original mean.
 
     Args:
-        pipe (`DiffusionPipeline`):
+        pipe (`StableDiffusionPipeline`):
             Pipe to provide access to the tokenizer and the text encoder.
         prompt (`str` or `List[str]`):
             The prompt or prompts to guide the image generation.
@@ -340,7 +349,7 @@ def get_weighted_text_embeddings(
         pipe.tokenizer.model_max_length,
         no_boseos_middle=no_boseos_middle,
     )
-    prompt_weights = torch.tensor(prompt_weights, dtype=text_embeddings.dtype, device=text_embeddings.device)
+    prompt_weights = torch.tensor(prompt_weights, dtype=text_embeddings.dtype, device=pipe.device)
     if uncond_prompt is not None:
         uncond_embeddings = get_unweighted_text_embeddings(
             pipe,
@@ -348,7 +357,7 @@ def get_weighted_text_embeddings(
             pipe.tokenizer.model_max_length,
             no_boseos_middle=no_boseos_middle,
         )
-        uncond_weights = torch.tensor(uncond_weights, dtype=uncond_embeddings.dtype, device=uncond_embeddings.device)
+        uncond_weights = torch.tensor(uncond_weights, dtype=uncond_embeddings.dtype, device=pipe.device)
 
     # assign weights to the prompts and normalize in the sense of mean
     # TODO: should we normalize by chunk or in a whole (current implementation)?
@@ -368,50 +377,30 @@ def get_weighted_text_embeddings(
     return text_embeddings, None
 
 
-def preprocess_image(image, batch_size):
+def preprocess_image(image):
     w, h = image.size
-    w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+    w, h = (x - x % 32 for x in (w, h))  # resize to integer multiple of 32
     image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
     image = np.array(image).astype(np.float32) / 255.0
-    image = np.vstack([image[None].transpose(0, 3, 1, 2)] * batch_size)
+    image = image[None].transpose(0, 3, 1, 2)
     image = torch.from_numpy(image)
     return 2.0 * image - 1.0
 
 
-def preprocess_mask(mask, batch_size, scale_factor=8):
-    if not isinstance(mask, torch.FloatTensor):
-        mask = mask.convert("L")
-        w, h = mask.size
-        w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
-        mask = mask.resize((w // scale_factor, h // scale_factor), resample=PIL_INTERPOLATION["nearest"])
-        mask = np.array(mask).astype(np.float32) / 255.0
-        mask = np.tile(mask, (4, 1, 1))
-        mask = np.vstack([mask[None]] * batch_size)
-        mask = 1 - mask  # repaint white, keep black
-        mask = torch.from_numpy(mask)
-        return mask
-
-    else:
-        valid_mask_channel_sizes = [1, 3]
-        # if mask channel is fourth tensor dimension, permute dimensions to pytorch standard (B, C, H, W)
-        if mask.shape[3] in valid_mask_channel_sizes:
-            mask = mask.permute(0, 3, 1, 2)
-        elif mask.shape[1] not in valid_mask_channel_sizes:
-            raise ValueError(
-                f"Mask channel dimension of size in {valid_mask_channel_sizes} should be second or fourth dimension,"
-                f" but received mask of shape {tuple(mask.shape)}"
-            )
-        # (potentially) reduce mask channel dimension from 3 to 1 for broadcasting to latent shape
-        mask = mask.mean(dim=1, keepdim=True)
-        h, w = mask.shape[-2:]
-        h, w = (x - x % 8 for x in (h, w))  # resize to integer multiple of 8
-        mask = torch.nn.functional.interpolate(mask, (h // scale_factor, w // scale_factor))
-        return mask
+def preprocess_mask(mask, scale_factor=8):
+    mask = mask.convert("L")
+    w, h = mask.size
+    w, h = (x - x % 32 for x in (w, h))  # resize to integer multiple of 32
+    mask = mask.resize((w // scale_factor, h // scale_factor), resample=PIL_INTERPOLATION["nearest"])
+    mask = np.array(mask).astype(np.float32) / 255.0
+    mask = np.tile(mask, (4, 1, 1))
+    mask = mask[None].transpose(0, 1, 2, 3)  # what does this step do?
+    mask = 1 - mask  # repaint white, keep black
+    mask = torch.from_numpy(mask)
+    return mask
 
 
-class StableDiffusionLongPromptWeightingPipeline(
-    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromCkptMixin
-):
+class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
     r"""
     Pipeline for text-to-image generation using Stable Diffusion without tokens length limit, and support parsing
     weighting in prompt.
@@ -440,196 +429,66 @@ class StableDiffusionLongPromptWeightingPipeline(
             Model that extracts features from generated images to be used as inputs for the `safety_checker`.
     """
 
-    _optional_components = ["safety_checker", "feature_extractor"]
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPImageProcessor,
-        requires_safety_checker: bool = True,
-    ):
-        super().__init__()
-
-        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
-                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
-                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
-                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
-                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file"
-            )
-            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["steps_offset"] = 1
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
-                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
-                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
-                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
-                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
-            )
-            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["clip_sample"] = False
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+    if version.parse(version.parse(diffusers.__version__).base_version) >= version.parse("0.9.0"):
+
+        def __init__(
+            self,
+            vae: AutoencoderKL,
+            text_encoder: CLIPTextModel,
+            tokenizer: CLIPTokenizer,
+            unet: UNet2DConditionModel,
+            scheduler: SchedulerMixin,
+            safety_checker: StableDiffusionSafetyChecker,
+            feature_extractor: CLIPImageProcessor,
+            requires_safety_checker: bool = True,
+        ):
+            super().__init__(
+                vae=vae,
+                text_encoder=text_encoder,
+                tokenizer=tokenizer,
+                unet=unet,
+                scheduler=scheduler,
+                safety_checker=safety_checker,
+                feature_extractor=feature_extractor,
+                requires_safety_checker=requires_safety_checker,
             )
+            self.__init__additional__()
 
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
+    else:
 
-        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
-            version.parse(unet.config._diffusers_version).base_version
-        ) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
-        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
-            deprecation_message = (
-                "The configuration file of the unet has set the default `sample_size` to smaller than"
-                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
-                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
-                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
-                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
-                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
-                " in the config might lead to incorrect results in future versions. If you have downloaded this"
-                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file"
+        def __init__(
+            self,
+            vae: AutoencoderKL,
+            text_encoder: CLIPTextModel,
+            tokenizer: CLIPTokenizer,
+            unet: UNet2DConditionModel,
+            scheduler: SchedulerMixin,
+            safety_checker: StableDiffusionSafetyChecker,
+            feature_extractor: CLIPImageProcessor,
+        ):
+            super().__init__(
+                vae=vae,
+                text_encoder=text_encoder,
+                tokenizer=tokenizer,
+                unet=unet,
+                scheduler=scheduler,
+                safety_checker=safety_checker,
+                feature_extractor=feature_extractor,
             )
-            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(unet.config)
-            new_config["sample_size"] = 64
-            unet._internal_dict = FrozenDict(new_config)
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-
-        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
-        self.register_to_config(
-            requires_safety_checker=requires_safety_checker,
-        )
-
-    def enable_vae_slicing(self):
-        r"""
-        Enable sliced VAE decoding.
-
-        When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
-        steps. This is useful to save some memory and allow larger batch sizes.
-        """
-        self.vae.enable_slicing()
-
-    def disable_vae_slicing(self):
-        r"""
-        Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
-        computing decoding in one step.
-        """
-        self.vae.disable_slicing()
-
-    def enable_vae_tiling(self):
-        r"""
-        Enable tiled VAE decoding.
-
-        When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in
-        several steps. This is useful to save a large amount of memory and to allow the processing of larger images.
-        """
-        self.vae.enable_tiling()
-
-    def disable_vae_tiling(self):
-        r"""
-        Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to
-        computing decoding in one step.
-        """
-        self.vae.disable_tiling()
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
-    def enable_sequential_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
-        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
-        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
-        Note that offloading happens on a submodule basis. Memory savings are higher than with
-        `enable_model_cpu_offload`, but performance is lower.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
-            from accelerate import cpu_offload
-        else:
-            raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
-            cpu_offload(cpu_offloaded_model, device)
-
-        if self.safety_checker is not None:
-            cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
-    def enable_model_cpu_offload(self, gpu_id=0):
-        r"""
-        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
-        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
-        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
-        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
-        """
-        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
-            from accelerate import cpu_offload_with_hook
-        else:
-            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
-        device = torch.device(f"cuda:{gpu_id}")
-
-        if self.device.type != "cpu":
-            self.to("cpu", silence_dtype_warnings=True)
-            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
-
-        hook = None
-        for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
-            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
-        if self.safety_checker is not None:
-            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
+            self.__init__additional__()
 
-        # We'll offload the last model manually.
-        self.final_offload_hook = hook
+    def __init__additional__(self):
+        if not hasattr(self, "vae_scale_factor"):
+            setattr(self, "vae_scale_factor", 2 ** (len(self.vae.config.block_out_channels) - 1))
 
     @property
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
     def _execution_device(self):
         r"""
         Returns the device on which the pipeline's models will be executed. After calling
         `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
         hooks.
         """
-        if not hasattr(self.unet, "_hf_hook"):
+        if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
             return self.device
         for module in self.unet.modules():
             if (
@@ -646,10 +505,8 @@ def _encode_prompt(
         device,
         num_images_per_prompt,
         do_classifier_free_guidance,
-        negative_prompt=None,
-        max_embeddings_multiples=3,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt,
+        max_embeddings_multiples,
     ):
         r"""
         Encodes the prompt into text encoder hidden states.
@@ -669,71 +526,47 @@ def _encode_prompt(
             max_embeddings_multiples (`int`, *optional*, defaults to `3`):
                 The max multiple length of prompt embeddings compared to the max output length of text encoder.
         """
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if negative_prompt_embeds is None:
-            if negative_prompt is None:
-                negative_prompt = [""] * batch_size
-            elif isinstance(negative_prompt, str):
-                negative_prompt = [negative_prompt] * batch_size
-            if batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-        if prompt_embeds is None or negative_prompt_embeds is None:
-            if isinstance(self, TextualInversionLoaderMixin):
-                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
-                if do_classifier_free_guidance and negative_prompt_embeds is None:
-                    negative_prompt = self.maybe_convert_prompt(negative_prompt, self.tokenizer)
-
-            prompt_embeds1, negative_prompt_embeds1 = get_weighted_text_embeddings(
-                pipe=self,
-                prompt=prompt,
-                uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
-                max_embeddings_multiples=max_embeddings_multiples,
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        if negative_prompt is None:
+            negative_prompt = [""] * batch_size
+        elif isinstance(negative_prompt, str):
+            negative_prompt = [negative_prompt] * batch_size
+        if batch_size != len(negative_prompt):
+            raise ValueError(
+                f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                " the batch size of `prompt`."
             )
-            if prompt_embeds is None:
-                prompt_embeds = prompt_embeds1
-            if negative_prompt_embeds is None:
-                negative_prompt_embeds = negative_prompt_embeds1
 
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        text_embeddings, uncond_embeddings = get_weighted_text_embeddings(
+            pipe=self,
+            prompt=prompt,
+            uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
+            max_embeddings_multiples=max_embeddings_multiples,
+        )
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
 
         if do_classifier_free_guidance:
-            bs_embed, seq_len, _ = negative_prompt_embeds.shape
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
-            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            bs_embed, seq_len, _ = uncond_embeddings.shape
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
 
-        return prompt_embeds
+        return text_embeddings
 
-    def check_inputs(
-        self,
-        prompt,
-        height,
-        width,
-        strength,
-        callback_steps,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-    ):
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+    def check_inputs(self, prompt, height, width, strength, callback_steps):
+        if not isinstance(prompt, str) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
         if strength < 0 or strength > 1:
             raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
 
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
         if (callback_steps is None) or (
             callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
         ):
@@ -742,42 +575,17 @@ def check_inputs(
                 f" {type(callback_steps)}."
             )
 
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
     def get_timesteps(self, num_inference_steps, strength, device, is_text2img):
         if is_text2img:
             return self.scheduler.timesteps.to(device), num_inference_steps
         else:
             # get the original timestep using init_timestep
-            init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
-
-            t_start = max(num_inference_steps - init_timestep, 0)
-            timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+            offset = self.scheduler.config.get("steps_offset", 0)
+            init_timestep = int(num_inference_steps * strength) + offset
+            init_timestep = min(init_timestep, num_inference_steps)
 
+            t_start = max(num_inference_steps - init_timestep + offset, 0)
+            timesteps = self.scheduler.timesteps[t_start:].to(device)
             return timesteps, num_inference_steps - t_start
 
     def run_safety_checker(self, image, device, dtype):
@@ -791,7 +599,7 @@ def run_safety_checker(self, image, device, dtype):
         return image, has_nsfw_concept
 
     def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
+        latents = 1 / 0.18215 * latents
         image = self.vae.decode(latents).sample
         image = (image / 2 + 0.5).clamp(0, 1)
         # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
@@ -815,51 +623,43 @@ def prepare_extra_step_kwargs(self, generator, eta):
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
-    def prepare_latents(
-        self,
-        image,
-        timestep,
-        num_images_per_prompt,
-        batch_size,
-        num_channels_latents,
-        height,
-        width,
-        dtype,
-        device,
-        generator,
-        latents=None,
-    ):
+    def prepare_latents(self, image, timestep, batch_size, height, width, dtype, device, generator, latents=None):
         if image is None:
-            batch_size = batch_size * num_images_per_prompt
-            shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
-            if isinstance(generator, list) and len(generator) != batch_size:
-                raise ValueError(
-                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-                )
+            shape = (
+                batch_size,
+                self.unet.config.in_channels,
+                height // self.vae_scale_factor,
+                width // self.vae_scale_factor,
+            )
 
             if latents is None:
-                latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+                if device.type == "mps":
+                    # randn does not work reproducibly on mps
+                    latents = torch.randn(shape, generator=generator, device="cpu", dtype=dtype).to(device)
+                else:
+                    latents = torch.randn(shape, generator=generator, device=device, dtype=dtype)
             else:
+                if latents.shape != shape:
+                    raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
                 latents = latents.to(device)
 
             # scale the initial noise by the standard deviation required by the scheduler
             latents = latents * self.scheduler.init_noise_sigma
             return latents, None, None
         else:
-            image = image.to(device=self.device, dtype=dtype)
             init_latent_dist = self.vae.encode(image).latent_dist
             init_latents = init_latent_dist.sample(generator=generator)
-            init_latents = self.vae.config.scaling_factor * init_latents
-
-            # Expand init_latents for batch_size and num_images_per_prompt
-            init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0)
+            init_latents = 0.18215 * init_latents
+            init_latents = torch.cat([init_latents] * batch_size, dim=0)
             init_latents_orig = init_latents
+            shape = init_latents.shape
 
             # add noise to latents using the timesteps
-            noise = randn_tensor(init_latents.shape, generator=generator, device=self.device, dtype=dtype)
-            init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
-            latents = init_latents
+            if device.type == "mps":
+                noise = torch.randn(shape, generator=generator, device="cpu", dtype=dtype).to(device)
+            else:
+                noise = torch.randn(shape, generator=generator, device=device, dtype=dtype)
+            latents = self.scheduler.add_noise(init_latents, noise, timestep)
             return latents, init_latents_orig, noise
 
     @torch.no_grad()
@@ -875,19 +675,15 @@ def __call__(
         guidance_scale: float = 7.5,
         strength: float = 0.8,
         num_images_per_prompt: Optional[int] = 1,
-        add_predicted_noise: Optional[bool] = False,
         eta: float = 0.0,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        generator: Optional[torch.Generator] = None,
         latents: Optional[torch.FloatTensor] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
         max_embeddings_multiples: Optional[int] = 3,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         is_cancelled_callback: Optional[Callable[[], bool]] = None,
         callback_steps: int = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
     ):
         r"""
         Function invoked when calling the pipeline for generation.
@@ -927,26 +723,16 @@ def __call__(
                 `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
             num_images_per_prompt (`int`, *optional*, defaults to 1):
                 The number of images to generate per prompt.
-            add_predicted_noise (`bool`, *optional*, defaults to True):
-                Use predicted noise instead of random noise when constructing noisy versions of the original image in
-                the reverse diffusion process
             eta (`float`, *optional*, defaults to 0.0):
                 Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                 [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
             latents (`torch.FloatTensor`, *optional*):
                 Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                 generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                 tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
             max_embeddings_multiples (`int`, *optional*, defaults to `3`):
                 The max multiple length of prompt embeddings compared to the max output length of text encoder.
             output_type (`str`, *optional*, defaults to `"pil"`):
@@ -964,10 +750,6 @@ def __call__(
             callback_steps (`int`, *optional*, defaults to 1):
                 The frequency at which the `callback` function will be called. If not specified, the callback will be
                 called at every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
 
         Returns:
             `None` if cancelled by `is_cancelled_callback`,
@@ -982,18 +764,10 @@ def __call__(
         width = width or self.unet.config.sample_size * self.vae_scale_factor
 
         # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt, height, width, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
-        )
+        self.check_inputs(prompt, height, width, strength, callback_steps)
 
         # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
         device = self._execution_device
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
@@ -1001,28 +775,26 @@ def __call__(
         do_classifier_free_guidance = guidance_scale > 1.0
 
         # 3. Encode input prompt
-        prompt_embeds = self._encode_prompt(
+        text_embeddings = self._encode_prompt(
             prompt,
             device,
             num_images_per_prompt,
             do_classifier_free_guidance,
             negative_prompt,
             max_embeddings_multiples,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
         )
-        dtype = prompt_embeds.dtype
+        dtype = text_embeddings.dtype
 
         # 4. Preprocess image and mask
         if isinstance(image, PIL.Image.Image):
-            image = preprocess_image(image, batch_size)
+            image = preprocess_image(image)
         if image is not None:
             image = image.to(device=self.device, dtype=dtype)
         if isinstance(mask_image, PIL.Image.Image):
-            mask_image = preprocess_mask(mask_image, batch_size, self.vae_scale_factor)
+            mask_image = preprocess_mask(mask_image, self.vae_scale_factor)
         if mask_image is not None:
             mask = mask_image.to(device=self.device, dtype=dtype)
-            mask = torch.cat([mask] * num_images_per_prompt)
+            mask = torch.cat([mask] * batch_size * num_images_per_prompt)
         else:
             mask = None
 
@@ -1035,9 +807,7 @@ def __call__(
         latents, init_latents_orig, noise = self.prepare_latents(
             image,
             latent_timestep,
-            num_images_per_prompt,
-            batch_size,
-            self.unet.config.in_channels,
+            batch_size * num_images_per_prompt,
             height,
             width,
             dtype,
@@ -1050,70 +820,43 @@ def __call__(
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
         # 8. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                ).sample
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-                if mask is not None:
-                    # masking
-                    if add_predicted_noise:
-                        init_latents_proper = self.scheduler.add_noise(
-                            init_latents_orig, noise_pred_uncond, torch.tensor([t])
-                        )
-                    else:
-                        init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, torch.tensor([t]))
-                    latents = (init_latents_proper * mask) + (latents * (1 - mask))
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if i % callback_steps == 0:
-                        if callback is not None:
-                            callback(i, t, latents)
-                        if is_cancelled_callback is not None and is_cancelled_callback():
-                            return None
-
-        if output_type == "latent":
-            image = latents
-            has_nsfw_concept = None
-        elif output_type == "pil":
-            # 9. Post-processing
-            image = self.decode_latents(latents)
-
-            # 10. Run safety checker
-            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
-
-            # 11. Convert to PIL
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+            if mask is not None:
+                # masking
+                init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, torch.tensor([t]))
+                latents = (init_latents_proper * mask) + (latents * (1 - mask))
+
+            # call the callback, if provided
+            if i % callback_steps == 0:
+                if callback is not None:
+                    callback(i, t, latents)
+                if is_cancelled_callback is not None and is_cancelled_callback():
+                    return None
+
+        # 9. Post-processing
+        image = self.decode_latents(latents)
+
+        # 10. Run safety checker
+        image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)
+
+        # 11. Convert to PIL
+        if output_type == "pil":
             image = self.numpy_to_pil(image)
-        else:
-            # 9. Post-processing
-            image = self.decode_latents(latents)
-
-            # 10. Run safety checker
-            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
-
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
 
         if not return_dict:
             return image, has_nsfw_concept
@@ -1130,17 +873,14 @@ def text2img(
         guidance_scale: float = 7.5,
         num_images_per_prompt: Optional[int] = 1,
         eta: float = 0.0,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        generator: Optional[torch.Generator] = None,
         latents: Optional[torch.FloatTensor] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
         max_embeddings_multiples: Optional[int] = 3,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         is_cancelled_callback: Optional[Callable[[], bool]] = None,
         callback_steps: int = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
     ):
         r"""
         Function for text-to-image generation.
@@ -1168,20 +908,13 @@ def text2img(
             eta (`float`, *optional*, defaults to 0.0):
                 Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                 [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
             latents (`torch.FloatTensor`, *optional*):
                 Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                 generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                 tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
             max_embeddings_multiples (`int`, *optional*, defaults to `3`):
                 The max multiple length of prompt embeddings compared to the max output length of text encoder.
             output_type (`str`, *optional*, defaults to `"pil"`):
@@ -1199,13 +932,7 @@ def text2img(
             callback_steps (`int`, *optional*, defaults to 1):
                 The frequency at which the `callback` function will be called. If not specified, the callback will be
                 called at every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
-
         Returns:
-            `None` if cancelled by `is_cancelled_callback`,
             [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
             [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
             When returning a tuple, the first element is a list with the generated images, and the second element is a
@@ -1223,15 +950,12 @@ def text2img(
             eta=eta,
             generator=generator,
             latents=latents,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
             max_embeddings_multiples=max_embeddings_multiples,
             output_type=output_type,
             return_dict=return_dict,
             callback=callback,
             is_cancelled_callback=is_cancelled_callback,
             callback_steps=callback_steps,
-            cross_attention_kwargs=cross_attention_kwargs,
         )
 
     def img2img(
@@ -1244,16 +968,13 @@ def img2img(
         guidance_scale: Optional[float] = 7.5,
         num_images_per_prompt: Optional[int] = 1,
         eta: Optional[float] = 0.0,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        generator: Optional[torch.Generator] = None,
         max_embeddings_multiples: Optional[int] = 3,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         is_cancelled_callback: Optional[Callable[[], bool]] = None,
         callback_steps: int = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
     ):
         r"""
         Function for image-to-image generation.
@@ -1286,16 +1007,9 @@ def img2img(
             eta (`float`, *optional*, defaults to 0.0):
                 Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                 [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
             max_embeddings_multiples (`int`, *optional*, defaults to `3`):
                 The max multiple length of prompt embeddings compared to the max output length of text encoder.
             output_type (`str`, *optional*, defaults to `"pil"`):
@@ -1313,13 +1027,8 @@ def img2img(
             callback_steps (`int`, *optional*, defaults to 1):
                 The frequency at which the `callback` function will be called. If not specified, the callback will be
                 called at every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
-
         Returns:
-            `None` if cancelled by `is_cancelled_callback`,
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
             [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
             When returning a tuple, the first element is a list with the generated images, and the second element is a
             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
@@ -1335,15 +1044,12 @@ def img2img(
             num_images_per_prompt=num_images_per_prompt,
             eta=eta,
             generator=generator,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
             max_embeddings_multiples=max_embeddings_multiples,
             output_type=output_type,
             return_dict=return_dict,
             callback=callback,
             is_cancelled_callback=is_cancelled_callback,
             callback_steps=callback_steps,
-            cross_attention_kwargs=cross_attention_kwargs,
         )
 
     def inpaint(
@@ -1356,18 +1062,14 @@ def inpaint(
         num_inference_steps: Optional[int] = 50,
         guidance_scale: Optional[float] = 7.5,
         num_images_per_prompt: Optional[int] = 1,
-        add_predicted_noise: Optional[bool] = False,
         eta: Optional[float] = 0.0,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        generator: Optional[torch.Generator] = None,
         max_embeddings_multiples: Optional[int] = 3,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         is_cancelled_callback: Optional[Callable[[], bool]] = None,
         callback_steps: int = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
     ):
         r"""
         Function for inpaint.
@@ -1401,22 +1103,12 @@ def inpaint(
                 usually at the expense of lower image quality.
             num_images_per_prompt (`int`, *optional*, defaults to 1):
                 The number of images to generate per prompt.
-            add_predicted_noise (`bool`, *optional*, defaults to True):
-                Use predicted noise instead of random noise when constructing noisy versions of the original image in
-                the reverse diffusion process
             eta (`float`, *optional*, defaults to 0.0):
                 Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                 [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
             max_embeddings_multiples (`int`, *optional*, defaults to `3`):
                 The max multiple length of prompt embeddings compared to the max output length of text encoder.
             output_type (`str`, *optional*, defaults to `"pil"`):
@@ -1434,13 +1126,8 @@ def inpaint(
             callback_steps (`int`, *optional*, defaults to 1):
                 The frequency at which the `callback` function will be called. If not specified, the callback will be
                 called at every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
-
         Returns:
-            `None` if cancelled by `is_cancelled_callback`,
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
             [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
             When returning a tuple, the first element is a list with the generated images, and the second element is a
             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
@@ -1455,16 +1142,12 @@ def inpaint(
             guidance_scale=guidance_scale,
             strength=strength,
             num_images_per_prompt=num_images_per_prompt,
-            add_predicted_noise=add_predicted_noise,
             eta=eta,
             generator=generator,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
             max_embeddings_multiples=max_embeddings_multiples,
             output_type=output_type,
             return_dict=return_dict,
             callback=callback,
             is_cancelled_callback=is_cancelled_callback,
             callback_steps=callback_steps,
-            cross_attention_kwargs=cross_attention_kwargs,
         )

From c5933c9c89c9c23c5f7f0fcd532d9a7043d879b8 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 25 Apr 2023 00:44:00 +0200
Subject: [PATCH 54/71] [Bug fix] Fix batch size attention head size mismatch
 (#3214)

---
 src/diffusers/models/attention.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 1085c452b076..8e537c6f3680 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -86,8 +86,10 @@ def reshape_batch_dim_to_heads(self, tensor, unmerge_head_and_batch=True):
         head_size = self.num_heads
 
         if unmerge_head_and_batch:
-            batch_size, seq_len, dim = tensor.shape
-            tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
+            batch_head_size, seq_len, dim = tensor.shape
+            batch_size = batch_head_size // head_size
+
+            tensor = tensor.reshape(batch_size, head_size, seq_len, dim)
         else:
             batch_size, _, seq_len, dim = tensor.shape
 

From 0ddc5bf7b97ee832c478ff7f4db22930b8f27d99 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lucca=20Zen=C3=B3bio?= <luccazen@gmail.com>
Date: Tue, 25 Apr 2023 06:52:57 -0300
Subject: [PATCH 55/71] fix mixed precision training on
 train_dreambooth_inpaint_lora (#3138)

cast to weight dtype
---
 .../dreambooth_inpaint/train_dreambooth_inpaint_lora.py         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint_lora.py b/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint_lora.py
index 07df6f201175..821c66b7237a 100644
--- a/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint_lora.py
+++ b/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint_lora.py
@@ -735,7 +735,7 @@ def collate_fn(examples):
                         torch.nn.functional.interpolate(mask, size=(args.resolution // 8, args.resolution // 8))
                         for mask in masks
                     ]
-                )
+                ).to(dtype=weight_dtype)
                 mask = mask.reshape(-1, 1, args.resolution // 8, args.resolution // 8)
 
                 # Sample noise that we'll add to the latents

From e9edbfc2515847470fcb2b95e54b4d9a055e8072 Mon Sep 17 00:00:00 2001
From: Isaac <34376531+init-22@users.noreply.github.com>
Date: Tue, 25 Apr 2023 18:42:21 +0530
Subject: [PATCH 56/71] adding enable_vae_tiling and disable_vae_tiling
 functions (#3225)

adding enable_vae_tiling and disable_val_tiling functions
---
 .../pipeline_stable_diffusion_controlnet.py    | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
index 322f2232fc8a..3bd7f82d7eb6 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py
@@ -249,6 +249,24 @@ def disable_vae_slicing(self):
         """
         self.vae.disable_slicing()
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding.
+
+        When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in
+        several steps. This is useful to save a large amount of memory and to allow the processing of larger images.
+        """
+        self.vae.enable_tiling()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+
     def enable_sequential_cpu_offload(self, gpu_id=0):
         r"""
         Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,

From 131312caba0af97da98fc498dfdca335c9692f8c Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 25 Apr 2023 15:12:35 +0200
Subject: [PATCH 57/71] Add ControlNet v1.1 docs (#3226)

Add v1.1 docs
---
 .../pipelines/stable_diffusion/controlnet.mdx | 23 +++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx b/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
index dabd3ded31ce..fd5c87821c01 100644
--- a/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
+++ b/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx
@@ -277,7 +277,6 @@ Canny Control Example
 |<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare_guess_mode/output_images/diffusers/output_bird_canny_0.png"><img width="128" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare_guess_mode/output_images/diffusers/output_bird_canny_0.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare_guess_mode/output_images/diffusers/output_bird_canny_0_gm.png"><img width="128" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare_guess_mode/output_images/diffusers/output_bird_canny_0_gm.png"/></a>|
 
 
-
 ## Available checkpoints
 
 ControlNet requires a *control image* in addition to the text-to-image *prompt*. 
@@ -285,7 +284,9 @@ Each pretrained model is trained using a different conditioning method that requ
 
 All checkpoints can be found under the authors' namespace [lllyasviel](https://huggingface.co/lllyasviel).
 
-### ControlNet with Stable Diffusion 1.5
+**13.04.2024 Update**: The author has released improved controlnet checkpoints v1.1 - see [here](#controlnet-v1.1).
+
+### ControlNet v1.0
 
 | Model Name | Control Image Overview| Control Image Example | Generated Image Example |
 |---|---|---|---|
@@ -298,6 +299,24 @@ All checkpoints can be found under the authors' namespace [lllyasviel](https://h
 |[lllyasviel/sd-controlnet-scribble](https://huggingface.co/lllyasviel/sd-controlnet_scribble)<br/> *Trained with human scribbles*  |A hand-drawn monochrome image with white outlines on a black background.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_vermeer_scribble.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_vermeer_scribble.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_vermeer_scribble_0.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_vermeer_scribble_0.png"/></a> |
 |[lllyasviel/sd-controlnet-seg](https://huggingface.co/lllyasviel/sd-controlnet_seg)<br/>*Trained with semantic segmentation*  |An [ADE20K](https://groups.csail.mit.edu/vision/datasets/ADE20K/)'s segmentation protocol image.|<a href="https://huggingface.co/takuma104/controlnet_dev/blob/main/gen_compare/control_images/converted/control_room_seg.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/control_images/converted/control_room_seg.png"/></a>|<a href="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_room_seg_1.png"><img width="64" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare/output_images/diffusers/output_room_seg_1.png"/></a> |
 
+### ControlNet v1.1
+
+| Model Name | Control Image Overview| Control Image Example | Generated Image Example |
+|---|---|---|---|
+|[lllyasviel/control_v11p_sd15_canny](https://huggingface.co/lllyasviel/control_v11p_sd15_canny)<br/> *Trained with canny edge detection* | A monochrome image with white edges on a black background.|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15_canny/resolve/main/images/control.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/lllyasviel/control_v11p_sd15_canny/resolve/main/images/control.png"/></a>|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15_canny/resolve/main/images/image_out.png"><img width="64" src="https://huggingface.co/lllyasviel/control_v11p_sd15_canny/resolve/main/images/image_out.png"/></a>|
+|[lllyasviel/control_v11e_sd15_ip2p](https://huggingface.co/lllyasviel/control_v11e_sd15_ip2p)<br/> *Trained with pixel to pixel instruction* | No condition .|<a href="https://huggingface.co/lllyasviel/control_v11e_sd15_ip2p/resolve/main/images/control.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/lllyasviel/control_v11e_sd15_ip2p/resolve/main/images/control.png"/></a>|<a href="https://huggingface.co/lllyasviel/control_v11e_sd15_ip2p/resolve/main/images/image_out.png"><img width="64" src="https://huggingface.co/lllyasviel/control_v11e_sd15_ip2p/resolve/main/images/image_out.png"/></a>|
+|[lllyasviel/control_v11p_sd15_inpaint](https://huggingface.co/lllyasviel/control_v11p_sd15_inpaint)<br/> Trained with image inpainting | No condition.|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15_inpaint/resolve/main/images/control.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/lllyasviel/control_v11p_sd15_inpaint/resolve/main/images/control.png"/></a>|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15_inpaint/resolve/main/images/output.png"><img width="64" src="https://huggingface.co/lllyasviel/control_v11p_sd15_inpaint/resolve/main/images/output.png"/></a>|
+|[lllyasviel/control_v11p_sd15_mlsd](https://huggingface.co/lllyasviel/control_v11p_sd15_mlsd)<br/> Trained with multi-level line segment detection | An image with annotated line segments.|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15_mlsd/resolve/main/images/control.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/lllyasviel/control_v11p_sd15_mlsd/resolve/main/images/control.png"/></a>|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15_mlsd/resolve/main/images/image_out.png"><img width="64" src="https://huggingface.co/lllyasviel/control_v11p_sd15_mlsd/resolve/main/images/image_out.png"/></a>|
+|[lllyasviel/control_v11f1p_sd15_depth](https://huggingface.co/lllyasviel/control_v11f1p_sd15_depth)<br/> Trained with depth estimation | An image with depth information, usually represented as a grayscale image.|<a href="https://huggingface.co/lllyasviel/control_v11f1p_sd15_depth/resolve/main/images/control.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/lllyasviel/control_v11f1p_sd15_depth/resolve/main/images/control.png"/></a>|<a href="https://huggingface.co/lllyasviel/control_v11f1p_sd15_depth/resolve/main/images/image_out.png"><img width="64" src="https://huggingface.co/lllyasviel/control_v11f1p_sd15_depth/resolve/main/images/image_out.png"/></a>|
+|[lllyasviel/control_v11p_sd15_normalbae](https://huggingface.co/lllyasviel/control_v11p_sd15_normalbae)<br/> Trained with surface normal estimation | An image with surface normal information, usually represented as a color-coded image.|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15_normalbae/resolve/main/images/control.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/lllyasviel/control_v11p_sd15_normalbae/resolve/main/images/control.png"/></a>|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15_normalbae/resolve/main/images/image_out.png"><img width="64" src="https://huggingface.co/lllyasviel/control_v11p_sd15_normalbae/resolve/main/images/image_out.png"/></a>|
+|[lllyasviel/control_v11p_sd15_seg](https://huggingface.co/lllyasviel/control_v11p_sd15_seg)<br/> Trained with image segmentation | An image with segmented regions, usually represented as a color-coded image.|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15_seg/resolve/main/images/control.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/lllyasviel/control_v11p_sd15_seg/resolve/main/images/control.png"/></a>|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15_seg/resolve/main/images/image_out.png"><img width="64" src="https://huggingface.co/lllyasviel/control_v11p_sd15_seg/resolve/main/images/image_out.png"/></a>|
+|[lllyasviel/control_v11p_sd15_lineart](https://huggingface.co/lllyasviel/control_v11p_sd15_lineart)<br/> Trained with line art generation | An image with line art, usually black lines on a white background.|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15_lineart/resolve/main/images/control.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/lllyasviel/control_v11p_sd15_lineart/resolve/main/images/control.png"/></a>|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15_lineart/resolve/main/images/image_out.png"><img width="64" src="https://huggingface.co/lllyasviel/control_v11p_sd15_lineart/resolve/main/images/image_out.png"/></a>|
+|[lllyasviel/control_v11p_sd15s2_lineart_anime](https://huggingface.co/lllyasviel/control_v11p_sd15s2_lineart_anime)<br/> Trained with anime line art generation | An image with anime-style line art.|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15s2_lineart_anime/resolve/main/images/control.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/lllyasviel/control_v11p_sd15s2_lineart_anime/resolve/main/images/control.png"/></a>|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15s2_lineart_anime/resolve/main/images/image_out.png"><img width="64" src="https://huggingface.co/lllyasviel/control_v11p_sd15s2_lineart_anime/resolve/main/images/image_out.png"/></a>|
+|[lllyasviel/control_v11p_sd15_openpose](https://huggingface.co/lllyasviel/control_v11p_sd15s2_lineart_anime)<br/> Trained with human pose estimation | An image with human poses, usually represented as a set of keypoints or skeletons.|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15_openpose/resolve/main/images/control.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/lllyasviel/control_v11p_sd15_openpose/resolve/main/images/control.png"/></a>|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15_openpose/resolve/main/images/image_out.png"><img width="64" src="https://huggingface.co/lllyasviel/control_v11p_sd15_openpose/resolve/main/images/image_out.png"/></a>|
+|[lllyasviel/control_v11p_sd15_scribble](https://huggingface.co/lllyasviel/control_v11p_sd15_scribble)<br/> Trained with scribble-based image generation | An image with scribbles, usually random or user-drawn strokes.|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15_scribble/resolve/main/images/control.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/lllyasviel/control_v11p_sd15_scribble/resolve/main/images/control.png"/></a>|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15_scribble/resolve/main/images/image_out.png"><img width="64" src="https://huggingface.co/lllyasviel/control_v11p_sd15_scribble/resolve/main/images/image_out.png"/></a>|
+|[lllyasviel/control_v11p_sd15_softedge](https://huggingface.co/lllyasviel/control_v11p_sd15_softedge)<br/> Trained with soft edge image generation | An image with soft edges, usually to create a more painterly or artistic effect.|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15_softedge/resolve/main/images/control.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/lllyasviel/control_v11p_sd15_softedge/resolve/main/images/control.png"/></a>|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15_softedge/resolve/main/images/image_out.png"><img width="64" src="https://huggingface.co/lllyasviel/control_v11p_sd15_softedge/resolve/main/images/image_out.png"/></a>|
+|[lllyasviel/control_v11e_sd15_shuffle](https://huggingface.co/lllyasviel/control_v11e_sd15_shuffle)<br/> Trained with image shuffling | An image with shuffled patches or regions.|<a href="https://huggingface.co/lllyasviel/control_v11e_sd15_shuffle/resolve/main/images/control.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/lllyasviel/control_v11e_sd15_shuffle/resolve/main/images/control.png"/></a>|<a href="https://huggingface.co/lllyasviel/control_v11e_sd15_shuffle/resolve/main/images/image_out.png"><img width="64" src="https://huggingface.co/lllyasviel/control_v11e_sd15_shuffle/resolve/main/images/image_out.png"/></a>|
+
 ## StableDiffusionControlNetPipeline
 [[autodoc]] StableDiffusionControlNetPipeline
 	- all

From 0d196f9f45106648966a89569d20beccd359c3b9 Mon Sep 17 00:00:00 2001
From: pdoane <pdoane2@gmail.com>
Date: Tue, 25 Apr 2023 06:17:57 -0700
Subject: [PATCH 58/71] Fix issue in maybe_convert_prompt (#3188)

When the token used for textual inversion does not have any special symbols (e.g. it is not surrounded by <>), the tokenizer does not properly split the replacement tokens.  Adding a space for the padding tokens fixes this.
---
 src/diffusers/loaders.py          | 2 +-
 tests/pipelines/test_pipelines.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py
index b4c443fd303b..8878fb116d1d 100644
--- a/src/diffusers/loaders.py
+++ b/src/diffusers/loaders.py
@@ -410,7 +410,7 @@ def _maybe_convert_prompt(self, prompt: str, tokenizer: "PreTrainedTokenizer"):
                 replacement = token
                 i = 1
                 while f"{token}_{i}" in tokenizer.added_tokens_encoder:
-                    replacement += f"{token}_{i}"
+                    replacement += f" {token}_{i}"
                     i += 1
 
                 prompt = prompt.replace(token, replacement)
diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py
index a5d70b01d453..8fb79f0c4057 100644
--- a/tests/pipelines/test_pipelines.py
+++ b/tests/pipelines/test_pipelines.py
@@ -541,7 +541,7 @@ def test_text_inversion_download(self):
             assert pipe.text_encoder.get_input_embeddings().weight[-3].sum().item() == 96
             assert pipe.text_encoder.get_input_embeddings().weight[-2].sum().item() == 128
             assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 160
-            assert pipe._maybe_convert_prompt("<***>", pipe.tokenizer) == "<***><***>_1<***>_2"
+            assert pipe._maybe_convert_prompt("<***>", pipe.tokenizer) == "<***> <***>_1 <***>_2"
 
             prompt = "hey <***>"
             out = pipe(prompt, num_inference_steps=1, output_type="numpy").images
@@ -569,7 +569,7 @@ def test_text_inversion_download(self):
             assert pipe.text_encoder.get_input_embeddings().weight[-3].sum().item() == 96
             assert pipe.text_encoder.get_input_embeddings().weight[-2].sum().item() == 128
             assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 160
-            assert pipe._maybe_convert_prompt("<****>", pipe.tokenizer) == "<****><****>_1<****>_2"
+            assert pipe._maybe_convert_prompt("<****>", pipe.tokenizer) == "<****> <****>_1 <****>_2"
 
             prompt = "hey <****>"
             out = pipe(prompt, num_inference_steps=1, output_type="numpy").images

From 730e01ec93b24cfb3796f7f3f0c208081736ac10 Mon Sep 17 00:00:00 2001
From: Yuchen Fan <fyc0624@gmail.com>
Date: Tue, 25 Apr 2023 08:18:25 -0500
Subject: [PATCH 59/71] Sync cache version check from transformers (#3179)

sync cache version check from transformers
---
 src/diffusers/utils/hub_utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/utils/hub_utils.py b/src/diffusers/utils/hub_utils.py
index 511763ec6687..9cfc649c8b86 100644
--- a/src/diffusers/utils/hub_utils.py
+++ b/src/diffusers/utils/hub_utils.py
@@ -199,7 +199,10 @@ def move_cache(old_cache_dir: Optional[str] = None, new_cache_dir: Optional[str]
     cache_version = 0
 else:
     with open(cache_version_file) as f:
-        cache_version = int(f.read())
+        try:
+            cache_version = int(f.read())
+        except ValueError:
+            cache_version = 0
 
 if cache_version < 1:
     old_cache_is_not_empty = os.path.isdir(old_diffusers_cache) and len(os.listdir(old_diffusers_cache)) > 0

From 1ffcc924bc4193ade6d91d5945c9803e87062a78 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 25 Apr 2023 15:18:40 +0200
Subject: [PATCH 60/71] Fix docs text inversion (#3166)

* Fix docs text inversion

* Apply suggestions from code review
---
 src/diffusers/loaders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py
index 8878fb116d1d..0db716c012d8 100644
--- a/src/diffusers/loaders.py
+++ b/src/diffusers/loaders.py
@@ -511,7 +511,7 @@ def load_textual_inversion(
         model_id = "runwayml/stable-diffusion-v1-5"
         pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
 
-        pipe.load_textual_inversion("./charturnerv2.pt")
+        pipe.load_textual_inversion("./charturnerv2.pt", token="charturnerv2")
 
         prompt = "charturnerv2, multiple views of the same character in the same outfit, a character turnaround of a woman wearing a black jacket and red shirt, best quality, intricate details."
 

From e51f19aee82c8dd874b715a09dbc521d88835d68 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 25 Apr 2023 23:20:43 +0200
Subject: [PATCH 61/71] add model (#3230)

* add

* clean

* up

* clean up more

* fix more tests

* Improve docs further

* improve

* more fixes docs

* Improve docs more

* Update src/diffusers/models/unet_2d_condition.py

* fix

* up

* update doc links

* make fix-copies

* add safety checker and watermarker to stage 3 doc page code snippets

* speed optimizations docs

* memory optimization docs

* make style

* add watermarking snippets to doc string examples

* make style

* use pt_to_pil helper functions in doc strings

* skip mps tests

* Improve safety

* make style

* new logic

* fix

* fix bad onnx design

* make new stable diffusion upscale pipeline model arguments optional

* define has_nsfw_concept when non-pil output type

* lowercase linked to notebook name

---------

Co-authored-by: William Berman <WLBberman@gmail.com>
---
 docs/source/en/_toctree.yml                   |    2 +
 docs/source/en/api/pipelines/if.mdx           |  523 +++++++
 docs/source/en/api/pipelines/overview.mdx     |    3 +
 docs/source/en/index.mdx                      |    3 +
 scripts/convert_if.py                         | 1257 +++++++++++++++++
 src/diffusers/__init__.py                     |    6 +
 src/diffusers/configuration_utils.py          |    4 +
 src/diffusers/models/embeddings.py            |   66 +
 src/diffusers/models/modeling_utils.py        |    7 +-
 src/diffusers/models/unet_2d_condition.py     |   30 +-
 src/diffusers/pipelines/__init__.py           |    8 +
 .../pipelines/deepfloyd_if/__init__.py        |   54 +
 .../pipelines/deepfloyd_if/pipeline_if.py     |  854 +++++++++++
 .../deepfloyd_if/pipeline_if_img2img.py       |  979 +++++++++++++
 .../pipeline_if_img2img_superresolution.py    | 1097 ++++++++++++++
 .../deepfloyd_if/pipeline_if_inpainting.py    | 1098 ++++++++++++++
 .../pipeline_if_inpainting_superresolution.py | 1208 ++++++++++++++++
 .../pipeline_if_superresolution.py            |  947 +++++++++++++
 .../pipelines/deepfloyd_if/safety_checker.py  |   59 +
 .../pipelines/deepfloyd_if/timesteps.py       |  579 ++++++++
 .../pipelines/deepfloyd_if/watermark.py       |   46 +
 src/diffusers/pipelines/pipeline_utils.py     |   42 +-
 .../pipeline_onnx_stable_diffusion_upscale.py |   13 +-
 .../pipeline_stable_diffusion_upscale.py      |   49 +-
 .../versatile_diffusion/modeling_text_unet.py |   30 +-
 src/diffusers/utils/__init__.py               |    5 +-
 .../dummy_torch_and_transformers_objects.py   |   90 ++
 src/diffusers/utils/import_utils.py           |   41 +
 src/diffusers/utils/pil_utils.py              |   24 +
 tests/pipelines/deepfloyd_if/__init__.py      |  272 ++++
 tests/pipelines/deepfloyd_if/test_if.py       |  340 +++++
 .../pipelines/deepfloyd_if/test_if_img2img.py |   84 ++
 .../test_if_img2img_superresolution.py        |   79 ++
 .../deepfloyd_if/test_if_inpainting.py        |   82 ++
 .../test_if_inpainting_superresolution.py     |   84 ++
 .../deepfloyd_if/test_if_superresolution.py   |   77 +
 tests/pipelines/test_pipelines.py             |   13 +
 tests/pipelines/test_pipelines_common.py      |   12 +-
 38 files changed, 10135 insertions(+), 32 deletions(-)
 create mode 100644 docs/source/en/api/pipelines/if.mdx
 create mode 100644 scripts/convert_if.py
 create mode 100644 src/diffusers/pipelines/deepfloyd_if/__init__.py
 create mode 100644 src/diffusers/pipelines/deepfloyd_if/pipeline_if.py
 create mode 100644 src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
 create mode 100644 src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
 create mode 100644 src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
 create mode 100644 src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
 create mode 100644 src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
 create mode 100644 src/diffusers/pipelines/deepfloyd_if/safety_checker.py
 create mode 100644 src/diffusers/pipelines/deepfloyd_if/timesteps.py
 create mode 100644 src/diffusers/pipelines/deepfloyd_if/watermark.py
 create mode 100644 tests/pipelines/deepfloyd_if/__init__.py
 create mode 100644 tests/pipelines/deepfloyd_if/test_if.py
 create mode 100644 tests/pipelines/deepfloyd_if/test_if_img2img.py
 create mode 100644 tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py
 create mode 100644 tests/pipelines/deepfloyd_if/test_if_inpainting.py
 create mode 100644 tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py
 create mode 100644 tests/pipelines/deepfloyd_if/test_if_superresolution.py

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index de33ba616d0a..cc880f3e0b81 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -152,6 +152,8 @@
       title: DDPM
     - local: api/pipelines/dit
       title: DiT
+    - local: api/pipelines/if
+      title: IF
     - local: api/pipelines/latent_diffusion
       title: Latent Diffusion
     - local: api/pipelines/paint_by_example
diff --git a/docs/source/en/api/pipelines/if.mdx b/docs/source/en/api/pipelines/if.mdx
new file mode 100644
index 000000000000..5d3b292587f6
--- /dev/null
+++ b/docs/source/en/api/pipelines/if.mdx
@@ -0,0 +1,523 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# IF
+
+## Overview
+
+DeepFloyd IF is a novel state-of-the-art open-source text-to-image model with a high degree of photorealism and language understanding. 
+The model is a modular composed of a frozen text encoder and three cascaded pixel diffusion modules: 
+- Stage 1: a base model that generates 64x64 px image based on text prompt,
+- Stage 2: a 64x64 px => 256x256 px super-resolution model, and a
+- Stage 3: a 256x256 px => 1024x1024 px super-resolution model
+Stage 1 and Stage 2 utilize a frozen text encoder based on the T5 transformer to extract text embeddings, 
+which are then fed into a UNet architecture enhanced with cross-attention and attention pooling. 
+Stage 3 is [Stability's x4 Upscaling model](https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler).
+The result is a highly efficient model that outperforms current state-of-the-art models, achieving a zero-shot FID score of 6.66 on the COCO dataset. 
+Our work underscores the potential of larger UNet architectures in the first stage of cascaded diffusion models and depicts a promising future for text-to-image synthesis.
+
+## Usage
+
+Before you can use IF, you need to accept its usage conditions. To do so:
+1. Make sure to have a [Hugging Face account](https://huggingface.co/join) and be loggin in
+2. Accept the license on the model card of [DeepFloyd/IF-I-IF-v1.0](https://huggingface.co/DeepFloyd/IF-I-IF-v1.0) and [DeepFloyd/IF-II-L-v1.0](https://huggingface.co/DeepFloyd/IF-II-L-v1.0)
+3. Make sure to login locally. Install `huggingface_hub`
+```sh
+pip install huggingface_hub --upgrade
+```
+
+run the login function in a Python shell
+
+```py
+from huggingface_hub import login
+
+login()
+```
+
+and enter your [Hugging Face Hub access token](https://huggingface.co/docs/hub/security-tokens#what-are-user-access-tokens).
+
+Next we install `diffusers` and dependencies:
+
+```sh
+pip install diffusers accelerate transformers safetensors
+```
+
+The following sections give more in-detail examples of how to use IF. Specifically:
+
+- [Text-to-Image Generation](#text-to-image-generation)
+- [Image-to-Image Generation](#text-guided-image-to-image-generation)
+- [Inpainting](#text-guided-inpainting-generation)
+- [Reusing model weights](#converting-between-different-pipelines)
+- [Speed optimization](#optimizing-for-speed)
+- [Memory optimization](#optimizing-for-memory)
+
+**Available checkpoints**
+- *Stage-1*
+  - [DeepFloyd/IF-I-IF-v1.0](https://huggingface.co/DeepFloyd/IF-I-IF-v1.0)
+  - [DeepFloyd/IF-I-L-v1.0](https://huggingface.co/DeepFloyd/IF-I-L-v1.0)
+  - [DeepFloyd/IF-I-M-v1.0](https://huggingface.co/DeepFloyd/IF-I-M-v1.0)
+
+- *Stage-2*
+  - [DeepFloyd/IF-II-L-v1.0](https://huggingface.co/DeepFloyd/IF-II-L-v1.0)
+  - [DeepFloyd/IF-II-M-v1.0](https://huggingface.co/DeepFloyd/IF-II-M-v1.0)
+
+- *Stage-3*
+  - [stabilityai/stable-diffusion-x4-upscaler](https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler)
+
+**Demo**
+[![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/DeepFloyd/IF)
+
+**Google Colab**
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/deepfloyd_if_free_tier_google_colab.ipynb)
+
+### Text-to-Image Generation
+
+By default diffusers makes use of [model cpu offloading](https://huggingface.co/docs/diffusers/optimization/fp16#model-offloading-for-fast-inference-and-memory-savings)
+to run the whole IF pipeline with as little as 14 GB of VRAM.
+
+```python
+from diffusers import DiffusionPipeline
+from diffusers.utils import pt_to_pil
+import torch
+
+# stage 1
+stage_1 = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+stage_1.enable_model_cpu_offload()
+
+# stage 2
+stage_2 = DiffusionPipeline.from_pretrained(
+    "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16
+)
+stage_2.enable_model_cpu_offload()
+
+# stage 3
+safety_modules = {
+    "feature_extractor": stage_1.feature_extractor,
+    "safety_checker": stage_1.safety_checker,
+    "watermarker": stage_1.watermarker,
+}
+stage_3 = DiffusionPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-x4-upscaler", **safety_modules, torch_dtype=torch.float16
+)
+stage_3.enable_model_cpu_offload()
+
+prompt = 'a photo of a kangaroo wearing an orange hoodie and blue sunglasses standing in front of the eiffel tower holding a sign that says "very deep learning"'
+generator = torch.manual_seed(1)
+
+# text embeds
+prompt_embeds, negative_embeds = stage_1.encode_prompt(prompt)
+
+# stage 1
+image = stage_1(
+    prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_embeds, generator=generator, output_type="pt"
+).images
+pt_to_pil(image)[0].save("./if_stage_I.png")
+
+# stage 2
+image = stage_2(
+    image=image,
+    prompt_embeds=prompt_embeds,
+    negative_prompt_embeds=negative_embeds,
+    generator=generator,
+    output_type="pt",
+).images
+pt_to_pil(image)[0].save("./if_stage_II.png")
+
+# stage 3
+image = stage_3(prompt=prompt, image=image, noise_level=100, generator=generator).images
+image[0].save("./if_stage_III.png")
+```
+
+### Text Guided Image-to-Image Generation
+
+The same IF model weights can be used for text-guided image-to-image translation or image variation.
+In this case just make sure to load the weights using the [`IFInpaintingPipeline`] and [`IFInpaintingSuperResolutionPipeline`] pipelines.
+
+**Note**: You can also directly move the weights of the text-to-image pipelines to the image-to-image pipelines
+without loading them twice by making use of the [`~DiffusionPipeline.components()`] function as explained [here](#converting-between-different-pipelines).
+
+```python
+from diffusers import IFImg2ImgPipeline, IFImg2ImgSuperResolutionPipeline, DiffusionPipeline
+from diffusers.utils import pt_to_pil
+
+import torch
+
+from PIL import Image
+import requests
+from io import BytesIO
+
+# download image
+url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+response = requests.get(url)
+original_image = Image.open(BytesIO(response.content)).convert("RGB")
+original_image = original_image.resize((768, 512))
+
+# stage 1
+stage_1 = IFImg2ImgPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+stage_1.enable_model_cpu_offload()
+
+# stage 2
+stage_2 = IFImg2ImgSuperResolutionPipeline.from_pretrained(
+    "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16
+)
+stage_2.enable_model_cpu_offload()
+
+# stage 3
+safety_modules = {
+    "feature_extractor": stage_1.feature_extractor,
+    "safety_checker": stage_1.safety_checker,
+    "watermarker": stage_1.watermarker,
+}
+stage_3 = DiffusionPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-x4-upscaler", **safety_modules, torch_dtype=torch.float16
+)
+stage_3.enable_model_cpu_offload()
+
+prompt = "A fantasy landscape in style minecraft"
+generator = torch.manual_seed(1)
+
+# text embeds
+prompt_embeds, negative_embeds = stage_1.encode_prompt(prompt)
+
+# stage 1
+image = stage_1(
+    image=original_image,
+    prompt_embeds=prompt_embeds,
+    negative_prompt_embeds=negative_embeds,
+    generator=generator,
+    output_type="pt",
+).images
+pt_to_pil(image)[0].save("./if_stage_I.png")
+
+# stage 2
+image = stage_2(
+    image=image,
+    original_image=original_image,
+    prompt_embeds=prompt_embeds,
+    negative_prompt_embeds=negative_embeds,
+    generator=generator,
+    output_type="pt",
+).images
+pt_to_pil(image)[0].save("./if_stage_II.png")
+
+# stage 3
+image = stage_3(prompt=prompt, image=image, generator=generator, noise_level=100).images
+image[0].save("./if_stage_III.png")
+```
+
+### Text Guided Inpainting Generation
+
+The same IF model weights can be used for text-guided image-to-image translation or image variation.
+In this case just make sure to load the weights using the [`IFInpaintingPipeline`] and [`IFInpaintingSuperResolutionPipeline`] pipelines.
+
+**Note**: You can also directly move the weights of the text-to-image pipelines to the image-to-image pipelines
+without loading them twice by making use of the [`~DiffusionPipeline.components()`] function as explained [here](#converting-between-different-pipelines).
+
+```python
+from diffusers import IFInpaintingPipeline, IFInpaintingSuperResolutionPipeline, DiffusionPipeline
+from diffusers.utils import pt_to_pil
+import torch
+
+from PIL import Image
+import requests
+from io import BytesIO
+
+# download image
+url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/if/person.png"
+response = requests.get(url)
+original_image = Image.open(BytesIO(response.content)).convert("RGB")
+original_image = original_image
+
+# download mask
+url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/if/glasses_mask.png"
+response = requests.get(url)
+mask_image = Image.open(BytesIO(response.content))
+mask_image = mask_image
+
+# stage 1
+stage_1 = IFInpaintingPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+stage_1.enable_model_cpu_offload()
+
+# stage 2
+stage_2 = IFInpaintingSuperResolutionPipeline.from_pretrained(
+    "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16
+)
+stage_2.enable_model_cpu_offload()
+
+# stage 3
+safety_modules = {
+    "feature_extractor": stage_1.feature_extractor,
+    "safety_checker": stage_1.safety_checker,
+    "watermarker": stage_1.watermarker,
+}
+stage_3 = DiffusionPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-x4-upscaler", **safety_modules, torch_dtype=torch.float16
+)
+stage_3.enable_model_cpu_offload()
+
+prompt = "blue sunglasses"
+generator = torch.manual_seed(1)
+
+# text embeds
+prompt_embeds, negative_embeds = stage_1.encode_prompt(prompt)
+
+# stage 1
+image = stage_1(
+    image=original_image,
+    mask_image=mask_image,
+    prompt_embeds=prompt_embeds,
+    negative_prompt_embeds=negative_embeds,
+    generator=generator,
+    output_type="pt",
+).images
+pt_to_pil(image)[0].save("./if_stage_I.png")
+
+# stage 2
+image = stage_2(
+    image=image,
+    original_image=original_image,
+    mask_image=mask_image,
+    prompt_embeds=prompt_embeds,
+    negative_prompt_embeds=negative_embeds,
+    generator=generator,
+    output_type="pt",
+).images
+pt_to_pil(image)[0].save("./if_stage_II.png")
+
+# stage 3
+image = stage_3(prompt=prompt, image=image, generator=generator, noise_level=100).images
+image[0].save("./if_stage_III.png")
+```
+
+### Converting between different pipelines
+
+In addition to being loaded with `from_pretrained`, Pipelines can also be loaded directly from each other.
+
+```python
+from diffusers import IFPipeline, IFSuperResolutionPipeline
+
+pipe_1 = IFPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0")
+pipe_2 = IFSuperResolutionPipeline.from_pretrained("DeepFloyd/IF-II-L-v1.0")
+
+
+from diffusers import IFImg2ImgPipeline, IFImg2ImgSuperResolutionPipeline
+
+pipe_1 = IFImg2ImgPipeline(**pipe_1.components)
+pipe_2 = IFImg2ImgSuperResolutionPipeline(**pipe_2.components)
+
+
+from diffusers import IFInpaintingPipeline, IFInpaintingSuperResolutionPipeline
+
+pipe_1 = IFInpaintingPipeline(**pipe_1.components)
+pipe_2 = IFInpaintingSuperResolutionPipeline(**pipe_2.components)
+```
+
+### Optimizing for speed
+
+The simplest optimization to run IF faster is to move all model components to the GPU.
+
+```py
+pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+pipe.to("cuda")
+```
+
+You can also run the diffusion process for a shorter number of timesteps.
+
+This can either be done with the `num_inference_steps` argument
+
+```py
+pipe("<prompt>", num_inference_steps=30)
+```
+
+Or with the `timesteps` argument
+
+```py
+from diffusers.pipelines.deepfloyd_if import fast27_timesteps
+
+pipe("<prompt>", timesteps=fast27_timesteps)
+```
+
+When doing image variation or inpainting, you can also decrease the number of timesteps
+with the strength argument. The strength argument is the amount of noise to add to 
+the input image which also determines how many steps to run in the denoising process.
+A smaller number will vary the image less but run faster.
+
+```py
+pipe = IFImg2ImgPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+pipe.to("cuda")
+
+image = pipe(image=image, prompt="<prompt>", strength=0.3).images
+```
+
+You can also use [`torch.compile`](../../optimization/torch2.0). Note that we have not exhaustively tested `torch.compile`
+with IF and it might not give expected results.
+
+```py
+import torch
+
+pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+pipe.to("cuda")
+
+pipe.text_encoder = torch.compile(pipe.text_encoder)
+pipe.unet = torch.compile(pipe.unet)
+```
+
+### Optimizing for memory
+
+When optimizing for GPU memory, we can use the standard diffusers cpu offloading APIs.
+
+Either the model based CPU offloading,
+
+```py
+pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+pipe.enable_model_cpu_offload()
+```
+
+or the more aggressive layer based CPU offloading.
+
+```py
+pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+pipe.enable_sequential_cpu_offload()
+```
+
+Additionally, T5 can be loaded in 8bit precision
+
+```py
+from transformers import T5EncoderModel
+
+text_encoder = T5EncoderModel.from_pretrained(
+    "DeepFloyd/IF-I-IF-v1.0", subfolder="text_encoder", device_map="auto", load_in_8bit=True, variant="8bit"
+)
+
+from diffusers import DiffusionPipeline
+
+pipe = DiffusionPipeline.from_pretrained(
+    "DeepFloyd/IF-I-IF-v1.0",
+    text_encoder=text_encoder,  # pass the previously instantiated 8bit text encoder
+    unet=None,
+    device_map="auto",
+)
+
+prompt_embeds, negative_embeds = pipe.encode_prompt("<prompt>")
+```
+
+For CPU RAM constrained machines like google colab free tier where we can't load all 
+model components to the CPU at once, we can manually only load the pipeline with
+the text encoder or unet when the respective model components are needed.
+
+```py
+from diffusers import IFPipeline, IFSuperResolutionPipeline
+import torch
+import gc
+from transformers import T5EncoderModel
+from diffusers.utils import pt_to_pil
+
+text_encoder = T5EncoderModel.from_pretrained(
+    "DeepFloyd/IF-I-IF-v1.0", subfolder="text_encoder", device_map="auto", load_in_8bit=True, variant="8bit"
+)
+
+# text to image
+
+pipe = DiffusionPipeline.from_pretrained(
+    "DeepFloyd/IF-I-IF-v1.0",
+    text_encoder=text_encoder,  # pass the previously instantiated 8bit text encoder
+    unet=None,
+    device_map="auto",
+)
+
+prompt = 'a photo of a kangaroo wearing an orange hoodie and blue sunglasses standing in front of the eiffel tower holding a sign that says "very deep learning"'
+prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
+
+# Remove the pipeline so we can re-load the pipeline with the unet
+del text_encoder
+del pipe
+gc.collect()
+torch.cuda.empty_cache()
+
+pipe = IFPipeline.from_pretrained(
+    "DeepFloyd/IF-I-IF-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16, device_map="auto"
+)
+
+generator = torch.Generator().manual_seed(0)
+image = pipe(
+    prompt_embeds=prompt_embeds,
+    negative_prompt_embeds=negative_embeds,
+    output_type="pt",
+    generator=generator,
+).images
+
+pt_to_pil(image)[0].save("./if_stage_I.png")
+
+# Remove the pipeline so we can load the super-resolution pipeline
+del pipe
+gc.collect()
+torch.cuda.empty_cache()
+
+# First super resolution
+
+pipe = IFSuperResolutionPipeline.from_pretrained(
+    "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16, device_map="auto"
+)
+
+generator = torch.Generator().manual_seed(0)
+image = pipe(
+    image=image,
+    prompt_embeds=prompt_embeds,
+    negative_prompt_embeds=negative_embeds,
+    output_type="pt",
+    generator=generator,
+).images
+
+pt_to_pil(image)[0].save("./if_stage_II.png")
+```
+
+
+## Available Pipelines:
+
+| Pipeline | Tasks | Colab
+|---|---|:---:|
+| [pipeline_if.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py) | *Text-to-Image Generation* | - |
+| [pipeline_if_superresolution.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py) | *Text-to-Image Generation* | - |
+| [pipeline_if_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py) | *Image-to-Image Generation* | - |
+| [pipeline_if_img2img_superresolution.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py) | *Image-to-Image Generation* | - |
+| [pipeline_if_inpainting.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py) | *Image-to-Image Generation* | - |
+| [pipeline_if_inpainting_superresolution.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py) | *Image-to-Image Generation* | - |
+
+## IFPipeline
+[[autodoc]] IFPipeline
+	- all
+	- __call__
+
+## IFSuperResolutionPipeline
+[[autodoc]] IFSuperResolutionPipeline
+	- all
+	- __call__
+
+## IFImg2ImgPipeline
+[[autodoc]] IFImg2ImgPipeline
+	- all
+	- __call__
+
+## IFImg2ImgSuperResolutionPipeline
+[[autodoc]] IFImg2ImgSuperResolutionPipeline
+	- all
+	- __call__
+
+## IFInpaintingPipeline
+[[autodoc]] IFInpaintingPipeline
+	- all
+	- __call__
+
+## IFInpaintingSuperResolutionPipeline
+[[autodoc]] IFInpaintingSuperResolutionPipeline
+	- all
+	- __call__
diff --git a/docs/source/en/api/pipelines/overview.mdx b/docs/source/en/api/pipelines/overview.mdx
index 3c5331955513..91716784f8fe 100644
--- a/docs/source/en/api/pipelines/overview.mdx
+++ b/docs/source/en/api/pipelines/overview.mdx
@@ -51,6 +51,9 @@ available a colab notebook to directly try them out.
 | [dance_diffusion](./dance_diffusion) | [**Dance Diffusion**](https://github.com/williamberman/diffusers.git) | Unconditional Audio Generation |
 | [ddpm](./ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation |
 | [ddim](./ddim) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) | Unconditional Image Generation |
+| [if](./if) | [**IF**](https://github.com/deep-floyd/IF) | Image Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/deepfloyd_if_free_tier_google_colab.ipynb)
+| [if_img2img](./if) | [**IF**](https://github.com/deep-floyd/IF) | Image-to-Image Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/deepfloyd_if_free_tier_google_colab.ipynb)
+| [if_inpainting](./if) | [**IF**](https://github.com/deep-floyd/IF) | Image-to-Image Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/deepfloyd_if_free_tier_google_colab.ipynb)
 | [latent_diffusion](./latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Text-to-Image Generation | 
 | [latent_diffusion](./latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Super Resolution Image-to-Image | 
 | [latent_diffusion_uncond](./latent_diffusion_uncond) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752) | Unconditional Image Generation | 
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 10a237f29278..46a985ac2f8d 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -58,6 +58,9 @@ The library has three main components:
 | [dance_diffusion](./api/pipelines/dance_diffusion) | [Dance Diffusion](https://github.com/williamberman/diffusers.git) | Unconditional Audio Generation |
 | [ddpm](./api/pipelines/ddpm) | [Denoising Diffusion Probabilistic Models](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation |
 | [ddim](./api/pipelines/ddim) | [Denoising Diffusion Implicit Models](https://arxiv.org/abs/2010.02502) | Unconditional Image Generation |
+| [if](./if) | [**IF**](./api/pipelines/if) | Image Generation |
+| [if_img2img](./if) | [**IF**](./api/pipelines/if) | Image-to-Image Generation |
+| [if_inpainting](./if) | [**IF**](./api/pipelines/if) | Image-to-Image Generation |
 | [latent_diffusion](./api/pipelines/latent_diffusion) | [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752)| Text-to-Image Generation |
 | [latent_diffusion](./api/pipelines/latent_diffusion) | [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752)| Super Resolution Image-to-Image |
 | [latent_diffusion_uncond](./api/pipelines/latent_diffusion_uncond) | [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) | Unconditional Image Generation |
diff --git a/scripts/convert_if.py b/scripts/convert_if.py
new file mode 100644
index 000000000000..66d7f694c8e1
--- /dev/null
+++ b/scripts/convert_if.py
@@ -0,0 +1,1257 @@
+import argparse
+import inspect
+import os
+
+import numpy as np
+import torch
+from torch.nn import functional as F
+from transformers import CLIPConfig, CLIPImageProcessor, CLIPVisionModelWithProjection, T5EncoderModel, T5Tokenizer
+
+from diffusers import DDPMScheduler, IFPipeline, IFSuperResolutionPipeline, UNet2DConditionModel
+from diffusers.pipelines.deepfloyd_if.safety_checker import IFSafetyChecker
+
+
+try:
+    from omegaconf import OmegaConf
+except ImportError:
+    raise ImportError(
+        "OmegaConf is required to convert the IF checkpoints. Please install it with `pip install" " OmegaConf`."
+    )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--dump_path", required=False, default=None, type=str)
+
+    parser.add_argument("--dump_path_stage_2", required=False, default=None, type=str)
+
+    parser.add_argument("--dump_path_stage_3", required=False, default=None, type=str)
+
+    parser.add_argument("--unet_config", required=False, default=None, type=str, help="Path to unet config file")
+
+    parser.add_argument(
+        "--unet_checkpoint_path", required=False, default=None, type=str, help="Path to unet checkpoint file"
+    )
+
+    parser.add_argument(
+        "--unet_checkpoint_path_stage_2",
+        required=False,
+        default=None,
+        type=str,
+        help="Path to stage 2 unet checkpoint file",
+    )
+
+    parser.add_argument(
+        "--unet_checkpoint_path_stage_3",
+        required=False,
+        default=None,
+        type=str,
+        help="Path to stage 3 unet checkpoint file",
+    )
+
+    parser.add_argument("--p_head_path", type=str, required=True)
+
+    parser.add_argument("--w_head_path", type=str, required=True)
+
+    args = parser.parse_args()
+
+    return args
+
+
+def main(args):
+    tokenizer = T5Tokenizer.from_pretrained("google/t5-v1_1-xxl")
+    text_encoder = T5EncoderModel.from_pretrained("google/t5-v1_1-xxl")
+
+    feature_extractor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14")
+    safety_checker = convert_safety_checker(p_head_path=args.p_head_path, w_head_path=args.w_head_path)
+
+    if args.unet_config is not None and args.unet_checkpoint_path is not None and args.dump_path is not None:
+        convert_stage_1_pipeline(tokenizer, text_encoder, feature_extractor, safety_checker, args)
+
+    if args.unet_checkpoint_path_stage_2 is not None and args.dump_path_stage_2 is not None:
+        convert_super_res_pipeline(tokenizer, text_encoder, feature_extractor, safety_checker, args, stage=2)
+
+    if args.unet_checkpoint_path_stage_3 is not None and args.dump_path_stage_3 is not None:
+        convert_super_res_pipeline(tokenizer, text_encoder, feature_extractor, safety_checker, args, stage=3)
+
+
+def convert_stage_1_pipeline(tokenizer, text_encoder, feature_extractor, safety_checker, args):
+    unet = get_stage_1_unet(args.unet_config, args.unet_checkpoint_path)
+
+    scheduler = DDPMScheduler(
+        variance_type="learned_range",
+        beta_schedule="squaredcos_cap_v2",
+        prediction_type="epsilon",
+        thresholding=True,
+        dynamic_thresholding_ratio=0.95,
+        sample_max_value=1.5,
+    )
+
+    pipe = IFPipeline(
+        tokenizer=tokenizer,
+        text_encoder=text_encoder,
+        unet=unet,
+        scheduler=scheduler,
+        safety_checker=safety_checker,
+        feature_extractor=feature_extractor,
+        requires_safety_checker=True,
+    )
+
+    pipe.save_pretrained(args.dump_path)
+
+
+def convert_super_res_pipeline(tokenizer, text_encoder, feature_extractor, safety_checker, args, stage):
+    if stage == 2:
+        unet_checkpoint_path = args.unet_checkpoint_path_stage_2
+        sample_size = None
+        dump_path = args.dump_path_stage_2
+    elif stage == 3:
+        unet_checkpoint_path = args.unet_checkpoint_path_stage_3
+        sample_size = 1024
+        dump_path = args.dump_path_stage_3
+    else:
+        assert False
+
+    unet = get_super_res_unet(unet_checkpoint_path, verify_param_count=False, sample_size=sample_size)
+
+    image_noising_scheduler = DDPMScheduler(
+        beta_schedule="squaredcos_cap_v2",
+    )
+
+    scheduler = DDPMScheduler(
+        variance_type="learned_range",
+        beta_schedule="squaredcos_cap_v2",
+        prediction_type="epsilon",
+        thresholding=True,
+        dynamic_thresholding_ratio=0.95,
+        sample_max_value=1.0,
+    )
+
+    pipe = IFSuperResolutionPipeline(
+        tokenizer=tokenizer,
+        text_encoder=text_encoder,
+        unet=unet,
+        scheduler=scheduler,
+        image_noising_scheduler=image_noising_scheduler,
+        safety_checker=safety_checker,
+        feature_extractor=feature_extractor,
+        requires_safety_checker=True,
+    )
+
+    pipe.save_pretrained(dump_path)
+
+
+def get_stage_1_unet(unet_config, unet_checkpoint_path):
+    original_unet_config = OmegaConf.load(unet_config)
+    original_unet_config = original_unet_config.params
+
+    unet_diffusers_config = create_unet_diffusers_config(original_unet_config)
+
+    unet = UNet2DConditionModel(**unet_diffusers_config)
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    unet_checkpoint = torch.load(unet_checkpoint_path, map_location=device)
+
+    converted_unet_checkpoint = convert_ldm_unet_checkpoint(
+        unet_checkpoint, unet_diffusers_config, path=unet_checkpoint_path
+    )
+
+    unet.load_state_dict(converted_unet_checkpoint)
+
+    return unet
+
+
+def convert_safety_checker(p_head_path, w_head_path):
+    state_dict = {}
+
+    # p head
+
+    p_head = np.load(p_head_path)
+
+    p_head_weights = p_head["weights"]
+    p_head_weights = torch.from_numpy(p_head_weights)
+    p_head_weights = p_head_weights.unsqueeze(0)
+
+    p_head_biases = p_head["biases"]
+    p_head_biases = torch.from_numpy(p_head_biases)
+    p_head_biases = p_head_biases.unsqueeze(0)
+
+    state_dict["p_head.weight"] = p_head_weights
+    state_dict["p_head.bias"] = p_head_biases
+
+    # w head
+
+    w_head = np.load(w_head_path)
+
+    w_head_weights = w_head["weights"]
+    w_head_weights = torch.from_numpy(w_head_weights)
+    w_head_weights = w_head_weights.unsqueeze(0)
+
+    w_head_biases = w_head["biases"]
+    w_head_biases = torch.from_numpy(w_head_biases)
+    w_head_biases = w_head_biases.unsqueeze(0)
+
+    state_dict["w_head.weight"] = w_head_weights
+    state_dict["w_head.bias"] = w_head_biases
+
+    # vision model
+
+    vision_model = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")
+    vision_model_state_dict = vision_model.state_dict()
+
+    for key, value in vision_model_state_dict.items():
+        key = f"vision_model.{key}"
+        state_dict[key] = value
+
+    # full model
+
+    config = CLIPConfig.from_pretrained("openai/clip-vit-large-patch14")
+    safety_checker = IFSafetyChecker(config)
+
+    safety_checker.load_state_dict(state_dict)
+
+    return safety_checker
+
+
+def create_unet_diffusers_config(original_unet_config, class_embed_type=None):
+    attention_resolutions = parse_list(original_unet_config.attention_resolutions)
+    attention_resolutions = [original_unet_config.image_size // int(res) for res in attention_resolutions]
+
+    channel_mult = parse_list(original_unet_config.channel_mult)
+    block_out_channels = [original_unet_config.model_channels * mult for mult in channel_mult]
+
+    down_block_types = []
+    resolution = 1
+
+    for i in range(len(block_out_channels)):
+        if resolution in attention_resolutions:
+            block_type = "SimpleCrossAttnDownBlock2D"
+        elif original_unet_config.resblock_updown:
+            block_type = "ResnetDownsampleBlock2D"
+        else:
+            block_type = "DownBlock2D"
+
+        down_block_types.append(block_type)
+
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
+
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        if resolution in attention_resolutions:
+            block_type = "SimpleCrossAttnUpBlock2D"
+        elif original_unet_config.resblock_updown:
+            block_type = "ResnetUpsampleBlock2D"
+        else:
+            block_type = "UpBlock2D"
+        up_block_types.append(block_type)
+        resolution //= 2
+
+    head_dim = original_unet_config.num_head_channels
+
+    use_linear_projection = (
+        original_unet_config.use_linear_in_transformer
+        if "use_linear_in_transformer" in original_unet_config
+        else False
+    )
+    if use_linear_projection:
+        # stable diffusion 2-base-512 and 2-768
+        if head_dim is None:
+            head_dim = [5, 10, 20, 20]
+
+    projection_class_embeddings_input_dim = None
+
+    if class_embed_type is None:
+        if "num_classes" in original_unet_config:
+            if original_unet_config.num_classes == "sequential":
+                class_embed_type = "projection"
+                assert "adm_in_channels" in original_unet_config
+                projection_class_embeddings_input_dim = original_unet_config.adm_in_channels
+            else:
+                raise NotImplementedError(
+                    f"Unknown conditional unet num_classes config: {original_unet_config.num_classes}"
+                )
+
+    config = {
+        "sample_size": original_unet_config.image_size,
+        "in_channels": original_unet_config.in_channels,
+        "down_block_types": tuple(down_block_types),
+        "block_out_channels": tuple(block_out_channels),
+        "layers_per_block": original_unet_config.num_res_blocks,
+        "cross_attention_dim": original_unet_config.encoder_channels,
+        "attention_head_dim": head_dim,
+        "use_linear_projection": use_linear_projection,
+        "class_embed_type": class_embed_type,
+        "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim,
+        "out_channels": original_unet_config.out_channels,
+        "up_block_types": tuple(up_block_types),
+        "upcast_attention": False,  # TODO: guessing
+        "cross_attention_norm": "group_norm",
+        "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+        "addition_embed_type": "text",
+        "act_fn": "gelu",
+    }
+
+    if original_unet_config.use_scale_shift_norm:
+        config["resnet_time_scale_shift"] = "scale_shift"
+
+    if "encoder_dim" in original_unet_config:
+        config["encoder_hid_dim"] = original_unet_config.encoder_dim
+
+    return config
+
+
+def convert_ldm_unet_checkpoint(unet_state_dict, config, path=None):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+    new_checkpoint = {}
+
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+
+    if config["class_embed_type"] in [None, "identity"]:
+        # No parameters to port
+        ...
+    elif config["class_embed_type"] == "timestep" or config["class_embed_type"] == "projection":
+        new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"]
+        new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
+        new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"]
+        new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
+    else:
+        raise NotImplementedError(f"Not implemented `class_embed_type`: {config['class_embed_type']}")
+
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+
+    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}." in key]
+        for layer_id in range(num_input_blocks)
+    }
+
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}." in key]
+        for layer_id in range(num_output_blocks)
+    }
+
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
+
+        paths = renew_resnet_paths(resnets)
+
+        # TODO need better check than i in [4, 8, 12, 16]
+        block_type = config["down_block_types"][block_id]
+        if (block_type == "ResnetDownsampleBlock2D" or block_type == "SimpleCrossAttnDownBlock2D") and i in [
+            4,
+            8,
+            12,
+            16,
+        ]:
+            meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.downsamplers.0"}
+        else:
+            meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+
+        if len(attentions):
+            old_path = f"input_blocks.{i}.1"
+            new_path = f"down_blocks.{block_id}.attentions.{layer_in_block_id}"
+
+            assign_attention_to_checkpoint(
+                new_checkpoint=new_checkpoint,
+                unet_state_dict=unet_state_dict,
+                old_path=old_path,
+                new_path=new_path,
+                config=config,
+            )
+
+            paths = renew_attention_paths(attentions)
+            meta_path = {"old": old_path, "new": new_path}
+            assign_to_checkpoint(
+                paths,
+                new_checkpoint,
+                unet_state_dict,
+                additional_replacements=[meta_path],
+                config=config,
+            )
+
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+
+    old_path = "middle_block.1"
+    new_path = "mid_block.attentions.0"
+
+    assign_attention_to_checkpoint(
+        new_checkpoint=new_checkpoint,
+        unet_state_dict=unet_state_dict,
+        old_path=old_path,
+        new_path=new_path,
+        config=config,
+    )
+
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+
+    for i in range(num_output_blocks):
+        block_id = i // (config["layers_per_block"] + 1)
+        layer_in_block_id = i % (config["layers_per_block"] + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+
+        # len(output_block_list) == 1 -> resnet
+        # len(output_block_list) == 2 -> resnet, attention
+        # len(output_block_list) == 3 -> resnet, attention, upscale resnet
+
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
+
+            paths = renew_resnet_paths(resnets)
+
+            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
+
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
+            if ["conv.bias", "conv.weight"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
+
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+
+            if len(attentions):
+                old_path = f"output_blocks.{i}.1"
+                new_path = f"up_blocks.{block_id}.attentions.{layer_in_block_id}"
+
+                assign_attention_to_checkpoint(
+                    new_checkpoint=new_checkpoint,
+                    unet_state_dict=unet_state_dict,
+                    old_path=old_path,
+                    new_path=new_path,
+                    config=config,
+                )
+
+                paths = renew_attention_paths(attentions)
+                meta_path = {
+                    "old": old_path,
+                    "new": new_path,
+                }
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+
+            if len(output_block_list) == 3:
+                resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.2" in key]
+                paths = renew_resnet_paths(resnets)
+                meta_path = {"old": f"output_blocks.{i}.2", "new": f"up_blocks.{block_id}.upsamplers.0"}
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
+
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+
+    if "encoder_proj.weight" in unet_state_dict:
+        new_checkpoint["encoder_hid_proj.weight"] = unet_state_dict.pop("encoder_proj.weight")
+        new_checkpoint["encoder_hid_proj.bias"] = unet_state_dict.pop("encoder_proj.bias")
+
+    if "encoder_pooling.0.weight" in unet_state_dict:
+        new_checkpoint["add_embedding.norm1.weight"] = unet_state_dict.pop("encoder_pooling.0.weight")
+        new_checkpoint["add_embedding.norm1.bias"] = unet_state_dict.pop("encoder_pooling.0.bias")
+
+        new_checkpoint["add_embedding.pool.positional_embedding"] = unet_state_dict.pop(
+            "encoder_pooling.1.positional_embedding"
+        )
+        new_checkpoint["add_embedding.pool.k_proj.weight"] = unet_state_dict.pop("encoder_pooling.1.k_proj.weight")
+        new_checkpoint["add_embedding.pool.k_proj.bias"] = unet_state_dict.pop("encoder_pooling.1.k_proj.bias")
+        new_checkpoint["add_embedding.pool.q_proj.weight"] = unet_state_dict.pop("encoder_pooling.1.q_proj.weight")
+        new_checkpoint["add_embedding.pool.q_proj.bias"] = unet_state_dict.pop("encoder_pooling.1.q_proj.bias")
+        new_checkpoint["add_embedding.pool.v_proj.weight"] = unet_state_dict.pop("encoder_pooling.1.v_proj.weight")
+        new_checkpoint["add_embedding.pool.v_proj.bias"] = unet_state_dict.pop("encoder_pooling.1.v_proj.bias")
+
+        new_checkpoint["add_embedding.proj.weight"] = unet_state_dict.pop("encoder_pooling.2.weight")
+        new_checkpoint["add_embedding.proj.bias"] = unet_state_dict.pop("encoder_pooling.2.bias")
+
+        new_checkpoint["add_embedding.norm2.weight"] = unet_state_dict.pop("encoder_pooling.3.weight")
+        new_checkpoint["add_embedding.norm2.bias"] = unet_state_dict.pop("encoder_pooling.3.bias")
+
+    return new_checkpoint
+
+
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+
+
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
+
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
+
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        if "qkv" in new_item:
+            continue
+
+        if "encoder_kv" in new_item:
+            continue
+
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")
+
+        new_item = new_item.replace("proj_out.weight", "to_out.0.weight")
+        new_item = new_item.replace("proj_out.bias", "to_out.0.bias")
+
+        new_item = new_item.replace("norm_encoder.weight", "norm_cross.weight")
+        new_item = new_item.replace("norm_encoder.bias", "norm_cross.bias")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def assign_attention_to_checkpoint(new_checkpoint, unet_state_dict, old_path, new_path, config):
+    qkv_weight = unet_state_dict.pop(f"{old_path}.qkv.weight")
+    qkv_weight = qkv_weight[:, :, 0]
+
+    qkv_bias = unet_state_dict.pop(f"{old_path}.qkv.bias")
+
+    is_cross_attn_only = "only_cross_attention" in config and config["only_cross_attention"]
+
+    split = 1 if is_cross_attn_only else 3
+
+    weights, bias = split_attentions(
+        weight=qkv_weight,
+        bias=qkv_bias,
+        split=split,
+        chunk_size=config["attention_head_dim"],
+    )
+
+    if is_cross_attn_only:
+        query_weight, q_bias = weights, bias
+        new_checkpoint[f"{new_path}.to_q.weight"] = query_weight[0]
+        new_checkpoint[f"{new_path}.to_q.bias"] = q_bias[0]
+    else:
+        [query_weight, key_weight, value_weight], [q_bias, k_bias, v_bias] = weights, bias
+        new_checkpoint[f"{new_path}.to_q.weight"] = query_weight
+        new_checkpoint[f"{new_path}.to_q.bias"] = q_bias
+        new_checkpoint[f"{new_path}.to_k.weight"] = key_weight
+        new_checkpoint[f"{new_path}.to_k.bias"] = k_bias
+        new_checkpoint[f"{new_path}.to_v.weight"] = value_weight
+        new_checkpoint[f"{new_path}.to_v.bias"] = v_bias
+
+    encoder_kv_weight = unet_state_dict.pop(f"{old_path}.encoder_kv.weight")
+    encoder_kv_weight = encoder_kv_weight[:, :, 0]
+
+    encoder_kv_bias = unet_state_dict.pop(f"{old_path}.encoder_kv.bias")
+
+    [encoder_k_weight, encoder_v_weight], [encoder_k_bias, encoder_v_bias] = split_attentions(
+        weight=encoder_kv_weight,
+        bias=encoder_kv_bias,
+        split=2,
+        chunk_size=config["attention_head_dim"],
+    )
+
+    new_checkpoint[f"{new_path}.add_k_proj.weight"] = encoder_k_weight
+    new_checkpoint[f"{new_path}.add_k_proj.bias"] = encoder_k_bias
+    new_checkpoint[f"{new_path}.add_v_proj.weight"] = encoder_v_weight
+    new_checkpoint[f"{new_path}.add_v_proj.bias"] = encoder_v_bias
+
+
+def assign_to_checkpoint(paths, checkpoint, old_checkpoint, additional_replacements=None, config=None):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
+    attention layers, and takes into account additional replacements that may arise.
+
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+
+    for path in paths:
+        new_path = path["new"]
+
+        # Global renaming happens here
+        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
+        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
+        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
+
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+
+        # proj_attn.weight has to be converted from conv 1D to linear
+        if "proj_attn.weight" in new_path or "to_out.0.weight" in new_path:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
+        else:
+            checkpoint[new_path] = old_checkpoint[path["old"]]
+
+
+# TODO maybe document and/or can do more efficiently (build indices in for loop and extract once for each split?)
+def split_attentions(*, weight, bias, split, chunk_size):
+    weights = [None] * split
+    biases = [None] * split
+
+    weights_biases_idx = 0
+
+    for starting_row_index in range(0, weight.shape[0], chunk_size):
+        row_indices = torch.arange(starting_row_index, starting_row_index + chunk_size)
+
+        weight_rows = weight[row_indices, :]
+        bias_rows = bias[row_indices]
+
+        if weights[weights_biases_idx] is None:
+            weights[weights_biases_idx] = weight_rows
+            biases[weights_biases_idx] = bias_rows
+        else:
+            assert weights[weights_biases_idx] is not None
+            weights[weights_biases_idx] = torch.concat([weights[weights_biases_idx], weight_rows])
+            biases[weights_biases_idx] = torch.concat([biases[weights_biases_idx], bias_rows])
+
+        weights_biases_idx = (weights_biases_idx + 1) % split
+
+    return weights, biases
+
+
+def parse_list(value):
+    if isinstance(value, str):
+        value = value.split(",")
+        value = [int(v) for v in value]
+    elif isinstance(value, list):
+        pass
+    else:
+        raise ValueError(f"Can't parse list for type: {type(value)}")
+
+    return value
+
+
+# below is copy and pasted from original convert_if_stage_2.py script
+
+
+def get_super_res_unet(unet_checkpoint_path, verify_param_count=True, sample_size=None):
+    orig_path = unet_checkpoint_path
+
+    original_unet_config = OmegaConf.load(os.path.join(orig_path, "config.yml"))
+    original_unet_config = original_unet_config.params
+
+    unet_diffusers_config = superres_create_unet_diffusers_config(original_unet_config)
+    unet_diffusers_config["time_embedding_dim"] = original_unet_config.model_channels * int(
+        original_unet_config.channel_mult.split(",")[-1]
+    )
+    if original_unet_config.encoder_dim != original_unet_config.encoder_channels:
+        unet_diffusers_config["encoder_hid_dim"] = original_unet_config.encoder_dim
+        unet_diffusers_config["class_embed_type"] = "timestep"
+        unet_diffusers_config["addition_embed_type"] = "text"
+
+    unet_diffusers_config["time_embedding_act_fn"] = "gelu"
+    unet_diffusers_config["resnet_skip_time_act"] = True
+    unet_diffusers_config["resnet_out_scale_factor"] = 1 / 0.7071
+    unet_diffusers_config["mid_block_scale_factor"] = 1 / 0.7071
+    unet_diffusers_config["only_cross_attention"] = (
+        bool(original_unet_config.disable_self_attentions)
+        if (
+            "disable_self_attentions" in original_unet_config
+            and isinstance(original_unet_config.disable_self_attentions, int)
+        )
+        else True
+    )
+
+    if sample_size is None:
+        unet_diffusers_config["sample_size"] = original_unet_config.image_size
+    else:
+        # The second upscaler unet's sample size is incorrectly specified
+        # in the config and is instead hardcoded in source
+        unet_diffusers_config["sample_size"] = sample_size
+
+    unet_checkpoint = torch.load(os.path.join(unet_checkpoint_path, "pytorch_model.bin"), map_location="cpu")
+
+    if verify_param_count:
+        # check that architecture matches - is a bit slow
+        verify_param_count(orig_path, unet_diffusers_config)
+
+    converted_unet_checkpoint = superres_convert_ldm_unet_checkpoint(
+        unet_checkpoint, unet_diffusers_config, path=unet_checkpoint_path
+    )
+    converted_keys = converted_unet_checkpoint.keys()
+
+    model = UNet2DConditionModel(**unet_diffusers_config)
+    expected_weights = model.state_dict().keys()
+
+    diff_c_e = set(converted_keys) - set(expected_weights)
+    diff_e_c = set(expected_weights) - set(converted_keys)
+
+    assert len(diff_e_c) == 0, f"Expected, but not converted: {diff_e_c}"
+    assert len(diff_c_e) == 0, f"Converted, but not expected: {diff_c_e}"
+
+    model.load_state_dict(converted_unet_checkpoint)
+
+    return model
+
+
+def superres_create_unet_diffusers_config(original_unet_config):
+    attention_resolutions = parse_list(original_unet_config.attention_resolutions)
+    attention_resolutions = [original_unet_config.image_size // int(res) for res in attention_resolutions]
+
+    channel_mult = parse_list(original_unet_config.channel_mult)
+    block_out_channels = [original_unet_config.model_channels * mult for mult in channel_mult]
+
+    down_block_types = []
+    resolution = 1
+
+    for i in range(len(block_out_channels)):
+        if resolution in attention_resolutions:
+            block_type = "SimpleCrossAttnDownBlock2D"
+        elif original_unet_config.resblock_updown:
+            block_type = "ResnetDownsampleBlock2D"
+        else:
+            block_type = "DownBlock2D"
+
+        down_block_types.append(block_type)
+
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
+
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        if resolution in attention_resolutions:
+            block_type = "SimpleCrossAttnUpBlock2D"
+        elif original_unet_config.resblock_updown:
+            block_type = "ResnetUpsampleBlock2D"
+        else:
+            block_type = "UpBlock2D"
+        up_block_types.append(block_type)
+        resolution //= 2
+
+    head_dim = original_unet_config.num_head_channels
+    use_linear_projection = (
+        original_unet_config.use_linear_in_transformer
+        if "use_linear_in_transformer" in original_unet_config
+        else False
+    )
+    if use_linear_projection:
+        # stable diffusion 2-base-512 and 2-768
+        if head_dim is None:
+            head_dim = [5, 10, 20, 20]
+
+    class_embed_type = None
+    projection_class_embeddings_input_dim = None
+
+    if "num_classes" in original_unet_config:
+        if original_unet_config.num_classes == "sequential":
+            class_embed_type = "projection"
+            assert "adm_in_channels" in original_unet_config
+            projection_class_embeddings_input_dim = original_unet_config.adm_in_channels
+        else:
+            raise NotImplementedError(
+                f"Unknown conditional unet num_classes config: {original_unet_config.num_classes}"
+            )
+
+    config = {
+        "in_channels": original_unet_config.in_channels,
+        "down_block_types": tuple(down_block_types),
+        "block_out_channels": tuple(block_out_channels),
+        "layers_per_block": tuple(original_unet_config.num_res_blocks),
+        "cross_attention_dim": original_unet_config.encoder_channels,
+        "attention_head_dim": head_dim,
+        "use_linear_projection": use_linear_projection,
+        "class_embed_type": class_embed_type,
+        "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim,
+        "out_channels": original_unet_config.out_channels,
+        "up_block_types": tuple(up_block_types),
+        "upcast_attention": False,  # TODO: guessing
+        "cross_attention_norm": "group_norm",
+        "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+        "act_fn": "gelu",
+    }
+
+    if original_unet_config.use_scale_shift_norm:
+        config["resnet_time_scale_shift"] = "scale_shift"
+
+    return config
+
+
+def superres_convert_ldm_unet_checkpoint(unet_state_dict, config, path=None, extract_ema=False):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+    new_checkpoint = {}
+
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+
+    if config["class_embed_type"] is None:
+        # No parameters to port
+        ...
+    elif config["class_embed_type"] == "timestep" or config["class_embed_type"] == "projection":
+        new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict["aug_proj.0.weight"]
+        new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict["aug_proj.0.bias"]
+        new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict["aug_proj.2.weight"]
+        new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict["aug_proj.2.bias"]
+    else:
+        raise NotImplementedError(f"Not implemented `class_embed_type`: {config['class_embed_type']}")
+
+    if "encoder_proj.weight" in unet_state_dict:
+        new_checkpoint["encoder_hid_proj.weight"] = unet_state_dict["encoder_proj.weight"]
+        new_checkpoint["encoder_hid_proj.bias"] = unet_state_dict["encoder_proj.bias"]
+
+    if "encoder_pooling.0.weight" in unet_state_dict:
+        mapping = {
+            "encoder_pooling.0": "add_embedding.norm1",
+            "encoder_pooling.1": "add_embedding.pool",
+            "encoder_pooling.2": "add_embedding.proj",
+            "encoder_pooling.3": "add_embedding.norm2",
+        }
+        for key in unet_state_dict.keys():
+            if key.startswith("encoder_pooling"):
+                prefix = key[: len("encoder_pooling.0")]
+                new_key = key.replace(prefix, mapping[prefix])
+                new_checkpoint[new_key] = unet_state_dict[key]
+
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+
+    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}." in key]
+        for layer_id in range(num_input_blocks)
+    }
+
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}." in key]
+        for layer_id in range(num_output_blocks)
+    }
+    if not isinstance(config["layers_per_block"], int):
+        layers_per_block_list = [e + 1 for e in config["layers_per_block"]]
+        layers_per_block_cumsum = list(np.cumsum(layers_per_block_list))
+        downsampler_ids = layers_per_block_cumsum
+    else:
+        # TODO need better check than i in [4, 8, 12, 16]
+        downsampler_ids = [4, 8, 12, 16]
+
+    for i in range(1, num_input_blocks):
+        if isinstance(config["layers_per_block"], int):
+            layers_per_block = config["layers_per_block"]
+            block_id = (i - 1) // (layers_per_block + 1)
+            layer_in_block_id = (i - 1) % (layers_per_block + 1)
+        else:
+            block_id = next(k for k, n in enumerate(layers_per_block_cumsum) if (i - 1) < n)
+            passed_blocks = layers_per_block_cumsum[block_id - 1] if block_id > 0 else 0
+            layer_in_block_id = (i - 1) - passed_blocks
+
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
+
+        paths = renew_resnet_paths(resnets)
+
+        block_type = config["down_block_types"][block_id]
+        if (
+            block_type == "ResnetDownsampleBlock2D" or block_type == "SimpleCrossAttnDownBlock2D"
+        ) and i in downsampler_ids:
+            meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.downsamplers.0"}
+        else:
+            meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+
+        if len(attentions):
+            old_path = f"input_blocks.{i}.1"
+            new_path = f"down_blocks.{block_id}.attentions.{layer_in_block_id}"
+
+            assign_attention_to_checkpoint(
+                new_checkpoint=new_checkpoint,
+                unet_state_dict=unet_state_dict,
+                old_path=old_path,
+                new_path=new_path,
+                config=config,
+            )
+
+            paths = renew_attention_paths(attentions)
+            meta_path = {"old": old_path, "new": new_path}
+            assign_to_checkpoint(
+                paths,
+                new_checkpoint,
+                unet_state_dict,
+                additional_replacements=[meta_path],
+                config=config,
+            )
+
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+
+    old_path = "middle_block.1"
+    new_path = "mid_block.attentions.0"
+
+    assign_attention_to_checkpoint(
+        new_checkpoint=new_checkpoint,
+        unet_state_dict=unet_state_dict,
+        old_path=old_path,
+        new_path=new_path,
+        config=config,
+    )
+
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+    if not isinstance(config["layers_per_block"], int):
+        layers_per_block_list = list(reversed([e + 1 for e in config["layers_per_block"]]))
+        layers_per_block_cumsum = list(np.cumsum(layers_per_block_list))
+
+    for i in range(num_output_blocks):
+        if isinstance(config["layers_per_block"], int):
+            layers_per_block = config["layers_per_block"]
+            block_id = i // (layers_per_block + 1)
+            layer_in_block_id = i % (layers_per_block + 1)
+        else:
+            block_id = next(k for k, n in enumerate(layers_per_block_cumsum) if i < n)
+            passed_blocks = layers_per_block_cumsum[block_id - 1] if block_id > 0 else 0
+            layer_in_block_id = i - passed_blocks
+
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+
+        # len(output_block_list) == 1 -> resnet
+        # len(output_block_list) == 2 -> resnet, attention or resnet, upscale resnet
+        # len(output_block_list) == 3 -> resnet, attention, upscale resnet
+
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+
+            has_attention = True
+            if len(output_block_list) == 2 and any("in_layers" in k for k in output_block_list["1"]):
+                has_attention = False
+
+            maybe_attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
+
+            paths = renew_resnet_paths(resnets)
+
+            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
+
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
+            if ["conv.bias", "conv.weight"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
+
+                # this layer was no attention
+                has_attention = False
+                maybe_attentions = []
+
+            if has_attention:
+                old_path = f"output_blocks.{i}.1"
+                new_path = f"up_blocks.{block_id}.attentions.{layer_in_block_id}"
+
+                assign_attention_to_checkpoint(
+                    new_checkpoint=new_checkpoint,
+                    unet_state_dict=unet_state_dict,
+                    old_path=old_path,
+                    new_path=new_path,
+                    config=config,
+                )
+
+                paths = renew_attention_paths(maybe_attentions)
+                meta_path = {
+                    "old": old_path,
+                    "new": new_path,
+                }
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+
+            if len(output_block_list) == 3 or (not has_attention and len(maybe_attentions) > 0):
+                layer_id = len(output_block_list) - 1
+                resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.{layer_id}" in key]
+                paths = renew_resnet_paths(resnets)
+                meta_path = {"old": f"output_blocks.{i}.{layer_id}", "new": f"up_blocks.{block_id}.upsamplers.0"}
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
+
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+
+    return new_checkpoint
+
+
+def verify_param_count(orig_path, unet_diffusers_config):
+    if "-II-" in orig_path:
+        from deepfloyd_if.modules import IFStageII
+
+        if_II = IFStageII(device="cpu", dir_or_name=orig_path)
+    elif "-III-" in orig_path:
+        from deepfloyd_if.modules import IFStageIII
+
+        if_II = IFStageIII(device="cpu", dir_or_name=orig_path)
+    else:
+        assert f"Weird name. Should have -II- or -III- in path: {orig_path}"
+
+    unet = UNet2DConditionModel(**unet_diffusers_config)
+
+    # in params
+    assert_param_count(unet.time_embedding, if_II.model.time_embed)
+    assert_param_count(unet.conv_in, if_II.model.input_blocks[:1])
+
+    # downblocks
+    assert_param_count(unet.down_blocks[0], if_II.model.input_blocks[1:4])
+    assert_param_count(unet.down_blocks[1], if_II.model.input_blocks[4:7])
+    assert_param_count(unet.down_blocks[2], if_II.model.input_blocks[7:11])
+
+    if "-II-" in orig_path:
+        assert_param_count(unet.down_blocks[3], if_II.model.input_blocks[11:17])
+        assert_param_count(unet.down_blocks[4], if_II.model.input_blocks[17:])
+    if "-III-" in orig_path:
+        assert_param_count(unet.down_blocks[3], if_II.model.input_blocks[11:15])
+        assert_param_count(unet.down_blocks[4], if_II.model.input_blocks[15:20])
+        assert_param_count(unet.down_blocks[5], if_II.model.input_blocks[20:])
+
+    # mid block
+    assert_param_count(unet.mid_block, if_II.model.middle_block)
+
+    # up block
+    if "-II-" in orig_path:
+        assert_param_count(unet.up_blocks[0], if_II.model.output_blocks[:6])
+        assert_param_count(unet.up_blocks[1], if_II.model.output_blocks[6:12])
+        assert_param_count(unet.up_blocks[2], if_II.model.output_blocks[12:16])
+        assert_param_count(unet.up_blocks[3], if_II.model.output_blocks[16:19])
+        assert_param_count(unet.up_blocks[4], if_II.model.output_blocks[19:])
+    if "-III-" in orig_path:
+        assert_param_count(unet.up_blocks[0], if_II.model.output_blocks[:5])
+        assert_param_count(unet.up_blocks[1], if_II.model.output_blocks[5:10])
+        assert_param_count(unet.up_blocks[2], if_II.model.output_blocks[10:14])
+        assert_param_count(unet.up_blocks[3], if_II.model.output_blocks[14:18])
+        assert_param_count(unet.up_blocks[4], if_II.model.output_blocks[18:21])
+        assert_param_count(unet.up_blocks[5], if_II.model.output_blocks[21:24])
+
+    # out params
+    assert_param_count(unet.conv_norm_out, if_II.model.out[0])
+    assert_param_count(unet.conv_out, if_II.model.out[2])
+
+    # make sure all model architecture has same param count
+    assert_param_count(unet, if_II.model)
+
+
+def assert_param_count(model_1, model_2):
+    count_1 = sum(p.numel() for p in model_1.parameters())
+    count_2 = sum(p.numel() for p in model_2.parameters())
+    assert count_1 == count_2, f"{model_1.__class__}: {count_1} != {model_2.__class__}: {count_2}"
+
+
+def superres_check_against_original(dump_path, unet_checkpoint_path):
+    model_path = dump_path
+    model = UNet2DConditionModel.from_pretrained(model_path)
+    model.to("cuda")
+    orig_path = unet_checkpoint_path
+
+    if "-II-" in orig_path:
+        from deepfloyd_if.modules import IFStageII
+
+        if_II_model = IFStageII(device="cuda", dir_or_name=orig_path, model_kwargs={"precision": "fp32"}).model
+    elif "-III-" in orig_path:
+        from deepfloyd_if.modules import IFStageIII
+
+        if_II_model = IFStageIII(device="cuda", dir_or_name=orig_path, model_kwargs={"precision": "fp32"}).model
+
+    batch_size = 1
+    channels = model.in_channels // 2
+    height = model.sample_size
+    width = model.sample_size
+    height = 1024
+    width = 1024
+
+    torch.manual_seed(0)
+
+    latents = torch.randn((batch_size, channels, height, width), device=model.device)
+    image_small = torch.randn((batch_size, channels, height // 4, width // 4), device=model.device)
+
+    interpolate_antialias = {}
+    if "antialias" in inspect.signature(F.interpolate).parameters:
+        interpolate_antialias["antialias"] = True
+        image_upscaled = F.interpolate(
+            image_small, size=[height, width], mode="bicubic", align_corners=False, **interpolate_antialias
+        )
+
+    latent_model_input = torch.cat([latents, image_upscaled], dim=1).to(model.dtype)
+    t = torch.tensor([5], device=model.device).to(model.dtype)
+
+    seq_len = 64
+    encoder_hidden_states = torch.randn((batch_size, seq_len, model.config.encoder_hid_dim), device=model.device).to(
+        model.dtype
+    )
+
+    fake_class_labels = torch.tensor([t], device=model.device).to(model.dtype)
+
+    with torch.no_grad():
+        out = if_II_model(latent_model_input, t, aug_steps=fake_class_labels, text_emb=encoder_hidden_states)
+
+    if_II_model.to("cpu")
+    del if_II_model
+    import gc
+
+    torch.cuda.empty_cache()
+    gc.collect()
+    print(50 * "=")
+
+    with torch.no_grad():
+        noise_pred = model(
+            sample=latent_model_input,
+            encoder_hidden_states=encoder_hidden_states,
+            class_labels=fake_class_labels,
+            timestep=t,
+        ).sample
+
+    print("Out shape", noise_pred.shape)
+    print("Diff", (out - noise_pred).abs().sum())
+
+
+if __name__ == "__main__":
+    main(parse_args())
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 40029fcecfd1..e9d12bdb7cca 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -114,6 +114,12 @@
         AltDiffusionPipeline,
         AudioLDMPipeline,
         CycleDiffusionPipeline,
+        IFImg2ImgPipeline,
+        IFImg2ImgSuperResolutionPipeline,
+        IFInpaintingPipeline,
+        IFInpaintingSuperResolutionPipeline,
+        IFPipeline,
+        IFSuperResolutionPipeline,
         LDMTextToImagePipeline,
         PaintByExamplePipeline,
         SemanticStableDiffusionPipeline,
diff --git a/src/diffusers/configuration_utils.py b/src/diffusers/configuration_utils.py
index 772e119fbe97..af639de306ee 100644
--- a/src/diffusers/configuration_utils.py
+++ b/src/diffusers/configuration_utils.py
@@ -109,6 +109,7 @@ def register_to_config(self, **kwargs):
         # TODO: remove this when we remove the deprecation warning, and the `kwargs` argument,
         # or solve in a more general way.
         kwargs.pop("kwargs", None)
+
         if not hasattr(self, "_internal_dict"):
             internal_dict = kwargs
         else:
@@ -550,6 +551,9 @@ def to_json_saveable(value):
             return value
 
         config_dict = {k: to_json_saveable(v) for k, v in config_dict.items()}
+        # Don't save "_ignore_files"
+        config_dict.pop("_ignore_files", None)
+
         return json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
 
     def to_json_file(self, json_file_path: Union[str, os.PathLike]):
diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
index d12e75344ba1..fa88bce305e6 100644
--- a/src/diffusers/models/embeddings.py
+++ b/src/diffusers/models/embeddings.py
@@ -377,3 +377,69 @@ def forward(self, timestep, class_labels, hidden_dtype=None):
         conditioning = timesteps_emb + class_labels  # (N, D)
 
         return conditioning
+
+
+class TextTimeEmbedding(nn.Module):
+    def __init__(self, encoder_dim: int, time_embed_dim: int, num_heads: int = 64):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(encoder_dim)
+        self.pool = AttentionPooling(num_heads, encoder_dim)
+        self.proj = nn.Linear(encoder_dim, time_embed_dim)
+        self.norm2 = nn.LayerNorm(time_embed_dim)
+
+    def forward(self, hidden_states):
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.pool(hidden_states)
+        hidden_states = self.proj(hidden_states)
+        hidden_states = self.norm2(hidden_states)
+        return hidden_states
+
+
+class AttentionPooling(nn.Module):
+    # Copied from https://github.com/deep-floyd/IF/blob/2f91391f27dd3c468bf174be5805b4cc92980c0b/deepfloyd_if/model/nn.py#L54
+
+    def __init__(self, num_heads, embed_dim, dtype=None):
+        super().__init__()
+        self.dtype = dtype
+        self.positional_embedding = nn.Parameter(torch.randn(1, embed_dim) / embed_dim**0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim, dtype=self.dtype)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, dtype=self.dtype)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, dtype=self.dtype)
+        self.num_heads = num_heads
+        self.dim_per_head = embed_dim // self.num_heads
+
+    def forward(self, x):
+        bs, length, width = x.size()
+
+        def shape(x):
+            # (bs, length, width) --> (bs, length, n_heads, dim_per_head)
+            x = x.view(bs, -1, self.num_heads, self.dim_per_head)
+            # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
+            x = x.transpose(1, 2)
+            # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
+            x = x.reshape(bs * self.num_heads, -1, self.dim_per_head)
+            # (bs*n_heads, length, dim_per_head) --> (bs*n_heads, dim_per_head, length)
+            x = x.transpose(1, 2)
+            return x
+
+        class_token = x.mean(dim=1, keepdim=True) + self.positional_embedding.to(x.dtype)
+        x = torch.cat([class_token, x], dim=1)  # (bs, length+1, width)
+
+        # (bs*n_heads, class_token_length, dim_per_head)
+        q = shape(self.q_proj(class_token))
+        # (bs*n_heads, length+class_token_length, dim_per_head)
+        k = shape(self.k_proj(x))
+        v = shape(self.v_proj(x))
+
+        # (bs*n_heads, class_token_length, length+class_token_length):
+        scale = 1 / math.sqrt(math.sqrt(self.dim_per_head))
+        weight = torch.einsum("bct,bcs->bts", q * scale, k * scale)  # More stable with f16 than dividing afterwards
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+
+        # (bs*n_heads, dim_per_head, class_token_length)
+        a = torch.einsum("bts,bcs->bct", weight, v)
+
+        # (bs, length+1, width)
+        a = a.reshape(bs, -1, 1).transpose(1, 2)
+
+        return a[:, 0, :]  # cls_token
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
index 5363e6330623..521e99fdd69c 100644
--- a/src/diffusers/models/modeling_utils.py
+++ b/src/diffusers/models/modeling_utils.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 
 import inspect
+import itertools
 import os
 from functools import partial
 from typing import Any, Callable, List, Optional, Tuple, Union
@@ -60,7 +61,8 @@
 
 def get_parameter_device(parameter: torch.nn.Module):
     try:
-        return next(parameter.parameters()).device
+        parameters_and_buffers = itertools.chain(parameter.parameters(), parameter.buffers())
+        return next(parameters_and_buffers).device
     except StopIteration:
         # For torch.nn.DataParallel compatibility in PyTorch 1.5
 
@@ -75,7 +77,8 @@ def find_tensor_attributes(module: torch.nn.Module) -> List[Tuple[str, Tensor]]:
 
 def get_parameter_dtype(parameter: torch.nn.Module):
     try:
-        return next(parameter.parameters()).dtype
+        parameters_and_buffers = itertools.chain(parameter.parameters(), parameter.buffers())
+        return next(parameters_and_buffers).dtype
     except StopIteration:
         # For torch.nn.DataParallel compatibility in PyTorch 1.5
 
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index b4997a257643..38e0fa3b5b2e 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -23,7 +23,7 @@
 from ..loaders import UNet2DConditionLoadersMixin
 from ..utils import BaseOutput, logging
 from .attention_processor import AttentionProcessor, AttnProcessor
-from .embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
+from .embeddings import GaussianFourierProjection, TextTimeEmbedding, TimestepEmbedding, Timesteps
 from .modeling_utils import ModelMixin
 from .unet_2d_blocks import (
     CrossAttnDownBlock2D,
@@ -97,11 +97,16 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
         class_embed_type (`str`, *optional*, defaults to None):
             The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
             `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
+        addition_embed_type (`str`, *optional*, defaults to None):
+            Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
+            "text". "text" will use the `TextTimeEmbedding` layer.
         num_class_embeds (`int`, *optional*, defaults to None):
             Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
             class conditioning with `class_embed_type` equal to `None`.
         time_embedding_type (`str`, *optional*, default to `positional`):
             The type of position embedding to use for timesteps. Choose from `positional` or `fourier`.
+        time_embedding_dim (`int`, *optional*, default to `None`):
+            An optional override for the dimension of the projected time embedding.
         time_embedding_act_fn (`str`, *optional*, default to `None`):
             Optional activation function to use on the time embeddings only one time before they as passed to the rest
             of the unet. Choose from `silu`, `mish`, `gelu`, and `swish`.
@@ -155,12 +160,14 @@ def __init__(
         dual_cross_attention: bool = False,
         use_linear_projection: bool = False,
         class_embed_type: Optional[str] = None,
+        addition_embed_type: Optional[str] = None,
         num_class_embeds: Optional[int] = None,
         upcast_attention: bool = False,
         resnet_time_scale_shift: str = "default",
         resnet_skip_time_act: bool = False,
         resnet_out_scale_factor: int = 1.0,
         time_embedding_type: str = "positional",
+        time_embedding_dim: Optional[int] = None,
         time_embedding_act_fn: Optional[str] = None,
         timestep_post_act: Optional[str] = None,
         time_cond_proj_dim: Optional[int] = None,
@@ -170,6 +177,7 @@ def __init__(
         class_embeddings_concat: bool = False,
         mid_block_only_cross_attention: Optional[bool] = None,
         cross_attention_norm: Optional[str] = None,
+        addition_embed_type_num_heads=64,
     ):
         super().__init__()
 
@@ -214,7 +222,7 @@ def __init__(
 
         # time
         if time_embedding_type == "fourier":
-            time_embed_dim = block_out_channels[0] * 2
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 2
             if time_embed_dim % 2 != 0:
                 raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
             self.time_proj = GaussianFourierProjection(
@@ -222,7 +230,7 @@ def __init__(
             )
             timestep_input_dim = time_embed_dim
         elif time_embedding_type == "positional":
-            time_embed_dim = block_out_channels[0] * 4
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
 
             self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
             timestep_input_dim = block_out_channels[0]
@@ -273,6 +281,18 @@ def __init__(
         else:
             self.class_embedding = None
 
+        if addition_embed_type == "text":
+            if encoder_hid_dim is not None:
+                text_time_embedding_from_dim = encoder_hid_dim
+            else:
+                text_time_embedding_from_dim = cross_attention_dim
+
+            self.add_embedding = TextTimeEmbedding(
+                text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
+            )
+        elif addition_embed_type is not None:
+            raise ValueError(f"addition_embed_type: {addition_embed_type} must be None or 'text'.")
+
         if time_embedding_act_fn is None:
             self.time_embed_act = None
         elif time_embedding_act_fn == "swish":
@@ -684,6 +704,10 @@ def forward(
             else:
                 emb = emb + class_emb
 
+        if self.config.addition_embed_type == "text":
+            aug_emb = self.add_embedding(encoder_hidden_states)
+            emb = emb + aug_emb
+
         if self.time_embed_act is not None:
             emb = self.time_embed_act(emb)
 
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 602cf028e2e9..10da653a1377 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -44,6 +44,14 @@
 else:
     from .alt_diffusion import AltDiffusionImg2ImgPipeline, AltDiffusionPipeline
     from .audioldm import AudioLDMPipeline
+    from .deepfloyd_if import (
+        IFImg2ImgPipeline,
+        IFImg2ImgSuperResolutionPipeline,
+        IFInpaintingPipeline,
+        IFInpaintingSuperResolutionPipeline,
+        IFPipeline,
+        IFSuperResolutionPipeline,
+    )
     from .latent_diffusion import LDMTextToImagePipeline
     from .paint_by_example import PaintByExamplePipeline
     from .semantic_stable_diffusion import SemanticStableDiffusionPipeline
diff --git a/src/diffusers/pipelines/deepfloyd_if/__init__.py b/src/diffusers/pipelines/deepfloyd_if/__init__.py
new file mode 100644
index 000000000000..93414f20e733
--- /dev/null
+++ b/src/diffusers/pipelines/deepfloyd_if/__init__.py
@@ -0,0 +1,54 @@
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL
+
+from ...utils import BaseOutput, OptionalDependencyNotAvailable, is_torch_available, is_transformers_available
+from .timesteps import (
+    fast27_timesteps,
+    smart27_timesteps,
+    smart50_timesteps,
+    smart100_timesteps,
+    smart185_timesteps,
+    super27_timesteps,
+    super40_timesteps,
+    super100_timesteps,
+)
+
+
+@dataclass
+class IFPipelineOutput(BaseOutput):
+    """
+    Args:
+    Output class for Stable Diffusion pipelines.
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+        nsfw_detected (`List[bool]`)
+            List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content or a watermark. `None` if safety checking could not be performed.
+        watermark_detected (`List[bool]`)
+            List of flags denoting whether the corresponding generated image likely has a watermark. `None` if safety
+            checking could not be performed.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
+    nsfw_detected: Optional[List[bool]]
+    watermark_detected: Optional[List[bool]]
+
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+else:
+    from .pipeline_if import IFPipeline
+    from .pipeline_if_img2img import IFImg2ImgPipeline
+    from .pipeline_if_img2img_superresolution import IFImg2ImgSuperResolutionPipeline
+    from .pipeline_if_inpainting import IFInpaintingPipeline
+    from .pipeline_if_inpainting_superresolution import IFInpaintingSuperResolutionPipeline
+    from .pipeline_if_superresolution import IFSuperResolutionPipeline
+    from .safety_checker import IFSafetyChecker
+    from .watermark import IFWatermarker
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py
new file mode 100644
index 000000000000..a76e51a3ffe9
--- /dev/null
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py
@@ -0,0 +1,854 @@
+import html
+import inspect
+import re
+import urllib.parse as ul
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
+
+from ...models import UNet2DConditionModel
+from ...schedulers import DDPMScheduler
+from ...utils import (
+    BACKENDS_MAPPING,
+    is_accelerate_available,
+    is_accelerate_version,
+    is_bs4_available,
+    is_ftfy_available,
+    logging,
+    randn_tensor,
+    replace_example_docstring,
+)
+from ..pipeline_utils import DiffusionPipeline
+from . import IFPipelineOutput
+from .safety_checker import IFSafetyChecker
+from .watermark import IFWatermarker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+if is_bs4_available():
+    from bs4 import BeautifulSoup
+
+if is_ftfy_available():
+    import ftfy
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import IFPipeline, IFSuperResolutionPipeline, DiffusionPipeline
+        >>> from diffusers.utils import pt_to_pil
+        >>> import torch
+
+        >>> pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> prompt = 'a photo of a kangaroo wearing an orange hoodie and blue sunglasses standing in front of the eiffel tower holding a sign that says "very deep learning"'
+        >>> prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
+
+        >>> image = pipe(prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_embeds, output_type="pt").images
+
+        >>> # save intermediate image
+        >>> pil_image = pt_to_pil(image)
+        >>> pil_image[0].save("./if_stage_I.png")
+
+        >>> super_res_1_pipe = IFSuperResolutionPipeline.from_pretrained(
+        ...     "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16
+        ... )
+        >>> super_res_1_pipe.enable_model_cpu_offload()
+
+        >>> image = super_res_1_pipe(
+        ...     image=image, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_embeds, output_type="pt"
+        ... ).images
+
+        >>> # save intermediate image
+        >>> pil_image = pt_to_pil(image)
+        >>> pil_image[0].save("./if_stage_I.png")
+
+        >>> safety_modules = {
+        ...     "feature_extractor": pipe.feature_extractor,
+        ...     "safety_checker": pipe.safety_checker,
+        ...     "watermarker": pipe.watermarker,
+        ... }
+        >>> super_res_2_pipe = DiffusionPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-x4-upscaler", **safety_modules, torch_dtype=torch.float16
+        ... )
+        >>> super_res_2_pipe.enable_model_cpu_offload()
+
+        >>> image = super_res_2_pipe(
+        ...     prompt=prompt,
+        ...     image=image,
+        ... ).images
+        >>> image[0].save("./if_stage_II.png")
+        ```
+"""
+
+
+class IFPipeline(DiffusionPipeline):
+    tokenizer: T5Tokenizer
+    text_encoder: T5EncoderModel
+
+    unet: UNet2DConditionModel
+    scheduler: DDPMScheduler
+
+    feature_extractor: Optional[CLIPImageProcessor]
+    safety_checker: Optional[IFSafetyChecker]
+
+    watermarker: Optional[IFWatermarker]
+
+    bad_punct_regex = re.compile(
+        r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
+    )  # noqa
+
+    _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"]
+
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        safety_checker: Optional[IFSafetyChecker],
+        feature_extractor: Optional[CLIPImageProcessor],
+        watermarker: Optional[IFWatermarker],
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the IF license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            watermarker=watermarker,
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
+        models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only
+        when their specific submodule has its `forward` method called.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        models = [
+            self.text_encoder,
+            self.unet,
+        ]
+        for cpu_offloaded_model in models:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+
+        if self.safety_checker is not None:
+            cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
+
+    def enable_model_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        hook = None
+
+        if self.text_encoder is not None:
+            _, hook = cpu_offload_with_hook(self.text_encoder, device, prev_module_hook=hook)
+
+            # Accelerate will move the next model to the device _before_ calling the offload hook of the
+            # previous model. This will cause both models to be present on the device at the same time.
+            # IF uses T5 for its text encoder which is really large. We can manually call the offload
+            # hook for the text encoder to ensure it's moved to the cpu before the unet is moved to
+            # the GPU.
+            self.text_encoder_offload_hook = hook
+
+        _, hook = cpu_offload_with_hook(self.unet, device, prev_module_hook=hook)
+
+        # if the safety checker isn't called, `unet_offload_hook` will have to be called to manually offload the unet
+        self.unet_offload_hook = hook
+
+        if self.safety_checker is not None:
+            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
+
+        # We'll offload the last model manually.
+        self.final_offload_hook = hook
+
+    def remove_all_hooks(self):
+        if is_accelerate_available():
+            from accelerate.hooks import remove_hook_from_module
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        for model in [self.text_encoder, self.unet, self.safety_checker]:
+            if model is not None:
+                remove_hook_from_module(model, recurse=True)
+
+        self.unet_offload_hook = None
+        self.text_encoder_offload_hook = None
+        self.final_offload_hook = None
+
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+
+    @torch.no_grad()
+    def encode_prompt(
+        self,
+        prompt,
+        do_classifier_free_guidance=True,
+        num_images_per_prompt=1,
+        device=None,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        clean_caption: bool = False,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+             prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if prompt is not None and negative_prompt is not None:
+            if type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+
+        if device is None:
+            device = self._execution_device
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # while T5 can handle much longer input sequences than 77, the text encoder was trained with a max length of 77 for IF
+        max_length = 77
+
+        if prompt_embeds is None:
+            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {max_length} tokens: {removed_text}"
+                )
+
+            attention_mask = text_inputs.attention_mask.to(device)
+
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+
+        if self.text_encoder is not None:
+            dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            dtype = self.unet.dtype
+        else:
+            dtype = None
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            attention_mask = uncond_input.attention_mask.to(device)
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+        else:
+            negative_prompt_embeds = None
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, nsfw_detected, watermark_detected = self.safety_checker(
+                images=image,
+                clip_input=safety_checker_input.pixel_values.to(dtype=dtype),
+            )
+        else:
+            nsfw_detected = None
+            watermark_detected = None
+
+            if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None:
+                self.unet_offload_hook.offload()
+
+        return image, nsfw_detected, watermark_detected
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    def prepare_intermediate_images(self, batch_size, num_channels, height, width, dtype, device, generator):
+        shape = (batch_size, num_channels, height, width)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        intermediate_images = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        intermediate_images = intermediate_images * self.scheduler.init_noise_sigma
+        return intermediate_images
+
+    def _text_preprocessing(self, text, clean_caption=False):
+        if clean_caption and not is_bs4_available():
+            logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
+            logger.warn("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if clean_caption and not is_ftfy_available():
+            logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
+            logger.warn("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if not isinstance(text, (tuple, list)):
+            text = [text]
+
+        def process(text: str):
+            if clean_caption:
+                text = self._clean_caption(text)
+                text = self._clean_caption(text)
+            else:
+                text = text.lower().strip()
+            return text
+
+        return [process(t) for t in text]
+
+    def _clean_caption(self, caption):
+        caption = str(caption)
+        caption = ul.unquote_plus(caption)
+        caption = caption.strip().lower()
+        caption = re.sub("<person>", "person", caption)
+        # urls:
+        caption = re.sub(
+            r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        caption = re.sub(
+            r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        # html:
+        caption = BeautifulSoup(caption, features="html.parser").text
+
+        # @<nickname>
+        caption = re.sub(r"@[\w\d]+\b", "", caption)
+
+        # 31C0—31EF CJK Strokes
+        # 31F0—31FF Katakana Phonetic Extensions
+        # 3200—32FF Enclosed CJK Letters and Months
+        # 3300—33FF CJK Compatibility
+        # 3400—4DBF CJK Unified Ideographs Extension A
+        # 4DC0—4DFF Yijing Hexagram Symbols
+        # 4E00—9FFF CJK Unified Ideographs
+        caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
+        caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
+        caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
+        caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
+        caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
+        caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
+        caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
+        #######################################################
+
+        # все виды тире / all types of dash --> "-"
+        caption = re.sub(
+            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
+            "-",
+            caption,
+        )
+
+        # кавычки к одному стандарту
+        caption = re.sub(r"[`´«»“”¨]", '"', caption)
+        caption = re.sub(r"[‘’]", "'", caption)
+
+        # &quot;
+        caption = re.sub(r"&quot;?", "", caption)
+        # &amp
+        caption = re.sub(r"&amp", "", caption)
+
+        # ip adresses:
+        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
+
+        # article ids:
+        caption = re.sub(r"\d:\d\d\s+$", "", caption)
+
+        # \n
+        caption = re.sub(r"\\n", " ", caption)
+
+        # "#123"
+        caption = re.sub(r"#\d{1,3}\b", "", caption)
+        # "#12345.."
+        caption = re.sub(r"#\d{5,}\b", "", caption)
+        # "123456.."
+        caption = re.sub(r"\b\d{6,}\b", "", caption)
+        # filenames:
+        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
+
+        #
+        caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
+        caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
+
+        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
+
+        # this-is-my-cute-cat / this_is_my_cute_cat
+        regex2 = re.compile(r"(?:\-|\_)")
+        if len(re.findall(regex2, caption)) > 3:
+            caption = re.sub(regex2, " ", caption)
+
+        caption = ftfy.fix_text(caption)
+        caption = html.unescape(html.unescape(caption))
+
+        caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
+        caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
+        caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231
+
+        caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
+        caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
+        caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
+        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
+        caption = re.sub(r"\bpage\s+\d+\b", "", caption)
+
+        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
+
+        caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
+
+        caption = re.sub(r"\b\s+\:\s+", r": ", caption)
+        caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
+        caption = re.sub(r"\s+", " ", caption)
+
+        caption.strip()
+
+        caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
+        caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
+        caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
+        caption = re.sub(r"^\.\S+$", "", caption)
+
+        return caption.strip()
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_inference_steps: int = 100,
+        timesteps: List[int] = None,
+        guidance_scale: float = 7.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        clean_caption: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
+                timesteps are used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size):
+                The width in pixels of the generated image.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            clean_caption (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
+                be installed. If the dependencies are not installed, the embeddings will be created from the raw
+                prompt.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.IFPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.IFPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
+            returning a tuple, the first element is a list with the generated images, and the second element is a list
+            of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw)
+            or watermarked content, according to the `safety_checker`.
+        """
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
+
+        # 2. Define call parameters
+        height = height or self.unet.config.sample_size
+        width = width or self.unet.config.sample_size
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            do_classifier_free_guidance,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clean_caption=clean_caption,
+        )
+
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare timesteps
+        if timesteps is not None:
+            self.scheduler.set_timesteps(timesteps=timesteps, device=device)
+            timesteps = self.scheduler.timesteps
+            num_inference_steps = len(timesteps)
+        else:
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            timesteps = self.scheduler.timesteps
+
+        # 5. Prepare intermediate images
+        intermediate_images = self.prepare_intermediate_images(
+            batch_size * num_images_per_prompt,
+            self.unet.config.in_channels,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # HACK: see comment in `enable_model_cpu_offload`
+        if hasattr(self, "text_encoder_offload_hook") and self.text_encoder_offload_hook is not None:
+            self.text_encoder_offload_hook.offload()
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                model_input = (
+                    torch.cat([intermediate_images] * 2) if do_classifier_free_guidance else intermediate_images
+                )
+                model_input = self.scheduler.scale_model_input(model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred_uncond, _ = noise_pred_uncond.split(model_input.shape[1], dim=1)
+                    noise_pred_text, predicted_variance = noise_pred_text.split(model_input.shape[1], dim=1)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    noise_pred = torch.cat([noise_pred, predicted_variance], dim=1)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                intermediate_images = self.scheduler.step(
+                    noise_pred, t, intermediate_images, **extra_step_kwargs
+                ).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, intermediate_images)
+
+        image = intermediate_images
+
+        if output_type == "pil":
+            # 8. Post-processing
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            # 9. Run safety checker
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+            # 10. Convert to PIL
+            image = self.numpy_to_pil(image)
+
+            # 11. Apply watermark
+            if self.watermarker is not None:
+                self.watermarker.apply_watermark(image, self.unet.config.sample_size)
+        elif output_type == "pt":
+            nsfw_detected = None
+            watermark_detected = None
+
+            if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None:
+                self.unet_offload_hook.offload()
+        else:
+            # 8. Post-processing
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            # 9. Run safety checker
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+
+        if not return_dict:
+            return (image, nsfw_detected, watermark_detected)
+
+        return IFPipelineOutput(images=image, nsfw_detected=nsfw_detected, watermark_detected=watermark_detected)
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
new file mode 100644
index 000000000000..a31748450d4b
--- /dev/null
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
@@ -0,0 +1,979 @@
+import html
+import inspect
+import re
+import urllib.parse as ul
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL
+import torch
+from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
+
+from ...models import UNet2DConditionModel
+from ...schedulers import DDPMScheduler
+from ...utils import (
+    BACKENDS_MAPPING,
+    PIL_INTERPOLATION,
+    is_accelerate_available,
+    is_accelerate_version,
+    is_bs4_available,
+    is_ftfy_available,
+    logging,
+    randn_tensor,
+    replace_example_docstring,
+)
+from ..pipeline_utils import DiffusionPipeline
+from . import IFPipelineOutput
+from .safety_checker import IFSafetyChecker
+from .watermark import IFWatermarker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+if is_bs4_available():
+    from bs4 import BeautifulSoup
+
+if is_ftfy_available():
+    import ftfy
+
+
+def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
+    w, h = images.size
+
+    coef = w / h
+
+    w, h = img_size, img_size
+
+    if coef >= 1:
+        w = int(round(img_size / 8 * coef) * 8)
+    else:
+        h = int(round(img_size / 8 / coef) * 8)
+
+    images = images.resize((w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
+
+    return images
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import IFImg2ImgPipeline, IFImg2ImgSuperResolutionPipeline, DiffusionPipeline
+        >>> from diffusers.utils import pt_to_pil
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+        >>> from io import BytesIO
+
+        >>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+        >>> response = requests.get(url)
+        >>> original_image = Image.open(BytesIO(response.content)).convert("RGB")
+        >>> original_image = original_image.resize((768, 512))
+
+        >>> pipe = IFImg2ImgPipeline.from_pretrained(
+        ...     "DeepFloyd/IF-I-IF-v1.0",
+        ...     variant="fp16",
+        ...     torch_dtype=torch.float16,
+        ... )
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> prompt = "A fantasy landscape in style minecraft"
+        >>> prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
+
+        >>> image = pipe(
+        ...     image=original_image,
+        ...     prompt_embeds=prompt_embeds,
+        ...     negative_prompt_embeds=negative_embeds,
+        ...     output_type="pt",
+        ... ).images
+
+        >>> # save intermediate image
+        >>> pil_image = pt_to_pil(image)
+        >>> pil_image[0].save("./if_stage_I.png")
+
+        >>> super_res_1_pipe = IFImg2ImgSuperResolutionPipeline.from_pretrained(
+        ...     "DeepFloyd/IF-II-L-v1.0",
+        ...     text_encoder=None,
+        ...     variant="fp16",
+        ...     torch_dtype=torch.float16,
+        ... )
+        >>> super_res_1_pipe.enable_model_cpu_offload()
+
+        >>> image = super_res_1_pipe(
+        ...     image=image,
+        ...     original_image=original_image,
+        ...     prompt_embeds=prompt_embeds,
+        ...     negative_prompt_embeds=negative_embeds,
+        ... ).images
+        >>> image[0].save("./if_stage_II.png")
+        ```
+"""
+
+
+class IFImg2ImgPipeline(DiffusionPipeline):
+    tokenizer: T5Tokenizer
+    text_encoder: T5EncoderModel
+
+    unet: UNet2DConditionModel
+    scheduler: DDPMScheduler
+
+    feature_extractor: Optional[CLIPImageProcessor]
+    safety_checker: Optional[IFSafetyChecker]
+
+    watermarker: Optional[IFWatermarker]
+
+    bad_punct_regex = re.compile(
+        r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
+    )  # noqa
+
+    _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"]
+
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        safety_checker: Optional[IFSafetyChecker],
+        feature_extractor: Optional[CLIPImageProcessor],
+        watermarker: Optional[IFWatermarker],
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the IF license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            watermarker=watermarker,
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_sequential_cpu_offload
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
+        models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only
+        when their specific submodule has its `forward` method called.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        models = [
+            self.text_encoder,
+            self.unet,
+        ]
+        for cpu_offloaded_model in models:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+
+        if self.safety_checker is not None:
+            cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_model_cpu_offload
+    def enable_model_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        hook = None
+
+        if self.text_encoder is not None:
+            _, hook = cpu_offload_with_hook(self.text_encoder, device, prev_module_hook=hook)
+
+            # Accelerate will move the next model to the device _before_ calling the offload hook of the
+            # previous model. This will cause both models to be present on the device at the same time.
+            # IF uses T5 for its text encoder which is really large. We can manually call the offload
+            # hook for the text encoder to ensure it's moved to the cpu before the unet is moved to
+            # the GPU.
+            self.text_encoder_offload_hook = hook
+
+        _, hook = cpu_offload_with_hook(self.unet, device, prev_module_hook=hook)
+
+        # if the safety checker isn't called, `unet_offload_hook` will have to be called to manually offload the unet
+        self.unet_offload_hook = hook
+
+        if self.safety_checker is not None:
+            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
+
+        # We'll offload the last model manually.
+        self.final_offload_hook = hook
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.remove_all_hooks
+    def remove_all_hooks(self):
+        if is_accelerate_available():
+            from accelerate.hooks import remove_hook_from_module
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        for model in [self.text_encoder, self.unet, self.safety_checker]:
+            if model is not None:
+                remove_hook_from_module(model, recurse=True)
+
+        self.unet_offload_hook = None
+        self.text_encoder_offload_hook = None
+        self.final_offload_hook = None
+
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+
+    @torch.no_grad()
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        do_classifier_free_guidance=True,
+        num_images_per_prompt=1,
+        device=None,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        clean_caption: bool = False,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+             prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if prompt is not None and negative_prompt is not None:
+            if type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+
+        if device is None:
+            device = self._execution_device
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # while T5 can handle much longer input sequences than 77, the text encoder was trained with a max length of 77 for IF
+        max_length = 77
+
+        if prompt_embeds is None:
+            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {max_length} tokens: {removed_text}"
+                )
+
+            attention_mask = text_inputs.attention_mask.to(device)
+
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+
+        if self.text_encoder is not None:
+            dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            dtype = self.unet.dtype
+        else:
+            dtype = None
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            attention_mask = uncond_input.attention_mask.to(device)
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+        else:
+            negative_prompt_embeds = None
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, nsfw_detected, watermark_detected = self.safety_checker(
+                images=image,
+                clip_input=safety_checker_input.pixel_values.to(dtype=dtype),
+            )
+        else:
+            nsfw_detected = None
+            watermark_detected = None
+
+            if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None:
+                self.unet_offload_hook.offload()
+
+        return image, nsfw_detected, watermark_detected
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        batch_size,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if isinstance(image, list):
+            check_image_type = image[0]
+        else:
+            check_image_type = image
+
+        if (
+            not isinstance(check_image_type, torch.Tensor)
+            and not isinstance(check_image_type, PIL.Image.Image)
+            and not isinstance(check_image_type, np.ndarray)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
+                f" {type(check_image_type)}"
+            )
+
+        if isinstance(image, list):
+            image_batch_size = len(image)
+        elif isinstance(image, torch.Tensor):
+            image_batch_size = image.shape[0]
+        elif isinstance(image, PIL.Image.Image):
+            image_batch_size = 1
+        elif isinstance(image, np.ndarray):
+            image_batch_size = image.shape[0]
+        else:
+            assert False
+
+        if batch_size != image_batch_size:
+            raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}")
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
+    def _text_preprocessing(self, text, clean_caption=False):
+        if clean_caption and not is_bs4_available():
+            logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
+            logger.warn("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if clean_caption and not is_ftfy_available():
+            logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
+            logger.warn("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if not isinstance(text, (tuple, list)):
+            text = [text]
+
+        def process(text: str):
+            if clean_caption:
+                text = self._clean_caption(text)
+                text = self._clean_caption(text)
+            else:
+                text = text.lower().strip()
+            return text
+
+        return [process(t) for t in text]
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
+    def _clean_caption(self, caption):
+        caption = str(caption)
+        caption = ul.unquote_plus(caption)
+        caption = caption.strip().lower()
+        caption = re.sub("<person>", "person", caption)
+        # urls:
+        caption = re.sub(
+            r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        caption = re.sub(
+            r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        # html:
+        caption = BeautifulSoup(caption, features="html.parser").text
+
+        # @<nickname>
+        caption = re.sub(r"@[\w\d]+\b", "", caption)
+
+        # 31C0—31EF CJK Strokes
+        # 31F0—31FF Katakana Phonetic Extensions
+        # 3200—32FF Enclosed CJK Letters and Months
+        # 3300—33FF CJK Compatibility
+        # 3400—4DBF CJK Unified Ideographs Extension A
+        # 4DC0—4DFF Yijing Hexagram Symbols
+        # 4E00—9FFF CJK Unified Ideographs
+        caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
+        caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
+        caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
+        caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
+        caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
+        caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
+        caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
+        #######################################################
+
+        # все виды тире / all types of dash --> "-"
+        caption = re.sub(
+            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
+            "-",
+            caption,
+        )
+
+        # кавычки к одному стандарту
+        caption = re.sub(r"[`´«»“”¨]", '"', caption)
+        caption = re.sub(r"[‘’]", "'", caption)
+
+        # &quot;
+        caption = re.sub(r"&quot;?", "", caption)
+        # &amp
+        caption = re.sub(r"&amp", "", caption)
+
+        # ip adresses:
+        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
+
+        # article ids:
+        caption = re.sub(r"\d:\d\d\s+$", "", caption)
+
+        # \n
+        caption = re.sub(r"\\n", " ", caption)
+
+        # "#123"
+        caption = re.sub(r"#\d{1,3}\b", "", caption)
+        # "#12345.."
+        caption = re.sub(r"#\d{5,}\b", "", caption)
+        # "123456.."
+        caption = re.sub(r"\b\d{6,}\b", "", caption)
+        # filenames:
+        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
+
+        #
+        caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
+        caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
+
+        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
+
+        # this-is-my-cute-cat / this_is_my_cute_cat
+        regex2 = re.compile(r"(?:\-|\_)")
+        if len(re.findall(regex2, caption)) > 3:
+            caption = re.sub(regex2, " ", caption)
+
+        caption = ftfy.fix_text(caption)
+        caption = html.unescape(html.unescape(caption))
+
+        caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
+        caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
+        caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231
+
+        caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
+        caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
+        caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
+        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
+        caption = re.sub(r"\bpage\s+\d+\b", "", caption)
+
+        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
+
+        caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
+
+        caption = re.sub(r"\b\s+\:\s+", r": ", caption)
+        caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
+        caption = re.sub(r"\s+", " ", caption)
+
+        caption.strip()
+
+        caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
+        caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
+        caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
+        caption = re.sub(r"^\.\S+$", "", caption)
+
+        return caption.strip()
+
+    def preprocess_image(self, image: PIL.Image.Image) -> torch.Tensor:
+        if not isinstance(image, list):
+            image = [image]
+
+        def numpy_to_pt(images):
+            if images.ndim == 3:
+                images = images[..., None]
+
+            images = torch.from_numpy(images.transpose(0, 3, 1, 2))
+            return images
+
+        if isinstance(image[0], PIL.Image.Image):
+            new_image = []
+
+            for image_ in image:
+                image_ = image_.convert("RGB")
+                image_ = resize(image_, self.unet.sample_size)
+                image_ = np.array(image_)
+                image_ = image_.astype(np.float32)
+                image_ = image_ / 127.5 - 1
+                new_image.append(image_)
+
+            image = new_image
+
+            image = np.stack(image, axis=0)  # to np
+            image = numpy_to_pt(image)  # to pt
+
+        elif isinstance(image[0], np.ndarray):
+            image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
+            image = numpy_to_pt(image)
+
+        elif isinstance(image[0], torch.Tensor):
+            image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0)
+
+        return image
+
+    def get_timesteps(self, num_inference_steps, strength):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_intermediate_images(
+        self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None
+    ):
+        _, channels, height, width = image.shape
+
+        batch_size = batch_size * num_images_per_prompt
+
+        shape = (batch_size, channels, height, width)
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        image = image.repeat_interleave(num_images_per_prompt, dim=0)
+        image = self.scheduler.add_noise(image, noise, timestep)
+
+        return image
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[
+            PIL.Image.Image, torch.Tensor, np.ndarray, List[PIL.Image.Image], List[torch.Tensor], List[np.ndarray]
+        ] = None,
+        strength: float = 0.7,
+        num_inference_steps: int = 80,
+        timesteps: List[int] = None,
+        guidance_scale: float = 10.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        clean_caption: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
+                timesteps are used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            clean_caption (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
+                be installed. If the dependencies are not installed, the embeddings will be created from the raw
+                prompt.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.IFPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.IFPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
+            returning a tuple, the first element is a list with the generated images, and the second element is a list
+            of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw)
+            or watermarked content, according to the `safety_checker`.
+        """
+        # 1. Check inputs. Raise error if not correct
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        self.check_inputs(
+            prompt, image, batch_size, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
+
+        # 2. Define call parameters
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            do_classifier_free_guidance,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clean_caption=clean_caption,
+        )
+
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        dtype = prompt_embeds.dtype
+
+        # 4. Prepare timesteps
+        if timesteps is not None:
+            self.scheduler.set_timesteps(timesteps=timesteps, device=device)
+            timesteps = self.scheduler.timesteps
+            num_inference_steps = len(timesteps)
+        else:
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            timesteps = self.scheduler.timesteps
+
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
+
+        # 5. Prepare intermediate images
+        image = self.preprocess_image(image)
+        image = image.to(device=device, dtype=dtype)
+
+        noise_timestep = timesteps[0:1]
+        noise_timestep = noise_timestep.repeat(batch_size * num_images_per_prompt)
+
+        intermediate_images = self.prepare_intermediate_images(
+            image, noise_timestep, batch_size, num_images_per_prompt, dtype, device, generator
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # HACK: see comment in `enable_model_cpu_offload`
+        if hasattr(self, "text_encoder_offload_hook") and self.text_encoder_offload_hook is not None:
+            self.text_encoder_offload_hook.offload()
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                model_input = (
+                    torch.cat([intermediate_images] * 2) if do_classifier_free_guidance else intermediate_images
+                )
+                model_input = self.scheduler.scale_model_input(model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred_uncond, _ = noise_pred_uncond.split(model_input.shape[1], dim=1)
+                    noise_pred_text, predicted_variance = noise_pred_text.split(model_input.shape[1], dim=1)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    noise_pred = torch.cat([noise_pred, predicted_variance], dim=1)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                intermediate_images = self.scheduler.step(
+                    noise_pred, t, intermediate_images, **extra_step_kwargs
+                ).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, intermediate_images)
+
+        image = intermediate_images
+
+        if output_type == "pil":
+            # 8. Post-processing
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            # 9. Run safety checker
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+            # 10. Convert to PIL
+            image = self.numpy_to_pil(image)
+
+            # 11. Apply watermark
+            if self.watermarker is not None:
+                self.watermarker.apply_watermark(image, self.unet.config.sample_size)
+        elif output_type == "pt":
+            nsfw_detected = None
+            watermark_detected = None
+
+            if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None:
+                self.unet_offload_hook.offload()
+        else:
+            # 8. Post-processing
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            # 9. Run safety checker
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+
+        if not return_dict:
+            return (image, nsfw_detected, watermark_detected)
+
+        return IFPipelineOutput(images=image, nsfw_detected=nsfw_detected, watermark_detected=watermark_detected)
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
new file mode 100644
index 000000000000..21e280654cf5
--- /dev/null
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
@@ -0,0 +1,1097 @@
+import html
+import inspect
+import re
+import urllib.parse as ul
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL
+import torch
+import torch.nn.functional as F
+from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
+
+from ...models import UNet2DConditionModel
+from ...schedulers import DDPMScheduler
+from ...utils import (
+    BACKENDS_MAPPING,
+    PIL_INTERPOLATION,
+    is_accelerate_available,
+    is_accelerate_version,
+    is_bs4_available,
+    is_ftfy_available,
+    logging,
+    randn_tensor,
+    replace_example_docstring,
+)
+from ..pipeline_utils import DiffusionPipeline
+from . import IFPipelineOutput
+from .safety_checker import IFSafetyChecker
+from .watermark import IFWatermarker
+
+
+if is_bs4_available():
+    from bs4 import BeautifulSoup
+
+if is_ftfy_available():
+    import ftfy
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.resize
+def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
+    w, h = images.size
+
+    coef = w / h
+
+    w, h = img_size, img_size
+
+    if coef >= 1:
+        w = int(round(img_size / 8 * coef) * 8)
+    else:
+        h = int(round(img_size / 8 / coef) * 8)
+
+    images = images.resize((w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
+
+    return images
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import IFImg2ImgPipeline, IFImg2ImgSuperResolutionPipeline, DiffusionPipeline
+        >>> from diffusers.utils import pt_to_pil
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+        >>> from io import BytesIO
+
+        >>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+        >>> response = requests.get(url)
+        >>> original_image = Image.open(BytesIO(response.content)).convert("RGB")
+        >>> original_image = original_image.resize((768, 512))
+
+        >>> pipe = IFImg2ImgPipeline.from_pretrained(
+        ...     "DeepFloyd/IF-I-IF-v1.0",
+        ...     variant="fp16",
+        ...     torch_dtype=torch.float16,
+        ... )
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> prompt = "A fantasy landscape in style minecraft"
+        >>> prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
+
+        >>> image = pipe(
+        ...     image=original_image,
+        ...     prompt_embeds=prompt_embeds,
+        ...     negative_prompt_embeds=negative_embeds,
+        ...     output_type="pt",
+        ... ).images
+
+        >>> # save intermediate image
+        >>> pil_image = pt_to_pil(image)
+        >>> pil_image[0].save("./if_stage_I.png")
+
+        >>> super_res_1_pipe = IFImg2ImgSuperResolutionPipeline.from_pretrained(
+        ...     "DeepFloyd/IF-II-L-v1.0",
+        ...     text_encoder=None,
+        ...     variant="fp16",
+        ...     torch_dtype=torch.float16,
+        ... )
+        >>> super_res_1_pipe.enable_model_cpu_offload()
+
+        >>> image = super_res_1_pipe(
+        ...     image=image,
+        ...     original_image=original_image,
+        ...     prompt_embeds=prompt_embeds,
+        ...     negative_prompt_embeds=negative_embeds,
+        ... ).images
+        >>> image[0].save("./if_stage_II.png")
+        ```
+"""
+
+
+class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline):
+    tokenizer: T5Tokenizer
+    text_encoder: T5EncoderModel
+
+    unet: UNet2DConditionModel
+    scheduler: DDPMScheduler
+    image_noising_scheduler: DDPMScheduler
+
+    feature_extractor: Optional[CLIPImageProcessor]
+    safety_checker: Optional[IFSafetyChecker]
+
+    watermarker: Optional[IFWatermarker]
+
+    bad_punct_regex = re.compile(
+        r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
+    )  # noqa
+
+    _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor"]
+
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        image_noising_scheduler: DDPMScheduler,
+        safety_checker: Optional[IFSafetyChecker],
+        feature_extractor: Optional[CLIPImageProcessor],
+        watermarker: Optional[IFWatermarker],
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the IF license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        if unet.config.in_channels != 6:
+            logger.warn(
+                "It seems like you have loaded a checkpoint that shall not be used for super resolution from {unet.config._name_or_path} as it accepts {unet.config.in_channels} input channels instead of 6. Please make sure to pass a super resolution checkpoint as the `'unet'`: IFSuperResolutionPipeline.from_pretrained(unet=super_resolution_unet, ...)`."
+            )
+
+        self.register_modules(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            image_noising_scheduler=image_noising_scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            watermarker=watermarker,
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_sequential_cpu_offload
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
+        models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only
+        when their specific submodule has its `forward` method called.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        models = [
+            self.text_encoder,
+            self.unet,
+        ]
+        for cpu_offloaded_model in models:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+
+        if self.safety_checker is not None:
+            cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_model_cpu_offload
+    def enable_model_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        hook = None
+
+        if self.text_encoder is not None:
+            _, hook = cpu_offload_with_hook(self.text_encoder, device, prev_module_hook=hook)
+
+            # Accelerate will move the next model to the device _before_ calling the offload hook of the
+            # previous model. This will cause both models to be present on the device at the same time.
+            # IF uses T5 for its text encoder which is really large. We can manually call the offload
+            # hook for the text encoder to ensure it's moved to the cpu before the unet is moved to
+            # the GPU.
+            self.text_encoder_offload_hook = hook
+
+        _, hook = cpu_offload_with_hook(self.unet, device, prev_module_hook=hook)
+
+        # if the safety checker isn't called, `unet_offload_hook` will have to be called to manually offload the unet
+        self.unet_offload_hook = hook
+
+        if self.safety_checker is not None:
+            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
+
+        # We'll offload the last model manually.
+        self.final_offload_hook = hook
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.remove_all_hooks
+    def remove_all_hooks(self):
+        if is_accelerate_available():
+            from accelerate.hooks import remove_hook_from_module
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        for model in [self.text_encoder, self.unet, self.safety_checker]:
+            if model is not None:
+                remove_hook_from_module(model, recurse=True)
+
+        self.unet_offload_hook = None
+        self.text_encoder_offload_hook = None
+        self.final_offload_hook = None
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
+    def _text_preprocessing(self, text, clean_caption=False):
+        if clean_caption and not is_bs4_available():
+            logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
+            logger.warn("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if clean_caption and not is_ftfy_available():
+            logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
+            logger.warn("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if not isinstance(text, (tuple, list)):
+            text = [text]
+
+        def process(text: str):
+            if clean_caption:
+                text = self._clean_caption(text)
+                text = self._clean_caption(text)
+            else:
+                text = text.lower().strip()
+            return text
+
+        return [process(t) for t in text]
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
+    def _clean_caption(self, caption):
+        caption = str(caption)
+        caption = ul.unquote_plus(caption)
+        caption = caption.strip().lower()
+        caption = re.sub("<person>", "person", caption)
+        # urls:
+        caption = re.sub(
+            r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        caption = re.sub(
+            r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        # html:
+        caption = BeautifulSoup(caption, features="html.parser").text
+
+        # @<nickname>
+        caption = re.sub(r"@[\w\d]+\b", "", caption)
+
+        # 31C0—31EF CJK Strokes
+        # 31F0—31FF Katakana Phonetic Extensions
+        # 3200—32FF Enclosed CJK Letters and Months
+        # 3300—33FF CJK Compatibility
+        # 3400—4DBF CJK Unified Ideographs Extension A
+        # 4DC0—4DFF Yijing Hexagram Symbols
+        # 4E00—9FFF CJK Unified Ideographs
+        caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
+        caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
+        caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
+        caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
+        caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
+        caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
+        caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
+        #######################################################
+
+        # все виды тире / all types of dash --> "-"
+        caption = re.sub(
+            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
+            "-",
+            caption,
+        )
+
+        # кавычки к одному стандарту
+        caption = re.sub(r"[`´«»“”¨]", '"', caption)
+        caption = re.sub(r"[‘’]", "'", caption)
+
+        # &quot;
+        caption = re.sub(r"&quot;?", "", caption)
+        # &amp
+        caption = re.sub(r"&amp", "", caption)
+
+        # ip adresses:
+        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
+
+        # article ids:
+        caption = re.sub(r"\d:\d\d\s+$", "", caption)
+
+        # \n
+        caption = re.sub(r"\\n", " ", caption)
+
+        # "#123"
+        caption = re.sub(r"#\d{1,3}\b", "", caption)
+        # "#12345.."
+        caption = re.sub(r"#\d{5,}\b", "", caption)
+        # "123456.."
+        caption = re.sub(r"\b\d{6,}\b", "", caption)
+        # filenames:
+        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
+
+        #
+        caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
+        caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
+
+        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
+
+        # this-is-my-cute-cat / this_is_my_cute_cat
+        regex2 = re.compile(r"(?:\-|\_)")
+        if len(re.findall(regex2, caption)) > 3:
+            caption = re.sub(regex2, " ", caption)
+
+        caption = ftfy.fix_text(caption)
+        caption = html.unescape(html.unescape(caption))
+
+        caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
+        caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
+        caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231
+
+        caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
+        caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
+        caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
+        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
+        caption = re.sub(r"\bpage\s+\d+\b", "", caption)
+
+        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
+
+        caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
+
+        caption = re.sub(r"\b\s+\:\s+", r": ", caption)
+        caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
+        caption = re.sub(r"\s+", " ", caption)
+
+        caption.strip()
+
+        caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
+        caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
+        caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
+        caption = re.sub(r"^\.\S+$", "", caption)
+
+        return caption.strip()
+
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+
+    @torch.no_grad()
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        do_classifier_free_guidance=True,
+        num_images_per_prompt=1,
+        device=None,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        clean_caption: bool = False,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+             prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if prompt is not None and negative_prompt is not None:
+            if type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+
+        if device is None:
+            device = self._execution_device
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # while T5 can handle much longer input sequences than 77, the text encoder was trained with a max length of 77 for IF
+        max_length = 77
+
+        if prompt_embeds is None:
+            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {max_length} tokens: {removed_text}"
+                )
+
+            attention_mask = text_inputs.attention_mask.to(device)
+
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+
+        if self.text_encoder is not None:
+            dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            dtype = self.unet.dtype
+        else:
+            dtype = None
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            attention_mask = uncond_input.attention_mask.to(device)
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+        else:
+            negative_prompt_embeds = None
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, nsfw_detected, watermark_detected = self.safety_checker(
+                images=image,
+                clip_input=safety_checker_input.pixel_values.to(dtype=dtype),
+            )
+        else:
+            nsfw_detected = None
+            watermark_detected = None
+
+            if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None:
+                self.unet_offload_hook.offload()
+
+        return image, nsfw_detected, watermark_detected
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        original_image,
+        batch_size,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        # image
+
+        if isinstance(image, list):
+            check_image_type = image[0]
+        else:
+            check_image_type = image
+
+        if (
+            not isinstance(check_image_type, torch.Tensor)
+            and not isinstance(check_image_type, PIL.Image.Image)
+            and not isinstance(check_image_type, np.ndarray)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
+                f" {type(check_image_type)}"
+            )
+
+        if isinstance(image, list):
+            image_batch_size = len(image)
+        elif isinstance(image, torch.Tensor):
+            image_batch_size = image.shape[0]
+        elif isinstance(image, PIL.Image.Image):
+            image_batch_size = 1
+        elif isinstance(image, np.ndarray):
+            image_batch_size = image.shape[0]
+        else:
+            assert False
+
+        if batch_size != image_batch_size:
+            raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}")
+
+        # original_image
+
+        if isinstance(original_image, list):
+            check_image_type = original_image[0]
+        else:
+            check_image_type = original_image
+
+        if (
+            not isinstance(check_image_type, torch.Tensor)
+            and not isinstance(check_image_type, PIL.Image.Image)
+            and not isinstance(check_image_type, np.ndarray)
+        ):
+            raise ValueError(
+                "`original_image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
+                f" {type(check_image_type)}"
+            )
+
+        if isinstance(original_image, list):
+            image_batch_size = len(original_image)
+        elif isinstance(original_image, torch.Tensor):
+            image_batch_size = original_image.shape[0]
+        elif isinstance(original_image, PIL.Image.Image):
+            image_batch_size = 1
+        elif isinstance(original_image, np.ndarray):
+            image_batch_size = original_image.shape[0]
+        else:
+            assert False
+
+        if batch_size != image_batch_size:
+            raise ValueError(
+                f"original_image batch size: {image_batch_size} must be same as prompt batch size {batch_size}"
+            )
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.preprocess_image with preprocess_image -> preprocess_original_image
+    def preprocess_original_image(self, image: PIL.Image.Image) -> torch.Tensor:
+        if not isinstance(image, list):
+            image = [image]
+
+        def numpy_to_pt(images):
+            if images.ndim == 3:
+                images = images[..., None]
+
+            images = torch.from_numpy(images.transpose(0, 3, 1, 2))
+            return images
+
+        if isinstance(image[0], PIL.Image.Image):
+            new_image = []
+
+            for image_ in image:
+                image_ = image_.convert("RGB")
+                image_ = resize(image_, self.unet.sample_size)
+                image_ = np.array(image_)
+                image_ = image_.astype(np.float32)
+                image_ = image_ / 127.5 - 1
+                new_image.append(image_)
+
+            image = new_image
+
+            image = np.stack(image, axis=0)  # to np
+            image = numpy_to_pt(image)  # to pt
+
+        elif isinstance(image[0], np.ndarray):
+            image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
+            image = numpy_to_pt(image)
+
+        elif isinstance(image[0], torch.Tensor):
+            image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0)
+
+        return image
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_superresolution.IFSuperResolutionPipeline.preprocess_image
+    def preprocess_image(self, image: PIL.Image.Image, num_images_per_prompt, device) -> torch.Tensor:
+        if not isinstance(image, torch.Tensor) and not isinstance(image, list):
+            image = [image]
+
+        if isinstance(image[0], PIL.Image.Image):
+            image = [np.array(i).astype(np.float32) / 255.0 for i in image]
+
+            image = np.stack(image, axis=0)  # to np
+            torch.from_numpy(image.transpose(0, 3, 1, 2))
+        elif isinstance(image[0], np.ndarray):
+            image = np.stack(image, axis=0)  # to np
+            if image.ndim == 5:
+                image = image[0]
+
+            image = torch.from_numpy(image.transpose(0, 3, 1, 2))
+        elif isinstance(image, list) and isinstance(image[0], torch.Tensor):
+            dims = image[0].ndim
+
+            if dims == 3:
+                image = torch.stack(image, dim=0)
+            elif dims == 4:
+                image = torch.concat(image, dim=0)
+            else:
+                raise ValueError(f"Image must have 3 or 4 dimensions, instead got {dims}")
+
+        image = image.to(device=device, dtype=self.unet.dtype)
+
+        image = image.repeat_interleave(num_images_per_prompt, dim=0)
+
+        return image
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
+
+        return timesteps, num_inference_steps - t_start
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.prepare_intermediate_images
+    def prepare_intermediate_images(
+        self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None
+    ):
+        _, channels, height, width = image.shape
+
+        batch_size = batch_size * num_images_per_prompt
+
+        shape = (batch_size, channels, height, width)
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        image = image.repeat_interleave(num_images_per_prompt, dim=0)
+        image = self.scheduler.add_noise(image, noise, timestep)
+
+        return image
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: Union[PIL.Image.Image, np.ndarray, torch.FloatTensor],
+        original_image: Union[
+            PIL.Image.Image, torch.Tensor, np.ndarray, List[PIL.Image.Image], List[torch.Tensor], List[np.ndarray]
+        ] = None,
+        strength: float = 0.8,
+        prompt: Union[str, List[str]] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        guidance_scale: float = 4.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        noise_level: int = 250,
+        clean_caption: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            original_image (`torch.FloatTensor` or `PIL.Image.Image`):
+                The original image that `image` was varied from.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
+                timesteps are used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+            noise_level (`int`, *optional*, defaults to 250):
+                The amount of noise to add to the upscaled image. Must be in the range `[0, 1000)`
+            clean_caption (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
+                be installed. If the dependencies are not installed, the embeddings will be created from the raw
+                prompt.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.IFPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.IFPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
+            returning a tuple, the first element is a list with the generated images, and the second element is a list
+            of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw)
+            or watermarked content, according to the `safety_checker`.
+        """
+        # 1. Check inputs. Raise error if not correct
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        self.check_inputs(
+            prompt,
+            image,
+            original_image,
+            batch_size,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+
+        # 2. Define call parameters
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            do_classifier_free_guidance,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clean_caption=clean_caption,
+        )
+
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        dtype = prompt_embeds.dtype
+
+        # 4. Prepare timesteps
+        if timesteps is not None:
+            self.scheduler.set_timesteps(timesteps=timesteps, device=device)
+            timesteps = self.scheduler.timesteps
+            num_inference_steps = len(timesteps)
+        else:
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            timesteps = self.scheduler.timesteps
+
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
+
+        # 5. prepare original image
+        original_image = self.preprocess_original_image(original_image)
+        original_image = original_image.to(device=device, dtype=dtype)
+
+        # 6. Prepare intermediate images
+        noise_timestep = timesteps[0:1]
+        noise_timestep = noise_timestep.repeat(batch_size * num_images_per_prompt)
+
+        intermediate_images = self.prepare_intermediate_images(
+            original_image,
+            noise_timestep,
+            batch_size,
+            num_images_per_prompt,
+            dtype,
+            device,
+            generator,
+        )
+
+        # 7. Prepare upscaled image and noise level
+        _, _, height, width = original_image.shape
+
+        image = self.preprocess_image(image, num_images_per_prompt, device)
+
+        upscaled = F.interpolate(image, (height, width), mode="bilinear", align_corners=True)
+
+        noise_level = torch.tensor([noise_level] * upscaled.shape[0], device=upscaled.device)
+        noise = randn_tensor(upscaled.shape, generator=generator, device=upscaled.device, dtype=upscaled.dtype)
+        upscaled = self.image_noising_scheduler.add_noise(upscaled, noise, timesteps=noise_level)
+
+        if do_classifier_free_guidance:
+            noise_level = torch.cat([noise_level] * 2)
+
+        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # HACK: see comment in `enable_model_cpu_offload`
+        if hasattr(self, "text_encoder_offload_hook") and self.text_encoder_offload_hook is not None:
+            self.text_encoder_offload_hook.offload()
+
+        # 9. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                model_input = torch.cat([intermediate_images, upscaled], dim=1)
+
+                model_input = torch.cat([model_input] * 2) if do_classifier_free_guidance else model_input
+                model_input = self.scheduler.scale_model_input(model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    class_labels=noise_level,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred_uncond, _ = noise_pred_uncond.split(model_input.shape[1] // 2, dim=1)
+                    noise_pred_text, predicted_variance = noise_pred_text.split(model_input.shape[1] // 2, dim=1)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    noise_pred = torch.cat([noise_pred, predicted_variance], dim=1)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                intermediate_images = self.scheduler.step(
+                    noise_pred, t, intermediate_images, **extra_step_kwargs
+                ).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, intermediate_images)
+
+        image = intermediate_images
+
+        if output_type == "pil":
+            # 10. Post-processing
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            # 11. Run safety checker
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+            # 12. Convert to PIL
+            image = self.numpy_to_pil(image)
+
+            # 13. Apply watermark
+            if self.watermarker is not None:
+                self.watermarker.apply_watermark(image, self.unet.config.sample_size)
+        elif output_type == "pt":
+            nsfw_detected = None
+            watermark_detected = None
+
+            if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None:
+                self.unet_offload_hook.offload()
+        else:
+            # 10. Post-processing
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            # 11. Run safety checker
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+
+        if not return_dict:
+            return (image, nsfw_detected, watermark_detected)
+
+        return IFPipelineOutput(images=image, nsfw_detected=nsfw_detected, watermark_detected=watermark_detected)
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
new file mode 100644
index 000000000000..95eba1cc7d24
--- /dev/null
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
@@ -0,0 +1,1098 @@
+import html
+import inspect
+import re
+import urllib.parse as ul
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL
+import torch
+from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
+
+from ...models import UNet2DConditionModel
+from ...schedulers import DDPMScheduler
+from ...utils import (
+    BACKENDS_MAPPING,
+    PIL_INTERPOLATION,
+    is_accelerate_available,
+    is_accelerate_version,
+    is_bs4_available,
+    is_ftfy_available,
+    logging,
+    randn_tensor,
+    replace_example_docstring,
+)
+from ..pipeline_utils import DiffusionPipeline
+from . import IFPipelineOutput
+from .safety_checker import IFSafetyChecker
+from .watermark import IFWatermarker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+if is_bs4_available():
+    from bs4 import BeautifulSoup
+
+if is_ftfy_available():
+    import ftfy
+
+
+# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.resize
+def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
+    w, h = images.size
+
+    coef = w / h
+
+    w, h = img_size, img_size
+
+    if coef >= 1:
+        w = int(round(img_size / 8 * coef) * 8)
+    else:
+        h = int(round(img_size / 8 / coef) * 8)
+
+    images = images.resize((w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
+
+    return images
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import IFInpaintingPipeline, IFInpaintingSuperResolutionPipeline, DiffusionPipeline
+        >>> from diffusers.utils import pt_to_pil
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+        >>> from io import BytesIO
+
+        >>> url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/if/person.png"
+        >>> response = requests.get(url)
+        >>> original_image = Image.open(BytesIO(response.content)).convert("RGB")
+        >>> original_image = original_image
+
+        >>> url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/if/glasses_mask.png"
+        >>> response = requests.get(url)
+        >>> mask_image = Image.open(BytesIO(response.content))
+        >>> mask_image = mask_image
+
+        >>> pipe = IFInpaintingPipeline.from_pretrained(
+        ...     "DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16
+        ... )
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> prompt = "blue sunglasses"
+        >>> prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
+
+        >>> image = pipe(
+        ...     image=original_image,
+        ...     mask_image=mask_image,
+        ...     prompt_embeds=prompt_embeds,
+        ...     negative_prompt_embeds=negative_embeds,
+        ...     output_type="pt",
+        ... ).images
+
+        >>> # save intermediate image
+        >>> pil_image = pt_to_pil(image)
+        >>> pil_image[0].save("./if_stage_I.png")
+
+        >>> super_res_1_pipe = IFInpaintingSuperResolutionPipeline.from_pretrained(
+        ...     "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16
+        ... )
+        >>> super_res_1_pipe.enable_model_cpu_offload()
+
+        >>> image = super_res_1_pipe(
+        ...     image=image,
+        ...     mask_image=mask_image,
+        ...     original_image=original_image,
+        ...     prompt_embeds=prompt_embeds,
+        ...     negative_prompt_embeds=negative_embeds,
+        ... ).images
+        >>> image[0].save("./if_stage_II.png")
+        ```
+"""
+
+
+class IFInpaintingPipeline(DiffusionPipeline):
+    tokenizer: T5Tokenizer
+    text_encoder: T5EncoderModel
+
+    unet: UNet2DConditionModel
+    scheduler: DDPMScheduler
+
+    feature_extractor: Optional[CLIPImageProcessor]
+    safety_checker: Optional[IFSafetyChecker]
+
+    watermarker: Optional[IFWatermarker]
+
+    bad_punct_regex = re.compile(
+        r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
+    )  # noqa
+
+    _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"]
+
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        safety_checker: Optional[IFSafetyChecker],
+        feature_extractor: Optional[CLIPImageProcessor],
+        watermarker: Optional[IFWatermarker],
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the IF license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            watermarker=watermarker,
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_sequential_cpu_offload
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
+        models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only
+        when their specific submodule has its `forward` method called.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        models = [
+            self.text_encoder,
+            self.unet,
+        ]
+        for cpu_offloaded_model in models:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+
+        if self.safety_checker is not None:
+            cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_model_cpu_offload
+    def enable_model_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        hook = None
+
+        if self.text_encoder is not None:
+            _, hook = cpu_offload_with_hook(self.text_encoder, device, prev_module_hook=hook)
+
+            # Accelerate will move the next model to the device _before_ calling the offload hook of the
+            # previous model. This will cause both models to be present on the device at the same time.
+            # IF uses T5 for its text encoder which is really large. We can manually call the offload
+            # hook for the text encoder to ensure it's moved to the cpu before the unet is moved to
+            # the GPU.
+            self.text_encoder_offload_hook = hook
+
+        _, hook = cpu_offload_with_hook(self.unet, device, prev_module_hook=hook)
+
+        # if the safety checker isn't called, `unet_offload_hook` will have to be called to manually offload the unet
+        self.unet_offload_hook = hook
+
+        if self.safety_checker is not None:
+            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
+
+        # We'll offload the last model manually.
+        self.final_offload_hook = hook
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.remove_all_hooks
+    def remove_all_hooks(self):
+        if is_accelerate_available():
+            from accelerate.hooks import remove_hook_from_module
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        for model in [self.text_encoder, self.unet, self.safety_checker]:
+            if model is not None:
+                remove_hook_from_module(model, recurse=True)
+
+        self.unet_offload_hook = None
+        self.text_encoder_offload_hook = None
+        self.final_offload_hook = None
+
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+
+    @torch.no_grad()
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        do_classifier_free_guidance=True,
+        num_images_per_prompt=1,
+        device=None,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        clean_caption: bool = False,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+             prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if prompt is not None and negative_prompt is not None:
+            if type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+
+        if device is None:
+            device = self._execution_device
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # while T5 can handle much longer input sequences than 77, the text encoder was trained with a max length of 77 for IF
+        max_length = 77
+
+        if prompt_embeds is None:
+            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {max_length} tokens: {removed_text}"
+                )
+
+            attention_mask = text_inputs.attention_mask.to(device)
+
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+
+        if self.text_encoder is not None:
+            dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            dtype = self.unet.dtype
+        else:
+            dtype = None
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            attention_mask = uncond_input.attention_mask.to(device)
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+        else:
+            negative_prompt_embeds = None
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, nsfw_detected, watermark_detected = self.safety_checker(
+                images=image,
+                clip_input=safety_checker_input.pixel_values.to(dtype=dtype),
+            )
+        else:
+            nsfw_detected = None
+            watermark_detected = None
+
+            if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None:
+                self.unet_offload_hook.offload()
+
+        return image, nsfw_detected, watermark_detected
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        mask_image,
+        batch_size,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        # image
+
+        if isinstance(image, list):
+            check_image_type = image[0]
+        else:
+            check_image_type = image
+
+        if (
+            not isinstance(check_image_type, torch.Tensor)
+            and not isinstance(check_image_type, PIL.Image.Image)
+            and not isinstance(check_image_type, np.ndarray)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
+                f" {type(check_image_type)}"
+            )
+
+        if isinstance(image, list):
+            image_batch_size = len(image)
+        elif isinstance(image, torch.Tensor):
+            image_batch_size = image.shape[0]
+        elif isinstance(image, PIL.Image.Image):
+            image_batch_size = 1
+        elif isinstance(image, np.ndarray):
+            image_batch_size = image.shape[0]
+        else:
+            assert False
+
+        if batch_size != image_batch_size:
+            raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}")
+
+        # mask_image
+
+        if isinstance(mask_image, list):
+            check_image_type = mask_image[0]
+        else:
+            check_image_type = mask_image
+
+        if (
+            not isinstance(check_image_type, torch.Tensor)
+            and not isinstance(check_image_type, PIL.Image.Image)
+            and not isinstance(check_image_type, np.ndarray)
+        ):
+            raise ValueError(
+                "`mask_image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
+                f" {type(check_image_type)}"
+            )
+
+        if isinstance(mask_image, list):
+            image_batch_size = len(mask_image)
+        elif isinstance(mask_image, torch.Tensor):
+            image_batch_size = mask_image.shape[0]
+        elif isinstance(mask_image, PIL.Image.Image):
+            image_batch_size = 1
+        elif isinstance(mask_image, np.ndarray):
+            image_batch_size = mask_image.shape[0]
+        else:
+            assert False
+
+        if image_batch_size != 1 and batch_size != image_batch_size:
+            raise ValueError(
+                f"mask_image batch size: {image_batch_size} must be `1` or the same as prompt batch size {batch_size}"
+            )
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
+    def _text_preprocessing(self, text, clean_caption=False):
+        if clean_caption and not is_bs4_available():
+            logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
+            logger.warn("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if clean_caption and not is_ftfy_available():
+            logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
+            logger.warn("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if not isinstance(text, (tuple, list)):
+            text = [text]
+
+        def process(text: str):
+            if clean_caption:
+                text = self._clean_caption(text)
+                text = self._clean_caption(text)
+            else:
+                text = text.lower().strip()
+            return text
+
+        return [process(t) for t in text]
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
+    def _clean_caption(self, caption):
+        caption = str(caption)
+        caption = ul.unquote_plus(caption)
+        caption = caption.strip().lower()
+        caption = re.sub("<person>", "person", caption)
+        # urls:
+        caption = re.sub(
+            r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        caption = re.sub(
+            r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        # html:
+        caption = BeautifulSoup(caption, features="html.parser").text
+
+        # @<nickname>
+        caption = re.sub(r"@[\w\d]+\b", "", caption)
+
+        # 31C0—31EF CJK Strokes
+        # 31F0—31FF Katakana Phonetic Extensions
+        # 3200—32FF Enclosed CJK Letters and Months
+        # 3300—33FF CJK Compatibility
+        # 3400—4DBF CJK Unified Ideographs Extension A
+        # 4DC0—4DFF Yijing Hexagram Symbols
+        # 4E00—9FFF CJK Unified Ideographs
+        caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
+        caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
+        caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
+        caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
+        caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
+        caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
+        caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
+        #######################################################
+
+        # все виды тире / all types of dash --> "-"
+        caption = re.sub(
+            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
+            "-",
+            caption,
+        )
+
+        # кавычки к одному стандарту
+        caption = re.sub(r"[`´«»“”¨]", '"', caption)
+        caption = re.sub(r"[‘’]", "'", caption)
+
+        # &quot;
+        caption = re.sub(r"&quot;?", "", caption)
+        # &amp
+        caption = re.sub(r"&amp", "", caption)
+
+        # ip adresses:
+        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
+
+        # article ids:
+        caption = re.sub(r"\d:\d\d\s+$", "", caption)
+
+        # \n
+        caption = re.sub(r"\\n", " ", caption)
+
+        # "#123"
+        caption = re.sub(r"#\d{1,3}\b", "", caption)
+        # "#12345.."
+        caption = re.sub(r"#\d{5,}\b", "", caption)
+        # "123456.."
+        caption = re.sub(r"\b\d{6,}\b", "", caption)
+        # filenames:
+        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
+
+        #
+        caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
+        caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
+
+        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
+
+        # this-is-my-cute-cat / this_is_my_cute_cat
+        regex2 = re.compile(r"(?:\-|\_)")
+        if len(re.findall(regex2, caption)) > 3:
+            caption = re.sub(regex2, " ", caption)
+
+        caption = ftfy.fix_text(caption)
+        caption = html.unescape(html.unescape(caption))
+
+        caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
+        caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
+        caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231
+
+        caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
+        caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
+        caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
+        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
+        caption = re.sub(r"\bpage\s+\d+\b", "", caption)
+
+        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
+
+        caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
+
+        caption = re.sub(r"\b\s+\:\s+", r": ", caption)
+        caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
+        caption = re.sub(r"\s+", " ", caption)
+
+        caption.strip()
+
+        caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
+        caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
+        caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
+        caption = re.sub(r"^\.\S+$", "", caption)
+
+        return caption.strip()
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.preprocess_image
+    def preprocess_image(self, image: PIL.Image.Image) -> torch.Tensor:
+        if not isinstance(image, list):
+            image = [image]
+
+        def numpy_to_pt(images):
+            if images.ndim == 3:
+                images = images[..., None]
+
+            images = torch.from_numpy(images.transpose(0, 3, 1, 2))
+            return images
+
+        if isinstance(image[0], PIL.Image.Image):
+            new_image = []
+
+            for image_ in image:
+                image_ = image_.convert("RGB")
+                image_ = resize(image_, self.unet.sample_size)
+                image_ = np.array(image_)
+                image_ = image_.astype(np.float32)
+                image_ = image_ / 127.5 - 1
+                new_image.append(image_)
+
+            image = new_image
+
+            image = np.stack(image, axis=0)  # to np
+            image = numpy_to_pt(image)  # to pt
+
+        elif isinstance(image[0], np.ndarray):
+            image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
+            image = numpy_to_pt(image)
+
+        elif isinstance(image[0], torch.Tensor):
+            image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0)
+
+        return image
+
+    def preprocess_mask_image(self, mask_image) -> torch.Tensor:
+        if not isinstance(mask_image, list):
+            mask_image = [mask_image]
+
+        if isinstance(mask_image[0], torch.Tensor):
+            mask_image = torch.cat(mask_image, axis=0) if mask_image[0].ndim == 4 else torch.stack(mask_image, axis=0)
+
+            if mask_image.ndim == 2:
+                # Batch and add channel dim for single mask
+                mask_image = mask_image.unsqueeze(0).unsqueeze(0)
+            elif mask_image.ndim == 3 and mask_image.shape[0] == 1:
+                # Single mask, the 0'th dimension is considered to be
+                # the existing batch size of 1
+                mask_image = mask_image.unsqueeze(0)
+            elif mask_image.ndim == 3 and mask_image.shape[0] != 1:
+                # Batch of mask, the 0'th dimension is considered to be
+                # the batching dimension
+                mask_image = mask_image.unsqueeze(1)
+
+            mask_image[mask_image < 0.5] = 0
+            mask_image[mask_image >= 0.5] = 1
+
+        elif isinstance(mask_image[0], PIL.Image.Image):
+            new_mask_image = []
+
+            for mask_image_ in mask_image:
+                mask_image_ = mask_image_.convert("L")
+                mask_image_ = resize(mask_image_, self.unet.sample_size)
+                mask_image_ = np.array(mask_image_)
+                mask_image_ = mask_image_[None, None, :]
+                new_mask_image.append(mask_image_)
+
+            mask_image = new_mask_image
+
+            mask_image = np.concatenate(mask_image, axis=0)
+            mask_image = mask_image.astype(np.float32) / 255.0
+            mask_image[mask_image < 0.5] = 0
+            mask_image[mask_image >= 0.5] = 1
+            mask_image = torch.from_numpy(mask_image)
+
+        elif isinstance(mask_image[0], np.ndarray):
+            mask_image = np.concatenate([m[None, None, :] for m in mask_image], axis=0)
+
+            mask_image[mask_image < 0.5] = 0
+            mask_image[mask_image >= 0.5] = 1
+            mask_image = torch.from_numpy(mask_image)
+
+        return mask_image
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_intermediate_images(
+        self, image, timestep, batch_size, num_images_per_prompt, dtype, device, mask_image, generator=None
+    ):
+        image_batch_size, channels, height, width = image.shape
+
+        batch_size = batch_size * num_images_per_prompt
+
+        shape = (batch_size, channels, height, width)
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        image = image.repeat_interleave(num_images_per_prompt, dim=0)
+        noised_image = self.scheduler.add_noise(image, noise, timestep)
+
+        image = (1 - mask_image) * image + mask_image * noised_image
+
+        return image
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[
+            PIL.Image.Image, torch.Tensor, np.ndarray, List[PIL.Image.Image], List[torch.Tensor], List[np.ndarray]
+        ] = None,
+        mask_image: Union[
+            PIL.Image.Image, torch.Tensor, np.ndarray, List[PIL.Image.Image], List[torch.Tensor], List[np.ndarray]
+        ] = None,
+        strength: float = 1.0,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        guidance_scale: float = 7.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        clean_caption: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            mask_image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
+                to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
+                instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
+                timesteps are used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            clean_caption (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
+                be installed. If the dependencies are not installed, the embeddings will be created from the raw
+                prompt.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.IFPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.IFPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
+            returning a tuple, the first element is a list with the generated images, and the second element is a list
+            of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw)
+            or watermarked content, according to the `safety_checker`.
+        """
+        # 1. Check inputs. Raise error if not correct
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        self.check_inputs(
+            prompt,
+            image,
+            mask_image,
+            batch_size,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+
+        # 2. Define call parameters
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            do_classifier_free_guidance,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clean_caption=clean_caption,
+        )
+
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        dtype = prompt_embeds.dtype
+
+        # 4. Prepare timesteps
+        if timesteps is not None:
+            self.scheduler.set_timesteps(timesteps=timesteps, device=device)
+            timesteps = self.scheduler.timesteps
+            num_inference_steps = len(timesteps)
+        else:
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            timesteps = self.scheduler.timesteps
+
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
+
+        # 5. Prepare intermediate images
+        image = self.preprocess_image(image)
+        image = image.to(device=device, dtype=dtype)
+
+        mask_image = self.preprocess_mask_image(mask_image)
+        mask_image = mask_image.to(device=device, dtype=dtype)
+
+        if mask_image.shape[0] == 1:
+            mask_image = mask_image.repeat_interleave(batch_size * num_images_per_prompt, dim=0)
+        else:
+            mask_image = mask_image.repeat_interleave(num_images_per_prompt, dim=0)
+
+        noise_timestep = timesteps[0:1]
+        noise_timestep = noise_timestep.repeat(batch_size * num_images_per_prompt)
+
+        intermediate_images = self.prepare_intermediate_images(
+            image, noise_timestep, batch_size, num_images_per_prompt, dtype, device, mask_image, generator
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # HACK: see comment in `enable_model_cpu_offload`
+        if hasattr(self, "text_encoder_offload_hook") and self.text_encoder_offload_hook is not None:
+            self.text_encoder_offload_hook.offload()
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                model_input = (
+                    torch.cat([intermediate_images] * 2) if do_classifier_free_guidance else intermediate_images
+                )
+                model_input = self.scheduler.scale_model_input(model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred_uncond, _ = noise_pred_uncond.split(model_input.shape[1], dim=1)
+                    noise_pred_text, predicted_variance = noise_pred_text.split(model_input.shape[1], dim=1)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    noise_pred = torch.cat([noise_pred, predicted_variance], dim=1)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                prev_intermediate_images = intermediate_images
+
+                intermediate_images = self.scheduler.step(
+                    noise_pred, t, intermediate_images, **extra_step_kwargs
+                ).prev_sample
+
+                intermediate_images = (1 - mask_image) * prev_intermediate_images + mask_image * intermediate_images
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, intermediate_images)
+
+        image = intermediate_images
+
+        if output_type == "pil":
+            # 8. Post-processing
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            # 9. Run safety checker
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+            # 10. Convert to PIL
+            image = self.numpy_to_pil(image)
+
+            # 11. Apply watermark
+            if self.watermarker is not None:
+                self.watermarker.apply_watermark(image, self.unet.config.sample_size)
+        elif output_type == "pt":
+            nsfw_detected = None
+            watermark_detected = None
+
+            if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None:
+                self.unet_offload_hook.offload()
+        else:
+            # 8. Post-processing
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            # 9. Run safety checker
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+
+        if not return_dict:
+            return (image, nsfw_detected, watermark_detected)
+
+        return IFPipelineOutput(images=image, nsfw_detected=nsfw_detected, watermark_detected=watermark_detected)
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
new file mode 100644
index 000000000000..4eb0bf300fa5
--- /dev/null
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
@@ -0,0 +1,1208 @@
+import html
+import inspect
+import re
+import urllib.parse as ul
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL
+import torch
+import torch.nn.functional as F
+from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
+
+from ...models import UNet2DConditionModel
+from ...schedulers import DDPMScheduler
+from ...utils import (
+    BACKENDS_MAPPING,
+    PIL_INTERPOLATION,
+    is_accelerate_available,
+    is_accelerate_version,
+    is_bs4_available,
+    is_ftfy_available,
+    logging,
+    randn_tensor,
+    replace_example_docstring,
+)
+from ..pipeline_utils import DiffusionPipeline
+from . import IFPipelineOutput
+from .safety_checker import IFSafetyChecker
+from .watermark import IFWatermarker
+
+
+if is_bs4_available():
+    from bs4 import BeautifulSoup
+
+if is_ftfy_available():
+    import ftfy
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.resize
+def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
+    w, h = images.size
+
+    coef = w / h
+
+    w, h = img_size, img_size
+
+    if coef >= 1:
+        w = int(round(img_size / 8 * coef) * 8)
+    else:
+        h = int(round(img_size / 8 / coef) * 8)
+
+    images = images.resize((w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
+
+    return images
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import IFInpaintingPipeline, IFInpaintingSuperResolutionPipeline, DiffusionPipeline
+        >>> from diffusers.utils import pt_to_pil
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+        >>> from io import BytesIO
+
+        >>> url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/if/person.png"
+        >>> response = requests.get(url)
+        >>> original_image = Image.open(BytesIO(response.content)).convert("RGB")
+        >>> original_image = original_image
+
+        >>> url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/if/glasses_mask.png"
+        >>> response = requests.get(url)
+        >>> mask_image = Image.open(BytesIO(response.content))
+        >>> mask_image = mask_image
+
+        >>> pipe = IFInpaintingPipeline.from_pretrained(
+        ...     "DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16
+        ... )
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> prompt = "blue sunglasses"
+
+        >>> prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
+        >>> image = pipe(
+        ...     image=original_image,
+        ...     mask_image=mask_image,
+        ...     prompt_embeds=prompt_embeds,
+        ...     negative_prompt_embeds=negative_embeds,
+        ...     output_type="pt",
+        ... ).images
+
+        >>> # save intermediate image
+        >>> pil_image = pt_to_pil(image)
+        >>> pil_image[0].save("./if_stage_I.png")
+
+        >>> super_res_1_pipe = IFInpaintingSuperResolutionPipeline.from_pretrained(
+        ...     "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16
+        ... )
+        >>> super_res_1_pipe.enable_model_cpu_offload()
+
+        >>> image = super_res_1_pipe(
+        ...     image=image,
+        ...     mask_image=mask_image,
+        ...     original_image=original_image,
+        ...     prompt_embeds=prompt_embeds,
+        ...     negative_prompt_embeds=negative_embeds,
+        ... ).images
+        >>> image[0].save("./if_stage_II.png")
+        ```
+    """
+
+
+class IFInpaintingSuperResolutionPipeline(DiffusionPipeline):
+    tokenizer: T5Tokenizer
+    text_encoder: T5EncoderModel
+
+    unet: UNet2DConditionModel
+    scheduler: DDPMScheduler
+    image_noising_scheduler: DDPMScheduler
+
+    feature_extractor: Optional[CLIPImageProcessor]
+    safety_checker: Optional[IFSafetyChecker]
+
+    watermarker: Optional[IFWatermarker]
+
+    bad_punct_regex = re.compile(
+        r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
+    )  # noqa
+
+    _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"]
+
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        image_noising_scheduler: DDPMScheduler,
+        safety_checker: Optional[IFSafetyChecker],
+        feature_extractor: Optional[CLIPImageProcessor],
+        watermarker: Optional[IFWatermarker],
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the IF license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        if unet.config.in_channels != 6:
+            logger.warn(
+                "It seems like you have loaded a checkpoint that shall not be used for super resolution from {unet.config._name_or_path} as it accepts {unet.config.in_channels} input channels instead of 6. Please make sure to pass a super resolution checkpoint as the `'unet'`: IFSuperResolutionPipeline.from_pretrained(unet=super_resolution_unet, ...)`."
+            )
+
+        self.register_modules(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            image_noising_scheduler=image_noising_scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            watermarker=watermarker,
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_sequential_cpu_offload
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
+        models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only
+        when their specific submodule has its `forward` method called.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        models = [
+            self.text_encoder,
+            self.unet,
+        ]
+        for cpu_offloaded_model in models:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+
+        if self.safety_checker is not None:
+            cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_model_cpu_offload
+    def enable_model_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        hook = None
+
+        if self.text_encoder is not None:
+            _, hook = cpu_offload_with_hook(self.text_encoder, device, prev_module_hook=hook)
+
+            # Accelerate will move the next model to the device _before_ calling the offload hook of the
+            # previous model. This will cause both models to be present on the device at the same time.
+            # IF uses T5 for its text encoder which is really large. We can manually call the offload
+            # hook for the text encoder to ensure it's moved to the cpu before the unet is moved to
+            # the GPU.
+            self.text_encoder_offload_hook = hook
+
+        _, hook = cpu_offload_with_hook(self.unet, device, prev_module_hook=hook)
+
+        # if the safety checker isn't called, `unet_offload_hook` will have to be called to manually offload the unet
+        self.unet_offload_hook = hook
+
+        if self.safety_checker is not None:
+            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
+
+        # We'll offload the last model manually.
+        self.final_offload_hook = hook
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.remove_all_hooks
+    def remove_all_hooks(self):
+        if is_accelerate_available():
+            from accelerate.hooks import remove_hook_from_module
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        for model in [self.text_encoder, self.unet, self.safety_checker]:
+            if model is not None:
+                remove_hook_from_module(model, recurse=True)
+
+        self.unet_offload_hook = None
+        self.text_encoder_offload_hook = None
+        self.final_offload_hook = None
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
+    def _text_preprocessing(self, text, clean_caption=False):
+        if clean_caption and not is_bs4_available():
+            logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
+            logger.warn("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if clean_caption and not is_ftfy_available():
+            logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
+            logger.warn("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if not isinstance(text, (tuple, list)):
+            text = [text]
+
+        def process(text: str):
+            if clean_caption:
+                text = self._clean_caption(text)
+                text = self._clean_caption(text)
+            else:
+                text = text.lower().strip()
+            return text
+
+        return [process(t) for t in text]
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
+    def _clean_caption(self, caption):
+        caption = str(caption)
+        caption = ul.unquote_plus(caption)
+        caption = caption.strip().lower()
+        caption = re.sub("<person>", "person", caption)
+        # urls:
+        caption = re.sub(
+            r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        caption = re.sub(
+            r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        # html:
+        caption = BeautifulSoup(caption, features="html.parser").text
+
+        # @<nickname>
+        caption = re.sub(r"@[\w\d]+\b", "", caption)
+
+        # 31C0—31EF CJK Strokes
+        # 31F0—31FF Katakana Phonetic Extensions
+        # 3200—32FF Enclosed CJK Letters and Months
+        # 3300—33FF CJK Compatibility
+        # 3400—4DBF CJK Unified Ideographs Extension A
+        # 4DC0—4DFF Yijing Hexagram Symbols
+        # 4E00—9FFF CJK Unified Ideographs
+        caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
+        caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
+        caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
+        caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
+        caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
+        caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
+        caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
+        #######################################################
+
+        # все виды тире / all types of dash --> "-"
+        caption = re.sub(
+            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
+            "-",
+            caption,
+        )
+
+        # кавычки к одному стандарту
+        caption = re.sub(r"[`´«»“”¨]", '"', caption)
+        caption = re.sub(r"[‘’]", "'", caption)
+
+        # &quot;
+        caption = re.sub(r"&quot;?", "", caption)
+        # &amp
+        caption = re.sub(r"&amp", "", caption)
+
+        # ip adresses:
+        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
+
+        # article ids:
+        caption = re.sub(r"\d:\d\d\s+$", "", caption)
+
+        # \n
+        caption = re.sub(r"\\n", " ", caption)
+
+        # "#123"
+        caption = re.sub(r"#\d{1,3}\b", "", caption)
+        # "#12345.."
+        caption = re.sub(r"#\d{5,}\b", "", caption)
+        # "123456.."
+        caption = re.sub(r"\b\d{6,}\b", "", caption)
+        # filenames:
+        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
+
+        #
+        caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
+        caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
+
+        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
+
+        # this-is-my-cute-cat / this_is_my_cute_cat
+        regex2 = re.compile(r"(?:\-|\_)")
+        if len(re.findall(regex2, caption)) > 3:
+            caption = re.sub(regex2, " ", caption)
+
+        caption = ftfy.fix_text(caption)
+        caption = html.unescape(html.unescape(caption))
+
+        caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
+        caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
+        caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231
+
+        caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
+        caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
+        caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
+        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
+        caption = re.sub(r"\bpage\s+\d+\b", "", caption)
+
+        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
+
+        caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
+
+        caption = re.sub(r"\b\s+\:\s+", r": ", caption)
+        caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
+        caption = re.sub(r"\s+", " ", caption)
+
+        caption.strip()
+
+        caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
+        caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
+        caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
+        caption = re.sub(r"^\.\S+$", "", caption)
+
+        return caption.strip()
+
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+
+    @torch.no_grad()
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        do_classifier_free_guidance=True,
+        num_images_per_prompt=1,
+        device=None,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        clean_caption: bool = False,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+             prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if prompt is not None and negative_prompt is not None:
+            if type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+
+        if device is None:
+            device = self._execution_device
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # while T5 can handle much longer input sequences than 77, the text encoder was trained with a max length of 77 for IF
+        max_length = 77
+
+        if prompt_embeds is None:
+            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {max_length} tokens: {removed_text}"
+                )
+
+            attention_mask = text_inputs.attention_mask.to(device)
+
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+
+        if self.text_encoder is not None:
+            dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            dtype = self.unet.dtype
+        else:
+            dtype = None
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            attention_mask = uncond_input.attention_mask.to(device)
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+        else:
+            negative_prompt_embeds = None
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, nsfw_detected, watermark_detected = self.safety_checker(
+                images=image,
+                clip_input=safety_checker_input.pixel_values.to(dtype=dtype),
+            )
+        else:
+            nsfw_detected = None
+            watermark_detected = None
+
+            if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None:
+                self.unet_offload_hook.offload()
+
+        return image, nsfw_detected, watermark_detected
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        original_image,
+        mask_image,
+        batch_size,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        # image
+
+        if isinstance(image, list):
+            check_image_type = image[0]
+        else:
+            check_image_type = image
+
+        if (
+            not isinstance(check_image_type, torch.Tensor)
+            and not isinstance(check_image_type, PIL.Image.Image)
+            and not isinstance(check_image_type, np.ndarray)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
+                f" {type(check_image_type)}"
+            )
+
+        if isinstance(image, list):
+            image_batch_size = len(image)
+        elif isinstance(image, torch.Tensor):
+            image_batch_size = image.shape[0]
+        elif isinstance(image, PIL.Image.Image):
+            image_batch_size = 1
+        elif isinstance(image, np.ndarray):
+            image_batch_size = image.shape[0]
+        else:
+            assert False
+
+        if batch_size != image_batch_size:
+            raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}")
+
+        # original_image
+
+        if isinstance(original_image, list):
+            check_image_type = original_image[0]
+        else:
+            check_image_type = original_image
+
+        if (
+            not isinstance(check_image_type, torch.Tensor)
+            and not isinstance(check_image_type, PIL.Image.Image)
+            and not isinstance(check_image_type, np.ndarray)
+        ):
+            raise ValueError(
+                "`original_image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
+                f" {type(check_image_type)}"
+            )
+
+        if isinstance(original_image, list):
+            image_batch_size = len(original_image)
+        elif isinstance(original_image, torch.Tensor):
+            image_batch_size = original_image.shape[0]
+        elif isinstance(original_image, PIL.Image.Image):
+            image_batch_size = 1
+        elif isinstance(original_image, np.ndarray):
+            image_batch_size = original_image.shape[0]
+        else:
+            assert False
+
+        if batch_size != image_batch_size:
+            raise ValueError(
+                f"original_image batch size: {image_batch_size} must be same as prompt batch size {batch_size}"
+            )
+
+        # mask_image
+
+        if isinstance(mask_image, list):
+            check_image_type = mask_image[0]
+        else:
+            check_image_type = mask_image
+
+        if (
+            not isinstance(check_image_type, torch.Tensor)
+            and not isinstance(check_image_type, PIL.Image.Image)
+            and not isinstance(check_image_type, np.ndarray)
+        ):
+            raise ValueError(
+                "`mask_image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
+                f" {type(check_image_type)}"
+            )
+
+        if isinstance(mask_image, list):
+            image_batch_size = len(mask_image)
+        elif isinstance(mask_image, torch.Tensor):
+            image_batch_size = mask_image.shape[0]
+        elif isinstance(mask_image, PIL.Image.Image):
+            image_batch_size = 1
+        elif isinstance(mask_image, np.ndarray):
+            image_batch_size = mask_image.shape[0]
+        else:
+            assert False
+
+        if image_batch_size != 1 and batch_size != image_batch_size:
+            raise ValueError(
+                f"mask_image batch size: {image_batch_size} must be `1` or the same as prompt batch size {batch_size}"
+            )
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.preprocess_image with preprocess_image -> preprocess_original_image
+    def preprocess_original_image(self, image: PIL.Image.Image) -> torch.Tensor:
+        if not isinstance(image, list):
+            image = [image]
+
+        def numpy_to_pt(images):
+            if images.ndim == 3:
+                images = images[..., None]
+
+            images = torch.from_numpy(images.transpose(0, 3, 1, 2))
+            return images
+
+        if isinstance(image[0], PIL.Image.Image):
+            new_image = []
+
+            for image_ in image:
+                image_ = image_.convert("RGB")
+                image_ = resize(image_, self.unet.sample_size)
+                image_ = np.array(image_)
+                image_ = image_.astype(np.float32)
+                image_ = image_ / 127.5 - 1
+                new_image.append(image_)
+
+            image = new_image
+
+            image = np.stack(image, axis=0)  # to np
+            image = numpy_to_pt(image)  # to pt
+
+        elif isinstance(image[0], np.ndarray):
+            image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
+            image = numpy_to_pt(image)
+
+        elif isinstance(image[0], torch.Tensor):
+            image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0)
+
+        return image
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_superresolution.IFSuperResolutionPipeline.preprocess_image
+    def preprocess_image(self, image: PIL.Image.Image, num_images_per_prompt, device) -> torch.Tensor:
+        if not isinstance(image, torch.Tensor) and not isinstance(image, list):
+            image = [image]
+
+        if isinstance(image[0], PIL.Image.Image):
+            image = [np.array(i).astype(np.float32) / 255.0 for i in image]
+
+            image = np.stack(image, axis=0)  # to np
+            torch.from_numpy(image.transpose(0, 3, 1, 2))
+        elif isinstance(image[0], np.ndarray):
+            image = np.stack(image, axis=0)  # to np
+            if image.ndim == 5:
+                image = image[0]
+
+            image = torch.from_numpy(image.transpose(0, 3, 1, 2))
+        elif isinstance(image, list) and isinstance(image[0], torch.Tensor):
+            dims = image[0].ndim
+
+            if dims == 3:
+                image = torch.stack(image, dim=0)
+            elif dims == 4:
+                image = torch.concat(image, dim=0)
+            else:
+                raise ValueError(f"Image must have 3 or 4 dimensions, instead got {dims}")
+
+        image = image.to(device=device, dtype=self.unet.dtype)
+
+        image = image.repeat_interleave(num_images_per_prompt, dim=0)
+
+        return image
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_inpainting.IFInpaintingPipeline.preprocess_mask_image
+    def preprocess_mask_image(self, mask_image) -> torch.Tensor:
+        if not isinstance(mask_image, list):
+            mask_image = [mask_image]
+
+        if isinstance(mask_image[0], torch.Tensor):
+            mask_image = torch.cat(mask_image, axis=0) if mask_image[0].ndim == 4 else torch.stack(mask_image, axis=0)
+
+            if mask_image.ndim == 2:
+                # Batch and add channel dim for single mask
+                mask_image = mask_image.unsqueeze(0).unsqueeze(0)
+            elif mask_image.ndim == 3 and mask_image.shape[0] == 1:
+                # Single mask, the 0'th dimension is considered to be
+                # the existing batch size of 1
+                mask_image = mask_image.unsqueeze(0)
+            elif mask_image.ndim == 3 and mask_image.shape[0] != 1:
+                # Batch of mask, the 0'th dimension is considered to be
+                # the batching dimension
+                mask_image = mask_image.unsqueeze(1)
+
+            mask_image[mask_image < 0.5] = 0
+            mask_image[mask_image >= 0.5] = 1
+
+        elif isinstance(mask_image[0], PIL.Image.Image):
+            new_mask_image = []
+
+            for mask_image_ in mask_image:
+                mask_image_ = mask_image_.convert("L")
+                mask_image_ = resize(mask_image_, self.unet.sample_size)
+                mask_image_ = np.array(mask_image_)
+                mask_image_ = mask_image_[None, None, :]
+                new_mask_image.append(mask_image_)
+
+            mask_image = new_mask_image
+
+            mask_image = np.concatenate(mask_image, axis=0)
+            mask_image = mask_image.astype(np.float32) / 255.0
+            mask_image[mask_image < 0.5] = 0
+            mask_image[mask_image >= 0.5] = 1
+            mask_image = torch.from_numpy(mask_image)
+
+        elif isinstance(mask_image[0], np.ndarray):
+            mask_image = np.concatenate([m[None, None, :] for m in mask_image], axis=0)
+
+            mask_image[mask_image < 0.5] = 0
+            mask_image[mask_image >= 0.5] = 1
+            mask_image = torch.from_numpy(mask_image)
+
+        return mask_image
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
+
+        return timesteps, num_inference_steps - t_start
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_inpainting.IFInpaintingPipeline.prepare_intermediate_images
+    def prepare_intermediate_images(
+        self, image, timestep, batch_size, num_images_per_prompt, dtype, device, mask_image, generator=None
+    ):
+        image_batch_size, channels, height, width = image.shape
+
+        batch_size = batch_size * num_images_per_prompt
+
+        shape = (batch_size, channels, height, width)
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        image = image.repeat_interleave(num_images_per_prompt, dim=0)
+        noised_image = self.scheduler.add_noise(image, noise, timestep)
+
+        image = (1 - mask_image) * image + mask_image * noised_image
+
+        return image
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: Union[PIL.Image.Image, np.ndarray, torch.FloatTensor],
+        original_image: Union[
+            PIL.Image.Image, torch.Tensor, np.ndarray, List[PIL.Image.Image], List[torch.Tensor], List[np.ndarray]
+        ] = None,
+        mask_image: Union[
+            PIL.Image.Image, torch.Tensor, np.ndarray, List[PIL.Image.Image], List[torch.Tensor], List[np.ndarray]
+        ] = None,
+        strength: float = 0.8,
+        prompt: Union[str, List[str]] = None,
+        num_inference_steps: int = 100,
+        timesteps: List[int] = None,
+        guidance_scale: float = 4.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        noise_level: int = 0,
+        clean_caption: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            original_image (`torch.FloatTensor` or `PIL.Image.Image`):
+                The original image that `image` was varied from.
+            mask_image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
+                to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
+                instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
+                timesteps are used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+            noise_level (`int`, *optional*, defaults to 0):
+                The amount of noise to add to the upscaled image. Must be in the range `[0, 1000)`
+            clean_caption (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
+                be installed. If the dependencies are not installed, the embeddings will be created from the raw
+                prompt.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.IFPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.IFPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
+            returning a tuple, the first element is a list with the generated images, and the second element is a list
+            of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw)
+            or watermarked content, according to the `safety_checker`.
+        """
+        # 1. Check inputs. Raise error if not correct
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        self.check_inputs(
+            prompt,
+            image,
+            original_image,
+            mask_image,
+            batch_size,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+
+        # 2. Define call parameters
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            do_classifier_free_guidance,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clean_caption=clean_caption,
+        )
+
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        dtype = prompt_embeds.dtype
+
+        # 4. Prepare timesteps
+        if timesteps is not None:
+            self.scheduler.set_timesteps(timesteps=timesteps, device=device)
+            timesteps = self.scheduler.timesteps
+            num_inference_steps = len(timesteps)
+        else:
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            timesteps = self.scheduler.timesteps
+
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
+
+        # 5. prepare original image
+        original_image = self.preprocess_original_image(original_image)
+        original_image = original_image.to(device=device, dtype=dtype)
+
+        # 6. prepare mask image
+        mask_image = self.preprocess_mask_image(mask_image)
+        mask_image = mask_image.to(device=device, dtype=dtype)
+
+        if mask_image.shape[0] == 1:
+            mask_image = mask_image.repeat_interleave(batch_size * num_images_per_prompt, dim=0)
+        else:
+            mask_image = mask_image.repeat_interleave(num_images_per_prompt, dim=0)
+
+        # 6. Prepare intermediate images
+        noise_timestep = timesteps[0:1]
+        noise_timestep = noise_timestep.repeat(batch_size * num_images_per_prompt)
+
+        intermediate_images = self.prepare_intermediate_images(
+            original_image,
+            noise_timestep,
+            batch_size,
+            num_images_per_prompt,
+            dtype,
+            device,
+            mask_image,
+            generator,
+        )
+
+        # 7. Prepare upscaled image and noise level
+        _, _, height, width = original_image.shape
+
+        image = self.preprocess_image(image, num_images_per_prompt, device)
+
+        upscaled = F.interpolate(image, (height, width), mode="bilinear", align_corners=True)
+
+        noise_level = torch.tensor([noise_level] * upscaled.shape[0], device=upscaled.device)
+        noise = randn_tensor(upscaled.shape, generator=generator, device=upscaled.device, dtype=upscaled.dtype)
+        upscaled = self.image_noising_scheduler.add_noise(upscaled, noise, timesteps=noise_level)
+
+        if do_classifier_free_guidance:
+            noise_level = torch.cat([noise_level] * 2)
+
+        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # HACK: see comment in `enable_model_cpu_offload`
+        if hasattr(self, "text_encoder_offload_hook") and self.text_encoder_offload_hook is not None:
+            self.text_encoder_offload_hook.offload()
+
+        # 9. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                model_input = torch.cat([intermediate_images, upscaled], dim=1)
+
+                model_input = torch.cat([model_input] * 2) if do_classifier_free_guidance else model_input
+                model_input = self.scheduler.scale_model_input(model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    class_labels=noise_level,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred_uncond, _ = noise_pred_uncond.split(model_input.shape[1] // 2, dim=1)
+                    noise_pred_text, predicted_variance = noise_pred_text.split(model_input.shape[1] // 2, dim=1)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    noise_pred = torch.cat([noise_pred, predicted_variance], dim=1)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                prev_intermediate_images = intermediate_images
+
+                intermediate_images = self.scheduler.step(
+                    noise_pred, t, intermediate_images, **extra_step_kwargs
+                ).prev_sample
+
+                intermediate_images = (1 - mask_image) * prev_intermediate_images + mask_image * intermediate_images
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, intermediate_images)
+
+        image = intermediate_images
+
+        if output_type == "pil":
+            # 10. Post-processing
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            # 11. Run safety checker
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+            # 12. Convert to PIL
+            image = self.numpy_to_pil(image)
+
+            # 13. Apply watermark
+            if self.watermarker is not None:
+                self.watermarker.apply_watermark(image, self.unet.config.sample_size)
+        elif output_type == "pt":
+            nsfw_detected = None
+            watermark_detected = None
+
+            if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None:
+                self.unet_offload_hook.offload()
+        else:
+            # 10. Post-processing
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            # 11. Run safety checker
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+
+        if not return_dict:
+            return (image, nsfw_detected, watermark_detected)
+
+        return IFPipelineOutput(images=image, nsfw_detected=nsfw_detected, watermark_detected=watermark_detected)
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
new file mode 100644
index 000000000000..bb1d4ee4ba66
--- /dev/null
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
@@ -0,0 +1,947 @@
+import html
+import inspect
+import re
+import urllib.parse as ul
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL
+import torch
+import torch.nn.functional as F
+from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
+
+from ...models import UNet2DConditionModel
+from ...schedulers import DDPMScheduler
+from ...utils import (
+    BACKENDS_MAPPING,
+    is_accelerate_available,
+    is_accelerate_version,
+    is_bs4_available,
+    is_ftfy_available,
+    logging,
+    randn_tensor,
+    replace_example_docstring,
+)
+from ..pipeline_utils import DiffusionPipeline
+from . import IFPipelineOutput
+from .safety_checker import IFSafetyChecker
+from .watermark import IFWatermarker
+
+
+if is_bs4_available():
+    from bs4 import BeautifulSoup
+
+if is_ftfy_available():
+    import ftfy
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import IFPipeline, IFSuperResolutionPipeline, DiffusionPipeline
+        >>> from diffusers.utils import pt_to_pil
+        >>> import torch
+
+        >>> pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> prompt = 'a photo of a kangaroo wearing an orange hoodie and blue sunglasses standing in front of the eiffel tower holding a sign that says "very deep learning"'
+        >>> prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
+
+        >>> image = pipe(prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_embeds, output_type="pt").images
+
+        >>> # save intermediate image
+        >>> pil_image = pt_to_pil(image)
+        >>> pil_image[0].save("./if_stage_I.png")
+
+        >>> super_res_1_pipe = IFSuperResolutionPipeline.from_pretrained(
+        ...     "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16
+        ... )
+        >>> super_res_1_pipe.enable_model_cpu_offload()
+
+        >>> image = super_res_1_pipe(
+        ...     image=image, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_embeds
+        ... ).images
+        >>> image[0].save("./if_stage_II.png")
+        ```
+"""
+
+
+class IFSuperResolutionPipeline(DiffusionPipeline):
+    tokenizer: T5Tokenizer
+    text_encoder: T5EncoderModel
+
+    unet: UNet2DConditionModel
+    scheduler: DDPMScheduler
+    image_noising_scheduler: DDPMScheduler
+
+    feature_extractor: Optional[CLIPImageProcessor]
+    safety_checker: Optional[IFSafetyChecker]
+
+    watermarker: Optional[IFWatermarker]
+
+    bad_punct_regex = re.compile(
+        r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
+    )  # noqa
+
+    _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"]
+
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        image_noising_scheduler: DDPMScheduler,
+        safety_checker: Optional[IFSafetyChecker],
+        feature_extractor: Optional[CLIPImageProcessor],
+        watermarker: Optional[IFWatermarker],
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the IF license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        if unet.config.in_channels != 6:
+            logger.warn(
+                "It seems like you have loaded a checkpoint that shall not be used for super resolution from {unet.config._name_or_path} as it accepts {unet.config.in_channels} input channels instead of 6. Please make sure to pass a super resolution checkpoint as the `'unet'`: IFSuperResolutionPipeline.from_pretrained(unet=super_resolution_unet, ...)`."
+            )
+
+        self.register_modules(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            image_noising_scheduler=image_noising_scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            watermarker=watermarker,
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_sequential_cpu_offload
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
+        models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only
+        when their specific submodule has its `forward` method called.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        models = [
+            self.text_encoder,
+            self.unet,
+        ]
+        for cpu_offloaded_model in models:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+
+        if self.safety_checker is not None:
+            cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_model_cpu_offload
+    def enable_model_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        hook = None
+
+        if self.text_encoder is not None:
+            _, hook = cpu_offload_with_hook(self.text_encoder, device, prev_module_hook=hook)
+
+            # Accelerate will move the next model to the device _before_ calling the offload hook of the
+            # previous model. This will cause both models to be present on the device at the same time.
+            # IF uses T5 for its text encoder which is really large. We can manually call the offload
+            # hook for the text encoder to ensure it's moved to the cpu before the unet is moved to
+            # the GPU.
+            self.text_encoder_offload_hook = hook
+
+        _, hook = cpu_offload_with_hook(self.unet, device, prev_module_hook=hook)
+
+        # if the safety checker isn't called, `unet_offload_hook` will have to be called to manually offload the unet
+        self.unet_offload_hook = hook
+
+        if self.safety_checker is not None:
+            _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
+
+        # We'll offload the last model manually.
+        self.final_offload_hook = hook
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.remove_all_hooks
+    def remove_all_hooks(self):
+        if is_accelerate_available():
+            from accelerate.hooks import remove_hook_from_module
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        for model in [self.text_encoder, self.unet, self.safety_checker]:
+            if model is not None:
+                remove_hook_from_module(model, recurse=True)
+
+        self.unet_offload_hook = None
+        self.text_encoder_offload_hook = None
+        self.final_offload_hook = None
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
+    def _text_preprocessing(self, text, clean_caption=False):
+        if clean_caption and not is_bs4_available():
+            logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
+            logger.warn("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if clean_caption and not is_ftfy_available():
+            logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
+            logger.warn("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if not isinstance(text, (tuple, list)):
+            text = [text]
+
+        def process(text: str):
+            if clean_caption:
+                text = self._clean_caption(text)
+                text = self._clean_caption(text)
+            else:
+                text = text.lower().strip()
+            return text
+
+        return [process(t) for t in text]
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
+    def _clean_caption(self, caption):
+        caption = str(caption)
+        caption = ul.unquote_plus(caption)
+        caption = caption.strip().lower()
+        caption = re.sub("<person>", "person", caption)
+        # urls:
+        caption = re.sub(
+            r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        caption = re.sub(
+            r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        # html:
+        caption = BeautifulSoup(caption, features="html.parser").text
+
+        # @<nickname>
+        caption = re.sub(r"@[\w\d]+\b", "", caption)
+
+        # 31C0—31EF CJK Strokes
+        # 31F0—31FF Katakana Phonetic Extensions
+        # 3200—32FF Enclosed CJK Letters and Months
+        # 3300—33FF CJK Compatibility
+        # 3400—4DBF CJK Unified Ideographs Extension A
+        # 4DC0—4DFF Yijing Hexagram Symbols
+        # 4E00—9FFF CJK Unified Ideographs
+        caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
+        caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
+        caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
+        caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
+        caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
+        caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
+        caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
+        #######################################################
+
+        # все виды тире / all types of dash --> "-"
+        caption = re.sub(
+            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
+            "-",
+            caption,
+        )
+
+        # кавычки к одному стандарту
+        caption = re.sub(r"[`´«»“”¨]", '"', caption)
+        caption = re.sub(r"[‘’]", "'", caption)
+
+        # &quot;
+        caption = re.sub(r"&quot;?", "", caption)
+        # &amp
+        caption = re.sub(r"&amp", "", caption)
+
+        # ip adresses:
+        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
+
+        # article ids:
+        caption = re.sub(r"\d:\d\d\s+$", "", caption)
+
+        # \n
+        caption = re.sub(r"\\n", " ", caption)
+
+        # "#123"
+        caption = re.sub(r"#\d{1,3}\b", "", caption)
+        # "#12345.."
+        caption = re.sub(r"#\d{5,}\b", "", caption)
+        # "123456.."
+        caption = re.sub(r"\b\d{6,}\b", "", caption)
+        # filenames:
+        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
+
+        #
+        caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
+        caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
+
+        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
+
+        # this-is-my-cute-cat / this_is_my_cute_cat
+        regex2 = re.compile(r"(?:\-|\_)")
+        if len(re.findall(regex2, caption)) > 3:
+            caption = re.sub(regex2, " ", caption)
+
+        caption = ftfy.fix_text(caption)
+        caption = html.unescape(html.unescape(caption))
+
+        caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
+        caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
+        caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231
+
+        caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
+        caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
+        caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
+        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
+        caption = re.sub(r"\bpage\s+\d+\b", "", caption)
+
+        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
+
+        caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
+
+        caption = re.sub(r"\b\s+\:\s+", r": ", caption)
+        caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
+        caption = re.sub(r"\s+", " ", caption)
+
+        caption.strip()
+
+        caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
+        caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
+        caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
+        caption = re.sub(r"^\.\S+$", "", caption)
+
+        return caption.strip()
+
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+
+    @torch.no_grad()
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        do_classifier_free_guidance=True,
+        num_images_per_prompt=1,
+        device=None,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        clean_caption: bool = False,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+             prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if prompt is not None and negative_prompt is not None:
+            if type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+
+        if device is None:
+            device = self._execution_device
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # while T5 can handle much longer input sequences than 77, the text encoder was trained with a max length of 77 for IF
+        max_length = 77
+
+        if prompt_embeds is None:
+            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {max_length} tokens: {removed_text}"
+                )
+
+            attention_mask = text_inputs.attention_mask.to(device)
+
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+
+        if self.text_encoder is not None:
+            dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            dtype = self.unet.dtype
+        else:
+            dtype = None
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            attention_mask = uncond_input.attention_mask.to(device)
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+        else:
+            negative_prompt_embeds = None
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, nsfw_detected, watermark_detected = self.safety_checker(
+                images=image,
+                clip_input=safety_checker_input.pixel_values.to(dtype=dtype),
+            )
+        else:
+            nsfw_detected = None
+            watermark_detected = None
+
+            if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None:
+                self.unet_offload_hook.offload()
+
+        return image, nsfw_detected, watermark_detected
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        batch_size,
+        noise_level,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if noise_level < 0 or noise_level >= self.image_noising_scheduler.config.num_train_timesteps:
+            raise ValueError(
+                f"`noise_level`: {noise_level} must be a valid timestep in `self.noising_scheduler`, [0, {self.image_noising_scheduler.config.num_train_timesteps})"
+            )
+
+        if isinstance(image, list):
+            check_image_type = image[0]
+        else:
+            check_image_type = image
+
+        if (
+            not isinstance(check_image_type, torch.Tensor)
+            and not isinstance(check_image_type, PIL.Image.Image)
+            and not isinstance(check_image_type, np.ndarray)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
+                f" {type(check_image_type)}"
+            )
+
+        if isinstance(image, list):
+            image_batch_size = len(image)
+        elif isinstance(image, torch.Tensor):
+            image_batch_size = image.shape[0]
+        elif isinstance(image, PIL.Image.Image):
+            image_batch_size = 1
+        elif isinstance(image, np.ndarray):
+            image_batch_size = image.shape[0]
+        else:
+            assert False
+
+        if batch_size != image_batch_size:
+            raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}")
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.prepare_intermediate_images
+    def prepare_intermediate_images(self, batch_size, num_channels, height, width, dtype, device, generator):
+        shape = (batch_size, num_channels, height, width)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        intermediate_images = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        intermediate_images = intermediate_images * self.scheduler.init_noise_sigma
+        return intermediate_images
+
+    def preprocess_image(self, image, num_images_per_prompt, device):
+        if not isinstance(image, torch.Tensor) and not isinstance(image, list):
+            image = [image]
+
+        if isinstance(image[0], PIL.Image.Image):
+            image = [np.array(i).astype(np.float32) / 255.0 for i in image]
+
+            image = np.stack(image, axis=0)  # to np
+            torch.from_numpy(image.transpose(0, 3, 1, 2))
+        elif isinstance(image[0], np.ndarray):
+            image = np.stack(image, axis=0)  # to np
+            if image.ndim == 5:
+                image = image[0]
+
+            image = torch.from_numpy(image.transpose(0, 3, 1, 2))
+        elif isinstance(image, list) and isinstance(image[0], torch.Tensor):
+            dims = image[0].ndim
+
+            if dims == 3:
+                image = torch.stack(image, dim=0)
+            elif dims == 4:
+                image = torch.concat(image, dim=0)
+            else:
+                raise ValueError(f"Image must have 3 or 4 dimensions, instead got {dims}")
+
+        image = image.to(device=device, dtype=self.unet.dtype)
+
+        image = image.repeat_interleave(num_images_per_prompt, dim=0)
+
+        return image
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[PIL.Image.Image, np.ndarray, torch.FloatTensor] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        guidance_scale: float = 4.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        noise_level: int = 250,
+        clean_caption: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`PIL.Image.Image`, `np.ndarray`, `torch.FloatTensor`):
+                The image to be upscaled.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
+                timesteps are used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+            noise_level (`int`, *optional*, defaults to 250):
+                The amount of noise to add to the upscaled image. Must be in the range `[0, 1000)`
+            clean_caption (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
+                be installed. If the dependencies are not installed, the embeddings will be created from the raw
+                prompt.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.IFPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.IFPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
+            returning a tuple, the first element is a list with the generated images, and the second element is a list
+            of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw)
+            or watermarked content, according to the `safety_checker`.
+        """
+        # 1. Check inputs. Raise error if not correct
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        self.check_inputs(
+            prompt,
+            image,
+            batch_size,
+            noise_level,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+
+        # 2. Define call parameters
+
+        height = self.unet.config.sample_size
+        width = self.unet.config.sample_size
+
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            do_classifier_free_guidance,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clean_caption=clean_caption,
+        )
+
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare timesteps
+        if timesteps is not None:
+            self.scheduler.set_timesteps(timesteps=timesteps, device=device)
+            timesteps = self.scheduler.timesteps
+            num_inference_steps = len(timesteps)
+        else:
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            timesteps = self.scheduler.timesteps
+
+        # 5. Prepare intermediate images
+        num_channels = self.unet.config.in_channels // 2
+        intermediate_images = self.prepare_intermediate_images(
+            batch_size * num_images_per_prompt,
+            num_channels,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Prepare upscaled image and noise level
+        image = self.preprocess_image(image, num_images_per_prompt, device)
+        upscaled = F.interpolate(image, (height, width), mode="bilinear", align_corners=True)
+
+        noise_level = torch.tensor([noise_level] * upscaled.shape[0], device=upscaled.device)
+        noise = randn_tensor(upscaled.shape, generator=generator, device=upscaled.device, dtype=upscaled.dtype)
+        upscaled = self.image_noising_scheduler.add_noise(upscaled, noise, timesteps=noise_level)
+
+        if do_classifier_free_guidance:
+            noise_level = torch.cat([noise_level] * 2)
+
+        # HACK: see comment in `enable_model_cpu_offload`
+        if hasattr(self, "text_encoder_offload_hook") and self.text_encoder_offload_hook is not None:
+            self.text_encoder_offload_hook.offload()
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                model_input = torch.cat([intermediate_images, upscaled], dim=1)
+
+                model_input = torch.cat([model_input] * 2) if do_classifier_free_guidance else model_input
+                model_input = self.scheduler.scale_model_input(model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    class_labels=noise_level,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred_uncond, _ = noise_pred_uncond.split(model_input.shape[1] // 2, dim=1)
+                    noise_pred_text, predicted_variance = noise_pred_text.split(model_input.shape[1] // 2, dim=1)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    noise_pred = torch.cat([noise_pred, predicted_variance], dim=1)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                intermediate_images = self.scheduler.step(
+                    noise_pred, t, intermediate_images, **extra_step_kwargs
+                ).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, intermediate_images)
+
+        image = intermediate_images
+
+        if output_type == "pil":
+            # 9. Post-processing
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            # 10. Run safety checker
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+            # 11. Convert to PIL
+            image = self.numpy_to_pil(image)
+
+            # 12. Apply watermark
+            if self.watermarker is not None:
+                self.watermarker.apply_watermark(image, self.unet.config.sample_size)
+        elif output_type == "pt":
+            nsfw_detected = None
+            watermark_detected = None
+
+            if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None:
+                self.unet_offload_hook.offload()
+        else:
+            # 9. Post-processing
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            # 10. Run safety checker
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+
+        if not return_dict:
+            return (image, nsfw_detected, watermark_detected)
+
+        return IFPipelineOutput(images=image, nsfw_detected=nsfw_detected, watermark_detected=watermark_detected)
diff --git a/src/diffusers/pipelines/deepfloyd_if/safety_checker.py b/src/diffusers/pipelines/deepfloyd_if/safety_checker.py
new file mode 100644
index 000000000000..8ffeed580bbe
--- /dev/null
+++ b/src/diffusers/pipelines/deepfloyd_if/safety_checker.py
@@ -0,0 +1,59 @@
+import numpy as np
+import torch
+import torch.nn as nn
+from transformers import CLIPConfig, CLIPVisionModelWithProjection, PreTrainedModel
+
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class IFSafetyChecker(PreTrainedModel):
+    config_class = CLIPConfig
+
+    _no_split_modules = ["CLIPEncoderLayer"]
+
+    def __init__(self, config: CLIPConfig):
+        super().__init__(config)
+
+        self.vision_model = CLIPVisionModelWithProjection(config.vision_config)
+
+        self.p_head = nn.Linear(config.vision_config.projection_dim, 1)
+        self.w_head = nn.Linear(config.vision_config.projection_dim, 1)
+
+    @torch.no_grad()
+    def forward(self, clip_input, images, p_threshold=0.5, w_threshold=0.5):
+        image_embeds = self.vision_model(clip_input)[0]
+
+        nsfw_detected = self.p_head(image_embeds)
+        nsfw_detected = nsfw_detected.flatten()
+        nsfw_detected = nsfw_detected > p_threshold
+        nsfw_detected = nsfw_detected.tolist()
+
+        if any(nsfw_detected):
+            logger.warning(
+                "Potential NSFW content was detected in one or more images. A black image will be returned instead."
+                " Try again with a different prompt and/or seed."
+            )
+
+        for idx, nsfw_detected_ in enumerate(nsfw_detected):
+            if nsfw_detected_:
+                images[idx] = np.zeros(images[idx].shape)
+
+        watermark_detected = self.w_head(image_embeds)
+        watermark_detected = watermark_detected.flatten()
+        watermark_detected = watermark_detected > w_threshold
+        watermark_detected = watermark_detected.tolist()
+
+        if any(watermark_detected):
+            logger.warning(
+                "Potential watermarked content was detected in one or more images. A black image will be returned instead."
+                " Try again with a different prompt and/or seed."
+            )
+
+        for idx, watermark_detected_ in enumerate(watermark_detected):
+            if watermark_detected_:
+                images[idx] = np.zeros(images[idx].shape)
+
+        return images, nsfw_detected, watermark_detected
diff --git a/src/diffusers/pipelines/deepfloyd_if/timesteps.py b/src/diffusers/pipelines/deepfloyd_if/timesteps.py
new file mode 100644
index 000000000000..d44285c017bb
--- /dev/null
+++ b/src/diffusers/pipelines/deepfloyd_if/timesteps.py
@@ -0,0 +1,579 @@
+fast27_timesteps = [
+    999,
+    800,
+    799,
+    600,
+    599,
+    500,
+    400,
+    399,
+    377,
+    355,
+    333,
+    311,
+    288,
+    266,
+    244,
+    222,
+    200,
+    199,
+    177,
+    155,
+    133,
+    111,
+    88,
+    66,
+    44,
+    22,
+    0,
+]
+
+smart27_timesteps = [
+    999,
+    976,
+    952,
+    928,
+    905,
+    882,
+    858,
+    857,
+    810,
+    762,
+    715,
+    714,
+    572,
+    429,
+    428,
+    286,
+    285,
+    238,
+    190,
+    143,
+    142,
+    118,
+    95,
+    71,
+    47,
+    24,
+    0,
+]
+
+smart50_timesteps = [
+    999,
+    988,
+    977,
+    966,
+    955,
+    944,
+    933,
+    922,
+    911,
+    900,
+    899,
+    879,
+    859,
+    840,
+    820,
+    800,
+    799,
+    766,
+    733,
+    700,
+    699,
+    650,
+    600,
+    599,
+    500,
+    499,
+    400,
+    399,
+    350,
+    300,
+    299,
+    266,
+    233,
+    200,
+    199,
+    179,
+    159,
+    140,
+    120,
+    100,
+    99,
+    88,
+    77,
+    66,
+    55,
+    44,
+    33,
+    22,
+    11,
+    0,
+]
+
+smart100_timesteps = [
+    999,
+    995,
+    992,
+    989,
+    985,
+    981,
+    978,
+    975,
+    971,
+    967,
+    964,
+    961,
+    957,
+    956,
+    951,
+    947,
+    942,
+    937,
+    933,
+    928,
+    923,
+    919,
+    914,
+    913,
+    908,
+    903,
+    897,
+    892,
+    887,
+    881,
+    876,
+    871,
+    870,
+    864,
+    858,
+    852,
+    846,
+    840,
+    834,
+    828,
+    827,
+    820,
+    813,
+    806,
+    799,
+    792,
+    785,
+    784,
+    777,
+    770,
+    763,
+    756,
+    749,
+    742,
+    741,
+    733,
+    724,
+    716,
+    707,
+    699,
+    698,
+    688,
+    677,
+    666,
+    656,
+    655,
+    645,
+    634,
+    623,
+    613,
+    612,
+    598,
+    584,
+    570,
+    569,
+    555,
+    541,
+    527,
+    526,
+    505,
+    484,
+    483,
+    462,
+    440,
+    439,
+    396,
+    395,
+    352,
+    351,
+    308,
+    307,
+    264,
+    263,
+    220,
+    219,
+    176,
+    132,
+    88,
+    44,
+    0,
+]
+
+smart185_timesteps = [
+    999,
+    997,
+    995,
+    992,
+    990,
+    988,
+    986,
+    984,
+    981,
+    979,
+    977,
+    975,
+    972,
+    970,
+    968,
+    966,
+    964,
+    961,
+    959,
+    957,
+    956,
+    954,
+    951,
+    949,
+    946,
+    944,
+    941,
+    939,
+    936,
+    934,
+    931,
+    929,
+    926,
+    924,
+    921,
+    919,
+    916,
+    914,
+    913,
+    910,
+    907,
+    905,
+    902,
+    899,
+    896,
+    893,
+    891,
+    888,
+    885,
+    882,
+    879,
+    877,
+    874,
+    871,
+    870,
+    867,
+    864,
+    861,
+    858,
+    855,
+    852,
+    849,
+    846,
+    843,
+    840,
+    837,
+    834,
+    831,
+    828,
+    827,
+    824,
+    821,
+    817,
+    814,
+    811,
+    808,
+    804,
+    801,
+    798,
+    795,
+    791,
+    788,
+    785,
+    784,
+    780,
+    777,
+    774,
+    770,
+    766,
+    763,
+    760,
+    756,
+    752,
+    749,
+    746,
+    742,
+    741,
+    737,
+    733,
+    730,
+    726,
+    722,
+    718,
+    714,
+    710,
+    707,
+    703,
+    699,
+    698,
+    694,
+    690,
+    685,
+    681,
+    677,
+    673,
+    669,
+    664,
+    660,
+    656,
+    655,
+    650,
+    646,
+    641,
+    636,
+    632,
+    627,
+    622,
+    618,
+    613,
+    612,
+    607,
+    602,
+    596,
+    591,
+    586,
+    580,
+    575,
+    570,
+    569,
+    563,
+    557,
+    551,
+    545,
+    539,
+    533,
+    527,
+    526,
+    519,
+    512,
+    505,
+    498,
+    491,
+    484,
+    483,
+    474,
+    466,
+    457,
+    449,
+    440,
+    439,
+    428,
+    418,
+    407,
+    396,
+    395,
+    381,
+    366,
+    352,
+    351,
+    330,
+    308,
+    307,
+    286,
+    264,
+    263,
+    242,
+    220,
+    219,
+    176,
+    175,
+    132,
+    131,
+    88,
+    44,
+    0,
+]
+
+super27_timesteps = [
+    999,
+    991,
+    982,
+    974,
+    966,
+    958,
+    950,
+    941,
+    933,
+    925,
+    916,
+    908,
+    900,
+    899,
+    874,
+    850,
+    825,
+    800,
+    799,
+    700,
+    600,
+    500,
+    400,
+    300,
+    200,
+    100,
+    0,
+]
+
+super40_timesteps = [
+    999,
+    992,
+    985,
+    978,
+    971,
+    964,
+    957,
+    949,
+    942,
+    935,
+    928,
+    921,
+    914,
+    907,
+    900,
+    899,
+    879,
+    859,
+    840,
+    820,
+    800,
+    799,
+    766,
+    733,
+    700,
+    699,
+    650,
+    600,
+    599,
+    500,
+    499,
+    400,
+    399,
+    300,
+    299,
+    200,
+    199,
+    100,
+    99,
+    0,
+]
+
+super100_timesteps = [
+    999,
+    996,
+    992,
+    989,
+    985,
+    982,
+    979,
+    975,
+    972,
+    968,
+    965,
+    961,
+    958,
+    955,
+    951,
+    948,
+    944,
+    941,
+    938,
+    934,
+    931,
+    927,
+    924,
+    920,
+    917,
+    914,
+    910,
+    907,
+    903,
+    900,
+    899,
+    891,
+    884,
+    876,
+    869,
+    861,
+    853,
+    846,
+    838,
+    830,
+    823,
+    815,
+    808,
+    800,
+    799,
+    788,
+    777,
+    766,
+    755,
+    744,
+    733,
+    722,
+    711,
+    700,
+    699,
+    688,
+    677,
+    666,
+    655,
+    644,
+    633,
+    622,
+    611,
+    600,
+    599,
+    585,
+    571,
+    557,
+    542,
+    528,
+    514,
+    500,
+    499,
+    485,
+    471,
+    457,
+    442,
+    428,
+    414,
+    400,
+    399,
+    379,
+    359,
+    340,
+    320,
+    300,
+    299,
+    279,
+    259,
+    240,
+    220,
+    200,
+    199,
+    166,
+    133,
+    100,
+    99,
+    66,
+    33,
+    0,
+]
diff --git a/src/diffusers/pipelines/deepfloyd_if/watermark.py b/src/diffusers/pipelines/deepfloyd_if/watermark.py
new file mode 100644
index 000000000000..db33dec0ef9a
--- /dev/null
+++ b/src/diffusers/pipelines/deepfloyd_if/watermark.py
@@ -0,0 +1,46 @@
+from typing import List
+
+import PIL
+import torch
+from PIL import Image
+
+from ...configuration_utils import ConfigMixin
+from ...models.modeling_utils import ModelMixin
+from ...utils import PIL_INTERPOLATION
+
+
+class IFWatermarker(ModelMixin, ConfigMixin):
+    def __init__(self):
+        super().__init__()
+
+        self.register_buffer("watermark_image", torch.zeros((62, 62, 4)))
+        self.watermark_image_as_pil = None
+
+    def apply_watermark(self, images: List[PIL.Image.Image], sample_size=None):
+        # copied from https://github.com/deep-floyd/IF/blob/b77482e36ca2031cb94dbca1001fc1e6400bf4ab/deepfloyd_if/modules/base.py#L287
+
+        h = images[0].height
+        w = images[0].width
+
+        sample_size = sample_size or h
+
+        coef = min(h / sample_size, w / sample_size)
+        img_h, img_w = (int(h / coef), int(w / coef)) if coef < 1 else (h, w)
+
+        S1, S2 = 1024**2, img_w * img_h
+        K = (S2 / S1) ** 0.5
+        wm_size, wm_x, wm_y = int(K * 62), img_w - int(14 * K), img_h - int(14 * K)
+
+        if self.watermark_image_as_pil is None:
+            watermark_image = self.watermark_image.to(torch.uint8).cpu().numpy()
+            watermark_image = Image.fromarray(watermark_image, mode="RGBA")
+            self.watermark_image_as_pil = watermark_image
+
+        wm_img = self.watermark_image_as_pil.resize(
+            (wm_size, wm_size), PIL_INTERPOLATION["bicubic"], reducing_gap=None
+        )
+
+        for pil_img in images:
+            pil_img.paste(wm_img, box=(wm_x - wm_size, wm_y - wm_size, wm_x, wm_y), mask=wm_img.split()[-1])
+
+        return images
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 2d61f1a3700f..8c028b64a8c8 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -30,7 +30,6 @@
 import torch
 from huggingface_hub import hf_hub_download, model_info, snapshot_download
 from packaging import version
-from PIL import Image
 from tqdm.auto import tqdm
 
 import diffusers
@@ -56,6 +55,7 @@
     is_torch_version,
     is_transformers_available,
     logging,
+    numpy_to_pil,
 )
 
 
@@ -623,7 +623,9 @@ def module_is_sequentially_offloaded(module):
             if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"):
                 return False
 
-            return hasattr(module, "_hf_hook") and not isinstance(module._hf_hook, accelerate.hooks.CpuOffload)
+            return hasattr(module, "_hf_hook") and not isinstance(
+                module._hf_hook, (accelerate.hooks.CpuOffload, accelerate.hooks.AlignDevicesHook)
+            )
 
         def module_is_offloaded(module):
             if not is_accelerate_available() or is_accelerate_version("<", "0.17.0.dev0"):
@@ -653,7 +655,20 @@ def module_is_offloaded(module):
 
         is_offloaded = pipeline_is_offloaded or pipeline_is_sequentially_offloaded
         for module in modules:
-            module.to(torch_device, torch_dtype)
+            is_loaded_in_8bit = hasattr(module, "is_loaded_in_8bit") and module.is_loaded_in_8bit
+
+            if is_loaded_in_8bit and torch_dtype is not None:
+                logger.warning(
+                    f"The module '{module.__class__.__name__}' has been loaded in 8bit and conversion to {torch_dtype} is not yet supported. Module is still in 8bit precision."
+                )
+
+            if is_loaded_in_8bit and torch_device is not None:
+                logger.warning(
+                    f"The module '{module.__class__.__name__}' has been loaded in 8bit and moving it to {torch_dtype} via `.to()` is not yet supported. Module is still on {module.device}."
+                )
+            else:
+                module.to(torch_device, torch_dtype)
+
             if (
                 module.dtype == torch.float16
                 and str(torch_device) in ["cpu"]
@@ -887,6 +902,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
 
         config_dict = cls.load_config(cached_folder)
 
+        # pop out "_ignore_files" as it is only needed for download
+        config_dict.pop("_ignore_files", None)
+
         # 2. Define which model components should load variants
         # We retrieve the information by matching whether variant
         # model checkpoints exist in the subfolders
@@ -1204,12 +1222,19 @@ def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]:
             )
 
             config_dict = cls._dict_from_json_file(config_file)
+
+            ignore_filenames = config_dict.pop("_ignore_files", [])
+
             # retrieve all folder_names that contain relevant files
             folder_names = [k for k, v in config_dict.items() if isinstance(v, list)]
 
             filenames = {sibling.rfilename for sibling in info.siblings}
             model_filenames, variant_filenames = variant_compatible_siblings(filenames, variant=variant)
 
+            # remove ignored filenames
+            model_filenames = set(model_filenames) - set(ignore_filenames)
+            variant_filenames = set(variant_filenames) - set(ignore_filenames)
+
             # if the whole pipeline is cached we don't have to ping the Hub
             if revision in DEPRECATED_REVISION_ARGS and version.parse(
                 version.parse(__version__).base_version
@@ -1370,16 +1395,7 @@ def numpy_to_pil(images):
         """
         Convert a numpy image or a batch of images to a PIL image.
         """
-        if images.ndim == 3:
-            images = images[None, ...]
-        images = (images * 255).round().astype("uint8")
-        if images.shape[-1] == 1:
-            # special case for grayscale (single channel) images
-            pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
-        else:
-            pil_images = [Image.fromarray(image) for image in images]
-
-        return pil_images
+        return numpy_to_pil(images)
 
     def progress_bar(self, iterable=None, total=None):
         if not hasattr(self, "_progress_bar_config"):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py
index 8db19c2b9109..56681391aeeb 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py
@@ -56,7 +56,18 @@ def __init__(
         scheduler: Any,
         max_noise_level: int = 350,
     ):
-        super().__init__(vae, text_encoder, tokenizer, unet, low_res_scheduler, scheduler, max_noise_level)
+        super().__init__(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            low_res_scheduler=low_res_scheduler,
+            scheduler=scheduler,
+            safety_checker=None,
+            feature_extractor=None,
+            watermarker=None,
+            max_noise_level=max_noise_level,
+        )
 
     def __call__(
         self,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
index 693208b18cdd..45b26de284af 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
@@ -13,18 +13,19 @@
 # limitations under the License.
 
 import inspect
-from typing import Callable, List, Optional, Union
+from typing import Any, Callable, List, Optional, Union
 
 import numpy as np
 import PIL
 import torch
-from transformers import CLIPTextModel, CLIPTokenizer
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
 from ...loaders import TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import DDPMScheduler, KarrasDiffusionSchedulers
 from ...utils import deprecate, is_accelerate_available, is_accelerate_version, logging, randn_tensor
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ..pipeline_utils import DiffusionPipeline
+from . import StableDiffusionPipelineOutput
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -76,6 +77,7 @@ class StableDiffusionUpscalePipeline(DiffusionPipeline, TextualInversionLoaderMi
             A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
     """
+    _optional_components = ["watermarker", "safety_checker", "feature_extractor"]
 
     def __init__(
         self,
@@ -85,12 +87,16 @@ def __init__(
         unet: UNet2DConditionModel,
         low_res_scheduler: DDPMScheduler,
         scheduler: KarrasDiffusionSchedulers,
+        safety_checker: Optional[Any] = None,
+        feature_extractor: Optional[CLIPImageProcessor] = None,
+        watermarker: Optional[Any] = None,
         max_noise_level: int = 350,
     ):
         super().__init__()
 
-        if hasattr(vae, "config"):
-            # check if vae has a config attribute `scaling_factor` and if it is set to 0.08333, else set it to 0.08333 and deprecate
+        if hasattr(
+            vae, "config"
+        ):  # check if vae has a config attribute `scaling_factor` and if it is set to 0.08333, else set it to 0.08333 and deprecate
             is_vae_scaling_factor_set_to_0_08333 = (
                 hasattr(vae.config, "scaling_factor") and vae.config.scaling_factor == 0.08333
             )
@@ -113,6 +119,9 @@ def __init__(
             unet=unet,
             low_res_scheduler=low_res_scheduler,
             scheduler=scheduler,
+            safety_checker=safety_checker,
+            watermarker=watermarker,
+            feature_extractor=feature_extractor,
         )
         self.register_to_config(max_noise_level=max_noise_level)
 
@@ -178,6 +187,23 @@ def _execution_device(self):
                 return torch.device(module._hf_hook.execution_device)
         return self.device
 
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, nsfw_detected, watermark_detected = self.safety_checker(
+                images=image,
+                clip_input=safety_checker_input.pixel_values.to(dtype=dtype),
+            )
+        else:
+            nsfw_detected = None
+            watermark_detected = None
+
+            if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None:
+                self.unet_offload_hook.offload()
+
+        return image, nsfw_detected, watermark_detected
+
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(
         self,
@@ -678,10 +704,19 @@ def __call__(
             self.final_offload_hook.offload()
 
         # 11. Convert to PIL
+        # has_nsfw_concept = False
         if output_type == "pil":
+            image, has_nsfw_concept, _ = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
             image = self.numpy_to_pil(image)
 
+            # 11. Apply watermark
+            if self.watermarker is not None:
+                image = self.watermarker.apply_watermark(image)
+        else:
+            has_nsfw_concept = None
+
         if not return_dict:
-            return (image,)
+            return (image, has_nsfw_concept)
 
-        return ImagePipelineOutput(images=image)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
index 2a7b80d01da7..57e1abc7315b 100644
--- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -15,7 +15,7 @@
     AttnProcessor,
 )
 from ...models.dual_transformer_2d import DualTransformer2DModel
-from ...models.embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
+from ...models.embeddings import GaussianFourierProjection, TextTimeEmbedding, TimestepEmbedding, Timesteps
 from ...models.transformer_2d import Transformer2DModel
 from ...models.unet_2d_condition import UNet2DConditionOutput
 from ...utils import logging
@@ -183,11 +183,16 @@ class UNetFlatConditionModel(ModelMixin, ConfigMixin):
         class_embed_type (`str`, *optional*, defaults to None):
             The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
             `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
+        addition_embed_type (`str`, *optional*, defaults to None):
+            Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
+            "text". "text" will use the `TextTimeEmbedding` layer.
         num_class_embeds (`int`, *optional*, defaults to None):
             Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
             class conditioning with `class_embed_type` equal to `None`.
         time_embedding_type (`str`, *optional*, default to `positional`):
             The type of position embedding to use for timesteps. Choose from `positional` or `fourier`.
+        time_embedding_dim (`int`, *optional*, default to `None`):
+            An optional override for the dimension of the projected time embedding.
         time_embedding_act_fn (`str`, *optional*, default to `None`):
             Optional activation function to use on the time embeddings only one time before they as passed to the rest
             of the unet. Choose from `silu`, `mish`, `gelu`, and `swish`.
@@ -246,12 +251,14 @@ def __init__(
         dual_cross_attention: bool = False,
         use_linear_projection: bool = False,
         class_embed_type: Optional[str] = None,
+        addition_embed_type: Optional[str] = None,
         num_class_embeds: Optional[int] = None,
         upcast_attention: bool = False,
         resnet_time_scale_shift: str = "default",
         resnet_skip_time_act: bool = False,
         resnet_out_scale_factor: int = 1.0,
         time_embedding_type: str = "positional",
+        time_embedding_dim: Optional[int] = None,
         time_embedding_act_fn: Optional[str] = None,
         timestep_post_act: Optional[str] = None,
         time_cond_proj_dim: Optional[int] = None,
@@ -261,6 +268,7 @@ def __init__(
         class_embeddings_concat: bool = False,
         mid_block_only_cross_attention: Optional[bool] = None,
         cross_attention_norm: Optional[str] = None,
+        addition_embed_type_num_heads=64,
     ):
         super().__init__()
 
@@ -311,7 +319,7 @@ def __init__(
 
         # time
         if time_embedding_type == "fourier":
-            time_embed_dim = block_out_channels[0] * 2
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 2
             if time_embed_dim % 2 != 0:
                 raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
             self.time_proj = GaussianFourierProjection(
@@ -319,7 +327,7 @@ def __init__(
             )
             timestep_input_dim = time_embed_dim
         elif time_embedding_type == "positional":
-            time_embed_dim = block_out_channels[0] * 4
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
 
             self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
             timestep_input_dim = block_out_channels[0]
@@ -370,6 +378,18 @@ def __init__(
         else:
             self.class_embedding = None
 
+        if addition_embed_type == "text":
+            if encoder_hid_dim is not None:
+                text_time_embedding_from_dim = encoder_hid_dim
+            else:
+                text_time_embedding_from_dim = cross_attention_dim
+
+            self.add_embedding = TextTimeEmbedding(
+                text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
+            )
+        elif addition_embed_type is not None:
+            raise ValueError(f"addition_embed_type: {addition_embed_type} must be None or 'text'.")
+
         if time_embedding_act_fn is None:
             self.time_embed_act = None
         elif time_embedding_act_fn == "swish":
@@ -781,6 +801,10 @@ def forward(
             else:
                 emb = emb + class_emb
 
+        if self.config.addition_embed_type == "text":
+            aug_emb = self.add_embedding(encoder_hidden_states)
+            emb = emb + aug_emb
+
         if self.time_embed_act is not None:
             emb = self.time_embed_act(emb)
 
diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
index c717d722f84c..1b8eca050c9e 100644
--- a/src/diffusers/utils/__init__.py
+++ b/src/diffusers/utils/__init__.py
@@ -44,6 +44,7 @@
     http_user_agent,
 )
 from .import_utils import (
+    BACKENDS_MAPPING,
     ENV_VARS_TRUE_AND_AUTO_VALUES,
     ENV_VARS_TRUE_VALUES,
     USE_JAX,
@@ -53,7 +54,9 @@
     OptionalDependencyNotAvailable,
     is_accelerate_available,
     is_accelerate_version,
+    is_bs4_available,
     is_flax_available,
+    is_ftfy_available,
     is_inflect_available,
     is_k_diffusion_available,
     is_k_diffusion_version,
@@ -76,7 +79,7 @@
 )
 from .logging import get_logger
 from .outputs import BaseOutput
-from .pil_utils import PIL_INTERPOLATION
+from .pil_utils import PIL_INTERPOLATION, numpy_to_pil, pt_to_pil
 from .torch_utils import is_compiled_module, randn_tensor
 
 
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index bda56d2ae8ae..bf4fe8d87ff9 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -62,6 +62,96 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class IFImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class IFImg2ImgSuperResolutionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class IFInpaintingPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class IFInpaintingSuperResolutionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class IFPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class IFSuperResolutionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class LDMTextToImagePipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py
index fd7538b1b5e9..2d90cb9747a7 100644
--- a/src/diffusers/utils/import_utils.py
+++ b/src/diffusers/utils/import_utils.py
@@ -271,6 +271,23 @@
     _compel_available = False
 
 
+_ftfy_available = importlib.util.find_spec("ftfy") is not None
+try:
+    _ftfy_version = importlib_metadata.version("ftfy")
+    logger.debug(f"Successfully imported ftfy version {_ftfy_version}")
+except importlib_metadata.PackageNotFoundError:
+    _ftfy_available = False
+
+
+_bs4_available = importlib.util.find_spec("bs4") is not None
+try:
+    # importlib metadata under different name
+    _bs4_version = importlib_metadata.version("beautifulsoup4")
+    logger.debug(f"Successfully imported ftfy version {_bs4_version}")
+except importlib_metadata.PackageNotFoundError:
+    _bs4_available = False
+
+
 def is_torch_available():
     return _torch_available
 
@@ -347,6 +364,14 @@ def is_compel_available():
     return _compel_available
 
 
+def is_ftfy_available():
+    return _ftfy_available
+
+
+def is_bs4_available():
+    return _bs4_available
+
+
 # docstyle-ignore
 FLAX_IMPORT_ERROR = """
 {0} requires the FLAX library but it was not found in your environment. Checkout the instructions on the
@@ -437,8 +462,23 @@ def is_compel_available():
 {0} requires the compel library but it was not found in your environment. You can install it with pip: `pip install compel`
 """
 
+# docstyle-ignore
+BS4_IMPORT_ERROR = """
+{0} requires the Beautiful Soup library but it was not found in your environment. You can install it with pip:
+`pip install beautifulsoup4`. Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+FTFY_IMPORT_ERROR = """
+{0} requires the ftfy library but it was not found in your environment. Checkout the instructions on the
+installation section: https://github.com/rspeer/python-ftfy/tree/master#installing and follow the ones
+that match your environment. Please note that you may need to restart your runtime after installation.
+"""
+
+
 BACKENDS_MAPPING = OrderedDict(
     [
+        ("bs4", (is_bs4_available, BS4_IMPORT_ERROR)),
         ("flax", (is_flax_available, FLAX_IMPORT_ERROR)),
         ("inflect", (is_inflect_available, INFLECT_IMPORT_ERROR)),
         ("onnx", (is_onnx_available, ONNX_IMPORT_ERROR)),
@@ -454,6 +494,7 @@ def is_compel_available():
         ("omegaconf", (is_omegaconf_available, OMEGACONF_IMPORT_ERROR)),
         ("tensorboard", (_tensorboard_available, TENSORBOARD_IMPORT_ERROR)),
         ("compel", (_compel_available, COMPEL_IMPORT_ERROR)),
+        ("ftfy", (is_ftfy_available, FTFY_IMPORT_ERROR)),
     ]
 )
 
diff --git a/src/diffusers/utils/pil_utils.py b/src/diffusers/utils/pil_utils.py
index 39d0a15a4e2f..ad76a32230fb 100644
--- a/src/diffusers/utils/pil_utils.py
+++ b/src/diffusers/utils/pil_utils.py
@@ -1,6 +1,7 @@
 import PIL.Image
 import PIL.ImageOps
 from packaging import version
+from PIL import Image
 
 
 if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
@@ -19,3 +20,26 @@
         "lanczos": PIL.Image.LANCZOS,
         "nearest": PIL.Image.NEAREST,
     }
+
+
+def pt_to_pil(images):
+    images = (images / 2 + 0.5).clamp(0, 1)
+    images = images.cpu().permute(0, 2, 3, 1).float().numpy()
+    images = numpy_to_pil(images)
+    return images
+
+
+def numpy_to_pil(images):
+    """
+    Convert a numpy image or a batch of images to a PIL image.
+    """
+    if images.ndim == 3:
+        images = images[None, ...]
+    images = (images * 255).round().astype("uint8")
+    if images.shape[-1] == 1:
+        # special case for grayscale (single channel) images
+        pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
+    else:
+        pil_images = [Image.fromarray(image) for image in images]
+
+    return pil_images
diff --git a/tests/pipelines/deepfloyd_if/__init__.py b/tests/pipelines/deepfloyd_if/__init__.py
new file mode 100644
index 000000000000..094254a61875
--- /dev/null
+++ b/tests/pipelines/deepfloyd_if/__init__.py
@@ -0,0 +1,272 @@
+import tempfile
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer, T5EncoderModel
+
+from diffusers import DDPMScheduler, UNet2DConditionModel
+from diffusers.models.attention_processor import AttnAddedKVProcessor
+from diffusers.pipelines.deepfloyd_if import IFWatermarker
+from diffusers.utils.testing_utils import torch_device
+
+from ..test_pipelines_common import to_np
+
+
+# WARN: the hf-internal-testing/tiny-random-t5 text encoder has some non-determinism in the `save_load` tests.
+
+
+class IFPipelineTesterMixin:
+    def _get_dummy_components(self):
+        torch.manual_seed(0)
+        text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        torch.manual_seed(0)
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            sample_size=32,
+            layers_per_block=1,
+            block_out_channels=[32, 64],
+            down_block_types=[
+                "ResnetDownsampleBlock2D",
+                "SimpleCrossAttnDownBlock2D",
+            ],
+            mid_block_type="UNetMidBlock2DSimpleCrossAttn",
+            up_block_types=["SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"],
+            in_channels=3,
+            out_channels=6,
+            cross_attention_dim=32,
+            encoder_hid_dim=32,
+            attention_head_dim=8,
+            addition_embed_type="text",
+            addition_embed_type_num_heads=2,
+            cross_attention_norm="group_norm",
+            resnet_time_scale_shift="scale_shift",
+            act_fn="gelu",
+        )
+        unet.set_attn_processor(AttnAddedKVProcessor())  # For reproducibility tests
+
+        torch.manual_seed(0)
+        scheduler = DDPMScheduler(
+            num_train_timesteps=1000,
+            beta_schedule="squaredcos_cap_v2",
+            beta_start=0.0001,
+            beta_end=0.02,
+            thresholding=True,
+            dynamic_thresholding_ratio=0.95,
+            sample_max_value=1.0,
+            prediction_type="epsilon",
+            variance_type="learned_range",
+        )
+
+        torch.manual_seed(0)
+        watermarker = IFWatermarker()
+
+        return {
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "unet": unet,
+            "scheduler": scheduler,
+            "watermarker": watermarker,
+            "safety_checker": None,
+            "feature_extractor": None,
+        }
+
+    def _get_superresolution_dummy_components(self):
+        torch.manual_seed(0)
+        text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        torch.manual_seed(0)
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            sample_size=32,
+            layers_per_block=[1, 2],
+            block_out_channels=[32, 64],
+            down_block_types=[
+                "ResnetDownsampleBlock2D",
+                "SimpleCrossAttnDownBlock2D",
+            ],
+            mid_block_type="UNetMidBlock2DSimpleCrossAttn",
+            up_block_types=["SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"],
+            in_channels=6,
+            out_channels=6,
+            cross_attention_dim=32,
+            encoder_hid_dim=32,
+            attention_head_dim=8,
+            addition_embed_type="text",
+            addition_embed_type_num_heads=2,
+            cross_attention_norm="group_norm",
+            resnet_time_scale_shift="scale_shift",
+            act_fn="gelu",
+            class_embed_type="timestep",
+            mid_block_scale_factor=1.414,
+            time_embedding_act_fn="gelu",
+            time_embedding_dim=32,
+        )
+        unet.set_attn_processor(AttnAddedKVProcessor())  # For reproducibility tests
+
+        torch.manual_seed(0)
+        scheduler = DDPMScheduler(
+            num_train_timesteps=1000,
+            beta_schedule="squaredcos_cap_v2",
+            beta_start=0.0001,
+            beta_end=0.02,
+            thresholding=True,
+            dynamic_thresholding_ratio=0.95,
+            sample_max_value=1.0,
+            prediction_type="epsilon",
+            variance_type="learned_range",
+        )
+
+        torch.manual_seed(0)
+        image_noising_scheduler = DDPMScheduler(
+            num_train_timesteps=1000,
+            beta_schedule="squaredcos_cap_v2",
+            beta_start=0.0001,
+            beta_end=0.02,
+        )
+
+        torch.manual_seed(0)
+        watermarker = IFWatermarker()
+
+        return {
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "unet": unet,
+            "scheduler": scheduler,
+            "image_noising_scheduler": image_noising_scheduler,
+            "watermarker": watermarker,
+            "safety_checker": None,
+            "feature_extractor": None,
+        }
+
+    # this test is modified from the base class because if pipelines set the text encoder
+    # as optional with the intention that the user is allowed to encode the prompt once
+    # and then pass the embeddings directly to the pipeline. The base class test uses
+    # the unmodified arguments from `self.get_dummy_inputs` which will pass the unencoded
+    # prompt to the pipeline when the text encoder is set to None, throwing an error.
+    # So we make the test reflect the intended usage of setting the text encoder to None.
+    def _test_save_load_optional_components(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        prompt = inputs["prompt"]
+        generator = inputs["generator"]
+        num_inference_steps = inputs["num_inference_steps"]
+        output_type = inputs["output_type"]
+
+        if "image" in inputs:
+            image = inputs["image"]
+        else:
+            image = None
+
+        if "mask_image" in inputs:
+            mask_image = inputs["mask_image"]
+        else:
+            mask_image = None
+
+        if "original_image" in inputs:
+            original_image = inputs["original_image"]
+        else:
+            original_image = None
+
+        prompt_embeds, negative_prompt_embeds = pipe.encode_prompt(prompt)
+
+        # inputs with prompt converted to embeddings
+        inputs = {
+            "prompt_embeds": prompt_embeds,
+            "negative_prompt_embeds": negative_prompt_embeds,
+            "generator": generator,
+            "num_inference_steps": num_inference_steps,
+            "output_type": output_type,
+        }
+
+        if image is not None:
+            inputs["image"] = image
+
+        if mask_image is not None:
+            inputs["mask_image"] = mask_image
+
+        if original_image is not None:
+            inputs["original_image"] = original_image
+
+        # set all optional components to None
+        for optional_component in pipe._optional_components:
+            setattr(pipe, optional_component, None)
+
+        output = pipe(**inputs)[0]
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pipe.save_pretrained(tmpdir)
+            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
+            pipe_loaded.to(torch_device)
+            pipe_loaded.set_progress_bar_config(disable=None)
+
+        pipe_loaded.unet.set_attn_processor(AttnAddedKVProcessor())  # For reproducibility tests
+
+        for optional_component in pipe._optional_components:
+            self.assertTrue(
+                getattr(pipe_loaded, optional_component) is None,
+                f"`{optional_component}` did not stay set to None after loading.",
+            )
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        generator = inputs["generator"]
+        num_inference_steps = inputs["num_inference_steps"]
+        output_type = inputs["output_type"]
+
+        # inputs with prompt converted to embeddings
+        inputs = {
+            "prompt_embeds": prompt_embeds,
+            "negative_prompt_embeds": negative_prompt_embeds,
+            "generator": generator,
+            "num_inference_steps": num_inference_steps,
+            "output_type": output_type,
+        }
+
+        if image is not None:
+            inputs["image"] = image
+
+        if mask_image is not None:
+            inputs["mask_image"] = mask_image
+
+        if original_image is not None:
+            inputs["original_image"] = original_image
+
+        output_loaded = pipe_loaded(**inputs)[0]
+
+        max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
+        self.assertLess(max_diff, 1e-4)
+
+    # Modified from `PipelineTesterMixin` to set the attn processor as it's not serialized.
+    # This should be handled in the base test and then this method can be removed.
+    def _test_save_load_local(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output = pipe(**inputs)[0]
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pipe.save_pretrained(tmpdir)
+            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
+            pipe_loaded.to(torch_device)
+            pipe_loaded.set_progress_bar_config(disable=None)
+
+        pipe_loaded.unet.set_attn_processor(AttnAddedKVProcessor())  # For reproducibility tests
+
+        inputs = self.get_dummy_inputs(torch_device)
+        output_loaded = pipe_loaded(**inputs)[0]
+
+        max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
+        self.assertLess(max_diff, 1e-4)
diff --git a/tests/pipelines/deepfloyd_if/test_if.py b/tests/pipelines/deepfloyd_if/test_if.py
new file mode 100644
index 000000000000..e2204cb601a6
--- /dev/null
+++ b/tests/pipelines/deepfloyd_if/test_if.py
@@ -0,0 +1,340 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import torch
+
+from diffusers import (
+    IFImg2ImgPipeline,
+    IFImg2ImgSuperResolutionPipeline,
+    IFInpaintingPipeline,
+    IFInpaintingSuperResolutionPipeline,
+    IFPipeline,
+    IFSuperResolutionPipeline,
+)
+from diffusers.models.attention_processor import AttnAddedKVProcessor
+from diffusers.utils.testing_utils import floats_tensor, load_numpy, require_torch_gpu, skip_mps, slow, torch_device
+
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
+from . import IFPipelineTesterMixin
+
+
+@skip_mps
+class IFPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase):
+    pipeline_class = IFPipeline
+    params = TEXT_TO_IMAGE_PARAMS - {"width", "height", "latents"}
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
+
+    test_xformers_attention = False
+
+    def get_dummy_components(self):
+        return self._get_dummy_components()
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "output_type": "numpy",
+        }
+
+        return inputs
+
+    def test_save_load_optional_components(self):
+        self._test_save_load_optional_components()
+
+    @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
+    def test_save_load_float16(self):
+        # Due to non-determinism in save load of the hf-internal-testing/tiny-random-t5 text encoder
+        self._test_save_load_float16(expected_max_diff=1e-1)
+
+    def test_attention_slicing_forward_pass(self):
+        self._test_attention_slicing_forward_pass(expected_max_diff=1e-2)
+
+    def test_save_load_local(self):
+        self._test_save_load_local()
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(
+            expected_max_diff=1e-2,
+        )
+
+
+@slow
+@require_torch_gpu
+class IFPipelineSlowTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_all(self):
+        # if
+
+        pipe_1 = IFPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+
+        pipe_2 = IFSuperResolutionPipeline.from_pretrained(
+            "DeepFloyd/IF-II-L-v1.0", variant="fp16", torch_dtype=torch.float16, text_encoder=None, tokenizer=None
+        )
+
+        # pre compute text embeddings and remove T5 to save memory
+
+        pipe_1.text_encoder.to("cuda")
+
+        prompt_embeds, negative_prompt_embeds = pipe_1.encode_prompt("anime turtle", device="cuda")
+
+        del pipe_1.tokenizer
+        del pipe_1.text_encoder
+        gc.collect()
+
+        pipe_1.tokenizer = None
+        pipe_1.text_encoder = None
+
+        pipe_1.enable_model_cpu_offload()
+        pipe_2.enable_model_cpu_offload()
+
+        pipe_1.unet.set_attn_processor(AttnAddedKVProcessor())
+        pipe_2.unet.set_attn_processor(AttnAddedKVProcessor())
+
+        self._test_if(pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds)
+
+        pipe_1.remove_all_hooks()
+        pipe_2.remove_all_hooks()
+
+        # img2img
+
+        pipe_1 = IFImg2ImgPipeline(**pipe_1.components)
+        pipe_2 = IFImg2ImgSuperResolutionPipeline(**pipe_2.components)
+
+        pipe_1.enable_model_cpu_offload()
+        pipe_2.enable_model_cpu_offload()
+
+        pipe_1.unet.set_attn_processor(AttnAddedKVProcessor())
+        pipe_2.unet.set_attn_processor(AttnAddedKVProcessor())
+
+        self._test_if_img2img(pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds)
+
+        pipe_1.remove_all_hooks()
+        pipe_2.remove_all_hooks()
+
+        # inpainting
+
+        pipe_1 = IFInpaintingPipeline(**pipe_1.components)
+        pipe_2 = IFInpaintingSuperResolutionPipeline(**pipe_2.components)
+
+        pipe_1.enable_model_cpu_offload()
+        pipe_2.enable_model_cpu_offload()
+
+        pipe_1.unet.set_attn_processor(AttnAddedKVProcessor())
+        pipe_2.unet.set_attn_processor(AttnAddedKVProcessor())
+
+        self._test_if_inpainting(pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds)
+
+    def _test_if(self, pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds):
+        # pipeline 1
+
+        _start_torch_memory_measurement()
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        output = pipe_1(
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            num_inference_steps=2,
+            generator=generator,
+            output_type="np",
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (64, 64, 3)
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        assert mem_bytes < 13 * 10**9
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/if/test_if.npy"
+        )
+        assert_mean_pixel_difference(image, expected_image)
+
+        # pipeline 2
+
+        _start_torch_memory_measurement()
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+
+        image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device)
+
+        output = pipe_2(
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            image=image,
+            generator=generator,
+            num_inference_steps=2,
+            output_type="np",
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (256, 256, 3)
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        assert mem_bytes < 4 * 10**9
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/if/test_if_superresolution_stage_II.npy"
+        )
+        assert_mean_pixel_difference(image, expected_image)
+
+    def _test_if_img2img(self, pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds):
+        # pipeline 1
+
+        _start_torch_memory_measurement()
+
+        image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+
+        output = pipe_1(
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            image=image,
+            num_inference_steps=2,
+            generator=generator,
+            output_type="np",
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (64, 64, 3)
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        assert mem_bytes < 10 * 10**9
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/if/test_if_img2img.npy"
+        )
+        assert_mean_pixel_difference(image, expected_image)
+
+        # pipeline 2
+
+        _start_torch_memory_measurement()
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+
+        original_image = floats_tensor((1, 3, 256, 256), rng=random.Random(0)).to(torch_device)
+        image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device)
+
+        output = pipe_2(
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            image=image,
+            original_image=original_image,
+            generator=generator,
+            num_inference_steps=2,
+            output_type="np",
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (256, 256, 3)
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        assert mem_bytes < 4 * 10**9
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/if/test_if_img2img_superresolution_stage_II.npy"
+        )
+        assert_mean_pixel_difference(image, expected_image)
+
+    def _test_if_inpainting(self, pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds):
+        # pipeline 1
+
+        _start_torch_memory_measurement()
+
+        image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device)
+        mask_image = floats_tensor((1, 3, 64, 64), rng=random.Random(1)).to(torch_device)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        output = pipe_1(
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            image=image,
+            mask_image=mask_image,
+            num_inference_steps=2,
+            generator=generator,
+            output_type="np",
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (64, 64, 3)
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        assert mem_bytes < 10 * 10**9
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/if/test_if_inpainting.npy"
+        )
+        assert_mean_pixel_difference(image, expected_image)
+
+        # pipeline 2
+
+        _start_torch_memory_measurement()
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+
+        image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device)
+        original_image = floats_tensor((1, 3, 256, 256), rng=random.Random(0)).to(torch_device)
+        mask_image = floats_tensor((1, 3, 256, 256), rng=random.Random(1)).to(torch_device)
+
+        output = pipe_2(
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            image=image,
+            mask_image=mask_image,
+            original_image=original_image,
+            generator=generator,
+            num_inference_steps=2,
+            output_type="np",
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (256, 256, 3)
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        assert mem_bytes < 4 * 10**9
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/if/test_if_inpainting_superresolution_stage_II.npy"
+        )
+        assert_mean_pixel_difference(image, expected_image)
+
+
+def _start_torch_memory_measurement():
+    torch.cuda.empty_cache()
+    torch.cuda.reset_max_memory_allocated()
+    torch.cuda.reset_peak_memory_stats()
diff --git a/tests/pipelines/deepfloyd_if/test_if_img2img.py b/tests/pipelines/deepfloyd_if/test_if_img2img.py
new file mode 100644
index 000000000000..b4c99a8ab93a
--- /dev/null
+++ b/tests/pipelines/deepfloyd_if/test_if_img2img.py
@@ -0,0 +1,84 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import unittest
+
+import torch
+
+from diffusers import IFImg2ImgPipeline
+from diffusers.utils import floats_tensor
+from diffusers.utils.testing_utils import skip_mps, torch_device
+
+from ..pipeline_params import (
+    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+)
+from ..test_pipelines_common import PipelineTesterMixin
+from . import IFPipelineTesterMixin
+
+
+@skip_mps
+class IFImg2ImgPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase):
+    pipeline_class = IFImg2ImgPipeline
+    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"width", "height"}
+    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
+    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
+
+    test_xformers_attention = False
+
+    def get_dummy_components(self):
+        return self._get_dummy_components()
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "output_type": "numpy",
+        }
+
+        return inputs
+
+    def test_save_load_optional_components(self):
+        self._test_save_load_optional_components()
+
+    @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
+    def test_save_load_float16(self):
+        # Due to non-determinism in save load of the hf-internal-testing/tiny-random-t5 text encoder
+        self._test_save_load_float16(expected_max_diff=1e-1)
+
+    @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
+    def test_float16_inference(self):
+        self._test_float16_inference(expected_max_diff=1e-1)
+
+    def test_attention_slicing_forward_pass(self):
+        self._test_attention_slicing_forward_pass(expected_max_diff=1e-2)
+
+    def test_save_load_local(self):
+        self._test_save_load_local()
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(
+            expected_max_diff=1e-2,
+        )
diff --git a/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py b/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py
new file mode 100644
index 000000000000..626ab321f895
--- /dev/null
+++ b/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py
@@ -0,0 +1,79 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import unittest
+
+import torch
+
+from diffusers import IFImg2ImgSuperResolutionPipeline
+from diffusers.utils import floats_tensor
+from diffusers.utils.testing_utils import skip_mps, torch_device
+
+from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
+from . import IFPipelineTesterMixin
+
+
+@skip_mps
+class IFImg2ImgSuperResolutionPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase):
+    pipeline_class = IFImg2ImgSuperResolutionPipeline
+    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"width", "height"}
+    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS.union({"original_image"})
+    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
+
+    test_xformers_attention = False
+
+    def get_dummy_components(self):
+        return self._get_superresolution_dummy_components()
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        original_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        image = floats_tensor((1, 3, 16, 16), rng=random.Random(seed)).to(device)
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": image,
+            "original_image": original_image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "output_type": "numpy",
+        }
+
+        return inputs
+
+    def test_save_load_optional_components(self):
+        self._test_save_load_optional_components()
+
+    @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
+    def test_save_load_float16(self):
+        # Due to non-determinism in save load of the hf-internal-testing/tiny-random-t5 text encoder
+        self._test_save_load_float16(expected_max_diff=1e-1)
+
+    def test_attention_slicing_forward_pass(self):
+        self._test_attention_slicing_forward_pass(expected_max_diff=1e-2)
+
+    def test_save_load_local(self):
+        self._test_save_load_local()
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(
+            expected_max_diff=1e-2,
+        )
diff --git a/tests/pipelines/deepfloyd_if/test_if_inpainting.py b/tests/pipelines/deepfloyd_if/test_if_inpainting.py
new file mode 100644
index 000000000000..37d818c7a910
--- /dev/null
+++ b/tests/pipelines/deepfloyd_if/test_if_inpainting.py
@@ -0,0 +1,82 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import unittest
+
+import torch
+
+from diffusers import IFInpaintingPipeline
+from diffusers.utils import floats_tensor
+from diffusers.utils.testing_utils import skip_mps, torch_device
+
+from ..pipeline_params import (
+    TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_INPAINTING_PARAMS,
+)
+from ..test_pipelines_common import PipelineTesterMixin
+from . import IFPipelineTesterMixin
+
+
+@skip_mps
+class IFInpaintingPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase):
+    pipeline_class = IFInpaintingPipeline
+    params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS - {"width", "height"}
+    batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
+    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
+
+    test_xformers_attention = False
+
+    def get_dummy_components(self):
+        return self._get_dummy_components()
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        mask_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": image,
+            "mask_image": mask_image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "output_type": "numpy",
+        }
+
+        return inputs
+
+    def test_save_load_optional_components(self):
+        self._test_save_load_optional_components()
+
+    @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
+    def test_save_load_float16(self):
+        # Due to non-determinism in save load of the hf-internal-testing/tiny-random-t5 text encoder
+        self._test_save_load_float16(expected_max_diff=1e-1)
+
+    def test_attention_slicing_forward_pass(self):
+        self._test_attention_slicing_forward_pass(expected_max_diff=1e-2)
+
+    def test_save_load_local(self):
+        self._test_save_load_local()
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(
+            expected_max_diff=1e-2,
+        )
diff --git a/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py b/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py
new file mode 100644
index 000000000000..30062cb2f8d0
--- /dev/null
+++ b/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py
@@ -0,0 +1,84 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import unittest
+
+import torch
+
+from diffusers import IFInpaintingSuperResolutionPipeline
+from diffusers.utils import floats_tensor
+from diffusers.utils.testing_utils import skip_mps, torch_device
+
+from ..pipeline_params import (
+    TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
+    TEXT_GUIDED_IMAGE_INPAINTING_PARAMS,
+)
+from ..test_pipelines_common import PipelineTesterMixin
+from . import IFPipelineTesterMixin
+
+
+@skip_mps
+class IFInpaintingSuperResolutionPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase):
+    pipeline_class = IFInpaintingSuperResolutionPipeline
+    params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS - {"width", "height"}
+    batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS.union({"original_image"})
+    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
+
+    test_xformers_attention = False
+
+    def get_dummy_components(self):
+        return self._get_superresolution_dummy_components()
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        image = floats_tensor((1, 3, 16, 16), rng=random.Random(seed)).to(device)
+        original_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        mask_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": image,
+            "original_image": original_image,
+            "mask_image": mask_image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "output_type": "numpy",
+        }
+
+        return inputs
+
+    def test_save_load_optional_components(self):
+        self._test_save_load_optional_components()
+
+    @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
+    def test_save_load_float16(self):
+        # Due to non-determinism in save load of the hf-internal-testing/tiny-random-t5 text encoder
+        self._test_save_load_float16(expected_max_diff=1e-1)
+
+    def test_attention_slicing_forward_pass(self):
+        self._test_attention_slicing_forward_pass(expected_max_diff=1e-2)
+
+    def test_save_load_local(self):
+        self._test_save_load_local()
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(
+            expected_max_diff=1e-2,
+        )
diff --git a/tests/pipelines/deepfloyd_if/test_if_superresolution.py b/tests/pipelines/deepfloyd_if/test_if_superresolution.py
new file mode 100644
index 000000000000..14acfa5415c2
--- /dev/null
+++ b/tests/pipelines/deepfloyd_if/test_if_superresolution.py
@@ -0,0 +1,77 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import unittest
+
+import torch
+
+from diffusers import IFSuperResolutionPipeline
+from diffusers.utils import floats_tensor
+from diffusers.utils.testing_utils import skip_mps, torch_device
+
+from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
+from . import IFPipelineTesterMixin
+
+
+@skip_mps
+class IFSuperResolutionPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase):
+    pipeline_class = IFSuperResolutionPipeline
+    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"width", "height"}
+    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
+    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
+
+    test_xformers_attention = False
+
+    def get_dummy_components(self):
+        return self._get_superresolution_dummy_components()
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "output_type": "numpy",
+        }
+
+        return inputs
+
+    def test_save_load_optional_components(self):
+        self._test_save_load_optional_components()
+
+    @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
+    def test_save_load_float16(self):
+        # Due to non-determinism in save load of the hf-internal-testing/tiny-random-t5 text encoder
+        self._test_save_load_float16(expected_max_diff=1e-1)
+
+    def test_attention_slicing_forward_pass(self):
+        self._test_attention_slicing_forward_pass(expected_max_diff=1e-2)
+
+    def test_save_load_local(self):
+        self._test_save_load_local()
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(
+            expected_max_diff=1e-2,
+        )
diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py
index 8fb79f0c4057..168ff8106c52 100644
--- a/tests/pipelines/test_pipelines.py
+++ b/tests/pipelines/test_pipelines.py
@@ -575,6 +575,19 @@ def test_text_inversion_download(self):
             out = pipe(prompt, num_inference_steps=1, output_type="numpy").images
             assert out.shape == (1, 128, 128, 3)
 
+    def test_download_ignore_files(self):
+        # Check https://huggingface.co/hf-internal-testing/tiny-stable-diffusion-pipe-ignore-files/blob/72f58636e5508a218c6b3f60550dc96445547817/model_index.json#L4
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            # pipeline has Flax weights
+            tmpdirname = DiffusionPipeline.download("hf-internal-testing/tiny-stable-diffusion-pipe-ignore-files")
+            all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname))]
+            files = [item for sublist in all_root_files for item in sublist]
+
+            # None of the downloaded files should be a pytorch file even if we have some here:
+            # https://huggingface.co/hf-internal-testing/tiny-stable-diffusion-pipe/blob/main/unet/diffusion_flax_model.msgpack
+            assert not any(f in ["vae/diffusion_pytorch_model.bin", "text_encoder/config.json"] for f in files)
+            assert len(files) == 14
+
 
 class CustomPipelineTests(unittest.TestCase):
     def test_load_custom_pipeline(self):
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index d0712bdec8f6..0278092282ba 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -339,6 +339,9 @@ def test_components_function(self):
 
     @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
     def test_float16_inference(self):
+        self._test_float16_inference()
+
+    def _test_float16_inference(self, expected_max_diff=1e-2):
         components = self.get_dummy_components()
         pipe = self.pipeline_class(**components)
         pipe.to(torch_device)
@@ -352,10 +355,13 @@ def test_float16_inference(self):
         output_fp16 = pipe_fp16(**self.get_dummy_inputs(torch_device))[0]
 
         max_diff = np.abs(to_np(output) - to_np(output_fp16)).max()
-        self.assertLess(max_diff, 1e-2, "The outputs of the fp16 and fp32 pipelines are too different.")
+        self.assertLess(max_diff, expected_max_diff, "The outputs of the fp16 and fp32 pipelines are too different.")
 
     @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
     def test_save_load_float16(self):
+        self._test_save_load_float16()
+
+    def _test_save_load_float16(self, expected_max_diff=1e-2):
         components = self.get_dummy_components()
         for name, module in components.items():
             if hasattr(module, "half"):
@@ -384,7 +390,9 @@ def test_save_load_float16(self):
         output_loaded = pipe_loaded(**inputs)[0]
 
         max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
-        self.assertLess(max_diff, 1e-2, "The output of the fp16 pipeline changed after saving and loading.")
+        self.assertLess(
+            max_diff, expected_max_diff, "The output of the fp16 pipeline changed after saving and loading."
+        )
 
     def test_save_load_optional_components(self):
         if not hasattr(self.pipeline_class, "_optional_components"):

From da2ce1a6b92f48cabe9e9d3944c4ee8b007b2871 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 26 Apr 2023 10:34:34 +0200
Subject: [PATCH 62/71] Allow return pt x4 (#3236)

* Add all files

* update
---
 .../pipeline_stable_diffusion_upscale.py          | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
index 45b26de284af..14e5c4ab7cd1 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
@@ -697,15 +697,11 @@ def __call__(
         # 10. Post-processing
         # make sure the VAE is in float32 mode, as it overflows in float16
         self.vae.to(dtype=torch.float32)
-        image = self.decode_latents(latents.float())
-
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
 
         # 11. Convert to PIL
         # has_nsfw_concept = False
         if output_type == "pil":
+            image = self.decode_latents(latents.float())
             image, has_nsfw_concept, _ = self.run_safety_checker(image, device, prompt_embeds.dtype)
 
             image = self.numpy_to_pil(image)
@@ -713,9 +709,18 @@ def __call__(
             # 11. Apply watermark
             if self.watermarker is not None:
                 image = self.watermarker.apply_watermark(image)
+        elif output_type == "pt":
+            latents = 1 / self.vae.config.scaling_factor * latents.float()
+            image = self.vae.decode(latents).sample
+            has_nsfw_concept = None
         else:
+            image = self.decode_latents(latents.float())
             has_nsfw_concept = None
 
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+
         if not return_dict:
             return (image, has_nsfw_concept)
 

From abbf3c1adf6bbf424ae82e640647ad4078e1a4b9 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 26 Apr 2023 12:16:06 +0200
Subject: [PATCH 63/71] Allow fp16 attn for x4 upscaler (#3239)

* Add all files

* update

* Make sure vae is memory efficient for PT 1

* make style
---
 src/diffusers/models/vae.py                   |  3 +++
 .../pipeline_stable_diffusion_upscale.py      | 21 +++++++++++++++----
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/models/vae.py b/src/diffusers/models/vae.py
index b4484823ac3d..400c3030af90 100644
--- a/src/diffusers/models/vae.py
+++ b/src/diffusers/models/vae.py
@@ -212,6 +212,7 @@ def forward(self, z):
         sample = z
         sample = self.conv_in(sample)
 
+        upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
         if self.training and self.gradient_checkpointing:
 
             def create_custom_forward(module):
@@ -222,6 +223,7 @@ def custom_forward(*inputs):
 
             # middle
             sample = torch.utils.checkpoint.checkpoint(create_custom_forward(self.mid_block), sample)
+            sample = sample.to(upscale_dtype)
 
             # up
             for up_block in self.up_blocks:
@@ -229,6 +231,7 @@ def custom_forward(*inputs):
         else:
             # middle
             sample = self.mid_block(sample)
+            sample = sample.to(upscale_dtype)
 
             # up
             for up_block in self.up_blocks:
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
index 14e5c4ab7cd1..87014f52dfc2 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
@@ -18,6 +18,7 @@
 import numpy as np
 import PIL
 import torch
+import torch.nn.functional as F
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 
 from ...loaders import TextualInversionLoaderMixin
@@ -698,10 +699,22 @@ def __call__(
         # make sure the VAE is in float32 mode, as it overflows in float16
         self.vae.to(dtype=torch.float32)
 
+        # TODO(Patrick, William) - clean up when attention is refactored
+        use_torch_2_0_attn = hasattr(F, "scaled_dot_product_attention")
+        use_xformers = self.vae.decoder.mid_block.attentions[0]._use_memory_efficient_attention_xformers
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if not use_torch_2_0_attn and not use_xformers:
+            self.vae.post_quant_conv.to(latents.dtype)
+            self.vae.decoder.conv_in.to(latents.dtype)
+            self.vae.decoder.mid_block.to(latents.dtype)
+        else:
+            latents = latents.float()
+
         # 11. Convert to PIL
-        # has_nsfw_concept = False
         if output_type == "pil":
-            image = self.decode_latents(latents.float())
+            image = self.decode_latents(latents)
+
             image, has_nsfw_concept, _ = self.run_safety_checker(image, device, prompt_embeds.dtype)
 
             image = self.numpy_to_pil(image)
@@ -710,11 +723,11 @@ def __call__(
             if self.watermarker is not None:
                 image = self.watermarker.apply_watermark(image)
         elif output_type == "pt":
-            latents = 1 / self.vae.config.scaling_factor * latents.float()
+            latents = 1 / self.vae.config.scaling_factor * latents
             image = self.vae.decode(latents).sample
             has_nsfw_concept = None
         else:
-            image = self.decode_latents(latents.float())
+            image = self.decode_latents(latents)
             has_nsfw_concept = None
 
         # Offload last model to CPU

From 744663f8dc110c03e10157e92175b8187cf64d59 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 26 Apr 2023 12:44:19 +0200
Subject: [PATCH 64/71] fix fast test (#3241)

---
 tests/pipelines/unclip/test_unclip.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/unclip/test_unclip.py b/tests/pipelines/unclip/test_unclip.py
index d2c699ea501d..5c9181c08e3f 100644
--- a/tests/pipelines/unclip/test_unclip.py
+++ b/tests/pipelines/unclip/test_unclip.py
@@ -358,7 +358,7 @@ class DummyScheduler:
     def test_attention_slicing_forward_pass(self):
         test_max_difference = torch_device == "cpu"
 
-        self._test_attention_slicing_forward_pass(test_max_difference=test_max_difference)
+        self._test_attention_slicing_forward_pass(test_max_difference=test_max_difference, expected_max_diff=0.01)
 
     # Overriding PipelineTesterMixin::test_inference_batch_single_identical
     # because UnCLIP undeterminism requires a looser check.

From 977162c02b753d088433ec1634e448df8741fb7c Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 26 Apr 2023 16:25:48 +0530
Subject: [PATCH 65/71] Adds a document on token merging (#3208)

* add document on token merging.

* fix headline.

* fix: headline.

* add some samples for comparison.
---
 docs/source/en/_toctree.yml          |   2 +
 docs/source/en/optimization/tome.mdx | 116 +++++++++++++++++++++++++++
 2 files changed, 118 insertions(+)
 create mode 100644 docs/source/en/optimization/tome.mdx

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index cc880f3e0b81..ccaaff7ca680 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -105,6 +105,8 @@
     title: MPS
   - local: optimization/habana
     title: Habana Gaudi
+  - local: optimization/tome
+    title: Token Merging
   title: Optimization/Special Hardware
 - sections:
   - local: conceptual/philosophy
diff --git a/docs/source/en/optimization/tome.mdx b/docs/source/en/optimization/tome.mdx
new file mode 100644
index 000000000000..c2158f539a65
--- /dev/null
+++ b/docs/source/en/optimization/tome.mdx
@@ -0,0 +1,116 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Token Merging
+
+Token Merging (introduced in [Token Merging: Your ViT But Faster](https://arxiv.org/abs/2210.09461)) works by merging the redundant tokens / patches progressively in the forward pass of a Transformer-based network. It can speed up the inference latency of the underlying network.
+
+After Token Merging (ToMe) was released, the authors released [Token Merging for Fast Stable Diffusion](https://arxiv.org/abs/2303.17604), which introduced a version of ToMe which is more compatible with Stable Diffusion. We can use ToMe to gracefully speed up the inference latency of a [`DiffusionPipeline`]. This doc discusses how to apply ToMe to the [`StableDiffusionPipeline`], the expected speedups, and the qualitative aspects of using ToMe on the [`StableDiffusionPipeline`]. 
+
+## Using ToMe
+
+The authors of ToMe released a convenient Python library called [`tomesd`](https://github.com/dbolya/tomesd) that lets us apply ToMe to a [`DiffusionPipeline`] like so:
+
+```diff
+from diffusers import StableDiffusionPipeline
+import tomesd
+
+pipeline = StableDiffusionPipeline.from_pretrained(
+      "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
+).to("cuda")
++ tomesd.apply_patch(pipeline, ratio=0.5)
+
+image = pipeline("a photo of an astronaut riding a horse on mars").images[0]
+```
+
+And that’s it! 
+
+`tomesd.apply_patch()` exposes [a number of arguments](https://github.com/dbolya/tomesd#usage) to let us strike a balance between the pipeline inference speed and the quality of the generated tokens. Amongst those arguments, the most important one is `ratio`. `ratio` controls the number of tokens that will be merged during the forward pass. For more details on `tomesd`, please refer to the original repository https://github.com/dbolya/tomesd and [the paper](https://arxiv.org/abs/2303.17604). 
+
+## Benchmarking `tomesd` with `StableDiffusionPipeline`
+
+We benchmarked the impact of using `tomesd` on [`StableDiffusionPipeline`] along with [xformers](https://huggingface.co/docs/diffusers/optimization/xformers) across different image resolutions. We used A100 and V100 as our test GPU devices with the following development environment (with Python 3.8.5):
+
+```bash
+- `diffusers` version: 0.15.1
+- Python version: 3.8.16
+- PyTorch version (GPU?): 1.13.1+cu116 (True)
+- Huggingface_hub version: 0.13.2
+- Transformers version: 4.27.2
+- Accelerate version: 0.18.0
+- xFormers version: 0.0.16
+- tomesd version: 0.1.2
+```
+
+We used this script for benchmarking: [https://gist.github.com/sayakpaul/27aec6bca7eb7b0e0aa4112205850335](https://gist.github.com/sayakpaul/27aec6bca7eb7b0e0aa4112205850335). Following are our findings: 
+
+### A100
+
+| Resolution | Batch size | Vanilla | ToMe | ToMe + xFormers | ToMe speedup (%) | ToMe + xFormers speedup (%) |
+| --- | --- | --- | --- | --- | --- | --- |
+| 512 | 10 | 6.88 | 5.26 | 4.69 | 23.54651163 | 31.83139535 |
+|  |  |  |  |  |  |  |
+| 768 | 10 | OOM | 14.71 | 11 |  |  |
+|  | 8 | OOM | 11.56 | 8.84 |  |  |
+|  | 4 | OOM | 5.98 | 4.66 |  |  |
+|  | 2 | 4.99 | 3.24 | 3.1 | 35.07014028 | 37.8757515 |
+|  | 1 | 3.29 | 2.24 | 2.03 | 31.91489362 | 38.29787234 |
+|  |  |  |  |  |  |  |
+| 1024 | 10 | OOM | OOM | OOM |  |  |
+|  | 8 | OOM | OOM | OOM |  |  |
+|  | 4 | OOM | 12.51 | 9.09 |  |  |
+|  | 2 | OOM | 6.52 | 4.96 |  |  |
+|  | 1 | 6.4 | 3.61 | 2.81 | 43.59375 | 56.09375 |
+
+***The timings reported here are in seconds. Speedups are calculated over the `Vanilla` timings.*** 
+
+### V100
+
+| Resolution | Batch size | Vanilla | ToMe | ToMe + xFormers | ToMe speedup (%) | ToMe + xFormers speedup (%) |
+| --- | --- | --- | --- | --- | --- | --- |
+| 512 | 10 | OOM | 10.03 | 9.29 |  |  |
+|  | 8 | OOM | 8.05 | 7.47 |  |  |
+|  | 4 | 5.7 | 4.3 | 3.98 | 24.56140351 | 30.1754386 |
+|  | 2 | 3.14 | 2.43 | 2.27 | 22.61146497 | 27.70700637 |
+|  | 1 | 1.88 | 1.57 | 1.57 | 16.4893617 | 16.4893617 |
+|  |  |  |  |  |  |  |
+| 768 | 10 | OOM | OOM | 23.67 |  |  |
+|  | 8 | OOM | OOM | 18.81 |  |  |
+|  | 4 | OOM | 11.81 | 9.7 |  |  |
+|  | 2 | OOM | 6.27 | 5.2 |  |  |
+|  | 1 | 5.43 | 3.38 | 2.82 | 37.75322284 | 48.06629834 |
+|  |  |  |  |  |  |  |
+| 1024 | 10 | OOM | OOM | OOM |  |  |
+|  | 8 | OOM | OOM | OOM |  |  |
+|  | 4 | OOM | OOM | 19.35 |  |  |
+|  | 2 | OOM | 13 | 10.78 |  |  |
+|  | 1 | OOM | 6.66 | 5.54 |  |  |
+
+As seen in the tables above, the speedup with `tomesd` becomes more pronounced for larger image resolutions. It is also interesting to note that with `tomesd`, it becomes possible to run the pipeline on a higher resolution, like 1024x1024. 
+
+It might be possible to speed up inference even further with [`torch.compile()`](https://huggingface.co/docs/diffusers/optimization/torch2.0). 
+
+## Quality
+
+As reported in [the paper](https://arxiv.org/abs/2303.17604), ToMe can preserve the quality of the generated images to a great extent while speeding up inference. By increasing the `ratio`, it is possible to further speed up inference, but that might come at the cost of a deterioration in the image quality. 
+
+To test the quality of the generated samples using our setup, we sampled a few prompts from the “Parti Prompts” (introduced in [Parti](https://parti.research.google/)) and performed inference with the [`StableDiffusionPipeline`] in the following settings:
+
+- Vanilla [`StableDiffusionPipeline`]
+- [`StableDiffusionPipeline`] + ToMe
+- [`StableDiffusionPipeline`] + ToMe + xformers
+
+We didn’t notice any significant decrease in the quality of the generated samples. Here are samples: 
+
+![tome-samples](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/tome/tome_samples.png)
+
+You can check out the generated samples [here](https://wandb.ai/sayakpaul/tomesd-results/runs/23j4bj3i?workspace=). We used [this script](https://gist.github.com/sayakpaul/8cac98d7f22399085a060992f411ecbd) for conducting this experiment.
\ No newline at end of file

From 46ceba5b350bbf3d9272e9614f17e5edbeb0e1ef Mon Sep 17 00:00:00 2001
From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
Date: Wed, 26 Apr 2023 11:33:08 +0000
Subject: [PATCH 66/71] [AudioLDM] Update docs to use updated ckpt (#3240)

* [AudioLDM] Update docs to use updated ckpt

* make style
---
 docs/source/en/api/pipelines/audioldm.mdx | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/docs/source/en/api/pipelines/audioldm.mdx b/docs/source/en/api/pipelines/audioldm.mdx
index f3987d2263ac..25a5bb8bce13 100644
--- a/docs/source/en/api/pipelines/audioldm.mdx
+++ b/docs/source/en/api/pipelines/audioldm.mdx
@@ -25,14 +25,14 @@ This pipeline was contributed by [sanchit-gandhi](https://huggingface.co/sanchit
 
 ## Text-to-Audio
 
-The [`AudioLDMPipeline`] can be used to load pre-trained weights from [cvssp/audioldm](https://huggingface.co/cvssp/audioldm) and generate text-conditional audio outputs:
+The [`AudioLDMPipeline`] can be used to load pre-trained weights from [cvssp/audioldm-s-full-v2](https://huggingface.co/cvssp/audioldm-s-full-v2) and generate text-conditional audio outputs:
 
 ```python
 from diffusers import AudioLDMPipeline
 import torch
 import scipy
 
-repo_id = "cvssp/audioldm"
+repo_id = "cvssp/audioldm-s-full-v2"
 pipe = AudioLDMPipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
 pipe = pipe.to("cuda")
 
@@ -56,7 +56,7 @@ Inference:
 ### How to load and use different schedulers
 
 The AudioLDM pipeline uses [`DDIMScheduler`] scheduler by default. But `diffusers` provides many other schedulers
-that can be used with the AudioLDM pipeline such as [`PNDMScheduler`], [`LMSDiscreteScheduler`], [`EulerDiscreteScheduler`], 
+that can be used with the AudioLDM pipeline such as [`PNDMScheduler`], [`LMSDiscreteScheduler`], [`EulerDiscreteScheduler`],
 [`EulerAncestralDiscreteScheduler`] etc. We recommend using the [`DPMSolverMultistepScheduler`] as it's currently the fastest
 scheduler there is.
 
@@ -68,12 +68,14 @@ method, or pass the `scheduler` argument to the `from_pretrained` method of the
 >>> from diffusers import AudioLDMPipeline, DPMSolverMultistepScheduler
 >>> import torch
 
->>> pipeline = AudioLDMPipeline.from_pretrained("cvssp/audioldm", torch_dtype=torch.float16)
+>>> pipeline = AudioLDMPipeline.from_pretrained("cvssp/audioldm-s-full-v2", torch_dtype=torch.float16)
 >>> pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
 
 >>> # or
->>> dpm_scheduler = DPMSolverMultistepScheduler.from_pretrained("cvssp/audioldm", subfolder="scheduler")
->>> pipeline = AudioLDMPipeline.from_pretrained("cvssp/audioldm", scheduler=dpm_scheduler, torch_dtype=torch.float16)
+>>> dpm_scheduler = DPMSolverMultistepScheduler.from_pretrained("cvssp/audioldm-s-full-v2", subfolder="scheduler")
+>>> pipeline = AudioLDMPipeline.from_pretrained(
+...     "cvssp/audioldm-s-full-v2", scheduler=dpm_scheduler, torch_dtype=torch.float16
+... )
 ```
 
 ## AudioLDMPipeline

From 6ba0efb9a188b08f5b46565a87c0b3da7ff46af4 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 26 Apr 2023 13:35:01 +0200
Subject: [PATCH 67/71] Release: v0.16.0

---
 examples/controlnet/train_controlnet.py                        | 2 +-
 examples/controlnet/train_controlnet_flax.py                   | 2 +-
 examples/custom_diffusion/train_custom_diffusion.py            | 2 +-
 examples/dreambooth/train_dreambooth.py                        | 2 +-
 examples/dreambooth/train_dreambooth_flax.py                   | 2 +-
 examples/dreambooth/train_dreambooth_lora.py                   | 2 +-
 examples/instruct_pix2pix/train_instruct_pix2pix.py            | 2 +-
 examples/text_to_image/train_text_to_image.py                  | 2 +-
 examples/text_to_image/train_text_to_image_flax.py             | 2 +-
 examples/text_to_image/train_text_to_image_lora.py             | 2 +-
 examples/textual_inversion/textual_inversion.py                | 2 +-
 examples/textual_inversion/textual_inversion_flax.py           | 2 +-
 examples/unconditional_image_generation/train_unconditional.py | 2 +-
 setup.py                                                       | 2 +-
 src/diffusers/__init__.py                                      | 2 +-
 15 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/examples/controlnet/train_controlnet.py b/examples/controlnet/train_controlnet.py
index d52e610ca52d..9b9ba5ab737f 100644
--- a/examples/controlnet/train_controlnet.py
+++ b/examples/controlnet/train_controlnet.py
@@ -55,7 +55,7 @@
     import wandb
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.16.0.dev0")
+check_min_version("0.16.0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/controlnet/train_controlnet_flax.py b/examples/controlnet/train_controlnet_flax.py
index b25f9325403f..aff361cb6e01 100644
--- a/examples/controlnet/train_controlnet_flax.py
+++ b/examples/controlnet/train_controlnet_flax.py
@@ -59,7 +59,7 @@
     import wandb
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.16.0.dev0")
+check_min_version("0.16.0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/custom_diffusion/train_custom_diffusion.py b/examples/custom_diffusion/train_custom_diffusion.py
index 49b05e6b5db3..0954f3d6e789 100644
--- a/examples/custom_diffusion/train_custom_diffusion.py
+++ b/examples/custom_diffusion/train_custom_diffusion.py
@@ -56,7 +56,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.15.0.dev0")
+check_min_version("0.16.0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py
index 4f731aa1f776..a9449002ca80 100644
--- a/examples/dreambooth/train_dreambooth.py
+++ b/examples/dreambooth/train_dreambooth.py
@@ -56,7 +56,7 @@
     import wandb
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.16.0.dev0")
+check_min_version("0.16.0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/dreambooth/train_dreambooth_flax.py b/examples/dreambooth/train_dreambooth_flax.py
index 8583f64c6fbd..1a4ca9153c80 100644
--- a/examples/dreambooth/train_dreambooth_flax.py
+++ b/examples/dreambooth/train_dreambooth_flax.py
@@ -36,7 +36,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.16.0.dev0")
+check_min_version("0.16.0")
 
 # Cache compiled models across invocations of this script.
 cc.initialize_cache(os.path.expanduser("~/.cache/jax/compilation_cache"))
diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py
index 367a3422de33..805a8d1eea4d 100644
--- a/examples/dreambooth/train_dreambooth_lora.py
+++ b/examples/dreambooth/train_dreambooth_lora.py
@@ -55,7 +55,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.16.0.dev0")
+check_min_version("0.16.0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/instruct_pix2pix/train_instruct_pix2pix.py b/examples/instruct_pix2pix/train_instruct_pix2pix.py
index 155c370614dc..dc5a1c3081c0 100644
--- a/examples/instruct_pix2pix/train_instruct_pix2pix.py
+++ b/examples/instruct_pix2pix/train_instruct_pix2pix.py
@@ -51,7 +51,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.16.0.dev0")
+check_min_version("0.16.0")
 
 logger = get_logger(__name__, log_level="INFO")
 
diff --git a/examples/text_to_image/train_text_to_image.py b/examples/text_to_image/train_text_to_image.py
index 67724698c099..1d6db2a6f1da 100644
--- a/examples/text_to_image/train_text_to_image.py
+++ b/examples/text_to_image/train_text_to_image.py
@@ -50,7 +50,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.16.0.dev0")
+check_min_version("0.16.0")
 
 logger = get_logger(__name__, log_level="INFO")
 
diff --git a/examples/text_to_image/train_text_to_image_flax.py b/examples/text_to_image/train_text_to_image_flax.py
index d44731896c1d..c5dc71f0536e 100644
--- a/examples/text_to_image/train_text_to_image_flax.py
+++ b/examples/text_to_image/train_text_to_image_flax.py
@@ -33,7 +33,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.16.0.dev0")
+check_min_version("0.16.0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/text_to_image/train_text_to_image_lora.py b/examples/text_to_image/train_text_to_image_lora.py
index 8dfd96904bd0..39bdb4e59a52 100644
--- a/examples/text_to_image/train_text_to_image_lora.py
+++ b/examples/text_to_image/train_text_to_image_lora.py
@@ -47,7 +47,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.16.0.dev0")
+check_min_version("0.16.0")
 
 logger = get_logger(__name__, log_level="INFO")
 
diff --git a/examples/textual_inversion/textual_inversion.py b/examples/textual_inversion/textual_inversion.py
index d7be58bdb9ba..824759cc4ca9 100644
--- a/examples/textual_inversion/textual_inversion.py
+++ b/examples/textual_inversion/textual_inversion.py
@@ -77,7 +77,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.16.0.dev0")
+check_min_version("0.16.0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/textual_inversion/textual_inversion_flax.py b/examples/textual_inversion/textual_inversion_flax.py
index 1d77753791f9..19553ceb92ec 100644
--- a/examples/textual_inversion/textual_inversion_flax.py
+++ b/examples/textual_inversion/textual_inversion_flax.py
@@ -56,7 +56,7 @@
 # ------------------------------------------------------------------------------
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.16.0.dev0")
+check_min_version("0.16.0")
 
 logger = logging.getLogger(__name__)
 
diff --git a/examples/unconditional_image_generation/train_unconditional.py b/examples/unconditional_image_generation/train_unconditional.py
index c004acc2d850..836a38f96286 100644
--- a/examples/unconditional_image_generation/train_unconditional.py
+++ b/examples/unconditional_image_generation/train_unconditional.py
@@ -28,7 +28,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.16.0.dev0")
+check_min_version("0.16.0")
 
 logger = get_logger(__name__, log_level="INFO")
 
diff --git a/setup.py b/setup.py
index 19cc1dca73bb..ea98b5d10277 100644
--- a/setup.py
+++ b/setup.py
@@ -226,7 +226,7 @@ def run(self):
 
 setup(
     name="diffusers",
-    version="0.16.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="0.16.0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
     description="Diffusers",
     long_description=open("README.md", "r", encoding="utf-8").read(),
     long_description_content_type="text/markdown",
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index e9d12bdb7cca..d4dbf1145072 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.16.0.dev0"
+__version__ = "0.16.0"
 
 from .configuration_utils import ConfigMixin
 from .utils import (

From 9c876a5915fe6621a3d21d7d9146f58be0a8610e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?apolin=C3=A1rio?= <joaopaulo.passos@gmail.com>
Date: Thu, 27 Apr 2023 15:26:58 +0200
Subject: [PATCH 68/71] merge conflict

---
 docs/source/en/api/pipelines/if.mdx           | 34 +++++++++----------
 .../pipelines/deepfloyd_if/pipeline_if.py     |  2 +-
 .../deepfloyd_if/pipeline_if_img2img.py       |  2 +-
 .../pipeline_if_img2img_superresolution.py    |  2 +-
 .../deepfloyd_if/pipeline_if_inpainting.py    |  2 +-
 .../pipeline_if_inpainting_superresolution.py |  2 +-
 .../pipeline_if_superresolution.py            |  2 +-
 tests/pipelines/deepfloyd_if/test_if.py       |  2 +-
 8 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/docs/source/en/api/pipelines/if.mdx b/docs/source/en/api/pipelines/if.mdx
index 5d3b292587f6..d79c7035fb75 100644
--- a/docs/source/en/api/pipelines/if.mdx
+++ b/docs/source/en/api/pipelines/if.mdx
@@ -28,8 +28,8 @@ Our work underscores the potential of larger UNet architectures in the first sta
 ## Usage
 
 Before you can use IF, you need to accept its usage conditions. To do so:
-1. Make sure to have a [Hugging Face account](https://huggingface.co/join) and be loggin in
-2. Accept the license on the model card of [DeepFloyd/IF-I-IF-v1.0](https://huggingface.co/DeepFloyd/IF-I-IF-v1.0) and [DeepFloyd/IF-II-L-v1.0](https://huggingface.co/DeepFloyd/IF-II-L-v1.0)
+1. Make sure to have a [Hugging Face account](https://huggingface.co/join) and be logged in
+2. Accept the license on the model card of [DeepFloyd/IF-I-XL-v1.0](https://huggingface.co/DeepFloyd/IF-I-XL-v1.0). Accepting the license on the stage I model card will auto accept for the other IF models.
 3. Make sure to login locally. Install `huggingface_hub`
 ```sh
 pip install huggingface_hub --upgrade
@@ -62,7 +62,7 @@ The following sections give more in-detail examples of how to use IF. Specifical
 
 **Available checkpoints**
 - *Stage-1*
-  - [DeepFloyd/IF-I-IF-v1.0](https://huggingface.co/DeepFloyd/IF-I-IF-v1.0)
+  - [DeepFloyd/IF-I-XL-v1.0](https://huggingface.co/DeepFloyd/IF-I-XL-v1.0)
   - [DeepFloyd/IF-I-L-v1.0](https://huggingface.co/DeepFloyd/IF-I-L-v1.0)
   - [DeepFloyd/IF-I-M-v1.0](https://huggingface.co/DeepFloyd/IF-I-M-v1.0)
 
@@ -90,7 +90,7 @@ from diffusers.utils import pt_to_pil
 import torch
 
 # stage 1
-stage_1 = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+stage_1 = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
 stage_1.enable_model_cpu_offload()
 
 # stage 2
@@ -162,7 +162,7 @@ original_image = Image.open(BytesIO(response.content)).convert("RGB")
 original_image = original_image.resize((768, 512))
 
 # stage 1
-stage_1 = IFImg2ImgPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+stage_1 = IFImg2ImgPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
 stage_1.enable_model_cpu_offload()
 
 # stage 2
@@ -244,7 +244,7 @@ mask_image = Image.open(BytesIO(response.content))
 mask_image = mask_image
 
 # stage 1
-stage_1 = IFInpaintingPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+stage_1 = IFInpaintingPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
 stage_1.enable_model_cpu_offload()
 
 # stage 2
@@ -305,7 +305,7 @@ In addition to being loaded with `from_pretrained`, Pipelines can also be loaded
 ```python
 from diffusers import IFPipeline, IFSuperResolutionPipeline
 
-pipe_1 = IFPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0")
+pipe_1 = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0")
 pipe_2 = IFSuperResolutionPipeline.from_pretrained("DeepFloyd/IF-II-L-v1.0")
 
 
@@ -326,7 +326,7 @@ pipe_2 = IFInpaintingSuperResolutionPipeline(**pipe_2.components)
 The simplest optimization to run IF faster is to move all model components to the GPU.
 
 ```py
-pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
 pipe.to("cuda")
 ```
 
@@ -352,7 +352,7 @@ the input image which also determines how many steps to run in the denoising pro
 A smaller number will vary the image less but run faster.
 
 ```py
-pipe = IFImg2ImgPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+pipe = IFImg2ImgPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
 pipe.to("cuda")
 
 image = pipe(image=image, prompt="<prompt>", strength=0.3).images
@@ -364,7 +364,7 @@ with IF and it might not give expected results.
 ```py
 import torch
 
-pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
 pipe.to("cuda")
 
 pipe.text_encoder = torch.compile(pipe.text_encoder)
@@ -378,14 +378,14 @@ When optimizing for GPU memory, we can use the standard diffusers cpu offloading
 Either the model based CPU offloading,
 
 ```py
-pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
 pipe.enable_model_cpu_offload()
 ```
 
 or the more aggressive layer based CPU offloading.
 
 ```py
-pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
 pipe.enable_sequential_cpu_offload()
 ```
 
@@ -395,13 +395,13 @@ Additionally, T5 can be loaded in 8bit precision
 from transformers import T5EncoderModel
 
 text_encoder = T5EncoderModel.from_pretrained(
-    "DeepFloyd/IF-I-IF-v1.0", subfolder="text_encoder", device_map="auto", load_in_8bit=True, variant="8bit"
+    "DeepFloyd/IF-I-XL-v1.0", subfolder="text_encoder", device_map="auto", load_in_8bit=True, variant="8bit"
 )
 
 from diffusers import DiffusionPipeline
 
 pipe = DiffusionPipeline.from_pretrained(
-    "DeepFloyd/IF-I-IF-v1.0",
+    "DeepFloyd/IF-I-XL-v1.0",
     text_encoder=text_encoder,  # pass the previously instantiated 8bit text encoder
     unet=None,
     device_map="auto",
@@ -422,13 +422,13 @@ from transformers import T5EncoderModel
 from diffusers.utils import pt_to_pil
 
 text_encoder = T5EncoderModel.from_pretrained(
-    "DeepFloyd/IF-I-IF-v1.0", subfolder="text_encoder", device_map="auto", load_in_8bit=True, variant="8bit"
+    "DeepFloyd/IF-I-XL-v1.0", subfolder="text_encoder", device_map="auto", load_in_8bit=True, variant="8bit"
 )
 
 # text to image
 
 pipe = DiffusionPipeline.from_pretrained(
-    "DeepFloyd/IF-I-IF-v1.0",
+    "DeepFloyd/IF-I-XL-v1.0",
     text_encoder=text_encoder,  # pass the previously instantiated 8bit text encoder
     unet=None,
     device_map="auto",
@@ -444,7 +444,7 @@ gc.collect()
 torch.cuda.empty_cache()
 
 pipe = IFPipeline.from_pretrained(
-    "DeepFloyd/IF-I-IF-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16, device_map="auto"
+    "DeepFloyd/IF-I-XL-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16, device_map="auto"
 )
 
 generator = torch.Generator().manual_seed(0)
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py
index a76e51a3ffe9..479ffa9e6635 100644
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py
@@ -41,7 +41,7 @@
         >>> from diffusers.utils import pt_to_pil
         >>> import torch
 
-        >>> pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+        >>> pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
         >>> pipe.enable_model_cpu_offload()
 
         >>> prompt = 'a photo of a kangaroo wearing an orange hoodie and blue sunglasses standing in front of the eiffel tower holding a sign that says "very deep learning"'
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
index a31748450d4b..fac4adeea463 100644
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
@@ -70,7 +70,7 @@ def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
         >>> original_image = original_image.resize((768, 512))
 
         >>> pipe = IFImg2ImgPipeline.from_pretrained(
-        ...     "DeepFloyd/IF-I-IF-v1.0",
+        ...     "DeepFloyd/IF-I-XL-v1.0",
         ...     variant="fp16",
         ...     torch_dtype=torch.float16,
         ... )
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
index 21e280654cf5..eed1bb43e5d8 100644
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
@@ -73,7 +73,7 @@ def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
         >>> original_image = original_image.resize((768, 512))
 
         >>> pipe = IFImg2ImgPipeline.from_pretrained(
-        ...     "DeepFloyd/IF-I-IF-v1.0",
+        ...     "DeepFloyd/IF-I-XL-v1.0",
         ...     variant="fp16",
         ...     torch_dtype=torch.float16,
         ... )
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
index 95eba1cc7d24..d3651f5169c1 100644
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
@@ -76,7 +76,7 @@ def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
         >>> mask_image = mask_image
 
         >>> pipe = IFInpaintingPipeline.from_pretrained(
-        ...     "DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16
+        ...     "DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16
         ... )
         >>> pipe.enable_model_cpu_offload()
 
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
index 4eb0bf300fa5..5ea6a47082ae 100644
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
@@ -78,7 +78,7 @@ def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
         >>> mask_image = mask_image
 
         >>> pipe = IFInpaintingPipeline.from_pretrained(
-        ...     "DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16
+        ...     "DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16
         ... )
         >>> pipe.enable_model_cpu_offload()
 
diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
index bb1d4ee4ba66..a62a51b0972f 100644
--- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
+++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
@@ -45,7 +45,7 @@
         >>> from diffusers.utils import pt_to_pil
         >>> import torch
 
-        >>> pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+        >>> pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
         >>> pipe.enable_model_cpu_offload()
 
         >>> prompt = 'a photo of a kangaroo wearing an orange hoodie and blue sunglasses standing in front of the eiffel tower holding a sign that says "very deep learning"'
diff --git a/tests/pipelines/deepfloyd_if/test_if.py b/tests/pipelines/deepfloyd_if/test_if.py
index e2204cb601a6..bf01c2350d22 100644
--- a/tests/pipelines/deepfloyd_if/test_if.py
+++ b/tests/pipelines/deepfloyd_if/test_if.py
@@ -94,7 +94,7 @@ def tearDown(self):
     def test_all(self):
         # if
 
-        pipe_1 = IFPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16)
+        pipe_1 = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
 
         pipe_2 = IFSuperResolutionPipeline.from_pretrained(
             "DeepFloyd/IF-II-L-v1.0", variant="fp16", torch_dtype=torch.float16, text_encoder=None, tokenizer=None

From 4c476e99b5cf1cf5d0c84b8f96730f603e6f35cf Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 27 Apr 2023 18:12:08 +0200
Subject: [PATCH 69/71] Fix community pipelines (#3266)

---
 src/diffusers/utils/dynamic_modules_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/utils/dynamic_modules_utils.py b/src/diffusers/utils/dynamic_modules_utils.py
index 1951c4fa2623..aa6c9c657a87 100644
--- a/src/diffusers/utils/dynamic_modules_utils.py
+++ b/src/diffusers/utils/dynamic_modules_utils.py
@@ -267,7 +267,7 @@ def get_cached_module_file(
 
         # retrieve github version that matches
         if revision is None:
-            revision = latest_version if latest_version in available_versions else "main"
+            revision = latest_version if latest_version[1:] in available_versions else "main"
             logger.info(f"Defaulting to latest_version: {revision}.")
         elif revision in available_versions:
             revision = f"v{revision}"

From 23159f4adbbb41eba8c5af0b667de4a31e366500 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 28 Apr 2023 13:31:11 +0200
Subject: [PATCH 70/71] Allow disabling torch 2_0 attention (#3273)

* Allow disabling torch 2_0 attention

* make style

* Update src/diffusers/models/attention.py
---
 src/diffusers/models/attention.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 8e537c6f3680..fb5f6f48b324 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -71,6 +71,7 @@ def __init__(
         self.proj_attn = nn.Linear(channels, channels, bias=True)
 
         self._use_memory_efficient_attention_xformers = False
+        self._use_2_0_attn = True
         self._attention_op = None
 
     def reshape_heads_to_batch_dim(self, tensor, merge_head_and_batch=True):
@@ -142,9 +143,8 @@ def forward(self, hidden_states):
 
         scale = 1 / math.sqrt(self.channels / self.num_heads)
 
-        use_torch_2_0_attn = (
-            hasattr(F, "scaled_dot_product_attention") and not self._use_memory_efficient_attention_xformers
-        )
+        _use_2_0_attn = self._use_2_0_attn and not self._use_memory_efficient_attention_xformers
+        use_torch_2_0_attn = hasattr(F, "scaled_dot_product_attention") and _use_2_0_attn
 
         query_proj = self.reshape_heads_to_batch_dim(query_proj, merge_head_and_batch=not use_torch_2_0_attn)
         key_proj = self.reshape_heads_to_batch_dim(key_proj, merge_head_and_batch=not use_torch_2_0_attn)

From 9b14ce397e53fc5f5b909b07b6e992a2afe8e3af Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 28 Apr 2023 14:03:50 +0200
Subject: [PATCH 71/71] Release: v0.16.1

---
 setup.py                  | 2 +-
 src/diffusers/__init__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index ea98b5d10277..5c26b246aa01 100644
--- a/setup.py
+++ b/setup.py
@@ -226,7 +226,7 @@ def run(self):
 
 setup(
     name="diffusers",
-    version="0.16.0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="0.16.1",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
     description="Diffusers",
     long_description=open("README.md", "r", encoding="utf-8").read(),
     long_description_content_type="text/markdown",
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index d4dbf1145072..bb7381d65a54 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.16.0"
+__version__ = "0.16.1"
 
 from .configuration_utils import ConfigMixin
 from .utils import (