From e126a82cc5d9afbeb9b476455de24dd3e7dd358a Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Wed, 12 Apr 2023 20:55:54 +0530 Subject: [PATCH 01/71] [Tests] Speed up panorama tests (#3067) * fix: norm group test for UNet3D. * chore: speed up the panorama tests (fast). * set default value of _test_inference_batch_single_identical. * fix: batch_sizes default value. --- .../test_stable_diffusion_panorama.py | 18 +++++++++++++----- tests/test_pipelines_common.py | 11 ++++++----- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py index de9e8a79fb34..752ed6e969c3 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py @@ -49,7 +49,7 @@ def get_dummy_components(self): torch.manual_seed(0) unet = UNet2DConditionModel( block_out_channels=(32, 64), - layers_per_block=2, + layers_per_block=1, sample_size=32, in_channels=4, out_channels=4, @@ -101,7 +101,7 @@ def get_dummy_inputs(self, device, seed=0): # Setting height and width to None to prevent OOMs on CPU. "height": None, "width": None, - "num_inference_steps": 2, + "num_inference_steps": 1, "guidance_scale": 6.0, "output_type": "numpy", } @@ -119,10 +119,18 @@ def test_stable_diffusion_panorama_default_case(self): image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.4794, 0.5084, 0.4992, 0.3941, 0.3555, 0.4754, 0.5248, 0.5224, 0.4839]) + expected_slice = np.array([0.6186, 0.5374, 0.4915, 0.4135, 0.4114, 0.4563, 0.5128, 0.4977, 0.4757]) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + # override to speed the overall test timing up. + def test_inference_batch_consistent(self): + super().test_inference_batch_consistent(batch_sizes=[1, 2]) + + # override to speed the overall test timing up. + def test_inference_batch_single_identical(self): + super().test_inference_batch_single_identical(batch_size=2) + def test_stable_diffusion_panorama_negative_prompt(self): device = "cpu" # ensure determinism for the device-dependent torch.Generator components = self.get_dummy_components() @@ -138,7 +146,7 @@ def test_stable_diffusion_panorama_negative_prompt(self): assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.5029, 0.5075, 0.5002, 0.3965, 0.3584, 0.4746, 0.5271, 0.5273, 0.4877]) + expected_slice = np.array([0.6187, 0.5375, 0.4915, 0.4136, 0.4114, 0.4563, 0.5128, 0.4976, 0.4757]) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 @@ -158,7 +166,7 @@ def test_stable_diffusion_panorama_euler(self): assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.4934, 0.5455, 0.4847, 0.5022, 0.5572, 0.4833, 0.5207, 0.4952, 0.5051]) + expected_slice = np.array([0.4886, 0.5586, 0.4476, 0.5053, 0.6013, 0.4737, 0.5538, 0.5100, 0.4927]) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 diff --git a/tests/test_pipelines_common.py b/tests/test_pipelines_common.py index 13fbe924c799..981bc9061ef9 100644 --- a/tests/test_pipelines_common.py +++ b/tests/test_pipelines_common.py @@ -175,8 +175,8 @@ def test_pipeline_call_signature(self): f"Required optional parameters not present: {remaining_required_optional_parameters}", ) - def test_inference_batch_consistent(self): - self._test_inference_batch_consistent() + def test_inference_batch_consistent(self, batch_sizes=[2, 4, 13]): + self._test_inference_batch_consistent(batch_sizes=batch_sizes) def _test_inference_batch_consistent( self, batch_sizes=[2, 4, 13], additional_params_copy_to_batched_inputs=["num_inference_steps"] @@ -235,11 +235,12 @@ def _test_inference_batch_consistent( logger.setLevel(level=diffusers.logging.WARNING) - def test_inference_batch_single_identical(self): - self._test_inference_batch_single_identical() + def test_inference_batch_single_identical(self, batch_size=3): + self._test_inference_batch_single_identical(batch_size=batch_size) def _test_inference_batch_single_identical( self, + batch_size=3, test_max_difference=None, test_mean_pixel_difference=None, relax_max_difference=False, @@ -267,7 +268,7 @@ def _test_inference_batch_single_identical( # batchify inputs batched_inputs = {} - batch_size = 3 + batch_size = batch_size for name, value in inputs.items(): if name in self.batch_params: # prompt is string From 0a73b4d3cd1dd58e6470cfd7f1e10b7b81c63511 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Wed, 12 Apr 2023 18:18:30 +0200 Subject: [PATCH 02/71] [Post release] v0.16.0dev (#3072) --- examples/controlnet/train_controlnet.py | 2 +- examples/controlnet/train_controlnet_flax.py | 2 +- examples/dreambooth/train_dreambooth.py | 2 +- examples/dreambooth/train_dreambooth_flax.py | 2 +- examples/dreambooth/train_dreambooth_lora.py | 2 +- examples/instruct_pix2pix/train_instruct_pix2pix.py | 2 +- examples/text_to_image/train_text_to_image.py | 2 +- examples/text_to_image/train_text_to_image_flax.py | 2 +- examples/text_to_image/train_text_to_image_lora.py | 2 +- examples/textual_inversion/textual_inversion.py | 2 +- examples/textual_inversion/textual_inversion_flax.py | 2 +- examples/unconditional_image_generation/train_unconditional.py | 2 +- setup.py | 2 +- src/diffusers/__init__.py | 2 +- 14 files changed, 14 insertions(+), 14 deletions(-) diff --git a/examples/controlnet/train_controlnet.py b/examples/controlnet/train_controlnet.py index 30e43075d809..c0b52291fc9b 100644 --- a/examples/controlnet/train_controlnet.py +++ b/examples/controlnet/train_controlnet.py @@ -55,7 +55,7 @@ import wandb # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.15.0") +check_min_version("0.16.0.dev0") logger = get_logger(__name__) diff --git a/examples/controlnet/train_controlnet_flax.py b/examples/controlnet/train_controlnet_flax.py index f5ea3ce84bf3..67fe1b82d0dd 100644 --- a/examples/controlnet/train_controlnet_flax.py +++ b/examples/controlnet/train_controlnet_flax.py @@ -58,7 +58,7 @@ import wandb # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.15.0") +check_min_version("0.16.0.dev0") logger = logging.getLogger(__name__) diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py index 141aafb85128..4f731aa1f776 100644 --- a/examples/dreambooth/train_dreambooth.py +++ b/examples/dreambooth/train_dreambooth.py @@ -56,7 +56,7 @@ import wandb # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.15.0") +check_min_version("0.16.0.dev0") logger = get_logger(__name__) diff --git a/examples/dreambooth/train_dreambooth_flax.py b/examples/dreambooth/train_dreambooth_flax.py index 8c2faa7ec877..8583f64c6fbd 100644 --- a/examples/dreambooth/train_dreambooth_flax.py +++ b/examples/dreambooth/train_dreambooth_flax.py @@ -36,7 +36,7 @@ # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.15.0") +check_min_version("0.16.0.dev0") # Cache compiled models across invocations of this script. cc.initialize_cache(os.path.expanduser("~/.cache/jax/compilation_cache")) diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py index a117bd394895..d360939c8c0c 100644 --- a/examples/dreambooth/train_dreambooth_lora.py +++ b/examples/dreambooth/train_dreambooth_lora.py @@ -53,7 +53,7 @@ # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.15.0") +check_min_version("0.16.0.dev0") logger = get_logger(__name__) diff --git a/examples/instruct_pix2pix/train_instruct_pix2pix.py b/examples/instruct_pix2pix/train_instruct_pix2pix.py index b542d01c112a..155c370614dc 100644 --- a/examples/instruct_pix2pix/train_instruct_pix2pix.py +++ b/examples/instruct_pix2pix/train_instruct_pix2pix.py @@ -51,7 +51,7 @@ # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.15.0") +check_min_version("0.16.0.dev0") logger = get_logger(__name__, log_level="INFO") diff --git a/examples/text_to_image/train_text_to_image.py b/examples/text_to_image/train_text_to_image.py index fde762814b54..4bbf4706f01c 100644 --- a/examples/text_to_image/train_text_to_image.py +++ b/examples/text_to_image/train_text_to_image.py @@ -50,7 +50,7 @@ # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.15.0") +check_min_version("0.16.0.dev0") logger = get_logger(__name__, log_level="INFO") diff --git a/examples/text_to_image/train_text_to_image_flax.py b/examples/text_to_image/train_text_to_image_flax.py index cdfc546a8f58..41a02d68f2b1 100644 --- a/examples/text_to_image/train_text_to_image_flax.py +++ b/examples/text_to_image/train_text_to_image_flax.py @@ -33,7 +33,7 @@ # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.15.0") +check_min_version("0.16.0.dev0") logger = logging.getLogger(__name__) diff --git a/examples/text_to_image/train_text_to_image_lora.py b/examples/text_to_image/train_text_to_image_lora.py index a50ca222a4a0..8dfd96904bd0 100644 --- a/examples/text_to_image/train_text_to_image_lora.py +++ b/examples/text_to_image/train_text_to_image_lora.py @@ -47,7 +47,7 @@ # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.15.0") +check_min_version("0.16.0.dev0") logger = get_logger(__name__, log_level="INFO") diff --git a/examples/textual_inversion/textual_inversion.py b/examples/textual_inversion/textual_inversion.py index aebc524bbb36..e157e629df64 100644 --- a/examples/textual_inversion/textual_inversion.py +++ b/examples/textual_inversion/textual_inversion.py @@ -77,7 +77,7 @@ # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.15.0") +check_min_version("0.16.0.dev0") logger = get_logger(__name__) diff --git a/examples/textual_inversion/textual_inversion_flax.py b/examples/textual_inversion/textual_inversion_flax.py index 513548d947a0..1d77753791f9 100644 --- a/examples/textual_inversion/textual_inversion_flax.py +++ b/examples/textual_inversion/textual_inversion_flax.py @@ -56,7 +56,7 @@ # ------------------------------------------------------------------------------ # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.15.0") +check_min_version("0.16.0.dev0") logger = logging.getLogger(__name__) diff --git a/examples/unconditional_image_generation/train_unconditional.py b/examples/unconditional_image_generation/train_unconditional.py index f38e908fcef6..c004acc2d850 100644 --- a/examples/unconditional_image_generation/train_unconditional.py +++ b/examples/unconditional_image_generation/train_unconditional.py @@ -28,7 +28,7 @@ # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.15.0") +check_min_version("0.16.0.dev0") logger = get_logger(__name__, log_level="INFO") diff --git a/setup.py b/setup.py index da75dd1e2a85..19cc1dca73bb 100644 --- a/setup.py +++ b/setup.py @@ -226,7 +226,7 @@ def run(self): setup( name="diffusers", - version="0.15.0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) + version="0.16.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) description="Diffusers", long_description=open("README.md", "r", encoding="utf-8").read(), long_description_content_type="text/markdown", diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index c7d850d65953..07c17100e0e0 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.15.0" +__version__ = "0.16.0.dev0" from .configuration_utils import ConfigMixin from .utils import ( From d06e06940b8af3567958935bf49d4c42768110c8 Mon Sep 17 00:00:00 2001 From: Andreas Steiner Date: Wed, 12 Apr 2023 20:29:18 +0200 Subject: [PATCH 03/71] Adds profiling flags, computes train metrics average. (#3053) * WIP controlnet training - bugfix --streaming - bugfix running report_to!='wandb' - adds memory profile before validation * Adds final logging statement. * Sets train epochs to 11. Looking at a longer ~16ep run, we see only good validation images after ~11ep: https://wandb.ai/andsteing/controlnet_fill50k/runs/3j2hx6n8 * Removes --logging_dir (it's not used). * Adds --profile flags. * Updates --output_dir=runs/fill-circle-{timestamp}. * Compute mean of `train_metrics`. Previously `train_metrics[-1]` was logged, resulting in very bumpy train metrics. * Improves logging a bit. - adds l2_grads gradient norm logging - adds steps_per_sec - sets walltime as x coordinate of train/step - logs controlnet_params config * Adds --ccache (doesn't really help though). * minor fix in controlnet flax example (#2986) * fix the error when push_to_hub but not log validation * contronet_from_pt & controlnet_revision * add intermediate checkpointing to the guide * Bugfix --profile_steps * Sets `RACKER_PROJECT_NAME='controlnet_fill50k'`. * Logs fractional epoch. * Adds relative `walltime` metric. * Adds `StepTraceAnnotation` and uses `global_step` insetad of `step`. * Applied `black`. * Streamlines commands in README a bit. * Removes `--ccache`. This makes only a very small difference (~1 min) with this model size, so removing the option introduced in cdb3cc. * Re-ran `black`. * Update examples/controlnet/README.md Co-authored-by: Sayak Paul * Converts spaces to tab. * Removes repeated args. * Skips first step (compilation) in profiling * Updates README with profiling instructions. * Unifies tabs/spaces in README. * Re-ran style & quality. --------- Co-authored-by: Sayak Paul --- examples/controlnet/README.md | 74 ++++++++++------ examples/controlnet/train_controlnet_flax.py | 90 +++++++++++++++----- 2 files changed, 119 insertions(+), 45 deletions(-) diff --git a/examples/controlnet/README.md b/examples/controlnet/README.md index 4b388d92a195..387755624729 100644 --- a/examples/controlnet/README.md +++ b/examples/controlnet/README.md @@ -284,9 +284,9 @@ TPU_TYPE=v4-8 VM_NAME=hg_flax gcloud alpha compute tpus tpu-vm create $VM_NAME \ - --zone $ZONE \ - --accelerator-type $TPU_TYPE \ - --version tpu-vm-v4-base + --zone $ZONE \ + --accelerator-type $TPU_TYPE \ + --version tpu-vm-v4-base gcloud alpha compute tpus tpu-vm ssh $VM_NAME --zone $ZONE -- \ ``` @@ -326,6 +326,7 @@ If you want to use Weights and Biases logging, you should also install `wandb` n pip install wandb ``` + Now let's downloading two conditioning images that we will use to run validation during the training in order to track our progress ``` @@ -343,8 +344,8 @@ Make sure you have the `MODEL_DIR`,`OUTPUT_DIR` and `HUB_MODEL_ID` environment v ```bash export MODEL_DIR="runwayml/stable-diffusion-v1-5" -export OUTPUT_DIR="control_out" -export HUB_MODEL_ID="fill-circle-controlnet" +export OUTPUT_DIR="runs/fill-circle-{timestamp}" +export HUB_MODEL_ID="controlnet-fill-circle" ``` And finally start the training @@ -363,32 +364,36 @@ python3 train_controlnet_flax.py \ --revision="non-ema" \ --from_pt \ --report_to="wandb" \ - --max_train_steps=10000 \ + --tracker_project_name=$HUB_MODEL_ID \ + --num_train_epochs=11 \ --push_to_hub \ --hub_model_id=$HUB_MODEL_ID ``` Since we passed the `--push_to_hub` flag, it will automatically create a model repo under your huggingface account based on `$HUB_MODEL_ID`. By the end of training, the final checkpoint will be automatically stored on the hub. You can find an example model repo [here](https://huggingface.co/YiYiXu/fill-circle-controlnet). -Our training script also provides limited support for streaming large datasets from the Hugging Face Hub. In order to enable streaming, one must also set `--max_train_samples`. Here is an example command: +Our training script also provides limited support for streaming large datasets from the Hugging Face Hub. In order to enable streaming, one must also set `--max_train_samples`. Here is an example command (from [this blog article](https://huggingface.co/blog/train-your-controlnet)): ```bash +export MODEL_DIR="runwayml/stable-diffusion-v1-5" +export OUTPUT_DIR="runs/uncanny-faces-{timestamp}" +export HUB_MODEL_ID="controlnet-uncanny-faces" + python3 train_controlnet_flax.py \ - --pretrained_model_name_or_path=$MODEL_DIR \ - --output_dir=$OUTPUT_DIR \ - --dataset_name=multimodalart/facesyntheticsspigacaptioned \ - --streaming \ - --conditioning_image_column=spiga_seg \ - --image_column=image \ - --caption_column=image_caption \ - --resolution=512 \ - --max_train_samples 50 \ - --max_train_steps 5 \ - --learning_rate=1e-5 \ - --validation_steps=2 \ - --train_batch_size=1 \ - --revision="flax" \ - --report_to="wandb" + --pretrained_model_name_or_path=$MODEL_DIR \ + --output_dir=$OUTPUT_DIR \ + --dataset_name=multimodalart/facesyntheticsspigacaptioned \ + --streaming \ + --conditioning_image_column=spiga_seg \ + --image_column=image \ + --caption_column=image_caption \ + --resolution=512 \ + --max_train_samples 100000 \ + --learning_rate=1e-5 \ + --train_batch_size=1 \ + --revision="flax" \ + --report_to="wandb" \ + --tracker_project_name=$HUB_MODEL_ID ``` Note, however, that the performance of the TPUs might get bottlenecked as streaming with `datasets` is not optimized for images. For ensuring maximum throughput, we encourage you to explore the following options: @@ -400,16 +405,35 @@ Note, however, that the performance of the TPUs might get bottlenecked as stream When work with a larger dataset, you may need to run training process for a long time and it’s useful to save regular checkpoints during the process. You can use the following argument to enable intermediate checkpointing: ```bash - --checkpointing_steps=500 + --checkpointing_steps=500 ``` This will save the trained model in subfolders of your output_dir. Subfolder names is the number of steps performed so far; for example: a checkpoint saved after 500 training steps would be saved in a subfolder named 500 You can then start your training from this saved checkpoint with ```bash - --controlnet_model_name_or_path="./control_out/500" + --controlnet_model_name_or_path="./control_out/500" ``` We support training with the Min-SNR weighting strategy proposed in [Efficient Diffusion Training via Min-SNR Weighting Strategy](https://arxiv.org/abs/2303.09556) which helps to achieve faster convergence by rebalancing the loss. To use it, one needs to set the `--snr_gamma` argument. The recommended value when using it is `5.0`. -We also support gradient accumulation - it is a technique that lets you use a bigger batch size than your machine would normally be able to fit into memory. You can use `gradient_accumulation_steps` argument to set gradient accumulation steps. The ControlNet author recommends using gradient accumulation to achieve better convergence. Read more [here](https://github.com/lllyasviel/ControlNet/blob/main/docs/train.md#more-consideration-sudden-converge-phenomenon-and-gradient-accumulation). \ No newline at end of file +We also support gradient accumulation - it is a technique that lets you use a bigger batch size than your machine would normally be able to fit into memory. You can use `gradient_accumulation_steps` argument to set gradient accumulation steps. The ControlNet author recommends using gradient accumulation to achieve better convergence. Read more [here](https://github.com/lllyasviel/ControlNet/blob/main/docs/train.md#more-consideration-sudden-converge-phenomenon-and-gradient-accumulation). + +You can **profile your code** with: + +```bash + --profile_steps==5 +``` + +Refer to the [JAX documentation on profiling](https://jax.readthedocs.io/en/latest/profiling.html). To inspect the profile trace, you'll have to install and start Tensorboard with the profile plugin: + +```bash +pip install tensorflow tensorboard-plugin-profile +tensorboard --logdir runs/fill-circle-100steps-20230411_165612/ +``` + +The profile can then be inspected at http://localhost:6006/#profile + +Sometimes you'll get version conflicts (error messages like `Duplicate plugins for name projector`), which means that you have to uninstall and reinstall all versions of Tensorflow/Tensorboard (e.g. with `pip uninstall tensorflow tf-nightly tensorboard tb-nightly tensorboard-plugin-profile && pip install tf-nightly tbp-nightly tensorboard-plugin-profile`). + +Note that the debugging functionality of the Tensorboard `profile` plugin is still under active development. Not all views are fully functional, and for example the `trace_viewer` cuts off events after 1M (which can result in all your device traces getting lost if you for example profile the compilation step by accident). diff --git a/examples/controlnet/train_controlnet_flax.py b/examples/controlnet/train_controlnet_flax.py index 67fe1b82d0dd..0b413ace09d2 100644 --- a/examples/controlnet/train_controlnet_flax.py +++ b/examples/controlnet/train_controlnet_flax.py @@ -18,6 +18,7 @@ import math import os import random +import time from pathlib import Path import jax @@ -220,6 +221,28 @@ def parse_args(): default=None, help="Revision of controlnet model identifier from huggingface.co/models.", ) + parser.add_argument( + "--profile_steps", + type=int, + default=0, + help="How many training steps to profile in the beginning.", + ) + parser.add_argument( + "--profile_validation", + action="store_true", + help="Whether to profile the (last) validation.", + ) + parser.add_argument( + "--profile_memory", + action="store_true", + help="Whether to dump an initial (before training loop) and a final (at program end) memory profile.", + ) + parser.add_argument( + "--ccache", + type=str, + default=None, + help="Enables compilation cache.", + ) parser.add_argument( "--controlnet_from_pt", action="store_true", @@ -234,8 +257,9 @@ def parse_args(): parser.add_argument( "--output_dir", type=str, - default="controlnet-model", - help="The output directory where the model predictions and checkpoints will be written.", + default="runs/{timestamp}", + help="The output directory where the model predictions and checkpoints will be written. " + "Can contain placeholders: {timestamp}.", ) parser.add_argument( "--cache_dir", @@ -317,15 +341,6 @@ def parse_args(): default=None, help="The name of the repository to keep in sync with the local `output_dir`.", ) - parser.add_argument( - "--logging_dir", - type=str, - default="logs", - help=( - "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to" - " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***." - ), - ) parser.add_argument( "--logging_steps", type=int, @@ -459,6 +474,8 @@ def parse_args(): parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") args = parser.parse_args() + args.output_dir = args.output_dir.replace("{timestamp}", time.strftime("%Y%m%d_%H%M%S")) + env_local_rank = int(os.environ.get("LOCAL_RANK", -1)) if env_local_rank != -1 and env_local_rank != args.local_rank: args.local_rank = env_local_rank @@ -952,6 +969,11 @@ def cumul_grad_step(grad_idx, loss_grad_rng): metrics = {"loss": loss} metrics = jax.lax.pmean(metrics, axis_name="batch") + def l2(xs): + return jnp.sqrt(sum([jnp.vdot(x, x) for x in jax.tree_util.tree_leaves(xs)])) + + metrics["l2_grads"] = l2(jax.tree_util.tree_leaves(grad)) + return new_state, metrics, new_train_rng # Create parallel version of the train step @@ -983,32 +1005,38 @@ def cumul_grad_step(grad_idx, loss_grad_rng): logger.info(f" Total train batch size (w. parallel & distributed) = {total_train_batch_size}") logger.info(f" Total optimization steps = {args.num_train_epochs * num_update_steps_per_epoch}") - if jax.process_index() == 0: + if jax.process_index() == 0 and args.report_to == "wandb": wandb.define_metric("*", step_metric="train/step") + wandb.define_metric("train/step", step_metric="walltime") wandb.config.update( { "num_train_examples": args.max_train_samples if args.streaming else len(train_dataset), "total_train_batch_size": total_train_batch_size, "total_optimization_step": args.num_train_epochs * num_update_steps_per_epoch, "num_devices": jax.device_count(), + "controlnet_params": sum(np.prod(x.shape) for x in jax.tree_util.tree_leaves(state.params)), } ) - global_step = 0 + global_step = step0 = 0 epochs = tqdm( range(args.num_train_epochs), desc="Epoch ... ", position=0, disable=jax.process_index() > 0, ) + if args.profile_memory: + jax.profiler.save_device_memory_profile(os.path.join(args.output_dir, "memory_initial.prof")) + t00 = t0 = time.monotonic() for epoch in epochs: # ======================== Training ================================ train_metrics = [] + train_metric = None steps_per_epoch = ( args.max_train_samples // total_train_batch_size - if args.streaming + if args.streaming or args.max_train_samples else len(train_dataset) // total_train_batch_size ) train_step_progress_bar = tqdm( @@ -1020,10 +1048,18 @@ def cumul_grad_step(grad_idx, loss_grad_rng): ) # train for batch in train_dataloader: + if args.profile_steps and global_step == 1: + train_metric["loss"].block_until_ready() + jax.profiler.start_trace(args.output_dir) + if args.profile_steps and global_step == 1 + args.profile_steps: + train_metric["loss"].block_until_ready() + jax.profiler.stop_trace() + batch = shard(batch) - state, train_metric, train_rngs = p_train_step( - state, unet_params, text_encoder_params, vae_params, batch, train_rngs - ) + with jax.profiler.StepTraceAnnotation("train", step_num=global_step): + state, train_metric, train_rngs = p_train_step( + state, unet_params, text_encoder_params, vae_params, batch, train_rngs + ) train_metrics.append(train_metric) train_step_progress_bar.update(1) @@ -1041,13 +1077,19 @@ def cumul_grad_step(grad_idx, loss_grad_rng): if global_step % args.logging_steps == 0 and jax.process_index() == 0: if args.report_to == "wandb": + train_metrics = jax_utils.unreplicate(train_metrics) + train_metrics = jax.tree_util.tree_map(lambda *m: jnp.array(m).mean(), *train_metrics) wandb.log( { + "walltime": time.monotonic() - t00, "train/step": global_step, - "train/epoch": epoch, - "train/loss": jax_utils.unreplicate(train_metric)["loss"], + "train/epoch": global_step / dataset_length, + "train/steps_per_sec": (global_step - step0) / (time.monotonic() - t0), + **{f"train/{k}": v for k, v in train_metrics.items()}, } ) + t0, step0 = time.monotonic(), global_step + train_metrics = [] if global_step % args.checkpointing_steps == 0 and jax.process_index() == 0: controlnet.save_pretrained( f"{args.output_dir}/{global_step}", @@ -1058,10 +1100,14 @@ def cumul_grad_step(grad_idx, loss_grad_rng): train_step_progress_bar.close() epochs.write(f"Epoch... ({epoch + 1}/{args.num_train_epochs} | Loss: {train_metric['loss']})") - # Create the pipeline using using the trained modules and save it. + # Final validation & store model. if jax.process_index() == 0: if args.validation_prompt is not None: + if args.profile_validation: + jax.profiler.start_trace(args.output_dir) image_logs = log_validation(controlnet, state.params, tokenizer, args, validation_rng, weight_dtype) + if args.profile_validation: + jax.profiler.stop_trace() else: image_logs = None @@ -1084,6 +1130,10 @@ def cumul_grad_step(grad_idx, loss_grad_rng): ignore_patterns=["step_*", "epoch_*"], ) + if args.profile_memory: + jax.profiler.save_device_memory_profile(os.path.join(args.output_dir, "memory_final.prof")) + logger.info("Finished training.") + if __name__ == "__main__": main() From 46c52f9b9607e6ecb29c782c052aea313e6487b7 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 13 Apr 2023 00:25:10 +0200 Subject: [PATCH 04/71] [Pipelines] Make sure that None functions are correctly not saved (#3080) --- src/diffusers/pipelines/pipeline_utils.py | 25 +++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index 2e20c21aaf38..72c4363da3c6 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -19,6 +19,7 @@ import inspect import os import re +import sys import warnings from dataclasses import dataclass from pathlib import Path @@ -540,11 +541,9 @@ def save_pretrained( variant (`str`, *optional*): If specified, weights are saved in the format pytorch_model..bin. """ - self.save_config(save_directory) - model_index_dict = dict(self.config) - model_index_dict.pop("_class_name") - model_index_dict.pop("_diffusers_version") + model_index_dict.pop("_class_name", None) + model_index_dict.pop("_diffusers_version", None) model_index_dict.pop("_module", None) expected_modules, optional_kwargs = self._get_signature_keys(self) @@ -557,7 +556,6 @@ def is_saveable_module(name, value): return True model_index_dict = {k: v for k, v in model_index_dict.items() if is_saveable_module(k, v)} - for pipeline_component_name in model_index_dict.keys(): sub_model = getattr(self, pipeline_component_name) model_cls = sub_model.__class__ @@ -571,7 +569,13 @@ def is_saveable_module(name, value): save_method_name = None # search for the model's base class in LOADABLE_CLASSES for library_name, library_classes in LOADABLE_CLASSES.items(): - library = importlib.import_module(library_name) + if library_name in sys.modules: + library = importlib.import_module(library_name) + else: + logger.info( + f"{library_name} is not installed. Cannot save {pipeline_component_name} as {library_classes} from {library_name}" + ) + for base_class, save_load_methods in library_classes.items(): class_candidate = getattr(library, base_class, None) if class_candidate is not None and issubclass(model_cls, class_candidate): @@ -581,6 +585,12 @@ def is_saveable_module(name, value): if save_method_name is not None: break + if save_method_name is None: + logger.warn(f"self.{pipeline_component_name}={sub_model} of type {type(sub_model)} cannot be saved.") + # make sure that unsaveable components are not tried to be loaded afterward + self.register_to_config(**{pipeline_component_name: (None, None)}) + continue + save_method = getattr(sub_model, save_method_name) # Call the save method with the argument safe_serialization only if it's supported @@ -596,6 +606,9 @@ def is_saveable_module(name, value): save_method(os.path.join(save_directory, pipeline_component_name), **save_kwargs) + # finally save the config + self.save_config(save_directory) + def to( self, torch_device: Optional[Union[str, torch.device]] = None, From e748b3c6e163ce9a61965eb456704a83b855ccc3 Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Wed, 12 Apr 2023 21:45:23 -1000 Subject: [PATCH 05/71] doc string example remove from_pt (#3083) --- .../pipeline_flax_stable_diffusion_controlnet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_controlnet.py index df3e79a194f8..7035242a0cda 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_controlnet.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_controlnet.py @@ -83,7 +83,7 @@ ... "lllyasviel/sd-controlnet-canny", from_pt=True, dtype=jnp.float32 ... ) >>> pipe, params = FlaxStableDiffusionControlNetPipeline.from_pretrained( - ... "runwayml/stable-diffusion-v1-5", controlnet=controlnet, from_pt=True, dtype=jnp.float32 + ... "runwayml/stable-diffusion-v1-5", controlnet=controlnet, revision="flax", dtype=jnp.float32 ... ) >>> params["controlnet"] = controlnet_params From 3a9d7d97588a1bbc906d8a17be77cf382492a7b6 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 13 Apr 2023 14:32:57 +0200 Subject: [PATCH 06/71] [Tests] parallelize (#3078) * [Tests] parallelize * finish folder structuring * Parallelize tests more * Correct saving of pipelines * make sure logging level is correct * try again * Apply suggestions from code review Co-authored-by: Pedro Cuenca --------- Co-authored-by: Pedro Cuenca --- .github/workflows/pr_tests.yml | 33 +++++++++++++------ tests/{ => models}/test_layers_utils.py | 0 tests/{ => models}/test_lora_layers.py | 0 tests/{ => models}/test_modeling_common.py | 0 .../{ => models}/test_modeling_common_flax.py | 0 tests/models/test_models_unet_1d.py | 2 +- tests/models/test_models_unet_2d.py | 2 +- tests/models/test_models_unet_2d_condition.py | 2 +- tests/models/test_models_unet_3d_condition.py | 2 +- tests/models/test_models_vae.py | 2 +- tests/models/test_models_vae_flax.py | 2 +- tests/models/test_models_vq.py | 2 +- tests/{ => models}/test_unet_2d_blocks.py | 0 tests/{ => models}/test_unet_blocks_common.py | 0 .../test_check_copies.py | 0 .../test_check_dummies.py | 0 tests/{ => others}/test_config.py | 12 +++++++ tests/{ => others}/test_ema.py | 0 tests/{ => others}/test_hub_utils.py | 0 tests/{ => others}/test_image_processor.py | 0 tests/{ => others}/test_outputs.py | 0 tests/{ => others}/test_training.py | 0 tests/{ => others}/test_utils.py | 2 +- .../altdiffusion/test_alt_diffusion.py | 4 +-- tests/pipelines/audioldm/test_audioldm.py | 4 +-- .../dance_diffusion/test_dance_diffusion.py | 4 +-- tests/pipelines/ddim/test_ddim.py | 4 +-- tests/pipelines/dit/test_dit.py | 4 +-- .../latent_diffusion/test_latent_diffusion.py | 4 +-- .../paint_by_example/test_paint_by_example.py | 4 +-- tests/{ => pipelines}/pipeline_params.py | 0 tests/pipelines/repaint/test_repaint.py | 4 +-- .../test_spectrogram_diffusion.py | 4 +-- .../stable_diffusion/test_cycle_diffusion.py | 4 +-- .../test_onnx_stable_diffusion.py | 2 +- .../test_onnx_stable_diffusion_img2img.py | 2 +- .../test_onnx_stable_diffusion_inpaint.py | 2 +- .../test_onnx_stable_diffusion_upscale.py | 2 +- .../stable_diffusion/test_stable_diffusion.py | 4 +-- .../test_stable_diffusion_controlnet.py | 4 +-- .../test_stable_diffusion_image_variation.py | 4 +-- .../test_stable_diffusion_img2img.py | 4 +-- .../test_stable_diffusion_inpaint.py | 4 +-- ...st_stable_diffusion_instruction_pix2pix.py | 4 +-- .../test_stable_diffusion_model_editing.py | 4 +-- .../test_stable_diffusion_panorama.py | 4 +-- .../test_stable_diffusion_pix2pix_zero.py | 4 +-- .../test_stable_diffusion_sag.py | 4 +-- .../test_stable_diffusion.py | 4 +-- ...test_stable_diffusion_attend_and_excite.py | 4 +-- .../test_stable_diffusion_depth.py | 4 +-- .../test_stable_diffusion_inpaint.py | 4 +-- .../test_stable_diffusion_latent_upscale.py | 4 +-- .../stable_unclip/test_stable_unclip.py | 4 +-- .../test_stable_unclip_img2img.py | 4 +-- tests/{ => pipelines}/test_pipelines.py | 0 .../{ => pipelines}/test_pipelines_common.py | 2 +- tests/{ => pipelines}/test_pipelines_flax.py | 0 .../test_pipelines_onnx_common.py | 0 .../text_to_video/test_text_to_video.py | 4 +-- .../text_to_video/test_text_to_video_zero.py | 2 +- tests/pipelines/unclip/test_unclip.py | 4 +-- .../unclip/test_unclip_image_variation.py | 4 +-- 63 files changed, 109 insertions(+), 84 deletions(-) rename tests/{ => models}/test_layers_utils.py (100%) rename tests/{ => models}/test_lora_layers.py (100%) rename tests/{ => models}/test_modeling_common.py (100%) rename tests/{ => models}/test_modeling_common_flax.py (100%) rename tests/{ => models}/test_unet_2d_blocks.py (100%) rename tests/{ => models}/test_unet_blocks_common.py (100%) rename tests/{repo_utils => others}/test_check_copies.py (100%) rename tests/{repo_utils => others}/test_check_dummies.py (100%) rename tests/{ => others}/test_config.py (95%) rename tests/{ => others}/test_ema.py (100%) rename tests/{ => others}/test_hub_utils.py (100%) rename tests/{ => others}/test_image_processor.py (100%) rename tests/{ => others}/test_outputs.py (100%) rename tests/{ => others}/test_training.py (100%) rename tests/{ => others}/test_utils.py (98%) rename tests/{ => pipelines}/pipeline_params.py (100%) rename tests/{ => pipelines}/test_pipelines.py (100%) rename tests/{ => pipelines}/test_pipelines_common.py (99%) rename tests/{ => pipelines}/test_pipelines_flax.py (100%) rename tests/{ => pipelines}/test_pipelines_onnx_common.py (100%) diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml index 3d5fd84ad949..d06b576fa631 100644 --- a/.github/workflows/pr_tests.yml +++ b/.github/workflows/pr_tests.yml @@ -21,22 +21,27 @@ jobs: fail-fast: false matrix: config: - - name: Fast PyTorch CPU tests on Ubuntu - framework: pytorch + - name: Fast PyTorch Pipeline CPU tests + framework: pytorch_pipelines runner: docker-cpu image: diffusers/diffusers-pytorch-cpu - report: torch_cpu - - name: Fast Flax CPU tests on Ubuntu + report: torch_cpu_pipelines + - name: Fast PyTorch Models & Schedulers CPU tests + framework: pytorch_models + runner: docker-cpu + image: diffusers/diffusers-pytorch-cpu + report: torch_cpu_models_schedulers + - name: Fast Flax CPU tests framework: flax runner: docker-cpu image: diffusers/diffusers-flax-cpu report: flax_cpu - - name: Fast ONNXRuntime CPU tests on Ubuntu + - name: Fast ONNXRuntime CPU tests framework: onnxruntime runner: docker-cpu image: diffusers/diffusers-onnxruntime-cpu report: onnx_cpu - - name: PyTorch Example CPU tests on Ubuntu + - name: PyTorch Example CPU tests framework: pytorch_examples runner: docker-cpu image: diffusers/diffusers-pytorch-cpu @@ -71,13 +76,21 @@ jobs: run: | python utils/print_env.py - - name: Run fast PyTorch CPU tests - if: ${{ matrix.config.framework == 'pytorch' }} + - name: Run fast PyTorch Pipeline CPU tests + if: ${{ matrix.config.framework == 'pytorch_pipelines' }} run: | python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \ -s -v -k "not Flax and not Onnx" \ --make-reports=tests_${{ matrix.config.report }} \ - tests/ + tests/pipelines + + - name: Run fast PyTorch Model Scheduler CPU tests + if: ${{ matrix.config.framework == 'pytorch_models' }} + run: | + python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \ + -s -v -k "not Flax and not Onnx" \ + --make-reports=tests_${{ matrix.config.report }} \ + tests/models tests/schedulers tests/others - name: Run fast Flax TPU tests if: ${{ matrix.config.framework == 'flax' }} @@ -85,7 +98,7 @@ jobs: python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \ -s -v -k "Flax" \ --make-reports=tests_${{ matrix.config.report }} \ - tests/ + tests - name: Run fast ONNXRuntime CPU tests if: ${{ matrix.config.framework == 'onnxruntime' }} diff --git a/tests/test_layers_utils.py b/tests/models/test_layers_utils.py similarity index 100% rename from tests/test_layers_utils.py rename to tests/models/test_layers_utils.py diff --git a/tests/test_lora_layers.py b/tests/models/test_lora_layers.py similarity index 100% rename from tests/test_lora_layers.py rename to tests/models/test_lora_layers.py diff --git a/tests/test_modeling_common.py b/tests/models/test_modeling_common.py similarity index 100% rename from tests/test_modeling_common.py rename to tests/models/test_modeling_common.py diff --git a/tests/test_modeling_common_flax.py b/tests/models/test_modeling_common_flax.py similarity index 100% rename from tests/test_modeling_common_flax.py rename to tests/models/test_modeling_common_flax.py diff --git a/tests/models/test_models_unet_1d.py b/tests/models/test_models_unet_1d.py index d3a3d5cfc9a0..f954d876fa76 100644 --- a/tests/models/test_models_unet_1d.py +++ b/tests/models/test_models_unet_1d.py @@ -20,7 +20,7 @@ from diffusers import UNet1DModel from diffusers.utils import floats_tensor, slow, torch_device -from ..test_modeling_common import ModelTesterMixin +from .test_modeling_common import ModelTesterMixin torch.backends.cuda.matmul.allow_tf32 = False diff --git a/tests/models/test_models_unet_2d.py b/tests/models/test_models_unet_2d.py index 8f831fcf7cbf..c20b0ef7d0a4 100644 --- a/tests/models/test_models_unet_2d.py +++ b/tests/models/test_models_unet_2d.py @@ -22,7 +22,7 @@ from diffusers import UNet2DModel from diffusers.utils import floats_tensor, logging, slow, torch_all_close, torch_device -from ..test_modeling_common import ModelTesterMixin +from .test_modeling_common import ModelTesterMixin logger = logging.get_logger(__name__) diff --git a/tests/models/test_models_unet_2d_condition.py b/tests/models/test_models_unet_2d_condition.py index 17e08e0a426e..15f77fb8c106 100644 --- a/tests/models/test_models_unet_2d_condition.py +++ b/tests/models/test_models_unet_2d_condition.py @@ -34,7 +34,7 @@ ) from diffusers.utils.import_utils import is_xformers_available -from ..test_modeling_common import ModelTesterMixin +from .test_modeling_common import ModelTesterMixin logger = logging.get_logger(__name__) diff --git a/tests/models/test_models_unet_3d_condition.py b/tests/models/test_models_unet_3d_condition.py index c552b503af05..f245045bb3bb 100644 --- a/tests/models/test_models_unet_3d_condition.py +++ b/tests/models/test_models_unet_3d_condition.py @@ -30,7 +30,7 @@ ) from diffusers.utils.import_utils import is_xformers_available -from ..test_modeling_common import ModelTesterMixin +from .test_modeling_common import ModelTesterMixin logger = logging.get_logger(__name__) diff --git a/tests/models/test_models_vae.py b/tests/models/test_models_vae.py index abd4a078e692..fe0041850bb4 100644 --- a/tests/models/test_models_vae.py +++ b/tests/models/test_models_vae.py @@ -22,7 +22,7 @@ from diffusers import AutoencoderKL from diffusers.utils import floats_tensor, load_hf_numpy, require_torch_gpu, slow, torch_all_close, torch_device -from ..test_modeling_common import ModelTesterMixin +from .test_modeling_common import ModelTesterMixin torch.backends.cuda.matmul.allow_tf32 = False diff --git a/tests/models/test_models_vae_flax.py b/tests/models/test_models_vae_flax.py index 8fedb85eccfc..e5c56b61a5a4 100644 --- a/tests/models/test_models_vae_flax.py +++ b/tests/models/test_models_vae_flax.py @@ -4,7 +4,7 @@ from diffusers.utils import is_flax_available from diffusers.utils.testing_utils import require_flax -from ..test_modeling_common_flax import FlaxModelTesterMixin +from .test_modeling_common_flax import FlaxModelTesterMixin if is_flax_available(): diff --git a/tests/models/test_models_vq.py b/tests/models/test_models_vq.py index 66c33e07371e..015d2abfc6fa 100644 --- a/tests/models/test_models_vq.py +++ b/tests/models/test_models_vq.py @@ -20,7 +20,7 @@ from diffusers import VQModel from diffusers.utils import floats_tensor, torch_device -from ..test_modeling_common import ModelTesterMixin +from .test_modeling_common import ModelTesterMixin torch.backends.cuda.matmul.allow_tf32 = False diff --git a/tests/test_unet_2d_blocks.py b/tests/models/test_unet_2d_blocks.py similarity index 100% rename from tests/test_unet_2d_blocks.py rename to tests/models/test_unet_2d_blocks.py diff --git a/tests/test_unet_blocks_common.py b/tests/models/test_unet_blocks_common.py similarity index 100% rename from tests/test_unet_blocks_common.py rename to tests/models/test_unet_blocks_common.py diff --git a/tests/repo_utils/test_check_copies.py b/tests/others/test_check_copies.py similarity index 100% rename from tests/repo_utils/test_check_copies.py rename to tests/others/test_check_copies.py diff --git a/tests/repo_utils/test_check_dummies.py b/tests/others/test_check_dummies.py similarity index 100% rename from tests/repo_utils/test_check_dummies.py rename to tests/others/test_check_dummies.py diff --git a/tests/test_config.py b/tests/others/test_config.py similarity index 95% rename from tests/test_config.py rename to tests/others/test_config.py index 95b0cdf9a597..a29190c199ca 100644 --- a/tests/test_config.py +++ b/tests/others/test_config.py @@ -141,6 +141,8 @@ def test_save_load(self): def test_load_ddim_from_pndm(self): logger = logging.get_logger("diffusers.configuration_utils") + # 30 for warning + logger.setLevel(30) with CaptureLogger(logger) as cap_logger: ddim = DDIMScheduler.from_pretrained( @@ -153,6 +155,8 @@ def test_load_ddim_from_pndm(self): def test_load_euler_from_pndm(self): logger = logging.get_logger("diffusers.configuration_utils") + # 30 for warning + logger.setLevel(30) with CaptureLogger(logger) as cap_logger: euler = EulerDiscreteScheduler.from_pretrained( @@ -165,6 +169,8 @@ def test_load_euler_from_pndm(self): def test_load_euler_ancestral_from_pndm(self): logger = logging.get_logger("diffusers.configuration_utils") + # 30 for warning + logger.setLevel(30) with CaptureLogger(logger) as cap_logger: euler = EulerAncestralDiscreteScheduler.from_pretrained( @@ -177,6 +183,8 @@ def test_load_euler_ancestral_from_pndm(self): def test_load_pndm(self): logger = logging.get_logger("diffusers.configuration_utils") + # 30 for warning + logger.setLevel(30) with CaptureLogger(logger) as cap_logger: pndm = PNDMScheduler.from_pretrained( @@ -189,6 +197,8 @@ def test_load_pndm(self): def test_overwrite_config_on_load(self): logger = logging.get_logger("diffusers.configuration_utils") + # 30 for warning + logger.setLevel(30) with CaptureLogger(logger) as cap_logger: ddpm = DDPMScheduler.from_pretrained( @@ -212,6 +222,8 @@ def test_overwrite_config_on_load(self): def test_load_dpmsolver(self): logger = logging.get_logger("diffusers.configuration_utils") + # 30 for warning + logger.setLevel(30) with CaptureLogger(logger) as cap_logger: dpm = DPMSolverMultistepScheduler.from_pretrained( diff --git a/tests/test_ema.py b/tests/others/test_ema.py similarity index 100% rename from tests/test_ema.py rename to tests/others/test_ema.py diff --git a/tests/test_hub_utils.py b/tests/others/test_hub_utils.py similarity index 100% rename from tests/test_hub_utils.py rename to tests/others/test_hub_utils.py diff --git a/tests/test_image_processor.py b/tests/others/test_image_processor.py similarity index 100% rename from tests/test_image_processor.py rename to tests/others/test_image_processor.py diff --git a/tests/test_outputs.py b/tests/others/test_outputs.py similarity index 100% rename from tests/test_outputs.py rename to tests/others/test_outputs.py diff --git a/tests/test_training.py b/tests/others/test_training.py similarity index 100% rename from tests/test_training.py rename to tests/others/test_training.py diff --git a/tests/test_utils.py b/tests/others/test_utils.py similarity index 98% rename from tests/test_utils.py rename to tests/others/test_utils.py index 4fc4e1a06638..6e7cc095f8df 100755 --- a/tests/test_utils.py +++ b/tests/others/test_utils.py @@ -167,4 +167,4 @@ def test_deprecate_stacklevel(self): with self.assertWarns(FutureWarning) as warning: deprecate(("deprecated_arg", self.higher_version, "This message is better!!!"), standard_warn=False) assert str(warning.warning) == "This message is better!!!" - assert "diffusers/tests/test_utils.py" in warning.filename + assert "diffusers/tests/others/test_utils.py" in warning.filename diff --git a/tests/pipelines/altdiffusion/test_alt_diffusion.py b/tests/pipelines/altdiffusion/test_alt_diffusion.py index faa56e18f748..4d19621f0c2c 100644 --- a/tests/pipelines/altdiffusion/test_alt_diffusion.py +++ b/tests/pipelines/altdiffusion/test_alt_diffusion.py @@ -28,8 +28,8 @@ from diffusers.utils import slow, torch_device from diffusers.utils.testing_utils import require_torch_gpu -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS +from ..test_pipelines_common import PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False diff --git a/tests/pipelines/audioldm/test_audioldm.py b/tests/pipelines/audioldm/test_audioldm.py index 10de5440eb00..ec72108fafc9 100644 --- a/tests/pipelines/audioldm/test_audioldm.py +++ b/tests/pipelines/audioldm/test_audioldm.py @@ -38,8 +38,8 @@ ) from diffusers.utils import slow, torch_device -from ...pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS, TEXT_TO_AUDIO_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ..pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS, TEXT_TO_AUDIO_PARAMS +from ..test_pipelines_common import PipelineTesterMixin class AudioLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase): diff --git a/tests/pipelines/dance_diffusion/test_dance_diffusion.py b/tests/pipelines/dance_diffusion/test_dance_diffusion.py index bbd4aa694b76..5db90a3aa740 100644 --- a/tests/pipelines/dance_diffusion/test_dance_diffusion.py +++ b/tests/pipelines/dance_diffusion/test_dance_diffusion.py @@ -23,8 +23,8 @@ from diffusers.utils import slow, torch_device from diffusers.utils.testing_utils import require_torch_gpu, skip_mps -from ...pipeline_params import UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS, UNCONDITIONAL_AUDIO_GENERATION_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ..pipeline_params import UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS, UNCONDITIONAL_AUDIO_GENERATION_PARAMS +from ..test_pipelines_common import PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False diff --git a/tests/pipelines/ddim/test_ddim.py b/tests/pipelines/ddim/test_ddim.py index 4d2c4e490d63..319bd778e3b2 100644 --- a/tests/pipelines/ddim/test_ddim.py +++ b/tests/pipelines/ddim/test_ddim.py @@ -21,8 +21,8 @@ from diffusers import DDIMPipeline, DDIMScheduler, UNet2DModel from diffusers.utils.testing_utils import require_torch_gpu, slow, torch_device -from ...pipeline_params import UNCONDITIONAL_IMAGE_GENERATION_BATCH_PARAMS, UNCONDITIONAL_IMAGE_GENERATION_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ..pipeline_params import UNCONDITIONAL_IMAGE_GENERATION_BATCH_PARAMS, UNCONDITIONAL_IMAGE_GENERATION_PARAMS +from ..test_pipelines_common import PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False diff --git a/tests/pipelines/dit/test_dit.py b/tests/pipelines/dit/test_dit.py index 947fd3cbf43d..d8098178f339 100644 --- a/tests/pipelines/dit/test_dit.py +++ b/tests/pipelines/dit/test_dit.py @@ -23,11 +23,11 @@ from diffusers.utils import is_xformers_available, load_numpy, slow, torch_device from diffusers.utils.testing_utils import require_torch_gpu -from ...pipeline_params import ( +from ..pipeline_params import ( CLASS_CONDITIONED_IMAGE_GENERATION_BATCH_PARAMS, CLASS_CONDITIONED_IMAGE_GENERATION_PARAMS, ) -from ...test_pipelines_common import PipelineTesterMixin +from ..test_pipelines_common import PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False diff --git a/tests/pipelines/latent_diffusion/test_latent_diffusion.py b/tests/pipelines/latent_diffusion/test_latent_diffusion.py index 2ff7feda6317..05ff4162e5c6 100644 --- a/tests/pipelines/latent_diffusion/test_latent_diffusion.py +++ b/tests/pipelines/latent_diffusion/test_latent_diffusion.py @@ -23,8 +23,8 @@ from diffusers import AutoencoderKL, DDIMScheduler, LDMTextToImagePipeline, UNet2DConditionModel from diffusers.utils.testing_utils import load_numpy, nightly, require_torch_gpu, slow, torch_device -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS +from ..test_pipelines_common import PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False diff --git a/tests/pipelines/paint_by_example/test_paint_by_example.py b/tests/pipelines/paint_by_example/test_paint_by_example.py index 14b045d6c480..17feba59e8e4 100644 --- a/tests/pipelines/paint_by_example/test_paint_by_example.py +++ b/tests/pipelines/paint_by_example/test_paint_by_example.py @@ -27,8 +27,8 @@ from diffusers.utils import floats_tensor, load_image, slow, torch_device from diffusers.utils.testing_utils import require_torch_gpu -from ...pipeline_params import IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ..pipeline_params import IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS +from ..test_pipelines_common import PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False diff --git a/tests/pipeline_params.py b/tests/pipelines/pipeline_params.py similarity index 100% rename from tests/pipeline_params.py rename to tests/pipelines/pipeline_params.py diff --git a/tests/pipelines/repaint/test_repaint.py b/tests/pipelines/repaint/test_repaint.py index 060e6c9161ba..4f98675bc5af 100644 --- a/tests/pipelines/repaint/test_repaint.py +++ b/tests/pipelines/repaint/test_repaint.py @@ -22,8 +22,8 @@ from diffusers import RePaintPipeline, RePaintScheduler, UNet2DModel from diffusers.utils.testing_utils import load_image, load_numpy, nightly, require_torch_gpu, skip_mps, torch_device -from ...pipeline_params import IMAGE_INPAINTING_BATCH_PARAMS, IMAGE_INPAINTING_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ..pipeline_params import IMAGE_INPAINTING_BATCH_PARAMS, IMAGE_INPAINTING_PARAMS +from ..test_pipelines_common import PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False diff --git a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py index 594d7c598f75..3b64ea2d2fc1 100644 --- a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py +++ b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py @@ -24,8 +24,8 @@ from diffusers.utils import require_torch_gpu, skip_mps, slow, torch_device from diffusers.utils.testing_utils import require_note_seq, require_onnxruntime -from ...pipeline_params import TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS, TOKENS_TO_AUDIO_GENERATION_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ..pipeline_params import TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS, TOKENS_TO_AUDIO_GENERATION_PARAMS +from ..test_pipelines_common import PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False diff --git a/tests/pipelines/stable_diffusion/test_cycle_diffusion.py b/tests/pipelines/stable_diffusion/test_cycle_diffusion.py index 5282cfd8dd24..05b72ab6a0fd 100644 --- a/tests/pipelines/stable_diffusion/test_cycle_diffusion.py +++ b/tests/pipelines/stable_diffusion/test_cycle_diffusion.py @@ -25,8 +25,8 @@ from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device from diffusers.utils.testing_utils import require_torch_gpu, skip_mps -from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS +from ..test_pipelines_common import PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False diff --git a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion.py index 3a5f9379ae50..6c90f0526662 100644 --- a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion.py @@ -29,7 +29,7 @@ ) from diffusers.utils.testing_utils import is_onnx_available, nightly, require_onnxruntime, require_torch_gpu -from ...test_pipelines_onnx_common import OnnxPipelineTesterMixin +from ..test_pipelines_onnx_common import OnnxPipelineTesterMixin if is_onnx_available(): diff --git a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py index e1aa2f6dc0a1..9147dc461fc5 100644 --- a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py +++ b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py @@ -35,7 +35,7 @@ require_torch_gpu, ) -from ...test_pipelines_onnx_common import OnnxPipelineTesterMixin +from ..test_pipelines_onnx_common import OnnxPipelineTesterMixin if is_onnx_available(): diff --git a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint.py index 16287d64d154..6004067887ea 100644 --- a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint.py +++ b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint.py @@ -26,7 +26,7 @@ require_torch_gpu, ) -from ...test_pipelines_onnx_common import OnnxPipelineTesterMixin +from ..test_pipelines_onnx_common import OnnxPipelineTesterMixin if is_onnx_available(): diff --git a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_upscale.py b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_upscale.py index d1527a42a1e5..a124c3de60ca 100644 --- a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_upscale.py +++ b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_upscale.py @@ -36,7 +36,7 @@ require_torch_gpu, ) -from ...test_pipelines_onnx_common import OnnxPipelineTesterMixin +from ..test_pipelines_onnx_common import OnnxPipelineTesterMixin if is_onnx_available(): diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py index 79796afdf597..14421a64b9e8 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py @@ -40,8 +40,8 @@ from diffusers.utils.testing_utils import CaptureLogger, require_torch_gpu from ...models.test_models_unet_2d_condition import create_lora_layers -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS +from ..test_pipelines_common import PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py index d556e6318f43..d7c5e2b0323a 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py @@ -33,8 +33,8 @@ from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import require_torch_gpu -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS +from ..test_pipelines_common import PipelineTesterMixin class StableDiffusionControlNetPipelineFastTests(PipelineTesterMixin, unittest.TestCase): diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py index 2a07ab64a36d..3bfa5810428a 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py @@ -32,8 +32,8 @@ from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device from diffusers.utils.testing_utils import require_torch_gpu -from ...pipeline_params import IMAGE_VARIATION_BATCH_PARAMS, IMAGE_VARIATION_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ..pipeline_params import IMAGE_VARIATION_BATCH_PARAMS, IMAGE_VARIATION_PARAMS +from ..test_pipelines_common import PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py index 69b92f685f25..127b1c216549 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py @@ -34,8 +34,8 @@ from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device from diffusers.utils.testing_utils import require_torch_gpu, skip_mps -from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS +from ..test_pipelines_common import PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py index 3553679e0ef6..290d9b0a9134 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py @@ -34,8 +34,8 @@ from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device from diffusers.utils.testing_utils import require_torch_gpu -from ...pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ..pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS +from ..test_pipelines_common import PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py index 78e697fbbac3..8915f524d972 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py @@ -34,8 +34,8 @@ from diffusers.utils import floats_tensor, load_image, slow, torch_device from diffusers.utils.testing_utils import require_torch_gpu -from ...pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ..pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS +from ..test_pipelines_common import PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py index 1e11500c72b1..bafad63ec2db 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py @@ -31,8 +31,8 @@ from diffusers.utils import slow, torch_device from diffusers.utils.testing_utils import require_torch_gpu, skip_mps -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS +from ..test_pipelines_common import PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py index 752ed6e969c3..3ead4fe55bab 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py @@ -32,8 +32,8 @@ from diffusers.utils import slow, torch_device from diffusers.utils.testing_utils import require_torch_gpu, skip_mps -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS +from ..test_pipelines_common import PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py index 59c45d603b91..0809a91041ce 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py @@ -33,8 +33,8 @@ from diffusers.utils import load_numpy, slow, torch_device from diffusers.utils.testing_utils import load_image, load_pt, require_torch_gpu, skip_mps -from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS +from ..test_pipelines_common import PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py index abaefbcad011..73859bdbf7d8 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py @@ -29,8 +29,8 @@ from diffusers.utils import slow, torch_device from diffusers.utils.testing_utils import require_torch_gpu -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS +from ..test_pipelines_common import PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py index 7b607c8fdd36..623dbde99469 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py @@ -35,8 +35,8 @@ from diffusers.utils import load_numpy, nightly, slow, torch_device from diffusers.utils.testing_utils import CaptureLogger, require_torch_gpu -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS +from ..test_pipelines_common import PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py index 90bb1461d351..f153ae08cbb6 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py @@ -29,8 +29,8 @@ from diffusers.utils import load_numpy, skip_mps, slow from diffusers.utils.testing_utils import require_torch_gpu -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS +from ..test_pipelines_common import PipelineTesterMixin @skip_mps diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py index 6b0205f3faeb..7a5e02a42af4 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py @@ -51,8 +51,8 @@ ) from diffusers.utils.testing_utils import require_torch_gpu, skip_mps -from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS +from ..test_pipelines_common import PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py index ee059314904f..2fa8b9045f43 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py @@ -26,8 +26,8 @@ from diffusers.utils import floats_tensor, load_image, load_numpy, torch_device from diffusers.utils.testing_utils import require_torch_gpu, slow -from ...pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ..pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS +from ..test_pipelines_common import PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py index 38f4b053714b..aff1c1cdbde9 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py @@ -31,8 +31,8 @@ from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device from diffusers.utils.testing_utils import require_torch_gpu -from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS +from ..test_pipelines_common import PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False diff --git a/tests/pipelines/stable_unclip/test_stable_unclip.py b/tests/pipelines/stable_unclip/test_stable_unclip.py index 368ab21f24a9..891323d22fe0 100644 --- a/tests/pipelines/stable_unclip/test_stable_unclip.py +++ b/tests/pipelines/stable_unclip/test_stable_unclip.py @@ -15,8 +15,8 @@ from diffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import StableUnCLIPImageNormalizer from diffusers.utils.testing_utils import load_numpy, require_torch_gpu, slow, torch_device -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference +from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS +from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference class StableUnCLIPPipelineFastTests(PipelineTesterMixin, unittest.TestCase): diff --git a/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py b/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py index 907853394040..69e3225ced52 100644 --- a/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py +++ b/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py @@ -27,8 +27,8 @@ torch_device, ) -from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS -from ...test_pipelines_common import ( +from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS +from ..test_pipelines_common import ( PipelineTesterMixin, assert_mean_pixel_difference, ) diff --git a/tests/test_pipelines.py b/tests/pipelines/test_pipelines.py similarity index 100% rename from tests/test_pipelines.py rename to tests/pipelines/test_pipelines.py diff --git a/tests/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py similarity index 99% rename from tests/test_pipelines_common.py rename to tests/pipelines/test_pipelines_common.py index 981bc9061ef9..d0712bdec8f6 100644 --- a/tests/test_pipelines_common.py +++ b/tests/pipelines/test_pipelines_common.py @@ -85,7 +85,7 @@ def params(self) -> frozenset: raise NotImplementedError( "You need to set the attribute `params` in the child test class. " "`params` are checked for if all values are present in `__call__`'s signature." - " You can set `params` using one of the common set of parameters defined in`pipeline_params.py`" + " You can set `params` using one of the common set of parameters defined in `pipeline_params.py`" " e.g., `TEXT_TO_IMAGE_PARAMS` defines the common parameters used in text to " "image pipelines, including prompts and prompt embedding overrides." "If your pipeline's set of arguments has minor changes from one of the common sets of arguments, " diff --git a/tests/test_pipelines_flax.py b/tests/pipelines/test_pipelines_flax.py similarity index 100% rename from tests/test_pipelines_flax.py rename to tests/pipelines/test_pipelines_flax.py diff --git a/tests/test_pipelines_onnx_common.py b/tests/pipelines/test_pipelines_onnx_common.py similarity index 100% rename from tests/test_pipelines_onnx_common.py rename to tests/pipelines/test_pipelines_onnx_common.py diff --git a/tests/pipelines/text_to_video/test_text_to_video.py b/tests/pipelines/text_to_video/test_text_to_video.py index 438e685a443c..b59653694616 100644 --- a/tests/pipelines/text_to_video/test_text_to_video.py +++ b/tests/pipelines/text_to_video/test_text_to_video.py @@ -28,8 +28,8 @@ ) from diffusers.utils import load_numpy, skip_mps, slow -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin +from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS +from ..test_pipelines_common import PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False diff --git a/tests/pipelines/text_to_video/test_text_to_video_zero.py b/tests/pipelines/text_to_video/test_text_to_video_zero.py index 45bb93fbd9c6..8fc7254c52d1 100644 --- a/tests/pipelines/text_to_video/test_text_to_video_zero.py +++ b/tests/pipelines/text_to_video/test_text_to_video_zero.py @@ -20,7 +20,7 @@ from diffusers import DDIMScheduler, TextToVideoZeroPipeline from diffusers.utils import load_pt, require_torch_gpu, slow -from ...test_pipelines_common import assert_mean_pixel_difference +from ..test_pipelines_common import assert_mean_pixel_difference @slow diff --git a/tests/pipelines/unclip/test_unclip.py b/tests/pipelines/unclip/test_unclip.py index c36fb02b190f..4df3e4d3828b 100644 --- a/tests/pipelines/unclip/test_unclip.py +++ b/tests/pipelines/unclip/test_unclip.py @@ -25,8 +25,8 @@ from diffusers.utils import load_numpy, nightly, slow, torch_device from diffusers.utils.testing_utils import require_torch_gpu, skip_mps -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference +from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS +from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference class UnCLIPPipelineFastTests(PipelineTesterMixin, unittest.TestCase): diff --git a/tests/pipelines/unclip/test_unclip_image_variation.py b/tests/pipelines/unclip/test_unclip_image_variation.py index 3cacb0bcad0b..57d15559cc75 100644 --- a/tests/pipelines/unclip/test_unclip_image_variation.py +++ b/tests/pipelines/unclip/test_unclip_image_variation.py @@ -39,8 +39,8 @@ from diffusers.utils import floats_tensor, load_numpy, slow, torch_device from diffusers.utils.testing_utils import load_image, require_torch_gpu, skip_mps -from ...pipeline_params import IMAGE_VARIATION_BATCH_PARAMS, IMAGE_VARIATION_PARAMS -from ...test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference +from ..pipeline_params import IMAGE_VARIATION_BATCH_PARAMS, IMAGE_VARIATION_PARAMS +from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference class UnCLIPImageVariationPipelineFastTests(PipelineTesterMixin, unittest.TestCase): From 3bf5ce21ad2fd39c0443f8f689e12761c0f67a0f Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 13 Apr 2023 14:33:11 +0200 Subject: [PATCH 07/71] Throw deprecation warning for return_cached_folder (#3092) Throw deprecation warning --- src/diffusers/pipelines/pipeline_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index 72c4363da3c6..c095da1665de 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -1059,7 +1059,7 @@ def load_module(name, value): return_cached_folder = kwargs.pop("return_cached_folder", False) if return_cached_folder: message = f"Passing `return_cached_folder=True` is deprecated and will be removed in `diffusers=0.17.0`. Please do the following instead: \n 1. Load the cached_folder via `cached_folder={cls}.download({pretrained_model_name_or_path})`. \n 2. Load the pipeline by loading from the cached folder: `pipeline={cls}.from_pretrained(cached_folder)`." - deprecate("return_cached_folder", "0.17.0", message, take_from=kwargs) + deprecate("return_cached_folder", "0.17.0", message) return model, cached_folder return model From 3eaead0c4a55bf11bdf832eaa61d0e87fe5464df Mon Sep 17 00:00:00 2001 From: Joseph Coffland Date: Thu, 13 Apr 2023 08:54:16 -0700 Subject: [PATCH 08/71] Allow SD attend and excite pipeline to work with any size output images (#2835) Allow stable diffusion attend and excite pipeline to work with any size output image. Re: #2476, #2603 --- ...eline_stable_diffusion_attend_and_excite.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py index 35351bae7116..c81ed5b54f94 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py @@ -14,7 +14,7 @@ import inspect import math -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import numpy as np import torch @@ -76,7 +76,7 @@ def get_empty_store(): def __call__(self, attn, is_cross: bool, place_in_unet: str): if self.cur_att_layer >= 0 and is_cross: - if attn.shape[1] == self.attn_res**2: + if attn.shape[1] == np.prod(self.attn_res): self.step_store[place_in_unet].append(attn) self.cur_att_layer += 1 @@ -98,7 +98,7 @@ def aggregate_attention(self, from_where: List[str]) -> torch.Tensor: attention_maps = self.get_average_attention() for location in from_where: for item in attention_maps[location]: - cross_maps = item.reshape(-1, self.attn_res, self.attn_res, item.shape[-1]) + cross_maps = item.reshape(-1, self.attn_res[0], self.attn_res[1], item.shape[-1]) out.append(cross_maps) out = torch.cat(out, dim=0) out = out.sum(0) / out.shape[0] @@ -109,7 +109,7 @@ def reset(self): self.step_store = self.get_empty_store() self.attention_store = {} - def __init__(self, attn_res=16): + def __init__(self, attn_res): """ Initialize an empty AttentionStore :param step_index: used to visualize only a specific step in the diffusion process @@ -724,7 +724,7 @@ def __call__( max_iter_to_alter: int = 25, thresholds: dict = {0: 0.05, 10: 0.5, 20: 0.8}, scale_factor: int = 20, - attn_res: int = 16, + attn_res: Optional[Tuple[int]] = None, ): r""" Function invoked when calling the pipeline for generation. @@ -796,8 +796,8 @@ def __call__( Dictionary defining the iterations and desired thresholds to apply iterative latent refinement in. scale_factor (`int`, *optional*, default to 20): Scale factor that controls the step size of each Attend and Excite update. - attn_res (`int`, *optional*, default to 16): - The resolution of most semantic attention map. + attn_res (`tuple`, *optional*, default computed from width and height): + The 2D resolution of the semantic attention map. Examples: @@ -870,7 +870,9 @@ def __call__( # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) - self.attention_store = AttentionStore(attn_res=attn_res) + if attn_res is None: + attn_res = int(np.ceil(width / 32)), int(np.ceil(height / 32)) + self.attention_store = AttentionStore(attn_res) self.register_attention_control() # default config for step size from original repo From d0f258206d9cdcfb8685447d18b6881aed63143e Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Thu, 13 Apr 2023 13:46:28 -0700 Subject: [PATCH 09/71] [docs] Update community pipeline docs (#2989) * update community pipeline docs * fix formatting * explain sharing workflows --- docs/source/en/_toctree.yml | 6 +- .../using-diffusers/contribute_pipeline.mdx | 166 ++++++++++-------- .../custom_pipeline_examples.mdx | 2 +- .../custom_pipeline_overview.mdx | 95 ++-------- 4 files changed, 106 insertions(+), 163 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index d74bd3785343..df41854a9fe7 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -25,7 +25,7 @@ - local: using-diffusers/schedulers title: Load and compare different schedulers - local: using-diffusers/custom_pipeline_overview - title: Load and add custom pipelines + title: Load community pipelines - local: using-diffusers/kerascv title: Load KerasCV Stable Diffusion checkpoints title: Loading & Hub @@ -47,9 +47,9 @@ - local: using-diffusers/reproducibility title: Create reproducible pipelines - local: using-diffusers/custom_pipeline_examples - title: Community Pipelines + title: Community pipelines - local: using-diffusers/contribute_pipeline - title: How to contribute a Pipeline + title: How to contribute a community pipeline - local: using-diffusers/using_safetensors title: Using safetensors - local: using-diffusers/stable_diffusion_jax_how_to diff --git a/docs/source/en/using-diffusers/contribute_pipeline.mdx b/docs/source/en/using-diffusers/contribute_pipeline.mdx index 8ee6d6ae4fb1..2c2b5abedcec 100644 --- a/docs/source/en/using-diffusers/contribute_pipeline.mdx +++ b/docs/source/en/using-diffusers/contribute_pipeline.mdx @@ -10,30 +10,21 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o specific language governing permissions and limitations under the License. --> -# How to build a community pipeline +# How to contribute a community pipeline -*Note*: this page was built from the GitHub Issue on Community Pipelines [#841](https://github.com/huggingface/diffusers/issues/841). + -Let's make an example! -Say you want to define a pipeline that just does a single forward pass to a U-Net and then calls a scheduler only once (Note, this doesn't make any sense from a scientific point of view, but only represents an example of how things work under the hood). +💡 Take a look at GitHub Issue [#841](https://github.com/huggingface/diffusers/issues/841) for more context about why we're adding community pipelines to help everyone easily share their work without being slowed down. -Cool! So you open your favorite IDE and start creating your pipeline 💻. -First, what model weights and configurations do we need? -We have a U-Net and a scheduler, so our pipeline should take a U-Net and a scheduler as an argument. -Also, as stated above, you'd like to be able to load weights and the scheduler config for Hub and share your code with others, so we'll inherit from `DiffusionPipeline`: + -```python -from diffusers import DiffusionPipeline -import torch +Community pipelines allow you to add any additional features you'd like on top of the [`DiffusionPipeline`]. The main benefit of building on top of the `DiffusionPipeline` is anyone can load and use your pipeline by only adding one more argument, making it super easy for the community to access. +This guide will show you how to create a community pipeline and explain how they work. To keep things simple, you'll create a "one-step" pipeline where the `UNet` does a single forward pass and calls the scheduler once. -class UnetSchedulerOneForwardPipeline(DiffusionPipeline): - def __init__(self, unet, scheduler): - super().__init__() -``` +## Initialize the pipeline -Now, we must save the `unet` and `scheduler` in a config file so that you can save your pipeline with `save_pretrained`. -Therefore, make sure you add every component that is save-able to the `register_modules` function: +You should start by creating a `one_step_unet.py` file for your community pipeline. In this file, create a pipeline class that inherits from the [`DiffusionPipeline`] to be able to load model weights and the scheduler configuration from the Hub. The one-step pipeline needs a `UNet` and a scheduler, so you'll need to add these as arguments to the `__init__` function: ```python from diffusers import DiffusionPipeline @@ -43,39 +34,54 @@ import torch class UnetSchedulerOneForwardPipeline(DiffusionPipeline): def __init__(self, unet, scheduler): super().__init__() +``` + +To ensure your pipeline and its components (`unet` and `scheduler`) can be saved with [`~DiffusionPipeline.save_pretrained`], add them to the `register_modules` function: + +```diff + from diffusers import DiffusionPipeline + import torch + + class UnetSchedulerOneForwardPipeline(DiffusionPipeline): + def __init__(self, unet, scheduler): + super().__init__() - self.register_modules(unet=unet, scheduler=scheduler) ++ self.register_modules(unet=unet, scheduler=scheduler) ``` -Cool, the init is done! 🔥 Now, let's go into the forward pass, which we recommend defining as `__call__` . Here you're given all the creative freedom there is. For our amazing "one-step" pipeline, we simply create a random image and call the unet once and the scheduler once: +Cool, the `__init__` step is done and you can move to the forward pass now! 🔥 -```python -from diffusers import DiffusionPipeline -import torch +## Define the forward pass +In the forward pass, which we recommend defining as `__call__`, you have complete creative freedom to add whatever feature you'd like. For our amazing one-step pipeline, create a random image and only call the `unet` and `scheduler` once by setting `timestep=1`: -class UnetSchedulerOneForwardPipeline(DiffusionPipeline): - def __init__(self, unet, scheduler): - super().__init__() +```diff + from diffusers import DiffusionPipeline + import torch + + + class UnetSchedulerOneForwardPipeline(DiffusionPipeline): + def __init__(self, unet, scheduler): + super().__init__() - self.register_modules(unet=unet, scheduler=scheduler) + self.register_modules(unet=unet, scheduler=scheduler) - def __call__(self): - image = torch.randn( - (1, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size), - ) - timestep = 1 ++ def __call__(self): ++ image = torch.randn( ++ (1, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size), ++ ) ++ timestep = 1 - model_output = self.unet(image, timestep).sample - scheduler_output = self.scheduler.step(model_output, timestep, image).prev_sample ++ model_output = self.unet(image, timestep).sample ++ scheduler_output = self.scheduler.step(model_output, timestep, image).prev_sample - return scheduler_output ++ return scheduler_output ``` -Cool, that's it! 🚀 You can now run this pipeline by passing a `unet` and a `scheduler` to the init: +That's it! 🚀 You can now run this pipeline by passing a `unet` and `scheduler` to it: ```python -from diffusers import DDPMScheduler, Unet2DModel +from diffusers import DDPMScheduler, UNet2DModel scheduler = DDPMScheduler() unet = UNet2DModel() @@ -85,7 +91,7 @@ pipeline = UnetSchedulerOneForwardPipeline(unet=unet, scheduler=scheduler) output = pipeline() ``` -But what's even better is that you can load pre-existing weights into the pipeline if they match exactly your pipeline structure. This is e.g. the case for [https://huggingface.co/google/ddpm-cifar10-32](https://huggingface.co/google/ddpm-cifar10-32) so that we can do the following: +But what's even better is you can load pre-existing weights into the pipeline if the pipeline structure is identical. For example, you can load the [`google/ddpm-cifar10-32`](https://huggingface.co/google/ddpm-cifar10-32) weights into the one-step pipeline: ```python pipeline = UnetSchedulerOneForwardPipeline.from_pretrained("google/ddpm-cifar10-32") @@ -93,63 +99,72 @@ pipeline = UnetSchedulerOneForwardPipeline.from_pretrained("google/ddpm-cifar10- output = pipeline() ``` -We want to share this amazing pipeline with the community, so we would open a PR request to add the following code under `one_step_unet.py` to [https://github.com/huggingface/diffusers/tree/main/examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) . - -```python -from diffusers import DiffusionPipeline -import torch - +## Share your pipeline -class UnetSchedulerOneForwardPipeline(DiffusionPipeline): - def __init__(self, unet, scheduler): - super().__init__() +Open a Pull Request on the 🧨 Diffusers [repository](https://github.com/huggingface/diffusers) to add your awesome pipeline in `one_step_unet.py` to the [examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) subfolder. - self.register_modules(unet=unet, scheduler=scheduler) +Once it is merged, anyone with `diffusers >= 0.4.0` installed can use this pipeline magically 🪄 by specifying it in the `custom_pipeline` argument: - def __call__(self): - image = torch.randn( - (1, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size), - ) - timestep = 1 - - model_output = self.unet(image, timestep).sample - scheduler_output = self.scheduler.step(model_output, timestep, image).prev_sample +```python +from diffusers import DiffusionPipeline - return scheduler_output +pipe = DiffusionPipeline.from_pretrained("google/ddpm-cifar10-32", custom_pipeline="one_step_unet") +pipe() ``` -Our amazing pipeline got merged here: [#840](https://github.com/huggingface/diffusers/pull/840). -Now everybody that has `diffusers >= 0.4.0` installed can use our pipeline magically 🪄 as follows: +Another way to share your community pipeline is to upload the `one_step_unet.py` file directly to your preferred [model repository](https://huggingface.co/docs/hub/models-uploading) on the Hub. Instead of specifying the `one_step_unet.py` file, pass the model repository id to the `custom_pipeline` argument: ```python from diffusers import DiffusionPipeline -pipe = DiffusionPipeline.from_pretrained("google/ddpm-cifar10-32", custom_pipeline="one_step_unet") -pipe() +pipeline = DiffusionPipeline.from_pretrained("google/ddpm-cifar10-32", custom_pipeline="stevhliu/one_step_unet") ``` -Another way to upload your custom_pipeline, besides sending a PR, is uploading the code that contains it to the Hugging Face Hub, [as exemplified here](https://huggingface.co/docs/diffusers/using-diffusers/custom_pipeline_overview#loading-custom-pipelines-from-the-hub). +Take a look at the following table to compare the two sharing workflows to help you decide the best option for you: + +| | GitHub community pipeline | HF Hub community pipeline | +|----------------|------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------| +| usage | same | same | +| review process | open a Pull Request on GitHub and undergo a review process from the Diffusers team before merging; may be slower | upload directly to a Hub repository without any review; this is the fastest workflow | +| visibility | included in the official Diffusers repository and documentation | included on your HF Hub profile and relies on your own usage/promotion to gain visibility | -**Try it out now - it works!** + -In general, you will want to create much more sophisticated pipelines, so we recommend looking at existing pipelines here: [https://github.com/huggingface/diffusers/tree/main/examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community). +💡 You can use whatever package you want in your community pipeline file - as long as the user has it installed, everything will work fine. Make sure you have one and only one pipeline class that inherits from `DiffusionPipeline` because this is automatically detected. -IMPORTANT: -You can use whatever package you want in your community pipeline file - as long as the user has it installed, everything will work fine. Make sure you have one and only one pipeline class that inherits from `DiffusionPipeline` as this will be automatically detected. + ## How do community pipelines work? -A community pipeline is a class that has to inherit from ['DiffusionPipeline']: -and that has been added to `examples/community` [files](https://github.com/huggingface/diffusers/tree/main/examples/community). -The community can load the pipeline code via the custom_pipeline argument from DiffusionPipeline. See docs [here](https://huggingface.co/docs/diffusers/api/diffusion_pipeline#diffusers.DiffusionPipeline.from_pretrained.custom_pipeline): -This means: -The model weights and configs of the pipeline should be loaded from the `pretrained_model_name_or_path` [argument](https://huggingface.co/docs/diffusers/api/diffusion_pipeline#diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path): -whereas the code that powers the community pipeline is defined in a file added in [`examples/community`](https://github.com/huggingface/diffusers/tree/main/examples/community). +A community pipeline is a class that inherits from [`DiffusionPipeline`] which means: + +- It can be loaded with the [`custom_pipeline`] argument. +- The model weights and scheduler configuration are loaded from [`pretrained_model_name_or_path`]. +- The code that implements a feature in the community pipeline is defined in a `pipeline.py` file. + +Sometimes you can't load all the pipeline components weights from an official repository. In this case, the other components should be passed directly to the pipeline: -Now, it might very well be that only some of your pipeline components weights can be downloaded from an official repo. -The other components should then be passed directly to init as is the case for the ClIP guidance notebook [here](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/CLIP_Guided_Stable_diffusion_with_diffusers.ipynb#scrollTo=z9Kglma6hjki). +```python +from diffusers import DiffusionPipeline +from transformers import CLIPFeatureExtractor, CLIPModel + +model_id = "CompVis/stable-diffusion-v1-4" +clip_model_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K" + +feature_extractor = CLIPFeatureExtractor.from_pretrained(clip_model_id) +clip_model = CLIPModel.from_pretrained(clip_model_id, torch_dtype=torch.float16) + +pipeline = DiffusionPipeline.from_pretrained( + model_id, + custom_pipeline="clip_guided_stable_diffusion", + clip_model=clip_model, + feature_extractor=feature_extractor, + scheduler=scheduler, + torch_dtype=torch.float16, +) +``` -The magic behind all of this is that we load the code directly from GitHub. You can check it out in more detail if you follow the functionality defined here: +The magic behind community pipelines is contained in the following code. It allows the community pipeline to be loaded from GitHub or the Hub, and it'll be available to all 🧨 Diffusers packages. ```python # 2. Load the pipeline class, if using custom module then load it from the hub @@ -164,6 +179,3 @@ else: diffusers_module = importlib.import_module(cls.__module__.split(".")[0]) pipeline_class = getattr(diffusers_module, config_dict["_class_name"]) ``` - -This is why a community pipeline merged to GitHub will be directly available to all `diffusers` packages. - diff --git a/docs/source/en/using-diffusers/custom_pipeline_examples.mdx b/docs/source/en/using-diffusers/custom_pipeline_examples.mdx index 2dfa71f0d33c..93ac6d1f782c 100644 --- a/docs/source/en/using-diffusers/custom_pipeline_examples.mdx +++ b/docs/source/en/using-diffusers/custom_pipeline_examples.mdx @@ -10,7 +10,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o specific language governing permissions and limitations under the License. --> -# Custom Pipelines +# Community pipelines > **For more information about community pipelines, please have a look at [this issue](https://github.com/huggingface/diffusers/issues/841).** diff --git a/docs/source/en/using-diffusers/custom_pipeline_overview.mdx b/docs/source/en/using-diffusers/custom_pipeline_overview.mdx index 934e639983d2..3c5df7c0dd6e 100644 --- a/docs/source/en/using-diffusers/custom_pipeline_overview.mdx +++ b/docs/source/en/using-diffusers/custom_pipeline_overview.mdx @@ -10,19 +10,21 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o specific language governing permissions and limitations under the License. --> -# Loading and Adding Custom Pipelines +# Load community pipelines -Diffusers allows you to conveniently load any custom pipeline from the Hugging Face Hub as well as any [official community pipeline](https://github.com/huggingface/diffusers/tree/main/examples/community) -via the [`DiffusionPipeline`] class. +Community pipelines are any [`DiffusionPipeline`] class that are different from the original implementation as specified in their paper (for example, the [`StableDiffusionControlNetPipeline`] corresponds to the [Text-to-Image Generation with ControlNet Conditioning](https://arxiv.org/abs/2302.05543) paper). They provide additional functionality or extend the original implementation of a pipeline. -## Loading custom pipelines from the Hub +There are many cool community pipelines like [Speech to Image](https://github.com/huggingface/diffusers/tree/main/examples/community#speech-to-image) or [Composable Stable Diffusion](https://github.com/huggingface/diffusers/tree/main/examples/community#composable-stable-diffusion), and you can find all the official community pipelines [here](https://github.com/huggingface/diffusers/tree/main/examples/community). -Custom pipelines can be easily loaded from any model repository on the Hub that defines a diffusion pipeline in a `pipeline.py` file. -Let's load a dummy pipeline from [hf-internal-testing/diffusers-dummy-pipeline](https://huggingface.co/hf-internal-testing/diffusers-dummy-pipeline). +To load any community pipeline on the Hub, pass the repository id of the community pipeline to the `custom_pipeline` argument and the model repository where you'd like to load the pipeline weights and components from. For example, the example below loads a dummy pipeline from [`hf-internal-testing/diffusers-dummy-pipeline`](https://huggingface.co/hf-internal-testing/diffusers-dummy-pipeline/blob/main/pipeline.py) and the pipeline weights and components from [`google/ddpm-cifar10-32`](https://huggingface.co/google/ddpm-cifar10-32): -All you need to do is pass the custom pipeline repo id with the `custom_pipeline` argument alongside the repo from where you wish to load the pipeline modules. + -```python +🔒 By loading a community pipeline from the Hugging Face Hub, you are trusting that the code you are loading is safe. Make sure to inspect the code online before loading and running it automatically! + + + +```py from diffusers import DiffusionPipeline pipeline = DiffusionPipeline.from_pretrained( @@ -30,25 +32,9 @@ pipeline = DiffusionPipeline.from_pretrained( ) ``` -This will load the custom pipeline as defined in the [model repository](https://huggingface.co/hf-internal-testing/diffusers-dummy-pipeline/blob/main/pipeline.py). - - - -By loading a custom pipeline from the Hugging Face Hub, you are trusting that the code you are loading -is safe 🔒. Make sure to check out the code online before loading & running it automatically. - - - -## Loading official community pipelines +Loading an official community pipeline is similar, but you can mix loading weights from an official repository id and pass pipeline components directly. The example below loads the community [CLIP Guided Stable Diffusion](https://github.com/huggingface/diffusers/tree/main/examples/community#clip-guided-stable-diffusion) pipeline, and you can pass the CLIP model components directly to it: -Community pipelines are summarized in the [community examples folder](https://github.com/huggingface/diffusers/tree/main/examples/community). - -Similarly, you need to pass both the *repo id* from where you wish to load the weights as well as the `custom_pipeline` argument. Here the `custom_pipeline` argument should consist simply of the filename of the community pipeline excluding the `.py` suffix, *e.g.* `clip_guided_stable_diffusion`. - -Since community pipelines are often more complex, one can mix loading weights from an official *repo id* -and passing pipeline modules directly. - -```python +```py from diffusers import DiffusionPipeline from transformers import CLIPImageProcessor, CLIPModel @@ -65,59 +51,4 @@ pipeline = DiffusionPipeline.from_pretrained( ) ``` -## Adding custom pipelines to the Hub - -To add a custom pipeline to the Hub, all you need to do is to define a pipeline class that inherits -from [`DiffusionPipeline`] in a `pipeline.py` file. -Make sure that the whole pipeline is encapsulated within a single class and that the `pipeline.py` file -has only one such class. - -Let's quickly define an example pipeline. - - -```python -import torch -from diffusers import DiffusionPipeline - - -class MyPipeline(DiffusionPipeline): - def __init__(self, unet, scheduler): - super().__init__() - - self.register_modules(unet=unet, scheduler=scheduler) - - @torch.no_grad() - def __call__(self, batch_size: int = 1, num_inference_steps: int = 50): - # Sample gaussian noise to begin loop - image = torch.randn( - (batch_size, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size) - ) - - image = image.to(self.device) - - # set step values - self.scheduler.set_timesteps(num_inference_steps) - - for t in self.progress_bar(self.scheduler.timesteps): - # 1. predict noise model_output - model_output = self.unet(image, t).sample - - # 2. predict previous mean of image x_t-1 and add variance depending on eta - # eta corresponds to η in paper and should be between [0, 1] - # do x_t -> x_t-1 - image = self.scheduler.step(model_output, t, image, eta).prev_sample - - image = (image / 2 + 0.5).clamp(0, 1) - image = image.cpu().permute(0, 2, 3, 1).numpy() - - return image -``` - -Now you can upload this short file under the name `pipeline.py` in your preferred [model repository](https://huggingface.co/docs/hub/models-uploading). For Stable Diffusion pipelines, you may also [join the community organisation for shared pipelines](https://huggingface.co/organizations/sd-diffusers-pipelines-library/share/BUPyDUuHcciGTOKaExlqtfFcyCZsVFdrjr) to upload yours. -Finally, we can load the custom pipeline by passing the model repository name, *e.g.* `sd-diffusers-pipelines-library/my_custom_pipeline` alongside the model repository from where we want to load the `unet` and `scheduler` components. - -```python -my_pipeline = DiffusionPipeline.from_pretrained( - "google/ddpm-cifar10-32", custom_pipeline="patrickvonplaten/my_custom_pipeline" -) -``` +For more information about community pipelines, take a look at the [Community pipelines](custom_pipeline_examples) guide for how to use them and if you're interested in adding a community pipeline check out the [How to contribute a community pipeline](contribute_pipeline) guide! \ No newline at end of file From 5c9dd0af952a92f19a1e672b2a9471ad5674841d Mon Sep 17 00:00:00 2001 From: Takuma Mori Date: Fri, 14 Apr 2023 12:07:34 +0900 Subject: [PATCH 10/71] Add to support Guess Mode for StableDiffusionControlnetPipleline (#2998) * add guess mode (WIP) * fix uncond/cond order * support guidance_scale=1.0 and batch != 1 * remove magic coeff * add docstring * add intergration test * add document to controlnet.mdx * made the comments a bit more explanatory * fix table --- .../pipelines/stable_diffusion/controlnet.mdx | 36 ++++++++++++++++ src/diffusers/models/controlnet.py | 11 ++++- .../pipeline_stable_diffusion_controlnet.py | 42 +++++++++++++++++-- .../test_stable_diffusion_controlnet.py | 32 ++++++++++++++ 4 files changed, 115 insertions(+), 6 deletions(-) diff --git a/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx b/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx index 5a4cfa41ca43..af859177c002 100644 --- a/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx +++ b/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx @@ -242,6 +242,42 @@ image.save("./multi_controlnet_output.png") +### Guess Mode + +Guess Mode is [a ControlNet feature that was implemented](https://github.com/lllyasviel/ControlNet#guess-mode--non-prompt-mode) after the publication of [the paper](https://arxiv.org/abs/2302.05543). The description states: + +>In this mode, the ControlNet encoder will try best to recognize the content of the input control map, like depth map, edge map, scribbles, etc, even if you remove all prompts. + +#### The core implementation: + +It adjusts the scale of the output residuals from ControlNet by a fixed ratio depending on the block depth. The shallowest DownBlock corresponds to `0.1`. As the blocks get deeper, the scale increases exponentially, and the scale for the output of the MidBlock becomes `1.0`. + +Since the core implementation is just this, **it does not have any impact on prompt conditioning**. While it is common to use it without specifying any prompts, it is also possible to provide prompts if desired. + +#### Usage: + +Just specify `guess_mode=True` in the pipe() function. A `guidance_scale` between 3.0 and 5.0 is [recommended](https://github.com/lllyasviel/ControlNet#guess-mode--non-prompt-mode). +```py +from diffusers import StableDiffusionControlNetPipeline, ControlNetModel +import torch + +controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny") +pipe = StableDiffusionControlNetPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", controlnet=controlnet).to( + "cuda" +) +image = pipe("", image=canny_image, guess_mode=True, guidance_scale=3.0).images[0] +image.save("guess_mode_generated.png") +``` + +#### Output image comparison: +Canny Control Example + +|no guess_mode with prompt|guess_mode without prompt| +|---|---| +||| + + + ## Available checkpoints ControlNet requires a *control image* in addition to the text-to-image *prompt*. diff --git a/src/diffusers/models/controlnet.py b/src/diffusers/models/controlnet.py index bb608ad82a7a..4f1ffe604578 100644 --- a/src/diffusers/models/controlnet.py +++ b/src/diffusers/models/controlnet.py @@ -456,6 +456,7 @@ def forward( timestep_cond: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, + guess_mode: bool = False, return_dict: bool = True, ) -> Union[ControlNetOutput, Tuple]: # check channel order @@ -556,8 +557,14 @@ def forward( mid_block_res_sample = self.controlnet_mid_block(sample) # 6. scaling - down_block_res_samples = [sample * conditioning_scale for sample in down_block_res_samples] - mid_block_res_sample *= conditioning_scale + if guess_mode: + scales = torch.logspace(-1, 0, len(down_block_res_samples) + 1) # 0.1 to 1.0 + scales *= conditioning_scale + down_block_res_samples = [sample * scale for sample, scale in zip(down_block_res_samples, scales)] + mid_block_res_sample *= scales[-1] # last one + else: + down_block_res_samples = [sample * conditioning_scale for sample in down_block_res_samples] + mid_block_res_sample *= conditioning_scale if not return_dict: return (down_block_res_samples, mid_block_res_sample) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py index 12d21afbfeda..1ebd469f76b3 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py @@ -118,6 +118,7 @@ def forward( timestep_cond: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, + guess_mode: bool = False, return_dict: bool = True, ) -> Union[ControlNetOutput, Tuple]: for i, (image, scale, controlnet) in enumerate(zip(controlnet_cond, conditioning_scale, self.nets)): @@ -131,6 +132,7 @@ def forward( timestep_cond, attention_mask, cross_attention_kwargs, + guess_mode, return_dict, ) @@ -627,7 +629,16 @@ def check_image(self, image, prompt, prompt_embeds): ) def prepare_image( - self, image, width, height, batch_size, num_images_per_prompt, device, dtype, do_classifier_free_guidance + self, + image, + width, + height, + batch_size, + num_images_per_prompt, + device, + dtype, + do_classifier_free_guidance, + guess_mode, ): if not isinstance(image, torch.Tensor): if isinstance(image, PIL.Image.Image): @@ -664,7 +675,7 @@ def prepare_image( image = image.to(device=device, dtype=dtype) - if do_classifier_free_guidance: + if do_classifier_free_guidance and not guess_mode: image = torch.cat([image] * 2) return image @@ -747,6 +758,7 @@ def __call__( callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, controlnet_conditioning_scale: Union[float, List[float]] = 1.0, + guess_mode: bool = False, ): r""" Function invoked when calling the pipeline for generation. @@ -819,6 +831,10 @@ def __call__( The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added to the residual in the original unet. If multiple ControlNets are specified in init, you can set the corresponding scale as a list. + guess_mode (`bool`, *optional*, defaults to `False`): + In this mode, the ControlNet encoder will try best to recognize the content of the input image even if + you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended. + Examples: Returns: @@ -883,6 +899,7 @@ def __call__( device=device, dtype=self.controlnet.dtype, do_classifier_free_guidance=do_classifier_free_guidance, + guess_mode=guess_mode, ) elif isinstance(self.controlnet, MultiControlNetModel): images = [] @@ -897,6 +914,7 @@ def __call__( device=device, dtype=self.controlnet.dtype, do_classifier_free_guidance=do_classifier_free_guidance, + guess_mode=guess_mode, ) images.append(image_) @@ -934,15 +952,31 @@ def __call__( latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # controlnet(s) inference + if guess_mode and do_classifier_free_guidance: + # Infer ControlNet only for the conditional batch. + controlnet_latent_model_input = latents + controlnet_prompt_embeds = prompt_embeds.chunk(2)[1] + else: + controlnet_latent_model_input = latent_model_input + controlnet_prompt_embeds = prompt_embeds + down_block_res_samples, mid_block_res_sample = self.controlnet( - latent_model_input, + controlnet_latent_model_input, t, - encoder_hidden_states=prompt_embeds, + encoder_hidden_states=controlnet_prompt_embeds, controlnet_cond=image, conditioning_scale=controlnet_conditioning_scale, + guess_mode=guess_mode, return_dict=False, ) + if guess_mode and do_classifier_free_guidance: + # Infered ControlNet only for the conditional batch. + # To apply the output of ControlNet to both the unconditional and conditional batches, + # add 0 to the unconditional batch to keep it unchanged. + down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples] + mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample]) + # predict the noise residual noise_pred = self.unet( latent_model_input, diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py index d7c5e2b0323a..70b3652fce77 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py @@ -553,6 +553,38 @@ def test_sequential_cpu_offloading(self): # make sure that less than 7 GB is allocated assert mem_bytes < 4 * 10**9 + def test_canny_guess_mode(self): + controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny") + + pipe = StableDiffusionControlNetPipeline.from_pretrained( + "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet + ) + pipe.enable_model_cpu_offload() + pipe.set_progress_bar_config(disable=None) + + generator = torch.Generator(device="cpu").manual_seed(0) + prompt = "" + image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png" + ) + + output = pipe( + prompt, + image, + generator=generator, + output_type="np", + num_inference_steps=3, + guidance_scale=3.0, + guess_mode=True, + ) + + image = output.images[0] + assert image.shape == (768, 512, 3) + + image_slice = image[-3:, -3:, -1] + expected_slice = np.array([0.2724, 0.2846, 0.2724, 0.3843, 0.3682, 0.2736, 0.4675, 0.3862, 0.2887]) + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + @slow @require_torch_gpu From eb2ef316068620ab2f44b6a7d6b13a0cb146088e Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Thu, 13 Apr 2023 17:54:54 -1000 Subject: [PATCH 11/71] fix default value for attend-and-excite (#3099) * fix default --- .../pipeline_stable_diffusion_attend_and_excite.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py index c81ed5b54f94..fba2a4e32f88 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py @@ -724,7 +724,7 @@ def __call__( max_iter_to_alter: int = 25, thresholds: dict = {0: 0.05, 10: 0.5, 20: 0.8}, scale_factor: int = 20, - attn_res: Optional[Tuple[int]] = None, + attn_res: Optional[Tuple[int]] = (16, 16), ): r""" Function invoked when calling the pipeline for generation. From 1bd4c9e93dcbb31135aa8594aaf28f7b6efd39ab Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Fri, 14 Apr 2023 06:39:25 -1000 Subject: [PATCH 12/71] remvoe one line as requested by gc team (#3077) remvoe one line --- examples/text_to_image/train_text_to_image_flax.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/text_to_image/train_text_to_image_flax.py b/examples/text_to_image/train_text_to_image_flax.py index 41a02d68f2b1..d44731896c1d 100644 --- a/examples/text_to_image/train_text_to_image_flax.py +++ b/examples/text_to_image/train_text_to_image_flax.py @@ -340,11 +340,10 @@ def preprocess_train(examples): return examples - if jax.process_index() == 0: - if args.max_train_samples is not None: - dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples)) + if args.max_train_samples is not None: + dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples)) # Set the training transforms - train_dataset = dataset["train"].with_transform(preprocess_train) + train_dataset = dataset["train"].with_transform(preprocess_train) def collate_fn(examples): pixel_values = torch.stack([example["pixel_values"] for example in examples]) From b811964a7b7f3c4cd50dc25a58789a0fed351e09 Mon Sep 17 00:00:00 2001 From: Will Berman Date: Fri, 14 Apr 2023 12:39:38 -0700 Subject: [PATCH 13/71] ddpm custom timesteps (#3007) add custom timesteps test add custom timesteps descending order check docs timesteps -> custom_timesteps can only pass one of num_inference_steps and timesteps --- src/diffusers/schedulers/scheduling_ddpm.py | 79 ++++++++++++++++----- tests/schedulers/test_scheduler_ddpm.py | 56 +++++++++++++++ 2 files changed, 119 insertions(+), 16 deletions(-) diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py index eaaf497f9c1d..2bc34bb8b444 100644 --- a/src/diffusers/schedulers/scheduling_ddpm.py +++ b/src/diffusers/schedulers/scheduling_ddpm.py @@ -162,6 +162,7 @@ def __init__( self.init_noise_sigma = 1.0 # setable values + self.custom_timesteps = False self.num_inference_steps = None self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy()) @@ -191,31 +192,62 @@ def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = """ return sample - def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None): + def set_timesteps( + self, + num_inference_steps: Optional[int] = None, + device: Union[str, torch.device] = None, + timesteps: Optional[List[int]] = None, + ): """ Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference. Args: - num_inference_steps (`int`): - the number of diffusion steps used when generating samples with a pre-trained model. + num_inference_steps (`Optional[int]`): + the number of diffusion steps used when generating samples with a pre-trained model. If passed, then + `timesteps` must be `None`. + device (`str` or `torch.device`, optional): + the device to which the timesteps are moved to. + custom_timesteps (`List[int]`, optional): + custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default + timestep spacing strategy of equal spacing between timesteps is used. If passed, `num_inference_steps` + must be `None`. + """ + if num_inference_steps is not None and timesteps is not None: + raise ValueError("Can only pass one of `num_inference_steps` or `custom_timesteps`.") + + if timesteps is not None: + for i in range(1, len(timesteps)): + if timesteps[i] >= timesteps[i - 1]: + raise ValueError("`custom_timesteps` must be in descending order.") + + if timesteps[0] >= self.config.num_train_timesteps: + raise ValueError( + f"`timesteps` must start before `self.config.train_timesteps`:" + f" {self.config.num_train_timesteps}." + ) + + timesteps = np.array(timesteps, dtype=np.int64) + self.custom_timesteps = True + else: + if num_inference_steps > self.config.num_train_timesteps: + raise ValueError( + f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:" + f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle" + f" maximal {self.config.num_train_timesteps} timesteps." + ) - if num_inference_steps > self.config.num_train_timesteps: - raise ValueError( - f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:" - f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle" - f" maximal {self.config.num_train_timesteps} timesteps." - ) + self.num_inference_steps = num_inference_steps - self.num_inference_steps = num_inference_steps + step_ratio = self.config.num_train_timesteps // self.num_inference_steps + timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64) + self.custom_timesteps = False - step_ratio = self.config.num_train_timesteps // self.num_inference_steps - timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64) self.timesteps = torch.from_numpy(timesteps).to(device) def _get_variance(self, t, predicted_variance=None, variance_type=None): - num_inference_steps = self.num_inference_steps if self.num_inference_steps else self.config.num_train_timesteps - prev_t = t - self.config.num_train_timesteps // num_inference_steps + prev_t = self.previous_timestep(t) + alpha_prod_t = self.alphas_cumprod[t] alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one current_beta_t = 1 - alpha_prod_t / alpha_prod_t_prev @@ -314,8 +346,8 @@ def step( """ t = timestep - num_inference_steps = self.num_inference_steps if self.num_inference_steps else self.config.num_train_timesteps - prev_t = timestep - self.config.num_train_timesteps // num_inference_steps + + prev_t = self.previous_timestep(t) if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type in ["learned", "learned_range"]: model_output, predicted_variance = torch.split(model_output, sample.shape[1], dim=1) @@ -428,3 +460,18 @@ def get_velocity( def __len__(self): return self.config.num_train_timesteps + + def previous_timestep(self, timestep): + if self.custom_timesteps: + index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0] + if index == self.timesteps.shape[0] - 1: + prev_t = torch.tensor(-1) + else: + prev_t = self.timesteps[index + 1] + else: + num_inference_steps = ( + self.num_inference_steps if self.num_inference_steps else self.config.num_train_timesteps + ) + prev_t = timestep - self.config.num_train_timesteps // num_inference_steps + + return prev_t diff --git a/tests/schedulers/test_scheduler_ddpm.py b/tests/schedulers/test_scheduler_ddpm.py index b55a39ee2e79..c44ded43e67e 100644 --- a/tests/schedulers/test_scheduler_ddpm.py +++ b/tests/schedulers/test_scheduler_ddpm.py @@ -129,3 +129,59 @@ def test_full_loop_with_v_prediction(self): assert abs(result_sum.item() - 202.0296) < 1e-2 assert abs(result_mean.item() - 0.2631) < 1e-3 + + def test_custom_timesteps(self): + scheduler_class = self.scheduler_classes[0] + scheduler_config = self.get_scheduler_config() + scheduler = scheduler_class(**scheduler_config) + + timesteps = [100, 87, 50, 1, 0] + + scheduler.set_timesteps(timesteps=timesteps) + + scheduler_timesteps = scheduler.timesteps + + for i, timestep in enumerate(scheduler_timesteps): + if i == len(timesteps) - 1: + expected_prev_t = -1 + else: + expected_prev_t = timesteps[i + 1] + + prev_t = scheduler.previous_timestep(timestep) + prev_t = prev_t.item() + + self.assertEqual(prev_t, expected_prev_t) + + def test_custom_timesteps_increasing_order(self): + scheduler_class = self.scheduler_classes[0] + scheduler_config = self.get_scheduler_config() + scheduler = scheduler_class(**scheduler_config) + + timesteps = [100, 87, 50, 51, 0] + + with self.assertRaises(ValueError, msg="`custom_timesteps` must be in descending order."): + scheduler.set_timesteps(timesteps=timesteps) + + def test_custom_timesteps_passing_both_num_inference_steps_and_timesteps(self): + scheduler_class = self.scheduler_classes[0] + scheduler_config = self.get_scheduler_config() + scheduler = scheduler_class(**scheduler_config) + + timesteps = [100, 87, 50, 1, 0] + num_inference_steps = len(timesteps) + + with self.assertRaises(ValueError, msg="Can only pass one of `num_inference_steps` or `custom_timesteps`."): + scheduler.set_timesteps(num_inference_steps=num_inference_steps, timesteps=timesteps) + + def test_custom_timesteps_too_large(self): + scheduler_class = self.scheduler_classes[0] + scheduler_config = self.get_scheduler_config() + scheduler = scheduler_class(**scheduler_config) + + timesteps = [scheduler.config.num_train_timesteps] + + with self.assertRaises( + ValueError, + msg="`timesteps` must start before `self.config.train_timesteps`: {scheduler.config.num_train_timesteps}}", + ): + scheduler.set_timesteps(timesteps=timesteps) From 807f69b32879a0ea74aa4e58ee007988507d6df8 Mon Sep 17 00:00:00 2001 From: Tommaso De Rossi Date: Sun, 16 Apr 2023 19:04:11 +0200 Subject: [PATCH 14/71] Fix breaking change in `pipeline_stable_diffusion_controlnet.py` (#3118) fix breaking change --- .../stable_diffusion/pipeline_stable_diffusion_controlnet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py index 1ebd469f76b3..3b8889d92b55 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py @@ -637,8 +637,8 @@ def prepare_image( num_images_per_prompt, device, dtype, - do_classifier_free_guidance, - guess_mode, + do_classifier_free_guidance=False, + guess_mode=False, ): if not isinstance(image, torch.Tensor): if isinstance(image, PIL.Image.Image): From cfc99adf0f2e45afbddc117671e4faa59ca83ae2 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Sun, 16 Apr 2023 19:07:23 +0200 Subject: [PATCH 15/71] Add global pooling to controlnet (#3121) --- src/diffusers/models/controlnet.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/diffusers/models/controlnet.py b/src/diffusers/models/controlnet.py index 4f1ffe604578..3ffbb04eb222 100644 --- a/src/diffusers/models/controlnet.py +++ b/src/diffusers/models/controlnet.py @@ -119,6 +119,7 @@ def __init__( projection_class_embeddings_input_dim: Optional[int] = None, controlnet_conditioning_channel_order: str = "rgb", conditioning_embedding_out_channels: Optional[Tuple[int]] = (16, 32, 96, 256), + global_pool_conditions: bool = False, ): super().__init__() @@ -566,6 +567,12 @@ def forward( down_block_res_samples = [sample * conditioning_scale for sample in down_block_res_samples] mid_block_res_sample *= conditioning_scale + if self.config.global_pool_conditions: + down_block_res_samples = [ + torch.mean(sample, dim=(2, 3), keepdim=True) for sample in down_block_res_samples + ] + mid_block_res_sample = torch.mean(mid_block_res_sample, dim=(2, 3), keepdim=True) + if not return_dict: return (down_block_res_samples, mid_block_res_sample) From beb848e2b6cc888bd5039e6f6cac7c932c6c3225 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 17 Apr 2023 11:53:04 +0200 Subject: [PATCH 16/71] [Bug fix] Fix img2img processor with safety checker (#3127) Fix img2img processor with safety checker --- .../pipelines/stable_diffusion/safety_checker.py | 5 ++++- .../test_stable_diffusion_img2img.py | 14 ++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/stable_diffusion/safety_checker.py b/src/diffusers/pipelines/stable_diffusion/safety_checker.py index 84b8aeb7bcde..38c7b22d08d4 100644 --- a/src/diffusers/pipelines/stable_diffusion/safety_checker.py +++ b/src/diffusers/pipelines/stable_diffusion/safety_checker.py @@ -85,7 +85,10 @@ def forward(self, clip_input, images): for idx, has_nsfw_concept in enumerate(has_nsfw_concepts): if has_nsfw_concept: - images[idx] = np.zeros(images[idx].shape) # black image + if torch.is_tensor(images) or torch.is_tensor(images[0]): + images[idx] = torch.zeros_like(images[idx]) # black image + else: + images[idx] = np.zeros(images[idx].shape) # black image if any(has_nsfw_concepts): logger.warning( diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py index 127b1c216549..0e2c4acb5484 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py @@ -453,6 +453,20 @@ def test_stable_diffusion_img2img_pipeline_multiple_of_8(self): assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-3 + def test_img2img_safety_checker_works(self): + sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") + sd_pipe.to(torch_device) + sd_pipe.set_progress_bar_config(disable=None) + + inputs = self.get_inputs(torch_device) + inputs["num_inference_steps"] = 20 + # make sure the safety checker is activated + inputs["prompt"] = "naked, sex, porn" + out = sd_pipe(**inputs) + + assert out.nsfw_content_detected[0], f"Safety checker should work for prompt: {inputs['prompt']}" + assert np.abs(out.images[0]).sum() < 1e-5 # should be all zeros + @nightly @require_torch_gpu From ca783a0f1f4ce8b0a16e6b96a8890edc47489e3a Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 17 Apr 2023 12:52:40 +0200 Subject: [PATCH 17/71] [Bug fix] Make sure correct timesteps are chosen for img2img (#3128) Make sure correct timesteps are chosen for img2img --- .../pipeline_alt_diffusion_img2img.py | 2 +- .../pipeline_cycle_diffusion.py | 2 +- .../pipeline_stable_diffusion_depth2img.py | 2 +- .../pipeline_stable_diffusion_img2img.py | 2 +- ...ipeline_stable_diffusion_inpaint_legacy.py | 2 +- .../test_stable_diffusion_img2img.py | 28 +++++++++++++++++++ 6 files changed, 33 insertions(+), 5 deletions(-) diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py index bb8116f2f5d5..86fc47f424e9 100644 --- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py +++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py @@ -503,7 +503,7 @@ def get_timesteps(self, num_inference_steps, strength, device): init_timestep = min(int(num_inference_steps * strength), num_inference_steps) t_start = max(num_inference_steps - init_timestep, 0) - timesteps = self.scheduler.timesteps[t_start:] + timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] return timesteps, num_inference_steps - t_start diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py index dd8e4f16dfc0..e2accb6d2d2a 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py @@ -528,7 +528,7 @@ def get_timesteps(self, num_inference_steps, strength, device): init_timestep = min(int(num_inference_steps * strength), num_inference_steps) t_start = max(num_inference_steps - init_timestep, 0) - timesteps = self.scheduler.timesteps[t_start:] + timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] return timesteps, num_inference_steps - t_start diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py index 54f00ebc23f2..4fe117ba120b 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py @@ -390,7 +390,7 @@ def get_timesteps(self, num_inference_steps, strength, device): init_timestep = min(int(num_inference_steps * strength), num_inference_steps) t_start = max(num_inference_steps - init_timestep, 0) - timesteps = self.scheduler.timesteps[t_start:] + timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] return timesteps, num_inference_steps - t_start diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index a0befdae73c4..5860a53ad528 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -511,7 +511,7 @@ def get_timesteps(self, num_inference_steps, strength, device): init_timestep = min(int(num_inference_steps * strength), num_inference_steps) t_start = max(num_inference_steps - init_timestep, 0) - timesteps = self.scheduler.timesteps[t_start:] + timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] return timesteps, num_inference_steps - t_start diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py index b7a0c942bbe2..6d9cbaf67a07 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py @@ -507,7 +507,7 @@ def get_timesteps(self, num_inference_steps, strength, device): init_timestep = min(int(num_inference_steps * strength), num_inference_steps) t_start = max(num_inference_steps - init_timestep, 0) - timesteps = self.scheduler.timesteps[t_start:] + timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] return timesteps, num_inference_steps - t_start diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py index 0e2c4acb5484..4262114c78eb 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py @@ -25,6 +25,7 @@ AutoencoderKL, DDIMScheduler, DPMSolverMultistepScheduler, + HeunDiscreteScheduler, LMSDiscreteScheduler, PNDMScheduler, StableDiffusionImg2ImgPipeline, @@ -416,6 +417,33 @@ def test_stable_diffusion_pipeline_with_model_offloading(self): for module in pipe.text_encoder, pipe.unet, pipe.vae: assert module.device == torch.device("cpu") + def test_img2img_2nd_order(self): + sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") + sd_pipe.scheduler = HeunDiscreteScheduler.from_config(sd_pipe.scheduler.config) + sd_pipe.to(torch_device) + sd_pipe.set_progress_bar_config(disable=None) + + inputs = self.get_inputs(torch_device) + inputs["num_inference_steps"] = 10 + inputs["strength"] = 0.75 + image = sd_pipe(**inputs).images[0] + + expected_image = load_numpy( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/img2img/img2img_heun.npy" + ) + max_diff = np.abs(expected_image - image).max() + assert max_diff < 5e-2 + + inputs = self.get_inputs(torch_device) + inputs["num_inference_steps"] = 11 + inputs["strength"] = 0.75 + image_other = sd_pipe(**inputs).images[0] + + mean_diff = np.abs(image - image_other).mean() + + # images should be very similar + assert mean_diff < 5e-2 + def test_stable_diffusion_img2img_pipeline_multiple_of_8(self): init_image = load_image( "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" From ed8fd38337c0f75259cae86c4013c6125fe96a61 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 17 Apr 2023 17:19:11 +0200 Subject: [PATCH 18/71] Improve deprecation warnings (#3131) --- src/diffusers/pipelines/pipeline_utils.py | 10 +++++----- tests/models/test_lora_layers.py | 1 + 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index c095da1665de..d531d967c3d1 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -201,24 +201,24 @@ def variant_compatible_siblings(filenames, variant=None) -> Union[List[os.PathLi # .bin, .safetensors, ... weight_suffixs = [w.split(".")[-1] for w in weight_names] # -00001-of-00002 - transformers_index_format = "\d{5}-of-\d{5}" + transformers_index_format = r"\d{5}-of-\d{5}" if variant is not None: # `diffusion_pytorch_model.fp16.bin` as well as `model.fp16-00001-of-00002.safetenstors` variant_file_re = re.compile( - f"({'|'.join(weight_prefixes)})\.({variant}|{variant}-{transformers_index_format})\.({'|'.join(weight_suffixs)})$" + rf"({'|'.join(weight_prefixes)})\.({variant}|{variant}-{transformers_index_format})\.({'|'.join(weight_suffixs)})$" ) # `text_encoder/pytorch_model.bin.index.fp16.json` variant_index_re = re.compile( - f"({'|'.join(weight_prefixes)})\.({'|'.join(weight_suffixs)})\.index\.{variant}\.json$" + rf"({'|'.join(weight_prefixes)})\.({'|'.join(weight_suffixs)})\.index\.{variant}\.json$" ) # `diffusion_pytorch_model.bin` as well as `model-00001-of-00002.safetenstors` non_variant_file_re = re.compile( - f"({'|'.join(weight_prefixes)})(-{transformers_index_format})?\.({'|'.join(weight_suffixs)})$" + rf"({'|'.join(weight_prefixes)})(-{transformers_index_format})?\.({'|'.join(weight_suffixs)})$" ) # `text_encoder/pytorch_model.bin.index.json` - non_variant_index_re = re.compile(f"({'|'.join(weight_prefixes)})\.({'|'.join(weight_suffixs)})\.index\.json") + non_variant_index_re = re.compile(rf"({'|'.join(weight_prefixes)})\.({'|'.join(weight_suffixs)})\.index\.json") if variant is not None: variant_weights = {f for f in filenames if variant_file_re.match(f.split("/")[-1]) is not None} diff --git a/tests/models/test_lora_layers.py b/tests/models/test_lora_layers.py index 9bcdc5d93301..6f75902d388f 100644 --- a/tests/models/test_lora_layers.py +++ b/tests/models/test_lora_layers.py @@ -71,6 +71,7 @@ def get_dummy_components(self): beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False, + steps_offset=1, ) torch.manual_seed(0) vae = AutoencoderKL( From 703307efcc49fbc3f1362344dc5d577e4c4595c8 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 17 Apr 2023 18:16:28 +0200 Subject: [PATCH 19/71] Fix config deprecation (#3129) * Better deprecation message * Better deprecation message * Better doc string * Fixes * fix more * fix more * Improve __getattr__ * correct more * fix more * fix * Improve more * more improvements * fix more * Apply suggestions from code review Co-authored-by: Pedro Cuenca * make style * Fix all rest & add tests & remove old deprecation fns --------- Co-authored-by: Pedro Cuenca --- .../community/unclip_image_interpolation.py | 12 ++-- .../community/unclip_text_interpolation.py | 12 ++-- src/diffusers/configuration_utils.py | 18 +++++ src/diffusers/models/autoencoder_kl.py | 12 +--- src/diffusers/models/modeling_utils.py | 21 +++++- src/diffusers/models/unet_1d.py | 12 +--- src/diffusers/models/unet_2d.py | 12 +--- src/diffusers/models/unet_2d_condition.py | 12 +--- src/diffusers/pipelines/pipeline_utils.py | 69 +++++++++---------- .../pipeline_text_to_video_zero.py | 2 +- .../pipelines/unclip/pipeline_unclip.py | 12 ++-- .../unclip/pipeline_unclip_image_variation.py | 12 ++-- .../versatile_diffusion/modeling_text_unet.py | 15 +--- ...ipeline_versatile_diffusion_dual_guided.py | 2 +- ...ine_versatile_diffusion_image_variation.py | 2 +- ...eline_versatile_diffusion_text_to_image.py | 2 +- src/diffusers/schedulers/scheduling_ddpm.py | 12 +--- src/diffusers/utils/deprecation_utils.py | 4 +- tests/models/test_modeling_common.py | 47 ++++++++++++- tests/pipelines/unclip/test_unclip.py | 8 +-- .../unclip/test_unclip_image_variation.py | 13 ++-- tests/schedulers/test_schedulers.py | 44 ++++++++++++ 22 files changed, 209 insertions(+), 146 deletions(-) diff --git a/examples/community/unclip_image_interpolation.py b/examples/community/unclip_image_interpolation.py index d0b54125b688..453ac07af7c6 100644 --- a/examples/community/unclip_image_interpolation.py +++ b/examples/community/unclip_image_interpolation.py @@ -372,9 +372,9 @@ def __call__( self.decoder_scheduler.set_timesteps(decoder_num_inference_steps, device=device) decoder_timesteps_tensor = self.decoder_scheduler.timesteps - num_channels_latents = self.decoder.in_channels - height = self.decoder.sample_size - width = self.decoder.sample_size + num_channels_latents = self.decoder.config.in_channels + height = self.decoder.config.sample_size + width = self.decoder.config.sample_size decoder_latents = self.prepare_latents( (batch_size, num_channels_latents, height, width), @@ -425,9 +425,9 @@ def __call__( self.super_res_scheduler.set_timesteps(super_res_num_inference_steps, device=device) super_res_timesteps_tensor = self.super_res_scheduler.timesteps - channels = self.super_res_first.in_channels // 2 - height = self.super_res_first.sample_size - width = self.super_res_first.sample_size + channels = self.super_res_first.config.in_channels // 2 + height = self.super_res_first.config.sample_size + width = self.super_res_first.config.sample_size super_res_latents = self.prepare_latents( (batch_size, channels, height, width), diff --git a/examples/community/unclip_text_interpolation.py b/examples/community/unclip_text_interpolation.py index ac6b73d974b6..290f45317004 100644 --- a/examples/community/unclip_text_interpolation.py +++ b/examples/community/unclip_text_interpolation.py @@ -452,9 +452,9 @@ def __call__( self.decoder_scheduler.set_timesteps(decoder_num_inference_steps, device=device) decoder_timesteps_tensor = self.decoder_scheduler.timesteps - num_channels_latents = self.decoder.in_channels - height = self.decoder.sample_size - width = self.decoder.sample_size + num_channels_latents = self.decoder.config.in_channels + height = self.decoder.config.sample_size + width = self.decoder.config.sample_size decoder_latents = self.prepare_latents( (batch_size, num_channels_latents, height, width), @@ -505,9 +505,9 @@ def __call__( self.super_res_scheduler.set_timesteps(super_res_num_inference_steps, device=device) super_res_timesteps_tensor = self.super_res_scheduler.timesteps - channels = self.super_res_first.in_channels // 2 - height = self.super_res_first.sample_size - width = self.super_res_first.sample_size + channels = self.super_res_first.config.in_channels // 2 + height = self.super_res_first.config.sample_size + width = self.super_res_first.config.sample_size super_res_latents = self.prepare_latents( (batch_size, channels, height, width), diff --git a/src/diffusers/configuration_utils.py b/src/diffusers/configuration_utils.py index 45930431351a..772e119fbe97 100644 --- a/src/diffusers/configuration_utils.py +++ b/src/diffusers/configuration_utils.py @@ -118,6 +118,24 @@ def register_to_config(self, **kwargs): self._internal_dict = FrozenDict(internal_dict) + def __getattr__(self, name: str) -> Any: + """The only reason we overwrite `getattr` here is to gracefully deprecate accessing + config attributes directly. See https://github.com/huggingface/diffusers/pull/3129 + + Tihs funtion is mostly copied from PyTorch's __getattr__ overwrite: + https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module + """ + + is_in_config = "_internal_dict" in self.__dict__ and hasattr(self.__dict__["_internal_dict"], name) + is_attribute = name in self.__dict__ + + if is_in_config and not is_attribute: + deprecation_message = f"Accessing config attribute `{name}` directly via '{type(self).__name__}' object attribute is deprecated. Please access '{name}' over '{type(self).__name__}'s config object instead, e.g. 'scheduler.config.{name}'." + deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False) + return self._internal_dict[name] + + raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'") + def save_config(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs): """ Save a configuration object to the directory `save_directory`, so that it can be re-loaded using the diff --git a/src/diffusers/models/autoencoder_kl.py b/src/diffusers/models/autoencoder_kl.py index 5d1c54a9af25..1a8a204d80ce 100644 --- a/src/diffusers/models/autoencoder_kl.py +++ b/src/diffusers/models/autoencoder_kl.py @@ -18,7 +18,7 @@ import torch.nn as nn from ..configuration_utils import ConfigMixin, register_to_config -from ..utils import BaseOutput, apply_forward_hook, deprecate +from ..utils import BaseOutput, apply_forward_hook from .modeling_utils import ModelMixin from .vae import Decoder, DecoderOutput, DiagonalGaussianDistribution, Encoder @@ -123,16 +123,6 @@ def __init__( self.tile_latent_min_size = int(sample_size / (2 ** (len(self.config.block_out_channels) - 1))) self.tile_overlap_factor = 0.25 - @property - def block_out_channels(self): - deprecate( - "block_out_channels", - "1.0.0", - "Accessing `block_out_channels` directly via vae.block_out_channels is deprecated. Please use `vae.config.block_out_channels instead`", - standard_warn=False, - ) - return self.config.block_out_channels - def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, (Encoder, Decoder)): module.gradient_checkpointing = value diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py index 6a849f6f0e45..5363e6330623 100644 --- a/src/diffusers/models/modeling_utils.py +++ b/src/diffusers/models/modeling_utils.py @@ -17,7 +17,7 @@ import inspect import os from functools import partial -from typing import Callable, List, Optional, Tuple, Union +from typing import Any, Callable, List, Optional, Tuple, Union import torch from torch import Tensor, device @@ -32,6 +32,7 @@ WEIGHTS_NAME, _add_variant, _get_model_file, + deprecate, is_accelerate_available, is_safetensors_available, is_torch_version, @@ -156,6 +157,24 @@ class ModelMixin(torch.nn.Module): def __init__(self): super().__init__() + def __getattr__(self, name: str) -> Any: + """The only reason we overwrite `getattr` here is to gracefully deprecate accessing + config attributes directly. See https://github.com/huggingface/diffusers/pull/3129 We need to overwrite + __getattr__ here in addition so that we don't trigger `torch.nn.Module`'s __getattr__': + https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module + """ + + is_in_config = "_internal_dict" in self.__dict__ and hasattr(self.__dict__["_internal_dict"], name) + is_attribute = name in self.__dict__ + + if is_in_config and not is_attribute: + deprecation_message = f"Accessing config attribute `{name}` directly via '{type(self).__name__}' object attribute is deprecated. Please access '{name}' over '{type(self).__name__}'s config object instead, e.g. 'unet.config.{name}'." + deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False, stacklevel=3) + return self._internal_dict[name] + + # call PyTorch's https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module + return super().__getattr__(name) + @property def is_gradient_checkpointing(self) -> bool: """ diff --git a/src/diffusers/models/unet_1d.py b/src/diffusers/models/unet_1d.py index c7755bb3ed45..34a1d2b5160e 100644 --- a/src/diffusers/models/unet_1d.py +++ b/src/diffusers/models/unet_1d.py @@ -19,7 +19,7 @@ import torch.nn as nn from ..configuration_utils import ConfigMixin, register_to_config -from ..utils import BaseOutput, deprecate +from ..utils import BaseOutput from .embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps from .modeling_utils import ModelMixin from .unet_1d_blocks import get_down_block, get_mid_block, get_out_block, get_up_block @@ -190,16 +190,6 @@ def __init__( fc_dim=block_out_channels[-1] // 4, ) - @property - def in_channels(self): - deprecate( - "in_channels", - "1.0.0", - "Accessing `in_channels` directly via unet.in_channels is deprecated. Please use `unet.config.in_channels` instead", - standard_warn=False, - ) - return self.config.in_channels - def forward( self, sample: torch.FloatTensor, diff --git a/src/diffusers/models/unet_2d.py b/src/diffusers/models/unet_2d.py index a83e4917c143..2a6a1b9de5f2 100644 --- a/src/diffusers/models/unet_2d.py +++ b/src/diffusers/models/unet_2d.py @@ -18,7 +18,7 @@ import torch.nn as nn from ..configuration_utils import ConfigMixin, register_to_config -from ..utils import BaseOutput, deprecate +from ..utils import BaseOutput from .embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps from .modeling_utils import ModelMixin from .unet_2d_blocks import UNetMidBlock2D, get_down_block, get_up_block @@ -216,16 +216,6 @@ def __init__( self.conv_act = nn.SiLU() self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, kernel_size=3, padding=1) - @property - def in_channels(self): - deprecate( - "in_channels", - "1.0.0", - "Accessing `in_channels` directly via unet.in_channels is deprecated. Please use `unet.config.in_channels` instead", - standard_warn=False, - ) - return self.config.in_channels - def forward( self, sample: torch.FloatTensor, diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py index 1b982aedc5de..b2814356939b 100644 --- a/src/diffusers/models/unet_2d_condition.py +++ b/src/diffusers/models/unet_2d_condition.py @@ -21,7 +21,7 @@ from ..configuration_utils import ConfigMixin, register_to_config from ..loaders import UNet2DConditionLoadersMixin -from ..utils import BaseOutput, deprecate, logging +from ..utils import BaseOutput, logging from .attention_processor import AttentionProcessor, AttnProcessor from .embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps from .modeling_utils import ModelMixin @@ -447,16 +447,6 @@ def __init__( block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding ) - @property - def in_channels(self): - deprecate( - "in_channels", - "1.0.0", - "Accessing `in_channels` directly via unet.in_channels is deprecated. Please use `unet.config.in_channels` instead", - standard_warn=False, - ) - return self.config.in_channels - @property def attn_processors(self) -> Dict[str, AttentionProcessor]: r""" diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index d531d967c3d1..2d61f1a3700f 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -508,7 +508,7 @@ def register_modules(self, **kwargs): setattr(self, name, module) def __setattr__(self, name: str, value: Any): - if hasattr(self, name) and hasattr(self.config, name): + if name in self.__dict__ and hasattr(self.config, name): # We need to overwrite the config if name exists in config if isinstance(getattr(self.config, name), (tuple, list)): if value is not None and self.config[name][0] is not None: @@ -648,26 +648,25 @@ def module_is_offloaded(module): ) module_names, _ = self._get_signature_keys(self) - module_names = [m for m in module_names if hasattr(self, m)] + modules = [getattr(self, n, None) for n in module_names] + modules = [m for m in modules if isinstance(m, torch.nn.Module)] is_offloaded = pipeline_is_offloaded or pipeline_is_sequentially_offloaded - for name in module_names: - module = getattr(self, name) - if isinstance(module, torch.nn.Module): - module.to(torch_device, torch_dtype) - if ( - module.dtype == torch.float16 - and str(torch_device) in ["cpu"] - and not silence_dtype_warnings - and not is_offloaded - ): - logger.warning( - "Pipelines loaded with `torch_dtype=torch.float16` cannot run with `cpu` device. It" - " is not recommended to move them to `cpu` as running them will fail. Please make" - " sure to use an accelerator to run the pipeline in inference, due to the lack of" - " support for`float16` operations on this device in PyTorch. Please, remove the" - " `torch_dtype=torch.float16` argument, or use another device for inference." - ) + for module in modules: + module.to(torch_device, torch_dtype) + if ( + module.dtype == torch.float16 + and str(torch_device) in ["cpu"] + and not silence_dtype_warnings + and not is_offloaded + ): + logger.warning( + "Pipelines loaded with `torch_dtype=torch.float16` cannot run with `cpu` device. It" + " is not recommended to move them to `cpu` as running them will fail. Please make" + " sure to use an accelerator to run the pipeline in inference, due to the lack of" + " support for`float16` operations on this device in PyTorch. Please, remove the" + " `torch_dtype=torch.float16` argument, or use another device for inference." + ) return self @property @@ -677,12 +676,12 @@ def device(self) -> torch.device: `torch.device`: The torch device on which the pipeline is located. """ module_names, _ = self._get_signature_keys(self) - module_names = [m for m in module_names if hasattr(self, m)] + modules = [getattr(self, n, None) for n in module_names] + modules = [m for m in modules if isinstance(m, torch.nn.Module)] + + for module in modules: + return module.device - for name in module_names: - module = getattr(self, name) - if isinstance(module, torch.nn.Module): - return module.device return torch.device("cpu") @classmethod @@ -1451,13 +1450,12 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module): for child in module.children(): fn_recursive_set_mem_eff(child) - module_names, _, _ = self.extract_init_dict(dict(self.config)) - module_names = [m for m in module_names if hasattr(self, m)] + module_names, _ = self._get_signature_keys(self) + modules = [getattr(self, n, None) for n in module_names] + modules = [m for m in modules if isinstance(m, torch.nn.Module)] - for module_name in module_names: - module = getattr(self, module_name) - if isinstance(module, torch.nn.Module): - fn_recursive_set_mem_eff(module) + for module in modules: + fn_recursive_set_mem_eff(module) def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"): r""" @@ -1484,10 +1482,9 @@ def disable_attention_slicing(self): self.enable_attention_slicing(None) def set_attention_slice(self, slice_size: Optional[int]): - module_names, _, _ = self.extract_init_dict(dict(self.config)) - module_names = [m for m in module_names if hasattr(self, m)] + module_names, _ = self._get_signature_keys(self) + modules = [getattr(self, n, None) for n in module_names] + modules = [m for m in modules if isinstance(m, torch.nn.Module) and hasattr(m, "set_attention_slice")] - for module_name in module_names: - module = getattr(self, module_name) - if isinstance(module, torch.nn.Module) and hasattr(module, "set_attention_slice"): - module.set_attention_slice(slice_size) + for module in modules: + module.set_attention_slice(slice_size) diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py index cf5e6e399a77..5b163bbbc8f5 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py @@ -441,7 +441,7 @@ def __call__( timesteps = self.scheduler.timesteps # Prepare latent variables - num_channels_latents = self.unet.in_channels + num_channels_latents = self.unet.config.in_channels latents = self.prepare_latents( batch_size * num_videos_per_prompt, num_channels_latents, diff --git a/src/diffusers/pipelines/unclip/pipeline_unclip.py b/src/diffusers/pipelines/unclip/pipeline_unclip.py index 3aac39b3a3b0..abbb48ce8f46 100644 --- a/src/diffusers/pipelines/unclip/pipeline_unclip.py +++ b/src/diffusers/pipelines/unclip/pipeline_unclip.py @@ -413,9 +413,9 @@ def __call__( self.decoder_scheduler.set_timesteps(decoder_num_inference_steps, device=device) decoder_timesteps_tensor = self.decoder_scheduler.timesteps - num_channels_latents = self.decoder.in_channels - height = self.decoder.sample_size - width = self.decoder.sample_size + num_channels_latents = self.decoder.config.in_channels + height = self.decoder.config.sample_size + width = self.decoder.config.sample_size decoder_latents = self.prepare_latents( (batch_size, num_channels_latents, height, width), @@ -466,9 +466,9 @@ def __call__( self.super_res_scheduler.set_timesteps(super_res_num_inference_steps, device=device) super_res_timesteps_tensor = self.super_res_scheduler.timesteps - channels = self.super_res_first.in_channels // 2 - height = self.super_res_first.sample_size - width = self.super_res_first.sample_size + channels = self.super_res_first.config.in_channels // 2 + height = self.super_res_first.config.sample_size + width = self.super_res_first.config.sample_size super_res_latents = self.prepare_latents( (batch_size, channels, height, width), diff --git a/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py b/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py index 56d522354d9a..30d74cd36bb0 100644 --- a/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +++ b/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py @@ -339,9 +339,9 @@ def __call__( self.decoder_scheduler.set_timesteps(decoder_num_inference_steps, device=device) decoder_timesteps_tensor = self.decoder_scheduler.timesteps - num_channels_latents = self.decoder.in_channels - height = self.decoder.sample_size - width = self.decoder.sample_size + num_channels_latents = self.decoder.config.in_channels + height = self.decoder.config.sample_size + width = self.decoder.config.sample_size if decoder_latents is None: decoder_latents = self.prepare_latents( @@ -393,9 +393,9 @@ def __call__( self.super_res_scheduler.set_timesteps(super_res_num_inference_steps, device=device) super_res_timesteps_tensor = self.super_res_scheduler.timesteps - channels = self.super_res_first.in_channels // 2 - height = self.super_res_first.sample_size - width = self.super_res_first.sample_size + channels = self.super_res_first.config.in_channels // 2 + height = self.super_res_first.config.sample_size + width = self.super_res_first.config.sample_size if super_res_latents is None: super_res_latents = self.prepare_latents( diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py index 35ddfcadc3cb..4377be1181a8 100644 --- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py +++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py @@ -18,7 +18,7 @@ from ...models.embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps from ...models.transformer_2d import Transformer2DModel from ...models.unet_2d_condition import UNet2DConditionOutput -from ...utils import deprecate, logging +from ...utils import logging logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -544,19 +544,6 @@ def __init__( block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding ) - @property - def in_channels(self): - deprecate( - "in_channels", - "1.0.0", - ( - "Accessing `in_channels` directly via unet.in_channels is deprecated. Please use" - " `unet.config.in_channels` instead" - ), - standard_warn=False, - ) - return self.config.in_channels - @property def attn_processors(self) -> Dict[str, AttentionProcessor]: r""" diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py index 0f385ed6612c..661a1bd3cf73 100644 --- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py @@ -533,7 +533,7 @@ def __call__( timesteps = self.scheduler.timesteps # 5. Prepare latent variables - num_channels_latents = self.image_unet.in_channels + num_channels_latents = self.image_unet.config.in_channels latents = self.prepare_latents( batch_size * num_images_per_prompt, num_channels_latents, diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py index 2b47184d7773..e3a2ee370362 100644 --- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py @@ -378,7 +378,7 @@ def __call__( timesteps = self.scheduler.timesteps # 5. Prepare latent variables - num_channels_latents = self.image_unet.in_channels + num_channels_latents = self.image_unet.config.in_channels latents = self.prepare_latents( batch_size * num_images_per_prompt, num_channels_latents, diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py index fdca625fd99d..26b9be2bfa76 100644 --- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py @@ -452,7 +452,7 @@ def __call__( timesteps = self.scheduler.timesteps # 5. Prepare latent variables - num_channels_latents = self.image_unet.in_channels + num_channels_latents = self.image_unet.config.in_channels latents = self.prepare_latents( batch_size * num_images_per_prompt, num_channels_latents, diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py index 2bc34bb8b444..a8a71fe420aa 100644 --- a/src/diffusers/schedulers/scheduling_ddpm.py +++ b/src/diffusers/schedulers/scheduling_ddpm.py @@ -22,7 +22,7 @@ import torch from ..configuration_utils import ConfigMixin, register_to_config -from ..utils import BaseOutput, deprecate, randn_tensor +from ..utils import BaseOutput, randn_tensor from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin @@ -168,16 +168,6 @@ def __init__( self.variance_type = variance_type - @property - def num_train_timesteps(self): - deprecate( - "num_train_timesteps", - "1.0.0", - "Accessing `num_train_timesteps` directly via scheduler.num_train_timesteps is deprecated. Please use `scheduler.config.num_train_timesteps instead`", - standard_warn=False, - ) - return self.config.num_train_timesteps - def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the diff --git a/src/diffusers/utils/deprecation_utils.py b/src/diffusers/utils/deprecation_utils.py index 6bdda664e102..f482deddd2f4 100644 --- a/src/diffusers/utils/deprecation_utils.py +++ b/src/diffusers/utils/deprecation_utils.py @@ -5,7 +5,7 @@ from packaging import version -def deprecate(*args, take_from: Optional[Union[Dict, Any]] = None, standard_warn=True): +def deprecate(*args, take_from: Optional[Union[Dict, Any]] = None, standard_warn=True, stacklevel=2): from .. import __version__ deprecated_kwargs = take_from @@ -32,7 +32,7 @@ def deprecate(*args, take_from: Optional[Union[Dict, Any]] = None, standard_warn if warning is not None: warning = warning + " " if standard_warn else "" - warnings.warn(warning + message, FutureWarning, stacklevel=2) + warnings.warn(warning + message, FutureWarning, stacklevel=stacklevel) if isinstance(deprecated_kwargs, dict) and len(deprecated_kwargs) > 0: call_frame = inspect.getouterframes(inspect.currentframe())[1] diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py index 40aba3b24967..4a94a77fcabb 100644 --- a/tests/models/test_modeling_common.py +++ b/tests/models/test_modeling_common.py @@ -26,8 +26,8 @@ from diffusers.models import UNet2DConditionModel from diffusers.training_utils import EMAModel -from diffusers.utils import torch_device -from diffusers.utils.testing_utils import require_torch_gpu +from diffusers.utils import logging, torch_device +from diffusers.utils.testing_utils import CaptureLogger, require_torch_gpu class ModelUtilsTest(unittest.TestCase): @@ -155,6 +155,49 @@ def test_from_save_pretrained(self): max_diff = (image - new_image).abs().sum().item() self.assertLessEqual(max_diff, 5e-5, "Models give different forward passes") + def test_getattr_is_correct(self): + init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() + model = self.model_class(**init_dict) + + # save some things to test + model.dummy_attribute = 5 + model.register_to_config(test_attribute=5) + + logger = logging.get_logger("diffusers.models.modeling_utils") + # 30 for warning + logger.setLevel(30) + with CaptureLogger(logger) as cap_logger: + assert hasattr(model, "dummy_attribute") + assert getattr(model, "dummy_attribute") == 5 + assert model.dummy_attribute == 5 + + # no warning should be thrown + assert cap_logger.out == "" + + logger = logging.get_logger("diffusers.models.modeling_utils") + # 30 for warning + logger.setLevel(30) + with CaptureLogger(logger) as cap_logger: + assert hasattr(model, "save_pretrained") + fn = model.save_pretrained + fn_1 = getattr(model, "save_pretrained") + + assert fn == fn_1 + # no warning should be thrown + assert cap_logger.out == "" + + # warning should be thrown + with self.assertWarns(FutureWarning): + assert model.test_attribute == 5 + + with self.assertWarns(FutureWarning): + assert getattr(model, "test_attribute") == 5 + + with self.assertRaises(AttributeError) as error: + model.does_not_exist + + assert str(error.exception) == f"'{type(model).__name__}' object has no attribute 'does_not_exist'" + def test_from_save_pretrained_variant(self): init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() diff --git a/tests/pipelines/unclip/test_unclip.py b/tests/pipelines/unclip/test_unclip.py index 4df3e4d3828b..d2c699ea501d 100644 --- a/tests/pipelines/unclip/test_unclip.py +++ b/tests/pipelines/unclip/test_unclip.py @@ -293,16 +293,16 @@ class DummyScheduler: prior_latents = pipe.prepare_latents( shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler() ) - shape = (batch_size, decoder.in_channels, decoder.sample_size, decoder.sample_size) + shape = (batch_size, decoder.config.in_channels, decoder.config.sample_size, decoder.config.sample_size) decoder_latents = pipe.prepare_latents( shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler() ) shape = ( batch_size, - super_res_first.in_channels // 2, - super_res_first.sample_size, - super_res_first.sample_size, + super_res_first.config.in_channels // 2, + super_res_first.config.sample_size, + super_res_first.config.sample_size, ) super_res_latents = pipe.prepare_latents( shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler() diff --git a/tests/pipelines/unclip/test_unclip_image_variation.py b/tests/pipelines/unclip/test_unclip_image_variation.py index 57d15559cc75..c1b8be9cd49e 100644 --- a/tests/pipelines/unclip/test_unclip_image_variation.py +++ b/tests/pipelines/unclip/test_unclip_image_variation.py @@ -379,16 +379,21 @@ class DummyScheduler: dtype = pipe.decoder.dtype batch_size = 1 - shape = (batch_size, pipe.decoder.in_channels, pipe.decoder.sample_size, pipe.decoder.sample_size) + shape = ( + batch_size, + pipe.decoder.config.in_channels, + pipe.decoder.config.sample_size, + pipe.decoder.config.sample_size, + ) decoder_latents = pipe.prepare_latents( shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler() ) shape = ( batch_size, - pipe.super_res_first.in_channels // 2, - pipe.super_res_first.sample_size, - pipe.super_res_first.sample_size, + pipe.super_res_first.config.in_channels // 2, + pipe.super_res_first.config.sample_size, + pipe.super_res_first.config.sample_size, ) super_res_latents = pipe.prepare_latents( shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler() diff --git a/tests/schedulers/test_schedulers.py b/tests/schedulers/test_schedulers.py index bfbf5cbc798f..69cddb36dde2 100755 --- a/tests/schedulers/test_schedulers.py +++ b/tests/schedulers/test_schedulers.py @@ -596,3 +596,47 @@ def test_trained_betas(self): new_scheduler = scheduler_class.from_pretrained(tmpdirname) assert scheduler.betas.tolist() == new_scheduler.betas.tolist() + + def test_getattr_is_correct(self): + for scheduler_class in self.scheduler_classes: + scheduler_config = self.get_scheduler_config() + scheduler = scheduler_class(**scheduler_config) + + # save some things to test + scheduler.dummy_attribute = 5 + scheduler.register_to_config(test_attribute=5) + + logger = logging.get_logger("diffusers.configuration_utils") + # 30 for warning + logger.setLevel(30) + with CaptureLogger(logger) as cap_logger: + assert hasattr(scheduler, "dummy_attribute") + assert getattr(scheduler, "dummy_attribute") == 5 + assert scheduler.dummy_attribute == 5 + + # no warning should be thrown + assert cap_logger.out == "" + + logger = logging.get_logger("diffusers.schedulers.schedulering_utils") + # 30 for warning + logger.setLevel(30) + with CaptureLogger(logger) as cap_logger: + assert hasattr(scheduler, "save_pretrained") + fn = scheduler.save_pretrained + fn_1 = getattr(scheduler, "save_pretrained") + + assert fn == fn_1 + # no warning should be thrown + assert cap_logger.out == "" + + # warning should be thrown + with self.assertWarns(FutureWarning): + assert scheduler.test_attribute == 5 + + with self.assertWarns(FutureWarning): + assert getattr(scheduler, "test_attribute") == 5 + + with self.assertRaises(AttributeError) as error: + scheduler.does_not_exist + + assert str(error.exception) == f"'{type(scheduler).__name__}' object has no attribute 'does_not_exist'" From 3b641eabe9876e7c48977b35331fda54ce972b4a Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Tue, 18 Apr 2023 08:36:13 +0530 Subject: [PATCH 20/71] feat: verfication of multi-gpu support for select examples. (#3126) * feat: verfication of multi-gpu support for select examples. * add: multi-gpu training sections to the relvant doc pages. --- docs/source/en/training/controlnet.mdx | 23 +++++++++++++++++ docs/source/en/training/instructpix2pix.mdx | 21 ++++++++++++++++ docs/source/en/training/text2image.mdx | 25 +++++++++++++++++++ .../en/training/unconditional_training.mdx | 20 +++++++++++++++ examples/controlnet/README.md | 23 +++++++++++++++++ examples/instruct_pix2pix/README.md | 21 ++++++++++++++++ examples/text_to_image/README.md | 25 +++++++++++++++++++ examples/text_to_image/train_text_to_image.py | 4 +-- .../unconditional_image_generation/README.md | 23 ++++++++++++++++- 9 files changed, 182 insertions(+), 3 deletions(-) diff --git a/docs/source/en/training/controlnet.mdx b/docs/source/en/training/controlnet.mdx index 6b7539b89b07..7a5454107b83 100644 --- a/docs/source/en/training/controlnet.mdx +++ b/docs/source/en/training/controlnet.mdx @@ -113,6 +113,29 @@ accelerate launch train_controlnet.py \ --gradient_accumulation_steps=4 ``` +## Training with multiple GPUs + +`accelerate` allows for seamless multi-GPU training. Follow the instructions [here](https://huggingface.co/docs/accelerate/basic_tutorials/launch) +for running distributed training with `accelerate`. Here is an example command: + +```bash +export MODEL_DIR="runwayml/stable-diffusion-v1-5" +export OUTPUT_DIR="path to save model" + +accelerate launch --mixed_precision="fp16" --multi_gpu train_controlnet.py \ + --pretrained_model_name_or_path=$MODEL_DIR \ + --output_dir=$OUTPUT_DIR \ + --dataset_name=fusing/fill50k \ + --resolution=512 \ + --learning_rate=1e-5 \ + --validation_image "./conditioning_image_1.png" "./conditioning_image_2.png" \ + --validation_prompt "red circle with blue background" "cyan circle with brown floral background" \ + --train_batch_size=4 \ + --mixed_precision="fp16" \ + --tracker_project_name="controlnet-demo" \ + --report_to=wandb +``` + ## Example results #### After 300 steps with batch size 8 diff --git a/docs/source/en/training/instructpix2pix.mdx b/docs/source/en/training/instructpix2pix.mdx index e6f050b34acf..c485db6d6b20 100644 --- a/docs/source/en/training/instructpix2pix.mdx +++ b/docs/source/en/training/instructpix2pix.mdx @@ -126,6 +126,27 @@ accelerate launch --mixed_precision="fp16" train_instruct_pix2pix.py \ ***Note: In the original paper, the authors observed that even when the model is trained with an image resolution of 256x256, it generalizes well to bigger resolutions such as 512x512. This is likely because of the larger dataset they used during training.*** + ## Training with multiple GPUs + +`accelerate` allows for seamless multi-GPU training. Follow the instructions [here](https://huggingface.co/docs/accelerate/basic_tutorials/launch) +for running distributed training with `accelerate`. Here is an example command: + +```bash +accelerate launch --mixed_precision="fp16" --multi_gpu train_instruct_pix2pix.py \ + --pretrained_model_name_or_path=runwayml/stable-diffusion-v1-5 \ + --dataset_name=sayakpaul/instructpix2pix-1000-samples \ + --use_ema \ + --enable_xformers_memory_efficient_attention \ + --resolution=512 --random_flip \ + --train_batch_size=4 --gradient_accumulation_steps=4 --gradient_checkpointing \ + --max_train_steps=15000 \ + --checkpointing_steps=5000 --checkpoints_total_limit=1 \ + --learning_rate=5e-05 --lr_warmup_steps=0 \ + --conditioning_dropout_prob=0.05 \ + --mixed_precision=fp16 \ + --seed=42 +``` + ## Inference Once training is complete, we can perform inference: diff --git a/docs/source/en/training/text2image.mdx b/docs/source/en/training/text2image.mdx index 4f57ccf94de0..70f8c003a787 100644 --- a/docs/source/en/training/text2image.mdx +++ b/docs/source/en/training/text2image.mdx @@ -106,6 +106,31 @@ accelerate launch train_text_to_image.py \ --lr_scheduler="constant" --lr_warmup_steps=0 \ --output_dir=${OUTPUT_DIR} ``` + +#### Training with multiple GPUs + +`accelerate` allows for seamless multi-GPU training. Follow the instructions [here](https://huggingface.co/docs/accelerate/basic_tutorials/launch) +for running distributed training with `accelerate`. Here is an example command: + +```bash +export MODEL_NAME="CompVis/stable-diffusion-v1-4" +export dataset_name="lambdalabs/pokemon-blip-captions" + +accelerate launch --mixed_precision="fp16" --multi_gpu train_text_to_image.py \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --dataset_name=$dataset_name \ + --use_ema \ + --resolution=512 --center_crop --random_flip \ + --train_batch_size=1 \ + --gradient_accumulation_steps=4 \ + --gradient_checkpointing \ + --max_train_steps=15000 \ + --learning_rate=1e-05 \ + --max_grad_norm=1 \ + --lr_scheduler="constant" --lr_warmup_steps=0 \ + --output_dir="sd-pokemon-model" +``` + With Flax, it's possible to train a Stable Diffusion model faster on TPUs and GPUs thanks to [@duongna211](https://github.com/duongna21). This is very efficient on TPU hardware but works great on GPUs too. The Flax training script doesn't support features like gradient checkpointing or gradient accumulation yet, so you'll need a GPU with at least 30GB of memory or a TPU v3. diff --git a/docs/source/en/training/unconditional_training.mdx b/docs/source/en/training/unconditional_training.mdx index 26517fd1fcf8..514932d4b22d 100644 --- a/docs/source/en/training/unconditional_training.mdx +++ b/docs/source/en/training/unconditional_training.mdx @@ -122,6 +122,26 @@ accelerate launch train_unconditional.py \ +### Training with multiple GPUs + +`accelerate` allows for seamless multi-GPU training. Follow the instructions [here](https://huggingface.co/docs/accelerate/basic_tutorials/launch) +for running distributed training with `accelerate`. Here is an example command: + +```bash +accelerate launch --mixed_precision="fp16" --multi_gpu train_unconditional.py \ + --dataset_name="huggan/pokemon" \ + --resolution=64 --center_crop --random_flip \ + --output_dir="ddpm-ema-pokemon-64" \ + --train_batch_size=16 \ + --num_epochs=100 \ + --gradient_accumulation_steps=1 \ + --use_ema \ + --learning_rate=1e-4 \ + --lr_warmup_steps=500 \ + --mixed_precision="fp16" \ + --logger="wandb" +``` + ## Finetuning with your own data There are two ways to finetune a model on your own dataset: diff --git a/examples/controlnet/README.md b/examples/controlnet/README.md index 387755624729..571e9e708cf2 100644 --- a/examples/controlnet/README.md +++ b/examples/controlnet/README.md @@ -96,6 +96,29 @@ accelerate launch train_controlnet.py \ --gradient_accumulation_steps=4 ``` +## Training with multiple GPUs + +`accelerate` allows for seamless multi-GPU training. Follow the instructions [here](https://huggingface.co/docs/accelerate/basic_tutorials/launch) +for running distributed training with `accelerate`. Here is an example command: + +```bash +export MODEL_DIR="runwayml/stable-diffusion-v1-5" +export OUTPUT_DIR="path to save model" + +accelerate launch --mixed_precision="fp16" --multi_gpu train_controlnet.py \ + --pretrained_model_name_or_path=$MODEL_DIR \ + --output_dir=$OUTPUT_DIR \ + --dataset_name=fusing/fill50k \ + --resolution=512 \ + --learning_rate=1e-5 \ + --validation_image "./conditioning_image_1.png" "./conditioning_image_2.png" \ + --validation_prompt "red circle with blue background" "cyan circle with brown floral background" \ + --train_batch_size=4 \ + --mixed_precision="fp16" \ + --tracker_project_name="controlnet-demo" \ + --report_to=wandb +``` + ## Example results #### After 300 steps with batch size 8 diff --git a/examples/instruct_pix2pix/README.md b/examples/instruct_pix2pix/README.md index 02f0fed04299..94a7bd2a98f6 100644 --- a/examples/instruct_pix2pix/README.md +++ b/examples/instruct_pix2pix/README.md @@ -113,6 +113,27 @@ accelerate launch --mixed_precision="fp16" train_instruct_pix2pix.py \ ***Note: In the original paper, the authors observed that even when the model is trained with an image resolution of 256x256, it generalizes well to bigger resolutions such as 512x512. This is likely because of the larger dataset they used during training.*** + ## Training with multiple GPUs + +`accelerate` allows for seamless multi-GPU training. Follow the instructions [here](https://huggingface.co/docs/accelerate/basic_tutorials/launch) +for running distributed training with `accelerate`. Here is an example command: + +```bash +accelerate launch --mixed_precision="fp16" --multi_gpu train_instruct_pix2pix.py \ + --pretrained_model_name_or_path=runwayml/stable-diffusion-v1-5 \ + --dataset_name=sayakpaul/instructpix2pix-1000-samples \ + --use_ema \ + --enable_xformers_memory_efficient_attention \ + --resolution=512 --random_flip \ + --train_batch_size=4 --gradient_accumulation_steps=4 --gradient_checkpointing \ + --max_train_steps=15000 \ + --checkpointing_steps=5000 --checkpoints_total_limit=1 \ + --learning_rate=5e-05 --lr_warmup_steps=0 \ + --conditioning_dropout_prob=0.05 \ + --mixed_precision=fp16 \ + --seed=42 +``` + ## Inference Once training is complete, we can perform inference: diff --git a/examples/text_to_image/README.md b/examples/text_to_image/README.md index c84db0ceee64..406a64b3759f 100644 --- a/examples/text_to_image/README.md +++ b/examples/text_to_image/README.md @@ -111,6 +111,31 @@ image = pipe(prompt="yoda").images[0] image.save("yoda-pokemon.png") ``` +#### Training with multiple GPUs + +`accelerate` allows for seamless multi-GPU training. Follow the instructions [here](https://huggingface.co/docs/accelerate/basic_tutorials/launch) +for running distributed training with `accelerate`. Here is an example command: + +```bash +export MODEL_NAME="CompVis/stable-diffusion-v1-4" +export dataset_name="lambdalabs/pokemon-blip-captions" + +accelerate launch --mixed_precision="fp16" --multi_gpu train_text_to_image.py \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --dataset_name=$dataset_name \ + --use_ema \ + --resolution=512 --center_crop --random_flip \ + --train_batch_size=1 \ + --gradient_accumulation_steps=4 \ + --gradient_checkpointing \ + --max_train_steps=15000 \ + --learning_rate=1e-05 \ + --max_grad_norm=1 \ + --lr_scheduler="constant" --lr_warmup_steps=0 \ + --output_dir="sd-pokemon-model" +``` + + #### Training with Min-SNR weighting We support training with the Min-SNR weighting strategy proposed in [Efficient Diffusion Training via Min-SNR Weighting Strategy](https://arxiv.org/abs/2303.09556) which helps to achieve faster convergence diff --git a/examples/text_to_image/train_text_to_image.py b/examples/text_to_image/train_text_to_image.py index 4bbf4706f01c..67724698c099 100644 --- a/examples/text_to_image/train_text_to_image.py +++ b/examples/text_to_image/train_text_to_image.py @@ -64,8 +64,8 @@ def log_validation(vae, text_encoder, tokenizer, unet, args, accelerator, weight pipeline = StableDiffusionPipeline.from_pretrained( args.pretrained_model_name_or_path, - vae=vae, - text_encoder=text_encoder, + vae=accelerator.unwrap_model(vae), + text_encoder=accelerator.unwrap_model(text_encoder), tokenizer=tokenizer, unet=accelerator.unwrap_model(unet), safety_checker=None, diff --git a/examples/unconditional_image_generation/README.md b/examples/unconditional_image_generation/README.md index db06d9011681..d83dc928c7a1 100644 --- a/examples/unconditional_image_generation/README.md +++ b/examples/unconditional_image_generation/README.md @@ -1,4 +1,4 @@ -## Training examples +## Training an unconditional diffusion model Creating a training image set is [described in a different document](https://huggingface.co/docs/datasets/image_process#image-datasets). @@ -76,6 +76,27 @@ A full training run takes 2 hours on 4xV100 GPUs. +### Training with multiple GPUs + +`accelerate` allows for seamless multi-GPU training. Follow the instructions [here](https://huggingface.co/docs/accelerate/basic_tutorials/launch) +for running distributed training with `accelerate`. Here is an example command: + +```bash +accelerate launch --mixed_precision="fp16" --multi_gpu train_unconditional.py \ + --dataset_name="huggan/pokemon" \ + --resolution=64 --center_crop --random_flip \ + --output_dir="ddpm-ema-pokemon-64" \ + --train_batch_size=16 \ + --num_epochs=100 \ + --gradient_accumulation_steps=1 \ + --use_ema \ + --learning_rate=1e-4 \ + --lr_warmup_steps=500 \ + --mixed_precision="fp16" \ + --logger="wandb" +``` + +To be able to use Weights and Biases (`wandb`) as a logger you need to install the library: `pip install wandb`. ### Using your own data From cd8b7507c2c674046be921a3954f64a9d1e83d0f Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Tue, 18 Apr 2023 02:02:25 -1000 Subject: [PATCH 21/71] speed up attend-and-excite fast tests (#3079) --- .../test_stable_diffusion_attend_and_excite.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py index f153ae08cbb6..846e251f3ce2 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py @@ -44,7 +44,7 @@ def get_dummy_components(self): torch.manual_seed(0) unet = UNet2DConditionModel( block_out_channels=(32, 64), - layers_per_block=2, + layers_per_block=1, sample_size=32, in_channels=4, out_channels=4, @@ -111,7 +111,7 @@ def get_dummy_inputs(self, device, seed=0): "prompt": "a cat and a frog", "token_indices": [2, 5], "generator": generator, - "num_inference_steps": 2, + "num_inference_steps": 1, "guidance_scale": 6.0, "output_type": "numpy", "max_iter_to_alter": 2, @@ -132,13 +132,18 @@ def test_inference(self): image_slice = image[0, -3:, -3:, -1] self.assertEqual(image.shape, (1, 64, 64, 3)) - expected_slice = np.array([0.5743, 0.6081, 0.4975, 0.5021, 0.5441, 0.4699, 0.4988, 0.4841, 0.4851]) + expected_slice = np.array( + [0.63905364, 0.62897307, 0.48599017, 0.5133624, 0.5550048, 0.45769516, 0.50326973, 0.5023139, 0.45384496] + ) max_diff = np.abs(image_slice.flatten() - expected_slice).max() self.assertLessEqual(max_diff, 1e-3) def test_inference_batch_consistent(self): # NOTE: Larger batch sizes cause this test to timeout, only test on smaller batches - self._test_inference_batch_consistent(batch_sizes=[2, 4]) + self._test_inference_batch_consistent(batch_sizes=[1, 2]) + + def test_inference_batch_single_identical(self): + self._test_inference_batch_single_identical(batch_size=2) @require_torch_gpu From 8ecdd3ef657b168a8058a99772871cae91a21b63 Mon Sep 17 00:00:00 2001 From: Cristian Garcia Date: Tue, 18 Apr 2023 07:03:00 -0500 Subject: [PATCH 22/71] Optimize log_validation in train_controlnet_flax (#3110) extract pipeline from log_validation --- examples/controlnet/train_controlnet_flax.py | 35 +++++++++++--------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/examples/controlnet/train_controlnet_flax.py b/examples/controlnet/train_controlnet_flax.py index 0b413ace09d2..24b32e7f4301 100644 --- a/examples/controlnet/train_controlnet_flax.py +++ b/examples/controlnet/train_controlnet_flax.py @@ -76,20 +76,11 @@ def image_grid(imgs, rows, cols): return grid -def log_validation(controlnet, controlnet_params, tokenizer, args, rng, weight_dtype): - logger.info("Running validation... ") +def log_validation(pipeline, pipeline_params, controlnet_params, tokenizer, args, rng, weight_dtype): + logger.info("Running validation...") - pipeline, params = FlaxStableDiffusionControlNetPipeline.from_pretrained( - args.pretrained_model_name_or_path, - tokenizer=tokenizer, - controlnet=controlnet, - safety_checker=None, - dtype=weight_dtype, - revision=args.revision, - from_pt=args.from_pt, - ) - params = jax_utils.replicate(params) - params["controlnet"] = controlnet_params + pipeline_params = pipeline_params.copy() + pipeline_params["controlnet"] = controlnet_params num_samples = jax.device_count() prng_seed = jax.random.split(rng, jax.device_count()) @@ -121,7 +112,7 @@ def log_validation(controlnet, controlnet_params, tokenizer, args, rng, weight_d images = pipeline( prompt_ids=prompt_ids, image=processed_image, - params=params, + params=pipeline_params, prng_seed=prng_seed, num_inference_steps=50, jit=True, @@ -176,6 +167,7 @@ def save_model_card(repo_id: str, image_logs=None, base_model=str, repo_folder=N - text-to-image - diffusers - controlnet +- jax-diffusers-event inference: true --- """ @@ -800,6 +792,17 @@ def main(): ]: controlnet_params[key] = unet_params[key] + pipeline, pipeline_params = FlaxStableDiffusionControlNetPipeline.from_pretrained( + args.pretrained_model_name_or_path, + tokenizer=tokenizer, + controlnet=controlnet, + safety_checker=None, + dtype=weight_dtype, + revision=args.revision, + from_pt=args.from_pt, + ) + pipeline_params = jax_utils.replicate(pipeline_params) + # Optimization if args.scale_lr: args.learning_rate = args.learning_rate * total_train_batch_size @@ -1073,7 +1076,7 @@ def l2(xs): and global_step % args.validation_steps == 0 and jax.process_index() == 0 ): - _ = log_validation(controlnet, state.params, tokenizer, args, validation_rng, weight_dtype) + _ = log_validation(pipeline, pipeline_params, state.params, tokenizer, args, validation_rng, weight_dtype) if global_step % args.logging_steps == 0 and jax.process_index() == 0: if args.report_to == "wandb": @@ -1105,7 +1108,7 @@ def l2(xs): if args.validation_prompt is not None: if args.profile_validation: jax.profiler.start_trace(args.output_dir) - image_logs = log_validation(controlnet, state.params, tokenizer, args, validation_rng, weight_dtype) + image_logs = log_validation(pipeline, pipeline_params, state.params, tokenizer, args, validation_rng, weight_dtype) if args.profile_validation: jax.profiler.stop_trace() else: From f2df39fa0e6246d13aea03364366b2d53a4ab5f9 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 18 Apr 2023 14:03:17 +0200 Subject: [PATCH 23/71] make style --- examples/controlnet/train_controlnet_flax.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/examples/controlnet/train_controlnet_flax.py b/examples/controlnet/train_controlnet_flax.py index 24b32e7f4301..b25f9325403f 100644 --- a/examples/controlnet/train_controlnet_flax.py +++ b/examples/controlnet/train_controlnet_flax.py @@ -1076,7 +1076,9 @@ def l2(xs): and global_step % args.validation_steps == 0 and jax.process_index() == 0 ): - _ = log_validation(pipeline, pipeline_params, state.params, tokenizer, args, validation_rng, weight_dtype) + _ = log_validation( + pipeline, pipeline_params, state.params, tokenizer, args, validation_rng, weight_dtype + ) if global_step % args.logging_steps == 0 and jax.process_index() == 0: if args.report_to == "wandb": @@ -1108,7 +1110,9 @@ def l2(xs): if args.validation_prompt is not None: if args.profile_validation: jax.profiler.start_trace(args.output_dir) - image_logs = log_validation(pipeline, pipeline_params, state.params, tokenizer, args, validation_rng, weight_dtype) + image_logs = log_validation( + pipeline, pipeline_params, state.params, tokenizer, args, validation_rng, weight_dtype + ) if args.profile_validation: jax.profiler.stop_trace() else: From 4bc157ffa90a2a967247952a82ea76bea5c5d990 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 18 Apr 2023 17:35:12 +0200 Subject: [PATCH 24/71] Correct textual inversion readme (#3145) * Update README.md * Apply suggestions from code review --- examples/textual_inversion/README.md | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/examples/textual_inversion/README.md b/examples/textual_inversion/README.md index 3a7c96be69fb..4d420b284f38 100644 --- a/examples/textual_inversion/README.md +++ b/examples/textual_inversion/README.md @@ -39,29 +39,31 @@ accelerate config ### Cat toy example -You need to accept the model license before downloading or using the weights. In this example we'll use model version `v1-5`, so you'll need to visit [its card](https://huggingface.co/runwayml/stable-diffusion-v1-5), read the license and tick the checkbox if you agree. - -You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens). - -Run the following command to authenticate your token +First, let's login so that we can upload the checkpoint to the Hub during training: ```bash huggingface-cli login ``` -If you have already cloned the repo, then you won't need to go through these steps. +Now let's get our dataset. For this example we will use some cat images: https://huggingface.co/datasets/diffusers/cat_toy_example . -
+Let's first download it locally: -Now let's get our dataset.Download 3-4 images from [here](https://drive.google.com/drive/folders/1fmJMs25nxS_rSNqS5hTcRdLem_YQXbq5) and save them in a directory. This will be our training data. +```py +from huggingface_hub import snapshot_download -And launch the training using +local_dir = "./cat" +snapshot_download("diffusers/cat_toy_example", local_dir=local_dir, repo_type="dataset", ignore_patterns=".gitattributes") +``` + +This will be our training data. +Now we can launch the training using **___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___** ```bash export MODEL_NAME="runwayml/stable-diffusion-v1-5" -export DATA_DIR="path-to-dir-containing-images" +export DATA_DIR="./cat" accelerate launch textual_inversion.py \ --pretrained_model_name_or_path=$MODEL_NAME \ @@ -75,6 +77,7 @@ accelerate launch textual_inversion.py \ --learning_rate=5.0e-04 --scale_lr \ --lr_scheduler="constant" \ --lr_warmup_steps=0 \ + --push_to_hub \ --output_dir="textual_inversion_cat" ``` From f0c74e9a756daf5295105444470655aacce5cd9c Mon Sep 17 00:00:00 2001 From: Will Berman Date: Tue, 18 Apr 2023 14:13:16 -0700 Subject: [PATCH 25/71] Add unet act fn to other model components (#3136) Adding act fn config to the unet timestep class embedding and conv activation. The custom activation defaults to silu which is the default activation function for both the conv act and the timestep class embeddings so default behavior is not changed. The only unet which use the custom activation is the stable diffusion latent upscaler https://huggingface.co/stabilityai/sd-x2-latent-upscaler/blob/main/unet/config.json (I ran a script against the hub to confirm). The latent upscaler does not use the conv activation nor the timestep class embeddings so we don't change its behavior. --- src/diffusers/models/unet_2d_condition.py | 15 +++++++++++++-- .../versatile_diffusion/modeling_text_unet.py | 15 +++++++++++++-- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py index b2814356939b..29de8734d4e7 100644 --- a/src/diffusers/models/unet_2d_condition.py +++ b/src/diffusers/models/unet_2d_condition.py @@ -248,7 +248,7 @@ def __init__( if class_embed_type is None and num_class_embeds is not None: self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim) elif class_embed_type == "timestep": - self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim) + self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim, act_fn=act_fn) elif class_embed_type == "identity": self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim) elif class_embed_type == "projection": @@ -437,7 +437,18 @@ def __init__( self.conv_norm_out = nn.GroupNorm( num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps ) - self.conv_act = nn.SiLU() + + if act_fn == "swish": + self.conv_act = lambda x: F.silu(x) + elif act_fn == "mish": + self.conv_act = nn.Mish() + elif act_fn == "silu": + self.conv_act = nn.SiLU() + elif act_fn == "gelu": + self.conv_act = nn.GELU() + else: + raise ValueError(f"Unsupported activation function: {act_fn}") + else: self.conv_norm_out = None self.conv_act = None diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py index 4377be1181a8..b20f18c485d0 100644 --- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py +++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py @@ -345,7 +345,7 @@ def __init__( if class_embed_type is None and num_class_embeds is not None: self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim) elif class_embed_type == "timestep": - self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim) + self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim, act_fn=act_fn) elif class_embed_type == "identity": self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim) elif class_embed_type == "projection": @@ -534,7 +534,18 @@ def __init__( self.conv_norm_out = nn.GroupNorm( num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps ) - self.conv_act = nn.SiLU() + + if act_fn == "swish": + self.conv_act = lambda x: F.silu(x) + elif act_fn == "mish": + self.conv_act = nn.Mish() + elif act_fn == "silu": + self.conv_act = nn.SiLU() + elif act_fn == "gelu": + self.conv_act = nn.GELU() + else: + raise ValueError(f"Unsupported activation function: {act_fn}") + else: self.conv_norm_out = None self.conv_act = None From fc1883918ff73564e088a7c655a96f52ff915045 Mon Sep 17 00:00:00 2001 From: Will Berman Date: Tue, 18 Apr 2023 15:05:41 -0700 Subject: [PATCH 26/71] class labels timestep embeddings projection dtype cast (#3137) This mimics the dtype cast for the standard time embeddings --- src/diffusers/models/unet_2d_condition.py | 6 +++++- .../pipelines/versatile_diffusion/modeling_text_unet.py | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py index 29de8734d4e7..b4997a257643 100644 --- a/src/diffusers/models/unet_2d_condition.py +++ b/src/diffusers/models/unet_2d_condition.py @@ -659,7 +659,7 @@ def forward( t_emb = self.time_proj(timesteps) - # timesteps does not contain any weights and will always return f32 tensors + # `Timesteps` does not contain any weights and will always return f32 tensors # but time_embedding might actually be running in fp16. so we need to cast here. # there might be better ways to encapsulate this. t_emb = t_emb.to(dtype=self.dtype) @@ -673,6 +673,10 @@ def forward( if self.config.class_embed_type == "timestep": class_labels = self.time_proj(class_labels) + # `Timesteps` does not contain any weights and will always return f32 tensors + # there might be better ways to encapsulate this. + class_labels = class_labels.to(dtype=sample.dtype) + class_emb = self.class_embedding(class_labels).to(dtype=self.dtype) if self.config.class_embeddings_concat: diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py index b20f18c485d0..2a7b80d01da7 100644 --- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py +++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py @@ -756,7 +756,7 @@ def forward( t_emb = self.time_proj(timesteps) - # timesteps does not contain any weights and will always return f32 tensors + # `Timesteps` does not contain any weights and will always return f32 tensors # but time_embedding might actually be running in fp16. so we need to cast here. # there might be better ways to encapsulate this. t_emb = t_emb.to(dtype=self.dtype) @@ -770,6 +770,10 @@ def forward( if self.config.class_embed_type == "timestep": class_labels = self.time_proj(class_labels) + # `Timesteps` does not contain any weights and will always return f32 tensors + # there might be better ways to encapsulate this. + class_labels = class_labels.to(dtype=sample.dtype) + class_emb = self.class_embedding(class_labels).to(dtype=self.dtype) if self.config.class_embeddings_concat: From bdeff4d64a57e556c2b62f887da03a2c37c54d54 Mon Sep 17 00:00:00 2001 From: cmdr2 Date: Wed, 19 Apr 2023 18:07:07 +0530 Subject: [PATCH 27/71] [ckpt loader] Allow loading the Inpaint and Img2Img pipelines, while loading a ckpt model (#2705) * [ckpt loader] Allow loading the Inpaint and Img2Img pipelines, while loading a ckpt model * Address review comment from PR * PyLint formatting * Some more pylint fixes, unrelated to our change * Another pylint fix * Styling fix --- .../stable_diffusion/convert_from_ckpt.py | 97 +++++++++++++++---- 1 file changed, 78 insertions(+), 19 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py index a16213639526..dbc1b27e88be 100644 --- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py @@ -45,6 +45,8 @@ PNDMScheduler, PriorTransformer, StableDiffusionControlNetPipeline, + StableDiffusionImg2ImgPipeline, + StableDiffusionInpaintPipeline, StableDiffusionPipeline, StableUnCLIPImg2ImgPipeline, StableUnCLIPPipeline, @@ -979,6 +981,7 @@ def download_from_original_stable_diffusion_ckpt( image_size: int = 512, prediction_type: str = None, model_type: str = None, + is_img2img: bool = False, extract_ema: bool = False, scheduler_type: str = "pndm", num_in_channels: Optional[int] = None, @@ -1018,6 +1021,8 @@ def download_from_original_stable_diffusion_ckpt( model_type (`str`, *optional*, defaults to `None`): The pipeline type. `None` to automatically infer, or one of `["FrozenOpenCLIPEmbedder", "FrozenCLIPEmbedder", "PaintByExample"]`. + is_img2img (`bool`, *optional*, defaults to `False`): + Whether the model should be loaded as an img2img pipeline. extract_ema (`bool`, *optional*, defaults to `False`): Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights or not. Defaults to `False`. Pass `True` to extract the EMA weights. EMA weights usually yield higher quality images for @@ -1193,16 +1198,44 @@ def download_from_original_stable_diffusion_ckpt( requires_safety_checker=False, ) else: - pipe = StableDiffusionPipeline( - vae=vae, - text_encoder=text_model, - tokenizer=tokenizer, - unet=unet, - scheduler=scheduler, - safety_checker=None, - feature_extractor=None, - requires_safety_checker=False, - ) + if ( + hasattr(original_config, "model") + and hasattr(original_config.model, "target") + and "LatentInpaintDiffusion" in original_config.model.target + ): + pipe = StableDiffusionInpaintPipeline( + vae=vae, + text_encoder=text_model, + tokenizer=tokenizer, + unet=unet, + scheduler=scheduler, + safety_checker=None, + feature_extractor=None, + requires_safety_checker=False, + ) + else: + if is_img2img: + pipe = StableDiffusionImg2ImgPipeline( + vae=vae, + text_encoder=text_model, + tokenizer=tokenizer, + unet=unet, + scheduler=scheduler, + safety_checker=None, + feature_extractor=None, + requires_safety_checker=False, + ) + else: + pipe = StableDiffusionPipeline( + vae=vae, + text_encoder=text_model, + tokenizer=tokenizer, + unet=unet, + scheduler=scheduler, + safety_checker=None, + feature_extractor=None, + requires_safety_checker=False, + ) else: image_normalizer, image_noising_scheduler = stable_unclip_image_noising_components( original_config, clip_stats_path=clip_stats_path, device=device @@ -1293,15 +1326,41 @@ def download_from_original_stable_diffusion_ckpt( feature_extractor=feature_extractor, ) else: - pipe = StableDiffusionPipeline( - vae=vae, - text_encoder=text_model, - tokenizer=tokenizer, - unet=unet, - scheduler=scheduler, - safety_checker=safety_checker, - feature_extractor=feature_extractor, - ) + if ( + hasattr(original_config, "model") + and hasattr(original_config.model, "target") + and "LatentInpaintDiffusion" in original_config.model.target + ): + pipe = StableDiffusionInpaintPipeline( + vae=vae, + text_encoder=text_model, + tokenizer=tokenizer, + unet=unet, + scheduler=scheduler, + safety_checker=safety_checker, + feature_extractor=feature_extractor, + ) + else: + if is_img2img: + pipe = StableDiffusionImg2ImgPipeline( + vae=vae, + text_encoder=text_model, + tokenizer=tokenizer, + unet=unet, + scheduler=scheduler, + safety_checker=safety_checker, + feature_extractor=feature_extractor, + ) + else: + pipe = StableDiffusionPipeline( + vae=vae, + text_encoder=text_model, + tokenizer=tokenizer, + unet=unet, + scheduler=scheduler, + safety_checker=safety_checker, + feature_extractor=feature_extractor, + ) else: text_config = create_ldm_bert_config(original_config) text_model = convert_ldm_bert_checkpoint(checkpoint, text_config) From 86ecd4b795f865b5b615b8c54991c177bb3dbef5 Mon Sep 17 00:00:00 2001 From: 1lint <105617163+1lint@users.noreply.github.com> Date: Wed, 19 Apr 2023 11:07:36 -0500 Subject: [PATCH 28/71] add from_ckpt method as Mixin (#2318) * add mixin class for pipeline from original sd ckpt * Improve * make style * merge main into * Improve more * fix more * up * Apply suggestions from code review * finish docs * rename * make style --------- Co-authored-by: Patrick von Platen --- docs/source/en/api/loaders.mdx | 4 + .../pipelines/stable_diffusion/controlnet.mdx | 1 + .../pipelines/stable_diffusion/depth2img.mdx | 5 +- .../pipelines/stable_diffusion/img2img.mdx | 6 +- .../pipelines/stable_diffusion/inpaint.mdx | 5 +- .../pipelines/stable_diffusion/pix2pix.mdx | 3 + .../pipelines/stable_diffusion/text2img.mdx | 4 + src/diffusers/__init__.py | 1 - src/diffusers/loaders.py | 198 ++++++++++++++++++ .../alt_diffusion/pipeline_alt_diffusion.py | 8 + .../pipeline_alt_diffusion_img2img.py | 8 + .../stable_diffusion/convert_from_ckpt.py | 146 +++++-------- .../pipeline_stable_diffusion.py | 12 +- .../pipeline_stable_diffusion_controlnet.py | 3 + .../pipeline_stable_diffusion_depth2img.py | 11 +- .../pipeline_stable_diffusion_img2img.py | 12 +- .../pipeline_stable_diffusion_inpaint.py | 11 +- ...ipeline_stable_diffusion_inpaint_legacy.py | 14 +- ...eline_stable_diffusion_instruct_pix2pix.py | 11 +- .../dummy_torch_and_transformers_objects.py | 15 -- .../stable_diffusion/test_stable_diffusion.py | 57 +++++ 21 files changed, 410 insertions(+), 125 deletions(-) diff --git a/docs/source/en/api/loaders.mdx b/docs/source/en/api/loaders.mdx index 8cbf21b8e0cf..20134a0afe66 100644 --- a/docs/source/en/api/loaders.mdx +++ b/docs/source/en/api/loaders.mdx @@ -36,3 +36,7 @@ API to load such adapter neural networks via the [`loaders.py` module](https://g ### LoraLoaderMixin [[autodoc]] loaders.LoraLoaderMixin + +### FromCkptMixin + +[[autodoc]] loaders.FromCkptMixin diff --git a/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx b/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx index af859177c002..dabd3ded31ce 100644 --- a/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx +++ b/docs/source/en/api/pipelines/stable_diffusion/controlnet.mdx @@ -308,6 +308,7 @@ All checkpoints can be found under the authors' namespace [lllyasviel](https://h - disable_vae_slicing - enable_xformers_memory_efficient_attention - disable_xformers_memory_efficient_attention + - load_textual_inversion ## FlaxStableDiffusionControlNetPipeline [[autodoc]] FlaxStableDiffusionControlNetPipeline diff --git a/docs/source/en/api/pipelines/stable_diffusion/depth2img.mdx b/docs/source/en/api/pipelines/stable_diffusion/depth2img.mdx index c46576ff2887..a91167bac58c 100644 --- a/docs/source/en/api/pipelines/stable_diffusion/depth2img.mdx +++ b/docs/source/en/api/pipelines/stable_diffusion/depth2img.mdx @@ -30,4 +30,7 @@ Available Checkpoints are: - enable_attention_slicing - disable_attention_slicing - enable_xformers_memory_efficient_attention - - disable_xformers_memory_efficient_attention \ No newline at end of file + - disable_xformers_memory_efficient_attention + - load_textual_inversion + - load_lora_weights + - save_lora_weights diff --git a/docs/source/en/api/pipelines/stable_diffusion/img2img.mdx b/docs/source/en/api/pipelines/stable_diffusion/img2img.mdx index 09bfb853f9c9..7959c588608b 100644 --- a/docs/source/en/api/pipelines/stable_diffusion/img2img.mdx +++ b/docs/source/en/api/pipelines/stable_diffusion/img2img.mdx @@ -30,7 +30,11 @@ proposed by Chenlin Meng, Yutong He, Yang Song, Jiaming Song, Jiajun Wu, Jun-Yan - disable_attention_slicing - enable_xformers_memory_efficient_attention - disable_xformers_memory_efficient_attention + - load_textual_inversion + - from_ckpt + - load_lora_weights + - save_lora_weights [[autodoc]] FlaxStableDiffusionImg2ImgPipeline - all - - __call__ \ No newline at end of file + - __call__ diff --git a/docs/source/en/api/pipelines/stable_diffusion/inpaint.mdx b/docs/source/en/api/pipelines/stable_diffusion/inpaint.mdx index 33e84a63261f..39e5ae0fd37d 100644 --- a/docs/source/en/api/pipelines/stable_diffusion/inpaint.mdx +++ b/docs/source/en/api/pipelines/stable_diffusion/inpaint.mdx @@ -31,7 +31,10 @@ Available checkpoints are: - disable_attention_slicing - enable_xformers_memory_efficient_attention - disable_xformers_memory_efficient_attention + - load_textual_inversion + - load_lora_weights + - save_lora_weights [[autodoc]] FlaxStableDiffusionInpaintPipeline - all - - __call__ \ No newline at end of file + - __call__ diff --git a/docs/source/en/api/pipelines/stable_diffusion/pix2pix.mdx b/docs/source/en/api/pipelines/stable_diffusion/pix2pix.mdx index 42cd4b896b2e..d01f1df23385 100644 --- a/docs/source/en/api/pipelines/stable_diffusion/pix2pix.mdx +++ b/docs/source/en/api/pipelines/stable_diffusion/pix2pix.mdx @@ -68,3 +68,6 @@ images[0].save("snowy_mountains.png") [[autodoc]] StableDiffusionInstructPix2PixPipeline - __call__ - all + - load_textual_inversion + - load_lora_weights + - save_lora_weights diff --git a/docs/source/en/api/pipelines/stable_diffusion/text2img.mdx b/docs/source/en/api/pipelines/stable_diffusion/text2img.mdx index 6b8d53bf6510..ce78434fdbaa 100644 --- a/docs/source/en/api/pipelines/stable_diffusion/text2img.mdx +++ b/docs/source/en/api/pipelines/stable_diffusion/text2img.mdx @@ -39,6 +39,10 @@ Available Checkpoints are: - disable_xformers_memory_efficient_attention - enable_vae_tiling - disable_vae_tiling + - load_textual_inversion + - from_ckpt + - load_lora_weights + - save_lora_weights [[autodoc]] FlaxStableDiffusionPipeline - all diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 07c17100e0e0..40029fcecfd1 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -109,7 +109,6 @@ except OptionalDependencyNotAvailable: from .utils.dummy_torch_and_transformers_objects import * # noqa F403 else: - from .loaders import TextualInversionLoaderMixin from .pipelines import ( AltDiffusionImg2ImgPipeline, AltDiffusionPipeline, diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py index e814981a85c9..3133da117390 100644 --- a/src/diffusers/loaders.py +++ b/src/diffusers/loaders.py @@ -13,9 +13,11 @@ # limitations under the License. import os from collections import defaultdict +from pathlib import Path from typing import Callable, Dict, List, Optional, Union import torch +from huggingface_hub import hf_hub_download from .models.attention_processor import LoRAAttnProcessor from .utils import ( @@ -431,6 +433,7 @@ def load_textual_inversion( Example: To load a textual inversion embedding vector in `diffusers` format: + ```py from diffusers import StableDiffusionPipeline import torch @@ -463,6 +466,7 @@ def load_textual_inversion( image = pipe(prompt, num_inference_steps=50).images[0] image.save("character.png") ``` + """ if not hasattr(self, "tokenizer") or not isinstance(self.tokenizer, PreTrainedTokenizer): raise ValueError( @@ -1051,3 +1055,197 @@ def save_function(weights, filename): save_function(state_dict, os.path.join(save_directory, weight_name)) logger.info(f"Model weights saved in {os.path.join(save_directory, weight_name)}") + + +class FromCkptMixin: + """This helper class allows to directly load .ckpt stable diffusion file_extension + into the respective classes.""" + + @classmethod + def from_ckpt(cls, pretrained_model_link_or_path, **kwargs): + r""" + Instantiate a PyTorch diffusion pipeline from pre-trained pipeline weights saved in the original .ckpt format. + + The pipeline is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). + + Parameters: + pretrained_model_link_or_path (`str` or `os.PathLike`, *optional*): + Can be either: + - A link to the .ckpt file on the Hub. Should be in the format + `"https://huggingface.co//blob/main/"` + - A path to a *file* containing all pipeline weights. + torch_dtype (`str` or `torch.dtype`, *optional*): + Override the default `torch.dtype` and load the model under this dtype. If `"auto"` is passed the dtype + will be automatically derived from the model's weights. + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force the (re-)download of the model weights and configuration files, overriding the + cached versions if they exist. + cache_dir (`Union[str, os.PathLike]`, *optional*): + Path to a directory in which a downloaded pretrained model configuration should be cached if the + standard cache should not be used. + resume_download (`bool`, *optional*, defaults to `False`): + Whether or not to delete incompletely received files. Will attempt to resume the download if such a + file exists. + proxies (`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. + local_files_only (`bool`, *optional*, defaults to `False`): + Whether or not to only look at local files (i.e., do not try to download the model). + use_auth_token (`str` or *bool*, *optional*): + The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated + when running `huggingface-cli login` (stored in `~/.huggingface`). + revision (`str`, *optional*, defaults to `"main"`): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any + identifier allowed by git. + use_safetensors (`bool`, *optional* ): + If set to `True`, the pipeline will be loaded from `safetensors` weights. If set to `None` (the + default). The pipeline will load using `safetensors` if the safetensors weights are available *and* if + `safetensors` is installed. If the to `False` the pipeline will *not* use `safetensors`. + extract_ema (`bool`, *optional*, defaults to `False`): Only relevant for + checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights or not. Defaults + to `False`. Pass `True` to extract the EMA weights. EMA weights usually yield higher quality images for + inference. Non-EMA weights are usually better to continue fine-tuning. + upcast_attention (`bool`, *optional*, defaults to `None`): + Whether the attention computation should always be upcasted. This is necessary when running stable + image_size (`int`, *optional*, defaults to 512): + The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Diffusion v2 + Base. Use 768 for Stable Diffusion v2. + prediction_type (`str`, *optional*): + The prediction type that the model was trained on. Use `'epsilon'` for Stable Diffusion v1.X and Stable + Diffusion v2 Base. Use `'v_prediction'` for Stable Diffusion v2. + num_in_channels (`int`, *optional*, defaults to None): + The number of input channels. If `None`, it will be automatically inferred. + scheduler_type (`str`, *optional*, defaults to 'pndm'): + Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler", "euler-ancestral", "dpm", + "ddim"]`. + load_safety_checker (`bool`, *optional*, defaults to `True`): + Whether to load the safety checker or not. Defaults to `True`. + kwargs (remaining dictionary of keyword arguments, *optional*): + Can be used to overwrite load - and saveable variables - *i.e.* the pipeline components - of the + specific pipeline class. The overwritten components are then directly passed to the pipelines + `__init__` method. See example below for more information. + + Examples: + + ```py + >>> from diffusers import StableDiffusionPipeline + + >>> # Download pipeline from huggingface.co and cache. + >>> pipeline = StableDiffusionPipeline.from_ckpt( + ... "https://huggingface.co/WarriorMama777/OrangeMixs/blob/main/Models/AbyssOrangeMix/AbyssOrangeMix.safetensors" + ... ) + + >>> # Download pipeline from local file + >>> # file is downloaded under ./v1-5-pruned-emaonly.ckpt + >>> pipeline = StableDiffusionPipeline.from_ckpt("./v1-5-pruned-emaonly") + + >>> # Enable float16 and move to GPU + >>> pipeline = StableDiffusionPipeline.from_ckpt( + ... "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt", + ... torch_dtype=torch.float16, + ... ) + >>> pipeline.to("cuda") + ``` + """ + # import here to avoid circular dependency + from .pipelines.stable_diffusion.convert_from_ckpt import download_from_original_stable_diffusion_ckpt + + cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE) + resume_download = kwargs.pop("resume_download", False) + force_download = kwargs.pop("force_download", False) + proxies = kwargs.pop("proxies", None) + local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE) + use_auth_token = kwargs.pop("use_auth_token", None) + revision = kwargs.pop("revision", None) + extract_ema = kwargs.pop("extract_ema", False) + image_size = kwargs.pop("image_size", 512) + scheduler_type = kwargs.pop("scheduler_type", "pndm") + num_in_channels = kwargs.pop("num_in_channels", None) + upcast_attention = kwargs.pop("upcast_attention", None) + load_safety_checker = kwargs.pop("load_safety_checker", True) + prediction_type = kwargs.pop("prediction_type", None) + + torch_dtype = kwargs.pop("torch_dtype", None) + + use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False) + + pipeline_name = cls.__name__ + file_extension = pretrained_model_link_or_path.rsplit(".", 1)[-1] + from_safetensors = file_extension == "safetensors" + + if from_safetensors and use_safetensors is True: + raise ValueError("Make sure to install `safetensors` with `pip install safetensors`.") + + # TODO: For now we only support stable diffusion + stable_unclip = None + controlnet = False + + if pipeline_name == "StableDiffusionControlNetPipeline": + model_type = "FrozenCLIPEmbedder" + controlnet = True + elif "StableDiffusion" in pipeline_name: + model_type = "FrozenCLIPEmbedder" + elif pipeline_name == "StableUnCLIPPipeline": + model_type == "FrozenOpenCLIPEmbedder" + stable_unclip = "txt2img" + elif pipeline_name == "StableUnCLIPImg2ImgPipeline": + model_type == "FrozenOpenCLIPEmbedder" + stable_unclip = "img2img" + elif pipeline_name == "PaintByExamplePipeline": + model_type == "PaintByExample" + elif pipeline_name == "LDMTextToImagePipeline": + model_type == "LDMTextToImage" + else: + raise ValueError(f"Unhandled pipeline class: {pipeline_name}") + + # remove huggingface url + for prefix in ["https://huggingface.co/", "huggingface.co/", "hf.co/", "https://hf.co/"]: + if pretrained_model_link_or_path.startswith(prefix): + pretrained_model_link_or_path = pretrained_model_link_or_path[len(prefix) :] + + # Code based on diffusers.pipelines.pipeline_utils.DiffusionPipeline.from_pretrained + ckpt_path = Path(pretrained_model_link_or_path) + if not ckpt_path.is_file(): + # get repo_id and (potentially nested) file path of ckpt in repo + repo_id = str(Path().joinpath(*ckpt_path.parts[:2])) + file_path = str(Path().joinpath(*ckpt_path.parts[2:])) + + if file_path.startswith("blob/"): + file_path = file_path[len("blob/") :] + + if file_path.startswith("main/"): + file_path = file_path[len("main/") :] + + pretrained_model_link_or_path = hf_hub_download( + repo_id, + filename=file_path, + cache_dir=cache_dir, + resume_download=resume_download, + proxies=proxies, + local_files_only=local_files_only, + use_auth_token=use_auth_token, + revision=revision, + force_download=force_download, + ) + + pipe = download_from_original_stable_diffusion_ckpt( + pretrained_model_link_or_path, + pipeline_class=cls, + model_type=model_type, + stable_unclip=stable_unclip, + controlnet=controlnet, + from_safetensors=from_safetensors, + extract_ema=extract_ema, + image_size=image_size, + scheduler_type=scheduler_type, + num_in_channels=num_in_channels, + upcast_attention=upcast_attention, + load_safety_checker=load_safety_checker, + prediction_type=prediction_type, + ) + + if torch_dtype is not None: + pipe.to(torch_dtype=torch_dtype) + + return pipe diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py index bf314b91116e..ff9474ffd43a 100644 --- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py +++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py @@ -57,6 +57,14 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin): This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + In addition the pipeline inherits the following loading methods: + - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`] + - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`] + - *Ckpt*: [`loaders.FromCkptMixin.from_ckpt`] + + as well as the following saving methods: + - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`] + Args: vae ([`AutoencoderKL`]): Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py index 86fc47f424e9..dee4a91924f7 100644 --- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py +++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py @@ -96,6 +96,14 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + In addition the pipeline inherits the following loading methods: + - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`] + - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`] + - *Ckpt*: [`loaders.FromCkptMixin.from_ckpt`] + + as well as the following saving methods: + - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`] + Args: vae ([`AutoencoderKL`]): Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py index dbc1b27e88be..5961636dd197 100644 --- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py @@ -31,35 +31,30 @@ CLIPVisionModelWithProjection, ) -from diffusers import ( +from ...models import ( AutoencoderKL, ControlNetModel, + PriorTransformer, + UNet2DConditionModel, +) +from ...schedulers import ( DDIMScheduler, DDPMScheduler, DPMSolverMultistepScheduler, EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, HeunDiscreteScheduler, - LDMTextToImagePipeline, LMSDiscreteScheduler, PNDMScheduler, - PriorTransformer, - StableDiffusionControlNetPipeline, - StableDiffusionImg2ImgPipeline, - StableDiffusionInpaintPipeline, - StableDiffusionPipeline, - StableUnCLIPImg2ImgPipeline, - StableUnCLIPPipeline, UnCLIPScheduler, - UNet2DConditionModel, ) -from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel -from diffusers.pipelines.paint_by_example import PaintByExampleImageEncoder, PaintByExamplePipeline -from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker -from diffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import StableUnCLIPImageNormalizer - from ...utils import is_omegaconf_available, is_safetensors_available, logging from ...utils.import_utils import BACKENDS_MAPPING +from ..latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel +from ..paint_by_example import PaintByExampleImageEncoder +from ..pipeline_utils import DiffusionPipeline +from .safety_checker import StableDiffusionSafetyChecker +from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -981,7 +976,6 @@ def download_from_original_stable_diffusion_ckpt( image_size: int = 512, prediction_type: str = None, model_type: str = None, - is_img2img: bool = False, extract_ema: bool = False, scheduler_type: str = "pndm", num_in_channels: Optional[int] = None, @@ -993,7 +987,8 @@ def download_from_original_stable_diffusion_ckpt( clip_stats_path: Optional[str] = None, controlnet: Optional[bool] = None, load_safety_checker: bool = True, -) -> StableDiffusionPipeline: + pipeline_class: DiffusionPipeline = None, +) -> DiffusionPipeline: """ Load a Stable Diffusion pipeline object from a CompVis-style `.ckpt`/`.safetensors` file and (ideally) a `.yaml` config file. @@ -1031,12 +1026,29 @@ def download_from_original_stable_diffusion_ckpt( Whether the attention computation should always be upcasted. This is necessary when running stable diffusion 2.1. device (`str`, *optional*, defaults to `None`): - The device to use. Pass `None` to determine automatically. :param from_safetensors: If `checkpoint_path` is - in `safetensors` format, load checkpoint with safetensors instead of PyTorch. :return: A - StableDiffusionPipeline object representing the passed-in `.ckpt`/`.safetensors` file. + The device to use. Pass `None` to determine automatically. + from_safetensors (`str`, *optional*, defaults to `False`): + If `checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch. load_safety_checker (`bool`, *optional*, defaults to `True`): Whether to load the safety checker or not. Defaults to `True`. + pipeline_class (`str`, *optional*, defaults to `None`): + The pipeline class to use. Pass `None` to determine automatically. + return: A StableDiffusionPipeline object representing the passed-in `.ckpt`/`.safetensors` file. """ + + # import pipelines here to avoid circular import error when using from_ckpt method + from diffusers import ( + LDMTextToImagePipeline, + PaintByExamplePipeline, + StableDiffusionControlNetPipeline, + StableDiffusionPipeline, + StableUnCLIPImg2ImgPipeline, + StableUnCLIPPipeline, + ) + + if pipeline_class is None: + pipeline_class = StableDiffusionPipeline + if prediction_type == "v-prediction": prediction_type = "v_prediction" @@ -1198,44 +1210,16 @@ def download_from_original_stable_diffusion_ckpt( requires_safety_checker=False, ) else: - if ( - hasattr(original_config, "model") - and hasattr(original_config.model, "target") - and "LatentInpaintDiffusion" in original_config.model.target - ): - pipe = StableDiffusionInpaintPipeline( - vae=vae, - text_encoder=text_model, - tokenizer=tokenizer, - unet=unet, - scheduler=scheduler, - safety_checker=None, - feature_extractor=None, - requires_safety_checker=False, - ) - else: - if is_img2img: - pipe = StableDiffusionImg2ImgPipeline( - vae=vae, - text_encoder=text_model, - tokenizer=tokenizer, - unet=unet, - scheduler=scheduler, - safety_checker=None, - feature_extractor=None, - requires_safety_checker=False, - ) - else: - pipe = StableDiffusionPipeline( - vae=vae, - text_encoder=text_model, - tokenizer=tokenizer, - unet=unet, - scheduler=scheduler, - safety_checker=None, - feature_extractor=None, - requires_safety_checker=False, - ) + pipe = pipeline_class( + vae=vae, + text_encoder=text_model, + tokenizer=tokenizer, + unet=unet, + scheduler=scheduler, + safety_checker=None, + feature_extractor=None, + requires_safety_checker=False, + ) else: image_normalizer, image_noising_scheduler = stable_unclip_image_noising_components( original_config, clip_stats_path=clip_stats_path, device=device @@ -1326,41 +1310,15 @@ def download_from_original_stable_diffusion_ckpt( feature_extractor=feature_extractor, ) else: - if ( - hasattr(original_config, "model") - and hasattr(original_config.model, "target") - and "LatentInpaintDiffusion" in original_config.model.target - ): - pipe = StableDiffusionInpaintPipeline( - vae=vae, - text_encoder=text_model, - tokenizer=tokenizer, - unet=unet, - scheduler=scheduler, - safety_checker=safety_checker, - feature_extractor=feature_extractor, - ) - else: - if is_img2img: - pipe = StableDiffusionImg2ImgPipeline( - vae=vae, - text_encoder=text_model, - tokenizer=tokenizer, - unet=unet, - scheduler=scheduler, - safety_checker=safety_checker, - feature_extractor=feature_extractor, - ) - else: - pipe = StableDiffusionPipeline( - vae=vae, - text_encoder=text_model, - tokenizer=tokenizer, - unet=unet, - scheduler=scheduler, - safety_checker=safety_checker, - feature_extractor=feature_extractor, - ) + pipe = pipeline_class( + vae=vae, + text_encoder=text_model, + tokenizer=tokenizer, + unet=unet, + scheduler=scheduler, + safety_checker=safety_checker, + feature_extractor=feature_extractor, + ) else: text_config = create_ldm_bert_config(original_config) text_model = convert_ldm_bert_checkpoint(checkpoint, text_config) @@ -1379,7 +1337,7 @@ def download_controlnet_from_original_ckpt( upcast_attention: Optional[bool] = None, device: str = None, from_safetensors: bool = False, -) -> StableDiffusionPipeline: +) -> DiffusionPipeline: if not is_omegaconf_available(): raise ValueError(BACKENDS_MAPPING["omegaconf"][1]) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 689febe3e891..7347d70c4023 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -20,7 +20,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...configuration_utils import FrozenDict -from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import FromCkptMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( @@ -53,13 +53,21 @@ """ -class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): +class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromCkptMixin): r""" Pipeline for text-to-image generation using Stable Diffusion. This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + In addition the pipeline inherits the following loading methods: + - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`] + - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`] + - *Ckpt*: [`loaders.FromCkptMixin.from_ckpt`] + + as well as the following saving methods: + - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`] + Args: vae ([`AutoencoderKL`]): Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py index 3b8889d92b55..322f2232fc8a 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py @@ -156,6 +156,9 @@ class StableDiffusionControlNetPipeline(DiffusionPipeline, TextualInversionLoade This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + In addition the pipeline inherits the following loading methods: + - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`] + Args: vae ([`AutoencoderKL`]): Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py index 4fe117ba120b..c4f9ae59a4e9 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py @@ -23,7 +23,7 @@ from transformers import CLIPTextModel, CLIPTokenizer, DPTFeatureExtractor, DPTForDepthEstimation from ...configuration_utils import FrozenDict -from ...loaders import TextualInversionLoaderMixin +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import PIL_INTERPOLATION, deprecate, is_accelerate_available, logging, randn_tensor @@ -55,13 +55,20 @@ def preprocess(image): return image -class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin): +class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): r""" Pipeline for text-guided image to image generation using Stable Diffusion. This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + In addition the pipeline inherits the following loading methods: + - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`] + - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`] + + as well as the following saving methods: + - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`] + Args: vae ([`AutoencoderKL`]): Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index 5860a53ad528..c26ddf06cadc 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -23,7 +23,7 @@ from ...configuration_utils import FrozenDict from ...image_processor import VaeImageProcessor -from ...loaders import TextualInversionLoaderMixin +from ...loaders import FromCkptMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( @@ -92,13 +92,21 @@ def preprocess(image): return image -class StableDiffusionImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin): +class StableDiffusionImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromCkptMixin): r""" Pipeline for text-guided image to image generation using Stable Diffusion. This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + In addition the pipeline inherits the following loading methods: + - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`] + - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`] + - *Ckpt*: [`loaders.FromCkptMixin.from_ckpt`] + + as well as the following saving methods: + - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`] + Args: vae ([`AutoencoderKL`]): Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index 8e0ea5a8d079..fb2e5dc424e3 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -22,7 +22,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...configuration_utils import FrozenDict -from ...loaders import TextualInversionLoaderMixin +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import deprecate, is_accelerate_available, is_accelerate_version, logging, randn_tensor @@ -138,13 +138,20 @@ def prepare_mask_and_masked_image(image, mask): return mask, masked_image -class StableDiffusionInpaintPipeline(DiffusionPipeline, TextualInversionLoaderMixin): +class StableDiffusionInpaintPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): r""" Pipeline for text-guided image inpainting using Stable Diffusion. *This is an experimental feature*. This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + In addition the pipeline inherits the following loading methods: + - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`] + - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`] + + as well as the following saving methods: + - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`] + Args: vae ([`AutoencoderKL`]): Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py index 6d9cbaf67a07..1c8377c7e54e 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py @@ -22,7 +22,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...configuration_utils import FrozenDict -from ...loaders import TextualInversionLoaderMixin +from ...loaders import FromCkptMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( @@ -82,13 +82,23 @@ def preprocess_mask(mask, scale_factor=8): return mask -class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline, TextualInversionLoaderMixin): +class StableDiffusionInpaintPipelineLegacy( + DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromCkptMixin +): r""" Pipeline for text-guided image inpainting using Stable Diffusion. *This is an experimental feature*. This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + In addition the pipeline inherits the following loading methods: + - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`] + - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`] + - *Ckpt*: [`loaders.FromCkptMixin.from_ckpt`] + + as well as the following saving methods: + - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`] + Args: vae ([`AutoencoderKL`]): Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py index f7999a08dc9b..49944cdcd636 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py @@ -20,7 +20,7 @@ import torch from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer -from ...loaders import TextualInversionLoaderMixin +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( @@ -61,13 +61,20 @@ def preprocess(image): return image -class StableDiffusionInstructPix2PixPipeline(DiffusionPipeline, TextualInversionLoaderMixin): +class StableDiffusionInstructPix2PixPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): r""" Pipeline for pixel-level image editing by following text instructions. Based on Stable Diffusion. This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + In addition the pipeline inherits the following loading methods: + - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`] + - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`] + + as well as the following saving methods: + - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`] + Args: vae ([`AutoencoderKL`]): Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py index 8a521457f2e3..bda56d2ae8ae 100644 --- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py +++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py @@ -2,21 +2,6 @@ from ..utils import DummyObject, requires_backends -class TextualInversionLoaderMixin(metaclass=DummyObject): - _backends = ["torch", "transformers"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch", "transformers"]) - - @classmethod - def from_config(cls, *args, **kwargs): - requires_backends(cls, ["torch", "transformers"]) - - @classmethod - def from_pretrained(cls, *args, **kwargs): - requires_backends(cls, ["torch", "transformers"]) - - class AltDiffusionImg2ImgPipeline(metaclass=DummyObject): _backends = ["torch", "transformers"] diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py index 14421a64b9e8..fcfcd84c5d48 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py @@ -36,6 +36,7 @@ UNet2DConditionModel, logging, ) +from diffusers.models.attention_processor import AttnProcessor from diffusers.utils import load_numpy, nightly, slow, torch_device from diffusers.utils.testing_utils import CaptureLogger, require_torch_gpu @@ -865,6 +866,62 @@ def test_stable_diffusion_textual_inversion(self): assert max_diff < 5e-2 +@slow +@require_torch_gpu +class StableDiffusionPipelineCkptTests(unittest.TestCase): + def tearDown(self): + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def test_download_from_hub(self): + ckpt_paths = [ + "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt", + "https://huggingface.co/WarriorMama777/OrangeMixs/blob/main/Models/AbyssOrangeMix/AbyssOrangeMix_base.ckpt", + ] + + for ckpt_path in ckpt_paths: + pipe = StableDiffusionPipeline.from_ckpt(ckpt_path, torch_dtype=torch.float16) + pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) + pipe.to("cuda") + + image_out = pipe("test", num_inference_steps=1, output_type="np").images[0] + + assert image_out.shape == (512, 512, 3) + + def test_download_local(self): + filename = hf_hub_download("runwayml/stable-diffusion-v1-5", filename="v1-5-pruned-emaonly.ckpt") + + pipe = StableDiffusionPipeline.from_ckpt(filename, torch_dtype=torch.float16) + pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) + pipe.to("cuda") + + image_out = pipe("test", num_inference_steps=1, output_type="np").images[0] + + assert image_out.shape == (512, 512, 3) + + def test_download_ckpt_diff_format_is_same(self): + ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt" + + pipe = StableDiffusionPipeline.from_ckpt(ckpt_path) + pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) + pipe.unet.set_attn_processor(AttnProcessor()) + pipe.to("cuda") + + generator = torch.Generator(device="cpu").manual_seed(0) + image_ckpt = pipe("a turtle", num_inference_steps=5, generator=generator, output_type="np").images[0] + + pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") + pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) + pipe.unet.set_attn_processor(AttnProcessor()) + pipe.to("cuda") + + generator = torch.Generator(device="cpu").manual_seed(0) + image = pipe("a turtle", num_inference_steps=5, generator=generator, output_type="np").images[0] + + assert np.max(np.abs(image - image_ckpt)) < 1e-4 + + @nightly @require_torch_gpu class StableDiffusionPipelineNightlyTests(unittest.TestCase): From bba1c1de151bf0ff0b47a7b81a8251c3bed1db1f Mon Sep 17 00:00:00 2001 From: asfiyab-nvidia <117682710+asfiyab-nvidia@users.noreply.github.com> Date: Wed, 19 Apr 2023 09:51:03 -0700 Subject: [PATCH 29/71] Add TensorRT SD/txt2img Community Pipeline to diffusers along with TensorRT utils (#2974) * Add SD/txt2img Community Pipeline to diffusers along with TensorRT utils Signed-off-by: Asfiya Baig * update installation command Signed-off-by: Asfiya Baig * update tensorrt installation Signed-off-by: Asfiya Baig * changes 1. Update setting of cache directory 2. Address comments: merge utils and pipeline code. 3. Address comments: Add section in README Signed-off-by: Asfiya Baig * apply make style Signed-off-by: Asfiya Baig --------- Signed-off-by: Asfiya Baig Co-authored-by: Patrick von Platen --- examples/community/README.md | 33 +- .../stable_diffusion_tensorrt_txt2img.py | 926 ++++++++++++++++++ 2 files changed, 958 insertions(+), 1 deletion(-) create mode 100644 examples/community/stable_diffusion_tensorrt_txt2img.py diff --git a/examples/community/README.md b/examples/community/README.md index 11da90764579..8b5b1743203d 100644 --- a/examples/community/README.md +++ b/examples/community/README.md @@ -31,7 +31,7 @@ MagicMix | Diffusion Pipeline for semantic mixing of an image and a text prompt | UnCLIP Image Interpolation Pipeline | Diffusion Pipeline that allows passing two images/image_embeddings and produces images while interpolating between their image-embeddings | [UnCLIP Image Interpolation Pipeline](#unclip-image-interpolation-pipeline) | - | [Naga Sai Abhinay Devarinti](https://github.com/Abhinay1997/) | | DDIM Noise Comparative Analysis Pipeline | Investigating how the diffusion models learn visual concepts from each noise level (which is a contribution of [P2 weighting (CVPR 2022)](https://arxiv.org/abs/2204.00227)) | [DDIM Noise Comparative Analysis Pipeline](#ddim-noise-comparative-analysis-pipeline) | - |[Aengus (Duc-Anh)](https://github.com/aengusng8) | | CLIP Guided Img2Img Stable Diffusion Pipeline | Doing CLIP guidance for image to image generation with Stable Diffusion | [CLIP Guided Img2Img Stable Diffusion](#clip-guided-img2img-stable-diffusion) | - | [Nipun Jindal](https://github.com/nipunjindal/) | - +| TensorRT Stable Diffusion Pipeline | Accelerates the Stable Diffusion Text2Image Pipeline using TensorRT | [TensorRT Stable Diffusion Pipeline](#tensorrt-text2image-stable-diffusion-pipeline) | - |[Asfiya Baig](https://github.com/asfiyab-nvidia) | To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly. @@ -1130,3 +1130,34 @@ Init Image Output Image ![img2img_clip_guidance](https://huggingface.co/datasets/njindal/images/resolve/main/clip_guided_img2img.jpg) + +### TensorRT Text2Image Stable Diffusion Pipeline + +The TensorRT Pipeline can be used to accelerate the Text2Image Stable Diffusion Inference run. + +NOTE: The ONNX conversions and TensorRT engine build may take up to 30 minutes. + +```python +import torch +from diffusers import DDIMScheduler +from diffusers.pipelines.stable_diffusion import StableDiffusionPipeline + +# Use the DDIMScheduler scheduler here instead +scheduler = DDIMScheduler.from_pretrained("stabilityai/stable-diffusion-2-1", + subfolder="scheduler") + +pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1", + custom_pipeline="stable_diffusion_tensorrt_txt2img", + revision='fp16', + torch_dtype=torch.float16, + scheduler=scheduler,) + +# re-use cached folder to save ONNX models and TensorRT Engines +pipe.set_cached_folder("stabilityai/stable-diffusion-2-1", revision='fp16',) + +pipe = pipe.to("cuda") + +prompt = "a beautiful photograph of Mt. Fuji during cherry blossom" +image = pipe(prompt).images[0] +image.save('tensorrt_mt_fuji.png') +``` diff --git a/examples/community/stable_diffusion_tensorrt_txt2img.py b/examples/community/stable_diffusion_tensorrt_txt2img.py new file mode 100644 index 000000000000..7aef2bec743f --- /dev/null +++ b/examples/community/stable_diffusion_tensorrt_txt2img.py @@ -0,0 +1,926 @@ +# +# Copyright 2023 The HuggingFace Inc. team. +# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import os +from collections import OrderedDict +from copy import copy +from typing import List, Optional, Union + +import numpy as np +import onnx +import onnx_graphsurgeon as gs +import tensorrt as trt +import torch +from huggingface_hub import snapshot_download +from onnx import shape_inference +from polygraphy import cuda +from polygraphy.backend.common import bytes_from_path +from polygraphy.backend.onnx.loader import fold_constants +from polygraphy.backend.trt import ( + CreateConfig, + Profile, + engine_from_bytes, + engine_from_network, + network_from_onnx_path, + save_engine, +) +from polygraphy.backend.trt import util as trt_util +from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer + +from diffusers.models import AutoencoderKL, UNet2DConditionModel +from diffusers.pipelines.stable_diffusion import ( + StableDiffusionPipeline, + StableDiffusionPipelineOutput, + StableDiffusionSafetyChecker, +) +from diffusers.schedulers import DDIMScheduler +from diffusers.utils import DIFFUSERS_CACHE, logging + + +""" +Installation instructions +python3 -m pip install --upgrade tensorrt +python3 -m pip install --upgrade polygraphy onnx-graphsurgeon --extra-index-url https://pypi.ngc.nvidia.com +python3 -m pip install onnxruntime +""" + +TRT_LOGGER = trt.Logger(trt.Logger.ERROR) +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +# Map of numpy dtype -> torch dtype +numpy_to_torch_dtype_dict = { + np.uint8: torch.uint8, + np.int8: torch.int8, + np.int16: torch.int16, + np.int32: torch.int32, + np.int64: torch.int64, + np.float16: torch.float16, + np.float32: torch.float32, + np.float64: torch.float64, + np.complex64: torch.complex64, + np.complex128: torch.complex128, +} +if np.version.full_version >= "1.24.0": + numpy_to_torch_dtype_dict[np.bool_] = torch.bool +else: + numpy_to_torch_dtype_dict[np.bool] = torch.bool + +# Map of torch dtype -> numpy dtype +torch_to_numpy_dtype_dict = {value: key for (key, value) in numpy_to_torch_dtype_dict.items()} + + +def device_view(t): + return cuda.DeviceView(ptr=t.data_ptr(), shape=t.shape, dtype=torch_to_numpy_dtype_dict[t.dtype]) + + +class Engine: + def __init__(self, engine_path): + self.engine_path = engine_path + self.engine = None + self.context = None + self.buffers = OrderedDict() + self.tensors = OrderedDict() + + def __del__(self): + [buf.free() for buf in self.buffers.values() if isinstance(buf, cuda.DeviceArray)] + del self.engine + del self.context + del self.buffers + del self.tensors + + def build( + self, + onnx_path, + fp16, + input_profile=None, + enable_preview=False, + enable_all_tactics=False, + timing_cache=None, + workspace_size=0, + ): + logger.warning(f"Building TensorRT engine for {onnx_path}: {self.engine_path}") + p = Profile() + if input_profile: + for name, dims in input_profile.items(): + assert len(dims) == 3 + p.add(name, min=dims[0], opt=dims[1], max=dims[2]) + + config_kwargs = {} + + config_kwargs["preview_features"] = [trt.PreviewFeature.DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805] + if enable_preview: + # Faster dynamic shapes made optional since it increases engine build time. + config_kwargs["preview_features"].append(trt.PreviewFeature.FASTER_DYNAMIC_SHAPES_0805) + if workspace_size > 0: + config_kwargs["memory_pool_limits"] = {trt.MemoryPoolType.WORKSPACE: workspace_size} + if not enable_all_tactics: + config_kwargs["tactic_sources"] = [] + + engine = engine_from_network( + network_from_onnx_path(onnx_path), + config=CreateConfig(fp16=fp16, profiles=[p], load_timing_cache=timing_cache, **config_kwargs), + save_timing_cache=timing_cache, + ) + save_engine(engine, path=self.engine_path) + + def load(self): + logger.warning(f"Loading TensorRT engine: {self.engine_path}") + self.engine = engine_from_bytes(bytes_from_path(self.engine_path)) + + def activate(self): + self.context = self.engine.create_execution_context() + + def allocate_buffers(self, shape_dict=None, device="cuda"): + for idx in range(trt_util.get_bindings_per_profile(self.engine)): + binding = self.engine[idx] + if shape_dict and binding in shape_dict: + shape = shape_dict[binding] + else: + shape = self.engine.get_binding_shape(binding) + dtype = trt.nptype(self.engine.get_binding_dtype(binding)) + if self.engine.binding_is_input(binding): + self.context.set_binding_shape(idx, shape) + tensor = torch.empty(tuple(shape), dtype=numpy_to_torch_dtype_dict[dtype]).to(device=device) + self.tensors[binding] = tensor + self.buffers[binding] = cuda.DeviceView(ptr=tensor.data_ptr(), shape=shape, dtype=dtype) + + def infer(self, feed_dict, stream): + start_binding, end_binding = trt_util.get_active_profile_bindings(self.context) + # shallow copy of ordered dict + device_buffers = copy(self.buffers) + for name, buf in feed_dict.items(): + assert isinstance(buf, cuda.DeviceView) + device_buffers[name] = buf + bindings = [0] * start_binding + [buf.ptr for buf in device_buffers.values()] + noerror = self.context.execute_async_v2(bindings=bindings, stream_handle=stream.ptr) + if not noerror: + raise ValueError("ERROR: inference failed.") + + return self.tensors + + +class Optimizer: + def __init__(self, onnx_graph): + self.graph = gs.import_onnx(onnx_graph) + + def cleanup(self, return_onnx=False): + self.graph.cleanup().toposort() + if return_onnx: + return gs.export_onnx(self.graph) + + def select_outputs(self, keep, names=None): + self.graph.outputs = [self.graph.outputs[o] for o in keep] + if names: + for i, name in enumerate(names): + self.graph.outputs[i].name = name + + def fold_constants(self, return_onnx=False): + onnx_graph = fold_constants(gs.export_onnx(self.graph), allow_onnxruntime_shape_inference=True) + self.graph = gs.import_onnx(onnx_graph) + if return_onnx: + return onnx_graph + + def infer_shapes(self, return_onnx=False): + onnx_graph = gs.export_onnx(self.graph) + if onnx_graph.ByteSize() > 2147483648: + raise TypeError("ERROR: model size exceeds supported 2GB limit") + else: + onnx_graph = shape_inference.infer_shapes(onnx_graph) + + self.graph = gs.import_onnx(onnx_graph) + if return_onnx: + return onnx_graph + + +class BaseModel: + def __init__(self, model, fp16=False, device="cuda", max_batch_size=16, embedding_dim=768, text_maxlen=77): + self.model = model + self.name = "SD Model" + self.fp16 = fp16 + self.device = device + + self.min_batch = 1 + self.max_batch = max_batch_size + self.min_image_shape = 256 # min image resolution: 256x256 + self.max_image_shape = 1024 # max image resolution: 1024x1024 + self.min_latent_shape = self.min_image_shape // 8 + self.max_latent_shape = self.max_image_shape // 8 + + self.embedding_dim = embedding_dim + self.text_maxlen = text_maxlen + + def get_model(self): + return self.model + + def get_input_names(self): + pass + + def get_output_names(self): + pass + + def get_dynamic_axes(self): + return None + + def get_sample_input(self, batch_size, image_height, image_width): + pass + + def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape): + return None + + def get_shape_dict(self, batch_size, image_height, image_width): + return None + + def optimize(self, onnx_graph): + opt = Optimizer(onnx_graph) + opt.cleanup() + opt.fold_constants() + opt.infer_shapes() + onnx_opt_graph = opt.cleanup(return_onnx=True) + return onnx_opt_graph + + def check_dims(self, batch_size, image_height, image_width): + assert batch_size >= self.min_batch and batch_size <= self.max_batch + assert image_height % 8 == 0 or image_width % 8 == 0 + latent_height = image_height // 8 + latent_width = image_width // 8 + assert latent_height >= self.min_latent_shape and latent_height <= self.max_latent_shape + assert latent_width >= self.min_latent_shape and latent_width <= self.max_latent_shape + return (latent_height, latent_width) + + def get_minmax_dims(self, batch_size, image_height, image_width, static_batch, static_shape): + min_batch = batch_size if static_batch else self.min_batch + max_batch = batch_size if static_batch else self.max_batch + latent_height = image_height // 8 + latent_width = image_width // 8 + min_image_height = image_height if static_shape else self.min_image_shape + max_image_height = image_height if static_shape else self.max_image_shape + min_image_width = image_width if static_shape else self.min_image_shape + max_image_width = image_width if static_shape else self.max_image_shape + min_latent_height = latent_height if static_shape else self.min_latent_shape + max_latent_height = latent_height if static_shape else self.max_latent_shape + min_latent_width = latent_width if static_shape else self.min_latent_shape + max_latent_width = latent_width if static_shape else self.max_latent_shape + return ( + min_batch, + max_batch, + min_image_height, + max_image_height, + min_image_width, + max_image_width, + min_latent_height, + max_latent_height, + min_latent_width, + max_latent_width, + ) + + +def getOnnxPath(model_name, onnx_dir, opt=True): + return os.path.join(onnx_dir, model_name + (".opt" if opt else "") + ".onnx") + + +def getEnginePath(model_name, engine_dir): + return os.path.join(engine_dir, model_name + ".plan") + + +def build_engines( + models: dict, + engine_dir, + onnx_dir, + onnx_opset, + opt_image_height, + opt_image_width, + opt_batch_size=1, + force_engine_rebuild=False, + static_batch=False, + static_shape=True, + enable_preview=False, + enable_all_tactics=False, + timing_cache=None, + max_workspace_size=0, +): + built_engines = {} + if not os.path.isdir(onnx_dir): + os.makedirs(onnx_dir) + if not os.path.isdir(engine_dir): + os.makedirs(engine_dir) + + # Export models to ONNX + for model_name, model_obj in models.items(): + engine_path = getEnginePath(model_name, engine_dir) + if force_engine_rebuild or not os.path.exists(engine_path): + logger.warning("Building Engines...") + logger.warning("Engine build can take a while to complete") + onnx_path = getOnnxPath(model_name, onnx_dir, opt=False) + onnx_opt_path = getOnnxPath(model_name, onnx_dir) + if force_engine_rebuild or not os.path.exists(onnx_opt_path): + if force_engine_rebuild or not os.path.exists(onnx_path): + logger.warning(f"Exporting model: {onnx_path}") + model = model_obj.get_model() + with torch.inference_mode(), torch.autocast("cuda"): + inputs = model_obj.get_sample_input(opt_batch_size, opt_image_height, opt_image_width) + torch.onnx.export( + model, + inputs, + onnx_path, + export_params=True, + opset_version=onnx_opset, + do_constant_folding=True, + input_names=model_obj.get_input_names(), + output_names=model_obj.get_output_names(), + dynamic_axes=model_obj.get_dynamic_axes(), + ) + del model + torch.cuda.empty_cache() + gc.collect() + else: + logger.warning(f"Found cached model: {onnx_path}") + + # Optimize onnx + if force_engine_rebuild or not os.path.exists(onnx_opt_path): + logger.warning(f"Generating optimizing model: {onnx_opt_path}") + onnx_opt_graph = model_obj.optimize(onnx.load(onnx_path)) + onnx.save(onnx_opt_graph, onnx_opt_path) + else: + logger.warning(f"Found cached optimized model: {onnx_opt_path} ") + + # Build TensorRT engines + for model_name, model_obj in models.items(): + engine_path = getEnginePath(model_name, engine_dir) + engine = Engine(engine_path) + onnx_path = getOnnxPath(model_name, onnx_dir, opt=False) + onnx_opt_path = getOnnxPath(model_name, onnx_dir) + + if force_engine_rebuild or not os.path.exists(engine.engine_path): + engine.build( + onnx_opt_path, + fp16=True, + input_profile=model_obj.get_input_profile( + opt_batch_size, + opt_image_height, + opt_image_width, + static_batch=static_batch, + static_shape=static_shape, + ), + enable_preview=enable_preview, + timing_cache=timing_cache, + workspace_size=max_workspace_size, + ) + built_engines[model_name] = engine + + # Load and activate TensorRT engines + for model_name, model_obj in models.items(): + engine = built_engines[model_name] + engine.load() + engine.activate() + + return built_engines + + +def runEngine(engine, feed_dict, stream): + return engine.infer(feed_dict, stream) + + +class CLIP(BaseModel): + def __init__(self, model, device, max_batch_size, embedding_dim): + super(CLIP, self).__init__( + model=model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim + ) + self.name = "CLIP" + + def get_input_names(self): + return ["input_ids"] + + def get_output_names(self): + return ["text_embeddings", "pooler_output"] + + def get_dynamic_axes(self): + return {"input_ids": {0: "B"}, "text_embeddings": {0: "B"}} + + def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape): + self.check_dims(batch_size, image_height, image_width) + min_batch, max_batch, _, _, _, _, _, _, _, _ = self.get_minmax_dims( + batch_size, image_height, image_width, static_batch, static_shape + ) + return { + "input_ids": [(min_batch, self.text_maxlen), (batch_size, self.text_maxlen), (max_batch, self.text_maxlen)] + } + + def get_shape_dict(self, batch_size, image_height, image_width): + self.check_dims(batch_size, image_height, image_width) + return { + "input_ids": (batch_size, self.text_maxlen), + "text_embeddings": (batch_size, self.text_maxlen, self.embedding_dim), + } + + def get_sample_input(self, batch_size, image_height, image_width): + self.check_dims(batch_size, image_height, image_width) + return torch.zeros(batch_size, self.text_maxlen, dtype=torch.int32, device=self.device) + + def optimize(self, onnx_graph): + opt = Optimizer(onnx_graph) + opt.select_outputs([0]) # delete graph output#1 + opt.cleanup() + opt.fold_constants() + opt.infer_shapes() + opt.select_outputs([0], names=["text_embeddings"]) # rename network output + opt_onnx_graph = opt.cleanup(return_onnx=True) + return opt_onnx_graph + + +def make_CLIP(model, device, max_batch_size, embedding_dim, inpaint=False): + return CLIP(model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim) + + +class UNet(BaseModel): + def __init__( + self, model, fp16=False, device="cuda", max_batch_size=16, embedding_dim=768, text_maxlen=77, unet_dim=4 + ): + super(UNet, self).__init__( + model=model, + fp16=fp16, + device=device, + max_batch_size=max_batch_size, + embedding_dim=embedding_dim, + text_maxlen=text_maxlen, + ) + self.unet_dim = unet_dim + self.name = "UNet" + + def get_input_names(self): + return ["sample", "timestep", "encoder_hidden_states"] + + def get_output_names(self): + return ["latent"] + + def get_dynamic_axes(self): + return { + "sample": {0: "2B", 2: "H", 3: "W"}, + "encoder_hidden_states": {0: "2B"}, + "latent": {0: "2B", 2: "H", 3: "W"}, + } + + def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape): + latent_height, latent_width = self.check_dims(batch_size, image_height, image_width) + ( + min_batch, + max_batch, + _, + _, + _, + _, + min_latent_height, + max_latent_height, + min_latent_width, + max_latent_width, + ) = self.get_minmax_dims(batch_size, image_height, image_width, static_batch, static_shape) + return { + "sample": [ + (2 * min_batch, self.unet_dim, min_latent_height, min_latent_width), + (2 * batch_size, self.unet_dim, latent_height, latent_width), + (2 * max_batch, self.unet_dim, max_latent_height, max_latent_width), + ], + "encoder_hidden_states": [ + (2 * min_batch, self.text_maxlen, self.embedding_dim), + (2 * batch_size, self.text_maxlen, self.embedding_dim), + (2 * max_batch, self.text_maxlen, self.embedding_dim), + ], + } + + def get_shape_dict(self, batch_size, image_height, image_width): + latent_height, latent_width = self.check_dims(batch_size, image_height, image_width) + return { + "sample": (2 * batch_size, self.unet_dim, latent_height, latent_width), + "encoder_hidden_states": (2 * batch_size, self.text_maxlen, self.embedding_dim), + "latent": (2 * batch_size, 4, latent_height, latent_width), + } + + def get_sample_input(self, batch_size, image_height, image_width): + latent_height, latent_width = self.check_dims(batch_size, image_height, image_width) + dtype = torch.float16 if self.fp16 else torch.float32 + return ( + torch.randn( + 2 * batch_size, self.unet_dim, latent_height, latent_width, dtype=torch.float32, device=self.device + ), + torch.tensor([1.0], dtype=torch.float32, device=self.device), + torch.randn(2 * batch_size, self.text_maxlen, self.embedding_dim, dtype=dtype, device=self.device), + ) + + +def make_UNet(model, device, max_batch_size, embedding_dim, inpaint=False): + return UNet( + model, + fp16=True, + device=device, + max_batch_size=max_batch_size, + embedding_dim=embedding_dim, + unet_dim=(9 if inpaint else 4), + ) + + +class VAE(BaseModel): + def __init__(self, model, device, max_batch_size, embedding_dim): + super(VAE, self).__init__( + model=model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim + ) + self.name = "VAE decoder" + + def get_input_names(self): + return ["latent"] + + def get_output_names(self): + return ["images"] + + def get_dynamic_axes(self): + return {"latent": {0: "B", 2: "H", 3: "W"}, "images": {0: "B", 2: "8H", 3: "8W"}} + + def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape): + latent_height, latent_width = self.check_dims(batch_size, image_height, image_width) + ( + min_batch, + max_batch, + _, + _, + _, + _, + min_latent_height, + max_latent_height, + min_latent_width, + max_latent_width, + ) = self.get_minmax_dims(batch_size, image_height, image_width, static_batch, static_shape) + return { + "latent": [ + (min_batch, 4, min_latent_height, min_latent_width), + (batch_size, 4, latent_height, latent_width), + (max_batch, 4, max_latent_height, max_latent_width), + ] + } + + def get_shape_dict(self, batch_size, image_height, image_width): + latent_height, latent_width = self.check_dims(batch_size, image_height, image_width) + return { + "latent": (batch_size, 4, latent_height, latent_width), + "images": (batch_size, 3, image_height, image_width), + } + + def get_sample_input(self, batch_size, image_height, image_width): + latent_height, latent_width = self.check_dims(batch_size, image_height, image_width) + return torch.randn(batch_size, 4, latent_height, latent_width, dtype=torch.float32, device=self.device) + + +def make_VAE(model, device, max_batch_size, embedding_dim, inpaint=False): + return VAE(model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim) + + +class TensorRTStableDiffusionPipeline(StableDiffusionPipeline): + r""" + Pipeline for text-to-image generation using TensorRT accelerated Stable Diffusion. + + This model inherits from [`StableDiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + Args: + vae ([`AutoencoderKL`]): + Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. + text_encoder ([`CLIPTextModel`]): + Frozen text-encoder. Stable Diffusion uses the text portion of + [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically + the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant. + tokenizer (`CLIPTokenizer`): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + scheduler ([`SchedulerMixin`]): + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. + safety_checker ([`StableDiffusionSafetyChecker`]): + Classification module that estimates whether generated images could be considered offensive or harmful. + Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details. + feature_extractor ([`CLIPFeatureExtractor`]): + Model that extracts features from generated images to be used as inputs for the `safety_checker`. + """ + + def __init__( + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: DDIMScheduler, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPFeatureExtractor, + requires_safety_checker: bool = True, + stages=["clip", "unet", "vae"], + image_height: int = 768, + image_width: int = 768, + max_batch_size: int = 16, + # ONNX export parameters + onnx_opset: int = 17, + onnx_dir: str = "onnx", + # TensorRT engine build parameters + engine_dir: str = "engine", + force_engine_rebuild: bool = False, + timing_cache: str = "timing_cache", + ): + super().__init__( + vae, text_encoder, tokenizer, unet, scheduler, safety_checker, feature_extractor, requires_safety_checker + ) + + self.vae.forward = self.vae.decode + + self.stages = stages + self.image_height, self.image_width = image_height, image_width + self.inpaint = False + self.onnx_opset = onnx_opset + self.onnx_dir = onnx_dir + self.engine_dir = engine_dir + self.force_engine_rebuild = force_engine_rebuild + self.timing_cache = timing_cache + self.build_static_batch = False + self.build_dynamic_shape = False + self.build_preview_features = False + + self.max_batch_size = max_batch_size + # TODO: Restrict batch size to 4 for larger image dimensions as a WAR for TensorRT limitation. + if self.build_dynamic_shape or self.image_height > 512 or self.image_width > 512: + self.max_batch_size = 4 + + self.stream = None # loaded in loadResources() + self.models = {} # loaded in __loadModels() + self.engine = {} # loaded in build_engines() + + def __loadModels(self): + # Load pipeline models + self.embedding_dim = self.text_encoder.config.hidden_size + models_args = { + "device": self.torch_device, + "max_batch_size": self.max_batch_size, + "embedding_dim": self.embedding_dim, + "inpaint": self.inpaint, + } + if "clip" in self.stages: + self.models["clip"] = make_CLIP(self.text_encoder, **models_args) + if "unet" in self.stages: + self.models["unet"] = make_UNet(self.unet, **models_args) + if "vae" in self.stages: + self.models["vae"] = make_VAE(self.vae, **models_args) + + @classmethod + def set_cached_folder(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs): + cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE) + resume_download = kwargs.pop("resume_download", False) + proxies = kwargs.pop("proxies", None) + local_files_only = kwargs.pop("local_files_only", False) + use_auth_token = kwargs.pop("use_auth_token", None) + revision = kwargs.pop("revision", None) + + cls.cached_folder = ( + pretrained_model_name_or_path + if os.path.isdir(pretrained_model_name_or_path) + else snapshot_download( + pretrained_model_name_or_path, + cache_dir=cache_dir, + resume_download=resume_download, + proxies=proxies, + local_files_only=local_files_only, + use_auth_token=use_auth_token, + revision=revision, + ) + ) + + def to(self, torch_device: Optional[Union[str, torch.device]] = None, silence_dtype_warnings: bool = False): + super().to(torch_device, silence_dtype_warnings) + + self.onnx_dir = os.path.join(self.cached_folder, self.onnx_dir) + self.engine_dir = os.path.join(self.cached_folder, self.engine_dir) + self.timing_cache = os.path.join(self.cached_folder, self.timing_cache) + + # set device + self.torch_device = self._execution_device + logger.warning(f"Running inference on device: {self.torch_device}") + + # load models + self.__loadModels() + + # build engines + self.engine = build_engines( + self.models, + self.engine_dir, + self.onnx_dir, + self.onnx_opset, + opt_image_height=self.image_height, + opt_image_width=self.image_width, + force_engine_rebuild=self.force_engine_rebuild, + static_batch=self.build_static_batch, + static_shape=not self.build_dynamic_shape, + enable_preview=self.build_preview_features, + timing_cache=self.timing_cache, + ) + + return self + + def __encode_prompt(self, prompt, negative_prompt): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. + Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). + """ + # Tokenize prompt + text_input_ids = ( + self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + .input_ids.type(torch.int32) + .to(self.torch_device) + ) + + text_input_ids_inp = device_view(text_input_ids) + # NOTE: output tensor for CLIP must be cloned because it will be overwritten when called again for negative prompt + text_embeddings = runEngine(self.engine["clip"], {"input_ids": text_input_ids_inp}, self.stream)[ + "text_embeddings" + ].clone() + + # Tokenize negative prompt + uncond_input_ids = ( + self.tokenizer( + negative_prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + .input_ids.type(torch.int32) + .to(self.torch_device) + ) + uncond_input_ids_inp = device_view(uncond_input_ids) + uncond_embeddings = runEngine(self.engine["clip"], {"input_ids": uncond_input_ids_inp}, self.stream)[ + "text_embeddings" + ] + + # Concatenate the unconditional and text embeddings into a single batch to avoid doing two forward passes for classifier free guidance + text_embeddings = torch.cat([uncond_embeddings, text_embeddings]).to(dtype=torch.float16) + + return text_embeddings + + def __denoise_latent( + self, latents, text_embeddings, timesteps=None, step_offset=0, mask=None, masked_image_latents=None + ): + if not isinstance(timesteps, torch.Tensor): + timesteps = self.scheduler.timesteps + for step_index, timestep in enumerate(timesteps): + # Expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, timestep) + if isinstance(mask, torch.Tensor): + latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1) + + # Predict the noise residual + timestep_float = timestep.float() if timestep.dtype != torch.float32 else timestep + + sample_inp = device_view(latent_model_input) + timestep_inp = device_view(timestep_float) + embeddings_inp = device_view(text_embeddings) + noise_pred = runEngine( + self.engine["unet"], + {"sample": sample_inp, "timestep": timestep_inp, "encoder_hidden_states": embeddings_inp}, + self.stream, + )["latent"] + + # Perform guidance + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond) + + latents = self.scheduler.step(noise_pred, timestep, latents).prev_sample + + latents = 1.0 / 0.18215 * latents + return latents + + def __decode_latent(self, latents): + images = runEngine(self.engine["vae"], {"latent": device_view(latents)}, self.stream)["images"] + images = (images / 2 + 0.5).clamp(0, 1) + return images.cpu().permute(0, 2, 3, 1).float().numpy() + + def __loadResources(self, image_height, image_width, batch_size): + self.stream = cuda.Stream() + + # Allocate buffers for TensorRT engine bindings + for model_name, obj in self.models.items(): + self.engine[model_name].allocate_buffers( + shape_dict=obj.get_shape_dict(batch_size, image_height, image_width), device=self.torch_device + ) + + @torch.no_grad() + def __call__( + self, + prompt: Union[str, List[str]] = None, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + ): + r""" + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + instead. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + guidance_scale (`float`, *optional*, defaults to 7.5): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. + Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + + """ + self.generator = generator + self.denoising_steps = num_inference_steps + self.guidance_scale = guidance_scale + + # Pre-compute latent input scales and linear multistep coefficients + self.scheduler.set_timesteps(self.denoising_steps, device=self.torch_device) + + # Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + prompt = [prompt] + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + raise ValueError(f"Expected prompt to be of type list or str but got {type(prompt)}") + + if negative_prompt is None: + negative_prompt = [""] * batch_size + + if negative_prompt is not None and isinstance(negative_prompt, str): + negative_prompt = [negative_prompt] + + assert len(prompt) == len(negative_prompt) + + if batch_size > self.max_batch_size: + raise ValueError( + f"Batch size {len(prompt)} is larger than allowed {self.max_batch_size}. If dynamic shape is used, then maximum batch size is 4" + ) + + # load resources + self.__loadResources(self.image_height, self.image_width, batch_size) + + with torch.inference_mode(), torch.autocast("cuda"), trt.Runtime(TRT_LOGGER): + # CLIP text encoder + text_embeddings = self.__encode_prompt(prompt, negative_prompt) + + # Pre-initialize latents + num_channels_latents = self.unet.in_channels + latents = self.prepare_latents( + batch_size, + num_channels_latents, + self.image_height, + self.image_width, + torch.float32, + self.torch_device, + generator, + ) + + # UNet denoiser + latents = self.__denoise_latent(latents, text_embeddings) + + # VAE decode latent + images = self.__decode_latent(latents) + + images, has_nsfw_concept = self.run_safety_checker(images, self.torch_device, text_embeddings.dtype) + images = self.numpy_to_pil(images) + return StableDiffusionPipelineOutput(images=images, nsfw_content_detected=has_nsfw_concept) From c8fdfe457229d647d6019e449f3eb6fafb4b6e92 Mon Sep 17 00:00:00 2001 From: Chanchana Sornsoontorn Date: Wed, 19 Apr 2023 23:51:58 +0700 Subject: [PATCH 30/71] Correct `Transformer2DModel.forward` docstring (#3074) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ⚙️chore(transformer_2d) update function signature for encoder_hidden_states --- src/diffusers/models/transformer_2d.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/models/transformer_2d.py b/src/diffusers/models/transformer_2d.py index 23364bfa1d16..fde1014bd2e7 100644 --- a/src/diffusers/models/transformer_2d.py +++ b/src/diffusers/models/transformer_2d.py @@ -225,7 +225,7 @@ def forward( hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`. When continuous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input hidden_states - encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*): + encoder_hidden_states ( `torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*): Conditional embeddings for cross attention layer. If not given, cross-attention defaults to self-attention. timestep ( `torch.long`, *optional*): From 3becd368b14d74ca361eada8408627234996e4d1 Mon Sep 17 00:00:00 2001 From: hwuebben Date: Wed, 19 Apr 2023 18:58:13 +0200 Subject: [PATCH 31/71] Update pipeline_stable_diffusion_inpaint_legacy.py (#2903) * Update pipeline_stable_diffusion_inpaint_legacy.py * fix preprocessing of Pil images with adequate batch size * revert map * add tests * reformat * Update test_stable_diffusion_inpaint_legacy.py * Update test_stable_diffusion_inpaint_legacy.py * Update test_stable_diffusion_inpaint_legacy.py * Update test_stable_diffusion_inpaint_legacy.py * next try to fix the style * wth is this * Update testing_utils.py * Update testing_utils.py * Update test_stable_diffusion_inpaint_legacy.py * Update test_stable_diffusion_inpaint_legacy.py * Update test_stable_diffusion_inpaint_legacy.py * Update test_stable_diffusion_inpaint_legacy.py * Update test_stable_diffusion_inpaint_legacy.py * Update test_stable_diffusion_inpaint_legacy.py --------- Co-authored-by: Patrick von Platen --- ...ipeline_stable_diffusion_inpaint_legacy.py | 20 ++-- src/diffusers/utils/testing_utils.py | 10 ++ .../test_stable_diffusion_inpaint_legacy.py | 93 ++++++++++++++++++- 3 files changed, 108 insertions(+), 15 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py index 1c8377c7e54e..3ad1d5e92273 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py @@ -41,17 +41,17 @@ logger = logging.get_logger(__name__) -def preprocess_image(image): +def preprocess_image(image, batch_size): w, h = image.size w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8 image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]) image = np.array(image).astype(np.float32) / 255.0 - image = image[None].transpose(0, 3, 1, 2) + image = np.vstack([image[None].transpose(0, 3, 1, 2)] * batch_size) image = torch.from_numpy(image) return 2.0 * image - 1.0 -def preprocess_mask(mask, scale_factor=8): +def preprocess_mask(mask, batch_size, scale_factor=8): if not isinstance(mask, torch.FloatTensor): mask = mask.convert("L") w, h = mask.size @@ -59,7 +59,7 @@ def preprocess_mask(mask, scale_factor=8): mask = mask.resize((w // scale_factor, h // scale_factor), resample=PIL_INTERPOLATION["nearest"]) mask = np.array(mask).astype(np.float32) / 255.0 mask = np.tile(mask, (4, 1, 1)) - mask = mask[None].transpose(0, 1, 2, 3) # what does this step do? + mask = np.vstack([mask[None]] * batch_size) mask = 1 - mask # repaint white, keep black mask = torch.from_numpy(mask) return mask @@ -521,14 +521,14 @@ def get_timesteps(self, num_inference_steps, strength, device): return timesteps, num_inference_steps - t_start - def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator): + def prepare_latents(self, image, timestep, num_images_per_prompt, dtype, device, generator): image = image.to(device=self.device, dtype=dtype) init_latent_dist = self.vae.encode(image).latent_dist init_latents = init_latent_dist.sample(generator=generator) init_latents = self.vae.config.scaling_factor * init_latents # Expand init_latents for batch_size and num_images_per_prompt - init_latents = torch.cat([init_latents] * batch_size * num_images_per_prompt, dim=0) + init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0) init_latents_orig = init_latents # add noise to latents using the timesteps @@ -659,9 +659,9 @@ def __call__( # 4. Preprocess image and mask if not isinstance(image, torch.FloatTensor): - image = preprocess_image(image) + image = preprocess_image(image, batch_size) - mask_image = preprocess_mask(mask_image, self.vae_scale_factor) + mask_image = preprocess_mask(mask_image, batch_size, self.vae_scale_factor) # 5. set timesteps self.scheduler.set_timesteps(num_inference_steps, device=device) @@ -671,12 +671,12 @@ def __call__( # 6. Prepare latent variables # encode the init image into latents and scale the latents latents, init_latents_orig, noise = self.prepare_latents( - image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator + image, latent_timestep, num_images_per_prompt, prompt_embeds.dtype, device, generator ) # 7. Prepare mask latent mask = mask_image.to(device=self.device, dtype=latents.dtype) - mask = torch.cat([mask] * batch_size * num_images_per_prompt) + mask = torch.cat([mask] * num_images_per_prompt) # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py index afea0540b765..d8fed5dec1c8 100644 --- a/src/diffusers/utils/testing_utils.py +++ b/src/diffusers/utils/testing_utils.py @@ -279,6 +279,16 @@ def load_image(image: Union[str, PIL.Image.Image]) -> PIL.Image.Image: return image +def preprocess_image(image: PIL.Image, batch_size: int): + w, h = image.size + w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8 + image = image.resize((w, h), resample=PIL.Image.LANCZOS) + image = np.array(image).astype(np.float32) / 255.0 + image = np.vstack([image[None].transpose(0, 3, 1, 2)] * batch_size) + image = torch.from_numpy(image) + return 2.0 * image - 1.0 + + def export_to_video(video_frames: List[np.ndarray], output_video_path: str = None) -> str: if is_opencv_available(): import cv2 diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py index 15d94414ea2f..f56fa31a9601 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py @@ -34,7 +34,7 @@ VQModel, ) from diffusers.utils import floats_tensor, load_image, nightly, slow, torch_device -from diffusers.utils.testing_utils import load_numpy, require_torch_gpu +from diffusers.utils.testing_utils import load_numpy, preprocess_image, require_torch_gpu torch.backends.cuda.matmul.allow_tf32 = False @@ -217,6 +217,55 @@ def test_stable_diffusion_inpaint_legacy(self): assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 + def test_stable_diffusion_inpaint_legacy_batched(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + unet = self.dummy_cond_unet + scheduler = PNDMScheduler(skip_prk_steps=True) + vae = self.dummy_vae + bert = self.dummy_text_encoder + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") + + image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0] + init_image = Image.fromarray(np.uint8(image)).convert("RGB") + init_images_tens = preprocess_image(init_image, batch_size=2) + init_masks_tens = init_images_tens + 4 + + # make sure here that pndm scheduler skips prk + sd_pipe = StableDiffusionInpaintPipelineLegacy( + unet=unet, + scheduler=scheduler, + vae=vae, + text_encoder=bert, + tokenizer=tokenizer, + safety_checker=None, + feature_extractor=self.dummy_extractor, + ) + sd_pipe = sd_pipe.to(device) + sd_pipe.set_progress_bar_config(disable=None) + + prompt = "A painting of a squirrel eating a burger" + generator = torch.Generator(device=device).manual_seed(0) + images = sd_pipe( + [prompt] * 2, + generator=generator, + guidance_scale=6.0, + num_inference_steps=2, + output_type="np", + image=init_images_tens, + mask_image=init_masks_tens, + ).images + + assert images.shape == (2, 32, 32, 3) + + image_slice_0 = images[0, -3:, -3:, -1].flatten() + image_slice_1 = images[1, -3:, -3:, -1].flatten() + + expected_slice_0 = np.array([0.4697, 0.3770, 0.4096, 0.4653, 0.4497, 0.4183, 0.3950, 0.4668, 0.4672]) + expected_slice_1 = np.array([0.4105, 0.4987, 0.5771, 0.4921, 0.4237, 0.5684, 0.5496, 0.4645, 0.5272]) + + assert np.abs(expected_slice_0 - image_slice_0).max() < 1e-2 + assert np.abs(expected_slice_1 - image_slice_1).max() < 1e-2 + def test_stable_diffusion_inpaint_legacy_negative_prompt(self): device = "cpu" # ensure determinism for the device-dependent torch.Generator unet = self.dummy_cond_unet @@ -349,7 +398,7 @@ def tearDown(self): gc.collect() torch.cuda.empty_cache() - def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): + def get_inputs(self, generator_device="cpu", seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) init_image = load_image( "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" @@ -379,7 +428,7 @@ def test_stable_diffusion_inpaint_legacy_pndm(self): pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() - inputs = self.get_inputs(torch_device) + inputs = self.get_inputs() image = pipe(**inputs).images image_slice = image[0, 253:256, 253:256, -1].flatten() @@ -388,6 +437,40 @@ def test_stable_diffusion_inpaint_legacy_pndm(self): assert np.abs(expected_slice - image_slice).max() < 1e-4 + def test_stable_diffusion_inpaint_legacy_batched(self): + pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained( + "CompVis/stable-diffusion-v1-4", safety_checker=None + ) + pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + pipe.enable_attention_slicing() + + inputs = self.get_inputs() + inputs["prompt"] = [inputs["prompt"]] * 2 + inputs["image"] = preprocess_image(inputs["image"], batch_size=2) + + mask = inputs["mask_image"].convert("L") + mask = np.array(mask).astype(np.float32) / 255.0 + mask = torch.from_numpy(1 - mask) + masks = torch.vstack([mask[None][None]] * 2) + inputs["mask_image"] = masks + + image = pipe(**inputs).images + assert image.shape == (2, 512, 512, 3) + + image_slice_0 = image[0, 253:256, 253:256, -1].flatten() + image_slice_1 = image[1, 253:256, 253:256, -1].flatten() + + expected_slice_0 = np.array( + [0.52093095, 0.4176447, 0.32752383, 0.6175223, 0.50563973, 0.36470804, 0.65460044, 0.5775188, 0.44332123] + ) + expected_slice_1 = np.array( + [0.3592432, 0.4233033, 0.3914635, 0.31014425, 0.3702293, 0.39412856, 0.17526966, 0.2642669, 0.37480092] + ) + + assert np.abs(expected_slice_0 - image_slice_0).max() < 1e-4 + assert np.abs(expected_slice_1 - image_slice_1).max() < 1e-4 + def test_stable_diffusion_inpaint_legacy_k_lms(self): pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained( "CompVis/stable-diffusion-v1-4", safety_checker=None @@ -397,7 +480,7 @@ def test_stable_diffusion_inpaint_legacy_k_lms(self): pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() - inputs = self.get_inputs(torch_device) + inputs = self.get_inputs() image = pipe(**inputs).images image_slice = image[0, 253:256, 253:256, -1].flatten() @@ -437,7 +520,7 @@ def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() - inputs = self.get_inputs(torch_device, dtype=torch.float16) + inputs = self.get_inputs() pipe(**inputs, callback=callback_fn, callback_steps=1) assert callback_fn.has_been_called assert number_of_steps == 2 From a4c91be73b871e2b1b0e934d893001978415e547 Mon Sep 17 00:00:00 2001 From: superhero-7 <57797766+superhero-7@users.noreply.github.com> Date: Thu, 20 Apr 2023 01:00:29 +0800 Subject: [PATCH 32/71] Modified altdiffusion pipline to support altdiffusion-m18 (#2993) * Modified altdiffusion pipline to support altdiffusion-m18 * Modified altdiffusion pipline to support altdiffusion-m18 * Modified altdiffusion pipline to support altdiffusion-m18 * Modified altdiffusion pipline to support altdiffusion-m18 * Modified altdiffusion pipline to support altdiffusion-m18 * Modified altdiffusion pipline to support altdiffusion-m18 * Modified altdiffusion pipline to support altdiffusion-m18 --------- Co-authored-by: root --- .../alt_diffusion/modeling_roberta_series.py | 35 +++++++++++++------ 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/src/diffusers/pipelines/alt_diffusion/modeling_roberta_series.py b/src/diffusers/pipelines/alt_diffusion/modeling_roberta_series.py index 637d6dd18698..f73ef15d7de7 100644 --- a/src/diffusers/pipelines/alt_diffusion/modeling_roberta_series.py +++ b/src/diffusers/pipelines/alt_diffusion/modeling_roberta_series.py @@ -56,7 +56,7 @@ def __init__( class RobertaSeriesModelWithTransformation(RobertaPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] + _keys_to_ignore_on_load_unexpected = [r"pooler", r"logit_scale"] _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] base_model_prefix = "roberta" config_class = RobertaSeriesConfig @@ -65,6 +65,10 @@ def __init__(self, config): super().__init__(config) self.roberta = XLMRobertaModel(config) self.transformation = nn.Linear(config.hidden_size, config.project_dim) + self.has_pre_transformation = getattr(config, "has_pre_transformation", False) + if self.has_pre_transformation: + self.transformation_pre = nn.Linear(config.hidden_size, config.project_dim) + self.pre_LN = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.post_init() def forward( @@ -95,15 +99,26 @@ def forward( encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + output_hidden_states=True if self.has_pre_transformation else output_hidden_states, return_dict=return_dict, ) - projection_state = self.transformation(outputs.last_hidden_state) - - return TransformationModelOutput( - projection_state=projection_state, - last_hidden_state=outputs.last_hidden_state, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) + if self.has_pre_transformation: + sequence_output2 = outputs["hidden_states"][-2] + sequence_output2 = self.pre_LN(sequence_output2) + projection_state2 = self.transformation_pre(sequence_output2) + + return TransformationModelOutput( + projection_state=projection_state2, + last_hidden_state=outputs.last_hidden_state, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + else: + projection_state = self.transformation(outputs.last_hidden_state) + return TransformationModelOutput( + projection_state=projection_state, + last_hidden_state=outputs.last_hidden_state, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) From 7e6886f5e93ca9bb1e6d4beece46fe1e43b819c2 Mon Sep 17 00:00:00 2001 From: Will Berman Date: Wed, 19 Apr 2023 10:46:51 -0700 Subject: [PATCH 33/71] controlnet training resize inputs to multiple of 8 (#3135) controlnet training center crop input images to multiple of 8 The pipeline code resizes inputs to multiples of 8. Not doing this resizing in the training script is causing the encoded image to have different height/width dimensions than the encoded conditioning image (which uses a separate encoder that's part of the controlnet model). We resize and center crop the inputs to make sure they're the same size (as well as all other images in the batch). We also check that the initial resolution is a multiple of 8. --- examples/controlnet/train_controlnet.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/examples/controlnet/train_controlnet.py b/examples/controlnet/train_controlnet.py index c0b52291fc9b..d52e610ca52d 100644 --- a/examples/controlnet/train_controlnet.py +++ b/examples/controlnet/train_controlnet.py @@ -525,6 +525,11 @@ def parse_args(input_args=None): " or the same number of `--validation_prompt`s and `--validation_image`s" ) + if args.resolution % 8 != 0: + raise ValueError( + "`--resolution` must be divisible by 8 for consistently sized encoded images between the VAE and the controlnet encoder." + ) + return args @@ -607,6 +612,7 @@ def tokenize_captions(examples, is_train=True): image_transforms = transforms.Compose( [ transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR), + transforms.CenterCrop(args.resolution), transforms.ToTensor(), transforms.Normalize([0.5], [0.5]), ] @@ -615,6 +621,7 @@ def tokenize_captions(examples, is_train=True): conditioning_image_transforms = transforms.Compose( [ transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR), + transforms.CenterCrop(args.resolution), transforms.ToTensor(), ] ) From 3979aac996213fe48e03bd95384b9858dd69a2f0 Mon Sep 17 00:00:00 2001 From: nupurkmr9 Date: Thu, 20 Apr 2023 03:31:42 -0400 Subject: [PATCH 34/71] adding custom diffusion training to diffusers examples (#3031) * diffusers==0.14.0 update * custom diffusion update * custom diffusion update * custom diffusion update * custom diffusion update * custom diffusion update * custom diffusion update * custom diffusion * custom diffusion * custom diffusion * custom diffusion * custom diffusion * apply formatting and get rid of bare except. * refactor readme and other minor changes. * misc refactor. * fix: repo_id issue and loaders logging bug. * fix: save_model_card. * fix: save_model_card. * fix: save_model_card. * add: doc entry. * refactor doc,. * custom diffusion * custom diffusion * custom diffusion * apply style. * remove tralining whitespace. * fix: toctree entry. * remove unnecessary print. * custom diffusion * custom diffusion * custom diffusion test * custom diffusion xformer update * custom diffusion xformer update * custom diffusion xformer update --------- Co-authored-by: Nupur Kumari Co-authored-by: Sayak Paul Co-authored-by: Patrick von Platen Co-authored-by: Nupur Kumari --- docs/source/en/_toctree.yml | 2 + docs/source/en/training/custom_diffusion.mdx | 287 ++++ docs/source/en/training/overview.mdx | 4 + examples/custom_diffusion/README.md | 280 ++++ examples/custom_diffusion/requirements.txt | 6 + examples/custom_diffusion/retrieve.py | 87 ++ .../train_custom_diffusion.py | 1289 +++++++++++++++++ examples/test_examples.py | 24 + src/diffusers/loaders.py | 70 +- src/diffusers/models/attention_processor.py | 189 +++ tests/models/test_models_unet_2d_condition.py | 141 +- 11 files changed, 2369 insertions(+), 10 deletions(-) create mode 100644 docs/source/en/training/custom_diffusion.mdx create mode 100644 examples/custom_diffusion/README.md create mode 100644 examples/custom_diffusion/requirements.txt create mode 100644 examples/custom_diffusion/retrieve.py create mode 100644 examples/custom_diffusion/train_custom_diffusion.py diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index df41854a9fe7..de33ba616d0a 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -74,6 +74,8 @@ title: ControlNet - local: training/instructpix2pix title: InstructPix2Pix Training + - local: training/custom_diffusion + title: Custom Diffusion title: Training - sections: - local: using-diffusers/rl diff --git a/docs/source/en/training/custom_diffusion.mdx b/docs/source/en/training/custom_diffusion.mdx new file mode 100644 index 000000000000..1e1958e1c946 --- /dev/null +++ b/docs/source/en/training/custom_diffusion.mdx @@ -0,0 +1,287 @@ + + +# Custom Diffusion training example + +[Custom Diffusion](https://arxiv.org/abs/2212.04488) is a method to customize text-to-image models like Stable Diffusion given just a few (4~5) images of a subject. +The `train_custom_diffusion.py` script shows how to implement the training procedure and adapt it for stable diffusion. + +## Running locally with PyTorch + +### Installing the dependencies + +Before running the scripts, make sure to install the library's training dependencies: + +**Important** + +To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment: + +```bash +git clone https://github.com/huggingface/diffusers +cd diffusers +pip install -e . +``` + +Then cd in the example folder and run + +```bash +pip install -r requirements.txt +pip install clip-retrieval +``` + +And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with: + +```bash +accelerate config +``` + +Or for a default accelerate configuration without answering questions about your environment + +```bash +accelerate config default +``` + +Or if your environment doesn't support an interactive shell e.g. a notebook + +```python +from accelerate.utils import write_basic_config + +write_basic_config() +``` +### Cat example 😺 + +Now let's get our dataset. Download dataset from [here](https://www.cs.cmu.edu/~custom-diffusion/assets/data.zip) and unzip it. + +We also collect 200 real images using `clip-retrieval` which are combined with the target images in the training dataset as a regularization. This prevents overfitting to the the given target image. The following flags enable the regularization `with_prior_preservation`, `real_prior` with `prior_loss_weight=1.`. +The `class_prompt` should be the category name same as target image. The collected real images are with text captions similar to the `class_prompt`. The retrieved image are saved in `class_data_dir`. You can disable `real_prior` to use generated images as regularization. To collect the real images use this command first before training. + +```bash +pip install clip-retrieval +python retrieve.py --class_prompt cat --class_data_dir real_reg/samples_cat --num_class_images 200 +``` + +**___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___** + +```bash +export MODEL_NAME="CompVis/stable-diffusion-v1-4" +export OUTPUT_DIR="path-to-save-model" +export INSTANCE_DIR="./data/cat" + +accelerate launch train_custom_diffusion.py \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --instance_data_dir=$INSTANCE_DIR \ + --output_dir=$OUTPUT_DIR \ + --class_data_dir=./real_reg/samples_cat/ \ + --with_prior_preservation --real_prior --prior_loss_weight=1.0 \ + --class_prompt="cat" --num_class_images=200 \ + --instance_prompt="photo of a cat" \ + --resolution=512 \ + --train_batch_size=2 \ + --learning_rate=1e-5 \ + --lr_warmup_steps=0 \ + --max_train_steps=250 \ + --scale_lr --hflip \ + --modifier_token "" +``` + +**Use `--enable_xformers_memory_efficient_attention` for faster training with lower VRAM requirement (16GB per GPU). Follow [this guide](https://github.com/facebookresearch/xformers) for installation instructions.** + +To track your experiments using Weights and Biases (`wandb`) and to save intermediate results (whcih we HIGHLY recommend), follow these steps: + +* Install `wandb`: `pip install wandb`. +* Authorize: `wandb login`. +* Then specify a `validation_prompt` and set `report_to` to `wandb` while launching training. You can also configure the following related arguments: + * `num_validation_images` + * `validation_steps` + +Here is an example command: + +```bash +accelerate launch train_custom_diffusion.py \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --instance_data_dir=$INSTANCE_DIR \ + --output_dir=$OUTPUT_DIR \ + --class_data_dir=./real_reg/samples_cat/ \ + --with_prior_preservation --real_prior --prior_loss_weight=1.0 \ + --class_prompt="cat" --num_class_images=200 \ + --instance_prompt="photo of a cat" \ + --resolution=512 \ + --train_batch_size=2 \ + --learning_rate=1e-5 \ + --lr_warmup_steps=0 \ + --max_train_steps=250 \ + --scale_lr --hflip \ + --modifier_token "" \ + --validation_prompt=" cat sitting in a bucket" \ + --report_to="wandb" +``` + +Here is an example [Weights and Biases page](https://wandb.ai/sayakpaul/custom-diffusion/runs/26ghrcau) where you can check out the intermediate results along with other training details. + +If you specify `--push_to_hub`, the learned parameters will be pushed to a repository on the Hugging Face Hub. Here is an [example repository](https://huggingface.co/sayakpaul/custom-diffusion-cat). + +### Training on multiple concepts 🐱🪵 + +Provide a [json](https://github.com/adobe-research/custom-diffusion/blob/main/assets/concept_list.json) file with the info about each concept, similar to [this](https://github.com/ShivamShrirao/diffusers/blob/main/examples/dreambooth/train_dreambooth.py). + +To collect the real images run this command for each concept in the json file. + +```bash +pip install clip-retrieval +python retrieve.py --class_prompt {} --class_data_dir {} --num_class_images 200 +``` + +And then we're ready to start training! + +```bash +export MODEL_NAME="CompVis/stable-diffusion-v1-4" +export OUTPUT_DIR="path-to-save-model" + +accelerate launch train_custom_diffusion.py \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --output_dir=$OUTPUT_DIR \ + --concepts_list=./concept_list.json \ + --with_prior_preservation --real_prior --prior_loss_weight=1.0 \ + --resolution=512 \ + --train_batch_size=2 \ + --learning_rate=1e-5 \ + --lr_warmup_steps=0 \ + --max_train_steps=500 \ + --num_class_images=200 \ + --scale_lr --hflip \ + --modifier_token "+" +``` + +Here is an example [Weights and Biases page](https://wandb.ai/sayakpaul/custom-diffusion/runs/3990tzkg) where you can check out the intermediate results along with other training details. + +### Training on human faces + +For fine-tuning on human faces we found the following configuration to work better: `learning_rate=5e-6`, `max_train_steps=1000 to 2000`, and `freeze_model=crossattn` with at least 15-20 images. + +To collect the real images use this command first before training. + +```bash +pip install clip-retrieval +python retrieve.py --class_prompt person --class_data_dir real_reg/samples_person --num_class_images 200 +``` + +Then start training! + +```bash +export MODEL_NAME="CompVis/stable-diffusion-v1-4" +export OUTPUT_DIR="path-to-save-model" +export INSTANCE_DIR="path-to-images" + +accelerate launch train_custom_diffusion.py \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --instance_data_dir=$INSTANCE_DIR \ + --output_dir=$OUTPUT_DIR \ + --class_data_dir=./real_reg/samples_person/ \ + --with_prior_preservation --real_prior --prior_loss_weight=1.0 \ + --class_prompt="person" --num_class_images=200 \ + --instance_prompt="photo of a person" \ + --resolution=512 \ + --train_batch_size=2 \ + --learning_rate=5e-6 \ + --lr_warmup_steps=0 \ + --max_train_steps=1000 \ + --scale_lr --hflip --noaug \ + --freeze_model crossattn \ + --modifier_token "" \ + --enable_xformers_memory_efficient_attention +``` + +## Inference + +Once you have trained a model using the above command, you can run inference using the below command. Make sure to include the `modifier token` (e.g. \ in above example) in your prompt. + +```python +import torch +from diffusers import DiffusionPipeline + +pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16).to("cuda") +pipe.unet.load_attn_procs("path-to-save-model", weight_name="pytorch_custom_diffusion_weights.bin") +pipe.load_textual_inversion("path-to-save-model", weight_name=".bin") + +image = pipe( + " cat sitting in a bucket", + num_inference_steps=100, + guidance_scale=6.0, + eta=1.0, +).images[0] +image.save("cat.png") +``` + +It's possible to directly load these parameters from a Hub repository: + +```python +import torch +from huggingface_hub.repocard import RepoCard +from diffusers import DiffusionPipeline + +model_id = "sayakpaul/custom-diffusion-cat" +card = RepoCard.load(model_id) +base_model_id = card.data.to_dict()["base_model"] + +pipe = DiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16).to("cuda") +pipe.unet.load_attn_procs(model_id, weight_name="pytorch_custom_diffusion_weights.bin") +pipe.load_textual_inversion(model_id, weight_name=".bin") + +image = pipe( + " cat sitting in a bucket", + num_inference_steps=100, + guidance_scale=6.0, + eta=1.0, +).images[0] +image.save("cat.png") +``` + +Here is an example of performing inference with multiple concepts: + +```python +import torch +from huggingface_hub.repocard import RepoCard +from diffusers import DiffusionPipeline + +model_id = "sayakpaul/custom-diffusion-cat-wooden-pot" +card = RepoCard.load(model_id) +base_model_id = card.data.to_dict()["base_model"] + +pipe = DiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16).to("cuda") +pipe.unet.load_attn_procs(model_id, weight_name="pytorch_custom_diffusion_weights.bin") +pipe.load_textual_inversion(model_id, weight_name=".bin") +pipe.load_textual_inversion(model_id, weight_name=".bin") + +image = pipe( + "the cat sculpture in the style of a wooden pot", + num_inference_steps=100, + guidance_scale=6.0, + eta=1.0, +).images[0] +image.save("multi-subject.png") +``` + +Here, `cat` and `wooden pot` refer to the multiple concepts. + +### Inference from a training checkpoint + +You can also perform inference from one of the complete checkpoint saved during the training process, if you used the `--checkpointing_steps` argument. + +TODO. + +## Set grads to none +To save even more memory, pass the `--set_grads_to_none` argument to the script. This will set grads to None instead of zero. However, be aware that it changes certain behaviors, so if you start experiencing any problems, remove this argument. + +More info: https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html + +## Experimental results +You can refer to [our webpage](https://www.cs.cmu.edu/~custom-diffusion/) that discusses our experiments in detail. diff --git a/docs/source/en/training/overview.mdx b/docs/source/en/training/overview.mdx index 5ad3a1f06cc1..c5cea3bb0a96 100644 --- a/docs/source/en/training/overview.mdx +++ b/docs/source/en/training/overview.mdx @@ -39,6 +39,8 @@ Training examples show how to pretrain or fine-tune diffusion models for a varie - [Dreambooth](./dreambooth) - [LoRA Support](./lora) - [ControlNet](./controlnet) +- [InstructPix2Pix](./instructpix2pix) +- [Custom Diffusion](./custom_diffusion) If possible, please [install xFormers](../optimization/xformers) for memory efficient attention. This could help make your training faster and less memory intensive. @@ -50,6 +52,8 @@ If possible, please [install xFormers](../optimization/xformers) for memory effi | [**Dreambooth**](./dreambooth) | ✅ | - | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_dreambooth_training.ipynb) | [**Training with LoRA**](./lora) | ✅ | - | - | | [**ControlNet**](./controlnet) | ✅ | ✅ | - | +| [**InstructPix2Pix**](./instructpix2pix) | ✅ | ✅ | - | +| [**Custom Diffusion**](./custom_diffusion) | ✅ | ✅ | - | ## Community diff --git a/examples/custom_diffusion/README.md b/examples/custom_diffusion/README.md new file mode 100644 index 000000000000..ecd972737bc3 --- /dev/null +++ b/examples/custom_diffusion/README.md @@ -0,0 +1,280 @@ +# Custom Diffusion training example + +[Custom Diffusion](https://arxiv.org/abs/2212.04488) is a method to customize text-to-image models like Stable Diffusion given just a few (4~5) images of a subject. +The `train_custom_diffusion.py` script shows how to implement the training procedure and adapt it for stable diffusion. + +## Running locally with PyTorch + +### Installing the dependencies + +Before running the scripts, make sure to install the library's training dependencies: + +**Important** + +To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment: + +```bash +git clone https://github.com/huggingface/diffusers +cd diffusers +pip install -e . +``` + +Then cd in the example folder and run + +```bash +pip install -r requirements.txt +pip install clip-retrieval +``` + +And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with: + +```bash +accelerate config +``` + +Or for a default accelerate configuration without answering questions about your environment + +```bash +accelerate config default +``` + +Or if your environment doesn't support an interactive shell e.g. a notebook + +```python +from accelerate.utils import write_basic_config +write_basic_config() +``` +### Cat example 😺 + +Now let's get our dataset. Download dataset from [here](https://www.cs.cmu.edu/~custom-diffusion/assets/data.zip) and unzip it. + +We also collect 200 real images using `clip-retrieval` which are combined with the target images in the training dataset as a regularization. This prevents overfitting to the the given target image. The following flags enable the regularization `with_prior_preservation`, `real_prior` with `prior_loss_weight=1.`. +The `class_prompt` should be the category name same as target image. The collected real images are with text captions similar to the `class_prompt`. The retrieved image are saved in `class_data_dir`. You can disable `real_prior` to use generated images as regularization. To collect the real images use this command first before training. + +```bash +pip install clip-retrieval +python retrieve.py --class_prompt cat --class_data_dir real_reg/samples_cat --num_class_images 200 +``` + +**___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___** + +```bash +export MODEL_NAME="CompVis/stable-diffusion-v1-4" +export OUTPUT_DIR="path-to-save-model" +export INSTANCE_DIR="./data/cat" + +accelerate launch train_custom_diffusion.py \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --instance_data_dir=$INSTANCE_DIR \ + --output_dir=$OUTPUT_DIR \ + --class_data_dir=./real_reg/samples_cat/ \ + --with_prior_preservation --real_prior --prior_loss_weight=1.0 \ + --class_prompt="cat" --num_class_images=200 \ + --instance_prompt="photo of a cat" \ + --resolution=512 \ + --train_batch_size=2 \ + --learning_rate=1e-5 \ + --lr_warmup_steps=0 \ + --max_train_steps=250 \ + --scale_lr --hflip \ + --modifier_token "" +``` + +**Use `--enable_xformers_memory_efficient_attention` for faster training with lower VRAM requirement (16GB per GPU). Follow [this guide](https://github.com/facebookresearch/xformers) for installation instructions.** + +To track your experiments using Weights and Biases (`wandb`) and to save intermediate results (whcih we HIGHLY recommend), follow these steps: + +* Install `wandb`: `pip install wandb`. +* Authorize: `wandb login`. +* Then specify a `validation_prompt` and set `report_to` to `wandb` while launching training. You can also configure the following related arguments: + * `num_validation_images` + * `validation_steps` + +Here is an example command: + +```bash +accelerate launch train_custom_diffusion.py \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --instance_data_dir=$INSTANCE_DIR \ + --output_dir=$OUTPUT_DIR \ + --class_data_dir=./real_reg/samples_cat/ \ + --with_prior_preservation --real_prior --prior_loss_weight=1.0 \ + --class_prompt="cat" --num_class_images=200 \ + --instance_prompt="photo of a cat" \ + --resolution=512 \ + --train_batch_size=2 \ + --learning_rate=1e-5 \ + --lr_warmup_steps=0 \ + --max_train_steps=250 \ + --scale_lr --hflip \ + --modifier_token "" \ + --validation_prompt=" cat sitting in a bucket" \ + --report_to="wandb" +``` + +Here is an example [Weights and Biases page](https://wandb.ai/sayakpaul/custom-diffusion/runs/26ghrcau) where you can check out the intermediate results along with other training details. + +If you specify `--push_to_hub`, the learned parameters will be pushed to a repository on the Hugging Face Hub. Here is an [example repository](https://huggingface.co/sayakpaul/custom-diffusion-cat). + +### Training on multiple concepts 🐱🪵 + +Provide a [json](https://github.com/adobe-research/custom-diffusion/blob/main/assets/concept_list.json) file with the info about each concept, similar to [this](https://github.com/ShivamShrirao/diffusers/blob/main/examples/dreambooth/train_dreambooth.py). + +To collect the real images run this command for each concept in the json file. + +```bash +pip install clip-retrieval +python retrieve.py --class_prompt {} --class_data_dir {} --num_class_images 200 +``` + +And then we're ready to start training! + +```bash +export MODEL_NAME="CompVis/stable-diffusion-v1-4" +export OUTPUT_DIR="path-to-save-model" + +accelerate launch train_custom_diffusion.py \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --output_dir=$OUTPUT_DIR \ + --concepts_list=./concept_list.json \ + --with_prior_preservation --real_prior --prior_loss_weight=1.0 \ + --resolution=512 \ + --train_batch_size=2 \ + --learning_rate=1e-5 \ + --lr_warmup_steps=0 \ + --max_train_steps=500 \ + --num_class_images=200 \ + --scale_lr --hflip \ + --modifier_token "+" +``` + +Here is an example [Weights and Biases page](https://wandb.ai/sayakpaul/custom-diffusion/runs/3990tzkg) where you can check out the intermediate results along with other training details. + +### Training on human faces + +For fine-tuning on human faces we found the following configuration to work better: `learning_rate=5e-6`, `max_train_steps=1000 to 2000`, and `freeze_model=crossattn` with at least 15-20 images. + +To collect the real images use this command first before training. + +```bash +pip install clip-retrieval +python retrieve.py --class_prompt person --class_data_dir real_reg/samples_person --num_class_images 200 +``` + +Then start training! + +```bash +export MODEL_NAME="CompVis/stable-diffusion-v1-4" +export OUTPUT_DIR="path-to-save-model" +export INSTANCE_DIR="path-to-images" + +accelerate launch train_custom_diffusion.py \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --instance_data_dir=$INSTANCE_DIR \ + --output_dir=$OUTPUT_DIR \ + --class_data_dir=./real_reg/samples_person/ \ + --with_prior_preservation --real_prior --prior_loss_weight=1.0 \ + --class_prompt="person" --num_class_images=200 \ + --instance_prompt="photo of a person" \ + --resolution=512 \ + --train_batch_size=2 \ + --learning_rate=5e-6 \ + --lr_warmup_steps=0 \ + --max_train_steps=1000 \ + --scale_lr --hflip --noaug \ + --freeze_model crossattn \ + --modifier_token "" \ + --enable_xformers_memory_efficient_attention +``` + +## Inference + +Once you have trained a model using the above command, you can run inference using the below command. Make sure to include the `modifier token` (e.g. \ in above example) in your prompt. + +```python +import torch +from diffusers import DiffusionPipeline + +pipe = DiffusionPipeline.from_pretrained( + "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16 +).to("cuda") +pipe.unet.load_attn_procs( + "path-to-save-model", weight_name="pytorch_custom_diffusion_weights.bin" +) +pipe.load_textual_inversion("path-to-save-model", weight_name=".bin") + +image = pipe( + " cat sitting in a bucket", + num_inference_steps=100, + guidance_scale=6.0, + eta=1.0, +).images[0] +image.save("cat.png") +``` + +It's possible to directly load these parameters from a Hub repository: + +```python +import torch +from huggingface_hub.repocard import RepoCard +from diffusers import DiffusionPipeline + +model_id = "sayakpaul/custom-diffusion-cat" +card = RepoCard.load(model_id) +base_model_id = card.data.to_dict()["base_model"] + +pipe = DiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16).to( +"cuda") +pipe.unet.load_attn_procs(model_id, weight_name="pytorch_custom_diffusion_weights.bin") +pipe.load_textual_inversion(model_id, weight_name=".bin") + +image = pipe( + " cat sitting in a bucket", + num_inference_steps=100, + guidance_scale=6.0, + eta=1.0, +).images[0] +image.save("cat.png") +``` + +Here is an example of performing inference with multiple concepts: + +```python +import torch +from huggingface_hub.repocard import RepoCard +from diffusers import DiffusionPipeline + +model_id = "sayakpaul/custom-diffusion-cat-wooden-pot" +card = RepoCard.load(model_id) +base_model_id = card.data.to_dict()["base_model"] + +pipe = DiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16).to( +"cuda") +pipe.unet.load_attn_procs(model_id, weight_name="pytorch_custom_diffusion_weights.bin") +pipe.load_textual_inversion(model_id, weight_name=".bin") +pipe.load_textual_inversion(model_id, weight_name=".bin") + +image = pipe( + "the cat sculpture in the style of a wooden pot", + num_inference_steps=100, + guidance_scale=6.0, + eta=1.0, +).images[0] +image.save("multi-subject.png") +``` + +Here, `cat` and `wooden pot` refer to the multiple concepts. + +### Inference from a training checkpoint + +You can also perform inference from one of the complete checkpoint saved during the training process, if you used the `--checkpointing_steps` argument. + +TODO. + +## Set grads to none +To save even more memory, pass the `--set_grads_to_none` argument to the script. This will set grads to None instead of zero. However, be aware that it changes certain behaviors, so if you start experiencing any problems, remove this argument. + +More info: https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html + +## Experimental results +You can refer to [our webpage](https://www.cs.cmu.edu/~custom-diffusion/) that discusses our experiments in detail. \ No newline at end of file diff --git a/examples/custom_diffusion/requirements.txt b/examples/custom_diffusion/requirements.txt new file mode 100644 index 000000000000..7d93f3d03bd8 --- /dev/null +++ b/examples/custom_diffusion/requirements.txt @@ -0,0 +1,6 @@ +accelerate +torchvision +transformers>=4.25.1 +ftfy +tensorboard +Jinja2 diff --git a/examples/custom_diffusion/retrieve.py b/examples/custom_diffusion/retrieve.py new file mode 100644 index 000000000000..7b7635c1887d --- /dev/null +++ b/examples/custom_diffusion/retrieve.py @@ -0,0 +1,87 @@ +# Copyright 2023 Custom Diffusion authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import os +from io import BytesIO +from pathlib import Path + +import requests +from clip_retrieval.clip_client import ClipClient +from PIL import Image +from tqdm import tqdm + + +def retrieve(class_prompt, class_data_dir, num_class_images): + factor = 1.5 + num_images = int(factor * num_class_images) + client = ClipClient( + url="https://knn.laion.ai/knn-service", indice_name="laion_400m", num_images=num_images, aesthetic_weight=0.1 + ) + + os.makedirs(f"{class_data_dir}/images", exist_ok=True) + if len(list(Path(f"{class_data_dir}/images").iterdir())) >= num_class_images: + return + + while True: + class_images = client.query(text=class_prompt) + if len(class_images) >= factor * num_class_images or num_images > 1e4: + break + else: + num_images = int(factor * num_images) + client = ClipClient( + url="https://knn.laion.ai/knn-service", + indice_name="laion_400m", + num_images=num_images, + aesthetic_weight=0.1, + ) + + count = 0 + total = 0 + pbar = tqdm(desc="downloading real regularization images", total=num_class_images) + + with open(f"{class_data_dir}/caption.txt", "w") as f1, open(f"{class_data_dir}/urls.txt", "w") as f2, open( + f"{class_data_dir}/images.txt", "w" + ) as f3: + while total < num_class_images: + images = class_images[count] + count += 1 + try: + img = requests.get(images["url"]) + if img.status_code == 200: + _ = Image.open(BytesIO(img.content)) + with open(f"{class_data_dir}/images/{total}.jpg", "wb") as f: + f.write(img.content) + f1.write(images["caption"] + "\n") + f2.write(images["url"] + "\n") + f3.write(f"{class_data_dir}/images/{total}.jpg" + "\n") + total += 1 + pbar.update(1) + else: + continue + except Exception: + continue + return + + +def parse_args(): + parser = argparse.ArgumentParser("", add_help=False) + parser.add_argument("--class_prompt", help="text prompt to retrieve images", required=True, type=str) + parser.add_argument("--class_data_dir", help="path to save images", required=True, type=str) + parser.add_argument("--num_class_images", help="number of images to download", default=200, type=int) + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + retrieve(args.class_prompt, args.class_data_dir, args.num_class_images) diff --git a/examples/custom_diffusion/train_custom_diffusion.py b/examples/custom_diffusion/train_custom_diffusion.py new file mode 100644 index 000000000000..49b05e6b5db3 --- /dev/null +++ b/examples/custom_diffusion/train_custom_diffusion.py @@ -0,0 +1,1289 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2023 Custom Diffusion authors and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and + +import argparse +import hashlib +import itertools +import json +import logging +import math +import os +import random +import warnings +from pathlib import Path + +import numpy as np +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +import transformers +from accelerate import Accelerator +from accelerate.logging import get_logger +from accelerate.utils import ProjectConfiguration, set_seed +from huggingface_hub import HfApi, create_repo +from packaging import version +from PIL import Image +from torch.utils.data import Dataset +from torchvision import transforms +from tqdm.auto import tqdm +from transformers import AutoTokenizer, PretrainedConfig + +import diffusers +from diffusers import ( + AutoencoderKL, + DDPMScheduler, + DiffusionPipeline, + DPMSolverMultistepScheduler, + UNet2DConditionModel, +) +from diffusers.loaders import AttnProcsLayers +from diffusers.models.attention_processor import CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor +from diffusers.optimization import get_scheduler +from diffusers.utils import check_min_version, is_wandb_available +from diffusers.utils.import_utils import is_xformers_available + + +# Will error if the minimal version of diffusers is not installed. Remove at your own risks. +check_min_version("0.15.0.dev0") + +logger = get_logger(__name__) + + +def freeze_params(params): + for param in params: + param.requires_grad = False + + +def save_model_card(repo_id: str, images=None, base_model=str, prompt=str, repo_folder=None): + img_str = "" + for i, image in enumerate(images): + image.save(os.path.join(repo_folder, f"image_{i}.png")) + img_str += f"![img_{i}](./image_{i}.png)\n" + + yaml = f""" +--- +license: creativeml-openrail-m +base_model: {base_model} +instance_prompt: {prompt} +tags: +- stable-diffusion +- stable-diffusion-diffusers +- text-to-image +- diffusers +- custom-diffusion +inference: true +--- + """ + model_card = f""" +# Custom Diffusion - {repo_id} + +These are Custom Diffusion adaption weights for {base_model}. The weights were trained on {prompt} using [Custom Diffusion](https://www.cs.cmu.edu/~custom-diffusion). You can find some example images in the following. \n +{img_str} + +\nFor more details on the training, please follow [this link](https://github.com/huggingface/diffusers/blob/main/examples/custom_diffusion). +""" + with open(os.path.join(repo_folder, "README.md"), "w") as f: + f.write(yaml + model_card) + + +def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str): + text_encoder_config = PretrainedConfig.from_pretrained( + pretrained_model_name_or_path, + subfolder="text_encoder", + revision=revision, + ) + model_class = text_encoder_config.architectures[0] + + if model_class == "CLIPTextModel": + from transformers import CLIPTextModel + + return CLIPTextModel + elif model_class == "RobertaSeriesModelWithTransformation": + from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation + + return RobertaSeriesModelWithTransformation + else: + raise ValueError(f"{model_class} is not supported.") + + +def collate_fn(examples, with_prior_preservation): + input_ids = [example["instance_prompt_ids"] for example in examples] + pixel_values = [example["instance_images"] for example in examples] + mask = [example["mask"] for example in examples] + # Concat class and instance examples for prior preservation. + # We do this to avoid doing two forward passes. + if with_prior_preservation: + input_ids += [example["class_prompt_ids"] for example in examples] + pixel_values += [example["class_images"] for example in examples] + mask += [example["class_mask"] for example in examples] + + input_ids = torch.cat(input_ids, dim=0) + pixel_values = torch.stack(pixel_values) + mask = torch.stack(mask) + pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float() + mask = mask.to(memory_format=torch.contiguous_format).float() + + batch = {"input_ids": input_ids, "pixel_values": pixel_values, "mask": mask.unsqueeze(1)} + return batch + + +class PromptDataset(Dataset): + "A simple dataset to prepare the prompts to generate class images on multiple GPUs." + + def __init__(self, prompt, num_samples): + self.prompt = prompt + self.num_samples = num_samples + + def __len__(self): + return self.num_samples + + def __getitem__(self, index): + example = {} + example["prompt"] = self.prompt + example["index"] = index + return example + + +class CustomDiffusionDataset(Dataset): + """ + A dataset to prepare the instance and class images with the prompts for fine-tuning the model. + It pre-processes the images and the tokenizes prompts. + """ + + def __init__( + self, + concepts_list, + tokenizer, + size=512, + mask_size=64, + center_crop=False, + with_prior_preservation=False, + num_class_images=200, + hflip=False, + aug=True, + ): + self.size = size + self.mask_size = mask_size + self.center_crop = center_crop + self.tokenizer = tokenizer + self.interpolation = Image.BILINEAR + self.aug = aug + + self.instance_images_path = [] + self.class_images_path = [] + self.with_prior_preservation = with_prior_preservation + for concept in concepts_list: + inst_img_path = [ + (x, concept["instance_prompt"]) for x in Path(concept["instance_data_dir"]).iterdir() if x.is_file() + ] + self.instance_images_path.extend(inst_img_path) + + if with_prior_preservation: + class_data_root = Path(concept["class_data_dir"]) + if os.path.isdir(class_data_root): + class_images_path = list(class_data_root.iterdir()) + class_prompt = [concept["class_prompt"] for _ in range(len(class_images_path))] + else: + with open(class_data_root, "r") as f: + class_images_path = f.read().splitlines() + with open(concept["class_prompt"], "r") as f: + class_prompt = f.read().splitlines() + + class_img_path = [(x, y) for (x, y) in zip(class_images_path, class_prompt)] + self.class_images_path.extend(class_img_path[:num_class_images]) + + random.shuffle(self.instance_images_path) + self.num_instance_images = len(self.instance_images_path) + self.num_class_images = len(self.class_images_path) + self._length = max(self.num_class_images, self.num_instance_images) + self.flip = transforms.RandomHorizontalFlip(0.5 * hflip) + + self.image_transforms = transforms.Compose( + [ + self.flip, + transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR), + transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size), + transforms.ToTensor(), + transforms.Normalize([0.5], [0.5]), + ] + ) + + def __len__(self): + return self._length + + def preprocess(self, image, scale, resample): + outer, inner = self.size, scale + factor = self.size // self.mask_size + if scale > self.size: + outer, inner = scale, self.size + top, left = np.random.randint(0, outer - inner + 1), np.random.randint(0, outer - inner + 1) + image = image.resize((scale, scale), resample=resample) + image = np.array(image).astype(np.uint8) + image = (image / 127.5 - 1.0).astype(np.float32) + instance_image = np.zeros((self.size, self.size, 3), dtype=np.float32) + mask = np.zeros((self.size // factor, self.size // factor)) + if scale > self.size: + instance_image = image[top : top + inner, left : left + inner, :] + mask = np.ones((self.size // factor, self.size // factor)) + else: + instance_image[top : top + inner, left : left + inner, :] = image + mask[ + top // factor + 1 : (top + scale) // factor - 1, left // factor + 1 : (left + scale) // factor - 1 + ] = 1.0 + return instance_image, mask + + def __getitem__(self, index): + example = {} + instance_image, instance_prompt = self.instance_images_path[index % self.num_instance_images] + instance_image = Image.open(instance_image) + if not instance_image.mode == "RGB": + instance_image = instance_image.convert("RGB") + instance_image = self.flip(instance_image) + + # apply resize augmentation and create a valid image region mask + random_scale = self.size + if self.aug: + random_scale = ( + np.random.randint(self.size // 3, self.size + 1) + if np.random.uniform() < 0.66 + else np.random.randint(int(1.2 * self.size), int(1.4 * self.size)) + ) + instance_image, mask = self.preprocess(instance_image, random_scale, self.interpolation) + + if random_scale < 0.6 * self.size: + instance_prompt = np.random.choice(["a far away ", "very small "]) + instance_prompt + elif random_scale > self.size: + instance_prompt = np.random.choice(["zoomed in ", "close up "]) + instance_prompt + + example["instance_images"] = torch.from_numpy(instance_image).permute(2, 0, 1) + example["mask"] = torch.from_numpy(mask) + example["instance_prompt_ids"] = self.tokenizer( + instance_prompt, + truncation=True, + padding="max_length", + max_length=self.tokenizer.model_max_length, + return_tensors="pt", + ).input_ids + + if self.with_prior_preservation: + class_image, class_prompt = self.class_images_path[index % self.num_class_images] + class_image = Image.open(class_image) + if not class_image.mode == "RGB": + class_image = class_image.convert("RGB") + example["class_images"] = self.image_transforms(class_image) + example["class_mask"] = torch.ones_like(example["mask"]) + example["class_prompt_ids"] = self.tokenizer( + class_prompt, + truncation=True, + padding="max_length", + max_length=self.tokenizer.model_max_length, + return_tensors="pt", + ).input_ids + + return example + + +def save_new_embed(text_encoder, modifier_token_id, accelerator, args, output_dir): + """Saves the new token embeddings from the text encoder.""" + logger.info("Saving embeddings") + learned_embeds = accelerator.unwrap_model(text_encoder).get_input_embeddings().weight + for x, y in zip(modifier_token_id, args.modifier_token): + learned_embeds_dict = {} + learned_embeds_dict[y] = learned_embeds[x] + torch.save(learned_embeds_dict, f"{output_dir}/{y}.bin") + + +def parse_args(input_args=None): + parser = argparse.ArgumentParser(description="Custom Diffusion training script.") + parser.add_argument( + "--pretrained_model_name_or_path", + type=str, + default=None, + required=True, + help="Path to pretrained model or model identifier from huggingface.co/models.", + ) + parser.add_argument( + "--revision", + type=str, + default=None, + required=False, + help="Revision of pretrained model identifier from huggingface.co/models.", + ) + parser.add_argument( + "--tokenizer_name", + type=str, + default=None, + help="Pretrained tokenizer name or path if not the same as model_name", + ) + parser.add_argument( + "--instance_data_dir", + type=str, + default=None, + help="A folder containing the training data of instance images.", + ) + parser.add_argument( + "--class_data_dir", + type=str, + default=None, + help="A folder containing the training data of class images.", + ) + parser.add_argument( + "--instance_prompt", + type=str, + default=None, + help="The prompt with identifier specifying the instance", + ) + parser.add_argument( + "--class_prompt", + type=str, + default=None, + help="The prompt to specify images in the same class as provided instance images.", + ) + parser.add_argument( + "--validation_prompt", + type=str, + default=None, + help="A prompt that is used during validation to verify that the model is learning.", + ) + parser.add_argument( + "--num_validation_images", + type=int, + default=2, + help="Number of images that should be generated during validation with `validation_prompt`.", + ) + parser.add_argument( + "--validation_steps", + type=int, + default=50, + help=( + "Run dreambooth validation every X epochs. Dreambooth validation consists of running the prompt" + " `args.validation_prompt` multiple times: `args.num_validation_images`." + ), + ) + parser.add_argument( + "--with_prior_preservation", + default=False, + action="store_true", + help="Flag to add prior preservation loss.", + ) + parser.add_argument( + "--real_prior", + default=False, + action="store_true", + help="real images as prior.", + ) + parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.") + parser.add_argument( + "--num_class_images", + type=int, + default=200, + help=( + "Minimal class images for prior preservation loss. If there are not enough images already present in" + " class_data_dir, additional images will be sampled with class_prompt." + ), + ) + parser.add_argument( + "--output_dir", + type=str, + default="custom-diffusion-model", + help="The output directory where the model predictions and checkpoints will be written.", + ) + parser.add_argument("--seed", type=int, default=42, help="A seed for reproducible training.") + parser.add_argument( + "--resolution", + type=int, + default=512, + help=( + "The resolution for input images, all the images in the train/validation dataset will be resized to this" + " resolution" + ), + ) + parser.add_argument( + "--center_crop", + default=False, + action="store_true", + help=( + "Whether to center crop the input images to the resolution. If not set, the images will be randomly" + " cropped. The images will be resized to the resolution first before cropping." + ), + ) + parser.add_argument( + "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader." + ) + parser.add_argument( + "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images." + ) + parser.add_argument("--num_train_epochs", type=int, default=1) + parser.add_argument( + "--max_train_steps", + type=int, + default=None, + help="Total number of training steps to perform. If provided, overrides num_train_epochs.", + ) + parser.add_argument( + "--checkpointing_steps", + type=int, + default=250, + help=( + "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final" + " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming" + " training using `--resume_from_checkpoint`." + ), + ) + parser.add_argument( + "--checkpoints_total_limit", + type=int, + default=None, + help=( + "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." + " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" + " for more docs" + ), + ) + parser.add_argument( + "--resume_from_checkpoint", + type=str, + default=None, + help=( + "Whether training should be resumed from a previous checkpoint. Use a path saved by" + ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.' + ), + ) + parser.add_argument( + "--gradient_accumulation_steps", + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.", + ) + parser.add_argument( + "--gradient_checkpointing", + action="store_true", + help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.", + ) + parser.add_argument( + "--learning_rate", + type=float, + default=1e-5, + help="Initial learning rate (after the potential warmup period) to use.", + ) + parser.add_argument( + "--scale_lr", + action="store_true", + default=False, + help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.", + ) + parser.add_argument( + "--dataloader_num_workers", + type=int, + default=2, + help=( + "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process." + ), + ) + parser.add_argument( + "--freeze_model", + type=str, + default="crossattn_kv", + choices=["crossattn_kv", "crossattn"], + help="crossattn to enable fine-tuning of all params in the cross attention", + ) + parser.add_argument( + "--lr_scheduler", + type=str, + default="constant", + help=( + 'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",' + ' "constant", "constant_with_warmup"]' + ), + ) + parser.add_argument( + "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler." + ) + parser.add_argument( + "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes." + ) + parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.") + parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.") + parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.") + parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer") + parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") + parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.") + parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.") + parser.add_argument( + "--hub_model_id", + type=str, + default=None, + help="The name of the repository to keep in sync with the local `output_dir`.", + ) + parser.add_argument( + "--logging_dir", + type=str, + default="logs", + help=( + "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to" + " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***." + ), + ) + parser.add_argument( + "--allow_tf32", + action="store_true", + help=( + "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see" + " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices" + ), + ) + parser.add_argument( + "--report_to", + type=str, + default="tensorboard", + help=( + 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`' + ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.' + ), + ) + parser.add_argument( + "--mixed_precision", + type=str, + default=None, + choices=["no", "fp16", "bf16"], + help=( + "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >=" + " 1.10.and an Nvidia Ampere GPU. Default to the value of accelerate config of the current system or the" + " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config." + ), + ) + parser.add_argument( + "--prior_generation_precision", + type=str, + default=None, + choices=["no", "fp32", "fp16", "bf16"], + help=( + "Choose prior generation precision between fp32, fp16 and bf16 (bfloat16). Bf16 requires PyTorch >=" + " 1.10.and an Nvidia Ampere GPU. Default to fp16 if a GPU is available else fp32." + ), + ) + parser.add_argument( + "--concepts_list", + type=str, + default=None, + help="Path to json containing multiple concepts, will overwrite parameters like instance_prompt, class_prompt, etc.", + ) + parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") + parser.add_argument( + "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers." + ) + parser.add_argument( + "--set_grads_to_none", + action="store_true", + help=( + "Save more memory by using setting grads to None instead of zero. Be aware, that this changes certain" + " behaviors, so disable this argument if it causes any problems. More info:" + " https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html" + ), + ) + parser.add_argument( + "--modifier_token", + type=str, + default=None, + help="A token to use as a modifier for the concept.", + ) + parser.add_argument( + "--initializer_token", type=str, default="ktn+pll+ucd", help="A token to use as initializer word." + ) + parser.add_argument("--hflip", action="store_true", help="Apply horizontal flip data augmentation.") + parser.add_argument( + "--noaug", + action="store_true", + help="Dont apply augmentation during data augmentation when this flag is enabled.", + ) + + if input_args is not None: + args = parser.parse_args(input_args) + else: + args = parser.parse_args() + + env_local_rank = int(os.environ.get("LOCAL_RANK", -1)) + if env_local_rank != -1 and env_local_rank != args.local_rank: + args.local_rank = env_local_rank + + if args.with_prior_preservation: + if args.concepts_list is None: + if args.class_data_dir is None: + raise ValueError("You must specify a data directory for class images.") + if args.class_prompt is None: + raise ValueError("You must specify prompt for class images.") + else: + # logger is not available yet + if args.class_data_dir is not None: + warnings.warn("You need not use --class_data_dir without --with_prior_preservation.") + if args.class_prompt is not None: + warnings.warn("You need not use --class_prompt without --with_prior_preservation.") + + return args + + +def main(args): + logging_dir = Path(args.output_dir, args.logging_dir) + + accelerator_project_config = ProjectConfiguration(total_limit=args.checkpoints_total_limit) + + accelerator = Accelerator( + gradient_accumulation_steps=args.gradient_accumulation_steps, + mixed_precision=args.mixed_precision, + log_with=args.report_to, + logging_dir=logging_dir, + project_config=accelerator_project_config, + ) + + if args.report_to == "wandb": + if not is_wandb_available(): + raise ImportError("Make sure to install wandb if you want to use it for logging during training.") + import wandb + + # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate + # This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models. + # TODO (patil-suraj): Remove this check when gradient accumulation with two models is enabled in accelerate. + # Make one log on every process with the configuration for debugging. + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, + ) + logger.info(accelerator.state, main_process_only=False) + if accelerator.is_local_main_process: + transformers.utils.logging.set_verbosity_warning() + diffusers.utils.logging.set_verbosity_info() + else: + transformers.utils.logging.set_verbosity_error() + diffusers.utils.logging.set_verbosity_error() + + # We need to initialize the trackers we use, and also store our configuration. + # The trackers initializes automatically on the main process. + if accelerator.is_main_process: + accelerator.init_trackers("custom-diffusion", config=vars(args)) + + # If passed along, set the training seed now. + if args.seed is not None: + set_seed(args.seed) + if args.concepts_list is None: + args.concepts_list = [ + { + "instance_prompt": args.instance_prompt, + "class_prompt": args.class_prompt, + "instance_data_dir": args.instance_data_dir, + "class_data_dir": args.class_data_dir, + } + ] + else: + with open(args.concepts_list, "r") as f: + args.concepts_list = json.load(f) + + # Generate class images if prior preservation is enabled. + if args.with_prior_preservation: + for i, concept in enumerate(args.concepts_list): + class_images_dir = Path(concept["class_data_dir"]) + if not class_images_dir.exists(): + class_images_dir.mkdir(parents=True, exist_ok=True) + if args.real_prior: + assert ( + class_images_dir / "images" + ).exists(), f"Please run: python retrieve.py --class_prompt \"{concept['class_prompt']}\" --class_data_dir {class_images_dir} --num_class_images {args.num_class_images}" + assert ( + len(list((class_images_dir / "images").iterdir())) == args.num_class_images + ), f"Please run: python retrieve.py --class_prompt \"{concept['class_prompt']}\" --class_data_dir {class_images_dir} --num_class_images {args.num_class_images}" + assert ( + class_images_dir / "caption.txt" + ).exists(), f"Please run: python retrieve.py --class_prompt \"{concept['class_prompt']}\" --class_data_dir {class_images_dir} --num_class_images {args.num_class_images}" + assert ( + class_images_dir / "images.txt" + ).exists(), f"Please run: python retrieve.py --class_prompt \"{concept['class_prompt']}\" --class_data_dir {class_images_dir} --num_class_images {args.num_class_images}" + concept["class_prompt"] = os.path.join(class_images_dir, "caption.txt") + concept["class_data_dir"] = os.path.join(class_images_dir, "images.txt") + args.concepts_list[i] = concept + accelerator.wait_for_everyone() + else: + cur_class_images = len(list(class_images_dir.iterdir())) + + if cur_class_images < args.num_class_images: + torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32 + if args.prior_generation_precision == "fp32": + torch_dtype = torch.float32 + elif args.prior_generation_precision == "fp16": + torch_dtype = torch.float16 + elif args.prior_generation_precision == "bf16": + torch_dtype = torch.bfloat16 + pipeline = DiffusionPipeline.from_pretrained( + args.pretrained_model_name_or_path, + torch_dtype=torch_dtype, + safety_checker=None, + revision=args.revision, + ) + pipeline.set_progress_bar_config(disable=True) + + num_new_images = args.num_class_images - cur_class_images + logger.info(f"Number of class images to sample: {num_new_images}.") + + sample_dataset = PromptDataset(args.class_prompt, num_new_images) + sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size) + + sample_dataloader = accelerator.prepare(sample_dataloader) + pipeline.to(accelerator.device) + + for example in tqdm( + sample_dataloader, + desc="Generating class images", + disable=not accelerator.is_local_main_process, + ): + images = pipeline(example["prompt"]).images + + for i, image in enumerate(images): + hash_image = hashlib.sha1(image.tobytes()).hexdigest() + image_filename = ( + class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg" + ) + image.save(image_filename) + + del pipeline + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + # Handle the repository creation + if accelerator.is_main_process: + if args.output_dir is not None: + os.makedirs(args.output_dir, exist_ok=True) + + if args.push_to_hub: + repo_id = create_repo( + repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token + ).repo_id + + # Load the tokenizer + if args.tokenizer_name: + tokenizer = AutoTokenizer.from_pretrained( + args.tokenizer_name, + revision=args.revision, + use_fast=False, + ) + elif args.pretrained_model_name_or_path: + tokenizer = AutoTokenizer.from_pretrained( + args.pretrained_model_name_or_path, + subfolder="tokenizer", + revision=args.revision, + use_fast=False, + ) + + # import correct text encoder class + text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision) + + # Load scheduler and models + noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler") + text_encoder = text_encoder_cls.from_pretrained( + args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision + ) + vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision) + unet = UNet2DConditionModel.from_pretrained( + args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision + ) + + # Adding a modifier token which is optimized #### + # Code taken from https://github.com/huggingface/diffusers/blob/main/examples/textual_inversion/textual_inversion.py + modifier_token_id = [] + initializer_token_id = [] + if args.modifier_token is not None: + args.modifier_token = args.modifier_token.split("+") + args.initializer_token = args.initializer_token.split("+") + if len(args.modifier_token) > len(args.initializer_token): + raise ValueError("You must specify + separated initializer token for each modifier token.") + for modifier_token, initializer_token in zip( + args.modifier_token, args.initializer_token[: len(args.modifier_token)] + ): + # Add the placeholder token in tokenizer + num_added_tokens = tokenizer.add_tokens(modifier_token) + if num_added_tokens == 0: + raise ValueError( + f"The tokenizer already contains the token {modifier_token}. Please pass a different" + " `modifier_token` that is not already in the tokenizer." + ) + + # Convert the initializer_token, placeholder_token to ids + token_ids = tokenizer.encode([initializer_token], add_special_tokens=False) + print(token_ids) + # Check if initializer_token is a single token or a sequence of tokens + if len(token_ids) > 1: + raise ValueError("The initializer token must be a single token.") + + initializer_token_id.append(token_ids[0]) + modifier_token_id.append(tokenizer.convert_tokens_to_ids(modifier_token)) + + # Resize the token embeddings as we are adding new special tokens to the tokenizer + text_encoder.resize_token_embeddings(len(tokenizer)) + + # Initialise the newly added placeholder token with the embeddings of the initializer token + token_embeds = text_encoder.get_input_embeddings().weight.data + for x, y in zip(modifier_token_id, initializer_token_id): + token_embeds[x] = token_embeds[y] + + # Freeze all parameters except for the token embeddings in text encoder + params_to_freeze = itertools.chain( + text_encoder.text_model.encoder.parameters(), + text_encoder.text_model.final_layer_norm.parameters(), + text_encoder.text_model.embeddings.position_embedding.parameters(), + ) + freeze_params(params_to_freeze) + ######################################################## + ######################################################## + + vae.requires_grad_(False) + if args.modifier_token is None: + text_encoder.requires_grad_(False) + unet.requires_grad_(False) + # For mixed precision training we cast the text_encoder and vae weights to half-precision + # as these models are only used for inference, keeping weights in full precision is not required. + weight_dtype = torch.float32 + if accelerator.mixed_precision == "fp16": + weight_dtype = torch.float16 + elif accelerator.mixed_precision == "bf16": + weight_dtype = torch.bfloat16 + + # Move unet, vae and text_encoder to device and cast to weight_dtype + if accelerator.mixed_precision != "fp16" and args.modifier_token is not None: + text_encoder.to(accelerator.device, dtype=weight_dtype) + unet.to(accelerator.device, dtype=weight_dtype) + vae.to(accelerator.device, dtype=weight_dtype) + + attention_class = CustomDiffusionAttnProcessor + if args.enable_xformers_memory_efficient_attention: + if is_xformers_available(): + import xformers + + xformers_version = version.parse(xformers.__version__) + if xformers_version == version.parse("0.0.16"): + logger.warn( + "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details." + ) + attention_class = CustomDiffusionXFormersAttnProcessor + else: + raise ValueError("xformers is not available. Make sure it is installed correctly") + + # now we will add new Custom Diffusion weights to the attention layers + # It's important to realize here how many attention weights will be added and of which sizes + # The sizes of the attention layers consist only of two different variables: + # 1) - the "hidden_size", which is increased according to `unet.config.block_out_channels`. + # 2) - the "cross attention size", which is set to `unet.config.cross_attention_dim`. + + # Let's first see how many attention processors we will have to set. + # For Stable Diffusion, it should be equal to: + # - down blocks (2x attention layers) * (2x transformer layers) * (3x down blocks) = 12 + # - mid blocks (2x attention layers) * (1x transformer layers) * (1x mid blocks) = 2 + # - up blocks (2x attention layers) * (3x transformer layers) * (3x down blocks) = 18 + # => 32 layers + + # Only train key, value projection layers if freeze_model = 'crossattn_kv' else train all params in the cross attention layer + train_kv = True + train_q_out = False if args.freeze_model == "crossattn_kv" else True + custom_diffusion_attn_procs = {} + + st = unet.state_dict() + for name, _ in unet.attn_processors.items(): + cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim + if name.startswith("mid_block"): + hidden_size = unet.config.block_out_channels[-1] + elif name.startswith("up_blocks"): + block_id = int(name[len("up_blocks.")]) + hidden_size = list(reversed(unet.config.block_out_channels))[block_id] + elif name.startswith("down_blocks"): + block_id = int(name[len("down_blocks.")]) + hidden_size = unet.config.block_out_channels[block_id] + layer_name = name.split(".processor")[0] + weights = { + "to_k_custom_diffusion.weight": st[layer_name + ".to_k.weight"], + "to_v_custom_diffusion.weight": st[layer_name + ".to_v.weight"], + } + if train_q_out: + weights["to_q_custom_diffusion.weight"] = st[layer_name + ".to_q.weight"] + weights["to_out_custom_diffusion.0.weight"] = st[layer_name + ".to_out.0.weight"] + weights["to_out_custom_diffusion.0.bias"] = st[layer_name + ".to_out.0.bias"] + if cross_attention_dim is not None: + custom_diffusion_attn_procs[name] = attention_class( + train_kv=train_kv, + train_q_out=train_q_out, + hidden_size=hidden_size, + cross_attention_dim=cross_attention_dim, + ).to(unet.device) + custom_diffusion_attn_procs[name].load_state_dict(weights) + else: + custom_diffusion_attn_procs[name] = attention_class( + train_kv=False, + train_q_out=False, + hidden_size=hidden_size, + cross_attention_dim=cross_attention_dim, + ) + del st + unet.set_attn_processor(custom_diffusion_attn_procs) + custom_diffusion_layers = AttnProcsLayers(unet.attn_processors) + + accelerator.register_for_checkpointing(custom_diffusion_layers) + + if args.gradient_checkpointing: + unet.enable_gradient_checkpointing() + if args.modifier_token is not None: + text_encoder.gradient_checkpointing_enable() + # Enable TF32 for faster training on Ampere GPUs, + # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices + if args.allow_tf32: + torch.backends.cuda.matmul.allow_tf32 = True + + if args.scale_lr: + args.learning_rate = ( + args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes + ) + if args.with_prior_preservation: + args.learning_rate = args.learning_rate * 2.0 + + # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs + if args.use_8bit_adam: + try: + import bitsandbytes as bnb + except ImportError: + raise ImportError( + "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`." + ) + + optimizer_class = bnb.optim.AdamW8bit + else: + optimizer_class = torch.optim.AdamW + + # Optimizer creation + optimizer = optimizer_class( + itertools.chain(text_encoder.get_input_embeddings().parameters(), custom_diffusion_layers.parameters()) + if args.modifier_token is not None + else custom_diffusion_layers.parameters(), + lr=args.learning_rate, + betas=(args.adam_beta1, args.adam_beta2), + weight_decay=args.adam_weight_decay, + eps=args.adam_epsilon, + ) + + # Dataset and DataLoaders creation: + train_dataset = CustomDiffusionDataset( + concepts_list=args.concepts_list, + tokenizer=tokenizer, + with_prior_preservation=args.with_prior_preservation, + size=args.resolution, + mask_size=vae.encode( + torch.randn(1, 3, args.resolution, args.resolution).to(dtype=weight_dtype).to(accelerator.device) + ) + .latent_dist.sample() + .size()[-1], + center_crop=args.center_crop, + num_class_images=args.num_class_images, + hflip=args.hflip, + aug=not args.noaug, + ) + + train_dataloader = torch.utils.data.DataLoader( + train_dataset, + batch_size=args.train_batch_size, + shuffle=True, + collate_fn=lambda examples: collate_fn(examples, args.with_prior_preservation), + num_workers=args.dataloader_num_workers, + ) + + # Scheduler and math around the number of training steps. + overrode_max_train_steps = False + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) + if args.max_train_steps is None: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + overrode_max_train_steps = True + + lr_scheduler = get_scheduler( + args.lr_scheduler, + optimizer=optimizer, + num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, + ) + + # Prepare everything with our `accelerator`. + if args.modifier_token is not None: + custom_diffusion_layers, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( + custom_diffusion_layers, text_encoder, optimizer, train_dataloader, lr_scheduler + ) + else: + custom_diffusion_layers, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( + custom_diffusion_layers, optimizer, train_dataloader, lr_scheduler + ) + + # We need to recalculate our total training steps as the size of the training dataloader may have changed. + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) + if overrode_max_train_steps: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + # Afterwards we recalculate our number of training epochs + args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) + + # Train! + total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps + + logger.info("***** Running training *****") + logger.info(f" Num examples = {len(train_dataset)}") + logger.info(f" Num batches each epoch = {len(train_dataloader)}") + logger.info(f" Num Epochs = {args.num_train_epochs}") + logger.info(f" Instantaneous batch size per device = {args.train_batch_size}") + logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") + logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") + logger.info(f" Total optimization steps = {args.max_train_steps}") + global_step = 0 + first_epoch = 0 + + # Potentially load in the weights and states from a previous save + if args.resume_from_checkpoint: + if args.resume_from_checkpoint != "latest": + path = os.path.basename(args.resume_from_checkpoint) + else: + # Get the most recent checkpoint + dirs = os.listdir(args.output_dir) + dirs = [d for d in dirs if d.startswith("checkpoint")] + dirs = sorted(dirs, key=lambda x: int(x.split("-")[1])) + path = dirs[-1] if len(dirs) > 0 else None + + if path is None: + accelerator.print( + f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run." + ) + args.resume_from_checkpoint = None + else: + accelerator.print(f"Resuming from checkpoint {path}") + accelerator.load_state(os.path.join(args.output_dir, path)) + global_step = int(path.split("-")[1]) + + resume_global_step = global_step * args.gradient_accumulation_steps + first_epoch = global_step // num_update_steps_per_epoch + resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps) + + # Only show the progress bar once on each machine. + progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process) + progress_bar.set_description("Steps") + + for epoch in range(first_epoch, args.num_train_epochs): + unet.train() + if args.modifier_token is not None: + text_encoder.train() + for step, batch in enumerate(train_dataloader): + # Skip steps until we reach the resumed step + if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step: + if step % args.gradient_accumulation_steps == 0: + progress_bar.update(1) + continue + + with accelerator.accumulate(unet), accelerator.accumulate(text_encoder): + # Convert images to latent space + latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample() + latents = latents * vae.config.scaling_factor + + # Sample noise that we'll add to the latents + noise = torch.randn_like(latents) + bsz = latents.shape[0] + # Sample a random timestep for each image + timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device) + timesteps = timesteps.long() + + # Add noise to the latents according to the noise magnitude at each timestep + # (this is the forward diffusion process) + noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) + + # Get the text embedding for conditioning + encoder_hidden_states = text_encoder(batch["input_ids"])[0] + + # Predict the noise residual + model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample + + # Get the target for loss depending on the prediction type + if noise_scheduler.config.prediction_type == "epsilon": + target = noise + elif noise_scheduler.config.prediction_type == "v_prediction": + target = noise_scheduler.get_velocity(latents, noise, timesteps) + else: + raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}") + + if args.with_prior_preservation: + # Chunk the noise and model_pred into two parts and compute the loss on each part separately. + model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0) + target, target_prior = torch.chunk(target, 2, dim=0) + mask = torch.chunk(batch["mask"], 2, dim=0)[0] + # Compute instance loss + loss = F.mse_loss(model_pred.float(), target.float(), reduction="none") + loss = ((loss * mask).sum([1, 2, 3]) / mask.sum([1, 2, 3])).mean() + + # Compute prior loss + prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean") + + # Add the prior loss to the instance loss. + loss = loss + args.prior_loss_weight * prior_loss + else: + mask = batch["mask"] + loss = F.mse_loss(model_pred.float(), target.float(), reduction="none") + loss = ((loss * mask).sum([1, 2, 3]) / mask.sum([1, 2, 3])).mean() + accelerator.backward(loss) + # Zero out the gradients for all token embeddings except the newly added + # embeddings for the concept, as we only want to optimize the concept embeddings + if args.modifier_token is not None: + if accelerator.num_processes > 1: + grads_text_encoder = text_encoder.module.get_input_embeddings().weight.grad + else: + grads_text_encoder = text_encoder.get_input_embeddings().weight.grad + # Get the index for tokens that we want to zero the grads for + index_grads_to_zero = torch.arange(len(tokenizer)) != modifier_token_id[0] + for i in range(len(modifier_token_id[1:])): + index_grads_to_zero = index_grads_to_zero & ( + torch.arange(len(tokenizer)) != modifier_token_id[i] + ) + grads_text_encoder.data[index_grads_to_zero, :] = grads_text_encoder.data[ + index_grads_to_zero, : + ].fill_(0) + + if accelerator.sync_gradients: + params_to_clip = ( + itertools.chain(text_encoder.parameters(), custom_diffusion_layers.parameters()) + if args.modifier_token is not None + else custom_diffusion_layers.parameters() + ) + accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm) + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad(set_to_none=args.set_grads_to_none) + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: + progress_bar.update(1) + global_step += 1 + + if global_step % args.checkpointing_steps == 0: + if accelerator.is_main_process: + save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}") + accelerator.save_state(save_path) + logger.info(f"Saved state to {save_path}") + + logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]} + progress_bar.set_postfix(**logs) + accelerator.log(logs, step=global_step) + + if global_step >= args.max_train_steps: + break + + if accelerator.is_main_process: + if args.validation_prompt is not None and global_step % args.validation_steps == 0: + logger.info( + f"Running validation... \n Generating {args.num_validation_images} images with prompt:" + f" {args.validation_prompt}." + ) + # create pipeline + pipeline = DiffusionPipeline.from_pretrained( + args.pretrained_model_name_or_path, + unet=accelerator.unwrap_model(unet), + text_encoder=accelerator.unwrap_model(text_encoder), + tokenizer=tokenizer, + revision=args.revision, + ) + pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config) + pipeline = pipeline.to(accelerator.device) + pipeline.set_progress_bar_config(disable=True) + + # run inference + generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) + images = [ + pipeline(args.validation_prompt, num_inference_steps=25, generator=generator, eta=1.0).images[0] + for _ in range(args.num_validation_images) + ] + + for tracker in accelerator.trackers: + if tracker.name == "tensorboard": + np_images = np.stack([np.asarray(img) for img in images]) + tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC") + if tracker.name == "wandb": + tracker.log( + { + "validation": [ + wandb.Image(image, caption=f"{i}: {args.validation_prompt}") + for i, image in enumerate(images) + ] + } + ) + + del pipeline + torch.cuda.empty_cache() + + # Save the custom diffusion layers + accelerator.wait_for_everyone() + if accelerator.is_main_process: + unet = unet.to(torch.float32) + unet.save_attn_procs(args.output_dir) + save_new_embed(text_encoder, modifier_token_id, accelerator, args, args.output_dir) + + # Final inference + # Load previous pipeline + pipeline = DiffusionPipeline.from_pretrained( + args.pretrained_model_name_or_path, revision=args.revision, torch_dtype=weight_dtype + ) + pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config) + pipeline = pipeline.to(accelerator.device) + + # load attention processors + pipeline.unet.load_attn_procs(args.output_dir, weight_name="pytorch_custom_diffusion_weights.bin") + for token in args.modifier_token: + pipeline.load_textual_inversion(args.output_dir, weight_name=f"{token}.bin") + + # run inference + if args.validation_prompt and args.num_validation_images > 0: + generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None + images = [ + pipeline(args.validation_prompt, num_inference_steps=25, generator=generator, eta=1.0).images[0] + for _ in range(args.num_validation_images) + ] + + for tracker in accelerator.trackers: + if tracker.name == "tensorboard": + np_images = np.stack([np.asarray(img) for img in images]) + tracker.writer.add_images("test", np_images, epoch, dataformats="NHWC") + if tracker.name == "wandb": + tracker.log( + { + "test": [ + wandb.Image(image, caption=f"{i}: {args.validation_prompt}") + for i, image in enumerate(images) + ] + } + ) + + if args.push_to_hub: + save_model_card( + repo_id, + images=images, + base_model=args.pretrained_model_name_or_path, + prompt=args.instance_prompt, + repo_folder=args.output_dir, + ) + api = HfApi(token=args.hub_token) + api.upload_folder( + repo_id=repo_id, + folder_path=args.output_dir, + commit_message="End of training", + ignore_patterns=["step_*", "epoch_*"], + ) + + accelerator.end_training() + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/examples/test_examples.py b/examples/test_examples.py index d9a1f86e53aa..a77fa4c7da23 100644 --- a/examples/test_examples.py +++ b/examples/test_examples.py @@ -221,6 +221,30 @@ def test_dreambooth_checkpointing(self): self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-4"))) self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-6"))) + def test_custom_diffusion(self): + with tempfile.TemporaryDirectory() as tmpdir: + test_args = f""" + examples/custom_diffusion/train_custom_diffusion.py + --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-pipe + --instance_data_dir docs/source/en/imgs + --instance_prompt + --resolution 64 + --train_batch_size 1 + --gradient_accumulation_steps 1 + --max_train_steps 2 + --learning_rate 1.0e-05 + --scale_lr + --lr_scheduler constant + --lr_warmup_steps 0 + --modifier_token + --output_dir {tmpdir} + """.split() + + run_command(self._launch_args + test_args) + # save_pretrained smoke test + self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_custom_diffusion_weights.bin"))) + self.assertTrue(os.path.isfile(os.path.join(tmpdir, ".bin"))) + def test_text_to_image(self): with tempfile.TemporaryDirectory() as tmpdir: test_args = f""" diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py index 3133da117390..82c1ac61ca9e 100644 --- a/src/diffusers/loaders.py +++ b/src/diffusers/loaders.py @@ -19,7 +19,11 @@ import torch from huggingface_hub import hf_hub_download -from .models.attention_processor import LoRAAttnProcessor +from .models.attention_processor import ( + CustomDiffusionAttnProcessor, + CustomDiffusionXFormersAttnProcessor, + LoRAAttnProcessor, +) from .utils import ( DIFFUSERS_CACHE, HF_HUB_OFFLINE, @@ -48,6 +52,9 @@ TEXT_INVERSION_NAME = "learned_embeds.bin" TEXT_INVERSION_NAME_SAFE = "learned_embeds.safetensors" +CUSTOM_DIFFUSION_WEIGHT_NAME = "pytorch_custom_diffusion_weights.bin" +CUSTOM_DIFFUSION_WEIGHT_NAME_SAFE = "pytorch_custom_diffusion_weights.safetensors" + class AttnProcsLayers(torch.nn.Module): def __init__(self, state_dict: Dict[str, torch.Tensor]): @@ -215,6 +222,7 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict attn_processors = {} is_lora = all("lora" in k for k in state_dict.keys()) + is_custom_diffusion = any("custom_diffusion" in k for k in state_dict.keys()) if is_lora: lora_grouped_dict = defaultdict(dict) @@ -231,9 +239,38 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=rank ) attn_processors[key].load_state_dict(value_dict) - + elif is_custom_diffusion: + custom_diffusion_grouped_dict = defaultdict(dict) + for key, value in state_dict.items(): + if len(value) == 0: + custom_diffusion_grouped_dict[key] = {} + else: + if "to_out" in key: + attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:]) + else: + attn_processor_key, sub_key = ".".join(key.split(".")[:-2]), ".".join(key.split(".")[-2:]) + custom_diffusion_grouped_dict[attn_processor_key][sub_key] = value + + for key, value_dict in custom_diffusion_grouped_dict.items(): + if len(value_dict) == 0: + attn_processors[key] = CustomDiffusionAttnProcessor( + train_kv=False, train_q_out=False, hidden_size=None, cross_attention_dim=None + ) + else: + cross_attention_dim = value_dict["to_k_custom_diffusion.weight"].shape[1] + hidden_size = value_dict["to_k_custom_diffusion.weight"].shape[0] + train_q_out = True if "to_q_custom_diffusion.weight" in value_dict else False + attn_processors[key] = CustomDiffusionAttnProcessor( + train_kv=True, + train_q_out=train_q_out, + hidden_size=hidden_size, + cross_attention_dim=cross_attention_dim, + ) + attn_processors[key].load_state_dict(value_dict) else: - raise ValueError(f"{model_file} does not seem to be in the correct format expected by LoRA training.") + raise ValueError( + f"{model_file} does not seem to be in the correct format expected by LoRA or Custom Diffusion training." + ) # set correct dtype & device attn_processors = {k: v.to(device=self.device, dtype=self.dtype) for k, v in attn_processors.items()} @@ -287,16 +324,31 @@ def save_function(weights, filename): os.makedirs(save_directory, exist_ok=True) - model_to_save = AttnProcsLayers(self.attn_processors) - - # Save the model - state_dict = model_to_save.state_dict() + is_custom_diffusion = any( + isinstance(x, (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor)) + for (_, x) in self.attn_processors.items() + ) + if is_custom_diffusion: + model_to_save = AttnProcsLayers( + { + y: x + for (y, x) in self.attn_processors.items() + if isinstance(x, (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor)) + } + ) + state_dict = model_to_save.state_dict() + for name, attn in self.attn_processors.items(): + if len(attn.state_dict()) == 0: + state_dict[name] = {} + else: + model_to_save = AttnProcsLayers(self.attn_processors) + state_dict = model_to_save.state_dict() if weight_name is None: if safe_serialization: - weight_name = LORA_WEIGHT_NAME_SAFE + weight_name = CUSTOM_DIFFUSION_WEIGHT_NAME_SAFE if is_custom_diffusion else LORA_WEIGHT_NAME_SAFE else: - weight_name = LORA_WEIGHT_NAME + weight_name = CUSTOM_DIFFUSION_WEIGHT_NAME if is_custom_diffusion else LORA_WEIGHT_NAME # Save the model save_function(state_dict, os.path.join(save_directory, weight_name)) diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py index f2a5a376bf39..b8787aed91f2 100644 --- a/src/diffusers/models/attention_processor.py +++ b/src/diffusers/models/attention_processor.py @@ -149,6 +149,9 @@ def set_use_memory_efficient_attention_xformers( is_lora = hasattr(self, "processor") and isinstance( self.processor, (LoRAAttnProcessor, LoRAXFormersAttnProcessor) ) + is_custom_diffusion = hasattr(self, "processor") and isinstance( + self.processor, (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor) + ) if use_memory_efficient_attention_xformers: if self.added_kv_proj_dim is not None: @@ -192,6 +195,17 @@ def set_use_memory_efficient_attention_xformers( ) processor.load_state_dict(self.processor.state_dict()) processor.to(self.processor.to_q_lora.up.weight.device) + elif is_custom_diffusion: + processor = CustomDiffusionXFormersAttnProcessor( + train_kv=self.processor.train_kv, + train_q_out=self.processor.train_q_out, + hidden_size=self.processor.hidden_size, + cross_attention_dim=self.processor.cross_attention_dim, + attention_op=attention_op, + ) + processor.load_state_dict(self.processor.state_dict()) + if hasattr(self.processor, "to_k_custom_diffusion"): + processor.to(self.processor.to_k_custom_diffusion.weight.device) else: processor = XFormersAttnProcessor(attention_op=attention_op) else: @@ -203,6 +217,16 @@ def set_use_memory_efficient_attention_xformers( ) processor.load_state_dict(self.processor.state_dict()) processor.to(self.processor.to_q_lora.up.weight.device) + elif is_custom_diffusion: + processor = CustomDiffusionAttnProcessor( + train_kv=self.processor.train_kv, + train_q_out=self.processor.train_q_out, + hidden_size=self.processor.hidden_size, + cross_attention_dim=self.processor.cross_attention_dim, + ) + processor.load_state_dict(self.processor.state_dict()) + if hasattr(self.processor, "to_k_custom_diffusion"): + processor.to(self.processor.to_k_custom_diffusion.weight.device) else: processor = AttnProcessor() @@ -459,6 +483,84 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a return hidden_states +class CustomDiffusionAttnProcessor(nn.Module): + def __init__( + self, + train_kv=True, + train_q_out=True, + hidden_size=None, + cross_attention_dim=None, + out_bias=True, + dropout=0.0, + ): + super().__init__() + self.train_kv = train_kv + self.train_q_out = train_q_out + + self.hidden_size = hidden_size + self.cross_attention_dim = cross_attention_dim + + # `_custom_diffusion` id for easy serialization and loading. + if self.train_kv: + self.to_k_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False) + self.to_v_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False) + if self.train_q_out: + self.to_q_custom_diffusion = nn.Linear(hidden_size, hidden_size, bias=False) + self.to_out_custom_diffusion = nn.ModuleList([]) + self.to_out_custom_diffusion.append(nn.Linear(hidden_size, hidden_size, bias=out_bias)) + self.to_out_custom_diffusion.append(nn.Dropout(dropout)) + + def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None): + batch_size, sequence_length, _ = hidden_states.shape + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) + if self.train_q_out: + query = self.to_q_custom_diffusion(hidden_states) + else: + query = attn.to_q(hidden_states) + + if encoder_hidden_states is None: + crossattn = False + encoder_hidden_states = hidden_states + else: + crossattn = True + if attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + + if self.train_kv: + key = self.to_k_custom_diffusion(encoder_hidden_states) + value = self.to_v_custom_diffusion(encoder_hidden_states) + else: + key = attn.to_k(encoder_hidden_states) + value = attn.to_v(encoder_hidden_states) + + if crossattn: + detach = torch.ones_like(key) + detach[:, :1, :] = detach[:, :1, :] * 0.0 + key = detach * key + (1 - detach) * key.detach() + value = detach * value + (1 - detach) * value.detach() + + query = attn.head_to_batch_dim(query) + key = attn.head_to_batch_dim(key) + value = attn.head_to_batch_dim(value) + + attention_probs = attn.get_attention_scores(query, key, attention_mask) + hidden_states = torch.bmm(attention_probs, value) + hidden_states = attn.batch_to_head_dim(hidden_states) + + if self.train_q_out: + # linear proj + hidden_states = self.to_out_custom_diffusion[0](hidden_states) + # dropout + hidden_states = self.to_out_custom_diffusion[1](hidden_states) + else: + # linear proj + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + return hidden_states + + class AttnAddedKVProcessor: def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None): residual = hidden_states @@ -699,6 +801,91 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a return hidden_states +class CustomDiffusionXFormersAttnProcessor(nn.Module): + def __init__( + self, + train_kv=True, + train_q_out=False, + hidden_size=None, + cross_attention_dim=None, + out_bias=True, + dropout=0.0, + attention_op: Optional[Callable] = None, + ): + super().__init__() + self.train_kv = train_kv + self.train_q_out = train_q_out + + self.hidden_size = hidden_size + self.cross_attention_dim = cross_attention_dim + self.attention_op = attention_op + + # `_custom_diffusion` id for easy serialization and loading. + if self.train_kv: + self.to_k_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False) + self.to_v_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False) + if self.train_q_out: + self.to_q_custom_diffusion = nn.Linear(hidden_size, hidden_size, bias=False) + self.to_out_custom_diffusion = nn.ModuleList([]) + self.to_out_custom_diffusion.append(nn.Linear(hidden_size, hidden_size, bias=out_bias)) + self.to_out_custom_diffusion.append(nn.Dropout(dropout)) + + def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None): + batch_size, sequence_length, _ = ( + hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape + ) + + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) + + if self.train_q_out: + query = self.to_q_custom_diffusion(hidden_states) + else: + query = attn.to_q(hidden_states) + + if encoder_hidden_states is None: + crossattn = False + encoder_hidden_states = hidden_states + else: + crossattn = True + if attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + + if self.train_kv: + key = self.to_k_custom_diffusion(encoder_hidden_states) + value = self.to_v_custom_diffusion(encoder_hidden_states) + else: + key = attn.to_k(encoder_hidden_states) + value = attn.to_v(encoder_hidden_states) + + if crossattn: + detach = torch.ones_like(key) + detach[:, :1, :] = detach[:, :1, :] * 0.0 + key = detach * key + (1 - detach) * key.detach() + value = detach * value + (1 - detach) * value.detach() + + query = attn.head_to_batch_dim(query).contiguous() + key = attn.head_to_batch_dim(key).contiguous() + value = attn.head_to_batch_dim(value).contiguous() + + hidden_states = xformers.ops.memory_efficient_attention( + query, key, value, attn_bias=attention_mask, op=self.attention_op, scale=attn.scale + ) + hidden_states = hidden_states.to(query.dtype) + hidden_states = attn.batch_to_head_dim(hidden_states) + + if self.train_q_out: + # linear proj + hidden_states = self.to_out_custom_diffusion[0](hidden_states) + # dropout + hidden_states = self.to_out_custom_diffusion[1](hidden_states) + else: + # linear proj + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + return hidden_states + + class SlicedAttnProcessor: def __init__(self, slice_size): self.slice_size = slice_size @@ -834,4 +1021,6 @@ def __call__(self, attn: "Attention", hidden_states, encoder_hidden_states=None, AttnAddedKVProcessor2_0, LoRAAttnProcessor, LoRAXFormersAttnProcessor, + CustomDiffusionAttnProcessor, + CustomDiffusionXFormersAttnProcessor, ] diff --git a/tests/models/test_models_unet_2d_condition.py b/tests/models/test_models_unet_2d_condition.py index 15f77fb8c106..2576297762a8 100644 --- a/tests/models/test_models_unet_2d_condition.py +++ b/tests/models/test_models_unet_2d_condition.py @@ -22,7 +22,7 @@ from parameterized import parameterized from diffusers import UNet2DConditionModel -from diffusers.models.attention_processor import LoRAAttnProcessor +from diffusers.models.attention_processor import CustomDiffusionAttnProcessor, LoRAAttnProcessor from diffusers.utils import ( floats_tensor, load_hf_numpy, @@ -68,6 +68,55 @@ def create_lora_layers(model, mock_weights: bool = True): return lora_attn_procs +def create_custom_diffusion_layers(model, mock_weights: bool = True): + train_kv = True + train_q_out = True + custom_diffusion_attn_procs = {} + + st = model.state_dict() + for name, _ in model.attn_processors.items(): + cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim + if name.startswith("mid_block"): + hidden_size = model.config.block_out_channels[-1] + elif name.startswith("up_blocks"): + block_id = int(name[len("up_blocks.")]) + hidden_size = list(reversed(model.config.block_out_channels))[block_id] + elif name.startswith("down_blocks"): + block_id = int(name[len("down_blocks.")]) + hidden_size = model.config.block_out_channels[block_id] + layer_name = name.split(".processor")[0] + weights = { + "to_k_custom_diffusion.weight": st[layer_name + ".to_k.weight"], + "to_v_custom_diffusion.weight": st[layer_name + ".to_v.weight"], + } + if train_q_out: + weights["to_q_custom_diffusion.weight"] = st[layer_name + ".to_q.weight"] + weights["to_out_custom_diffusion.0.weight"] = st[layer_name + ".to_out.0.weight"] + weights["to_out_custom_diffusion.0.bias"] = st[layer_name + ".to_out.0.bias"] + if cross_attention_dim is not None: + custom_diffusion_attn_procs[name] = CustomDiffusionAttnProcessor( + train_kv=train_kv, + train_q_out=train_q_out, + hidden_size=hidden_size, + cross_attention_dim=cross_attention_dim, + ).to(model.device) + custom_diffusion_attn_procs[name].load_state_dict(weights) + if mock_weights: + # add 1 to weights to mock trained weights + with torch.no_grad(): + custom_diffusion_attn_procs[name].to_k_custom_diffusion.weight += 1 + custom_diffusion_attn_procs[name].to_v_custom_diffusion.weight += 1 + else: + custom_diffusion_attn_procs[name] = CustomDiffusionAttnProcessor( + train_kv=False, + train_q_out=False, + hidden_size=hidden_size, + cross_attention_dim=cross_attention_dim, + ) + del st + return custom_diffusion_attn_procs + + class UNet2DConditionModelTests(ModelTesterMixin, unittest.TestCase): model_class = UNet2DConditionModel @@ -569,6 +618,96 @@ def test_lora_xformers_on_off(self): assert (sample - on_sample).abs().max() < 1e-4 assert (sample - off_sample).abs().max() < 1e-4 + def test_custom_diffusion_processors(self): + # enable deterministic behavior for gradient checkpointing + init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() + + init_dict["attention_head_dim"] = (8, 16) + + model = self.model_class(**init_dict) + model.to(torch_device) + + with torch.no_grad(): + sample1 = model(**inputs_dict).sample + + custom_diffusion_attn_procs = create_custom_diffusion_layers(model, mock_weights=False) + + # make sure we can set a list of attention processors + model.set_attn_processor(custom_diffusion_attn_procs) + model.to(torch_device) + + # test that attn processors can be set to itself + model.set_attn_processor(model.attn_processors) + + with torch.no_grad(): + sample2 = model(**inputs_dict).sample + + assert (sample1 - sample2).abs().max() < 1e-4 + + def test_custom_diffusion_save_load(self): + # enable deterministic behavior for gradient checkpointing + init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() + + init_dict["attention_head_dim"] = (8, 16) + + torch.manual_seed(0) + model = self.model_class(**init_dict) + model.to(torch_device) + + with torch.no_grad(): + old_sample = model(**inputs_dict).sample + + custom_diffusion_attn_procs = create_custom_diffusion_layers(model, mock_weights=False) + model.set_attn_processor(custom_diffusion_attn_procs) + + with torch.no_grad(): + sample = model(**inputs_dict).sample + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_attn_procs(tmpdirname) + self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_custom_diffusion_weights.bin"))) + torch.manual_seed(0) + new_model = self.model_class(**init_dict) + new_model.to(torch_device) + new_model.load_attn_procs(tmpdirname, weight_name="pytorch_custom_diffusion_weights.bin") + + with torch.no_grad(): + new_sample = new_model(**inputs_dict).sample + + assert (sample - new_sample).abs().max() < 1e-4 + + # custom diffusion and no custom diffusion should be the same + assert (sample - old_sample).abs().max() < 1e-4 + + @unittest.skipIf( + torch_device != "cuda" or not is_xformers_available(), + reason="XFormers attention is only available with CUDA and `xformers` installed", + ) + def test_custom_diffusion_xformers_on_off(self): + # enable deterministic behavior for gradient checkpointing + init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() + + init_dict["attention_head_dim"] = (8, 16) + + torch.manual_seed(0) + model = self.model_class(**init_dict) + model.to(torch_device) + custom_diffusion_attn_procs = create_custom_diffusion_layers(model, mock_weights=False) + model.set_attn_processor(custom_diffusion_attn_procs) + + # default + with torch.no_grad(): + sample = model(**inputs_dict).sample + + model.enable_xformers_memory_efficient_attention() + on_sample = model(**inputs_dict).sample + + model.disable_xformers_memory_efficient_attention() + off_sample = model(**inputs_dict).sample + + assert (sample - on_sample).abs().max() < 1e-4 + assert (sample - off_sample).abs().max() < 1e-4 + @slow class UNet2DConditionModelIntegrationTests(unittest.TestCase): From a121e05feb2e6e4ca02c7bc51fae7019e3005d18 Mon Sep 17 00:00:00 2001 From: Mishig Date: Thu, 20 Apr 2023 11:04:06 +0200 Subject: [PATCH 35/71] Update custom_diffusion.mdx (#3165) Add missing newlines for rendering the links correctly --- docs/source/en/training/custom_diffusion.mdx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/en/training/custom_diffusion.mdx b/docs/source/en/training/custom_diffusion.mdx index 1e1958e1c946..245d434adeda 100644 --- a/docs/source/en/training/custom_diffusion.mdx +++ b/docs/source/en/training/custom_diffusion.mdx @@ -279,9 +279,11 @@ You can also perform inference from one of the complete checkpoint saved during TODO. ## Set grads to none + To save even more memory, pass the `--set_grads_to_none` argument to the script. This will set grads to None instead of zero. However, be aware that it changes certain behaviors, so if you start experiencing any problems, remove this argument. More info: https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html ## Experimental results + You can refer to [our webpage](https://www.cs.cmu.edu/~custom-diffusion/) that discusses our experiments in detail. From a5b242d30de029828b8279cf291774a66b6b6298 Mon Sep 17 00:00:00 2001 From: XinyuYe-Intel Date: Thu, 20 Apr 2023 18:55:42 +0800 Subject: [PATCH 36/71] Added distillation for quantization example on textual inversion. (#2760) * Added distillation for quantization example on textual inversion. Signed-off-by: Ye, Xinyu * refined readme and code style. Signed-off-by: Ye, Xinyu * Update text2images.py * refined code of model load and added compatibility check. Signed-off-by: Ye, Xinyu * fixed code style. Signed-off-by: Ye, Xinyu * fix C403 [*] Unnecessary `list` comprehension (rewrite as a `set` comprehension) Signed-off-by: Ye, Xinyu --------- Signed-off-by: Ye, Xinyu --- .../textual_inversion_dfq/README.md | 93 ++ .../textual_inversion_dfq/requirements.txt | 7 + .../textual_inversion_dfq/text2images.py | 112 ++ .../textual_inversion.py | 1018 +++++++++++++++++ 4 files changed, 1230 insertions(+) create mode 100644 examples/research_projects/intel_opts/textual_inversion_dfq/README.md create mode 100644 examples/research_projects/intel_opts/textual_inversion_dfq/requirements.txt create mode 100644 examples/research_projects/intel_opts/textual_inversion_dfq/text2images.py create mode 100644 examples/research_projects/intel_opts/textual_inversion_dfq/textual_inversion.py diff --git a/examples/research_projects/intel_opts/textual_inversion_dfq/README.md b/examples/research_projects/intel_opts/textual_inversion_dfq/README.md new file mode 100644 index 000000000000..4a227cdb4d63 --- /dev/null +++ b/examples/research_projects/intel_opts/textual_inversion_dfq/README.md @@ -0,0 +1,93 @@ +# Distillation for quantization on Textual Inversion models to personalize text2image + +[Textual inversion](https://arxiv.org/abs/2208.01618) is a method to personalize text2image models like stable diffusion on your own images._By using just 3-5 images new concepts can be taught to Stable Diffusion and the model personalized on your own images_ +The `textual_inversion.py` script shows how to implement the training procedure and adapt it for stable diffusion. +We have enabled distillation for quantization in `textual_inversion.py` to do quantization aware training as well as distillation on the model generated by Textual Inversion method. + +## Installing the dependencies + +Before running the scripts, make sure to install the library's training dependencies: + +```bash +pip install -r requirements.txt +``` + +## Prepare Datasets + +One picture which is from the huggingface datasets [sd-concepts-library/dicoo2](https://huggingface.co/sd-concepts-library/dicoo2) is needed, and save it to the `./dicoo` directory. The picture is shown below: + + + + + +## Get a FP32 Textual Inversion model + +Use the following command to fine-tune the Stable Diffusion model on the above dataset to obtain the FP32 Textual Inversion model. + +```bash +export MODEL_NAME="CompVis/stable-diffusion-v1-4" +export DATA_DIR="./dicoo" + +accelerate launch textual_inversion.py \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --train_data_dir=$DATA_DIR \ + --learnable_property="object" \ + --placeholder_token="" --initializer_token="toy" \ + --resolution=512 \ + --train_batch_size=1 \ + --gradient_accumulation_steps=4 \ + --max_train_steps=3000 \ + --learning_rate=5.0e-04 --scale_lr \ + --lr_scheduler="constant" \ + --lr_warmup_steps=0 \ + --output_dir="dicoo_model" +``` + +## Do distillation for quantization + +Distillation for quantization is a method that combines [intermediate layer knowledge distillation](https://github.com/intel/neural-compressor/blob/master/docs/source/distillation.md#intermediate-layer-knowledge-distillation) and [quantization aware training](https://github.com/intel/neural-compressor/blob/master/docs/source/quantization.md#quantization-aware-training) in the same training process to improve the performance of the quantized model. Provided a FP32 model, the distillation for quantization approach will take this model itself as the teacher model and transfer the knowledges of the specified layers to the student model, i.e. quantized version of the FP32 model, during the quantization aware training process. + +Once you have the FP32 Textual Inversion model, the following command will take the FP32 Textual Inversion model as input to do distillation for quantization and generate the INT8 Textual Inversion model. + +```bash +export FP32_MODEL_NAME="./dicoo_model" +export DATA_DIR="./dicoo" + +accelerate launch textual_inversion.py \ + --pretrained_model_name_or_path=$FP32_MODEL_NAME \ + --train_data_dir=$DATA_DIR \ + --use_ema --learnable_property="object" \ + --placeholder_token="" --initializer_token="toy" \ + --resolution=512 \ + --train_batch_size=1 \ + --gradient_accumulation_steps=4 \ + --max_train_steps=300 \ + --learning_rate=5.0e-04 --max_grad_norm=3 \ + --lr_scheduler="constant" \ + --lr_warmup_steps=0 \ + --output_dir="int8_model" \ + --do_quantization --do_distillation --verify_loading +``` + +After the distillation for quantization process, the quantized UNet would be 4 times smaller (3279MB -> 827MB). + +## Inference + +Once you have trained a INT8 model with the above command, the inference can be done simply using the `text2images.py` script. Make sure to include the `placeholder_token` in your prompt. + +```bash +export INT8_MODEL_NAME="./int8_model" + +python text2images.py \ + --pretrained_model_name_or_path=$INT8_MODEL_NAME \ + --caption "a lovely in red dress and hat, in the snowly and brightly night, with many brighly buildings." \ + --images_num 4 +``` + +Here is the comparison of images generated by the FP32 model (left) and INT8 model (right) respectively: + +

+ FP32 + INT8 +

+ diff --git a/examples/research_projects/intel_opts/textual_inversion_dfq/requirements.txt b/examples/research_projects/intel_opts/textual_inversion_dfq/requirements.txt new file mode 100644 index 000000000000..cbd4c957be44 --- /dev/null +++ b/examples/research_projects/intel_opts/textual_inversion_dfq/requirements.txt @@ -0,0 +1,7 @@ +accelerate +torchvision +transformers>=4.25.0 +ftfy +tensorboard +modelcards +neural-compressor \ No newline at end of file diff --git a/examples/research_projects/intel_opts/textual_inversion_dfq/text2images.py b/examples/research_projects/intel_opts/textual_inversion_dfq/text2images.py new file mode 100644 index 000000000000..a99d727712eb --- /dev/null +++ b/examples/research_projects/intel_opts/textual_inversion_dfq/text2images.py @@ -0,0 +1,112 @@ +import argparse +import math +import os + +import torch +from neural_compressor.utils.pytorch import load +from PIL import Image +from transformers import CLIPTextModel, CLIPTokenizer + +from diffusers import AutoencoderKL, StableDiffusionPipeline, UNet2DConditionModel + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-m", + "--pretrained_model_name_or_path", + type=str, + default=None, + required=True, + help="Path to pretrained model or model identifier from huggingface.co/models.", + ) + parser.add_argument( + "-c", + "--caption", + type=str, + default="robotic cat with wings", + help="Text used to generate images.", + ) + parser.add_argument( + "-n", + "--images_num", + type=int, + default=4, + help="How much images to generate.", + ) + parser.add_argument( + "-s", + "--seed", + type=int, + default=42, + help="Seed for random process.", + ) + parser.add_argument( + "-ci", + "--cuda_id", + type=int, + default=0, + help="cuda_id.", + ) + args = parser.parse_args() + return args + + +def image_grid(imgs, rows, cols): + if not len(imgs) == rows * cols: + raise ValueError("The specified number of rows and columns are not correct.") + + w, h = imgs[0].size + grid = Image.new("RGB", size=(cols * w, rows * h)) + grid_w, grid_h = grid.size + + for i, img in enumerate(imgs): + grid.paste(img, box=(i % cols * w, i // cols * h)) + return grid + + +def generate_images( + pipeline, + prompt="robotic cat with wings", + guidance_scale=7.5, + num_inference_steps=50, + num_images_per_prompt=1, + seed=42, +): + generator = torch.Generator(pipeline.device).manual_seed(seed) + images = pipeline( + prompt, + guidance_scale=guidance_scale, + num_inference_steps=num_inference_steps, + generator=generator, + num_images_per_prompt=num_images_per_prompt, + ).images + _rows = int(math.sqrt(num_images_per_prompt)) + grid = image_grid(images, rows=_rows, cols=num_images_per_prompt // _rows) + return grid, images + + +args = parse_args() +# Load models and create wrapper for stable diffusion +tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer") +text_encoder = CLIPTextModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder") +vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae") +unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet") + +pipeline = StableDiffusionPipeline.from_pretrained( + args.pretrained_model_name_or_path, text_encoder=text_encoder, vae=vae, unet=unet, tokenizer=tokenizer +) +pipeline.safety_checker = lambda images, clip_input: (images, False) +if os.path.exists(os.path.join(args.pretrained_model_name_or_path, "best_model.pt")): + unet = load(args.pretrained_model_name_or_path, model=unet) + unet.eval() + setattr(pipeline, "unet", unet) +else: + unet = unet.to(torch.device("cuda", args.cuda_id)) +pipeline = pipeline.to(unet.device) +grid, images = generate_images(pipeline, prompt=args.caption, num_images_per_prompt=args.images_num, seed=args.seed) +grid.save(os.path.join(args.pretrained_model_name_or_path, "{}.png".format("_".join(args.caption.split())))) +dirname = os.path.join(args.pretrained_model_name_or_path, "_".join(args.caption.split())) +os.makedirs(dirname, exist_ok=True) +for idx, image in enumerate(images): + image.save(os.path.join(dirname, "{}.png".format(idx + 1))) diff --git a/examples/research_projects/intel_opts/textual_inversion_dfq/textual_inversion.py b/examples/research_projects/intel_opts/textual_inversion_dfq/textual_inversion.py new file mode 100644 index 000000000000..7afb6c67ef8e --- /dev/null +++ b/examples/research_projects/intel_opts/textual_inversion_dfq/textual_inversion.py @@ -0,0 +1,1018 @@ +import argparse +import itertools +import math +import os +import random +from pathlib import Path +from typing import Iterable, Optional + +import numpy as np +import PIL +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from accelerate import Accelerator +from accelerate.utils import set_seed +from huggingface_hub import HfFolder, Repository, whoami +from neural_compressor.utils import logger +from packaging import version +from PIL import Image +from torch.utils.data import Dataset +from torchvision import transforms +from tqdm.auto import tqdm +from transformers import CLIPTextModel, CLIPTokenizer + +from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel +from diffusers.optimization import get_scheduler + + +if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"): + PIL_INTERPOLATION = { + "linear": PIL.Image.Resampling.BILINEAR, + "bilinear": PIL.Image.Resampling.BILINEAR, + "bicubic": PIL.Image.Resampling.BICUBIC, + "lanczos": PIL.Image.Resampling.LANCZOS, + "nearest": PIL.Image.Resampling.NEAREST, + } +else: + PIL_INTERPOLATION = { + "linear": PIL.Image.LINEAR, + "bilinear": PIL.Image.BILINEAR, + "bicubic": PIL.Image.BICUBIC, + "lanczos": PIL.Image.LANCZOS, + "nearest": PIL.Image.NEAREST, + } +# ------------------------------------------------------------------------------ + + +def save_progress(text_encoder, placeholder_token_id, accelerator, args, save_path): + logger.info("Saving embeddings") + learned_embeds = accelerator.unwrap_model(text_encoder).get_input_embeddings().weight[placeholder_token_id] + learned_embeds_dict = {args.placeholder_token: learned_embeds.detach().cpu()} + torch.save(learned_embeds_dict, save_path) + + +def parse_args(): + parser = argparse.ArgumentParser(description="Example of distillation for quantization on Textual Inversion.") + parser.add_argument( + "--save_steps", + type=int, + default=500, + help="Save learned_embeds.bin every X updates steps.", + ) + parser.add_argument( + "--pretrained_model_name_or_path", + type=str, + default=None, + required=True, + help="Path to pretrained model or model identifier from huggingface.co/models.", + ) + parser.add_argument( + "--revision", + type=str, + default=None, + required=False, + help="Revision of pretrained model identifier from huggingface.co/models.", + ) + parser.add_argument( + "--tokenizer_name", + type=str, + default=None, + help="Pretrained tokenizer name or path if not the same as model_name", + ) + parser.add_argument( + "--train_data_dir", type=str, default=None, required=True, help="A folder containing the training data." + ) + parser.add_argument( + "--placeholder_token", + type=str, + default=None, + required=True, + help="A token to use as a placeholder for the concept.", + ) + parser.add_argument( + "--initializer_token", type=str, default=None, required=True, help="A token to use as initializer word." + ) + parser.add_argument("--learnable_property", type=str, default="object", help="Choose between 'object' and 'style'") + parser.add_argument("--repeats", type=int, default=100, help="How many times to repeat the training data.") + parser.add_argument( + "--output_dir", + type=str, + default="text-inversion-model", + help="The output directory where the model predictions and checkpoints will be written.", + ) + parser.add_argument( + "--cache_dir", + type=str, + default=None, + help="The directory where the downloaded models and datasets will be stored.", + ) + parser.add_argument("--seed", type=int, default=42, help="A seed for reproducible training.") + parser.add_argument( + "--resolution", + type=int, + default=512, + help=( + "The resolution for input images, all the images in the train/validation dataset will be resized to this" + " resolution" + ), + ) + parser.add_argument( + "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution" + ) + parser.add_argument( + "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader." + ) + parser.add_argument("--num_train_epochs", type=int, default=100) + parser.add_argument( + "--max_train_steps", + type=int, + default=5000, + help="Total number of training steps to perform. If provided, overrides num_train_epochs.", + ) + parser.add_argument( + "--gradient_accumulation_steps", + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.", + ) + parser.add_argument( + "--learning_rate", + type=float, + default=1e-4, + help="Initial learning rate (after the potential warmup period) to use.", + ) + parser.add_argument( + "--scale_lr", + action="store_true", + default=False, + help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.", + ) + parser.add_argument( + "--lr_scheduler", + type=str, + default="constant", + help=( + 'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",' + ' "constant", "constant_with_warmup"]' + ), + ) + parser.add_argument( + "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler." + ) + parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.") + parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.") + parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.") + parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer") + parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.") + parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.") + parser.add_argument( + "--hub_model_id", + type=str, + default=None, + help="The name of the repository to keep in sync with the local `output_dir`.", + ) + parser.add_argument( + "--logging_dir", + type=str, + default="logs", + help=( + "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to" + " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***." + ), + ) + parser.add_argument( + "--mixed_precision", + type=str, + default="no", + choices=["no", "fp16", "bf16"], + help=( + "Whether to use mixed precision. Choose" + "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10." + "and an Nvidia Ampere GPU." + ), + ) + parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.") + parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") + parser.add_argument("--do_quantization", action="store_true", help="Whether or not to do quantization.") + parser.add_argument("--do_distillation", action="store_true", help="Whether or not to do distillation.") + parser.add_argument( + "--verify_loading", action="store_true", help="Whether or not to verify the loading of the quantized model." + ) + parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") + + args = parser.parse_args() + env_local_rank = int(os.environ.get("LOCAL_RANK", -1)) + if env_local_rank != -1 and env_local_rank != args.local_rank: + args.local_rank = env_local_rank + + if args.train_data_dir is None: + raise ValueError("You must specify a train data directory.") + + return args + + +imagenet_templates_small = [ + "a photo of a {}", + "a rendering of a {}", + "a cropped photo of the {}", + "the photo of a {}", + "a photo of a clean {}", + "a photo of a dirty {}", + "a dark photo of the {}", + "a photo of my {}", + "a photo of the cool {}", + "a close-up photo of a {}", + "a bright photo of the {}", + "a cropped photo of a {}", + "a photo of the {}", + "a good photo of the {}", + "a photo of one {}", + "a close-up photo of the {}", + "a rendition of the {}", + "a photo of the clean {}", + "a rendition of a {}", + "a photo of a nice {}", + "a good photo of a {}", + "a photo of the nice {}", + "a photo of the small {}", + "a photo of the weird {}", + "a photo of the large {}", + "a photo of a cool {}", + "a photo of a small {}", +] + +imagenet_style_templates_small = [ + "a painting in the style of {}", + "a rendering in the style of {}", + "a cropped painting in the style of {}", + "the painting in the style of {}", + "a clean painting in the style of {}", + "a dirty painting in the style of {}", + "a dark painting in the style of {}", + "a picture in the style of {}", + "a cool painting in the style of {}", + "a close-up painting in the style of {}", + "a bright painting in the style of {}", + "a cropped painting in the style of {}", + "a good painting in the style of {}", + "a close-up painting in the style of {}", + "a rendition in the style of {}", + "a nice painting in the style of {}", + "a small painting in the style of {}", + "a weird painting in the style of {}", + "a large painting in the style of {}", +] + + +# Adapted from torch-ema https://github.com/fadel/pytorch_ema/blob/master/torch_ema/ema.py#L14 +class EMAModel: + """ + Exponential Moving Average of models weights + """ + + def __init__(self, parameters: Iterable[torch.nn.Parameter], decay=0.9999): + parameters = list(parameters) + self.shadow_params = [p.clone().detach() for p in parameters] + + self.decay = decay + self.optimization_step = 0 + + def get_decay(self, optimization_step): + """ + Compute the decay factor for the exponential moving average. + """ + value = (1 + optimization_step) / (10 + optimization_step) + return 1 - min(self.decay, value) + + @torch.no_grad() + def step(self, parameters): + parameters = list(parameters) + + self.optimization_step += 1 + self.decay = self.get_decay(self.optimization_step) + + for s_param, param in zip(self.shadow_params, parameters): + if param.requires_grad: + tmp = self.decay * (s_param - param) + s_param.sub_(tmp) + else: + s_param.copy_(param) + + torch.cuda.empty_cache() + + def copy_to(self, parameters: Iterable[torch.nn.Parameter]) -> None: + """ + Copy current averaged parameters into given collection of parameters. + Args: + parameters: Iterable of `torch.nn.Parameter`; the parameters to be + updated with the stored moving averages. If `None`, the + parameters with which this `ExponentialMovingAverage` was + initialized will be used. + """ + parameters = list(parameters) + for s_param, param in zip(self.shadow_params, parameters): + param.data.copy_(s_param.data) + + def to(self, device=None, dtype=None) -> None: + r"""Move internal buffers of the ExponentialMovingAverage to `device`. + Args: + device: like `device` argument to `torch.Tensor.to` + """ + # .to() on the tensors handles None correctly + self.shadow_params = [ + p.to(device=device, dtype=dtype) if p.is_floating_point() else p.to(device=device) + for p in self.shadow_params + ] + + +class TextualInversionDataset(Dataset): + def __init__( + self, + data_root, + tokenizer, + learnable_property="object", # [object, style] + size=512, + repeats=100, + interpolation="bicubic", + flip_p=0.5, + set="train", + placeholder_token="*", + center_crop=False, + ): + self.data_root = data_root + self.tokenizer = tokenizer + self.learnable_property = learnable_property + self.size = size + self.placeholder_token = placeholder_token + self.center_crop = center_crop + self.flip_p = flip_p + + self.image_paths = [os.path.join(self.data_root, file_path) for file_path in os.listdir(self.data_root)] + + self.num_images = len(self.image_paths) + self._length = self.num_images + + if set == "train": + self._length = self.num_images * repeats + + self.interpolation = { + "linear": PIL_INTERPOLATION["linear"], + "bilinear": PIL_INTERPOLATION["bilinear"], + "bicubic": PIL_INTERPOLATION["bicubic"], + "lanczos": PIL_INTERPOLATION["lanczos"], + }[interpolation] + + self.templates = imagenet_style_templates_small if learnable_property == "style" else imagenet_templates_small + self.flip_transform = transforms.RandomHorizontalFlip(p=self.flip_p) + + def __len__(self): + return self._length + + def __getitem__(self, i): + example = {} + image = Image.open(self.image_paths[i % self.num_images]) + + if not image.mode == "RGB": + image = image.convert("RGB") + + placeholder_string = self.placeholder_token + text = random.choice(self.templates).format(placeholder_string) + + example["input_ids"] = self.tokenizer( + text, + padding="max_length", + truncation=True, + max_length=self.tokenizer.model_max_length, + return_tensors="pt", + ).input_ids[0] + + # default to score-sde preprocessing + img = np.array(image).astype(np.uint8) + + if self.center_crop: + crop = min(img.shape[0], img.shape[1]) + ( + h, + w, + ) = ( + img.shape[0], + img.shape[1], + ) + img = img[(h - crop) // 2 : (h + crop) // 2, (w - crop) // 2 : (w + crop) // 2] + + image = Image.fromarray(img) + image = image.resize((self.size, self.size), resample=self.interpolation) + + image = self.flip_transform(image) + image = np.array(image).astype(np.uint8) + image = (image / 127.5 - 1.0).astype(np.float32) + + example["pixel_values"] = torch.from_numpy(image).permute(2, 0, 1) + return example + + +def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None): + if token is None: + token = HfFolder.get_token() + if organization is None: + username = whoami(token)["name"] + return f"{username}/{model_id}" + else: + return f"{organization}/{model_id}" + + +def freeze_params(params): + for param in params: + param.requires_grad = False + + +def image_grid(imgs, rows, cols): + if not len(imgs) == rows * cols: + raise ValueError("The specified number of rows and columns are not correct.") + + w, h = imgs[0].size + grid = Image.new("RGB", size=(cols * w, rows * h)) + grid_w, grid_h = grid.size + + for i, img in enumerate(imgs): + grid.paste(img, box=(i % cols * w, i // cols * h)) + return grid + + +def generate_images(pipeline, prompt="", guidance_scale=7.5, num_inference_steps=50, num_images_per_prompt=1, seed=42): + generator = torch.Generator(pipeline.device).manual_seed(seed) + images = pipeline( + prompt, + guidance_scale=guidance_scale, + num_inference_steps=num_inference_steps, + generator=generator, + num_images_per_prompt=num_images_per_prompt, + ).images + _rows = int(math.sqrt(num_images_per_prompt)) + grid = image_grid(images, rows=_rows, cols=num_images_per_prompt // _rows) + return grid + + +def main(): + args = parse_args() + logging_dir = os.path.join(args.output_dir, args.logging_dir) + + accelerator = Accelerator( + gradient_accumulation_steps=args.gradient_accumulation_steps, + mixed_precision=args.mixed_precision, + log_with="tensorboard", + logging_dir=logging_dir, + ) + + # If passed along, set the training seed now. + if args.seed is not None: + set_seed(args.seed) + + # Handle the repository creation + if accelerator.is_main_process: + if args.push_to_hub: + if args.hub_model_id is None: + repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token) + else: + repo_name = args.hub_model_id + repo = Repository(args.output_dir, clone_from=repo_name) + + with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore: + if "step_*" not in gitignore: + gitignore.write("step_*\n") + if "epoch_*" not in gitignore: + gitignore.write("epoch_*\n") + elif args.output_dir is not None: + os.makedirs(args.output_dir, exist_ok=True) + + # Load the tokenizer and add the placeholder token as a additional special token + if args.tokenizer_name: + tokenizer = CLIPTokenizer.from_pretrained(args.tokenizer_name) + elif args.pretrained_model_name_or_path: + tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer") + + # Load models and create wrapper for stable diffusion + noise_scheduler = DDPMScheduler.from_config(args.pretrained_model_name_or_path, subfolder="scheduler") + text_encoder = CLIPTextModel.from_pretrained( + args.pretrained_model_name_or_path, + subfolder="text_encoder", + revision=args.revision, + ) + vae = AutoencoderKL.from_pretrained( + args.pretrained_model_name_or_path, + subfolder="vae", + revision=args.revision, + ) + unet = UNet2DConditionModel.from_pretrained( + args.pretrained_model_name_or_path, + subfolder="unet", + revision=args.revision, + ) + + train_unet = False + # Freeze vae and unet + freeze_params(vae.parameters()) + if not args.do_quantization and not args.do_distillation: + # Add the placeholder token in tokenizer + num_added_tokens = tokenizer.add_tokens(args.placeholder_token) + if num_added_tokens == 0: + raise ValueError( + f"The tokenizer already contains the token {args.placeholder_token}. Please pass a different" + " `placeholder_token` that is not already in the tokenizer." + ) + + # Convert the initializer_token, placeholder_token to ids + token_ids = tokenizer.encode(args.initializer_token, add_special_tokens=False) + # Check if initializer_token is a single token or a sequence of tokens + if len(token_ids) > 1: + raise ValueError("The initializer token must be a single token.") + + initializer_token_id = token_ids[0] + placeholder_token_id = tokenizer.convert_tokens_to_ids(args.placeholder_token) + # Resize the token embeddings as we are adding new special tokens to the tokenizer + text_encoder.resize_token_embeddings(len(tokenizer)) + + # Initialise the newly added placeholder token with the embeddings of the initializer token + token_embeds = text_encoder.get_input_embeddings().weight.data + token_embeds[placeholder_token_id] = token_embeds[initializer_token_id] + + freeze_params(unet.parameters()) + # Freeze all parameters except for the token embeddings in text encoder + params_to_freeze = itertools.chain( + text_encoder.text_model.encoder.parameters(), + text_encoder.text_model.final_layer_norm.parameters(), + text_encoder.text_model.embeddings.position_embedding.parameters(), + ) + freeze_params(params_to_freeze) + else: + train_unet = True + freeze_params(text_encoder.parameters()) + + if args.scale_lr: + args.learning_rate = ( + args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes + ) + + # Initialize the optimizer + optimizer = torch.optim.AdamW( + # only optimize the unet or embeddings of text_encoder + unet.parameters() if train_unet else text_encoder.get_input_embeddings().parameters(), + lr=args.learning_rate, + betas=(args.adam_beta1, args.adam_beta2), + weight_decay=args.adam_weight_decay, + eps=args.adam_epsilon, + ) + + train_dataset = TextualInversionDataset( + data_root=args.train_data_dir, + tokenizer=tokenizer, + size=args.resolution, + placeholder_token=args.placeholder_token, + repeats=args.repeats, + learnable_property=args.learnable_property, + center_crop=args.center_crop, + set="train", + ) + train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=args.train_batch_size, shuffle=True) + + # Scheduler and math around the number of training steps. + overrode_max_train_steps = False + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) + if args.max_train_steps is None: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + overrode_max_train_steps = True + + lr_scheduler = get_scheduler( + args.lr_scheduler, + optimizer=optimizer, + num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, + ) + + if not train_unet: + text_encoder = accelerator.prepare(text_encoder) + unet.to(accelerator.device) + unet.eval() + else: + unet = accelerator.prepare(unet) + text_encoder.to(accelerator.device) + text_encoder.eval() + optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler) + + # Move vae to device + vae.to(accelerator.device) + + # Keep vae in eval model as we don't train these + vae.eval() + + compression_manager = None + + def train_func(model): + if train_unet: + unet_ = model + text_encoder_ = text_encoder + else: + unet_ = unet + text_encoder_ = model + # We need to recalculate our total training steps as the size of the training dataloader may have changed. + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) + if overrode_max_train_steps: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + # Afterwards we recalculate our number of training epochs + args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) + + # We need to initialize the trackers we use, and also store our configuration. + # The trackers initializes automatically on the main process. + if accelerator.is_main_process: + accelerator.init_trackers("textual_inversion", config=vars(args)) + + # Train! + total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps + + logger.info("***** Running training *****") + logger.info(f" Num examples = {len(train_dataset)}") + logger.info(f" Num Epochs = {args.num_train_epochs}") + logger.info(f" Instantaneous batch size per device = {args.train_batch_size}") + logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") + logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") + logger.info(f" Total optimization steps = {args.max_train_steps}") + # Only show the progress bar once on each machine. + progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) + progress_bar.set_description("Steps") + global_step = 0 + + if train_unet and args.use_ema: + ema_unet = EMAModel(unet_.parameters()) + + for epoch in range(args.num_train_epochs): + model.train() + train_loss = 0.0 + for step, batch in enumerate(train_dataloader): + with accelerator.accumulate(model): + # Convert images to latent space + latents = vae.encode(batch["pixel_values"]).latent_dist.sample().detach() + latents = latents * 0.18215 + + # Sample noise that we'll add to the latents + noise = torch.randn(latents.shape).to(latents.device) + bsz = latents.shape[0] + # Sample a random timestep for each image + timesteps = torch.randint( + 0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device + ).long() + + # Add noise to the latents according to the noise magnitude at each timestep + # (this is the forward diffusion process) + noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) + + # Get the text embedding for conditioning + encoder_hidden_states = text_encoder_(batch["input_ids"])[0] + + # Predict the noise residual + model_pred = unet_(noisy_latents, timesteps, encoder_hidden_states).sample + + loss = F.mse_loss(model_pred, noise, reduction="none").mean([1, 2, 3]).mean() + if train_unet and compression_manager: + unet_inputs = { + "sample": noisy_latents, + "timestep": timesteps, + "encoder_hidden_states": encoder_hidden_states, + } + loss = compression_manager.callbacks.on_after_compute_loss(unet_inputs, model_pred, loss) + + # Gather the losses across all processes for logging (if we use distributed training). + avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean() + train_loss += avg_loss.item() / args.gradient_accumulation_steps + + # Backpropagate + accelerator.backward(loss) + + if train_unet: + if accelerator.sync_gradients: + accelerator.clip_grad_norm_(unet_.parameters(), args.max_grad_norm) + else: + # Zero out the gradients for all token embeddings except the newly added + # embeddings for the concept, as we only want to optimize the concept embeddings + if accelerator.num_processes > 1: + grads = text_encoder_.module.get_input_embeddings().weight.grad + else: + grads = text_encoder_.get_input_embeddings().weight.grad + # Get the index for tokens that we want to zero the grads for + index_grads_to_zero = torch.arange(len(tokenizer)) != placeholder_token_id + grads.data[index_grads_to_zero, :] = grads.data[index_grads_to_zero, :].fill_(0) + + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: + if train_unet and args.use_ema: + ema_unet.step(unet_.parameters()) + progress_bar.update(1) + global_step += 1 + accelerator.log({"train_loss": train_loss}, step=global_step) + train_loss = 0.0 + if not train_unet and global_step % args.save_steps == 0: + save_path = os.path.join(args.output_dir, f"learned_embeds-steps-{global_step}.bin") + save_progress(text_encoder_, placeholder_token_id, accelerator, args, save_path) + + logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]} + progress_bar.set_postfix(**logs) + accelerator.log(logs, step=global_step) + + if global_step >= args.max_train_steps: + break + accelerator.wait_for_everyone() + + if train_unet and args.use_ema: + ema_unet.copy_to(unet_.parameters()) + + if not train_unet: + return text_encoder_ + + if not train_unet: + text_encoder = train_func(text_encoder) + else: + import copy + + model = copy.deepcopy(unet) + confs = [] + if args.do_quantization: + from neural_compressor import QuantizationAwareTrainingConfig + + q_conf = QuantizationAwareTrainingConfig() + confs.append(q_conf) + + if args.do_distillation: + teacher_model = copy.deepcopy(model) + + def attention_fetcher(x): + return x.sample + + layer_mappings = [ + [ + [ + "conv_in", + ] + ], + [ + [ + "time_embedding", + ] + ], + [["down_blocks.0.attentions.0", attention_fetcher]], + [["down_blocks.0.attentions.1", attention_fetcher]], + [ + [ + "down_blocks.0.resnets.0", + ] + ], + [ + [ + "down_blocks.0.resnets.1", + ] + ], + [ + [ + "down_blocks.0.downsamplers.0", + ] + ], + [["down_blocks.1.attentions.0", attention_fetcher]], + [["down_blocks.1.attentions.1", attention_fetcher]], + [ + [ + "down_blocks.1.resnets.0", + ] + ], + [ + [ + "down_blocks.1.resnets.1", + ] + ], + [ + [ + "down_blocks.1.downsamplers.0", + ] + ], + [["down_blocks.2.attentions.0", attention_fetcher]], + [["down_blocks.2.attentions.1", attention_fetcher]], + [ + [ + "down_blocks.2.resnets.0", + ] + ], + [ + [ + "down_blocks.2.resnets.1", + ] + ], + [ + [ + "down_blocks.2.downsamplers.0", + ] + ], + [ + [ + "down_blocks.3.resnets.0", + ] + ], + [ + [ + "down_blocks.3.resnets.1", + ] + ], + [ + [ + "up_blocks.0.resnets.0", + ] + ], + [ + [ + "up_blocks.0.resnets.1", + ] + ], + [ + [ + "up_blocks.0.resnets.2", + ] + ], + [ + [ + "up_blocks.0.upsamplers.0", + ] + ], + [["up_blocks.1.attentions.0", attention_fetcher]], + [["up_blocks.1.attentions.1", attention_fetcher]], + [["up_blocks.1.attentions.2", attention_fetcher]], + [ + [ + "up_blocks.1.resnets.0", + ] + ], + [ + [ + "up_blocks.1.resnets.1", + ] + ], + [ + [ + "up_blocks.1.resnets.2", + ] + ], + [ + [ + "up_blocks.1.upsamplers.0", + ] + ], + [["up_blocks.2.attentions.0", attention_fetcher]], + [["up_blocks.2.attentions.1", attention_fetcher]], + [["up_blocks.2.attentions.2", attention_fetcher]], + [ + [ + "up_blocks.2.resnets.0", + ] + ], + [ + [ + "up_blocks.2.resnets.1", + ] + ], + [ + [ + "up_blocks.2.resnets.2", + ] + ], + [ + [ + "up_blocks.2.upsamplers.0", + ] + ], + [["up_blocks.3.attentions.0", attention_fetcher]], + [["up_blocks.3.attentions.1", attention_fetcher]], + [["up_blocks.3.attentions.2", attention_fetcher]], + [ + [ + "up_blocks.3.resnets.0", + ] + ], + [ + [ + "up_blocks.3.resnets.1", + ] + ], + [ + [ + "up_blocks.3.resnets.2", + ] + ], + [["mid_block.attentions.0", attention_fetcher]], + [ + [ + "mid_block.resnets.0", + ] + ], + [ + [ + "mid_block.resnets.1", + ] + ], + [ + [ + "conv_out", + ] + ], + ] + layer_names = [layer_mapping[0][0] for layer_mapping in layer_mappings] + if not set(layer_names).issubset([n[0] for n in model.named_modules()]): + raise ValueError( + "Provided model is not compatible with the default layer_mappings, " + 'please use the model fine-tuned from "CompVis/stable-diffusion-v1-4", ' + "or modify the layer_mappings variable to fit your model." + f"\nDefault layer_mappings are as such:\n{layer_mappings}" + ) + from neural_compressor.config import DistillationConfig, IntermediateLayersKnowledgeDistillationLossConfig + + distillation_criterion = IntermediateLayersKnowledgeDistillationLossConfig( + layer_mappings=layer_mappings, + loss_types=["MSE"] * len(layer_mappings), + loss_weights=[1.0 / len(layer_mappings)] * len(layer_mappings), + add_origin_loss=True, + ) + d_conf = DistillationConfig(teacher_model=teacher_model, criterion=distillation_criterion) + confs.append(d_conf) + + from neural_compressor.training import prepare_compression + + compression_manager = prepare_compression(model, confs) + compression_manager.callbacks.on_train_begin() + model = compression_manager.model + train_func(model) + compression_manager.callbacks.on_train_end() + + # Save the resulting model and its corresponding configuration in the given directory + model.save(args.output_dir) + + logger.info(f"Optimized model saved to: {args.output_dir}.") + + # change to framework model for further use + model = model.model + + # Create the pipeline using using the trained modules and save it. + templates = imagenet_style_templates_small if args.learnable_property == "style" else imagenet_templates_small + prompt = templates[0].format(args.placeholder_token) + if accelerator.is_main_process: + pipeline = StableDiffusionPipeline.from_pretrained( + args.pretrained_model_name_or_path, + text_encoder=accelerator.unwrap_model(text_encoder), + vae=vae, + unet=accelerator.unwrap_model(unet), + tokenizer=tokenizer, + ) + pipeline.save_pretrained(args.output_dir) + pipeline = pipeline.to(unet.device) + baseline_model_images = generate_images(pipeline, prompt=prompt, seed=args.seed) + baseline_model_images.save( + os.path.join(args.output_dir, "{}_baseline_model.png".format("_".join(prompt.split()))) + ) + + if not train_unet: + # Also save the newly trained embeddings + save_path = os.path.join(args.output_dir, "learned_embeds.bin") + save_progress(text_encoder, placeholder_token_id, accelerator, args, save_path) + else: + setattr(pipeline, "unet", accelerator.unwrap_model(model)) + if args.do_quantization: + pipeline = pipeline.to(torch.device("cpu")) + + optimized_model_images = generate_images(pipeline, prompt=prompt, seed=args.seed) + optimized_model_images.save( + os.path.join(args.output_dir, "{}_optimized_model.png".format("_".join(prompt.split()))) + ) + + if args.push_to_hub: + repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True) + + accelerator.end_training() + + if args.do_quantization and args.verify_loading: + # Load the model obtained after Intel Neural Compressor quantization + from neural_compressor.utils.pytorch import load + + loaded_model = load(args.output_dir, model=unet) + loaded_model.eval() + + setattr(pipeline, "unet", loaded_model) + if args.do_quantization: + pipeline = pipeline.to(torch.device("cpu")) + + loaded_model_images = generate_images(pipeline, prompt=prompt, seed=args.seed) + if loaded_model_images != optimized_model_images: + logger.info("The quantized model was not successfully loaded.") + else: + logger.info("The quantized model was successfully loaded.") + + +if __name__ == "__main__": + main() From 17470057d202e2f6841398ccc627f3697b01c51b Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 20 Apr 2023 13:09:20 +0200 Subject: [PATCH 37/71] make style --- src/diffusers/loaders.py | 2 +- tests/models/test_lora_layers.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py index 82c1ac61ca9e..b4c443fd303b 100644 --- a/src/diffusers/loaders.py +++ b/src/diffusers/loaders.py @@ -848,7 +848,7 @@ def _modify_text_encoder(self, attn_processors: Dict[str, LoRAAttnProcessor]): """ # Loop over the original attention modules. for name, _ in self.text_encoder.named_modules(): - if any([x in name for x in TEXT_ENCODER_TARGET_MODULES]): + if any(x in name for x in TEXT_ENCODER_TARGET_MODULES): # Retrieve the module and its corresponding LoRA processor. module = self.text_encoder.get_submodule(name) # Construct a new function that performs the LoRA merging. We will monkey patch diff --git a/tests/models/test_lora_layers.py b/tests/models/test_lora_layers.py index 6f75902d388f..6f1e85e15558 100644 --- a/tests/models/test_lora_layers.py +++ b/tests/models/test_lora_layers.py @@ -46,7 +46,7 @@ def create_unet_lora_layers(unet: nn.Module): def create_text_encoder_lora_layers(text_encoder: nn.Module): text_lora_attn_procs = {} for name, module in text_encoder.named_modules(): - if any([x in name for x in TEXT_ENCODER_TARGET_MODULES]): + if any(x in name for x in TEXT_ENCODER_TARGET_MODULES): text_lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=module.out_features, cross_attention_dim=None) text_encoder_lora_layers = AttnProcsLayers(text_lora_attn_procs) return text_encoder_lora_layers From 7b0ba4820a7546520da4b099fc6c523d5b6d3383 Mon Sep 17 00:00:00 2001 From: clarencechen Date: Thu, 20 Apr 2023 04:13:47 -0700 Subject: [PATCH 38/71] Update Noise Autocorrelation Loss Function for Pix2PixZero Pipeline (#2942) * Update Pix2PixZero Auto-correlation Loss * Add fast inversion tests * Clarify purpose and mark as deprecated Fix inversion prompt broadcasting * Register modules set to `None` in config for `test_save_load_optional_components` * Update new tests to coordinate with #2953 --- .../pipeline_stable_diffusion_pix2pix_zero.py | 64 +++++++------ .../test_stable_diffusion_pix2pix_zero.py | 91 ++++++++++++++++++- 2 files changed, 123 insertions(+), 32 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py index 0239c8128171..6444ec7c8506 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py @@ -36,6 +36,7 @@ from ...utils import ( PIL_INTERPOLATION, BaseOutput, + deprecate, is_accelerate_available, is_accelerate_version, logging, @@ -721,23 +722,31 @@ def prepare_image_latents(self, image, batch_size, dtype, device, generator=None ) if isinstance(generator, list): - init_latents = [ - self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size) - ] - init_latents = torch.cat(init_latents, dim=0) + latents = [self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)] + latents = torch.cat(latents, dim=0) else: - init_latents = self.vae.encode(image).latent_dist.sample(generator) - - init_latents = self.vae.config.scaling_factor * init_latents - - if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0: - raise ValueError( - f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts." - ) + latents = self.vae.encode(image).latent_dist.sample(generator) + + latents = self.vae.config.scaling_factor * latents + + if batch_size != latents.shape[0]: + if batch_size % latents.shape[0] == 0: + # expand image_latents for batch_size + deprecation_message = ( + f"You have passed {batch_size} text prompts (`prompt`), but only {latents.shape[0]} initial" + " images (`image`). Initial images are now duplicating to match the number of text prompts. Note" + " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update" + " your script to pass as many initial images as text prompts to suppress this warning." + ) + deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False) + additional_latents_per_image = batch_size // latents.shape[0] + latents = torch.cat([latents] * additional_latents_per_image, dim=0) + else: + raise ValueError( + f"Cannot duplicate `image` of batch size {latents.shape[0]} to {batch_size} text prompts." + ) else: - init_latents = torch.cat([init_latents], dim=0) - - latents = init_latents + latents = torch.cat([latents], dim=0) return latents @@ -759,23 +768,18 @@ def get_epsilon(self, model_output: torch.Tensor, sample: torch.Tensor, timestep ) def auto_corr_loss(self, hidden_states, generator=None): - batch_size, channel, height, width = hidden_states.shape - if batch_size > 1: - raise ValueError("Only batch_size 1 is supported for now") - - hidden_states = hidden_states.squeeze(0) - # hidden_states must be shape [C,H,W] now reg_loss = 0.0 for i in range(hidden_states.shape[0]): - noise = hidden_states[i][None, None, :, :] - while True: - roll_amount = torch.randint(noise.shape[2] // 2, (1,), generator=generator).item() - reg_loss += (noise * torch.roll(noise, shifts=roll_amount, dims=2)).mean() ** 2 - reg_loss += (noise * torch.roll(noise, shifts=roll_amount, dims=3)).mean() ** 2 - - if noise.shape[2] <= 8: - break - noise = F.avg_pool2d(noise, kernel_size=2) + for j in range(hidden_states.shape[1]): + noise = hidden_states[i : i + 1, j : j + 1, :, :] + while True: + roll_amount = torch.randint(noise.shape[2] // 2, (1,), generator=generator).item() + reg_loss += (noise * torch.roll(noise, shifts=roll_amount, dims=2)).mean() ** 2 + reg_loss += (noise * torch.roll(noise, shifts=roll_amount, dims=3)).mean() ** 2 + + if noise.shape[2] <= 8: + break + noise = F.avg_pool2d(noise, kernel_size=2) return reg_loss def kl_divergence(self, hidden_states): diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py index 0809a91041ce..661926daaa3e 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py @@ -14,6 +14,8 @@ # limitations under the License. import gc +import random +import tempfile import unittest import numpy as np @@ -30,7 +32,7 @@ StableDiffusionPix2PixZeroPipeline, UNet2DConditionModel, ) -from diffusers.utils import load_numpy, slow, torch_device +from diffusers.utils import floats_tensor, load_numpy, slow, torch_device from diffusers.utils.testing_utils import load_image, load_pt, require_torch_gpu, skip_mps from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS @@ -69,6 +71,7 @@ def get_dummy_components(self): cross_attention_dim=32, ) scheduler = DDIMScheduler() + inverse_scheduler = DDIMInverseScheduler() torch.manual_seed(0) vae = AutoencoderKL( block_out_channels=[32, 64], @@ -101,7 +104,7 @@ def get_dummy_components(self): "tokenizer": tokenizer, "safety_checker": None, "feature_extractor": None, - "inverse_scheduler": None, + "inverse_scheduler": inverse_scheduler, "caption_generator": None, "caption_processor": None, } @@ -122,6 +125,90 @@ def get_dummy_inputs(self, device, seed=0): } return inputs + def get_dummy_inversion_inputs(self, device, seed=0): + dummy_image = floats_tensor((2, 3, 32, 32), rng=random.Random(seed)).to(torch_device) + generator = torch.manual_seed(seed) + + inputs = { + "prompt": [ + "A painting of a squirrel eating a burger", + "A painting of a burger eating a squirrel", + ], + "image": dummy_image.cpu(), + "num_inference_steps": 2, + "guidance_scale": 6.0, + "generator": generator, + "output_type": "numpy", + } + return inputs + + def test_save_load_optional_components(self): + if not hasattr(self.pipeline_class, "_optional_components"): + return + + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + + # set all optional components to None and update pipeline config accordingly + for optional_component in pipe._optional_components: + setattr(pipe, optional_component, None) + pipe.register_modules(**{optional_component: None for optional_component in pipe._optional_components}) + + inputs = self.get_dummy_inputs(torch_device) + output = pipe(**inputs)[0] + + with tempfile.TemporaryDirectory() as tmpdir: + pipe.save_pretrained(tmpdir) + pipe_loaded = self.pipeline_class.from_pretrained(tmpdir) + pipe_loaded.to(torch_device) + pipe_loaded.set_progress_bar_config(disable=None) + + for optional_component in pipe._optional_components: + self.assertTrue( + getattr(pipe_loaded, optional_component) is None, + f"`{optional_component}` did not stay set to None after loading.", + ) + + inputs = self.get_dummy_inputs(torch_device) + output_loaded = pipe_loaded(**inputs)[0] + + max_diff = np.abs(output - output_loaded).max() + self.assertLess(max_diff, 1e-4) + + def test_stable_diffusion_pix2pix_zero_inversion(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components() + sd_pipe = StableDiffusionPix2PixZeroPipeline(**components) + sd_pipe = sd_pipe.to(device) + sd_pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inversion_inputs(device) + inputs["image"] = inputs["image"][:1] + inputs["prompt"] = inputs["prompt"][:1] + image = sd_pipe.invert(**inputs).images + image_slice = image[0, -3:, -3:, -1] + assert image.shape == (1, 32, 32, 3) + expected_slice = np.array([0.4833, 0.4696, 0.5574, 0.5194, 0.5248, 0.5638, 0.5040, 0.5423, 0.5072]) + + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 + + def test_stable_diffusion_pix2pix_zero_inversion_batch(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components() + sd_pipe = StableDiffusionPix2PixZeroPipeline(**components) + sd_pipe = sd_pipe.to(device) + sd_pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inversion_inputs(device) + image = sd_pipe.invert(**inputs).images + image_slice = image[1, -3:, -3:, -1] + assert image.shape == (2, 32, 32, 3) + expected_slice = np.array([0.6672, 0.5203, 0.4908, 0.4376, 0.4517, 0.5544, 0.4605, 0.4826, 0.5007]) + + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 + def test_stable_diffusion_pix2pix_zero_default_case(self): device = "cpu" # ensure determinism for the device-dependent torch.Generator components = self.get_dummy_components() From 3045fb276352681f6b9075956e599dd8ef571872 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Thu, 20 Apr 2023 17:25:17 +0530 Subject: [PATCH 39/71] [DreamBooth] add text encoder LoRA support in the DreamBooth training script (#3130) * add: LoRA text encoder support for DreamBooth example. * fix initialization. * fix: modification call. * add: entry in the readme. * use dog dataset from hub. * fix: params to clip. * add entry to the LoRA doc. * add: tests for lora. * remove unnecessary list comprehension./ --- docs/source/en/training/dreambooth.mdx | 13 ++- docs/source/en/training/lora.mdx | 9 +- examples/dreambooth/README.md | 43 +++++--- examples/dreambooth/train_dreambooth_lora.py | 101 +++++++++++++++---- examples/test_examples.py | 63 ++++++++++++ 5 files changed, 197 insertions(+), 32 deletions(-) diff --git a/docs/source/en/training/dreambooth.mdx b/docs/source/en/training/dreambooth.mdx index 908355e496dc..88ded0e009dc 100644 --- a/docs/source/en/training/dreambooth.mdx +++ b/docs/source/en/training/dreambooth.mdx @@ -60,7 +60,18 @@ DreamBooth finetuning is very sensitive to hyperparameters and easy to overfit. -Let's try DreamBooth with a [few images of a dog](https://drive.google.com/drive/folders/1BO_dyz-p65qhBRRMRA4TbZ8qW4rB99JZ); download and save them to a directory and then set the `INSTANCE_DIR` environment variable to that path: +Let's try DreamBooth with a +[few images of a dog](https://huggingface.co/datasets/diffusers/dog-example); +download and save them to a directory and then set the `INSTANCE_DIR` environment variable to that path: + +```python +local_dir = "./path_to_training_images" +snapshot_download( + "diffusers/dog-example", + local_dir=local_dir, repo_type="dataset", + ignore_patterns=".gitattributes", +) +``` ```bash export MODEL_NAME="CompVis/stable-diffusion-v1-4" diff --git a/docs/source/en/training/lora.mdx b/docs/source/en/training/lora.mdx index 1c72fbbc8d58..ac2311df9f1e 100644 --- a/docs/source/en/training/lora.mdx +++ b/docs/source/en/training/lora.mdx @@ -16,7 +16,9 @@ specific language governing permissions and limitations under the License. -Currently, LoRA is only supported for the attention layers of the [`UNet2DConditionalModel`]. +Currently, LoRA is only supported for the attention layers of the [`UNet2DConditionalModel`]. We also +support LoRA fine-tuning of the text encoder for DreamBooth in a limited capacity. For more details on how we support +LoRA fine-tuning of the text encoder, refer to the discussion on [this PR](https://github.com/huggingface/diffusers/pull/2918). @@ -175,6 +177,11 @@ accelerate launch train_dreambooth_lora.py \ --push_to_hub ``` +It's also possible to additionally fine-tune the text encoder with LoRA. This, in most cases, leads +to better results with a slight increase in the compute. To allow fine-tuning the text encoder with LoRA, +specify the `--train_text_encoder` while launching the `train_dreambooth_lora.py` script. + + ### Inference[[dreambooth-inference]] Now you can use the model for inference by loading the base model in the [`StableDiffusionPipeline`]: diff --git a/examples/dreambooth/README.md b/examples/dreambooth/README.md index d53f17114404..8447c7560720 100644 --- a/examples/dreambooth/README.md +++ b/examples/dreambooth/README.md @@ -45,15 +45,28 @@ write_basic_config() ### Dog toy example -Now let's get our dataset. Download images from [here](https://drive.google.com/drive/folders/1BO_dyz-p65qhBRRMRA4TbZ8qW4rB99JZ) and save them in a directory. This will be our training data. +Now let's get our dataset. For this example we will use some dog images: https://huggingface.co/datasets/diffusers/dog-example. -And launch the training using +Let's first download it locally: + +```python +from huggingface_hub import snapshot_download + +local_dir = "./dog" +snapshot_download( + "diffusers/dog-example", + local_dir=local_dir, repo_type="dataset", + ignore_patterns=".gitattributes", +) +``` + +And launch the training using: **___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___** ```bash export MODEL_NAME="CompVis/stable-diffusion-v1-4" -export INSTANCE_DIR="path-to-instance-images" +export INSTANCE_DIR="dog" export OUTPUT_DIR="path-to-save-model" accelerate launch train_dreambooth.py \ @@ -77,7 +90,7 @@ According to the paper, it's recommended to generate `num_epochs * num_samples` ```bash export MODEL_NAME="CompVis/stable-diffusion-v1-4" -export INSTANCE_DIR="path-to-instance-images" +export INSTANCE_DIR="dog" export CLASS_DIR="path-to-class-images" export OUTPUT_DIR="path-to-save-model" @@ -108,7 +121,7 @@ To install `bitandbytes` please refer to this [readme](https://github.com/TimDet ```bash export MODEL_NAME="CompVis/stable-diffusion-v1-4" -export INSTANCE_DIR="path-to-instance-images" +export INSTANCE_DIR="dog" export CLASS_DIR="path-to-class-images" export OUTPUT_DIR="path-to-save-model" @@ -141,7 +154,7 @@ It is possible to run dreambooth on a 12GB GPU by using the following optimizati ```bash export MODEL_NAME="CompVis/stable-diffusion-v1-4" -export INSTANCE_DIR="path-to-instance-images" +export INSTANCE_DIR="dog" export CLASS_DIR="path-to-class-images" export OUTPUT_DIR="path-to-save-model" @@ -185,7 +198,7 @@ does not seem to be compatible with DeepSpeed at the moment. ```bash export MODEL_NAME="CompVis/stable-diffusion-v1-4" -export INSTANCE_DIR="path-to-instance-images" +export INSTANCE_DIR="dog" export CLASS_DIR="path-to-class-images" export OUTPUT_DIR="path-to-save-model" @@ -217,7 +230,7 @@ ___Note: Training text encoder requires more memory, with this option the traini ```bash export MODEL_NAME="CompVis/stable-diffusion-v1-4" -export INSTANCE_DIR="path-to-instance-images" +export INSTANCE_DIR="dog" export CLASS_DIR="path-to-class-images" export OUTPUT_DIR="path-to-save-model" @@ -300,7 +313,7 @@ Now, you can launch the training. Here we will use [Stable Diffusion 1-5](https: ```bash export MODEL_NAME="runwayml/stable-diffusion-v1-5" -export INSTANCE_DIR="path-to-instance-images" +export INSTANCE_DIR="dog" export OUTPUT_DIR="path-to-save-model" ``` @@ -342,6 +355,12 @@ The final LoRA embedding weights have been uploaded to [patrickvonplaten/lora_dr The training results are summarized [here](https://api.wandb.ai/report/patrickvonplaten/xm6cd5q5). You can use the `Step` slider to see how the model learned the features of our subject while the model trained. +Optionally, we can also train additional LoRA layers for the text encoder. Specify the `train_text_encoder` argument above for that. If you're interested to know more about how we +enable this support, check out this [PR](https://github.com/huggingface/diffusers/pull/2918). + +With the default hyperparameters from the above, the training seems to go in a positive direction. Check out [this panel](https://wandb.ai/sayakpaul/dreambooth-lora/reports/test-23-04-17-17-00-13---Vmlldzo0MDkwNjMy). The trained LoRA layers are available [here](https://huggingface.co/sayakpaul/dreambooth). + + ### Inference After training, LoRA weights can be loaded very easily into the original pipeline. First, you need to @@ -386,7 +405,7 @@ pip install -U -r requirements_flax.txt ```bash export MODEL_NAME="duongna/stable-diffusion-v1-4-flax" -export INSTANCE_DIR="path-to-instance-images" +export INSTANCE_DIR="dog" export OUTPUT_DIR="path-to-save-model" python train_dreambooth_flax.py \ @@ -405,7 +424,7 @@ python train_dreambooth_flax.py \ ```bash export MODEL_NAME="duongna/stable-diffusion-v1-4-flax" -export INSTANCE_DIR="path-to-instance-images" +export INSTANCE_DIR="dog" export CLASS_DIR="path-to-class-images" export OUTPUT_DIR="path-to-save-model" @@ -429,7 +448,7 @@ python train_dreambooth_flax.py \ ```bash export MODEL_NAME="duongna/stable-diffusion-v1-4-flax" -export INSTANCE_DIR="path-to-instance-images" +export INSTANCE_DIR="dog" export CLASS_DIR="path-to-class-images" export OUTPUT_DIR="path-to-save-model" diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py index d360939c8c0c..1b75402c3550 100644 --- a/examples/dreambooth/train_dreambooth_lora.py +++ b/examples/dreambooth/train_dreambooth_lora.py @@ -15,6 +15,7 @@ import argparse import hashlib +import itertools import logging import math import os @@ -43,12 +44,13 @@ DDPMScheduler, DiffusionPipeline, DPMSolverMultistepScheduler, + StableDiffusionPipeline, UNet2DConditionModel, ) -from diffusers.loaders import AttnProcsLayers +from diffusers.loaders import AttnProcsLayers, LoraLoaderMixin from diffusers.models.attention_processor import LoRAAttnProcessor from diffusers.optimization import get_scheduler -from diffusers.utils import check_min_version, is_wandb_available +from diffusers.utils import TEXT_ENCODER_TARGET_MODULES, check_min_version, is_wandb_available from diffusers.utils.import_utils import is_xformers_available @@ -58,7 +60,7 @@ logger = get_logger(__name__) -def save_model_card(repo_id: str, images=None, base_model=str, prompt=str, repo_folder=None): +def save_model_card(repo_id: str, images=None, base_model=str, train_text_encoder=False, prompt=str, repo_folder=None): img_str = "" for i, image in enumerate(images): image.save(os.path.join(repo_folder, f"image_{i}.png")) @@ -83,6 +85,8 @@ def save_model_card(repo_id: str, images=None, base_model=str, prompt=str, repo_ These are LoRA adaption weights for {base_model}. The weights were trained on {prompt} using [DreamBooth](https://dreambooth.github.io/). You can find some example images in the following. \n {img_str} + +LoRA for the text encoder was enabled: {train_text_encoder}. """ with open(os.path.join(repo_folder, "README.md"), "w") as f: f.write(yaml + model_card) @@ -219,6 +223,11 @@ def parse_args(input_args=None): " cropped. The images will be resized to the resolution first before cropping." ), ) + parser.add_argument( + "--train_text_encoder", + action="store_true", + help="Whether to train the text encoder. If set, the text encoder should be float32 precision.", + ) parser.add_argument( "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader." ) @@ -547,7 +556,13 @@ def main(args): # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate # This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models. - # TODO (patil-suraj): Remove this check when gradient accumulation with two models is enabled in accelerate. + # TODO (sayakpaul): Remove this check when gradient accumulation with two models is enabled in accelerate. + if args.train_text_encoder and args.gradient_accumulation_steps > 1 and accelerator.num_processes > 1: + raise ValueError( + "Gradient accumulation is not supported when training the text encoder in distributed training. " + "Please set gradient_accumulation_steps to 1. This feature will be supported in the future." + ) + # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -691,7 +706,7 @@ def main(args): # => 32 layers # Set correct lora layers - lora_attn_procs = {} + unet_lora_attn_procs = {} for name in unet.attn_processors.keys(): cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim if name.startswith("mid_block"): @@ -703,12 +718,33 @@ def main(args): block_id = int(name[len("down_blocks.")]) hidden_size = unet.config.block_out_channels[block_id] - lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim) - - unet.set_attn_processor(lora_attn_procs) - lora_layers = AttnProcsLayers(unet.attn_processors) + unet_lora_attn_procs[name] = LoRAAttnProcessor( + hidden_size=hidden_size, cross_attention_dim=cross_attention_dim + ) - accelerator.register_for_checkpointing(lora_layers) + unet.set_attn_processor(unet_lora_attn_procs) + unet_lora_layers = AttnProcsLayers(unet.attn_processors) + accelerator.register_for_checkpointing(unet_lora_layers) + + # The text encoder comes from 🤗 transformers, so we cannot directly modify it. + # So, instead, we monkey-patch the forward calls of its attention-blocks. For this, + # we first load a dummy pipeline with the text encoder and then do the monkey-patching. + text_encoder_lora_layers = None + if args.train_text_encoder: + text_lora_attn_procs = {} + for name, module in text_encoder.named_modules(): + if any(x in name for x in TEXT_ENCODER_TARGET_MODULES): + text_lora_attn_procs[name] = LoRAAttnProcessor( + hidden_size=module.out_features, cross_attention_dim=None + ) + text_encoder_lora_layers = AttnProcsLayers(text_lora_attn_procs) + temp_pipeline = StableDiffusionPipeline.from_pretrained( + args.pretrained_model_name_or_path, text_encoder=text_encoder + ) + temp_pipeline._modify_text_encoder(text_lora_attn_procs) + text_encoder = temp_pipeline.text_encoder + accelerator.register_for_checkpointing(unet_lora_layers) + del temp_pipeline if args.scale_lr: args.learning_rate = ( @@ -739,8 +775,13 @@ def main(args): optimizer_class = torch.optim.AdamW # Optimizer creation + params_to_optimize = ( + itertools.chain(unet_lora_layers.parameters(), text_encoder_lora_layers.parameters()) + if args.train_text_encoder + else unet_lora_layers.parameters() + ) optimizer = optimizer_class( - lora_layers.parameters(), + params_to_optimize, lr=args.learning_rate, betas=(args.adam_beta1, args.adam_beta2), weight_decay=args.adam_weight_decay, @@ -784,9 +825,14 @@ def main(args): ) # Prepare everything with our `accelerator`. - lora_layers, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( - lora_layers, optimizer, train_dataloader, lr_scheduler - ) + if args.train_text_encoder: + unet_lora_layers, text_encoder_lora_layers, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( + unet_lora_layers, text_encoder_lora_layers, optimizer, train_dataloader, lr_scheduler + ) + else: + unet_lora_layers, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( + unet_lora_layers, optimizer, train_dataloader, lr_scheduler + ) # We need to recalculate our total training steps as the size of the training dataloader may have changed. num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) @@ -845,6 +891,8 @@ def main(args): for epoch in range(first_epoch, args.num_train_epochs): unet.train() + if args.train_text_encoder: + text_encoder.train() for step, batch in enumerate(train_dataloader): # Skip steps until we reach the resumed step if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step: @@ -900,7 +948,11 @@ def main(args): accelerator.backward(loss) if accelerator.sync_gradients: - params_to_clip = lora_layers.parameters() + params_to_clip = ( + itertools.chain(unet_lora_layers.parameters(), text_encoder_lora_layers.parameters()) + if args.train_text_encoder + else unet_lora_layers.parameters() + ) accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm) optimizer.step() lr_scheduler.step() @@ -914,7 +966,14 @@ def main(args): if global_step % args.checkpointing_steps == 0: if accelerator.is_main_process: save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}") - accelerator.save_state(save_path) + # We combine the text encoder and UNet LoRA parameters with a simple + # custom logic. `accelerator.save_state()` won't know that. So, + # use `LoraLoaderMixin.save_lora_weights()`. + LoraLoaderMixin.save_lora_weights( + save_directory=save_path, + unet_lora_layers=unet_lora_layers, + text_encoder_lora_layers=text_encoder_lora_layers, + ) logger.info(f"Saved state to {save_path}") logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]} @@ -970,7 +1029,12 @@ def main(args): accelerator.wait_for_everyone() if accelerator.is_main_process: unet = unet.to(torch.float32) - unet.save_attn_procs(args.output_dir) + text_encoder = text_encoder.to(torch.float32) + LoraLoaderMixin.save_lora_weights( + save_directory=args.output_dir, + unet_lora_layers=unet_lora_layers, + text_encoder_lora_layers=text_encoder_lora_layers, + ) # Final inference # Load previous pipeline @@ -981,7 +1045,7 @@ def main(args): pipeline = pipeline.to(accelerator.device) # load attention processors - pipeline.unet.load_attn_procs(args.output_dir) + pipeline.load_attn_procs(args.output_dir) # run inference if args.validation_prompt and args.num_validation_images > 0: @@ -1010,6 +1074,7 @@ def main(args): repo_id, images=images, base_model=args.pretrained_model_name_or_path, + train_text_encoder=args.train_text_encoder, prompt=args.instance_prompt, repo_folder=args.output_dir, ) diff --git a/examples/test_examples.py b/examples/test_examples.py index a77fa4c7da23..238dc49d729f 100644 --- a/examples/test_examples.py +++ b/examples/test_examples.py @@ -23,6 +23,7 @@ import unittest from typing import List +import torch from accelerate.utils import write_basic_config from diffusers import DiffusionPipeline, UNet2DConditionModel @@ -221,6 +222,68 @@ def test_dreambooth_checkpointing(self): self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-4"))) self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-6"))) + def test_dreambooth_lora(self): + with tempfile.TemporaryDirectory() as tmpdir: + test_args = f""" + examples/dreambooth/train_dreambooth_lora.py + --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-pipe + --instance_data_dir docs/source/en/imgs + --instance_prompt photo + --resolution 64 + --train_batch_size 1 + --gradient_accumulation_steps 1 + --max_train_steps 2 + --learning_rate 5.0e-04 + --scale_lr + --lr_scheduler constant + --lr_warmup_steps 0 + --output_dir {tmpdir} + """.split() + + run_command(self._launch_args + test_args) + # save_pretrained smoke test + self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.bin"))) + + # make sure the state_dict has the correct naming in the parameters. + lora_state_dict = torch.load(os.path.join(tmpdir, "pytorch_lora_weights.bin")) + is_lora = all("lora" in k for k in lora_state_dict.keys()) + self.assertTrue(is_lora) + + # when not training the text encoder, all the parameters in the state dict should start + # with `"unet"` in their names. + starts_with_unet = all(key.startswith("unet") for key in lora_state_dict.keys()) + self.assertTrue(starts_with_unet) + + def test_dreambooth_lora_with_text_encoder(self): + with tempfile.TemporaryDirectory() as tmpdir: + test_args = f""" + examples/dreambooth/train_dreambooth_lora.py + --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-pipe + --instance_data_dir docs/source/en/imgs + --instance_prompt photo + --resolution 64 + --train_batch_size 1 + --gradient_accumulation_steps 1 + --max_train_steps 2 + --learning_rate 5.0e-04 + --scale_lr + --lr_scheduler constant + --lr_warmup_steps 0 + --train_text_encoder + --output_dir {tmpdir} + """.split() + + run_command(self._launch_args + test_args) + # save_pretrained smoke test + self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.bin"))) + + # the names of the keys of the state dict should either start with `unet` + # or `text_encoder`. + lora_state_dict = torch.load(os.path.join(tmpdir, "pytorch_lora_weights.bin")) + keys = lora_state_dict.keys() + is_correct_naming = all(k.startswith("unet") or k.startswith("text_encoder") for k in keys) + self.assertTrue(is_correct_naming) + def test_custom_diffusion(self): with tempfile.TemporaryDirectory() as tmpdir: test_args = f""" From 9bce375f77d8d4de88535c651c64aff057c33545 Mon Sep 17 00:00:00 2001 From: regisss <15324346+regisss@users.noreply.github.com> Date: Fri, 21 Apr 2023 18:24:43 +0200 Subject: [PATCH 40/71] Update Habana Gaudi documentation (#3169) * Update Habana Gaudi doc * Fix tables --- docs/source/en/optimization/habana.mdx | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/docs/source/en/optimization/habana.mdx b/docs/source/en/optimization/habana.mdx index a5f476b0cef2..7092c89352db 100644 --- a/docs/source/en/optimization/habana.mdx +++ b/docs/source/en/optimization/habana.mdx @@ -16,8 +16,8 @@ specific language governing permissions and limitations under the License. ## Requirements -- Optimum Habana 1.4 or later, [here](https://huggingface.co/docs/optimum/habana/installation) is how to install it. -- SynapseAI 1.8. +- Optimum Habana 1.5 or later, [here](https://huggingface.co/docs/optimum/habana/installation) is how to install it. +- SynapseAI 1.9. ## Inference Pipeline @@ -64,7 +64,16 @@ For more information, check out Optimum Habana's [documentation](https://hugging Here are the latencies for Habana first-generation Gaudi and Gaudi2 with the [Habana/stable-diffusion](https://huggingface.co/Habana/stable-diffusion) Gaudi configuration (mixed precision bf16/fp32): +- [Stable Diffusion v1.5](https://huggingface.co/runwayml/stable-diffusion-v1-5) (512x512 resolution): + | | Latency (batch size = 1) | Throughput (batch size = 8) | | ---------------------- |:------------------------:|:---------------------------:| -| first-generation Gaudi | 4.29s | 0.283 images/s | -| Gaudi2 | 1.54s | 0.904 images/s | +| first-generation Gaudi | 4.22s | 0.29 images/s | +| Gaudi2 | 1.70s | 0.925 images/s | + +- [Stable Diffusion v2.1](https://huggingface.co/stabilityai/stable-diffusion-2-1) (768x768 resolution): + +| | Latency (batch size = 1) | Throughput | +| ---------------------- |:------------------------:|:-------------------------------:| +| first-generation Gaudi | 23.3s | 0.045 images/s (batch size = 2) | +| Gaudi2 | 7.75s | 0.14 images/s (batch size = 5) | From 9c856118c72dca9cae194648492b4284a254386c Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Fri, 21 Apr 2023 18:47:33 +0200 Subject: [PATCH 41/71] Add model offload to x4 upscaler (#3187) * Add model offload to x4 upscaler * fix --- .../pipeline_stable_diffusion_upscale.py | 34 +++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py index c0086b32d6fd..693208b18cdd 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py @@ -23,7 +23,7 @@ from ...loaders import TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import DDPMScheduler, KarrasDiffusionSchedulers -from ...utils import deprecate, is_accelerate_available, logging, randn_tensor +from ...utils import deprecate, is_accelerate_available, is_accelerate_version, logging, randn_tensor from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput @@ -129,10 +129,36 @@ def enable_sequential_cpu_offload(self, gpu_id=0): device = torch.device(f"cuda:{gpu_id}") - for cpu_offloaded_model in [self.unet, self.text_encoder]: + for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]: if cpu_offloaded_model is not None: cpu_offload(cpu_offloaded_model, device) + def enable_model_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared + to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` + method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with + `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): + from accelerate import cpu_offload_with_hook + else: + raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + hook = None + for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]: + if cpu_offloaded_model is not None: + _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) + + # We'll offload the last model manually. + self.final_offload_hook = hook + @property # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device def _execution_device(self): @@ -647,6 +673,10 @@ def __call__( self.vae.to(dtype=torch.float32) image = self.decode_latents(latents.float()) + # Offload last model to CPU + if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: + self.final_offload_hook.offload() + # 11. Convert to PIL if output_type == "pil": image = self.numpy_to_pil(image) From 2f6351b0015a4cd610a054f973b4f75d65c83531 Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Fri, 21 Apr 2023 10:38:34 -0700 Subject: [PATCH 42/71] [docs] Deterministic algorithms (#3172) deterministic algos --- .../en/using-diffusers/reproducibility.mdx | 47 +++++++++++++++++-- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/docs/source/en/using-diffusers/reproducibility.mdx b/docs/source/en/using-diffusers/reproducibility.mdx index 35191c139289..5bef10bfe190 100644 --- a/docs/source/en/using-diffusers/reproducibility.mdx +++ b/docs/source/en/using-diffusers/reproducibility.mdx @@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License. Reproducibility is important for testing, replicating results, and can even be used to [improve image quality](reusing_seeds). However, the randomness in diffusion models is a desired property because it allows the pipeline to generate different images every time it is run. While you can't expect to get the exact same results across platforms, you can expect results to be reproducible across releases and platforms within a certain tolerance range. Even then, tolerance varies depending on the diffusion pipeline and checkpoint. -This is why it's important to understand how to control sources of randomness in diffusion models. +This is why it's important to understand how to control sources of randomness in diffusion models or use deterministic algorithms. @@ -24,7 +24,7 @@ This is why it's important to understand how to control sources of randomness in -## Inference +## Control randomness During inference, pipelines rely heavily on random sampling operations which include creating the Gaussian noise tensors to denoise and adding noise to the scheduling step. @@ -147,5 +147,46 @@ susceptible to precision error propagation. Don't expect similar results across different GPU hardware or PyTorch versions. In this case, you'll need to run exactly the same hardware and PyTorch version for full reproducibility. -## randn_tensor +### randn_tensor [[autodoc]] diffusers.utils.randn_tensor + +## Deterministic algorithms + +You can also configure PyTorch to use deterministic algorithms to create a reproducible pipeline. However, you should be aware that deterministic algorithms may be slower than nondeterministic ones and you may observe a decrease in performance. But if reproducibility is important to you, then this is the way to go! + +Nondeterministic behavior occurs when operations are launched in more than one CUDA stream. To avoid this, set the environment varibale [`CUBLAS_WORKSPACE_CONFIG`](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility) to `:16:8` to only use one buffer size during runtime. + +PyTorch typically benchmarks multiple algorithms to select the fastest one, but if you want reproducibility, you should disable this feature because the benchmark may select different algorithms each time. Lastly, pass `True` to [`torch.use_deterministic_algorithms`](https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html) to enable deterministic algorithms. + +```py +import os + +os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8" + +torch.backends.cudnn.benchmark = False +torch.use_deterministic_algorithms(True) +``` + +Now when you run the same pipeline twice, you'll get identical results. + +```py +import torch +from diffusers import DDIMScheduler, StableDiffusionPipeline +import numpy as np + +model_id = "runwayml/stable-diffusion-v1-5" +pipe = StableDiffusionPipeline.from_pretrained(model_id).to("cuda") +pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) +g = torch.Generator(device="cuda") + +prompt = "A bear is playing a guitar on Times Square" + +g.manual_seed(0) +result1 = pipe(prompt=prompt, num_inference_steps=50, generator=g, output_type="latent").images + +g.manual_seed(0) +result2 = pipe(prompt=prompt, num_inference_steps=50, generator=g, output_type="latent").images + +print("L_inf dist = ", abs(result1 - result2).max()) +"L_inf dist = tensor(0., device='cuda:0')" +``` \ No newline at end of file From e573ae06e2bf5aa632d9d78ce3c4c1374741287d Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Fri, 21 Apr 2023 23:14:08 +0530 Subject: [PATCH 43/71] Update custom_diffusion.mdx to credit the author (#3163) * Update custom_diffusion.mdx * fix: unnecessary list comprehension. --- docs/source/en/training/custom_diffusion.mdx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/en/training/custom_diffusion.mdx b/docs/source/en/training/custom_diffusion.mdx index 245d434adeda..08604f101ea2 100644 --- a/docs/source/en/training/custom_diffusion.mdx +++ b/docs/source/en/training/custom_diffusion.mdx @@ -15,6 +15,8 @@ specific language governing permissions and limitations under the License. [Custom Diffusion](https://arxiv.org/abs/2212.04488) is a method to customize text-to-image models like Stable Diffusion given just a few (4~5) images of a subject. The `train_custom_diffusion.py` script shows how to implement the training procedure and adapt it for stable diffusion. +This training example was contributed by [Nupur Kumari](https://nupurkmr9.github.io/) (one of the authors of Custom Diffusion). + ## Running locally with PyTorch ### Installing the dependencies From 05d9baeacd531dc66680d974ec234940e0088d58 Mon Sep 17 00:00:00 2001 From: asfiyab-nvidia <117682710+asfiyab-nvidia@users.noreply.github.com> Date: Fri, 21 Apr 2023 10:53:10 -0700 Subject: [PATCH 44/71] Fix TensorRT community pipeline device set function (#3157) pass silence_dtype_warnings as kwarg Signed-off-by: Asfiya Baig Co-authored-by: Patrick von Platen --- examples/community/stable_diffusion_tensorrt_txt2img.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/community/stable_diffusion_tensorrt_txt2img.py b/examples/community/stable_diffusion_tensorrt_txt2img.py index 7aef2bec743f..aa7b5c12313b 100644 --- a/examples/community/stable_diffusion_tensorrt_txt2img.py +++ b/examples/community/stable_diffusion_tensorrt_txt2img.py @@ -703,7 +703,7 @@ def set_cached_folder(cls, pretrained_model_name_or_path: Optional[Union[str, os ) def to(self, torch_device: Optional[Union[str, torch.device]] = None, silence_dtype_warnings: bool = False): - super().to(torch_device, silence_dtype_warnings) + super().to(torch_device, silence_dtype_warnings=silence_dtype_warnings) self.onnx_dir = os.path.join(self.cached_folder, self.onnx_dir) self.engine_dir = os.path.join(self.cached_folder, self.engine_dir) From bc0392a0cbac301474ef82eed5818d2030a4fc4c Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Fri, 21 Apr 2023 08:01:36 -1000 Subject: [PATCH 45/71] make `from_flax` work for controlnet (#3161) fix from_flax Co-authored-by: Patrick von Platen --- src/diffusers/models/modeling_pytorch_flax_utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/diffusers/models/modeling_pytorch_flax_utils.py b/src/diffusers/models/modeling_pytorch_flax_utils.py index b368a74ca299..17b521b00145 100644 --- a/src/diffusers/models/modeling_pytorch_flax_utils.py +++ b/src/diffusers/models/modeling_pytorch_flax_utils.py @@ -110,6 +110,12 @@ def load_flax_weights_in_pytorch_model(pt_model, flax_state): .replace("_1", ".1") .replace("_2", ".2") .replace("_3", ".3") + .replace("_4", ".4") + .replace("_5", ".5") + .replace("_6", ".6") + .replace("_7", ".7") + .replace("_8", ".8") + .replace("_9", ".9") ) flax_key = ".".join(flax_key_tuple_array) From 391cfcd7d7e3df50ba30b3771c4347848ff0b2e1 Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Fri, 21 Apr 2023 11:03:44 -0700 Subject: [PATCH 46/71] [docs] Clarify training args (#3146) * clarify training arg * apply feedback --- docs/source/en/training/controlnet.mdx | 1 + docs/source/en/training/dreambooth.mdx | 47 ++++++++++++--------- docs/source/en/training/instructpix2pix.mdx | 3 +- docs/source/en/training/lora.mdx | 8 +++- docs/source/en/training/text2image.mdx | 6 ++- docs/source/en/training/text_inversion.mdx | 23 +++++++--- 6 files changed, 58 insertions(+), 30 deletions(-) diff --git a/docs/source/en/training/controlnet.mdx b/docs/source/en/training/controlnet.mdx index 7a5454107b83..94e3d969b80a 100644 --- a/docs/source/en/training/controlnet.mdx +++ b/docs/source/en/training/controlnet.mdx @@ -74,6 +74,7 @@ wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/ma wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_2.png ``` +Specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`~diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path`] argument. ```bash export MODEL_DIR="runwayml/stable-diffusion-v1-5" diff --git a/docs/source/en/training/dreambooth.mdx b/docs/source/en/training/dreambooth.mdx index 88ded0e009dc..c5a5a047d114 100644 --- a/docs/source/en/training/dreambooth.mdx +++ b/docs/source/en/training/dreambooth.mdx @@ -50,6 +50,20 @@ from accelerate.utils import write_basic_config write_basic_config() ``` +Finally, download a [few images of a dog](https://huggingface.co/datasets/diffusers/dog-example) to DreamBooth with: + +```py +from huggingface_hub import snapshot_download + +local_dir = "./dog" +snapshot_download( + "diffusers/dog-example", + local_dir=local_dir, + repo_type="dataset", + ignore_patterns=".gitattributes", +) +``` + ## Finetuning @@ -60,22 +74,13 @@ DreamBooth finetuning is very sensitive to hyperparameters and easy to overfit. -Let's try DreamBooth with a -[few images of a dog](https://huggingface.co/datasets/diffusers/dog-example); -download and save them to a directory and then set the `INSTANCE_DIR` environment variable to that path: +Set the `INSTANCE_DIR` environment variable to the path of the directory containing the dog images. -```python -local_dir = "./path_to_training_images" -snapshot_download( - "diffusers/dog-example", - local_dir=local_dir, repo_type="dataset", - ignore_patterns=".gitattributes", -) -``` +Specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`~diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path`] argument. ```bash export MODEL_NAME="CompVis/stable-diffusion-v1-4" -export INSTANCE_DIR="path_to_training_images" +export INSTANCE_DIR="./dog" export OUTPUT_DIR="path_to_saved_model" ``` @@ -105,11 +110,13 @@ Before running the script, make sure you have the requirements installed: pip install -U -r requirements.txt ``` +Specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`~diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path`] argument. + Now you can launch the training script with the following command: ```bash export MODEL_NAME="duongna/stable-diffusion-v1-4-flax" -export INSTANCE_DIR="path-to-instance-images" +export INSTANCE_DIR="./dog" export OUTPUT_DIR="path-to-save-model" python train_dreambooth_flax.py \ @@ -135,7 +142,7 @@ The authors recommend generating `num_epochs * num_samples` images for prior pre ```bash export MODEL_NAME="CompVis/stable-diffusion-v1-4" -export INSTANCE_DIR="path_to_training_images" +export INSTANCE_DIR="./dog" export CLASS_DIR="path_to_class_images" export OUTPUT_DIR="path_to_saved_model" @@ -160,7 +167,7 @@ accelerate launch train_dreambooth.py \ ```bash export MODEL_NAME="duongna/stable-diffusion-v1-4-flax" -export INSTANCE_DIR="path-to-instance-images" +export INSTANCE_DIR="./dog" export CLASS_DIR="path-to-class-images" export OUTPUT_DIR="path-to-save-model" @@ -197,7 +204,7 @@ Pass the `--train_text_encoder` argument to the training script to enable finetu ```bash export MODEL_NAME="CompVis/stable-diffusion-v1-4" -export INSTANCE_DIR="path_to_training_images" +export INSTANCE_DIR="./dog" export CLASS_DIR="path_to_class_images" export OUTPUT_DIR="path_to_saved_model" @@ -224,7 +231,7 @@ accelerate launch train_dreambooth.py \ ```bash export MODEL_NAME="duongna/stable-diffusion-v1-4-flax" -export INSTANCE_DIR="path-to-instance-images" +export INSTANCE_DIR="./dog" export CLASS_DIR="path-to-class-images" export OUTPUT_DIR="path-to-save-model" @@ -360,7 +367,7 @@ Then pass the `--use_8bit_adam` option to the training script: ```bash export MODEL_NAME="CompVis/stable-diffusion-v1-4" -export INSTANCE_DIR="path_to_training_images" +export INSTANCE_DIR="./dog" export CLASS_DIR="path_to_class_images" export OUTPUT_DIR="path_to_saved_model" @@ -389,7 +396,7 @@ To run DreamBooth on a 12GB GPU, you'll need to enable gradient checkpointing, t ```bash export MODEL_NAME="CompVis/stable-diffusion-v1-4" -export INSTANCE_DIR="path-to-instance-images" +export INSTANCE_DIR="./dog" export CLASS_DIR="path-to-class-images" export OUTPUT_DIR="path-to-save-model" @@ -436,7 +443,7 @@ Launch training with the following command: ```bash export MODEL_NAME="CompVis/stable-diffusion-v1-4" -export INSTANCE_DIR="path_to_training_images" +export INSTANCE_DIR="./dog" export CLASS_DIR="path_to_class_images" export OUTPUT_DIR="path_to_saved_model" diff --git a/docs/source/en/training/instructpix2pix.mdx b/docs/source/en/training/instructpix2pix.mdx index c485db6d6b20..ff34ec335656 100644 --- a/docs/source/en/training/instructpix2pix.mdx +++ b/docs/source/en/training/instructpix2pix.mdx @@ -74,8 +74,7 @@ write_basic_config() As mentioned before, we'll use a [small toy dataset](https://huggingface.co/datasets/fusing/instructpix2pix-1000-samples) for training. The dataset is a smaller version of the [original dataset](https://huggingface.co/datasets/timbrooks/instructpix2pix-clip-filtered) used in the InstructPix2Pix paper. -Configure environment variables such as the dataset identifier and the Stable Diffusion -checkpoint: +Specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`~diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path`] argument. You'll also need to specify the dataset name in `DATASET_ID`: ```bash export MODEL_NAME="runwayml/stable-diffusion-v1-5" diff --git a/docs/source/en/training/lora.mdx b/docs/source/en/training/lora.mdx index ac2311df9f1e..7e3c3c0b2b68 100644 --- a/docs/source/en/training/lora.mdx +++ b/docs/source/en/training/lora.mdx @@ -52,7 +52,9 @@ Finetuning a model like Stable Diffusion, which has billions of parameters, can Let's finetune [`stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5) on the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset to generate your own Pokémon. -To start, make sure you have the `MODEL_NAME` and `DATASET_NAME` environment variables set. The `OUTPUT_DIR` and `HUB_MODEL_ID` variables are optional and specify where to save the model to on the Hub: +Specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`~diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path`] argument. You'll also need to set the `DATASET_NAME` environment variable to the name of the dataset you want to train on. + +The `OUTPUT_DIR` and `HUB_MODEL_ID` variables are optional and specify where to save the model to on the Hub: ```bash export MODEL_NAME="runwayml/stable-diffusion-v1-5" @@ -140,7 +142,9 @@ Load the LoRA weights from your finetuned model *on top of the base model weight Let's finetune [`stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5) with DreamBooth and LoRA with some 🐶 [dog images](https://drive.google.com/drive/folders/1BO_dyz-p65qhBRRMRA4TbZ8qW4rB99JZ). Download and save these images to a directory. -To start, make sure you have the `MODEL_NAME` and `INSTANCE_DIR` (path to directory containing images) environment variables set. The `OUTPUT_DIR` variables is optional and specifies where to save the model to on the Hub: +To start, specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`~diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path`] argument. You'll also need to set `INSTANCE_DIR` to the path of the directory containing the images. + +The `OUTPUT_DIR` variables is optional and specifies where to save the model to on the Hub: ```bash export MODEL_NAME="runwayml/stable-diffusion-v1-5" diff --git a/docs/source/en/training/text2image.mdx b/docs/source/en/training/text2image.mdx index 70f8c003a787..dabb68397f78 100644 --- a/docs/source/en/training/text2image.mdx +++ b/docs/source/en/training/text2image.mdx @@ -72,7 +72,9 @@ To load a checkpoint to resume training, pass the argument `--resume_from_checkp -Launch the [PyTorch training script](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py) for a fine-tuning run on the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset like this: +Launch the [PyTorch training script](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py) for a fine-tuning run on the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset like this. + +Specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`~diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path`] argument. {"path": "../../../../examples/text_to_image/README.md", @@ -141,6 +143,8 @@ Before running the script, make sure you have the requirements installed: pip install -U -r requirements_flax.txt ``` +Specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`~diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path`] argument. + Now you can launch the [Flax training script](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_flax.py) like this: ```bash diff --git a/docs/source/en/training/text_inversion.mdx b/docs/source/en/training/text_inversion.mdx index 6e6971d7f119..e47a0519c704 100644 --- a/docs/source/en/training/text_inversion.mdx +++ b/docs/source/en/training/text_inversion.mdx @@ -1,4 +1,4 @@ - + +# IF + +## Overview + +DeepFloyd IF is a novel state-of-the-art open-source text-to-image model with a high degree of photorealism and language understanding. +The model is a modular composed of a frozen text encoder and three cascaded pixel diffusion modules: +- Stage 1: a base model that generates 64x64 px image based on text prompt, +- Stage 2: a 64x64 px => 256x256 px super-resolution model, and a +- Stage 3: a 256x256 px => 1024x1024 px super-resolution model +Stage 1 and Stage 2 utilize a frozen text encoder based on the T5 transformer to extract text embeddings, +which are then fed into a UNet architecture enhanced with cross-attention and attention pooling. +Stage 3 is [Stability's x4 Upscaling model](https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler). +The result is a highly efficient model that outperforms current state-of-the-art models, achieving a zero-shot FID score of 6.66 on the COCO dataset. +Our work underscores the potential of larger UNet architectures in the first stage of cascaded diffusion models and depicts a promising future for text-to-image synthesis. + +## Usage + +Before you can use IF, you need to accept its usage conditions. To do so: +1. Make sure to have a [Hugging Face account](https://huggingface.co/join) and be loggin in +2. Accept the license on the model card of [DeepFloyd/IF-I-IF-v1.0](https://huggingface.co/DeepFloyd/IF-I-IF-v1.0) and [DeepFloyd/IF-II-L-v1.0](https://huggingface.co/DeepFloyd/IF-II-L-v1.0) +3. Make sure to login locally. Install `huggingface_hub` +```sh +pip install huggingface_hub --upgrade +``` + +run the login function in a Python shell + +```py +from huggingface_hub import login + +login() +``` + +and enter your [Hugging Face Hub access token](https://huggingface.co/docs/hub/security-tokens#what-are-user-access-tokens). + +Next we install `diffusers` and dependencies: + +```sh +pip install diffusers accelerate transformers safetensors +``` + +The following sections give more in-detail examples of how to use IF. Specifically: + +- [Text-to-Image Generation](#text-to-image-generation) +- [Image-to-Image Generation](#text-guided-image-to-image-generation) +- [Inpainting](#text-guided-inpainting-generation) +- [Reusing model weights](#converting-between-different-pipelines) +- [Speed optimization](#optimizing-for-speed) +- [Memory optimization](#optimizing-for-memory) + +**Available checkpoints** +- *Stage-1* + - [DeepFloyd/IF-I-IF-v1.0](https://huggingface.co/DeepFloyd/IF-I-IF-v1.0) + - [DeepFloyd/IF-I-L-v1.0](https://huggingface.co/DeepFloyd/IF-I-L-v1.0) + - [DeepFloyd/IF-I-M-v1.0](https://huggingface.co/DeepFloyd/IF-I-M-v1.0) + +- *Stage-2* + - [DeepFloyd/IF-II-L-v1.0](https://huggingface.co/DeepFloyd/IF-II-L-v1.0) + - [DeepFloyd/IF-II-M-v1.0](https://huggingface.co/DeepFloyd/IF-II-M-v1.0) + +- *Stage-3* + - [stabilityai/stable-diffusion-x4-upscaler](https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler) + +**Demo** +[![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/DeepFloyd/IF) + +**Google Colab** +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/deepfloyd_if_free_tier_google_colab.ipynb) + +### Text-to-Image Generation + +By default diffusers makes use of [model cpu offloading](https://huggingface.co/docs/diffusers/optimization/fp16#model-offloading-for-fast-inference-and-memory-savings) +to run the whole IF pipeline with as little as 14 GB of VRAM. + +```python +from diffusers import DiffusionPipeline +from diffusers.utils import pt_to_pil +import torch + +# stage 1 +stage_1 = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16) +stage_1.enable_model_cpu_offload() + +# stage 2 +stage_2 = DiffusionPipeline.from_pretrained( + "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16 +) +stage_2.enable_model_cpu_offload() + +# stage 3 +safety_modules = { + "feature_extractor": stage_1.feature_extractor, + "safety_checker": stage_1.safety_checker, + "watermarker": stage_1.watermarker, +} +stage_3 = DiffusionPipeline.from_pretrained( + "stabilityai/stable-diffusion-x4-upscaler", **safety_modules, torch_dtype=torch.float16 +) +stage_3.enable_model_cpu_offload() + +prompt = 'a photo of a kangaroo wearing an orange hoodie and blue sunglasses standing in front of the eiffel tower holding a sign that says "very deep learning"' +generator = torch.manual_seed(1) + +# text embeds +prompt_embeds, negative_embeds = stage_1.encode_prompt(prompt) + +# stage 1 +image = stage_1( + prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_embeds, generator=generator, output_type="pt" +).images +pt_to_pil(image)[0].save("./if_stage_I.png") + +# stage 2 +image = stage_2( + image=image, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_embeds, + generator=generator, + output_type="pt", +).images +pt_to_pil(image)[0].save("./if_stage_II.png") + +# stage 3 +image = stage_3(prompt=prompt, image=image, noise_level=100, generator=generator).images +image[0].save("./if_stage_III.png") +``` + +### Text Guided Image-to-Image Generation + +The same IF model weights can be used for text-guided image-to-image translation or image variation. +In this case just make sure to load the weights using the [`IFInpaintingPipeline`] and [`IFInpaintingSuperResolutionPipeline`] pipelines. + +**Note**: You can also directly move the weights of the text-to-image pipelines to the image-to-image pipelines +without loading them twice by making use of the [`~DiffusionPipeline.components()`] function as explained [here](#converting-between-different-pipelines). + +```python +from diffusers import IFImg2ImgPipeline, IFImg2ImgSuperResolutionPipeline, DiffusionPipeline +from diffusers.utils import pt_to_pil + +import torch + +from PIL import Image +import requests +from io import BytesIO + +# download image +url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg" +response = requests.get(url) +original_image = Image.open(BytesIO(response.content)).convert("RGB") +original_image = original_image.resize((768, 512)) + +# stage 1 +stage_1 = IFImg2ImgPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16) +stage_1.enable_model_cpu_offload() + +# stage 2 +stage_2 = IFImg2ImgSuperResolutionPipeline.from_pretrained( + "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16 +) +stage_2.enable_model_cpu_offload() + +# stage 3 +safety_modules = { + "feature_extractor": stage_1.feature_extractor, + "safety_checker": stage_1.safety_checker, + "watermarker": stage_1.watermarker, +} +stage_3 = DiffusionPipeline.from_pretrained( + "stabilityai/stable-diffusion-x4-upscaler", **safety_modules, torch_dtype=torch.float16 +) +stage_3.enable_model_cpu_offload() + +prompt = "A fantasy landscape in style minecraft" +generator = torch.manual_seed(1) + +# text embeds +prompt_embeds, negative_embeds = stage_1.encode_prompt(prompt) + +# stage 1 +image = stage_1( + image=original_image, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_embeds, + generator=generator, + output_type="pt", +).images +pt_to_pil(image)[0].save("./if_stage_I.png") + +# stage 2 +image = stage_2( + image=image, + original_image=original_image, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_embeds, + generator=generator, + output_type="pt", +).images +pt_to_pil(image)[0].save("./if_stage_II.png") + +# stage 3 +image = stage_3(prompt=prompt, image=image, generator=generator, noise_level=100).images +image[0].save("./if_stage_III.png") +``` + +### Text Guided Inpainting Generation + +The same IF model weights can be used for text-guided image-to-image translation or image variation. +In this case just make sure to load the weights using the [`IFInpaintingPipeline`] and [`IFInpaintingSuperResolutionPipeline`] pipelines. + +**Note**: You can also directly move the weights of the text-to-image pipelines to the image-to-image pipelines +without loading them twice by making use of the [`~DiffusionPipeline.components()`] function as explained [here](#converting-between-different-pipelines). + +```python +from diffusers import IFInpaintingPipeline, IFInpaintingSuperResolutionPipeline, DiffusionPipeline +from diffusers.utils import pt_to_pil +import torch + +from PIL import Image +import requests +from io import BytesIO + +# download image +url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/if/person.png" +response = requests.get(url) +original_image = Image.open(BytesIO(response.content)).convert("RGB") +original_image = original_image + +# download mask +url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/if/glasses_mask.png" +response = requests.get(url) +mask_image = Image.open(BytesIO(response.content)) +mask_image = mask_image + +# stage 1 +stage_1 = IFInpaintingPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16) +stage_1.enable_model_cpu_offload() + +# stage 2 +stage_2 = IFInpaintingSuperResolutionPipeline.from_pretrained( + "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16 +) +stage_2.enable_model_cpu_offload() + +# stage 3 +safety_modules = { + "feature_extractor": stage_1.feature_extractor, + "safety_checker": stage_1.safety_checker, + "watermarker": stage_1.watermarker, +} +stage_3 = DiffusionPipeline.from_pretrained( + "stabilityai/stable-diffusion-x4-upscaler", **safety_modules, torch_dtype=torch.float16 +) +stage_3.enable_model_cpu_offload() + +prompt = "blue sunglasses" +generator = torch.manual_seed(1) + +# text embeds +prompt_embeds, negative_embeds = stage_1.encode_prompt(prompt) + +# stage 1 +image = stage_1( + image=original_image, + mask_image=mask_image, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_embeds, + generator=generator, + output_type="pt", +).images +pt_to_pil(image)[0].save("./if_stage_I.png") + +# stage 2 +image = stage_2( + image=image, + original_image=original_image, + mask_image=mask_image, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_embeds, + generator=generator, + output_type="pt", +).images +pt_to_pil(image)[0].save("./if_stage_II.png") + +# stage 3 +image = stage_3(prompt=prompt, image=image, generator=generator, noise_level=100).images +image[0].save("./if_stage_III.png") +``` + +### Converting between different pipelines + +In addition to being loaded with `from_pretrained`, Pipelines can also be loaded directly from each other. + +```python +from diffusers import IFPipeline, IFSuperResolutionPipeline + +pipe_1 = IFPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0") +pipe_2 = IFSuperResolutionPipeline.from_pretrained("DeepFloyd/IF-II-L-v1.0") + + +from diffusers import IFImg2ImgPipeline, IFImg2ImgSuperResolutionPipeline + +pipe_1 = IFImg2ImgPipeline(**pipe_1.components) +pipe_2 = IFImg2ImgSuperResolutionPipeline(**pipe_2.components) + + +from diffusers import IFInpaintingPipeline, IFInpaintingSuperResolutionPipeline + +pipe_1 = IFInpaintingPipeline(**pipe_1.components) +pipe_2 = IFInpaintingSuperResolutionPipeline(**pipe_2.components) +``` + +### Optimizing for speed + +The simplest optimization to run IF faster is to move all model components to the GPU. + +```py +pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16) +pipe.to("cuda") +``` + +You can also run the diffusion process for a shorter number of timesteps. + +This can either be done with the `num_inference_steps` argument + +```py +pipe("", num_inference_steps=30) +``` + +Or with the `timesteps` argument + +```py +from diffusers.pipelines.deepfloyd_if import fast27_timesteps + +pipe("", timesteps=fast27_timesteps) +``` + +When doing image variation or inpainting, you can also decrease the number of timesteps +with the strength argument. The strength argument is the amount of noise to add to +the input image which also determines how many steps to run in the denoising process. +A smaller number will vary the image less but run faster. + +```py +pipe = IFImg2ImgPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16) +pipe.to("cuda") + +image = pipe(image=image, prompt="", strength=0.3).images +``` + +You can also use [`torch.compile`](../../optimization/torch2.0). Note that we have not exhaustively tested `torch.compile` +with IF and it might not give expected results. + +```py +import torch + +pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16) +pipe.to("cuda") + +pipe.text_encoder = torch.compile(pipe.text_encoder) +pipe.unet = torch.compile(pipe.unet) +``` + +### Optimizing for memory + +When optimizing for GPU memory, we can use the standard diffusers cpu offloading APIs. + +Either the model based CPU offloading, + +```py +pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16) +pipe.enable_model_cpu_offload() +``` + +or the more aggressive layer based CPU offloading. + +```py +pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16) +pipe.enable_sequential_cpu_offload() +``` + +Additionally, T5 can be loaded in 8bit precision + +```py +from transformers import T5EncoderModel + +text_encoder = T5EncoderModel.from_pretrained( + "DeepFloyd/IF-I-IF-v1.0", subfolder="text_encoder", device_map="auto", load_in_8bit=True, variant="8bit" +) + +from diffusers import DiffusionPipeline + +pipe = DiffusionPipeline.from_pretrained( + "DeepFloyd/IF-I-IF-v1.0", + text_encoder=text_encoder, # pass the previously instantiated 8bit text encoder + unet=None, + device_map="auto", +) + +prompt_embeds, negative_embeds = pipe.encode_prompt("") +``` + +For CPU RAM constrained machines like google colab free tier where we can't load all +model components to the CPU at once, we can manually only load the pipeline with +the text encoder or unet when the respective model components are needed. + +```py +from diffusers import IFPipeline, IFSuperResolutionPipeline +import torch +import gc +from transformers import T5EncoderModel +from diffusers.utils import pt_to_pil + +text_encoder = T5EncoderModel.from_pretrained( + "DeepFloyd/IF-I-IF-v1.0", subfolder="text_encoder", device_map="auto", load_in_8bit=True, variant="8bit" +) + +# text to image + +pipe = DiffusionPipeline.from_pretrained( + "DeepFloyd/IF-I-IF-v1.0", + text_encoder=text_encoder, # pass the previously instantiated 8bit text encoder + unet=None, + device_map="auto", +) + +prompt = 'a photo of a kangaroo wearing an orange hoodie and blue sunglasses standing in front of the eiffel tower holding a sign that says "very deep learning"' +prompt_embeds, negative_embeds = pipe.encode_prompt(prompt) + +# Remove the pipeline so we can re-load the pipeline with the unet +del text_encoder +del pipe +gc.collect() +torch.cuda.empty_cache() + +pipe = IFPipeline.from_pretrained( + "DeepFloyd/IF-I-IF-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16, device_map="auto" +) + +generator = torch.Generator().manual_seed(0) +image = pipe( + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_embeds, + output_type="pt", + generator=generator, +).images + +pt_to_pil(image)[0].save("./if_stage_I.png") + +# Remove the pipeline so we can load the super-resolution pipeline +del pipe +gc.collect() +torch.cuda.empty_cache() + +# First super resolution + +pipe = IFSuperResolutionPipeline.from_pretrained( + "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16, device_map="auto" +) + +generator = torch.Generator().manual_seed(0) +image = pipe( + image=image, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_embeds, + output_type="pt", + generator=generator, +).images + +pt_to_pil(image)[0].save("./if_stage_II.png") +``` + + +## Available Pipelines: + +| Pipeline | Tasks | Colab +|---|---|:---:| +| [pipeline_if.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py) | *Text-to-Image Generation* | - | +| [pipeline_if_superresolution.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py) | *Text-to-Image Generation* | - | +| [pipeline_if_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py) | *Image-to-Image Generation* | - | +| [pipeline_if_img2img_superresolution.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py) | *Image-to-Image Generation* | - | +| [pipeline_if_inpainting.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py) | *Image-to-Image Generation* | - | +| [pipeline_if_inpainting_superresolution.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py) | *Image-to-Image Generation* | - | + +## IFPipeline +[[autodoc]] IFPipeline + - all + - __call__ + +## IFSuperResolutionPipeline +[[autodoc]] IFSuperResolutionPipeline + - all + - __call__ + +## IFImg2ImgPipeline +[[autodoc]] IFImg2ImgPipeline + - all + - __call__ + +## IFImg2ImgSuperResolutionPipeline +[[autodoc]] IFImg2ImgSuperResolutionPipeline + - all + - __call__ + +## IFInpaintingPipeline +[[autodoc]] IFInpaintingPipeline + - all + - __call__ + +## IFInpaintingSuperResolutionPipeline +[[autodoc]] IFInpaintingSuperResolutionPipeline + - all + - __call__ diff --git a/docs/source/en/api/pipelines/overview.mdx b/docs/source/en/api/pipelines/overview.mdx index 3c5331955513..91716784f8fe 100644 --- a/docs/source/en/api/pipelines/overview.mdx +++ b/docs/source/en/api/pipelines/overview.mdx @@ -51,6 +51,9 @@ available a colab notebook to directly try them out. | [dance_diffusion](./dance_diffusion) | [**Dance Diffusion**](https://github.com/williamberman/diffusers.git) | Unconditional Audio Generation | | [ddpm](./ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation | | [ddim](./ddim) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) | Unconditional Image Generation | +| [if](./if) | [**IF**](https://github.com/deep-floyd/IF) | Image Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/deepfloyd_if_free_tier_google_colab.ipynb) +| [if_img2img](./if) | [**IF**](https://github.com/deep-floyd/IF) | Image-to-Image Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/deepfloyd_if_free_tier_google_colab.ipynb) +| [if_inpainting](./if) | [**IF**](https://github.com/deep-floyd/IF) | Image-to-Image Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/deepfloyd_if_free_tier_google_colab.ipynb) | [latent_diffusion](./latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Text-to-Image Generation | | [latent_diffusion](./latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Super Resolution Image-to-Image | | [latent_diffusion_uncond](./latent_diffusion_uncond) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752) | Unconditional Image Generation | diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx index 10a237f29278..46a985ac2f8d 100644 --- a/docs/source/en/index.mdx +++ b/docs/source/en/index.mdx @@ -58,6 +58,9 @@ The library has three main components: | [dance_diffusion](./api/pipelines/dance_diffusion) | [Dance Diffusion](https://github.com/williamberman/diffusers.git) | Unconditional Audio Generation | | [ddpm](./api/pipelines/ddpm) | [Denoising Diffusion Probabilistic Models](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation | | [ddim](./api/pipelines/ddim) | [Denoising Diffusion Implicit Models](https://arxiv.org/abs/2010.02502) | Unconditional Image Generation | +| [if](./if) | [**IF**](./api/pipelines/if) | Image Generation | +| [if_img2img](./if) | [**IF**](./api/pipelines/if) | Image-to-Image Generation | +| [if_inpainting](./if) | [**IF**](./api/pipelines/if) | Image-to-Image Generation | | [latent_diffusion](./api/pipelines/latent_diffusion) | [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752)| Text-to-Image Generation | | [latent_diffusion](./api/pipelines/latent_diffusion) | [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752)| Super Resolution Image-to-Image | | [latent_diffusion_uncond](./api/pipelines/latent_diffusion_uncond) | [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) | Unconditional Image Generation | diff --git a/scripts/convert_if.py b/scripts/convert_if.py new file mode 100644 index 000000000000..66d7f694c8e1 --- /dev/null +++ b/scripts/convert_if.py @@ -0,0 +1,1257 @@ +import argparse +import inspect +import os + +import numpy as np +import torch +from torch.nn import functional as F +from transformers import CLIPConfig, CLIPImageProcessor, CLIPVisionModelWithProjection, T5EncoderModel, T5Tokenizer + +from diffusers import DDPMScheduler, IFPipeline, IFSuperResolutionPipeline, UNet2DConditionModel +from diffusers.pipelines.deepfloyd_if.safety_checker import IFSafetyChecker + + +try: + from omegaconf import OmegaConf +except ImportError: + raise ImportError( + "OmegaConf is required to convert the IF checkpoints. Please install it with `pip install" " OmegaConf`." + ) + + +def parse_args(): + parser = argparse.ArgumentParser() + + parser.add_argument("--dump_path", required=False, default=None, type=str) + + parser.add_argument("--dump_path_stage_2", required=False, default=None, type=str) + + parser.add_argument("--dump_path_stage_3", required=False, default=None, type=str) + + parser.add_argument("--unet_config", required=False, default=None, type=str, help="Path to unet config file") + + parser.add_argument( + "--unet_checkpoint_path", required=False, default=None, type=str, help="Path to unet checkpoint file" + ) + + parser.add_argument( + "--unet_checkpoint_path_stage_2", + required=False, + default=None, + type=str, + help="Path to stage 2 unet checkpoint file", + ) + + parser.add_argument( + "--unet_checkpoint_path_stage_3", + required=False, + default=None, + type=str, + help="Path to stage 3 unet checkpoint file", + ) + + parser.add_argument("--p_head_path", type=str, required=True) + + parser.add_argument("--w_head_path", type=str, required=True) + + args = parser.parse_args() + + return args + + +def main(args): + tokenizer = T5Tokenizer.from_pretrained("google/t5-v1_1-xxl") + text_encoder = T5EncoderModel.from_pretrained("google/t5-v1_1-xxl") + + feature_extractor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14") + safety_checker = convert_safety_checker(p_head_path=args.p_head_path, w_head_path=args.w_head_path) + + if args.unet_config is not None and args.unet_checkpoint_path is not None and args.dump_path is not None: + convert_stage_1_pipeline(tokenizer, text_encoder, feature_extractor, safety_checker, args) + + if args.unet_checkpoint_path_stage_2 is not None and args.dump_path_stage_2 is not None: + convert_super_res_pipeline(tokenizer, text_encoder, feature_extractor, safety_checker, args, stage=2) + + if args.unet_checkpoint_path_stage_3 is not None and args.dump_path_stage_3 is not None: + convert_super_res_pipeline(tokenizer, text_encoder, feature_extractor, safety_checker, args, stage=3) + + +def convert_stage_1_pipeline(tokenizer, text_encoder, feature_extractor, safety_checker, args): + unet = get_stage_1_unet(args.unet_config, args.unet_checkpoint_path) + + scheduler = DDPMScheduler( + variance_type="learned_range", + beta_schedule="squaredcos_cap_v2", + prediction_type="epsilon", + thresholding=True, + dynamic_thresholding_ratio=0.95, + sample_max_value=1.5, + ) + + pipe = IFPipeline( + tokenizer=tokenizer, + text_encoder=text_encoder, + unet=unet, + scheduler=scheduler, + safety_checker=safety_checker, + feature_extractor=feature_extractor, + requires_safety_checker=True, + ) + + pipe.save_pretrained(args.dump_path) + + +def convert_super_res_pipeline(tokenizer, text_encoder, feature_extractor, safety_checker, args, stage): + if stage == 2: + unet_checkpoint_path = args.unet_checkpoint_path_stage_2 + sample_size = None + dump_path = args.dump_path_stage_2 + elif stage == 3: + unet_checkpoint_path = args.unet_checkpoint_path_stage_3 + sample_size = 1024 + dump_path = args.dump_path_stage_3 + else: + assert False + + unet = get_super_res_unet(unet_checkpoint_path, verify_param_count=False, sample_size=sample_size) + + image_noising_scheduler = DDPMScheduler( + beta_schedule="squaredcos_cap_v2", + ) + + scheduler = DDPMScheduler( + variance_type="learned_range", + beta_schedule="squaredcos_cap_v2", + prediction_type="epsilon", + thresholding=True, + dynamic_thresholding_ratio=0.95, + sample_max_value=1.0, + ) + + pipe = IFSuperResolutionPipeline( + tokenizer=tokenizer, + text_encoder=text_encoder, + unet=unet, + scheduler=scheduler, + image_noising_scheduler=image_noising_scheduler, + safety_checker=safety_checker, + feature_extractor=feature_extractor, + requires_safety_checker=True, + ) + + pipe.save_pretrained(dump_path) + + +def get_stage_1_unet(unet_config, unet_checkpoint_path): + original_unet_config = OmegaConf.load(unet_config) + original_unet_config = original_unet_config.params + + unet_diffusers_config = create_unet_diffusers_config(original_unet_config) + + unet = UNet2DConditionModel(**unet_diffusers_config) + + device = "cuda" if torch.cuda.is_available() else "cpu" + unet_checkpoint = torch.load(unet_checkpoint_path, map_location=device) + + converted_unet_checkpoint = convert_ldm_unet_checkpoint( + unet_checkpoint, unet_diffusers_config, path=unet_checkpoint_path + ) + + unet.load_state_dict(converted_unet_checkpoint) + + return unet + + +def convert_safety_checker(p_head_path, w_head_path): + state_dict = {} + + # p head + + p_head = np.load(p_head_path) + + p_head_weights = p_head["weights"] + p_head_weights = torch.from_numpy(p_head_weights) + p_head_weights = p_head_weights.unsqueeze(0) + + p_head_biases = p_head["biases"] + p_head_biases = torch.from_numpy(p_head_biases) + p_head_biases = p_head_biases.unsqueeze(0) + + state_dict["p_head.weight"] = p_head_weights + state_dict["p_head.bias"] = p_head_biases + + # w head + + w_head = np.load(w_head_path) + + w_head_weights = w_head["weights"] + w_head_weights = torch.from_numpy(w_head_weights) + w_head_weights = w_head_weights.unsqueeze(0) + + w_head_biases = w_head["biases"] + w_head_biases = torch.from_numpy(w_head_biases) + w_head_biases = w_head_biases.unsqueeze(0) + + state_dict["w_head.weight"] = w_head_weights + state_dict["w_head.bias"] = w_head_biases + + # vision model + + vision_model = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-large-patch14") + vision_model_state_dict = vision_model.state_dict() + + for key, value in vision_model_state_dict.items(): + key = f"vision_model.{key}" + state_dict[key] = value + + # full model + + config = CLIPConfig.from_pretrained("openai/clip-vit-large-patch14") + safety_checker = IFSafetyChecker(config) + + safety_checker.load_state_dict(state_dict) + + return safety_checker + + +def create_unet_diffusers_config(original_unet_config, class_embed_type=None): + attention_resolutions = parse_list(original_unet_config.attention_resolutions) + attention_resolutions = [original_unet_config.image_size // int(res) for res in attention_resolutions] + + channel_mult = parse_list(original_unet_config.channel_mult) + block_out_channels = [original_unet_config.model_channels * mult for mult in channel_mult] + + down_block_types = [] + resolution = 1 + + for i in range(len(block_out_channels)): + if resolution in attention_resolutions: + block_type = "SimpleCrossAttnDownBlock2D" + elif original_unet_config.resblock_updown: + block_type = "ResnetDownsampleBlock2D" + else: + block_type = "DownBlock2D" + + down_block_types.append(block_type) + + if i != len(block_out_channels) - 1: + resolution *= 2 + + up_block_types = [] + for i in range(len(block_out_channels)): + if resolution in attention_resolutions: + block_type = "SimpleCrossAttnUpBlock2D" + elif original_unet_config.resblock_updown: + block_type = "ResnetUpsampleBlock2D" + else: + block_type = "UpBlock2D" + up_block_types.append(block_type) + resolution //= 2 + + head_dim = original_unet_config.num_head_channels + + use_linear_projection = ( + original_unet_config.use_linear_in_transformer + if "use_linear_in_transformer" in original_unet_config + else False + ) + if use_linear_projection: + # stable diffusion 2-base-512 and 2-768 + if head_dim is None: + head_dim = [5, 10, 20, 20] + + projection_class_embeddings_input_dim = None + + if class_embed_type is None: + if "num_classes" in original_unet_config: + if original_unet_config.num_classes == "sequential": + class_embed_type = "projection" + assert "adm_in_channels" in original_unet_config + projection_class_embeddings_input_dim = original_unet_config.adm_in_channels + else: + raise NotImplementedError( + f"Unknown conditional unet num_classes config: {original_unet_config.num_classes}" + ) + + config = { + "sample_size": original_unet_config.image_size, + "in_channels": original_unet_config.in_channels, + "down_block_types": tuple(down_block_types), + "block_out_channels": tuple(block_out_channels), + "layers_per_block": original_unet_config.num_res_blocks, + "cross_attention_dim": original_unet_config.encoder_channels, + "attention_head_dim": head_dim, + "use_linear_projection": use_linear_projection, + "class_embed_type": class_embed_type, + "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim, + "out_channels": original_unet_config.out_channels, + "up_block_types": tuple(up_block_types), + "upcast_attention": False, # TODO: guessing + "cross_attention_norm": "group_norm", + "mid_block_type": "UNetMidBlock2DSimpleCrossAttn", + "addition_embed_type": "text", + "act_fn": "gelu", + } + + if original_unet_config.use_scale_shift_norm: + config["resnet_time_scale_shift"] = "scale_shift" + + if "encoder_dim" in original_unet_config: + config["encoder_hid_dim"] = original_unet_config.encoder_dim + + return config + + +def convert_ldm_unet_checkpoint(unet_state_dict, config, path=None): + """ + Takes a state dict and a config, and returns a converted checkpoint. + """ + new_checkpoint = {} + + new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"] + new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"] + new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"] + new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"] + + if config["class_embed_type"] in [None, "identity"]: + # No parameters to port + ... + elif config["class_embed_type"] == "timestep" or config["class_embed_type"] == "projection": + new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"] + new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"] + new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"] + new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"] + else: + raise NotImplementedError(f"Not implemented `class_embed_type`: {config['class_embed_type']}") + + new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"] + new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"] + + new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"] + new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"] + new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"] + new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"] + + # Retrieves the keys for the input blocks only + num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer}) + input_blocks = { + layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}." in key] + for layer_id in range(num_input_blocks) + } + + # Retrieves the keys for the middle blocks only + num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer}) + middle_blocks = { + layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key] + for layer_id in range(num_middle_blocks) + } + + # Retrieves the keys for the output blocks only + num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer}) + output_blocks = { + layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}." in key] + for layer_id in range(num_output_blocks) + } + + for i in range(1, num_input_blocks): + block_id = (i - 1) // (config["layers_per_block"] + 1) + layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1) + + resnets = [ + key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key + ] + attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key] + + if f"input_blocks.{i}.0.op.weight" in unet_state_dict: + new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop( + f"input_blocks.{i}.0.op.weight" + ) + new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop( + f"input_blocks.{i}.0.op.bias" + ) + + paths = renew_resnet_paths(resnets) + + # TODO need better check than i in [4, 8, 12, 16] + block_type = config["down_block_types"][block_id] + if (block_type == "ResnetDownsampleBlock2D" or block_type == "SimpleCrossAttnDownBlock2D") and i in [ + 4, + 8, + 12, + 16, + ]: + meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.downsamplers.0"} + else: + meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"} + + assign_to_checkpoint( + paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config + ) + + if len(attentions): + old_path = f"input_blocks.{i}.1" + new_path = f"down_blocks.{block_id}.attentions.{layer_in_block_id}" + + assign_attention_to_checkpoint( + new_checkpoint=new_checkpoint, + unet_state_dict=unet_state_dict, + old_path=old_path, + new_path=new_path, + config=config, + ) + + paths = renew_attention_paths(attentions) + meta_path = {"old": old_path, "new": new_path} + assign_to_checkpoint( + paths, + new_checkpoint, + unet_state_dict, + additional_replacements=[meta_path], + config=config, + ) + + resnet_0 = middle_blocks[0] + attentions = middle_blocks[1] + resnet_1 = middle_blocks[2] + + resnet_0_paths = renew_resnet_paths(resnet_0) + assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config) + + resnet_1_paths = renew_resnet_paths(resnet_1) + assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config) + + old_path = "middle_block.1" + new_path = "mid_block.attentions.0" + + assign_attention_to_checkpoint( + new_checkpoint=new_checkpoint, + unet_state_dict=unet_state_dict, + old_path=old_path, + new_path=new_path, + config=config, + ) + + attentions_paths = renew_attention_paths(attentions) + meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"} + assign_to_checkpoint( + attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config + ) + + for i in range(num_output_blocks): + block_id = i // (config["layers_per_block"] + 1) + layer_in_block_id = i % (config["layers_per_block"] + 1) + output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]] + output_block_list = {} + + for layer in output_block_layers: + layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1) + if layer_id in output_block_list: + output_block_list[layer_id].append(layer_name) + else: + output_block_list[layer_id] = [layer_name] + + # len(output_block_list) == 1 -> resnet + # len(output_block_list) == 2 -> resnet, attention + # len(output_block_list) == 3 -> resnet, attention, upscale resnet + + if len(output_block_list) > 1: + resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key] + attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key] + + paths = renew_resnet_paths(resnets) + + meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"} + + assign_to_checkpoint( + paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config + ) + + output_block_list = {k: sorted(v) for k, v in output_block_list.items()} + if ["conv.bias", "conv.weight"] in output_block_list.values(): + index = list(output_block_list.values()).index(["conv.bias", "conv.weight"]) + new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[ + f"output_blocks.{i}.{index}.conv.weight" + ] + new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[ + f"output_blocks.{i}.{index}.conv.bias" + ] + + # Clear attentions as they have been attributed above. + if len(attentions) == 2: + attentions = [] + + if len(attentions): + old_path = f"output_blocks.{i}.1" + new_path = f"up_blocks.{block_id}.attentions.{layer_in_block_id}" + + assign_attention_to_checkpoint( + new_checkpoint=new_checkpoint, + unet_state_dict=unet_state_dict, + old_path=old_path, + new_path=new_path, + config=config, + ) + + paths = renew_attention_paths(attentions) + meta_path = { + "old": old_path, + "new": new_path, + } + assign_to_checkpoint( + paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config + ) + + if len(output_block_list) == 3: + resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.2" in key] + paths = renew_resnet_paths(resnets) + meta_path = {"old": f"output_blocks.{i}.2", "new": f"up_blocks.{block_id}.upsamplers.0"} + assign_to_checkpoint( + paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config + ) + else: + resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1) + for path in resnet_0_paths: + old_path = ".".join(["output_blocks", str(i), path["old"]]) + new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]]) + + new_checkpoint[new_path] = unet_state_dict[old_path] + + if "encoder_proj.weight" in unet_state_dict: + new_checkpoint["encoder_hid_proj.weight"] = unet_state_dict.pop("encoder_proj.weight") + new_checkpoint["encoder_hid_proj.bias"] = unet_state_dict.pop("encoder_proj.bias") + + if "encoder_pooling.0.weight" in unet_state_dict: + new_checkpoint["add_embedding.norm1.weight"] = unet_state_dict.pop("encoder_pooling.0.weight") + new_checkpoint["add_embedding.norm1.bias"] = unet_state_dict.pop("encoder_pooling.0.bias") + + new_checkpoint["add_embedding.pool.positional_embedding"] = unet_state_dict.pop( + "encoder_pooling.1.positional_embedding" + ) + new_checkpoint["add_embedding.pool.k_proj.weight"] = unet_state_dict.pop("encoder_pooling.1.k_proj.weight") + new_checkpoint["add_embedding.pool.k_proj.bias"] = unet_state_dict.pop("encoder_pooling.1.k_proj.bias") + new_checkpoint["add_embedding.pool.q_proj.weight"] = unet_state_dict.pop("encoder_pooling.1.q_proj.weight") + new_checkpoint["add_embedding.pool.q_proj.bias"] = unet_state_dict.pop("encoder_pooling.1.q_proj.bias") + new_checkpoint["add_embedding.pool.v_proj.weight"] = unet_state_dict.pop("encoder_pooling.1.v_proj.weight") + new_checkpoint["add_embedding.pool.v_proj.bias"] = unet_state_dict.pop("encoder_pooling.1.v_proj.bias") + + new_checkpoint["add_embedding.proj.weight"] = unet_state_dict.pop("encoder_pooling.2.weight") + new_checkpoint["add_embedding.proj.bias"] = unet_state_dict.pop("encoder_pooling.2.bias") + + new_checkpoint["add_embedding.norm2.weight"] = unet_state_dict.pop("encoder_pooling.3.weight") + new_checkpoint["add_embedding.norm2.bias"] = unet_state_dict.pop("encoder_pooling.3.bias") + + return new_checkpoint + + +def shave_segments(path, n_shave_prefix_segments=1): + """ + Removes segments. Positive values shave the first segments, negative shave the last segments. + """ + if n_shave_prefix_segments >= 0: + return ".".join(path.split(".")[n_shave_prefix_segments:]) + else: + return ".".join(path.split(".")[:n_shave_prefix_segments]) + + +def renew_resnet_paths(old_list, n_shave_prefix_segments=0): + """ + Updates paths inside resnets to the new naming scheme (local renaming) + """ + mapping = [] + for old_item in old_list: + new_item = old_item.replace("in_layers.0", "norm1") + new_item = new_item.replace("in_layers.2", "conv1") + + new_item = new_item.replace("out_layers.0", "norm2") + new_item = new_item.replace("out_layers.3", "conv2") + + new_item = new_item.replace("emb_layers.1", "time_emb_proj") + new_item = new_item.replace("skip_connection", "conv_shortcut") + + new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) + + mapping.append({"old": old_item, "new": new_item}) + + return mapping + + +def renew_attention_paths(old_list, n_shave_prefix_segments=0): + """ + Updates paths inside attentions to the new naming scheme (local renaming) + """ + mapping = [] + for old_item in old_list: + new_item = old_item + + if "qkv" in new_item: + continue + + if "encoder_kv" in new_item: + continue + + new_item = new_item.replace("norm.weight", "group_norm.weight") + new_item = new_item.replace("norm.bias", "group_norm.bias") + + new_item = new_item.replace("proj_out.weight", "to_out.0.weight") + new_item = new_item.replace("proj_out.bias", "to_out.0.bias") + + new_item = new_item.replace("norm_encoder.weight", "norm_cross.weight") + new_item = new_item.replace("norm_encoder.bias", "norm_cross.bias") + + new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) + + mapping.append({"old": old_item, "new": new_item}) + + return mapping + + +def assign_attention_to_checkpoint(new_checkpoint, unet_state_dict, old_path, new_path, config): + qkv_weight = unet_state_dict.pop(f"{old_path}.qkv.weight") + qkv_weight = qkv_weight[:, :, 0] + + qkv_bias = unet_state_dict.pop(f"{old_path}.qkv.bias") + + is_cross_attn_only = "only_cross_attention" in config and config["only_cross_attention"] + + split = 1 if is_cross_attn_only else 3 + + weights, bias = split_attentions( + weight=qkv_weight, + bias=qkv_bias, + split=split, + chunk_size=config["attention_head_dim"], + ) + + if is_cross_attn_only: + query_weight, q_bias = weights, bias + new_checkpoint[f"{new_path}.to_q.weight"] = query_weight[0] + new_checkpoint[f"{new_path}.to_q.bias"] = q_bias[0] + else: + [query_weight, key_weight, value_weight], [q_bias, k_bias, v_bias] = weights, bias + new_checkpoint[f"{new_path}.to_q.weight"] = query_weight + new_checkpoint[f"{new_path}.to_q.bias"] = q_bias + new_checkpoint[f"{new_path}.to_k.weight"] = key_weight + new_checkpoint[f"{new_path}.to_k.bias"] = k_bias + new_checkpoint[f"{new_path}.to_v.weight"] = value_weight + new_checkpoint[f"{new_path}.to_v.bias"] = v_bias + + encoder_kv_weight = unet_state_dict.pop(f"{old_path}.encoder_kv.weight") + encoder_kv_weight = encoder_kv_weight[:, :, 0] + + encoder_kv_bias = unet_state_dict.pop(f"{old_path}.encoder_kv.bias") + + [encoder_k_weight, encoder_v_weight], [encoder_k_bias, encoder_v_bias] = split_attentions( + weight=encoder_kv_weight, + bias=encoder_kv_bias, + split=2, + chunk_size=config["attention_head_dim"], + ) + + new_checkpoint[f"{new_path}.add_k_proj.weight"] = encoder_k_weight + new_checkpoint[f"{new_path}.add_k_proj.bias"] = encoder_k_bias + new_checkpoint[f"{new_path}.add_v_proj.weight"] = encoder_v_weight + new_checkpoint[f"{new_path}.add_v_proj.bias"] = encoder_v_bias + + +def assign_to_checkpoint(paths, checkpoint, old_checkpoint, additional_replacements=None, config=None): + """ + This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits + attention layers, and takes into account additional replacements that may arise. + + Assigns the weights to the new checkpoint. + """ + assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys." + + for path in paths: + new_path = path["new"] + + # Global renaming happens here + new_path = new_path.replace("middle_block.0", "mid_block.resnets.0") + new_path = new_path.replace("middle_block.1", "mid_block.attentions.0") + new_path = new_path.replace("middle_block.2", "mid_block.resnets.1") + + if additional_replacements is not None: + for replacement in additional_replacements: + new_path = new_path.replace(replacement["old"], replacement["new"]) + + # proj_attn.weight has to be converted from conv 1D to linear + if "proj_attn.weight" in new_path or "to_out.0.weight" in new_path: + checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0] + else: + checkpoint[new_path] = old_checkpoint[path["old"]] + + +# TODO maybe document and/or can do more efficiently (build indices in for loop and extract once for each split?) +def split_attentions(*, weight, bias, split, chunk_size): + weights = [None] * split + biases = [None] * split + + weights_biases_idx = 0 + + for starting_row_index in range(0, weight.shape[0], chunk_size): + row_indices = torch.arange(starting_row_index, starting_row_index + chunk_size) + + weight_rows = weight[row_indices, :] + bias_rows = bias[row_indices] + + if weights[weights_biases_idx] is None: + weights[weights_biases_idx] = weight_rows + biases[weights_biases_idx] = bias_rows + else: + assert weights[weights_biases_idx] is not None + weights[weights_biases_idx] = torch.concat([weights[weights_biases_idx], weight_rows]) + biases[weights_biases_idx] = torch.concat([biases[weights_biases_idx], bias_rows]) + + weights_biases_idx = (weights_biases_idx + 1) % split + + return weights, biases + + +def parse_list(value): + if isinstance(value, str): + value = value.split(",") + value = [int(v) for v in value] + elif isinstance(value, list): + pass + else: + raise ValueError(f"Can't parse list for type: {type(value)}") + + return value + + +# below is copy and pasted from original convert_if_stage_2.py script + + +def get_super_res_unet(unet_checkpoint_path, verify_param_count=True, sample_size=None): + orig_path = unet_checkpoint_path + + original_unet_config = OmegaConf.load(os.path.join(orig_path, "config.yml")) + original_unet_config = original_unet_config.params + + unet_diffusers_config = superres_create_unet_diffusers_config(original_unet_config) + unet_diffusers_config["time_embedding_dim"] = original_unet_config.model_channels * int( + original_unet_config.channel_mult.split(",")[-1] + ) + if original_unet_config.encoder_dim != original_unet_config.encoder_channels: + unet_diffusers_config["encoder_hid_dim"] = original_unet_config.encoder_dim + unet_diffusers_config["class_embed_type"] = "timestep" + unet_diffusers_config["addition_embed_type"] = "text" + + unet_diffusers_config["time_embedding_act_fn"] = "gelu" + unet_diffusers_config["resnet_skip_time_act"] = True + unet_diffusers_config["resnet_out_scale_factor"] = 1 / 0.7071 + unet_diffusers_config["mid_block_scale_factor"] = 1 / 0.7071 + unet_diffusers_config["only_cross_attention"] = ( + bool(original_unet_config.disable_self_attentions) + if ( + "disable_self_attentions" in original_unet_config + and isinstance(original_unet_config.disable_self_attentions, int) + ) + else True + ) + + if sample_size is None: + unet_diffusers_config["sample_size"] = original_unet_config.image_size + else: + # The second upscaler unet's sample size is incorrectly specified + # in the config and is instead hardcoded in source + unet_diffusers_config["sample_size"] = sample_size + + unet_checkpoint = torch.load(os.path.join(unet_checkpoint_path, "pytorch_model.bin"), map_location="cpu") + + if verify_param_count: + # check that architecture matches - is a bit slow + verify_param_count(orig_path, unet_diffusers_config) + + converted_unet_checkpoint = superres_convert_ldm_unet_checkpoint( + unet_checkpoint, unet_diffusers_config, path=unet_checkpoint_path + ) + converted_keys = converted_unet_checkpoint.keys() + + model = UNet2DConditionModel(**unet_diffusers_config) + expected_weights = model.state_dict().keys() + + diff_c_e = set(converted_keys) - set(expected_weights) + diff_e_c = set(expected_weights) - set(converted_keys) + + assert len(diff_e_c) == 0, f"Expected, but not converted: {diff_e_c}" + assert len(diff_c_e) == 0, f"Converted, but not expected: {diff_c_e}" + + model.load_state_dict(converted_unet_checkpoint) + + return model + + +def superres_create_unet_diffusers_config(original_unet_config): + attention_resolutions = parse_list(original_unet_config.attention_resolutions) + attention_resolutions = [original_unet_config.image_size // int(res) for res in attention_resolutions] + + channel_mult = parse_list(original_unet_config.channel_mult) + block_out_channels = [original_unet_config.model_channels * mult for mult in channel_mult] + + down_block_types = [] + resolution = 1 + + for i in range(len(block_out_channels)): + if resolution in attention_resolutions: + block_type = "SimpleCrossAttnDownBlock2D" + elif original_unet_config.resblock_updown: + block_type = "ResnetDownsampleBlock2D" + else: + block_type = "DownBlock2D" + + down_block_types.append(block_type) + + if i != len(block_out_channels) - 1: + resolution *= 2 + + up_block_types = [] + for i in range(len(block_out_channels)): + if resolution in attention_resolutions: + block_type = "SimpleCrossAttnUpBlock2D" + elif original_unet_config.resblock_updown: + block_type = "ResnetUpsampleBlock2D" + else: + block_type = "UpBlock2D" + up_block_types.append(block_type) + resolution //= 2 + + head_dim = original_unet_config.num_head_channels + use_linear_projection = ( + original_unet_config.use_linear_in_transformer + if "use_linear_in_transformer" in original_unet_config + else False + ) + if use_linear_projection: + # stable diffusion 2-base-512 and 2-768 + if head_dim is None: + head_dim = [5, 10, 20, 20] + + class_embed_type = None + projection_class_embeddings_input_dim = None + + if "num_classes" in original_unet_config: + if original_unet_config.num_classes == "sequential": + class_embed_type = "projection" + assert "adm_in_channels" in original_unet_config + projection_class_embeddings_input_dim = original_unet_config.adm_in_channels + else: + raise NotImplementedError( + f"Unknown conditional unet num_classes config: {original_unet_config.num_classes}" + ) + + config = { + "in_channels": original_unet_config.in_channels, + "down_block_types": tuple(down_block_types), + "block_out_channels": tuple(block_out_channels), + "layers_per_block": tuple(original_unet_config.num_res_blocks), + "cross_attention_dim": original_unet_config.encoder_channels, + "attention_head_dim": head_dim, + "use_linear_projection": use_linear_projection, + "class_embed_type": class_embed_type, + "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim, + "out_channels": original_unet_config.out_channels, + "up_block_types": tuple(up_block_types), + "upcast_attention": False, # TODO: guessing + "cross_attention_norm": "group_norm", + "mid_block_type": "UNetMidBlock2DSimpleCrossAttn", + "act_fn": "gelu", + } + + if original_unet_config.use_scale_shift_norm: + config["resnet_time_scale_shift"] = "scale_shift" + + return config + + +def superres_convert_ldm_unet_checkpoint(unet_state_dict, config, path=None, extract_ema=False): + """ + Takes a state dict and a config, and returns a converted checkpoint. + """ + new_checkpoint = {} + + new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"] + new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"] + new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"] + new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"] + + if config["class_embed_type"] is None: + # No parameters to port + ... + elif config["class_embed_type"] == "timestep" or config["class_embed_type"] == "projection": + new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict["aug_proj.0.weight"] + new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict["aug_proj.0.bias"] + new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict["aug_proj.2.weight"] + new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict["aug_proj.2.bias"] + else: + raise NotImplementedError(f"Not implemented `class_embed_type`: {config['class_embed_type']}") + + if "encoder_proj.weight" in unet_state_dict: + new_checkpoint["encoder_hid_proj.weight"] = unet_state_dict["encoder_proj.weight"] + new_checkpoint["encoder_hid_proj.bias"] = unet_state_dict["encoder_proj.bias"] + + if "encoder_pooling.0.weight" in unet_state_dict: + mapping = { + "encoder_pooling.0": "add_embedding.norm1", + "encoder_pooling.1": "add_embedding.pool", + "encoder_pooling.2": "add_embedding.proj", + "encoder_pooling.3": "add_embedding.norm2", + } + for key in unet_state_dict.keys(): + if key.startswith("encoder_pooling"): + prefix = key[: len("encoder_pooling.0")] + new_key = key.replace(prefix, mapping[prefix]) + new_checkpoint[new_key] = unet_state_dict[key] + + new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"] + new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"] + + new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"] + new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"] + new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"] + new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"] + + # Retrieves the keys for the input blocks only + num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer}) + input_blocks = { + layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}." in key] + for layer_id in range(num_input_blocks) + } + + # Retrieves the keys for the middle blocks only + num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer}) + middle_blocks = { + layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key] + for layer_id in range(num_middle_blocks) + } + + # Retrieves the keys for the output blocks only + num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer}) + output_blocks = { + layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}." in key] + for layer_id in range(num_output_blocks) + } + if not isinstance(config["layers_per_block"], int): + layers_per_block_list = [e + 1 for e in config["layers_per_block"]] + layers_per_block_cumsum = list(np.cumsum(layers_per_block_list)) + downsampler_ids = layers_per_block_cumsum + else: + # TODO need better check than i in [4, 8, 12, 16] + downsampler_ids = [4, 8, 12, 16] + + for i in range(1, num_input_blocks): + if isinstance(config["layers_per_block"], int): + layers_per_block = config["layers_per_block"] + block_id = (i - 1) // (layers_per_block + 1) + layer_in_block_id = (i - 1) % (layers_per_block + 1) + else: + block_id = next(k for k, n in enumerate(layers_per_block_cumsum) if (i - 1) < n) + passed_blocks = layers_per_block_cumsum[block_id - 1] if block_id > 0 else 0 + layer_in_block_id = (i - 1) - passed_blocks + + resnets = [ + key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key + ] + attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key] + + if f"input_blocks.{i}.0.op.weight" in unet_state_dict: + new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop( + f"input_blocks.{i}.0.op.weight" + ) + new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop( + f"input_blocks.{i}.0.op.bias" + ) + + paths = renew_resnet_paths(resnets) + + block_type = config["down_block_types"][block_id] + if ( + block_type == "ResnetDownsampleBlock2D" or block_type == "SimpleCrossAttnDownBlock2D" + ) and i in downsampler_ids: + meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.downsamplers.0"} + else: + meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"} + + assign_to_checkpoint( + paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config + ) + + if len(attentions): + old_path = f"input_blocks.{i}.1" + new_path = f"down_blocks.{block_id}.attentions.{layer_in_block_id}" + + assign_attention_to_checkpoint( + new_checkpoint=new_checkpoint, + unet_state_dict=unet_state_dict, + old_path=old_path, + new_path=new_path, + config=config, + ) + + paths = renew_attention_paths(attentions) + meta_path = {"old": old_path, "new": new_path} + assign_to_checkpoint( + paths, + new_checkpoint, + unet_state_dict, + additional_replacements=[meta_path], + config=config, + ) + + resnet_0 = middle_blocks[0] + attentions = middle_blocks[1] + resnet_1 = middle_blocks[2] + + resnet_0_paths = renew_resnet_paths(resnet_0) + assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config) + + resnet_1_paths = renew_resnet_paths(resnet_1) + assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config) + + old_path = "middle_block.1" + new_path = "mid_block.attentions.0" + + assign_attention_to_checkpoint( + new_checkpoint=new_checkpoint, + unet_state_dict=unet_state_dict, + old_path=old_path, + new_path=new_path, + config=config, + ) + + attentions_paths = renew_attention_paths(attentions) + meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"} + assign_to_checkpoint( + attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config + ) + if not isinstance(config["layers_per_block"], int): + layers_per_block_list = list(reversed([e + 1 for e in config["layers_per_block"]])) + layers_per_block_cumsum = list(np.cumsum(layers_per_block_list)) + + for i in range(num_output_blocks): + if isinstance(config["layers_per_block"], int): + layers_per_block = config["layers_per_block"] + block_id = i // (layers_per_block + 1) + layer_in_block_id = i % (layers_per_block + 1) + else: + block_id = next(k for k, n in enumerate(layers_per_block_cumsum) if i < n) + passed_blocks = layers_per_block_cumsum[block_id - 1] if block_id > 0 else 0 + layer_in_block_id = i - passed_blocks + + output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]] + output_block_list = {} + + for layer in output_block_layers: + layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1) + if layer_id in output_block_list: + output_block_list[layer_id].append(layer_name) + else: + output_block_list[layer_id] = [layer_name] + + # len(output_block_list) == 1 -> resnet + # len(output_block_list) == 2 -> resnet, attention or resnet, upscale resnet + # len(output_block_list) == 3 -> resnet, attention, upscale resnet + + if len(output_block_list) > 1: + resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key] + + has_attention = True + if len(output_block_list) == 2 and any("in_layers" in k for k in output_block_list["1"]): + has_attention = False + + maybe_attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key] + + paths = renew_resnet_paths(resnets) + + meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"} + + assign_to_checkpoint( + paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config + ) + + output_block_list = {k: sorted(v) for k, v in output_block_list.items()} + if ["conv.bias", "conv.weight"] in output_block_list.values(): + index = list(output_block_list.values()).index(["conv.bias", "conv.weight"]) + new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[ + f"output_blocks.{i}.{index}.conv.weight" + ] + new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[ + f"output_blocks.{i}.{index}.conv.bias" + ] + + # this layer was no attention + has_attention = False + maybe_attentions = [] + + if has_attention: + old_path = f"output_blocks.{i}.1" + new_path = f"up_blocks.{block_id}.attentions.{layer_in_block_id}" + + assign_attention_to_checkpoint( + new_checkpoint=new_checkpoint, + unet_state_dict=unet_state_dict, + old_path=old_path, + new_path=new_path, + config=config, + ) + + paths = renew_attention_paths(maybe_attentions) + meta_path = { + "old": old_path, + "new": new_path, + } + assign_to_checkpoint( + paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config + ) + + if len(output_block_list) == 3 or (not has_attention and len(maybe_attentions) > 0): + layer_id = len(output_block_list) - 1 + resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.{layer_id}" in key] + paths = renew_resnet_paths(resnets) + meta_path = {"old": f"output_blocks.{i}.{layer_id}", "new": f"up_blocks.{block_id}.upsamplers.0"} + assign_to_checkpoint( + paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config + ) + else: + resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1) + for path in resnet_0_paths: + old_path = ".".join(["output_blocks", str(i), path["old"]]) + new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]]) + + new_checkpoint[new_path] = unet_state_dict[old_path] + + return new_checkpoint + + +def verify_param_count(orig_path, unet_diffusers_config): + if "-II-" in orig_path: + from deepfloyd_if.modules import IFStageII + + if_II = IFStageII(device="cpu", dir_or_name=orig_path) + elif "-III-" in orig_path: + from deepfloyd_if.modules import IFStageIII + + if_II = IFStageIII(device="cpu", dir_or_name=orig_path) + else: + assert f"Weird name. Should have -II- or -III- in path: {orig_path}" + + unet = UNet2DConditionModel(**unet_diffusers_config) + + # in params + assert_param_count(unet.time_embedding, if_II.model.time_embed) + assert_param_count(unet.conv_in, if_II.model.input_blocks[:1]) + + # downblocks + assert_param_count(unet.down_blocks[0], if_II.model.input_blocks[1:4]) + assert_param_count(unet.down_blocks[1], if_II.model.input_blocks[4:7]) + assert_param_count(unet.down_blocks[2], if_II.model.input_blocks[7:11]) + + if "-II-" in orig_path: + assert_param_count(unet.down_blocks[3], if_II.model.input_blocks[11:17]) + assert_param_count(unet.down_blocks[4], if_II.model.input_blocks[17:]) + if "-III-" in orig_path: + assert_param_count(unet.down_blocks[3], if_II.model.input_blocks[11:15]) + assert_param_count(unet.down_blocks[4], if_II.model.input_blocks[15:20]) + assert_param_count(unet.down_blocks[5], if_II.model.input_blocks[20:]) + + # mid block + assert_param_count(unet.mid_block, if_II.model.middle_block) + + # up block + if "-II-" in orig_path: + assert_param_count(unet.up_blocks[0], if_II.model.output_blocks[:6]) + assert_param_count(unet.up_blocks[1], if_II.model.output_blocks[6:12]) + assert_param_count(unet.up_blocks[2], if_II.model.output_blocks[12:16]) + assert_param_count(unet.up_blocks[3], if_II.model.output_blocks[16:19]) + assert_param_count(unet.up_blocks[4], if_II.model.output_blocks[19:]) + if "-III-" in orig_path: + assert_param_count(unet.up_blocks[0], if_II.model.output_blocks[:5]) + assert_param_count(unet.up_blocks[1], if_II.model.output_blocks[5:10]) + assert_param_count(unet.up_blocks[2], if_II.model.output_blocks[10:14]) + assert_param_count(unet.up_blocks[3], if_II.model.output_blocks[14:18]) + assert_param_count(unet.up_blocks[4], if_II.model.output_blocks[18:21]) + assert_param_count(unet.up_blocks[5], if_II.model.output_blocks[21:24]) + + # out params + assert_param_count(unet.conv_norm_out, if_II.model.out[0]) + assert_param_count(unet.conv_out, if_II.model.out[2]) + + # make sure all model architecture has same param count + assert_param_count(unet, if_II.model) + + +def assert_param_count(model_1, model_2): + count_1 = sum(p.numel() for p in model_1.parameters()) + count_2 = sum(p.numel() for p in model_2.parameters()) + assert count_1 == count_2, f"{model_1.__class__}: {count_1} != {model_2.__class__}: {count_2}" + + +def superres_check_against_original(dump_path, unet_checkpoint_path): + model_path = dump_path + model = UNet2DConditionModel.from_pretrained(model_path) + model.to("cuda") + orig_path = unet_checkpoint_path + + if "-II-" in orig_path: + from deepfloyd_if.modules import IFStageII + + if_II_model = IFStageII(device="cuda", dir_or_name=orig_path, model_kwargs={"precision": "fp32"}).model + elif "-III-" in orig_path: + from deepfloyd_if.modules import IFStageIII + + if_II_model = IFStageIII(device="cuda", dir_or_name=orig_path, model_kwargs={"precision": "fp32"}).model + + batch_size = 1 + channels = model.in_channels // 2 + height = model.sample_size + width = model.sample_size + height = 1024 + width = 1024 + + torch.manual_seed(0) + + latents = torch.randn((batch_size, channels, height, width), device=model.device) + image_small = torch.randn((batch_size, channels, height // 4, width // 4), device=model.device) + + interpolate_antialias = {} + if "antialias" in inspect.signature(F.interpolate).parameters: + interpolate_antialias["antialias"] = True + image_upscaled = F.interpolate( + image_small, size=[height, width], mode="bicubic", align_corners=False, **interpolate_antialias + ) + + latent_model_input = torch.cat([latents, image_upscaled], dim=1).to(model.dtype) + t = torch.tensor([5], device=model.device).to(model.dtype) + + seq_len = 64 + encoder_hidden_states = torch.randn((batch_size, seq_len, model.config.encoder_hid_dim), device=model.device).to( + model.dtype + ) + + fake_class_labels = torch.tensor([t], device=model.device).to(model.dtype) + + with torch.no_grad(): + out = if_II_model(latent_model_input, t, aug_steps=fake_class_labels, text_emb=encoder_hidden_states) + + if_II_model.to("cpu") + del if_II_model + import gc + + torch.cuda.empty_cache() + gc.collect() + print(50 * "=") + + with torch.no_grad(): + noise_pred = model( + sample=latent_model_input, + encoder_hidden_states=encoder_hidden_states, + class_labels=fake_class_labels, + timestep=t, + ).sample + + print("Out shape", noise_pred.shape) + print("Diff", (out - noise_pred).abs().sum()) + + +if __name__ == "__main__": + main(parse_args()) diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 40029fcecfd1..e9d12bdb7cca 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -114,6 +114,12 @@ AltDiffusionPipeline, AudioLDMPipeline, CycleDiffusionPipeline, + IFImg2ImgPipeline, + IFImg2ImgSuperResolutionPipeline, + IFInpaintingPipeline, + IFInpaintingSuperResolutionPipeline, + IFPipeline, + IFSuperResolutionPipeline, LDMTextToImagePipeline, PaintByExamplePipeline, SemanticStableDiffusionPipeline, diff --git a/src/diffusers/configuration_utils.py b/src/diffusers/configuration_utils.py index 772e119fbe97..af639de306ee 100644 --- a/src/diffusers/configuration_utils.py +++ b/src/diffusers/configuration_utils.py @@ -109,6 +109,7 @@ def register_to_config(self, **kwargs): # TODO: remove this when we remove the deprecation warning, and the `kwargs` argument, # or solve in a more general way. kwargs.pop("kwargs", None) + if not hasattr(self, "_internal_dict"): internal_dict = kwargs else: @@ -550,6 +551,9 @@ def to_json_saveable(value): return value config_dict = {k: to_json_saveable(v) for k, v in config_dict.items()} + # Don't save "_ignore_files" + config_dict.pop("_ignore_files", None) + return json.dumps(config_dict, indent=2, sort_keys=True) + "\n" def to_json_file(self, json_file_path: Union[str, os.PathLike]): diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py index d12e75344ba1..fa88bce305e6 100644 --- a/src/diffusers/models/embeddings.py +++ b/src/diffusers/models/embeddings.py @@ -377,3 +377,69 @@ def forward(self, timestep, class_labels, hidden_dtype=None): conditioning = timesteps_emb + class_labels # (N, D) return conditioning + + +class TextTimeEmbedding(nn.Module): + def __init__(self, encoder_dim: int, time_embed_dim: int, num_heads: int = 64): + super().__init__() + self.norm1 = nn.LayerNorm(encoder_dim) + self.pool = AttentionPooling(num_heads, encoder_dim) + self.proj = nn.Linear(encoder_dim, time_embed_dim) + self.norm2 = nn.LayerNorm(time_embed_dim) + + def forward(self, hidden_states): + hidden_states = self.norm1(hidden_states) + hidden_states = self.pool(hidden_states) + hidden_states = self.proj(hidden_states) + hidden_states = self.norm2(hidden_states) + return hidden_states + + +class AttentionPooling(nn.Module): + # Copied from https://github.com/deep-floyd/IF/blob/2f91391f27dd3c468bf174be5805b4cc92980c0b/deepfloyd_if/model/nn.py#L54 + + def __init__(self, num_heads, embed_dim, dtype=None): + super().__init__() + self.dtype = dtype + self.positional_embedding = nn.Parameter(torch.randn(1, embed_dim) / embed_dim**0.5) + self.k_proj = nn.Linear(embed_dim, embed_dim, dtype=self.dtype) + self.q_proj = nn.Linear(embed_dim, embed_dim, dtype=self.dtype) + self.v_proj = nn.Linear(embed_dim, embed_dim, dtype=self.dtype) + self.num_heads = num_heads + self.dim_per_head = embed_dim // self.num_heads + + def forward(self, x): + bs, length, width = x.size() + + def shape(x): + # (bs, length, width) --> (bs, length, n_heads, dim_per_head) + x = x.view(bs, -1, self.num_heads, self.dim_per_head) + # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head) + x = x.transpose(1, 2) + # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head) + x = x.reshape(bs * self.num_heads, -1, self.dim_per_head) + # (bs*n_heads, length, dim_per_head) --> (bs*n_heads, dim_per_head, length) + x = x.transpose(1, 2) + return x + + class_token = x.mean(dim=1, keepdim=True) + self.positional_embedding.to(x.dtype) + x = torch.cat([class_token, x], dim=1) # (bs, length+1, width) + + # (bs*n_heads, class_token_length, dim_per_head) + q = shape(self.q_proj(class_token)) + # (bs*n_heads, length+class_token_length, dim_per_head) + k = shape(self.k_proj(x)) + v = shape(self.v_proj(x)) + + # (bs*n_heads, class_token_length, length+class_token_length): + scale = 1 / math.sqrt(math.sqrt(self.dim_per_head)) + weight = torch.einsum("bct,bcs->bts", q * scale, k * scale) # More stable with f16 than dividing afterwards + weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype) + + # (bs*n_heads, dim_per_head, class_token_length) + a = torch.einsum("bts,bcs->bct", weight, v) + + # (bs, length+1, width) + a = a.reshape(bs, -1, 1).transpose(1, 2) + + return a[:, 0, :] # cls_token diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py index 5363e6330623..521e99fdd69c 100644 --- a/src/diffusers/models/modeling_utils.py +++ b/src/diffusers/models/modeling_utils.py @@ -15,6 +15,7 @@ # limitations under the License. import inspect +import itertools import os from functools import partial from typing import Any, Callable, List, Optional, Tuple, Union @@ -60,7 +61,8 @@ def get_parameter_device(parameter: torch.nn.Module): try: - return next(parameter.parameters()).device + parameters_and_buffers = itertools.chain(parameter.parameters(), parameter.buffers()) + return next(parameters_and_buffers).device except StopIteration: # For torch.nn.DataParallel compatibility in PyTorch 1.5 @@ -75,7 +77,8 @@ def find_tensor_attributes(module: torch.nn.Module) -> List[Tuple[str, Tensor]]: def get_parameter_dtype(parameter: torch.nn.Module): try: - return next(parameter.parameters()).dtype + parameters_and_buffers = itertools.chain(parameter.parameters(), parameter.buffers()) + return next(parameters_and_buffers).dtype except StopIteration: # For torch.nn.DataParallel compatibility in PyTorch 1.5 diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py index b4997a257643..38e0fa3b5b2e 100644 --- a/src/diffusers/models/unet_2d_condition.py +++ b/src/diffusers/models/unet_2d_condition.py @@ -23,7 +23,7 @@ from ..loaders import UNet2DConditionLoadersMixin from ..utils import BaseOutput, logging from .attention_processor import AttentionProcessor, AttnProcessor -from .embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps +from .embeddings import GaussianFourierProjection, TextTimeEmbedding, TimestepEmbedding, Timesteps from .modeling_utils import ModelMixin from .unet_2d_blocks import ( CrossAttnDownBlock2D, @@ -97,11 +97,16 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin) class_embed_type (`str`, *optional*, defaults to None): The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`, `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`. + addition_embed_type (`str`, *optional*, defaults to None): + Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or + "text". "text" will use the `TextTimeEmbedding` layer. num_class_embeds (`int`, *optional*, defaults to None): Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing class conditioning with `class_embed_type` equal to `None`. time_embedding_type (`str`, *optional*, default to `positional`): The type of position embedding to use for timesteps. Choose from `positional` or `fourier`. + time_embedding_dim (`int`, *optional*, default to `None`): + An optional override for the dimension of the projected time embedding. time_embedding_act_fn (`str`, *optional*, default to `None`): Optional activation function to use on the time embeddings only one time before they as passed to the rest of the unet. Choose from `silu`, `mish`, `gelu`, and `swish`. @@ -155,12 +160,14 @@ def __init__( dual_cross_attention: bool = False, use_linear_projection: bool = False, class_embed_type: Optional[str] = None, + addition_embed_type: Optional[str] = None, num_class_embeds: Optional[int] = None, upcast_attention: bool = False, resnet_time_scale_shift: str = "default", resnet_skip_time_act: bool = False, resnet_out_scale_factor: int = 1.0, time_embedding_type: str = "positional", + time_embedding_dim: Optional[int] = None, time_embedding_act_fn: Optional[str] = None, timestep_post_act: Optional[str] = None, time_cond_proj_dim: Optional[int] = None, @@ -170,6 +177,7 @@ def __init__( class_embeddings_concat: bool = False, mid_block_only_cross_attention: Optional[bool] = None, cross_attention_norm: Optional[str] = None, + addition_embed_type_num_heads=64, ): super().__init__() @@ -214,7 +222,7 @@ def __init__( # time if time_embedding_type == "fourier": - time_embed_dim = block_out_channels[0] * 2 + time_embed_dim = time_embedding_dim or block_out_channels[0] * 2 if time_embed_dim % 2 != 0: raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.") self.time_proj = GaussianFourierProjection( @@ -222,7 +230,7 @@ def __init__( ) timestep_input_dim = time_embed_dim elif time_embedding_type == "positional": - time_embed_dim = block_out_channels[0] * 4 + time_embed_dim = time_embedding_dim or block_out_channels[0] * 4 self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift) timestep_input_dim = block_out_channels[0] @@ -273,6 +281,18 @@ def __init__( else: self.class_embedding = None + if addition_embed_type == "text": + if encoder_hid_dim is not None: + text_time_embedding_from_dim = encoder_hid_dim + else: + text_time_embedding_from_dim = cross_attention_dim + + self.add_embedding = TextTimeEmbedding( + text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads + ) + elif addition_embed_type is not None: + raise ValueError(f"addition_embed_type: {addition_embed_type} must be None or 'text'.") + if time_embedding_act_fn is None: self.time_embed_act = None elif time_embedding_act_fn == "swish": @@ -684,6 +704,10 @@ def forward( else: emb = emb + class_emb + if self.config.addition_embed_type == "text": + aug_emb = self.add_embedding(encoder_hidden_states) + emb = emb + aug_emb + if self.time_embed_act is not None: emb = self.time_embed_act(emb) diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index 602cf028e2e9..10da653a1377 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -44,6 +44,14 @@ else: from .alt_diffusion import AltDiffusionImg2ImgPipeline, AltDiffusionPipeline from .audioldm import AudioLDMPipeline + from .deepfloyd_if import ( + IFImg2ImgPipeline, + IFImg2ImgSuperResolutionPipeline, + IFInpaintingPipeline, + IFInpaintingSuperResolutionPipeline, + IFPipeline, + IFSuperResolutionPipeline, + ) from .latent_diffusion import LDMTextToImagePipeline from .paint_by_example import PaintByExamplePipeline from .semantic_stable_diffusion import SemanticStableDiffusionPipeline diff --git a/src/diffusers/pipelines/deepfloyd_if/__init__.py b/src/diffusers/pipelines/deepfloyd_if/__init__.py new file mode 100644 index 000000000000..93414f20e733 --- /dev/null +++ b/src/diffusers/pipelines/deepfloyd_if/__init__.py @@ -0,0 +1,54 @@ +from dataclasses import dataclass +from typing import List, Optional, Union + +import numpy as np +import PIL + +from ...utils import BaseOutput, OptionalDependencyNotAvailable, is_torch_available, is_transformers_available +from .timesteps import ( + fast27_timesteps, + smart27_timesteps, + smart50_timesteps, + smart100_timesteps, + smart185_timesteps, + super27_timesteps, + super40_timesteps, + super100_timesteps, +) + + +@dataclass +class IFPipelineOutput(BaseOutput): + """ + Args: + Output class for Stable Diffusion pipelines. + images (`List[PIL.Image.Image]` or `np.ndarray`) + List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width, + num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline. + nsfw_detected (`List[bool]`) + List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work" + (nsfw) content or a watermark. `None` if safety checking could not be performed. + watermark_detected (`List[bool]`) + List of flags denoting whether the corresponding generated image likely has a watermark. `None` if safety + checking could not be performed. + """ + + images: Union[List[PIL.Image.Image], np.ndarray] + nsfw_detected: Optional[List[bool]] + watermark_detected: Optional[List[bool]] + + +try: + if not (is_transformers_available() and is_torch_available()): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from ...utils.dummy_torch_and_transformers_objects import * # noqa F403 +else: + from .pipeline_if import IFPipeline + from .pipeline_if_img2img import IFImg2ImgPipeline + from .pipeline_if_img2img_superresolution import IFImg2ImgSuperResolutionPipeline + from .pipeline_if_inpainting import IFInpaintingPipeline + from .pipeline_if_inpainting_superresolution import IFInpaintingSuperResolutionPipeline + from .pipeline_if_superresolution import IFSuperResolutionPipeline + from .safety_checker import IFSafetyChecker + from .watermark import IFWatermarker diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py new file mode 100644 index 000000000000..a76e51a3ffe9 --- /dev/null +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py @@ -0,0 +1,854 @@ +import html +import inspect +import re +import urllib.parse as ul +from typing import Any, Callable, Dict, List, Optional, Union + +import torch +from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer + +from ...models import UNet2DConditionModel +from ...schedulers import DDPMScheduler +from ...utils import ( + BACKENDS_MAPPING, + is_accelerate_available, + is_accelerate_version, + is_bs4_available, + is_ftfy_available, + logging, + randn_tensor, + replace_example_docstring, +) +from ..pipeline_utils import DiffusionPipeline +from . import IFPipelineOutput +from .safety_checker import IFSafetyChecker +from .watermark import IFWatermarker + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +if is_bs4_available(): + from bs4 import BeautifulSoup + +if is_ftfy_available(): + import ftfy + + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> from diffusers import IFPipeline, IFSuperResolutionPipeline, DiffusionPipeline + >>> from diffusers.utils import pt_to_pil + >>> import torch + + >>> pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16) + >>> pipe.enable_model_cpu_offload() + + >>> prompt = 'a photo of a kangaroo wearing an orange hoodie and blue sunglasses standing in front of the eiffel tower holding a sign that says "very deep learning"' + >>> prompt_embeds, negative_embeds = pipe.encode_prompt(prompt) + + >>> image = pipe(prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_embeds, output_type="pt").images + + >>> # save intermediate image + >>> pil_image = pt_to_pil(image) + >>> pil_image[0].save("./if_stage_I.png") + + >>> super_res_1_pipe = IFSuperResolutionPipeline.from_pretrained( + ... "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16 + ... ) + >>> super_res_1_pipe.enable_model_cpu_offload() + + >>> image = super_res_1_pipe( + ... image=image, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_embeds, output_type="pt" + ... ).images + + >>> # save intermediate image + >>> pil_image = pt_to_pil(image) + >>> pil_image[0].save("./if_stage_I.png") + + >>> safety_modules = { + ... "feature_extractor": pipe.feature_extractor, + ... "safety_checker": pipe.safety_checker, + ... "watermarker": pipe.watermarker, + ... } + >>> super_res_2_pipe = DiffusionPipeline.from_pretrained( + ... "stabilityai/stable-diffusion-x4-upscaler", **safety_modules, torch_dtype=torch.float16 + ... ) + >>> super_res_2_pipe.enable_model_cpu_offload() + + >>> image = super_res_2_pipe( + ... prompt=prompt, + ... image=image, + ... ).images + >>> image[0].save("./if_stage_II.png") + ``` +""" + + +class IFPipeline(DiffusionPipeline): + tokenizer: T5Tokenizer + text_encoder: T5EncoderModel + + unet: UNet2DConditionModel + scheduler: DDPMScheduler + + feature_extractor: Optional[CLIPImageProcessor] + safety_checker: Optional[IFSafetyChecker] + + watermarker: Optional[IFWatermarker] + + bad_punct_regex = re.compile( + r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}" + ) # noqa + + _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"] + + def __init__( + self, + tokenizer: T5Tokenizer, + text_encoder: T5EncoderModel, + unet: UNet2DConditionModel, + scheduler: DDPMScheduler, + safety_checker: Optional[IFSafetyChecker], + feature_extractor: Optional[CLIPImageProcessor], + watermarker: Optional[IFWatermarker], + requires_safety_checker: bool = True, + ): + super().__init__() + + if safety_checker is None and requires_safety_checker: + logger.warning( + f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure" + " that you abide to the conditions of the IF license and do not expose unfiltered" + " results in services or applications open to the public. Both the diffusers team and Hugging Face" + " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling" + " it only for use-cases that involve analyzing network behavior or auditing its results. For more" + " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ." + ) + + if safety_checker is not None and feature_extractor is None: + raise ValueError( + "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety" + " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead." + ) + + self.register_modules( + tokenizer=tokenizer, + text_encoder=text_encoder, + unet=unet, + scheduler=scheduler, + safety_checker=safety_checker, + feature_extractor=feature_extractor, + watermarker=watermarker, + ) + self.register_to_config(requires_safety_checker=requires_safety_checker) + + def enable_sequential_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's + models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only + when their specific submodule has its `forward` method called. + """ + if is_accelerate_available(): + from accelerate import cpu_offload + else: + raise ImportError("Please install accelerate via `pip install accelerate`") + + device = torch.device(f"cuda:{gpu_id}") + + models = [ + self.text_encoder, + self.unet, + ] + for cpu_offloaded_model in models: + if cpu_offloaded_model is not None: + cpu_offload(cpu_offloaded_model, device) + + if self.safety_checker is not None: + cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True) + + def enable_model_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared + to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` + method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with + `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): + from accelerate import cpu_offload_with_hook + else: + raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + hook = None + + if self.text_encoder is not None: + _, hook = cpu_offload_with_hook(self.text_encoder, device, prev_module_hook=hook) + + # Accelerate will move the next model to the device _before_ calling the offload hook of the + # previous model. This will cause both models to be present on the device at the same time. + # IF uses T5 for its text encoder which is really large. We can manually call the offload + # hook for the text encoder to ensure it's moved to the cpu before the unet is moved to + # the GPU. + self.text_encoder_offload_hook = hook + + _, hook = cpu_offload_with_hook(self.unet, device, prev_module_hook=hook) + + # if the safety checker isn't called, `unet_offload_hook` will have to be called to manually offload the unet + self.unet_offload_hook = hook + + if self.safety_checker is not None: + _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook) + + # We'll offload the last model manually. + self.final_offload_hook = hook + + def remove_all_hooks(self): + if is_accelerate_available(): + from accelerate.hooks import remove_hook_from_module + else: + raise ImportError("Please install accelerate via `pip install accelerate`") + + for model in [self.text_encoder, self.unet, self.safety_checker]: + if model is not None: + remove_hook_from_module(model, recurse=True) + + self.unet_offload_hook = None + self.text_encoder_offload_hook = None + self.final_offload_hook = None + + @property + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device + def _execution_device(self): + r""" + Returns the device on which the pipeline's models will be executed. After calling + `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module + hooks. + """ + if not hasattr(self.unet, "_hf_hook"): + return self.device + for module in self.unet.modules(): + if ( + hasattr(module, "_hf_hook") + and hasattr(module._hf_hook, "execution_device") + and module._hf_hook.execution_device is not None + ): + return torch.device(module._hf_hook.execution_device) + return self.device + + @torch.no_grad() + def encode_prompt( + self, + prompt, + do_classifier_free_guidance=True, + num_images_per_prompt=1, + device=None, + negative_prompt=None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + clean_caption: bool = False, + ): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + device: (`torch.device`, *optional*): + torch device to place the resulting embeddings on + num_images_per_prompt (`int`, *optional*, defaults to 1): + number of images that should be generated per prompt + do_classifier_free_guidance (`bool`, *optional*, defaults to `True`): + whether to use classifier free guidance or not + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. + Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + """ + if prompt is not None and negative_prompt is not None: + if type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + + if device is None: + device = self._execution_device + + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + # while T5 can handle much longer input sequences than 77, the text encoder was trained with a max length of 77 for IF + max_length = 77 + + if prompt_embeds is None: + prompt = self._text_preprocessing(prompt, clean_caption=clean_caption) + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=max_length, + truncation=True, + add_special_tokens=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( + text_input_ids, untruncated_ids + ): + removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1]) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {max_length} tokens: {removed_text}" + ) + + attention_mask = text_inputs.attention_mask.to(device) + + prompt_embeds = self.text_encoder( + text_input_ids.to(device), + attention_mask=attention_mask, + ) + prompt_embeds = prompt_embeds[0] + + if self.text_encoder is not None: + dtype = self.text_encoder.dtype + elif self.unet is not None: + dtype = self.unet.dtype + else: + dtype = None + + prompt_embeds = prompt_embeds.to(dtype=dtype, device=device) + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) + + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance and negative_prompt_embeds is None: + uncond_tokens: List[str] + if negative_prompt is None: + uncond_tokens = [""] * batch_size + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption) + max_length = prompt_embeds.shape[1] + uncond_input = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=max_length, + truncation=True, + return_attention_mask=True, + add_special_tokens=True, + return_tensors="pt", + ) + attention_mask = uncond_input.attention_mask.to(device) + + negative_prompt_embeds = self.text_encoder( + uncond_input.input_ids.to(device), + attention_mask=attention_mask, + ) + negative_prompt_embeds = negative_prompt_embeds[0] + + if do_classifier_free_guidance: + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + + negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device) + + negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) + negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + else: + negative_prompt_embeds = None + + return prompt_embeds, negative_prompt_embeds + + def run_safety_checker(self, image, device, dtype): + if self.safety_checker is not None: + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device) + image, nsfw_detected, watermark_detected = self.safety_checker( + images=image, + clip_input=safety_checker_input.pixel_values.to(dtype=dtype), + ) + else: + nsfw_detected = None + watermark_detected = None + + if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None: + self.unet_offload_hook.offload() + + return image, nsfw_detected, watermark_detected + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs + + def check_inputs( + self, + prompt, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): + if (callback_steps is None) or ( + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type" + f" {type(callback_steps)}." + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if prompt_embeds is not None and negative_prompt_embeds is not None: + if prompt_embeds.shape != negative_prompt_embeds.shape: + raise ValueError( + "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" + f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" + f" {negative_prompt_embeds.shape}." + ) + + def prepare_intermediate_images(self, batch_size, num_channels, height, width, dtype, device, generator): + shape = (batch_size, num_channels, height, width) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + intermediate_images = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + + # scale the initial noise by the standard deviation required by the scheduler + intermediate_images = intermediate_images * self.scheduler.init_noise_sigma + return intermediate_images + + def _text_preprocessing(self, text, clean_caption=False): + if clean_caption and not is_bs4_available(): + logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`")) + logger.warn("Setting `clean_caption` to False...") + clean_caption = False + + if clean_caption and not is_ftfy_available(): + logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`")) + logger.warn("Setting `clean_caption` to False...") + clean_caption = False + + if not isinstance(text, (tuple, list)): + text = [text] + + def process(text: str): + if clean_caption: + text = self._clean_caption(text) + text = self._clean_caption(text) + else: + text = text.lower().strip() + return text + + return [process(t) for t in text] + + def _clean_caption(self, caption): + caption = str(caption) + caption = ul.unquote_plus(caption) + caption = caption.strip().lower() + caption = re.sub("", "person", caption) + # urls: + caption = re.sub( + r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa + "", + caption, + ) # regex for urls + caption = re.sub( + r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa + "", + caption, + ) # regex for urls + # html: + caption = BeautifulSoup(caption, features="html.parser").text + + # @ + caption = re.sub(r"@[\w\d]+\b", "", caption) + + # 31C0—31EF CJK Strokes + # 31F0—31FF Katakana Phonetic Extensions + # 3200—32FF Enclosed CJK Letters and Months + # 3300—33FF CJK Compatibility + # 3400—4DBF CJK Unified Ideographs Extension A + # 4DC0—4DFF Yijing Hexagram Symbols + # 4E00—9FFF CJK Unified Ideographs + caption = re.sub(r"[\u31c0-\u31ef]+", "", caption) + caption = re.sub(r"[\u31f0-\u31ff]+", "", caption) + caption = re.sub(r"[\u3200-\u32ff]+", "", caption) + caption = re.sub(r"[\u3300-\u33ff]+", "", caption) + caption = re.sub(r"[\u3400-\u4dbf]+", "", caption) + caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption) + caption = re.sub(r"[\u4e00-\u9fff]+", "", caption) + ####################################################### + + # все виды тире / all types of dash --> "-" + caption = re.sub( + r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa + "-", + caption, + ) + + # кавычки к одному стандарту + caption = re.sub(r"[`´«»“”¨]", '"', caption) + caption = re.sub(r"[‘’]", "'", caption) + + # " + caption = re.sub(r""?", "", caption) + # & + caption = re.sub(r"&", "", caption) + + # ip adresses: + caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption) + + # article ids: + caption = re.sub(r"\d:\d\d\s+$", "", caption) + + # \n + caption = re.sub(r"\\n", " ", caption) + + # "#123" + caption = re.sub(r"#\d{1,3}\b", "", caption) + # "#12345.." + caption = re.sub(r"#\d{5,}\b", "", caption) + # "123456.." + caption = re.sub(r"\b\d{6,}\b", "", caption) + # filenames: + caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption) + + # + caption = re.sub(r"[\"\']{2,}", r'"', caption) # """AUSVERKAUFT""" + caption = re.sub(r"[\.]{2,}", r" ", caption) # """AUSVERKAUFT""" + + caption = re.sub(self.bad_punct_regex, r" ", caption) # ***AUSVERKAUFT***, #AUSVERKAUFT + caption = re.sub(r"\s+\.\s+", r" ", caption) # " . " + + # this-is-my-cute-cat / this_is_my_cute_cat + regex2 = re.compile(r"(?:\-|\_)") + if len(re.findall(regex2, caption)) > 3: + caption = re.sub(regex2, " ", caption) + + caption = ftfy.fix_text(caption) + caption = html.unescape(html.unescape(caption)) + + caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption) # jc6640 + caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption) # jc6640vc + caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption) # 6640vc231 + + caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption) + caption = re.sub(r"(free\s)?download(\sfree)?", "", caption) + caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption) + caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption) + caption = re.sub(r"\bpage\s+\d+\b", "", caption) + + caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) # j2d1a2a... + + caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption) + + caption = re.sub(r"\b\s+\:\s+", r": ", caption) + caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption) + caption = re.sub(r"\s+", " ", caption) + + caption.strip() + + caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption) + caption = re.sub(r"^[\'\_,\-\:;]", r"", caption) + caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption) + caption = re.sub(r"^\.\S+$", "", caption) + + return caption.strip() + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Union[str, List[str]] = None, + num_inference_steps: int = 100, + timesteps: List[int] = None, + guidance_scale: float = 7.0, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + height: Optional[int] = None, + width: Optional[int] = None, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback_steps: int = 1, + clean_caption: bool = True, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + ): + """ + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + instead. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + timesteps (`List[int]`, *optional*): + Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps` + timesteps are used. Must be in descending order. + guidance_scale (`float`, *optional*, defaults to 7.5): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + height (`int`, *optional*, defaults to self.unet.config.sample_size): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to self.unet.config.sample_size): + The width in pixels of the generated image. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to + [`schedulers.DDIMScheduler`], will be ignored for others. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. + callback (`Callable`, *optional*): + A function that will be called every `callback_steps` steps during inference. The function will be + called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function will be called. If not specified, the callback will be + called at every step. + clean_caption (`bool`, *optional*, defaults to `True`): + Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to + be installed. If the dependencies are not installed, the embeddings will be created from the raw + prompt. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). + + Examples: + + Returns: + [`~pipelines.stable_diffusion.IFPipelineOutput`] or `tuple`: + [`~pipelines.stable_diffusion.IFPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When + returning a tuple, the first element is a list with the generated images, and the second element is a list + of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw) + or watermarked content, according to the `safety_checker`. + """ + # 1. Check inputs. Raise error if not correct + self.check_inputs(prompt, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds) + + # 2. Define call parameters + height = height or self.unet.config.sample_size + width = width or self.unet.config.sample_size + + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + device = self._execution_device + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + + # 3. Encode input prompt + prompt_embeds, negative_prompt_embeds = self.encode_prompt( + prompt, + do_classifier_free_guidance, + num_images_per_prompt=num_images_per_prompt, + device=device, + negative_prompt=negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + clean_caption=clean_caption, + ) + + if do_classifier_free_guidance: + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + + # 4. Prepare timesteps + if timesteps is not None: + self.scheduler.set_timesteps(timesteps=timesteps, device=device) + timesteps = self.scheduler.timesteps + num_inference_steps = len(timesteps) + else: + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps = self.scheduler.timesteps + + # 5. Prepare intermediate images + intermediate_images = self.prepare_intermediate_images( + batch_size * num_images_per_prompt, + self.unet.config.in_channels, + height, + width, + prompt_embeds.dtype, + device, + generator, + ) + + # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + # HACK: see comment in `enable_model_cpu_offload` + if hasattr(self, "text_encoder_offload_hook") and self.text_encoder_offload_hook is not None: + self.text_encoder_offload_hook.offload() + + # 7. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + model_input = ( + torch.cat([intermediate_images] * 2) if do_classifier_free_guidance else intermediate_images + ) + model_input = self.scheduler.scale_model_input(model_input, t) + + # predict the noise residual + noise_pred = self.unet( + model_input, + t, + encoder_hidden_states=prompt_embeds, + cross_attention_kwargs=cross_attention_kwargs, + ).sample + + # perform guidance + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred_uncond, _ = noise_pred_uncond.split(model_input.shape[1], dim=1) + noise_pred_text, predicted_variance = noise_pred_text.split(model_input.shape[1], dim=1) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + noise_pred = torch.cat([noise_pred, predicted_variance], dim=1) + + # compute the previous noisy sample x_t -> x_t-1 + intermediate_images = self.scheduler.step( + noise_pred, t, intermediate_images, **extra_step_kwargs + ).prev_sample + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + callback(i, t, intermediate_images) + + image = intermediate_images + + if output_type == "pil": + # 8. Post-processing + image = (image / 2 + 0.5).clamp(0, 1) + image = image.cpu().permute(0, 2, 3, 1).float().numpy() + + # 9. Run safety checker + image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype) + + # 10. Convert to PIL + image = self.numpy_to_pil(image) + + # 11. Apply watermark + if self.watermarker is not None: + self.watermarker.apply_watermark(image, self.unet.config.sample_size) + elif output_type == "pt": + nsfw_detected = None + watermark_detected = None + + if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None: + self.unet_offload_hook.offload() + else: + # 8. Post-processing + image = (image / 2 + 0.5).clamp(0, 1) + image = image.cpu().permute(0, 2, 3, 1).float().numpy() + + # 9. Run safety checker + image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype) + + # Offload last model to CPU + if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: + self.final_offload_hook.offload() + + if not return_dict: + return (image, nsfw_detected, watermark_detected) + + return IFPipelineOutput(images=image, nsfw_detected=nsfw_detected, watermark_detected=watermark_detected) diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py new file mode 100644 index 000000000000..a31748450d4b --- /dev/null +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py @@ -0,0 +1,979 @@ +import html +import inspect +import re +import urllib.parse as ul +from typing import Any, Callable, Dict, List, Optional, Union + +import numpy as np +import PIL +import torch +from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer + +from ...models import UNet2DConditionModel +from ...schedulers import DDPMScheduler +from ...utils import ( + BACKENDS_MAPPING, + PIL_INTERPOLATION, + is_accelerate_available, + is_accelerate_version, + is_bs4_available, + is_ftfy_available, + logging, + randn_tensor, + replace_example_docstring, +) +from ..pipeline_utils import DiffusionPipeline +from . import IFPipelineOutput +from .safety_checker import IFSafetyChecker +from .watermark import IFWatermarker + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +if is_bs4_available(): + from bs4 import BeautifulSoup + +if is_ftfy_available(): + import ftfy + + +def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image: + w, h = images.size + + coef = w / h + + w, h = img_size, img_size + + if coef >= 1: + w = int(round(img_size / 8 * coef) * 8) + else: + h = int(round(img_size / 8 / coef) * 8) + + images = images.resize((w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None) + + return images + + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> from diffusers import IFImg2ImgPipeline, IFImg2ImgSuperResolutionPipeline, DiffusionPipeline + >>> from diffusers.utils import pt_to_pil + >>> import torch + >>> from PIL import Image + >>> import requests + >>> from io import BytesIO + + >>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg" + >>> response = requests.get(url) + >>> original_image = Image.open(BytesIO(response.content)).convert("RGB") + >>> original_image = original_image.resize((768, 512)) + + >>> pipe = IFImg2ImgPipeline.from_pretrained( + ... "DeepFloyd/IF-I-IF-v1.0", + ... variant="fp16", + ... torch_dtype=torch.float16, + ... ) + >>> pipe.enable_model_cpu_offload() + + >>> prompt = "A fantasy landscape in style minecraft" + >>> prompt_embeds, negative_embeds = pipe.encode_prompt(prompt) + + >>> image = pipe( + ... image=original_image, + ... prompt_embeds=prompt_embeds, + ... negative_prompt_embeds=negative_embeds, + ... output_type="pt", + ... ).images + + >>> # save intermediate image + >>> pil_image = pt_to_pil(image) + >>> pil_image[0].save("./if_stage_I.png") + + >>> super_res_1_pipe = IFImg2ImgSuperResolutionPipeline.from_pretrained( + ... "DeepFloyd/IF-II-L-v1.0", + ... text_encoder=None, + ... variant="fp16", + ... torch_dtype=torch.float16, + ... ) + >>> super_res_1_pipe.enable_model_cpu_offload() + + >>> image = super_res_1_pipe( + ... image=image, + ... original_image=original_image, + ... prompt_embeds=prompt_embeds, + ... negative_prompt_embeds=negative_embeds, + ... ).images + >>> image[0].save("./if_stage_II.png") + ``` +""" + + +class IFImg2ImgPipeline(DiffusionPipeline): + tokenizer: T5Tokenizer + text_encoder: T5EncoderModel + + unet: UNet2DConditionModel + scheduler: DDPMScheduler + + feature_extractor: Optional[CLIPImageProcessor] + safety_checker: Optional[IFSafetyChecker] + + watermarker: Optional[IFWatermarker] + + bad_punct_regex = re.compile( + r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}" + ) # noqa + + _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"] + + def __init__( + self, + tokenizer: T5Tokenizer, + text_encoder: T5EncoderModel, + unet: UNet2DConditionModel, + scheduler: DDPMScheduler, + safety_checker: Optional[IFSafetyChecker], + feature_extractor: Optional[CLIPImageProcessor], + watermarker: Optional[IFWatermarker], + requires_safety_checker: bool = True, + ): + super().__init__() + + if safety_checker is None and requires_safety_checker: + logger.warning( + f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure" + " that you abide to the conditions of the IF license and do not expose unfiltered" + " results in services or applications open to the public. Both the diffusers team and Hugging Face" + " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling" + " it only for use-cases that involve analyzing network behavior or auditing its results. For more" + " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ." + ) + + if safety_checker is not None and feature_extractor is None: + raise ValueError( + "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety" + " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead." + ) + + self.register_modules( + tokenizer=tokenizer, + text_encoder=text_encoder, + unet=unet, + scheduler=scheduler, + safety_checker=safety_checker, + feature_extractor=feature_extractor, + watermarker=watermarker, + ) + self.register_to_config(requires_safety_checker=requires_safety_checker) + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_sequential_cpu_offload + def enable_sequential_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's + models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only + when their specific submodule has its `forward` method called. + """ + if is_accelerate_available(): + from accelerate import cpu_offload + else: + raise ImportError("Please install accelerate via `pip install accelerate`") + + device = torch.device(f"cuda:{gpu_id}") + + models = [ + self.text_encoder, + self.unet, + ] + for cpu_offloaded_model in models: + if cpu_offloaded_model is not None: + cpu_offload(cpu_offloaded_model, device) + + if self.safety_checker is not None: + cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True) + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_model_cpu_offload + def enable_model_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared + to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` + method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with + `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): + from accelerate import cpu_offload_with_hook + else: + raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + hook = None + + if self.text_encoder is not None: + _, hook = cpu_offload_with_hook(self.text_encoder, device, prev_module_hook=hook) + + # Accelerate will move the next model to the device _before_ calling the offload hook of the + # previous model. This will cause both models to be present on the device at the same time. + # IF uses T5 for its text encoder which is really large. We can manually call the offload + # hook for the text encoder to ensure it's moved to the cpu before the unet is moved to + # the GPU. + self.text_encoder_offload_hook = hook + + _, hook = cpu_offload_with_hook(self.unet, device, prev_module_hook=hook) + + # if the safety checker isn't called, `unet_offload_hook` will have to be called to manually offload the unet + self.unet_offload_hook = hook + + if self.safety_checker is not None: + _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook) + + # We'll offload the last model manually. + self.final_offload_hook = hook + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.remove_all_hooks + def remove_all_hooks(self): + if is_accelerate_available(): + from accelerate.hooks import remove_hook_from_module + else: + raise ImportError("Please install accelerate via `pip install accelerate`") + + for model in [self.text_encoder, self.unet, self.safety_checker]: + if model is not None: + remove_hook_from_module(model, recurse=True) + + self.unet_offload_hook = None + self.text_encoder_offload_hook = None + self.final_offload_hook = None + + @property + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device + def _execution_device(self): + r""" + Returns the device on which the pipeline's models will be executed. After calling + `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module + hooks. + """ + if not hasattr(self.unet, "_hf_hook"): + return self.device + for module in self.unet.modules(): + if ( + hasattr(module, "_hf_hook") + and hasattr(module._hf_hook, "execution_device") + and module._hf_hook.execution_device is not None + ): + return torch.device(module._hf_hook.execution_device) + return self.device + + @torch.no_grad() + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt + def encode_prompt( + self, + prompt, + do_classifier_free_guidance=True, + num_images_per_prompt=1, + device=None, + negative_prompt=None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + clean_caption: bool = False, + ): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + device: (`torch.device`, *optional*): + torch device to place the resulting embeddings on + num_images_per_prompt (`int`, *optional*, defaults to 1): + number of images that should be generated per prompt + do_classifier_free_guidance (`bool`, *optional*, defaults to `True`): + whether to use classifier free guidance or not + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. + Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + """ + if prompt is not None and negative_prompt is not None: + if type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + + if device is None: + device = self._execution_device + + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + # while T5 can handle much longer input sequences than 77, the text encoder was trained with a max length of 77 for IF + max_length = 77 + + if prompt_embeds is None: + prompt = self._text_preprocessing(prompt, clean_caption=clean_caption) + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=max_length, + truncation=True, + add_special_tokens=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( + text_input_ids, untruncated_ids + ): + removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1]) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {max_length} tokens: {removed_text}" + ) + + attention_mask = text_inputs.attention_mask.to(device) + + prompt_embeds = self.text_encoder( + text_input_ids.to(device), + attention_mask=attention_mask, + ) + prompt_embeds = prompt_embeds[0] + + if self.text_encoder is not None: + dtype = self.text_encoder.dtype + elif self.unet is not None: + dtype = self.unet.dtype + else: + dtype = None + + prompt_embeds = prompt_embeds.to(dtype=dtype, device=device) + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) + + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance and negative_prompt_embeds is None: + uncond_tokens: List[str] + if negative_prompt is None: + uncond_tokens = [""] * batch_size + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption) + max_length = prompt_embeds.shape[1] + uncond_input = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=max_length, + truncation=True, + return_attention_mask=True, + add_special_tokens=True, + return_tensors="pt", + ) + attention_mask = uncond_input.attention_mask.to(device) + + negative_prompt_embeds = self.text_encoder( + uncond_input.input_ids.to(device), + attention_mask=attention_mask, + ) + negative_prompt_embeds = negative_prompt_embeds[0] + + if do_classifier_free_guidance: + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + + negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device) + + negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) + negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + else: + negative_prompt_embeds = None + + return prompt_embeds, negative_prompt_embeds + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker + def run_safety_checker(self, image, device, dtype): + if self.safety_checker is not None: + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device) + image, nsfw_detected, watermark_detected = self.safety_checker( + images=image, + clip_input=safety_checker_input.pixel_values.to(dtype=dtype), + ) + else: + nsfw_detected = None + watermark_detected = None + + if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None: + self.unet_offload_hook.offload() + + return image, nsfw_detected, watermark_detected + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.prepare_extra_step_kwargs + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs + + def check_inputs( + self, + prompt, + image, + batch_size, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): + if (callback_steps is None) or ( + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type" + f" {type(callback_steps)}." + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if prompt_embeds is not None and negative_prompt_embeds is not None: + if prompt_embeds.shape != negative_prompt_embeds.shape: + raise ValueError( + "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" + f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" + f" {negative_prompt_embeds.shape}." + ) + + if isinstance(image, list): + check_image_type = image[0] + else: + check_image_type = image + + if ( + not isinstance(check_image_type, torch.Tensor) + and not isinstance(check_image_type, PIL.Image.Image) + and not isinstance(check_image_type, np.ndarray) + ): + raise ValueError( + "`image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" + f" {type(check_image_type)}" + ) + + if isinstance(image, list): + image_batch_size = len(image) + elif isinstance(image, torch.Tensor): + image_batch_size = image.shape[0] + elif isinstance(image, PIL.Image.Image): + image_batch_size = 1 + elif isinstance(image, np.ndarray): + image_batch_size = image.shape[0] + else: + assert False + + if batch_size != image_batch_size: + raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}") + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing + def _text_preprocessing(self, text, clean_caption=False): + if clean_caption and not is_bs4_available(): + logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`")) + logger.warn("Setting `clean_caption` to False...") + clean_caption = False + + if clean_caption and not is_ftfy_available(): + logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`")) + logger.warn("Setting `clean_caption` to False...") + clean_caption = False + + if not isinstance(text, (tuple, list)): + text = [text] + + def process(text: str): + if clean_caption: + text = self._clean_caption(text) + text = self._clean_caption(text) + else: + text = text.lower().strip() + return text + + return [process(t) for t in text] + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption + def _clean_caption(self, caption): + caption = str(caption) + caption = ul.unquote_plus(caption) + caption = caption.strip().lower() + caption = re.sub("", "person", caption) + # urls: + caption = re.sub( + r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa + "", + caption, + ) # regex for urls + caption = re.sub( + r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa + "", + caption, + ) # regex for urls + # html: + caption = BeautifulSoup(caption, features="html.parser").text + + # @ + caption = re.sub(r"@[\w\d]+\b", "", caption) + + # 31C0—31EF CJK Strokes + # 31F0—31FF Katakana Phonetic Extensions + # 3200—32FF Enclosed CJK Letters and Months + # 3300—33FF CJK Compatibility + # 3400—4DBF CJK Unified Ideographs Extension A + # 4DC0—4DFF Yijing Hexagram Symbols + # 4E00—9FFF CJK Unified Ideographs + caption = re.sub(r"[\u31c0-\u31ef]+", "", caption) + caption = re.sub(r"[\u31f0-\u31ff]+", "", caption) + caption = re.sub(r"[\u3200-\u32ff]+", "", caption) + caption = re.sub(r"[\u3300-\u33ff]+", "", caption) + caption = re.sub(r"[\u3400-\u4dbf]+", "", caption) + caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption) + caption = re.sub(r"[\u4e00-\u9fff]+", "", caption) + ####################################################### + + # все виды тире / all types of dash --> "-" + caption = re.sub( + r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa + "-", + caption, + ) + + # кавычки к одному стандарту + caption = re.sub(r"[`´«»“”¨]", '"', caption) + caption = re.sub(r"[‘’]", "'", caption) + + # " + caption = re.sub(r""?", "", caption) + # & + caption = re.sub(r"&", "", caption) + + # ip adresses: + caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption) + + # article ids: + caption = re.sub(r"\d:\d\d\s+$", "", caption) + + # \n + caption = re.sub(r"\\n", " ", caption) + + # "#123" + caption = re.sub(r"#\d{1,3}\b", "", caption) + # "#12345.." + caption = re.sub(r"#\d{5,}\b", "", caption) + # "123456.." + caption = re.sub(r"\b\d{6,}\b", "", caption) + # filenames: + caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption) + + # + caption = re.sub(r"[\"\']{2,}", r'"', caption) # """AUSVERKAUFT""" + caption = re.sub(r"[\.]{2,}", r" ", caption) # """AUSVERKAUFT""" + + caption = re.sub(self.bad_punct_regex, r" ", caption) # ***AUSVERKAUFT***, #AUSVERKAUFT + caption = re.sub(r"\s+\.\s+", r" ", caption) # " . " + + # this-is-my-cute-cat / this_is_my_cute_cat + regex2 = re.compile(r"(?:\-|\_)") + if len(re.findall(regex2, caption)) > 3: + caption = re.sub(regex2, " ", caption) + + caption = ftfy.fix_text(caption) + caption = html.unescape(html.unescape(caption)) + + caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption) # jc6640 + caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption) # jc6640vc + caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption) # 6640vc231 + + caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption) + caption = re.sub(r"(free\s)?download(\sfree)?", "", caption) + caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption) + caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption) + caption = re.sub(r"\bpage\s+\d+\b", "", caption) + + caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) # j2d1a2a... + + caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption) + + caption = re.sub(r"\b\s+\:\s+", r": ", caption) + caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption) + caption = re.sub(r"\s+", " ", caption) + + caption.strip() + + caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption) + caption = re.sub(r"^[\'\_,\-\:;]", r"", caption) + caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption) + caption = re.sub(r"^\.\S+$", "", caption) + + return caption.strip() + + def preprocess_image(self, image: PIL.Image.Image) -> torch.Tensor: + if not isinstance(image, list): + image = [image] + + def numpy_to_pt(images): + if images.ndim == 3: + images = images[..., None] + + images = torch.from_numpy(images.transpose(0, 3, 1, 2)) + return images + + if isinstance(image[0], PIL.Image.Image): + new_image = [] + + for image_ in image: + image_ = image_.convert("RGB") + image_ = resize(image_, self.unet.sample_size) + image_ = np.array(image_) + image_ = image_.astype(np.float32) + image_ = image_ / 127.5 - 1 + new_image.append(image_) + + image = new_image + + image = np.stack(image, axis=0) # to np + image = numpy_to_pt(image) # to pt + + elif isinstance(image[0], np.ndarray): + image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0) + image = numpy_to_pt(image) + + elif isinstance(image[0], torch.Tensor): + image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0) + + return image + + def get_timesteps(self, num_inference_steps, strength): + # get the original timestep using init_timestep + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) + + t_start = max(num_inference_steps - init_timestep, 0) + timesteps = self.scheduler.timesteps[t_start:] + + return timesteps, num_inference_steps - t_start + + def prepare_intermediate_images( + self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None + ): + _, channels, height, width = image.shape + + batch_size = batch_size * num_images_per_prompt + + shape = (batch_size, channels, height, width) + + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + + image = image.repeat_interleave(num_images_per_prompt, dim=0) + image = self.scheduler.add_noise(image, noise, timestep) + + return image + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Union[str, List[str]] = None, + image: Union[ + PIL.Image.Image, torch.Tensor, np.ndarray, List[PIL.Image.Image], List[torch.Tensor], List[np.ndarray] + ] = None, + strength: float = 0.7, + num_inference_steps: int = 80, + timesteps: List[int] = None, + guidance_scale: float = 10.0, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback_steps: int = 1, + clean_caption: bool = True, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + ): + """ + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + instead. + image (`torch.FloatTensor` or `PIL.Image.Image`): + `Image`, or tensor representing an image batch, that will be used as the starting point for the + process. + strength (`float`, *optional*, defaults to 0.8): + Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` + will be used as a starting point, adding more noise to it the larger the `strength`. The number of + denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will + be maximum and the denoising process will run for the full number of iterations specified in + `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + timesteps (`List[int]`, *optional*): + Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps` + timesteps are used. Must be in descending order. + guidance_scale (`float`, *optional*, defaults to 7.5): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to + [`schedulers.DDIMScheduler`], will be ignored for others. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. + callback (`Callable`, *optional*): + A function that will be called every `callback_steps` steps during inference. The function will be + called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function will be called. If not specified, the callback will be + called at every step. + clean_caption (`bool`, *optional*, defaults to `True`): + Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to + be installed. If the dependencies are not installed, the embeddings will be created from the raw + prompt. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). + + Examples: + + Returns: + [`~pipelines.stable_diffusion.IFPipelineOutput`] or `tuple`: + [`~pipelines.stable_diffusion.IFPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When + returning a tuple, the first element is a list with the generated images, and the second element is a list + of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw) + or watermarked content, according to the `safety_checker`. + """ + # 1. Check inputs. Raise error if not correct + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + self.check_inputs( + prompt, image, batch_size, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds + ) + + # 2. Define call parameters + device = self._execution_device + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + + # 3. Encode input prompt + prompt_embeds, negative_prompt_embeds = self.encode_prompt( + prompt, + do_classifier_free_guidance, + num_images_per_prompt=num_images_per_prompt, + device=device, + negative_prompt=negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + clean_caption=clean_caption, + ) + + if do_classifier_free_guidance: + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + + dtype = prompt_embeds.dtype + + # 4. Prepare timesteps + if timesteps is not None: + self.scheduler.set_timesteps(timesteps=timesteps, device=device) + timesteps = self.scheduler.timesteps + num_inference_steps = len(timesteps) + else: + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps = self.scheduler.timesteps + + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength) + + # 5. Prepare intermediate images + image = self.preprocess_image(image) + image = image.to(device=device, dtype=dtype) + + noise_timestep = timesteps[0:1] + noise_timestep = noise_timestep.repeat(batch_size * num_images_per_prompt) + + intermediate_images = self.prepare_intermediate_images( + image, noise_timestep, batch_size, num_images_per_prompt, dtype, device, generator + ) + + # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + # HACK: see comment in `enable_model_cpu_offload` + if hasattr(self, "text_encoder_offload_hook") and self.text_encoder_offload_hook is not None: + self.text_encoder_offload_hook.offload() + + # 7. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + model_input = ( + torch.cat([intermediate_images] * 2) if do_classifier_free_guidance else intermediate_images + ) + model_input = self.scheduler.scale_model_input(model_input, t) + + # predict the noise residual + noise_pred = self.unet( + model_input, + t, + encoder_hidden_states=prompt_embeds, + cross_attention_kwargs=cross_attention_kwargs, + ).sample + + # perform guidance + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred_uncond, _ = noise_pred_uncond.split(model_input.shape[1], dim=1) + noise_pred_text, predicted_variance = noise_pred_text.split(model_input.shape[1], dim=1) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + noise_pred = torch.cat([noise_pred, predicted_variance], dim=1) + + # compute the previous noisy sample x_t -> x_t-1 + intermediate_images = self.scheduler.step( + noise_pred, t, intermediate_images, **extra_step_kwargs + ).prev_sample + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + callback(i, t, intermediate_images) + + image = intermediate_images + + if output_type == "pil": + # 8. Post-processing + image = (image / 2 + 0.5).clamp(0, 1) + image = image.cpu().permute(0, 2, 3, 1).float().numpy() + + # 9. Run safety checker + image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype) + + # 10. Convert to PIL + image = self.numpy_to_pil(image) + + # 11. Apply watermark + if self.watermarker is not None: + self.watermarker.apply_watermark(image, self.unet.config.sample_size) + elif output_type == "pt": + nsfw_detected = None + watermark_detected = None + + if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None: + self.unet_offload_hook.offload() + else: + # 8. Post-processing + image = (image / 2 + 0.5).clamp(0, 1) + image = image.cpu().permute(0, 2, 3, 1).float().numpy() + + # 9. Run safety checker + image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype) + + # Offload last model to CPU + if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: + self.final_offload_hook.offload() + + if not return_dict: + return (image, nsfw_detected, watermark_detected) + + return IFPipelineOutput(images=image, nsfw_detected=nsfw_detected, watermark_detected=watermark_detected) diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py new file mode 100644 index 000000000000..21e280654cf5 --- /dev/null +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py @@ -0,0 +1,1097 @@ +import html +import inspect +import re +import urllib.parse as ul +from typing import Any, Callable, Dict, List, Optional, Union + +import numpy as np +import PIL +import torch +import torch.nn.functional as F +from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer + +from ...models import UNet2DConditionModel +from ...schedulers import DDPMScheduler +from ...utils import ( + BACKENDS_MAPPING, + PIL_INTERPOLATION, + is_accelerate_available, + is_accelerate_version, + is_bs4_available, + is_ftfy_available, + logging, + randn_tensor, + replace_example_docstring, +) +from ..pipeline_utils import DiffusionPipeline +from . import IFPipelineOutput +from .safety_checker import IFSafetyChecker +from .watermark import IFWatermarker + + +if is_bs4_available(): + from bs4 import BeautifulSoup + +if is_ftfy_available(): + import ftfy + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.resize +def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image: + w, h = images.size + + coef = w / h + + w, h = img_size, img_size + + if coef >= 1: + w = int(round(img_size / 8 * coef) * 8) + else: + h = int(round(img_size / 8 / coef) * 8) + + images = images.resize((w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None) + + return images + + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> from diffusers import IFImg2ImgPipeline, IFImg2ImgSuperResolutionPipeline, DiffusionPipeline + >>> from diffusers.utils import pt_to_pil + >>> import torch + >>> from PIL import Image + >>> import requests + >>> from io import BytesIO + + >>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg" + >>> response = requests.get(url) + >>> original_image = Image.open(BytesIO(response.content)).convert("RGB") + >>> original_image = original_image.resize((768, 512)) + + >>> pipe = IFImg2ImgPipeline.from_pretrained( + ... "DeepFloyd/IF-I-IF-v1.0", + ... variant="fp16", + ... torch_dtype=torch.float16, + ... ) + >>> pipe.enable_model_cpu_offload() + + >>> prompt = "A fantasy landscape in style minecraft" + >>> prompt_embeds, negative_embeds = pipe.encode_prompt(prompt) + + >>> image = pipe( + ... image=original_image, + ... prompt_embeds=prompt_embeds, + ... negative_prompt_embeds=negative_embeds, + ... output_type="pt", + ... ).images + + >>> # save intermediate image + >>> pil_image = pt_to_pil(image) + >>> pil_image[0].save("./if_stage_I.png") + + >>> super_res_1_pipe = IFImg2ImgSuperResolutionPipeline.from_pretrained( + ... "DeepFloyd/IF-II-L-v1.0", + ... text_encoder=None, + ... variant="fp16", + ... torch_dtype=torch.float16, + ... ) + >>> super_res_1_pipe.enable_model_cpu_offload() + + >>> image = super_res_1_pipe( + ... image=image, + ... original_image=original_image, + ... prompt_embeds=prompt_embeds, + ... negative_prompt_embeds=negative_embeds, + ... ).images + >>> image[0].save("./if_stage_II.png") + ``` +""" + + +class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline): + tokenizer: T5Tokenizer + text_encoder: T5EncoderModel + + unet: UNet2DConditionModel + scheduler: DDPMScheduler + image_noising_scheduler: DDPMScheduler + + feature_extractor: Optional[CLIPImageProcessor] + safety_checker: Optional[IFSafetyChecker] + + watermarker: Optional[IFWatermarker] + + bad_punct_regex = re.compile( + r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}" + ) # noqa + + _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor"] + + def __init__( + self, + tokenizer: T5Tokenizer, + text_encoder: T5EncoderModel, + unet: UNet2DConditionModel, + scheduler: DDPMScheduler, + image_noising_scheduler: DDPMScheduler, + safety_checker: Optional[IFSafetyChecker], + feature_extractor: Optional[CLIPImageProcessor], + watermarker: Optional[IFWatermarker], + requires_safety_checker: bool = True, + ): + super().__init__() + + if safety_checker is None and requires_safety_checker: + logger.warning( + f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure" + " that you abide to the conditions of the IF license and do not expose unfiltered" + " results in services or applications open to the public. Both the diffusers team and Hugging Face" + " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling" + " it only for use-cases that involve analyzing network behavior or auditing its results. For more" + " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ." + ) + + if safety_checker is not None and feature_extractor is None: + raise ValueError( + "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety" + " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead." + ) + + if unet.config.in_channels != 6: + logger.warn( + "It seems like you have loaded a checkpoint that shall not be used for super resolution from {unet.config._name_or_path} as it accepts {unet.config.in_channels} input channels instead of 6. Please make sure to pass a super resolution checkpoint as the `'unet'`: IFSuperResolutionPipeline.from_pretrained(unet=super_resolution_unet, ...)`." + ) + + self.register_modules( + tokenizer=tokenizer, + text_encoder=text_encoder, + unet=unet, + scheduler=scheduler, + image_noising_scheduler=image_noising_scheduler, + safety_checker=safety_checker, + feature_extractor=feature_extractor, + watermarker=watermarker, + ) + self.register_to_config(requires_safety_checker=requires_safety_checker) + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_sequential_cpu_offload + def enable_sequential_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's + models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only + when their specific submodule has its `forward` method called. + """ + if is_accelerate_available(): + from accelerate import cpu_offload + else: + raise ImportError("Please install accelerate via `pip install accelerate`") + + device = torch.device(f"cuda:{gpu_id}") + + models = [ + self.text_encoder, + self.unet, + ] + for cpu_offloaded_model in models: + if cpu_offloaded_model is not None: + cpu_offload(cpu_offloaded_model, device) + + if self.safety_checker is not None: + cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True) + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_model_cpu_offload + def enable_model_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared + to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` + method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with + `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): + from accelerate import cpu_offload_with_hook + else: + raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + hook = None + + if self.text_encoder is not None: + _, hook = cpu_offload_with_hook(self.text_encoder, device, prev_module_hook=hook) + + # Accelerate will move the next model to the device _before_ calling the offload hook of the + # previous model. This will cause both models to be present on the device at the same time. + # IF uses T5 for its text encoder which is really large. We can manually call the offload + # hook for the text encoder to ensure it's moved to the cpu before the unet is moved to + # the GPU. + self.text_encoder_offload_hook = hook + + _, hook = cpu_offload_with_hook(self.unet, device, prev_module_hook=hook) + + # if the safety checker isn't called, `unet_offload_hook` will have to be called to manually offload the unet + self.unet_offload_hook = hook + + if self.safety_checker is not None: + _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook) + + # We'll offload the last model manually. + self.final_offload_hook = hook + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.remove_all_hooks + def remove_all_hooks(self): + if is_accelerate_available(): + from accelerate.hooks import remove_hook_from_module + else: + raise ImportError("Please install accelerate via `pip install accelerate`") + + for model in [self.text_encoder, self.unet, self.safety_checker]: + if model is not None: + remove_hook_from_module(model, recurse=True) + + self.unet_offload_hook = None + self.text_encoder_offload_hook = None + self.final_offload_hook = None + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing + def _text_preprocessing(self, text, clean_caption=False): + if clean_caption and not is_bs4_available(): + logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`")) + logger.warn("Setting `clean_caption` to False...") + clean_caption = False + + if clean_caption and not is_ftfy_available(): + logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`")) + logger.warn("Setting `clean_caption` to False...") + clean_caption = False + + if not isinstance(text, (tuple, list)): + text = [text] + + def process(text: str): + if clean_caption: + text = self._clean_caption(text) + text = self._clean_caption(text) + else: + text = text.lower().strip() + return text + + return [process(t) for t in text] + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption + def _clean_caption(self, caption): + caption = str(caption) + caption = ul.unquote_plus(caption) + caption = caption.strip().lower() + caption = re.sub("", "person", caption) + # urls: + caption = re.sub( + r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa + "", + caption, + ) # regex for urls + caption = re.sub( + r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa + "", + caption, + ) # regex for urls + # html: + caption = BeautifulSoup(caption, features="html.parser").text + + # @ + caption = re.sub(r"@[\w\d]+\b", "", caption) + + # 31C0—31EF CJK Strokes + # 31F0—31FF Katakana Phonetic Extensions + # 3200—32FF Enclosed CJK Letters and Months + # 3300—33FF CJK Compatibility + # 3400—4DBF CJK Unified Ideographs Extension A + # 4DC0—4DFF Yijing Hexagram Symbols + # 4E00—9FFF CJK Unified Ideographs + caption = re.sub(r"[\u31c0-\u31ef]+", "", caption) + caption = re.sub(r"[\u31f0-\u31ff]+", "", caption) + caption = re.sub(r"[\u3200-\u32ff]+", "", caption) + caption = re.sub(r"[\u3300-\u33ff]+", "", caption) + caption = re.sub(r"[\u3400-\u4dbf]+", "", caption) + caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption) + caption = re.sub(r"[\u4e00-\u9fff]+", "", caption) + ####################################################### + + # все виды тире / all types of dash --> "-" + caption = re.sub( + r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa + "-", + caption, + ) + + # кавычки к одному стандарту + caption = re.sub(r"[`´«»“”¨]", '"', caption) + caption = re.sub(r"[‘’]", "'", caption) + + # " + caption = re.sub(r""?", "", caption) + # & + caption = re.sub(r"&", "", caption) + + # ip adresses: + caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption) + + # article ids: + caption = re.sub(r"\d:\d\d\s+$", "", caption) + + # \n + caption = re.sub(r"\\n", " ", caption) + + # "#123" + caption = re.sub(r"#\d{1,3}\b", "", caption) + # "#12345.." + caption = re.sub(r"#\d{5,}\b", "", caption) + # "123456.." + caption = re.sub(r"\b\d{6,}\b", "", caption) + # filenames: + caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption) + + # + caption = re.sub(r"[\"\']{2,}", r'"', caption) # """AUSVERKAUFT""" + caption = re.sub(r"[\.]{2,}", r" ", caption) # """AUSVERKAUFT""" + + caption = re.sub(self.bad_punct_regex, r" ", caption) # ***AUSVERKAUFT***, #AUSVERKAUFT + caption = re.sub(r"\s+\.\s+", r" ", caption) # " . " + + # this-is-my-cute-cat / this_is_my_cute_cat + regex2 = re.compile(r"(?:\-|\_)") + if len(re.findall(regex2, caption)) > 3: + caption = re.sub(regex2, " ", caption) + + caption = ftfy.fix_text(caption) + caption = html.unescape(html.unescape(caption)) + + caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption) # jc6640 + caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption) # jc6640vc + caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption) # 6640vc231 + + caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption) + caption = re.sub(r"(free\s)?download(\sfree)?", "", caption) + caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption) + caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption) + caption = re.sub(r"\bpage\s+\d+\b", "", caption) + + caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) # j2d1a2a... + + caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption) + + caption = re.sub(r"\b\s+\:\s+", r": ", caption) + caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption) + caption = re.sub(r"\s+", " ", caption) + + caption.strip() + + caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption) + caption = re.sub(r"^[\'\_,\-\:;]", r"", caption) + caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption) + caption = re.sub(r"^\.\S+$", "", caption) + + return caption.strip() + + @property + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device + def _execution_device(self): + r""" + Returns the device on which the pipeline's models will be executed. After calling + `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module + hooks. + """ + if not hasattr(self.unet, "_hf_hook"): + return self.device + for module in self.unet.modules(): + if ( + hasattr(module, "_hf_hook") + and hasattr(module._hf_hook, "execution_device") + and module._hf_hook.execution_device is not None + ): + return torch.device(module._hf_hook.execution_device) + return self.device + + @torch.no_grad() + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt + def encode_prompt( + self, + prompt, + do_classifier_free_guidance=True, + num_images_per_prompt=1, + device=None, + negative_prompt=None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + clean_caption: bool = False, + ): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + device: (`torch.device`, *optional*): + torch device to place the resulting embeddings on + num_images_per_prompt (`int`, *optional*, defaults to 1): + number of images that should be generated per prompt + do_classifier_free_guidance (`bool`, *optional*, defaults to `True`): + whether to use classifier free guidance or not + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. + Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + """ + if prompt is not None and negative_prompt is not None: + if type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + + if device is None: + device = self._execution_device + + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + # while T5 can handle much longer input sequences than 77, the text encoder was trained with a max length of 77 for IF + max_length = 77 + + if prompt_embeds is None: + prompt = self._text_preprocessing(prompt, clean_caption=clean_caption) + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=max_length, + truncation=True, + add_special_tokens=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( + text_input_ids, untruncated_ids + ): + removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1]) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {max_length} tokens: {removed_text}" + ) + + attention_mask = text_inputs.attention_mask.to(device) + + prompt_embeds = self.text_encoder( + text_input_ids.to(device), + attention_mask=attention_mask, + ) + prompt_embeds = prompt_embeds[0] + + if self.text_encoder is not None: + dtype = self.text_encoder.dtype + elif self.unet is not None: + dtype = self.unet.dtype + else: + dtype = None + + prompt_embeds = prompt_embeds.to(dtype=dtype, device=device) + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) + + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance and negative_prompt_embeds is None: + uncond_tokens: List[str] + if negative_prompt is None: + uncond_tokens = [""] * batch_size + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption) + max_length = prompt_embeds.shape[1] + uncond_input = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=max_length, + truncation=True, + return_attention_mask=True, + add_special_tokens=True, + return_tensors="pt", + ) + attention_mask = uncond_input.attention_mask.to(device) + + negative_prompt_embeds = self.text_encoder( + uncond_input.input_ids.to(device), + attention_mask=attention_mask, + ) + negative_prompt_embeds = negative_prompt_embeds[0] + + if do_classifier_free_guidance: + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + + negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device) + + negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) + negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + else: + negative_prompt_embeds = None + + return prompt_embeds, negative_prompt_embeds + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker + def run_safety_checker(self, image, device, dtype): + if self.safety_checker is not None: + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device) + image, nsfw_detected, watermark_detected = self.safety_checker( + images=image, + clip_input=safety_checker_input.pixel_values.to(dtype=dtype), + ) + else: + nsfw_detected = None + watermark_detected = None + + if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None: + self.unet_offload_hook.offload() + + return image, nsfw_detected, watermark_detected + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.prepare_extra_step_kwargs + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs + + def check_inputs( + self, + prompt, + image, + original_image, + batch_size, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): + if (callback_steps is None) or ( + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type" + f" {type(callback_steps)}." + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if prompt_embeds is not None and negative_prompt_embeds is not None: + if prompt_embeds.shape != negative_prompt_embeds.shape: + raise ValueError( + "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" + f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" + f" {negative_prompt_embeds.shape}." + ) + + # image + + if isinstance(image, list): + check_image_type = image[0] + else: + check_image_type = image + + if ( + not isinstance(check_image_type, torch.Tensor) + and not isinstance(check_image_type, PIL.Image.Image) + and not isinstance(check_image_type, np.ndarray) + ): + raise ValueError( + "`image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" + f" {type(check_image_type)}" + ) + + if isinstance(image, list): + image_batch_size = len(image) + elif isinstance(image, torch.Tensor): + image_batch_size = image.shape[0] + elif isinstance(image, PIL.Image.Image): + image_batch_size = 1 + elif isinstance(image, np.ndarray): + image_batch_size = image.shape[0] + else: + assert False + + if batch_size != image_batch_size: + raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}") + + # original_image + + if isinstance(original_image, list): + check_image_type = original_image[0] + else: + check_image_type = original_image + + if ( + not isinstance(check_image_type, torch.Tensor) + and not isinstance(check_image_type, PIL.Image.Image) + and not isinstance(check_image_type, np.ndarray) + ): + raise ValueError( + "`original_image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" + f" {type(check_image_type)}" + ) + + if isinstance(original_image, list): + image_batch_size = len(original_image) + elif isinstance(original_image, torch.Tensor): + image_batch_size = original_image.shape[0] + elif isinstance(original_image, PIL.Image.Image): + image_batch_size = 1 + elif isinstance(original_image, np.ndarray): + image_batch_size = original_image.shape[0] + else: + assert False + + if batch_size != image_batch_size: + raise ValueError( + f"original_image batch size: {image_batch_size} must be same as prompt batch size {batch_size}" + ) + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.preprocess_image with preprocess_image -> preprocess_original_image + def preprocess_original_image(self, image: PIL.Image.Image) -> torch.Tensor: + if not isinstance(image, list): + image = [image] + + def numpy_to_pt(images): + if images.ndim == 3: + images = images[..., None] + + images = torch.from_numpy(images.transpose(0, 3, 1, 2)) + return images + + if isinstance(image[0], PIL.Image.Image): + new_image = [] + + for image_ in image: + image_ = image_.convert("RGB") + image_ = resize(image_, self.unet.sample_size) + image_ = np.array(image_) + image_ = image_.astype(np.float32) + image_ = image_ / 127.5 - 1 + new_image.append(image_) + + image = new_image + + image = np.stack(image, axis=0) # to np + image = numpy_to_pt(image) # to pt + + elif isinstance(image[0], np.ndarray): + image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0) + image = numpy_to_pt(image) + + elif isinstance(image[0], torch.Tensor): + image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0) + + return image + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_superresolution.IFSuperResolutionPipeline.preprocess_image + def preprocess_image(self, image: PIL.Image.Image, num_images_per_prompt, device) -> torch.Tensor: + if not isinstance(image, torch.Tensor) and not isinstance(image, list): + image = [image] + + if isinstance(image[0], PIL.Image.Image): + image = [np.array(i).astype(np.float32) / 255.0 for i in image] + + image = np.stack(image, axis=0) # to np + torch.from_numpy(image.transpose(0, 3, 1, 2)) + elif isinstance(image[0], np.ndarray): + image = np.stack(image, axis=0) # to np + if image.ndim == 5: + image = image[0] + + image = torch.from_numpy(image.transpose(0, 3, 1, 2)) + elif isinstance(image, list) and isinstance(image[0], torch.Tensor): + dims = image[0].ndim + + if dims == 3: + image = torch.stack(image, dim=0) + elif dims == 4: + image = torch.concat(image, dim=0) + else: + raise ValueError(f"Image must have 3 or 4 dimensions, instead got {dims}") + + image = image.to(device=device, dtype=self.unet.dtype) + + image = image.repeat_interleave(num_images_per_prompt, dim=0) + + return image + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.get_timesteps + def get_timesteps(self, num_inference_steps, strength): + # get the original timestep using init_timestep + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) + + t_start = max(num_inference_steps - init_timestep, 0) + timesteps = self.scheduler.timesteps[t_start:] + + return timesteps, num_inference_steps - t_start + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.prepare_intermediate_images + def prepare_intermediate_images( + self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None + ): + _, channels, height, width = image.shape + + batch_size = batch_size * num_images_per_prompt + + shape = (batch_size, channels, height, width) + + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + + image = image.repeat_interleave(num_images_per_prompt, dim=0) + image = self.scheduler.add_noise(image, noise, timestep) + + return image + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + image: Union[PIL.Image.Image, np.ndarray, torch.FloatTensor], + original_image: Union[ + PIL.Image.Image, torch.Tensor, np.ndarray, List[PIL.Image.Image], List[torch.Tensor], List[np.ndarray] + ] = None, + strength: float = 0.8, + prompt: Union[str, List[str]] = None, + num_inference_steps: int = 50, + timesteps: List[int] = None, + guidance_scale: float = 4.0, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + noise_level: int = 250, + clean_caption: bool = True, + ): + """ + Function invoked when calling the pipeline for generation. + + Args: + image (`torch.FloatTensor` or `PIL.Image.Image`): + `Image`, or tensor representing an image batch, that will be used as the starting point for the + process. + original_image (`torch.FloatTensor` or `PIL.Image.Image`): + The original image that `image` was varied from. + strength (`float`, *optional*, defaults to 0.8): + Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` + will be used as a starting point, adding more noise to it the larger the `strength`. The number of + denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will + be maximum and the denoising process will run for the full number of iterations specified in + `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + instead. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + timesteps (`List[int]`, *optional*): + Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps` + timesteps are used. Must be in descending order. + guidance_scale (`float`, *optional*, defaults to 7.5): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to + [`schedulers.DDIMScheduler`], will be ignored for others. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. + callback (`Callable`, *optional*): + A function that will be called every `callback_steps` steps during inference. The function will be + called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function will be called. If not specified, the callback will be + called at every step. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). + noise_level (`int`, *optional*, defaults to 250): + The amount of noise to add to the upscaled image. Must be in the range `[0, 1000)` + clean_caption (`bool`, *optional*, defaults to `True`): + Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to + be installed. If the dependencies are not installed, the embeddings will be created from the raw + prompt. + + Examples: + + Returns: + [`~pipelines.stable_diffusion.IFPipelineOutput`] or `tuple`: + [`~pipelines.stable_diffusion.IFPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When + returning a tuple, the first element is a list with the generated images, and the second element is a list + of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw) + or watermarked content, according to the `safety_checker`. + """ + # 1. Check inputs. Raise error if not correct + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + self.check_inputs( + prompt, + image, + original_image, + batch_size, + callback_steps, + negative_prompt, + prompt_embeds, + negative_prompt_embeds, + ) + + # 2. Define call parameters + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + + device = self._execution_device + + # 3. Encode input prompt + prompt_embeds, negative_prompt_embeds = self.encode_prompt( + prompt, + do_classifier_free_guidance, + num_images_per_prompt=num_images_per_prompt, + device=device, + negative_prompt=negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + clean_caption=clean_caption, + ) + + if do_classifier_free_guidance: + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + + dtype = prompt_embeds.dtype + + # 4. Prepare timesteps + if timesteps is not None: + self.scheduler.set_timesteps(timesteps=timesteps, device=device) + timesteps = self.scheduler.timesteps + num_inference_steps = len(timesteps) + else: + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps = self.scheduler.timesteps + + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength) + + # 5. prepare original image + original_image = self.preprocess_original_image(original_image) + original_image = original_image.to(device=device, dtype=dtype) + + # 6. Prepare intermediate images + noise_timestep = timesteps[0:1] + noise_timestep = noise_timestep.repeat(batch_size * num_images_per_prompt) + + intermediate_images = self.prepare_intermediate_images( + original_image, + noise_timestep, + batch_size, + num_images_per_prompt, + dtype, + device, + generator, + ) + + # 7. Prepare upscaled image and noise level + _, _, height, width = original_image.shape + + image = self.preprocess_image(image, num_images_per_prompt, device) + + upscaled = F.interpolate(image, (height, width), mode="bilinear", align_corners=True) + + noise_level = torch.tensor([noise_level] * upscaled.shape[0], device=upscaled.device) + noise = randn_tensor(upscaled.shape, generator=generator, device=upscaled.device, dtype=upscaled.dtype) + upscaled = self.image_noising_scheduler.add_noise(upscaled, noise, timesteps=noise_level) + + if do_classifier_free_guidance: + noise_level = torch.cat([noise_level] * 2) + + # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + # HACK: see comment in `enable_model_cpu_offload` + if hasattr(self, "text_encoder_offload_hook") and self.text_encoder_offload_hook is not None: + self.text_encoder_offload_hook.offload() + + # 9. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + model_input = torch.cat([intermediate_images, upscaled], dim=1) + + model_input = torch.cat([model_input] * 2) if do_classifier_free_guidance else model_input + model_input = self.scheduler.scale_model_input(model_input, t) + + # predict the noise residual + noise_pred = self.unet( + model_input, + t, + encoder_hidden_states=prompt_embeds, + class_labels=noise_level, + cross_attention_kwargs=cross_attention_kwargs, + ).sample + + # perform guidance + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred_uncond, _ = noise_pred_uncond.split(model_input.shape[1] // 2, dim=1) + noise_pred_text, predicted_variance = noise_pred_text.split(model_input.shape[1] // 2, dim=1) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + noise_pred = torch.cat([noise_pred, predicted_variance], dim=1) + + # compute the previous noisy sample x_t -> x_t-1 + intermediate_images = self.scheduler.step( + noise_pred, t, intermediate_images, **extra_step_kwargs + ).prev_sample + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + callback(i, t, intermediate_images) + + image = intermediate_images + + if output_type == "pil": + # 10. Post-processing + image = (image / 2 + 0.5).clamp(0, 1) + image = image.cpu().permute(0, 2, 3, 1).float().numpy() + + # 11. Run safety checker + image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype) + + # 12. Convert to PIL + image = self.numpy_to_pil(image) + + # 13. Apply watermark + if self.watermarker is not None: + self.watermarker.apply_watermark(image, self.unet.config.sample_size) + elif output_type == "pt": + nsfw_detected = None + watermark_detected = None + + if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None: + self.unet_offload_hook.offload() + else: + # 10. Post-processing + image = (image / 2 + 0.5).clamp(0, 1) + image = image.cpu().permute(0, 2, 3, 1).float().numpy() + + # 11. Run safety checker + image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype) + + # Offload last model to CPU + if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: + self.final_offload_hook.offload() + + if not return_dict: + return (image, nsfw_detected, watermark_detected) + + return IFPipelineOutput(images=image, nsfw_detected=nsfw_detected, watermark_detected=watermark_detected) diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py new file mode 100644 index 000000000000..95eba1cc7d24 --- /dev/null +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py @@ -0,0 +1,1098 @@ +import html +import inspect +import re +import urllib.parse as ul +from typing import Any, Callable, Dict, List, Optional, Union + +import numpy as np +import PIL +import torch +from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer + +from ...models import UNet2DConditionModel +from ...schedulers import DDPMScheduler +from ...utils import ( + BACKENDS_MAPPING, + PIL_INTERPOLATION, + is_accelerate_available, + is_accelerate_version, + is_bs4_available, + is_ftfy_available, + logging, + randn_tensor, + replace_example_docstring, +) +from ..pipeline_utils import DiffusionPipeline +from . import IFPipelineOutput +from .safety_checker import IFSafetyChecker +from .watermark import IFWatermarker + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +if is_bs4_available(): + from bs4 import BeautifulSoup + +if is_ftfy_available(): + import ftfy + + +# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.resize +def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image: + w, h = images.size + + coef = w / h + + w, h = img_size, img_size + + if coef >= 1: + w = int(round(img_size / 8 * coef) * 8) + else: + h = int(round(img_size / 8 / coef) * 8) + + images = images.resize((w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None) + + return images + + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> from diffusers import IFInpaintingPipeline, IFInpaintingSuperResolutionPipeline, DiffusionPipeline + >>> from diffusers.utils import pt_to_pil + >>> import torch + >>> from PIL import Image + >>> import requests + >>> from io import BytesIO + + >>> url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/if/person.png" + >>> response = requests.get(url) + >>> original_image = Image.open(BytesIO(response.content)).convert("RGB") + >>> original_image = original_image + + >>> url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/if/glasses_mask.png" + >>> response = requests.get(url) + >>> mask_image = Image.open(BytesIO(response.content)) + >>> mask_image = mask_image + + >>> pipe = IFInpaintingPipeline.from_pretrained( + ... "DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16 + ... ) + >>> pipe.enable_model_cpu_offload() + + >>> prompt = "blue sunglasses" + >>> prompt_embeds, negative_embeds = pipe.encode_prompt(prompt) + + >>> image = pipe( + ... image=original_image, + ... mask_image=mask_image, + ... prompt_embeds=prompt_embeds, + ... negative_prompt_embeds=negative_embeds, + ... output_type="pt", + ... ).images + + >>> # save intermediate image + >>> pil_image = pt_to_pil(image) + >>> pil_image[0].save("./if_stage_I.png") + + >>> super_res_1_pipe = IFInpaintingSuperResolutionPipeline.from_pretrained( + ... "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16 + ... ) + >>> super_res_1_pipe.enable_model_cpu_offload() + + >>> image = super_res_1_pipe( + ... image=image, + ... mask_image=mask_image, + ... original_image=original_image, + ... prompt_embeds=prompt_embeds, + ... negative_prompt_embeds=negative_embeds, + ... ).images + >>> image[0].save("./if_stage_II.png") + ``` +""" + + +class IFInpaintingPipeline(DiffusionPipeline): + tokenizer: T5Tokenizer + text_encoder: T5EncoderModel + + unet: UNet2DConditionModel + scheduler: DDPMScheduler + + feature_extractor: Optional[CLIPImageProcessor] + safety_checker: Optional[IFSafetyChecker] + + watermarker: Optional[IFWatermarker] + + bad_punct_regex = re.compile( + r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}" + ) # noqa + + _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"] + + def __init__( + self, + tokenizer: T5Tokenizer, + text_encoder: T5EncoderModel, + unet: UNet2DConditionModel, + scheduler: DDPMScheduler, + safety_checker: Optional[IFSafetyChecker], + feature_extractor: Optional[CLIPImageProcessor], + watermarker: Optional[IFWatermarker], + requires_safety_checker: bool = True, + ): + super().__init__() + + if safety_checker is None and requires_safety_checker: + logger.warning( + f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure" + " that you abide to the conditions of the IF license and do not expose unfiltered" + " results in services or applications open to the public. Both the diffusers team and Hugging Face" + " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling" + " it only for use-cases that involve analyzing network behavior or auditing its results. For more" + " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ." + ) + + if safety_checker is not None and feature_extractor is None: + raise ValueError( + "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety" + " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead." + ) + + self.register_modules( + tokenizer=tokenizer, + text_encoder=text_encoder, + unet=unet, + scheduler=scheduler, + safety_checker=safety_checker, + feature_extractor=feature_extractor, + watermarker=watermarker, + ) + self.register_to_config(requires_safety_checker=requires_safety_checker) + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_sequential_cpu_offload + def enable_sequential_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's + models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only + when their specific submodule has its `forward` method called. + """ + if is_accelerate_available(): + from accelerate import cpu_offload + else: + raise ImportError("Please install accelerate via `pip install accelerate`") + + device = torch.device(f"cuda:{gpu_id}") + + models = [ + self.text_encoder, + self.unet, + ] + for cpu_offloaded_model in models: + if cpu_offloaded_model is not None: + cpu_offload(cpu_offloaded_model, device) + + if self.safety_checker is not None: + cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True) + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_model_cpu_offload + def enable_model_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared + to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` + method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with + `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): + from accelerate import cpu_offload_with_hook + else: + raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + hook = None + + if self.text_encoder is not None: + _, hook = cpu_offload_with_hook(self.text_encoder, device, prev_module_hook=hook) + + # Accelerate will move the next model to the device _before_ calling the offload hook of the + # previous model. This will cause both models to be present on the device at the same time. + # IF uses T5 for its text encoder which is really large. We can manually call the offload + # hook for the text encoder to ensure it's moved to the cpu before the unet is moved to + # the GPU. + self.text_encoder_offload_hook = hook + + _, hook = cpu_offload_with_hook(self.unet, device, prev_module_hook=hook) + + # if the safety checker isn't called, `unet_offload_hook` will have to be called to manually offload the unet + self.unet_offload_hook = hook + + if self.safety_checker is not None: + _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook) + + # We'll offload the last model manually. + self.final_offload_hook = hook + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.remove_all_hooks + def remove_all_hooks(self): + if is_accelerate_available(): + from accelerate.hooks import remove_hook_from_module + else: + raise ImportError("Please install accelerate via `pip install accelerate`") + + for model in [self.text_encoder, self.unet, self.safety_checker]: + if model is not None: + remove_hook_from_module(model, recurse=True) + + self.unet_offload_hook = None + self.text_encoder_offload_hook = None + self.final_offload_hook = None + + @property + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device + def _execution_device(self): + r""" + Returns the device on which the pipeline's models will be executed. After calling + `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module + hooks. + """ + if not hasattr(self.unet, "_hf_hook"): + return self.device + for module in self.unet.modules(): + if ( + hasattr(module, "_hf_hook") + and hasattr(module._hf_hook, "execution_device") + and module._hf_hook.execution_device is not None + ): + return torch.device(module._hf_hook.execution_device) + return self.device + + @torch.no_grad() + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt + def encode_prompt( + self, + prompt, + do_classifier_free_guidance=True, + num_images_per_prompt=1, + device=None, + negative_prompt=None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + clean_caption: bool = False, + ): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + device: (`torch.device`, *optional*): + torch device to place the resulting embeddings on + num_images_per_prompt (`int`, *optional*, defaults to 1): + number of images that should be generated per prompt + do_classifier_free_guidance (`bool`, *optional*, defaults to `True`): + whether to use classifier free guidance or not + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. + Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + """ + if prompt is not None and negative_prompt is not None: + if type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + + if device is None: + device = self._execution_device + + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + # while T5 can handle much longer input sequences than 77, the text encoder was trained with a max length of 77 for IF + max_length = 77 + + if prompt_embeds is None: + prompt = self._text_preprocessing(prompt, clean_caption=clean_caption) + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=max_length, + truncation=True, + add_special_tokens=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( + text_input_ids, untruncated_ids + ): + removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1]) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {max_length} tokens: {removed_text}" + ) + + attention_mask = text_inputs.attention_mask.to(device) + + prompt_embeds = self.text_encoder( + text_input_ids.to(device), + attention_mask=attention_mask, + ) + prompt_embeds = prompt_embeds[0] + + if self.text_encoder is not None: + dtype = self.text_encoder.dtype + elif self.unet is not None: + dtype = self.unet.dtype + else: + dtype = None + + prompt_embeds = prompt_embeds.to(dtype=dtype, device=device) + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) + + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance and negative_prompt_embeds is None: + uncond_tokens: List[str] + if negative_prompt is None: + uncond_tokens = [""] * batch_size + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption) + max_length = prompt_embeds.shape[1] + uncond_input = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=max_length, + truncation=True, + return_attention_mask=True, + add_special_tokens=True, + return_tensors="pt", + ) + attention_mask = uncond_input.attention_mask.to(device) + + negative_prompt_embeds = self.text_encoder( + uncond_input.input_ids.to(device), + attention_mask=attention_mask, + ) + negative_prompt_embeds = negative_prompt_embeds[0] + + if do_classifier_free_guidance: + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + + negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device) + + negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) + negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + else: + negative_prompt_embeds = None + + return prompt_embeds, negative_prompt_embeds + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker + def run_safety_checker(self, image, device, dtype): + if self.safety_checker is not None: + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device) + image, nsfw_detected, watermark_detected = self.safety_checker( + images=image, + clip_input=safety_checker_input.pixel_values.to(dtype=dtype), + ) + else: + nsfw_detected = None + watermark_detected = None + + if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None: + self.unet_offload_hook.offload() + + return image, nsfw_detected, watermark_detected + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.prepare_extra_step_kwargs + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs + + def check_inputs( + self, + prompt, + image, + mask_image, + batch_size, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): + if (callback_steps is None) or ( + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type" + f" {type(callback_steps)}." + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if prompt_embeds is not None and negative_prompt_embeds is not None: + if prompt_embeds.shape != negative_prompt_embeds.shape: + raise ValueError( + "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" + f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" + f" {negative_prompt_embeds.shape}." + ) + + # image + + if isinstance(image, list): + check_image_type = image[0] + else: + check_image_type = image + + if ( + not isinstance(check_image_type, torch.Tensor) + and not isinstance(check_image_type, PIL.Image.Image) + and not isinstance(check_image_type, np.ndarray) + ): + raise ValueError( + "`image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" + f" {type(check_image_type)}" + ) + + if isinstance(image, list): + image_batch_size = len(image) + elif isinstance(image, torch.Tensor): + image_batch_size = image.shape[0] + elif isinstance(image, PIL.Image.Image): + image_batch_size = 1 + elif isinstance(image, np.ndarray): + image_batch_size = image.shape[0] + else: + assert False + + if batch_size != image_batch_size: + raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}") + + # mask_image + + if isinstance(mask_image, list): + check_image_type = mask_image[0] + else: + check_image_type = mask_image + + if ( + not isinstance(check_image_type, torch.Tensor) + and not isinstance(check_image_type, PIL.Image.Image) + and not isinstance(check_image_type, np.ndarray) + ): + raise ValueError( + "`mask_image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" + f" {type(check_image_type)}" + ) + + if isinstance(mask_image, list): + image_batch_size = len(mask_image) + elif isinstance(mask_image, torch.Tensor): + image_batch_size = mask_image.shape[0] + elif isinstance(mask_image, PIL.Image.Image): + image_batch_size = 1 + elif isinstance(mask_image, np.ndarray): + image_batch_size = mask_image.shape[0] + else: + assert False + + if image_batch_size != 1 and batch_size != image_batch_size: + raise ValueError( + f"mask_image batch size: {image_batch_size} must be `1` or the same as prompt batch size {batch_size}" + ) + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing + def _text_preprocessing(self, text, clean_caption=False): + if clean_caption and not is_bs4_available(): + logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`")) + logger.warn("Setting `clean_caption` to False...") + clean_caption = False + + if clean_caption and not is_ftfy_available(): + logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`")) + logger.warn("Setting `clean_caption` to False...") + clean_caption = False + + if not isinstance(text, (tuple, list)): + text = [text] + + def process(text: str): + if clean_caption: + text = self._clean_caption(text) + text = self._clean_caption(text) + else: + text = text.lower().strip() + return text + + return [process(t) for t in text] + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption + def _clean_caption(self, caption): + caption = str(caption) + caption = ul.unquote_plus(caption) + caption = caption.strip().lower() + caption = re.sub("", "person", caption) + # urls: + caption = re.sub( + r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa + "", + caption, + ) # regex for urls + caption = re.sub( + r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa + "", + caption, + ) # regex for urls + # html: + caption = BeautifulSoup(caption, features="html.parser").text + + # @ + caption = re.sub(r"@[\w\d]+\b", "", caption) + + # 31C0—31EF CJK Strokes + # 31F0—31FF Katakana Phonetic Extensions + # 3200—32FF Enclosed CJK Letters and Months + # 3300—33FF CJK Compatibility + # 3400—4DBF CJK Unified Ideographs Extension A + # 4DC0—4DFF Yijing Hexagram Symbols + # 4E00—9FFF CJK Unified Ideographs + caption = re.sub(r"[\u31c0-\u31ef]+", "", caption) + caption = re.sub(r"[\u31f0-\u31ff]+", "", caption) + caption = re.sub(r"[\u3200-\u32ff]+", "", caption) + caption = re.sub(r"[\u3300-\u33ff]+", "", caption) + caption = re.sub(r"[\u3400-\u4dbf]+", "", caption) + caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption) + caption = re.sub(r"[\u4e00-\u9fff]+", "", caption) + ####################################################### + + # все виды тире / all types of dash --> "-" + caption = re.sub( + r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa + "-", + caption, + ) + + # кавычки к одному стандарту + caption = re.sub(r"[`´«»“”¨]", '"', caption) + caption = re.sub(r"[‘’]", "'", caption) + + # " + caption = re.sub(r""?", "", caption) + # & + caption = re.sub(r"&", "", caption) + + # ip adresses: + caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption) + + # article ids: + caption = re.sub(r"\d:\d\d\s+$", "", caption) + + # \n + caption = re.sub(r"\\n", " ", caption) + + # "#123" + caption = re.sub(r"#\d{1,3}\b", "", caption) + # "#12345.." + caption = re.sub(r"#\d{5,}\b", "", caption) + # "123456.." + caption = re.sub(r"\b\d{6,}\b", "", caption) + # filenames: + caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption) + + # + caption = re.sub(r"[\"\']{2,}", r'"', caption) # """AUSVERKAUFT""" + caption = re.sub(r"[\.]{2,}", r" ", caption) # """AUSVERKAUFT""" + + caption = re.sub(self.bad_punct_regex, r" ", caption) # ***AUSVERKAUFT***, #AUSVERKAUFT + caption = re.sub(r"\s+\.\s+", r" ", caption) # " . " + + # this-is-my-cute-cat / this_is_my_cute_cat + regex2 = re.compile(r"(?:\-|\_)") + if len(re.findall(regex2, caption)) > 3: + caption = re.sub(regex2, " ", caption) + + caption = ftfy.fix_text(caption) + caption = html.unescape(html.unescape(caption)) + + caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption) # jc6640 + caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption) # jc6640vc + caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption) # 6640vc231 + + caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption) + caption = re.sub(r"(free\s)?download(\sfree)?", "", caption) + caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption) + caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption) + caption = re.sub(r"\bpage\s+\d+\b", "", caption) + + caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) # j2d1a2a... + + caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption) + + caption = re.sub(r"\b\s+\:\s+", r": ", caption) + caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption) + caption = re.sub(r"\s+", " ", caption) + + caption.strip() + + caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption) + caption = re.sub(r"^[\'\_,\-\:;]", r"", caption) + caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption) + caption = re.sub(r"^\.\S+$", "", caption) + + return caption.strip() + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.preprocess_image + def preprocess_image(self, image: PIL.Image.Image) -> torch.Tensor: + if not isinstance(image, list): + image = [image] + + def numpy_to_pt(images): + if images.ndim == 3: + images = images[..., None] + + images = torch.from_numpy(images.transpose(0, 3, 1, 2)) + return images + + if isinstance(image[0], PIL.Image.Image): + new_image = [] + + for image_ in image: + image_ = image_.convert("RGB") + image_ = resize(image_, self.unet.sample_size) + image_ = np.array(image_) + image_ = image_.astype(np.float32) + image_ = image_ / 127.5 - 1 + new_image.append(image_) + + image = new_image + + image = np.stack(image, axis=0) # to np + image = numpy_to_pt(image) # to pt + + elif isinstance(image[0], np.ndarray): + image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0) + image = numpy_to_pt(image) + + elif isinstance(image[0], torch.Tensor): + image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0) + + return image + + def preprocess_mask_image(self, mask_image) -> torch.Tensor: + if not isinstance(mask_image, list): + mask_image = [mask_image] + + if isinstance(mask_image[0], torch.Tensor): + mask_image = torch.cat(mask_image, axis=0) if mask_image[0].ndim == 4 else torch.stack(mask_image, axis=0) + + if mask_image.ndim == 2: + # Batch and add channel dim for single mask + mask_image = mask_image.unsqueeze(0).unsqueeze(0) + elif mask_image.ndim == 3 and mask_image.shape[0] == 1: + # Single mask, the 0'th dimension is considered to be + # the existing batch size of 1 + mask_image = mask_image.unsqueeze(0) + elif mask_image.ndim == 3 and mask_image.shape[0] != 1: + # Batch of mask, the 0'th dimension is considered to be + # the batching dimension + mask_image = mask_image.unsqueeze(1) + + mask_image[mask_image < 0.5] = 0 + mask_image[mask_image >= 0.5] = 1 + + elif isinstance(mask_image[0], PIL.Image.Image): + new_mask_image = [] + + for mask_image_ in mask_image: + mask_image_ = mask_image_.convert("L") + mask_image_ = resize(mask_image_, self.unet.sample_size) + mask_image_ = np.array(mask_image_) + mask_image_ = mask_image_[None, None, :] + new_mask_image.append(mask_image_) + + mask_image = new_mask_image + + mask_image = np.concatenate(mask_image, axis=0) + mask_image = mask_image.astype(np.float32) / 255.0 + mask_image[mask_image < 0.5] = 0 + mask_image[mask_image >= 0.5] = 1 + mask_image = torch.from_numpy(mask_image) + + elif isinstance(mask_image[0], np.ndarray): + mask_image = np.concatenate([m[None, None, :] for m in mask_image], axis=0) + + mask_image[mask_image < 0.5] = 0 + mask_image[mask_image >= 0.5] = 1 + mask_image = torch.from_numpy(mask_image) + + return mask_image + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.get_timesteps + def get_timesteps(self, num_inference_steps, strength): + # get the original timestep using init_timestep + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) + + t_start = max(num_inference_steps - init_timestep, 0) + timesteps = self.scheduler.timesteps[t_start:] + + return timesteps, num_inference_steps - t_start + + def prepare_intermediate_images( + self, image, timestep, batch_size, num_images_per_prompt, dtype, device, mask_image, generator=None + ): + image_batch_size, channels, height, width = image.shape + + batch_size = batch_size * num_images_per_prompt + + shape = (batch_size, channels, height, width) + + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + + image = image.repeat_interleave(num_images_per_prompt, dim=0) + noised_image = self.scheduler.add_noise(image, noise, timestep) + + image = (1 - mask_image) * image + mask_image * noised_image + + return image + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Union[str, List[str]] = None, + image: Union[ + PIL.Image.Image, torch.Tensor, np.ndarray, List[PIL.Image.Image], List[torch.Tensor], List[np.ndarray] + ] = None, + mask_image: Union[ + PIL.Image.Image, torch.Tensor, np.ndarray, List[PIL.Image.Image], List[torch.Tensor], List[np.ndarray] + ] = None, + strength: float = 1.0, + num_inference_steps: int = 50, + timesteps: List[int] = None, + guidance_scale: float = 7.0, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback_steps: int = 1, + clean_caption: bool = True, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + ): + """ + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + instead. + image (`torch.FloatTensor` or `PIL.Image.Image`): + `Image`, or tensor representing an image batch, that will be used as the starting point for the + process. + mask_image (`PIL.Image.Image`): + `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be + repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted + to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L) + instead of 3, so the expected shape would be `(B, H, W, 1)`. + strength (`float`, *optional*, defaults to 0.8): + Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` + will be used as a starting point, adding more noise to it the larger the `strength`. The number of + denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will + be maximum and the denoising process will run for the full number of iterations specified in + `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + timesteps (`List[int]`, *optional*): + Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps` + timesteps are used. Must be in descending order. + guidance_scale (`float`, *optional*, defaults to 7.5): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to + [`schedulers.DDIMScheduler`], will be ignored for others. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. + callback (`Callable`, *optional*): + A function that will be called every `callback_steps` steps during inference. The function will be + called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function will be called. If not specified, the callback will be + called at every step. + clean_caption (`bool`, *optional*, defaults to `True`): + Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to + be installed. If the dependencies are not installed, the embeddings will be created from the raw + prompt. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). + + Examples: + + Returns: + [`~pipelines.stable_diffusion.IFPipelineOutput`] or `tuple`: + [`~pipelines.stable_diffusion.IFPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When + returning a tuple, the first element is a list with the generated images, and the second element is a list + of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw) + or watermarked content, according to the `safety_checker`. + """ + # 1. Check inputs. Raise error if not correct + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + self.check_inputs( + prompt, + image, + mask_image, + batch_size, + callback_steps, + negative_prompt, + prompt_embeds, + negative_prompt_embeds, + ) + + # 2. Define call parameters + device = self._execution_device + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + + # 3. Encode input prompt + prompt_embeds, negative_prompt_embeds = self.encode_prompt( + prompt, + do_classifier_free_guidance, + num_images_per_prompt=num_images_per_prompt, + device=device, + negative_prompt=negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + clean_caption=clean_caption, + ) + + if do_classifier_free_guidance: + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + + dtype = prompt_embeds.dtype + + # 4. Prepare timesteps + if timesteps is not None: + self.scheduler.set_timesteps(timesteps=timesteps, device=device) + timesteps = self.scheduler.timesteps + num_inference_steps = len(timesteps) + else: + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps = self.scheduler.timesteps + + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength) + + # 5. Prepare intermediate images + image = self.preprocess_image(image) + image = image.to(device=device, dtype=dtype) + + mask_image = self.preprocess_mask_image(mask_image) + mask_image = mask_image.to(device=device, dtype=dtype) + + if mask_image.shape[0] == 1: + mask_image = mask_image.repeat_interleave(batch_size * num_images_per_prompt, dim=0) + else: + mask_image = mask_image.repeat_interleave(num_images_per_prompt, dim=0) + + noise_timestep = timesteps[0:1] + noise_timestep = noise_timestep.repeat(batch_size * num_images_per_prompt) + + intermediate_images = self.prepare_intermediate_images( + image, noise_timestep, batch_size, num_images_per_prompt, dtype, device, mask_image, generator + ) + + # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + # HACK: see comment in `enable_model_cpu_offload` + if hasattr(self, "text_encoder_offload_hook") and self.text_encoder_offload_hook is not None: + self.text_encoder_offload_hook.offload() + + # 7. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + model_input = ( + torch.cat([intermediate_images] * 2) if do_classifier_free_guidance else intermediate_images + ) + model_input = self.scheduler.scale_model_input(model_input, t) + + # predict the noise residual + noise_pred = self.unet( + model_input, + t, + encoder_hidden_states=prompt_embeds, + cross_attention_kwargs=cross_attention_kwargs, + ).sample + + # perform guidance + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred_uncond, _ = noise_pred_uncond.split(model_input.shape[1], dim=1) + noise_pred_text, predicted_variance = noise_pred_text.split(model_input.shape[1], dim=1) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + noise_pred = torch.cat([noise_pred, predicted_variance], dim=1) + + # compute the previous noisy sample x_t -> x_t-1 + prev_intermediate_images = intermediate_images + + intermediate_images = self.scheduler.step( + noise_pred, t, intermediate_images, **extra_step_kwargs + ).prev_sample + + intermediate_images = (1 - mask_image) * prev_intermediate_images + mask_image * intermediate_images + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + callback(i, t, intermediate_images) + + image = intermediate_images + + if output_type == "pil": + # 8. Post-processing + image = (image / 2 + 0.5).clamp(0, 1) + image = image.cpu().permute(0, 2, 3, 1).float().numpy() + + # 9. Run safety checker + image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype) + + # 10. Convert to PIL + image = self.numpy_to_pil(image) + + # 11. Apply watermark + if self.watermarker is not None: + self.watermarker.apply_watermark(image, self.unet.config.sample_size) + elif output_type == "pt": + nsfw_detected = None + watermark_detected = None + + if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None: + self.unet_offload_hook.offload() + else: + # 8. Post-processing + image = (image / 2 + 0.5).clamp(0, 1) + image = image.cpu().permute(0, 2, 3, 1).float().numpy() + + # 9. Run safety checker + image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype) + + # Offload last model to CPU + if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: + self.final_offload_hook.offload() + + if not return_dict: + return (image, nsfw_detected, watermark_detected) + + return IFPipelineOutput(images=image, nsfw_detected=nsfw_detected, watermark_detected=watermark_detected) diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py new file mode 100644 index 000000000000..4eb0bf300fa5 --- /dev/null +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py @@ -0,0 +1,1208 @@ +import html +import inspect +import re +import urllib.parse as ul +from typing import Any, Callable, Dict, List, Optional, Union + +import numpy as np +import PIL +import torch +import torch.nn.functional as F +from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer + +from ...models import UNet2DConditionModel +from ...schedulers import DDPMScheduler +from ...utils import ( + BACKENDS_MAPPING, + PIL_INTERPOLATION, + is_accelerate_available, + is_accelerate_version, + is_bs4_available, + is_ftfy_available, + logging, + randn_tensor, + replace_example_docstring, +) +from ..pipeline_utils import DiffusionPipeline +from . import IFPipelineOutput +from .safety_checker import IFSafetyChecker +from .watermark import IFWatermarker + + +if is_bs4_available(): + from bs4 import BeautifulSoup + +if is_ftfy_available(): + import ftfy + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.resize +def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image: + w, h = images.size + + coef = w / h + + w, h = img_size, img_size + + if coef >= 1: + w = int(round(img_size / 8 * coef) * 8) + else: + h = int(round(img_size / 8 / coef) * 8) + + images = images.resize((w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None) + + return images + + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> from diffusers import IFInpaintingPipeline, IFInpaintingSuperResolutionPipeline, DiffusionPipeline + >>> from diffusers.utils import pt_to_pil + >>> import torch + >>> from PIL import Image + >>> import requests + >>> from io import BytesIO + + >>> url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/if/person.png" + >>> response = requests.get(url) + >>> original_image = Image.open(BytesIO(response.content)).convert("RGB") + >>> original_image = original_image + + >>> url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/if/glasses_mask.png" + >>> response = requests.get(url) + >>> mask_image = Image.open(BytesIO(response.content)) + >>> mask_image = mask_image + + >>> pipe = IFInpaintingPipeline.from_pretrained( + ... "DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16 + ... ) + >>> pipe.enable_model_cpu_offload() + + >>> prompt = "blue sunglasses" + + >>> prompt_embeds, negative_embeds = pipe.encode_prompt(prompt) + >>> image = pipe( + ... image=original_image, + ... mask_image=mask_image, + ... prompt_embeds=prompt_embeds, + ... negative_prompt_embeds=negative_embeds, + ... output_type="pt", + ... ).images + + >>> # save intermediate image + >>> pil_image = pt_to_pil(image) + >>> pil_image[0].save("./if_stage_I.png") + + >>> super_res_1_pipe = IFInpaintingSuperResolutionPipeline.from_pretrained( + ... "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16 + ... ) + >>> super_res_1_pipe.enable_model_cpu_offload() + + >>> image = super_res_1_pipe( + ... image=image, + ... mask_image=mask_image, + ... original_image=original_image, + ... prompt_embeds=prompt_embeds, + ... negative_prompt_embeds=negative_embeds, + ... ).images + >>> image[0].save("./if_stage_II.png") + ``` + """ + + +class IFInpaintingSuperResolutionPipeline(DiffusionPipeline): + tokenizer: T5Tokenizer + text_encoder: T5EncoderModel + + unet: UNet2DConditionModel + scheduler: DDPMScheduler + image_noising_scheduler: DDPMScheduler + + feature_extractor: Optional[CLIPImageProcessor] + safety_checker: Optional[IFSafetyChecker] + + watermarker: Optional[IFWatermarker] + + bad_punct_regex = re.compile( + r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}" + ) # noqa + + _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"] + + def __init__( + self, + tokenizer: T5Tokenizer, + text_encoder: T5EncoderModel, + unet: UNet2DConditionModel, + scheduler: DDPMScheduler, + image_noising_scheduler: DDPMScheduler, + safety_checker: Optional[IFSafetyChecker], + feature_extractor: Optional[CLIPImageProcessor], + watermarker: Optional[IFWatermarker], + requires_safety_checker: bool = True, + ): + super().__init__() + + if safety_checker is None and requires_safety_checker: + logger.warning( + f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure" + " that you abide to the conditions of the IF license and do not expose unfiltered" + " results in services or applications open to the public. Both the diffusers team and Hugging Face" + " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling" + " it only for use-cases that involve analyzing network behavior or auditing its results. For more" + " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ." + ) + + if safety_checker is not None and feature_extractor is None: + raise ValueError( + "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety" + " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead." + ) + + if unet.config.in_channels != 6: + logger.warn( + "It seems like you have loaded a checkpoint that shall not be used for super resolution from {unet.config._name_or_path} as it accepts {unet.config.in_channels} input channels instead of 6. Please make sure to pass a super resolution checkpoint as the `'unet'`: IFSuperResolutionPipeline.from_pretrained(unet=super_resolution_unet, ...)`." + ) + + self.register_modules( + tokenizer=tokenizer, + text_encoder=text_encoder, + unet=unet, + scheduler=scheduler, + image_noising_scheduler=image_noising_scheduler, + safety_checker=safety_checker, + feature_extractor=feature_extractor, + watermarker=watermarker, + ) + self.register_to_config(requires_safety_checker=requires_safety_checker) + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_sequential_cpu_offload + def enable_sequential_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's + models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only + when their specific submodule has its `forward` method called. + """ + if is_accelerate_available(): + from accelerate import cpu_offload + else: + raise ImportError("Please install accelerate via `pip install accelerate`") + + device = torch.device(f"cuda:{gpu_id}") + + models = [ + self.text_encoder, + self.unet, + ] + for cpu_offloaded_model in models: + if cpu_offloaded_model is not None: + cpu_offload(cpu_offloaded_model, device) + + if self.safety_checker is not None: + cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True) + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_model_cpu_offload + def enable_model_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared + to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` + method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with + `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): + from accelerate import cpu_offload_with_hook + else: + raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + hook = None + + if self.text_encoder is not None: + _, hook = cpu_offload_with_hook(self.text_encoder, device, prev_module_hook=hook) + + # Accelerate will move the next model to the device _before_ calling the offload hook of the + # previous model. This will cause both models to be present on the device at the same time. + # IF uses T5 for its text encoder which is really large. We can manually call the offload + # hook for the text encoder to ensure it's moved to the cpu before the unet is moved to + # the GPU. + self.text_encoder_offload_hook = hook + + _, hook = cpu_offload_with_hook(self.unet, device, prev_module_hook=hook) + + # if the safety checker isn't called, `unet_offload_hook` will have to be called to manually offload the unet + self.unet_offload_hook = hook + + if self.safety_checker is not None: + _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook) + + # We'll offload the last model manually. + self.final_offload_hook = hook + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.remove_all_hooks + def remove_all_hooks(self): + if is_accelerate_available(): + from accelerate.hooks import remove_hook_from_module + else: + raise ImportError("Please install accelerate via `pip install accelerate`") + + for model in [self.text_encoder, self.unet, self.safety_checker]: + if model is not None: + remove_hook_from_module(model, recurse=True) + + self.unet_offload_hook = None + self.text_encoder_offload_hook = None + self.final_offload_hook = None + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing + def _text_preprocessing(self, text, clean_caption=False): + if clean_caption and not is_bs4_available(): + logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`")) + logger.warn("Setting `clean_caption` to False...") + clean_caption = False + + if clean_caption and not is_ftfy_available(): + logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`")) + logger.warn("Setting `clean_caption` to False...") + clean_caption = False + + if not isinstance(text, (tuple, list)): + text = [text] + + def process(text: str): + if clean_caption: + text = self._clean_caption(text) + text = self._clean_caption(text) + else: + text = text.lower().strip() + return text + + return [process(t) for t in text] + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption + def _clean_caption(self, caption): + caption = str(caption) + caption = ul.unquote_plus(caption) + caption = caption.strip().lower() + caption = re.sub("", "person", caption) + # urls: + caption = re.sub( + r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa + "", + caption, + ) # regex for urls + caption = re.sub( + r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa + "", + caption, + ) # regex for urls + # html: + caption = BeautifulSoup(caption, features="html.parser").text + + # @ + caption = re.sub(r"@[\w\d]+\b", "", caption) + + # 31C0—31EF CJK Strokes + # 31F0—31FF Katakana Phonetic Extensions + # 3200—32FF Enclosed CJK Letters and Months + # 3300—33FF CJK Compatibility + # 3400—4DBF CJK Unified Ideographs Extension A + # 4DC0—4DFF Yijing Hexagram Symbols + # 4E00—9FFF CJK Unified Ideographs + caption = re.sub(r"[\u31c0-\u31ef]+", "", caption) + caption = re.sub(r"[\u31f0-\u31ff]+", "", caption) + caption = re.sub(r"[\u3200-\u32ff]+", "", caption) + caption = re.sub(r"[\u3300-\u33ff]+", "", caption) + caption = re.sub(r"[\u3400-\u4dbf]+", "", caption) + caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption) + caption = re.sub(r"[\u4e00-\u9fff]+", "", caption) + ####################################################### + + # все виды тире / all types of dash --> "-" + caption = re.sub( + r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa + "-", + caption, + ) + + # кавычки к одному стандарту + caption = re.sub(r"[`´«»“”¨]", '"', caption) + caption = re.sub(r"[‘’]", "'", caption) + + # " + caption = re.sub(r""?", "", caption) + # & + caption = re.sub(r"&", "", caption) + + # ip adresses: + caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption) + + # article ids: + caption = re.sub(r"\d:\d\d\s+$", "", caption) + + # \n + caption = re.sub(r"\\n", " ", caption) + + # "#123" + caption = re.sub(r"#\d{1,3}\b", "", caption) + # "#12345.." + caption = re.sub(r"#\d{5,}\b", "", caption) + # "123456.." + caption = re.sub(r"\b\d{6,}\b", "", caption) + # filenames: + caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption) + + # + caption = re.sub(r"[\"\']{2,}", r'"', caption) # """AUSVERKAUFT""" + caption = re.sub(r"[\.]{2,}", r" ", caption) # """AUSVERKAUFT""" + + caption = re.sub(self.bad_punct_regex, r" ", caption) # ***AUSVERKAUFT***, #AUSVERKAUFT + caption = re.sub(r"\s+\.\s+", r" ", caption) # " . " + + # this-is-my-cute-cat / this_is_my_cute_cat + regex2 = re.compile(r"(?:\-|\_)") + if len(re.findall(regex2, caption)) > 3: + caption = re.sub(regex2, " ", caption) + + caption = ftfy.fix_text(caption) + caption = html.unescape(html.unescape(caption)) + + caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption) # jc6640 + caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption) # jc6640vc + caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption) # 6640vc231 + + caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption) + caption = re.sub(r"(free\s)?download(\sfree)?", "", caption) + caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption) + caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption) + caption = re.sub(r"\bpage\s+\d+\b", "", caption) + + caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) # j2d1a2a... + + caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption) + + caption = re.sub(r"\b\s+\:\s+", r": ", caption) + caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption) + caption = re.sub(r"\s+", " ", caption) + + caption.strip() + + caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption) + caption = re.sub(r"^[\'\_,\-\:;]", r"", caption) + caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption) + caption = re.sub(r"^\.\S+$", "", caption) + + return caption.strip() + + @property + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device + def _execution_device(self): + r""" + Returns the device on which the pipeline's models will be executed. After calling + `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module + hooks. + """ + if not hasattr(self.unet, "_hf_hook"): + return self.device + for module in self.unet.modules(): + if ( + hasattr(module, "_hf_hook") + and hasattr(module._hf_hook, "execution_device") + and module._hf_hook.execution_device is not None + ): + return torch.device(module._hf_hook.execution_device) + return self.device + + @torch.no_grad() + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt + def encode_prompt( + self, + prompt, + do_classifier_free_guidance=True, + num_images_per_prompt=1, + device=None, + negative_prompt=None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + clean_caption: bool = False, + ): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + device: (`torch.device`, *optional*): + torch device to place the resulting embeddings on + num_images_per_prompt (`int`, *optional*, defaults to 1): + number of images that should be generated per prompt + do_classifier_free_guidance (`bool`, *optional*, defaults to `True`): + whether to use classifier free guidance or not + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. + Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + """ + if prompt is not None and negative_prompt is not None: + if type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + + if device is None: + device = self._execution_device + + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + # while T5 can handle much longer input sequences than 77, the text encoder was trained with a max length of 77 for IF + max_length = 77 + + if prompt_embeds is None: + prompt = self._text_preprocessing(prompt, clean_caption=clean_caption) + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=max_length, + truncation=True, + add_special_tokens=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( + text_input_ids, untruncated_ids + ): + removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1]) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {max_length} tokens: {removed_text}" + ) + + attention_mask = text_inputs.attention_mask.to(device) + + prompt_embeds = self.text_encoder( + text_input_ids.to(device), + attention_mask=attention_mask, + ) + prompt_embeds = prompt_embeds[0] + + if self.text_encoder is not None: + dtype = self.text_encoder.dtype + elif self.unet is not None: + dtype = self.unet.dtype + else: + dtype = None + + prompt_embeds = prompt_embeds.to(dtype=dtype, device=device) + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) + + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance and negative_prompt_embeds is None: + uncond_tokens: List[str] + if negative_prompt is None: + uncond_tokens = [""] * batch_size + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption) + max_length = prompt_embeds.shape[1] + uncond_input = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=max_length, + truncation=True, + return_attention_mask=True, + add_special_tokens=True, + return_tensors="pt", + ) + attention_mask = uncond_input.attention_mask.to(device) + + negative_prompt_embeds = self.text_encoder( + uncond_input.input_ids.to(device), + attention_mask=attention_mask, + ) + negative_prompt_embeds = negative_prompt_embeds[0] + + if do_classifier_free_guidance: + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + + negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device) + + negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) + negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + else: + negative_prompt_embeds = None + + return prompt_embeds, negative_prompt_embeds + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker + def run_safety_checker(self, image, device, dtype): + if self.safety_checker is not None: + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device) + image, nsfw_detected, watermark_detected = self.safety_checker( + images=image, + clip_input=safety_checker_input.pixel_values.to(dtype=dtype), + ) + else: + nsfw_detected = None + watermark_detected = None + + if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None: + self.unet_offload_hook.offload() + + return image, nsfw_detected, watermark_detected + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.prepare_extra_step_kwargs + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs + + def check_inputs( + self, + prompt, + image, + original_image, + mask_image, + batch_size, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): + if (callback_steps is None) or ( + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type" + f" {type(callback_steps)}." + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if prompt_embeds is not None and negative_prompt_embeds is not None: + if prompt_embeds.shape != negative_prompt_embeds.shape: + raise ValueError( + "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" + f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" + f" {negative_prompt_embeds.shape}." + ) + + # image + + if isinstance(image, list): + check_image_type = image[0] + else: + check_image_type = image + + if ( + not isinstance(check_image_type, torch.Tensor) + and not isinstance(check_image_type, PIL.Image.Image) + and not isinstance(check_image_type, np.ndarray) + ): + raise ValueError( + "`image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" + f" {type(check_image_type)}" + ) + + if isinstance(image, list): + image_batch_size = len(image) + elif isinstance(image, torch.Tensor): + image_batch_size = image.shape[0] + elif isinstance(image, PIL.Image.Image): + image_batch_size = 1 + elif isinstance(image, np.ndarray): + image_batch_size = image.shape[0] + else: + assert False + + if batch_size != image_batch_size: + raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}") + + # original_image + + if isinstance(original_image, list): + check_image_type = original_image[0] + else: + check_image_type = original_image + + if ( + not isinstance(check_image_type, torch.Tensor) + and not isinstance(check_image_type, PIL.Image.Image) + and not isinstance(check_image_type, np.ndarray) + ): + raise ValueError( + "`original_image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" + f" {type(check_image_type)}" + ) + + if isinstance(original_image, list): + image_batch_size = len(original_image) + elif isinstance(original_image, torch.Tensor): + image_batch_size = original_image.shape[0] + elif isinstance(original_image, PIL.Image.Image): + image_batch_size = 1 + elif isinstance(original_image, np.ndarray): + image_batch_size = original_image.shape[0] + else: + assert False + + if batch_size != image_batch_size: + raise ValueError( + f"original_image batch size: {image_batch_size} must be same as prompt batch size {batch_size}" + ) + + # mask_image + + if isinstance(mask_image, list): + check_image_type = mask_image[0] + else: + check_image_type = mask_image + + if ( + not isinstance(check_image_type, torch.Tensor) + and not isinstance(check_image_type, PIL.Image.Image) + and not isinstance(check_image_type, np.ndarray) + ): + raise ValueError( + "`mask_image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" + f" {type(check_image_type)}" + ) + + if isinstance(mask_image, list): + image_batch_size = len(mask_image) + elif isinstance(mask_image, torch.Tensor): + image_batch_size = mask_image.shape[0] + elif isinstance(mask_image, PIL.Image.Image): + image_batch_size = 1 + elif isinstance(mask_image, np.ndarray): + image_batch_size = mask_image.shape[0] + else: + assert False + + if image_batch_size != 1 and batch_size != image_batch_size: + raise ValueError( + f"mask_image batch size: {image_batch_size} must be `1` or the same as prompt batch size {batch_size}" + ) + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.preprocess_image with preprocess_image -> preprocess_original_image + def preprocess_original_image(self, image: PIL.Image.Image) -> torch.Tensor: + if not isinstance(image, list): + image = [image] + + def numpy_to_pt(images): + if images.ndim == 3: + images = images[..., None] + + images = torch.from_numpy(images.transpose(0, 3, 1, 2)) + return images + + if isinstance(image[0], PIL.Image.Image): + new_image = [] + + for image_ in image: + image_ = image_.convert("RGB") + image_ = resize(image_, self.unet.sample_size) + image_ = np.array(image_) + image_ = image_.astype(np.float32) + image_ = image_ / 127.5 - 1 + new_image.append(image_) + + image = new_image + + image = np.stack(image, axis=0) # to np + image = numpy_to_pt(image) # to pt + + elif isinstance(image[0], np.ndarray): + image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0) + image = numpy_to_pt(image) + + elif isinstance(image[0], torch.Tensor): + image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0) + + return image + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_superresolution.IFSuperResolutionPipeline.preprocess_image + def preprocess_image(self, image: PIL.Image.Image, num_images_per_prompt, device) -> torch.Tensor: + if not isinstance(image, torch.Tensor) and not isinstance(image, list): + image = [image] + + if isinstance(image[0], PIL.Image.Image): + image = [np.array(i).astype(np.float32) / 255.0 for i in image] + + image = np.stack(image, axis=0) # to np + torch.from_numpy(image.transpose(0, 3, 1, 2)) + elif isinstance(image[0], np.ndarray): + image = np.stack(image, axis=0) # to np + if image.ndim == 5: + image = image[0] + + image = torch.from_numpy(image.transpose(0, 3, 1, 2)) + elif isinstance(image, list) and isinstance(image[0], torch.Tensor): + dims = image[0].ndim + + if dims == 3: + image = torch.stack(image, dim=0) + elif dims == 4: + image = torch.concat(image, dim=0) + else: + raise ValueError(f"Image must have 3 or 4 dimensions, instead got {dims}") + + image = image.to(device=device, dtype=self.unet.dtype) + + image = image.repeat_interleave(num_images_per_prompt, dim=0) + + return image + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_inpainting.IFInpaintingPipeline.preprocess_mask_image + def preprocess_mask_image(self, mask_image) -> torch.Tensor: + if not isinstance(mask_image, list): + mask_image = [mask_image] + + if isinstance(mask_image[0], torch.Tensor): + mask_image = torch.cat(mask_image, axis=0) if mask_image[0].ndim == 4 else torch.stack(mask_image, axis=0) + + if mask_image.ndim == 2: + # Batch and add channel dim for single mask + mask_image = mask_image.unsqueeze(0).unsqueeze(0) + elif mask_image.ndim == 3 and mask_image.shape[0] == 1: + # Single mask, the 0'th dimension is considered to be + # the existing batch size of 1 + mask_image = mask_image.unsqueeze(0) + elif mask_image.ndim == 3 and mask_image.shape[0] != 1: + # Batch of mask, the 0'th dimension is considered to be + # the batching dimension + mask_image = mask_image.unsqueeze(1) + + mask_image[mask_image < 0.5] = 0 + mask_image[mask_image >= 0.5] = 1 + + elif isinstance(mask_image[0], PIL.Image.Image): + new_mask_image = [] + + for mask_image_ in mask_image: + mask_image_ = mask_image_.convert("L") + mask_image_ = resize(mask_image_, self.unet.sample_size) + mask_image_ = np.array(mask_image_) + mask_image_ = mask_image_[None, None, :] + new_mask_image.append(mask_image_) + + mask_image = new_mask_image + + mask_image = np.concatenate(mask_image, axis=0) + mask_image = mask_image.astype(np.float32) / 255.0 + mask_image[mask_image < 0.5] = 0 + mask_image[mask_image >= 0.5] = 1 + mask_image = torch.from_numpy(mask_image) + + elif isinstance(mask_image[0], np.ndarray): + mask_image = np.concatenate([m[None, None, :] for m in mask_image], axis=0) + + mask_image[mask_image < 0.5] = 0 + mask_image[mask_image >= 0.5] = 1 + mask_image = torch.from_numpy(mask_image) + + return mask_image + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.get_timesteps + def get_timesteps(self, num_inference_steps, strength): + # get the original timestep using init_timestep + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) + + t_start = max(num_inference_steps - init_timestep, 0) + timesteps = self.scheduler.timesteps[t_start:] + + return timesteps, num_inference_steps - t_start + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_inpainting.IFInpaintingPipeline.prepare_intermediate_images + def prepare_intermediate_images( + self, image, timestep, batch_size, num_images_per_prompt, dtype, device, mask_image, generator=None + ): + image_batch_size, channels, height, width = image.shape + + batch_size = batch_size * num_images_per_prompt + + shape = (batch_size, channels, height, width) + + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + + image = image.repeat_interleave(num_images_per_prompt, dim=0) + noised_image = self.scheduler.add_noise(image, noise, timestep) + + image = (1 - mask_image) * image + mask_image * noised_image + + return image + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + image: Union[PIL.Image.Image, np.ndarray, torch.FloatTensor], + original_image: Union[ + PIL.Image.Image, torch.Tensor, np.ndarray, List[PIL.Image.Image], List[torch.Tensor], List[np.ndarray] + ] = None, + mask_image: Union[ + PIL.Image.Image, torch.Tensor, np.ndarray, List[PIL.Image.Image], List[torch.Tensor], List[np.ndarray] + ] = None, + strength: float = 0.8, + prompt: Union[str, List[str]] = None, + num_inference_steps: int = 100, + timesteps: List[int] = None, + guidance_scale: float = 4.0, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + noise_level: int = 0, + clean_caption: bool = True, + ): + """ + Function invoked when calling the pipeline for generation. + + Args: + image (`torch.FloatTensor` or `PIL.Image.Image`): + `Image`, or tensor representing an image batch, that will be used as the starting point for the + process. + original_image (`torch.FloatTensor` or `PIL.Image.Image`): + The original image that `image` was varied from. + mask_image (`PIL.Image.Image`): + `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be + repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted + to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L) + instead of 3, so the expected shape would be `(B, H, W, 1)`. + strength (`float`, *optional*, defaults to 0.8): + Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` + will be used as a starting point, adding more noise to it the larger the `strength`. The number of + denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will + be maximum and the denoising process will run for the full number of iterations specified in + `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + instead. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + timesteps (`List[int]`, *optional*): + Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps` + timesteps are used. Must be in descending order. + guidance_scale (`float`, *optional*, defaults to 7.5): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to + [`schedulers.DDIMScheduler`], will be ignored for others. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. + callback (`Callable`, *optional*): + A function that will be called every `callback_steps` steps during inference. The function will be + called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function will be called. If not specified, the callback will be + called at every step. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). + noise_level (`int`, *optional*, defaults to 0): + The amount of noise to add to the upscaled image. Must be in the range `[0, 1000)` + clean_caption (`bool`, *optional*, defaults to `True`): + Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to + be installed. If the dependencies are not installed, the embeddings will be created from the raw + prompt. + + Examples: + + Returns: + [`~pipelines.stable_diffusion.IFPipelineOutput`] or `tuple`: + [`~pipelines.stable_diffusion.IFPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When + returning a tuple, the first element is a list with the generated images, and the second element is a list + of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw) + or watermarked content, according to the `safety_checker`. + """ + # 1. Check inputs. Raise error if not correct + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + self.check_inputs( + prompt, + image, + original_image, + mask_image, + batch_size, + callback_steps, + negative_prompt, + prompt_embeds, + negative_prompt_embeds, + ) + + # 2. Define call parameters + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + + device = self._execution_device + + # 3. Encode input prompt + prompt_embeds, negative_prompt_embeds = self.encode_prompt( + prompt, + do_classifier_free_guidance, + num_images_per_prompt=num_images_per_prompt, + device=device, + negative_prompt=negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + clean_caption=clean_caption, + ) + + if do_classifier_free_guidance: + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + + dtype = prompt_embeds.dtype + + # 4. Prepare timesteps + if timesteps is not None: + self.scheduler.set_timesteps(timesteps=timesteps, device=device) + timesteps = self.scheduler.timesteps + num_inference_steps = len(timesteps) + else: + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps = self.scheduler.timesteps + + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength) + + # 5. prepare original image + original_image = self.preprocess_original_image(original_image) + original_image = original_image.to(device=device, dtype=dtype) + + # 6. prepare mask image + mask_image = self.preprocess_mask_image(mask_image) + mask_image = mask_image.to(device=device, dtype=dtype) + + if mask_image.shape[0] == 1: + mask_image = mask_image.repeat_interleave(batch_size * num_images_per_prompt, dim=0) + else: + mask_image = mask_image.repeat_interleave(num_images_per_prompt, dim=0) + + # 6. Prepare intermediate images + noise_timestep = timesteps[0:1] + noise_timestep = noise_timestep.repeat(batch_size * num_images_per_prompt) + + intermediate_images = self.prepare_intermediate_images( + original_image, + noise_timestep, + batch_size, + num_images_per_prompt, + dtype, + device, + mask_image, + generator, + ) + + # 7. Prepare upscaled image and noise level + _, _, height, width = original_image.shape + + image = self.preprocess_image(image, num_images_per_prompt, device) + + upscaled = F.interpolate(image, (height, width), mode="bilinear", align_corners=True) + + noise_level = torch.tensor([noise_level] * upscaled.shape[0], device=upscaled.device) + noise = randn_tensor(upscaled.shape, generator=generator, device=upscaled.device, dtype=upscaled.dtype) + upscaled = self.image_noising_scheduler.add_noise(upscaled, noise, timesteps=noise_level) + + if do_classifier_free_guidance: + noise_level = torch.cat([noise_level] * 2) + + # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + # HACK: see comment in `enable_model_cpu_offload` + if hasattr(self, "text_encoder_offload_hook") and self.text_encoder_offload_hook is not None: + self.text_encoder_offload_hook.offload() + + # 9. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + model_input = torch.cat([intermediate_images, upscaled], dim=1) + + model_input = torch.cat([model_input] * 2) if do_classifier_free_guidance else model_input + model_input = self.scheduler.scale_model_input(model_input, t) + + # predict the noise residual + noise_pred = self.unet( + model_input, + t, + encoder_hidden_states=prompt_embeds, + class_labels=noise_level, + cross_attention_kwargs=cross_attention_kwargs, + ).sample + + # perform guidance + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred_uncond, _ = noise_pred_uncond.split(model_input.shape[1] // 2, dim=1) + noise_pred_text, predicted_variance = noise_pred_text.split(model_input.shape[1] // 2, dim=1) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + noise_pred = torch.cat([noise_pred, predicted_variance], dim=1) + + # compute the previous noisy sample x_t -> x_t-1 + prev_intermediate_images = intermediate_images + + intermediate_images = self.scheduler.step( + noise_pred, t, intermediate_images, **extra_step_kwargs + ).prev_sample + + intermediate_images = (1 - mask_image) * prev_intermediate_images + mask_image * intermediate_images + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + callback(i, t, intermediate_images) + + image = intermediate_images + + if output_type == "pil": + # 10. Post-processing + image = (image / 2 + 0.5).clamp(0, 1) + image = image.cpu().permute(0, 2, 3, 1).float().numpy() + + # 11. Run safety checker + image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype) + + # 12. Convert to PIL + image = self.numpy_to_pil(image) + + # 13. Apply watermark + if self.watermarker is not None: + self.watermarker.apply_watermark(image, self.unet.config.sample_size) + elif output_type == "pt": + nsfw_detected = None + watermark_detected = None + + if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None: + self.unet_offload_hook.offload() + else: + # 10. Post-processing + image = (image / 2 + 0.5).clamp(0, 1) + image = image.cpu().permute(0, 2, 3, 1).float().numpy() + + # 11. Run safety checker + image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype) + + # Offload last model to CPU + if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: + self.final_offload_hook.offload() + + if not return_dict: + return (image, nsfw_detected, watermark_detected) + + return IFPipelineOutput(images=image, nsfw_detected=nsfw_detected, watermark_detected=watermark_detected) diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py new file mode 100644 index 000000000000..bb1d4ee4ba66 --- /dev/null +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py @@ -0,0 +1,947 @@ +import html +import inspect +import re +import urllib.parse as ul +from typing import Any, Callable, Dict, List, Optional, Union + +import numpy as np +import PIL +import torch +import torch.nn.functional as F +from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer + +from ...models import UNet2DConditionModel +from ...schedulers import DDPMScheduler +from ...utils import ( + BACKENDS_MAPPING, + is_accelerate_available, + is_accelerate_version, + is_bs4_available, + is_ftfy_available, + logging, + randn_tensor, + replace_example_docstring, +) +from ..pipeline_utils import DiffusionPipeline +from . import IFPipelineOutput +from .safety_checker import IFSafetyChecker +from .watermark import IFWatermarker + + +if is_bs4_available(): + from bs4 import BeautifulSoup + +if is_ftfy_available(): + import ftfy + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> from diffusers import IFPipeline, IFSuperResolutionPipeline, DiffusionPipeline + >>> from diffusers.utils import pt_to_pil + >>> import torch + + >>> pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16) + >>> pipe.enable_model_cpu_offload() + + >>> prompt = 'a photo of a kangaroo wearing an orange hoodie and blue sunglasses standing in front of the eiffel tower holding a sign that says "very deep learning"' + >>> prompt_embeds, negative_embeds = pipe.encode_prompt(prompt) + + >>> image = pipe(prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_embeds, output_type="pt").images + + >>> # save intermediate image + >>> pil_image = pt_to_pil(image) + >>> pil_image[0].save("./if_stage_I.png") + + >>> super_res_1_pipe = IFSuperResolutionPipeline.from_pretrained( + ... "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16 + ... ) + >>> super_res_1_pipe.enable_model_cpu_offload() + + >>> image = super_res_1_pipe( + ... image=image, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_embeds + ... ).images + >>> image[0].save("./if_stage_II.png") + ``` +""" + + +class IFSuperResolutionPipeline(DiffusionPipeline): + tokenizer: T5Tokenizer + text_encoder: T5EncoderModel + + unet: UNet2DConditionModel + scheduler: DDPMScheduler + image_noising_scheduler: DDPMScheduler + + feature_extractor: Optional[CLIPImageProcessor] + safety_checker: Optional[IFSafetyChecker] + + watermarker: Optional[IFWatermarker] + + bad_punct_regex = re.compile( + r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}" + ) # noqa + + _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"] + + def __init__( + self, + tokenizer: T5Tokenizer, + text_encoder: T5EncoderModel, + unet: UNet2DConditionModel, + scheduler: DDPMScheduler, + image_noising_scheduler: DDPMScheduler, + safety_checker: Optional[IFSafetyChecker], + feature_extractor: Optional[CLIPImageProcessor], + watermarker: Optional[IFWatermarker], + requires_safety_checker: bool = True, + ): + super().__init__() + + if safety_checker is None and requires_safety_checker: + logger.warning( + f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure" + " that you abide to the conditions of the IF license and do not expose unfiltered" + " results in services or applications open to the public. Both the diffusers team and Hugging Face" + " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling" + " it only for use-cases that involve analyzing network behavior or auditing its results. For more" + " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ." + ) + + if safety_checker is not None and feature_extractor is None: + raise ValueError( + "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety" + " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead." + ) + + if unet.config.in_channels != 6: + logger.warn( + "It seems like you have loaded a checkpoint that shall not be used for super resolution from {unet.config._name_or_path} as it accepts {unet.config.in_channels} input channels instead of 6. Please make sure to pass a super resolution checkpoint as the `'unet'`: IFSuperResolutionPipeline.from_pretrained(unet=super_resolution_unet, ...)`." + ) + + self.register_modules( + tokenizer=tokenizer, + text_encoder=text_encoder, + unet=unet, + scheduler=scheduler, + image_noising_scheduler=image_noising_scheduler, + safety_checker=safety_checker, + feature_extractor=feature_extractor, + watermarker=watermarker, + ) + self.register_to_config(requires_safety_checker=requires_safety_checker) + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_sequential_cpu_offload + def enable_sequential_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's + models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only + when their specific submodule has its `forward` method called. + """ + if is_accelerate_available(): + from accelerate import cpu_offload + else: + raise ImportError("Please install accelerate via `pip install accelerate`") + + device = torch.device(f"cuda:{gpu_id}") + + models = [ + self.text_encoder, + self.unet, + ] + for cpu_offloaded_model in models: + if cpu_offloaded_model is not None: + cpu_offload(cpu_offloaded_model, device) + + if self.safety_checker is not None: + cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True) + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.enable_model_cpu_offload + def enable_model_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared + to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` + method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with + `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): + from accelerate import cpu_offload_with_hook + else: + raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + hook = None + + if self.text_encoder is not None: + _, hook = cpu_offload_with_hook(self.text_encoder, device, prev_module_hook=hook) + + # Accelerate will move the next model to the device _before_ calling the offload hook of the + # previous model. This will cause both models to be present on the device at the same time. + # IF uses T5 for its text encoder which is really large. We can manually call the offload + # hook for the text encoder to ensure it's moved to the cpu before the unet is moved to + # the GPU. + self.text_encoder_offload_hook = hook + + _, hook = cpu_offload_with_hook(self.unet, device, prev_module_hook=hook) + + # if the safety checker isn't called, `unet_offload_hook` will have to be called to manually offload the unet + self.unet_offload_hook = hook + + if self.safety_checker is not None: + _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook) + + # We'll offload the last model manually. + self.final_offload_hook = hook + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.remove_all_hooks + def remove_all_hooks(self): + if is_accelerate_available(): + from accelerate.hooks import remove_hook_from_module + else: + raise ImportError("Please install accelerate via `pip install accelerate`") + + for model in [self.text_encoder, self.unet, self.safety_checker]: + if model is not None: + remove_hook_from_module(model, recurse=True) + + self.unet_offload_hook = None + self.text_encoder_offload_hook = None + self.final_offload_hook = None + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing + def _text_preprocessing(self, text, clean_caption=False): + if clean_caption and not is_bs4_available(): + logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`")) + logger.warn("Setting `clean_caption` to False...") + clean_caption = False + + if clean_caption and not is_ftfy_available(): + logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`")) + logger.warn("Setting `clean_caption` to False...") + clean_caption = False + + if not isinstance(text, (tuple, list)): + text = [text] + + def process(text: str): + if clean_caption: + text = self._clean_caption(text) + text = self._clean_caption(text) + else: + text = text.lower().strip() + return text + + return [process(t) for t in text] + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption + def _clean_caption(self, caption): + caption = str(caption) + caption = ul.unquote_plus(caption) + caption = caption.strip().lower() + caption = re.sub("", "person", caption) + # urls: + caption = re.sub( + r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa + "", + caption, + ) # regex for urls + caption = re.sub( + r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa + "", + caption, + ) # regex for urls + # html: + caption = BeautifulSoup(caption, features="html.parser").text + + # @ + caption = re.sub(r"@[\w\d]+\b", "", caption) + + # 31C0—31EF CJK Strokes + # 31F0—31FF Katakana Phonetic Extensions + # 3200—32FF Enclosed CJK Letters and Months + # 3300—33FF CJK Compatibility + # 3400—4DBF CJK Unified Ideographs Extension A + # 4DC0—4DFF Yijing Hexagram Symbols + # 4E00—9FFF CJK Unified Ideographs + caption = re.sub(r"[\u31c0-\u31ef]+", "", caption) + caption = re.sub(r"[\u31f0-\u31ff]+", "", caption) + caption = re.sub(r"[\u3200-\u32ff]+", "", caption) + caption = re.sub(r"[\u3300-\u33ff]+", "", caption) + caption = re.sub(r"[\u3400-\u4dbf]+", "", caption) + caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption) + caption = re.sub(r"[\u4e00-\u9fff]+", "", caption) + ####################################################### + + # все виды тире / all types of dash --> "-" + caption = re.sub( + r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa + "-", + caption, + ) + + # кавычки к одному стандарту + caption = re.sub(r"[`´«»“”¨]", '"', caption) + caption = re.sub(r"[‘’]", "'", caption) + + # " + caption = re.sub(r""?", "", caption) + # & + caption = re.sub(r"&", "", caption) + + # ip adresses: + caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption) + + # article ids: + caption = re.sub(r"\d:\d\d\s+$", "", caption) + + # \n + caption = re.sub(r"\\n", " ", caption) + + # "#123" + caption = re.sub(r"#\d{1,3}\b", "", caption) + # "#12345.." + caption = re.sub(r"#\d{5,}\b", "", caption) + # "123456.." + caption = re.sub(r"\b\d{6,}\b", "", caption) + # filenames: + caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption) + + # + caption = re.sub(r"[\"\']{2,}", r'"', caption) # """AUSVERKAUFT""" + caption = re.sub(r"[\.]{2,}", r" ", caption) # """AUSVERKAUFT""" + + caption = re.sub(self.bad_punct_regex, r" ", caption) # ***AUSVERKAUFT***, #AUSVERKAUFT + caption = re.sub(r"\s+\.\s+", r" ", caption) # " . " + + # this-is-my-cute-cat / this_is_my_cute_cat + regex2 = re.compile(r"(?:\-|\_)") + if len(re.findall(regex2, caption)) > 3: + caption = re.sub(regex2, " ", caption) + + caption = ftfy.fix_text(caption) + caption = html.unescape(html.unescape(caption)) + + caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption) # jc6640 + caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption) # jc6640vc + caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption) # 6640vc231 + + caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption) + caption = re.sub(r"(free\s)?download(\sfree)?", "", caption) + caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption) + caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption) + caption = re.sub(r"\bpage\s+\d+\b", "", caption) + + caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) # j2d1a2a... + + caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption) + + caption = re.sub(r"\b\s+\:\s+", r": ", caption) + caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption) + caption = re.sub(r"\s+", " ", caption) + + caption.strip() + + caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption) + caption = re.sub(r"^[\'\_,\-\:;]", r"", caption) + caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption) + caption = re.sub(r"^\.\S+$", "", caption) + + return caption.strip() + + @property + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device + def _execution_device(self): + r""" + Returns the device on which the pipeline's models will be executed. After calling + `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module + hooks. + """ + if not hasattr(self.unet, "_hf_hook"): + return self.device + for module in self.unet.modules(): + if ( + hasattr(module, "_hf_hook") + and hasattr(module._hf_hook, "execution_device") + and module._hf_hook.execution_device is not None + ): + return torch.device(module._hf_hook.execution_device) + return self.device + + @torch.no_grad() + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt + def encode_prompt( + self, + prompt, + do_classifier_free_guidance=True, + num_images_per_prompt=1, + device=None, + negative_prompt=None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + clean_caption: bool = False, + ): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + device: (`torch.device`, *optional*): + torch device to place the resulting embeddings on + num_images_per_prompt (`int`, *optional*, defaults to 1): + number of images that should be generated per prompt + do_classifier_free_guidance (`bool`, *optional*, defaults to `True`): + whether to use classifier free guidance or not + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. + Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + """ + if prompt is not None and negative_prompt is not None: + if type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + + if device is None: + device = self._execution_device + + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + # while T5 can handle much longer input sequences than 77, the text encoder was trained with a max length of 77 for IF + max_length = 77 + + if prompt_embeds is None: + prompt = self._text_preprocessing(prompt, clean_caption=clean_caption) + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=max_length, + truncation=True, + add_special_tokens=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( + text_input_ids, untruncated_ids + ): + removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1]) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {max_length} tokens: {removed_text}" + ) + + attention_mask = text_inputs.attention_mask.to(device) + + prompt_embeds = self.text_encoder( + text_input_ids.to(device), + attention_mask=attention_mask, + ) + prompt_embeds = prompt_embeds[0] + + if self.text_encoder is not None: + dtype = self.text_encoder.dtype + elif self.unet is not None: + dtype = self.unet.dtype + else: + dtype = None + + prompt_embeds = prompt_embeds.to(dtype=dtype, device=device) + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) + + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance and negative_prompt_embeds is None: + uncond_tokens: List[str] + if negative_prompt is None: + uncond_tokens = [""] * batch_size + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption) + max_length = prompt_embeds.shape[1] + uncond_input = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=max_length, + truncation=True, + return_attention_mask=True, + add_special_tokens=True, + return_tensors="pt", + ) + attention_mask = uncond_input.attention_mask.to(device) + + negative_prompt_embeds = self.text_encoder( + uncond_input.input_ids.to(device), + attention_mask=attention_mask, + ) + negative_prompt_embeds = negative_prompt_embeds[0] + + if do_classifier_free_guidance: + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + + negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device) + + negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) + negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + else: + negative_prompt_embeds = None + + return prompt_embeds, negative_prompt_embeds + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker + def run_safety_checker(self, image, device, dtype): + if self.safety_checker is not None: + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device) + image, nsfw_detected, watermark_detected = self.safety_checker( + images=image, + clip_input=safety_checker_input.pixel_values.to(dtype=dtype), + ) + else: + nsfw_detected = None + watermark_detected = None + + if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None: + self.unet_offload_hook.offload() + + return image, nsfw_detected, watermark_detected + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.prepare_extra_step_kwargs + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs + + def check_inputs( + self, + prompt, + image, + batch_size, + noise_level, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): + if (callback_steps is None) or ( + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type" + f" {type(callback_steps)}." + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if prompt_embeds is not None and negative_prompt_embeds is not None: + if prompt_embeds.shape != negative_prompt_embeds.shape: + raise ValueError( + "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" + f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" + f" {negative_prompt_embeds.shape}." + ) + + if noise_level < 0 or noise_level >= self.image_noising_scheduler.config.num_train_timesteps: + raise ValueError( + f"`noise_level`: {noise_level} must be a valid timestep in `self.noising_scheduler`, [0, {self.image_noising_scheduler.config.num_train_timesteps})" + ) + + if isinstance(image, list): + check_image_type = image[0] + else: + check_image_type = image + + if ( + not isinstance(check_image_type, torch.Tensor) + and not isinstance(check_image_type, PIL.Image.Image) + and not isinstance(check_image_type, np.ndarray) + ): + raise ValueError( + "`image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" + f" {type(check_image_type)}" + ) + + if isinstance(image, list): + image_batch_size = len(image) + elif isinstance(image, torch.Tensor): + image_batch_size = image.shape[0] + elif isinstance(image, PIL.Image.Image): + image_batch_size = 1 + elif isinstance(image, np.ndarray): + image_batch_size = image.shape[0] + else: + assert False + + if batch_size != image_batch_size: + raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}") + + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.prepare_intermediate_images + def prepare_intermediate_images(self, batch_size, num_channels, height, width, dtype, device, generator): + shape = (batch_size, num_channels, height, width) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + intermediate_images = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + + # scale the initial noise by the standard deviation required by the scheduler + intermediate_images = intermediate_images * self.scheduler.init_noise_sigma + return intermediate_images + + def preprocess_image(self, image, num_images_per_prompt, device): + if not isinstance(image, torch.Tensor) and not isinstance(image, list): + image = [image] + + if isinstance(image[0], PIL.Image.Image): + image = [np.array(i).astype(np.float32) / 255.0 for i in image] + + image = np.stack(image, axis=0) # to np + torch.from_numpy(image.transpose(0, 3, 1, 2)) + elif isinstance(image[0], np.ndarray): + image = np.stack(image, axis=0) # to np + if image.ndim == 5: + image = image[0] + + image = torch.from_numpy(image.transpose(0, 3, 1, 2)) + elif isinstance(image, list) and isinstance(image[0], torch.Tensor): + dims = image[0].ndim + + if dims == 3: + image = torch.stack(image, dim=0) + elif dims == 4: + image = torch.concat(image, dim=0) + else: + raise ValueError(f"Image must have 3 or 4 dimensions, instead got {dims}") + + image = image.to(device=device, dtype=self.unet.dtype) + + image = image.repeat_interleave(num_images_per_prompt, dim=0) + + return image + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Union[str, List[str]] = None, + image: Union[PIL.Image.Image, np.ndarray, torch.FloatTensor] = None, + num_inference_steps: int = 50, + timesteps: List[int] = None, + guidance_scale: float = 4.0, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + noise_level: int = 250, + clean_caption: bool = True, + ): + """ + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + instead. + image (`PIL.Image.Image`, `np.ndarray`, `torch.FloatTensor`): + The image to be upscaled. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + timesteps (`List[int]`, *optional*): + Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps` + timesteps are used. Must be in descending order. + guidance_scale (`float`, *optional*, defaults to 7.5): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to + [`schedulers.DDIMScheduler`], will be ignored for others. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple. + callback (`Callable`, *optional*): + A function that will be called every `callback_steps` steps during inference. The function will be + called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function will be called. If not specified, the callback will be + called at every step. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). + noise_level (`int`, *optional*, defaults to 250): + The amount of noise to add to the upscaled image. Must be in the range `[0, 1000)` + clean_caption (`bool`, *optional*, defaults to `True`): + Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to + be installed. If the dependencies are not installed, the embeddings will be created from the raw + prompt. + + Examples: + + Returns: + [`~pipelines.stable_diffusion.IFPipelineOutput`] or `tuple`: + [`~pipelines.stable_diffusion.IFPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When + returning a tuple, the first element is a list with the generated images, and the second element is a list + of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw) + or watermarked content, according to the `safety_checker`. + """ + # 1. Check inputs. Raise error if not correct + + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + self.check_inputs( + prompt, + image, + batch_size, + noise_level, + callback_steps, + negative_prompt, + prompt_embeds, + negative_prompt_embeds, + ) + + # 2. Define call parameters + + height = self.unet.config.sample_size + width = self.unet.config.sample_size + + device = self._execution_device + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + + # 3. Encode input prompt + prompt_embeds, negative_prompt_embeds = self.encode_prompt( + prompt, + do_classifier_free_guidance, + num_images_per_prompt=num_images_per_prompt, + device=device, + negative_prompt=negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + clean_caption=clean_caption, + ) + + if do_classifier_free_guidance: + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + + # 4. Prepare timesteps + if timesteps is not None: + self.scheduler.set_timesteps(timesteps=timesteps, device=device) + timesteps = self.scheduler.timesteps + num_inference_steps = len(timesteps) + else: + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps = self.scheduler.timesteps + + # 5. Prepare intermediate images + num_channels = self.unet.config.in_channels // 2 + intermediate_images = self.prepare_intermediate_images( + batch_size * num_images_per_prompt, + num_channels, + height, + width, + prompt_embeds.dtype, + device, + generator, + ) + + # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + # 7. Prepare upscaled image and noise level + image = self.preprocess_image(image, num_images_per_prompt, device) + upscaled = F.interpolate(image, (height, width), mode="bilinear", align_corners=True) + + noise_level = torch.tensor([noise_level] * upscaled.shape[0], device=upscaled.device) + noise = randn_tensor(upscaled.shape, generator=generator, device=upscaled.device, dtype=upscaled.dtype) + upscaled = self.image_noising_scheduler.add_noise(upscaled, noise, timesteps=noise_level) + + if do_classifier_free_guidance: + noise_level = torch.cat([noise_level] * 2) + + # HACK: see comment in `enable_model_cpu_offload` + if hasattr(self, "text_encoder_offload_hook") and self.text_encoder_offload_hook is not None: + self.text_encoder_offload_hook.offload() + + # 8. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + model_input = torch.cat([intermediate_images, upscaled], dim=1) + + model_input = torch.cat([model_input] * 2) if do_classifier_free_guidance else model_input + model_input = self.scheduler.scale_model_input(model_input, t) + + # predict the noise residual + noise_pred = self.unet( + model_input, + t, + encoder_hidden_states=prompt_embeds, + class_labels=noise_level, + cross_attention_kwargs=cross_attention_kwargs, + ).sample + + # perform guidance + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred_uncond, _ = noise_pred_uncond.split(model_input.shape[1] // 2, dim=1) + noise_pred_text, predicted_variance = noise_pred_text.split(model_input.shape[1] // 2, dim=1) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + noise_pred = torch.cat([noise_pred, predicted_variance], dim=1) + + # compute the previous noisy sample x_t -> x_t-1 + intermediate_images = self.scheduler.step( + noise_pred, t, intermediate_images, **extra_step_kwargs + ).prev_sample + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + callback(i, t, intermediate_images) + + image = intermediate_images + + if output_type == "pil": + # 9. Post-processing + image = (image / 2 + 0.5).clamp(0, 1) + image = image.cpu().permute(0, 2, 3, 1).float().numpy() + + # 10. Run safety checker + image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype) + + # 11. Convert to PIL + image = self.numpy_to_pil(image) + + # 12. Apply watermark + if self.watermarker is not None: + self.watermarker.apply_watermark(image, self.unet.config.sample_size) + elif output_type == "pt": + nsfw_detected = None + watermark_detected = None + + if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None: + self.unet_offload_hook.offload() + else: + # 9. Post-processing + image = (image / 2 + 0.5).clamp(0, 1) + image = image.cpu().permute(0, 2, 3, 1).float().numpy() + + # 10. Run safety checker + image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype) + + # Offload last model to CPU + if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: + self.final_offload_hook.offload() + + if not return_dict: + return (image, nsfw_detected, watermark_detected) + + return IFPipelineOutput(images=image, nsfw_detected=nsfw_detected, watermark_detected=watermark_detected) diff --git a/src/diffusers/pipelines/deepfloyd_if/safety_checker.py b/src/diffusers/pipelines/deepfloyd_if/safety_checker.py new file mode 100644 index 000000000000..8ffeed580bbe --- /dev/null +++ b/src/diffusers/pipelines/deepfloyd_if/safety_checker.py @@ -0,0 +1,59 @@ +import numpy as np +import torch +import torch.nn as nn +from transformers import CLIPConfig, CLIPVisionModelWithProjection, PreTrainedModel + +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class IFSafetyChecker(PreTrainedModel): + config_class = CLIPConfig + + _no_split_modules = ["CLIPEncoderLayer"] + + def __init__(self, config: CLIPConfig): + super().__init__(config) + + self.vision_model = CLIPVisionModelWithProjection(config.vision_config) + + self.p_head = nn.Linear(config.vision_config.projection_dim, 1) + self.w_head = nn.Linear(config.vision_config.projection_dim, 1) + + @torch.no_grad() + def forward(self, clip_input, images, p_threshold=0.5, w_threshold=0.5): + image_embeds = self.vision_model(clip_input)[0] + + nsfw_detected = self.p_head(image_embeds) + nsfw_detected = nsfw_detected.flatten() + nsfw_detected = nsfw_detected > p_threshold + nsfw_detected = nsfw_detected.tolist() + + if any(nsfw_detected): + logger.warning( + "Potential NSFW content was detected in one or more images. A black image will be returned instead." + " Try again with a different prompt and/or seed." + ) + + for idx, nsfw_detected_ in enumerate(nsfw_detected): + if nsfw_detected_: + images[idx] = np.zeros(images[idx].shape) + + watermark_detected = self.w_head(image_embeds) + watermark_detected = watermark_detected.flatten() + watermark_detected = watermark_detected > w_threshold + watermark_detected = watermark_detected.tolist() + + if any(watermark_detected): + logger.warning( + "Potential watermarked content was detected in one or more images. A black image will be returned instead." + " Try again with a different prompt and/or seed." + ) + + for idx, watermark_detected_ in enumerate(watermark_detected): + if watermark_detected_: + images[idx] = np.zeros(images[idx].shape) + + return images, nsfw_detected, watermark_detected diff --git a/src/diffusers/pipelines/deepfloyd_if/timesteps.py b/src/diffusers/pipelines/deepfloyd_if/timesteps.py new file mode 100644 index 000000000000..d44285c017bb --- /dev/null +++ b/src/diffusers/pipelines/deepfloyd_if/timesteps.py @@ -0,0 +1,579 @@ +fast27_timesteps = [ + 999, + 800, + 799, + 600, + 599, + 500, + 400, + 399, + 377, + 355, + 333, + 311, + 288, + 266, + 244, + 222, + 200, + 199, + 177, + 155, + 133, + 111, + 88, + 66, + 44, + 22, + 0, +] + +smart27_timesteps = [ + 999, + 976, + 952, + 928, + 905, + 882, + 858, + 857, + 810, + 762, + 715, + 714, + 572, + 429, + 428, + 286, + 285, + 238, + 190, + 143, + 142, + 118, + 95, + 71, + 47, + 24, + 0, +] + +smart50_timesteps = [ + 999, + 988, + 977, + 966, + 955, + 944, + 933, + 922, + 911, + 900, + 899, + 879, + 859, + 840, + 820, + 800, + 799, + 766, + 733, + 700, + 699, + 650, + 600, + 599, + 500, + 499, + 400, + 399, + 350, + 300, + 299, + 266, + 233, + 200, + 199, + 179, + 159, + 140, + 120, + 100, + 99, + 88, + 77, + 66, + 55, + 44, + 33, + 22, + 11, + 0, +] + +smart100_timesteps = [ + 999, + 995, + 992, + 989, + 985, + 981, + 978, + 975, + 971, + 967, + 964, + 961, + 957, + 956, + 951, + 947, + 942, + 937, + 933, + 928, + 923, + 919, + 914, + 913, + 908, + 903, + 897, + 892, + 887, + 881, + 876, + 871, + 870, + 864, + 858, + 852, + 846, + 840, + 834, + 828, + 827, + 820, + 813, + 806, + 799, + 792, + 785, + 784, + 777, + 770, + 763, + 756, + 749, + 742, + 741, + 733, + 724, + 716, + 707, + 699, + 698, + 688, + 677, + 666, + 656, + 655, + 645, + 634, + 623, + 613, + 612, + 598, + 584, + 570, + 569, + 555, + 541, + 527, + 526, + 505, + 484, + 483, + 462, + 440, + 439, + 396, + 395, + 352, + 351, + 308, + 307, + 264, + 263, + 220, + 219, + 176, + 132, + 88, + 44, + 0, +] + +smart185_timesteps = [ + 999, + 997, + 995, + 992, + 990, + 988, + 986, + 984, + 981, + 979, + 977, + 975, + 972, + 970, + 968, + 966, + 964, + 961, + 959, + 957, + 956, + 954, + 951, + 949, + 946, + 944, + 941, + 939, + 936, + 934, + 931, + 929, + 926, + 924, + 921, + 919, + 916, + 914, + 913, + 910, + 907, + 905, + 902, + 899, + 896, + 893, + 891, + 888, + 885, + 882, + 879, + 877, + 874, + 871, + 870, + 867, + 864, + 861, + 858, + 855, + 852, + 849, + 846, + 843, + 840, + 837, + 834, + 831, + 828, + 827, + 824, + 821, + 817, + 814, + 811, + 808, + 804, + 801, + 798, + 795, + 791, + 788, + 785, + 784, + 780, + 777, + 774, + 770, + 766, + 763, + 760, + 756, + 752, + 749, + 746, + 742, + 741, + 737, + 733, + 730, + 726, + 722, + 718, + 714, + 710, + 707, + 703, + 699, + 698, + 694, + 690, + 685, + 681, + 677, + 673, + 669, + 664, + 660, + 656, + 655, + 650, + 646, + 641, + 636, + 632, + 627, + 622, + 618, + 613, + 612, + 607, + 602, + 596, + 591, + 586, + 580, + 575, + 570, + 569, + 563, + 557, + 551, + 545, + 539, + 533, + 527, + 526, + 519, + 512, + 505, + 498, + 491, + 484, + 483, + 474, + 466, + 457, + 449, + 440, + 439, + 428, + 418, + 407, + 396, + 395, + 381, + 366, + 352, + 351, + 330, + 308, + 307, + 286, + 264, + 263, + 242, + 220, + 219, + 176, + 175, + 132, + 131, + 88, + 44, + 0, +] + +super27_timesteps = [ + 999, + 991, + 982, + 974, + 966, + 958, + 950, + 941, + 933, + 925, + 916, + 908, + 900, + 899, + 874, + 850, + 825, + 800, + 799, + 700, + 600, + 500, + 400, + 300, + 200, + 100, + 0, +] + +super40_timesteps = [ + 999, + 992, + 985, + 978, + 971, + 964, + 957, + 949, + 942, + 935, + 928, + 921, + 914, + 907, + 900, + 899, + 879, + 859, + 840, + 820, + 800, + 799, + 766, + 733, + 700, + 699, + 650, + 600, + 599, + 500, + 499, + 400, + 399, + 300, + 299, + 200, + 199, + 100, + 99, + 0, +] + +super100_timesteps = [ + 999, + 996, + 992, + 989, + 985, + 982, + 979, + 975, + 972, + 968, + 965, + 961, + 958, + 955, + 951, + 948, + 944, + 941, + 938, + 934, + 931, + 927, + 924, + 920, + 917, + 914, + 910, + 907, + 903, + 900, + 899, + 891, + 884, + 876, + 869, + 861, + 853, + 846, + 838, + 830, + 823, + 815, + 808, + 800, + 799, + 788, + 777, + 766, + 755, + 744, + 733, + 722, + 711, + 700, + 699, + 688, + 677, + 666, + 655, + 644, + 633, + 622, + 611, + 600, + 599, + 585, + 571, + 557, + 542, + 528, + 514, + 500, + 499, + 485, + 471, + 457, + 442, + 428, + 414, + 400, + 399, + 379, + 359, + 340, + 320, + 300, + 299, + 279, + 259, + 240, + 220, + 200, + 199, + 166, + 133, + 100, + 99, + 66, + 33, + 0, +] diff --git a/src/diffusers/pipelines/deepfloyd_if/watermark.py b/src/diffusers/pipelines/deepfloyd_if/watermark.py new file mode 100644 index 000000000000..db33dec0ef9a --- /dev/null +++ b/src/diffusers/pipelines/deepfloyd_if/watermark.py @@ -0,0 +1,46 @@ +from typing import List + +import PIL +import torch +from PIL import Image + +from ...configuration_utils import ConfigMixin +from ...models.modeling_utils import ModelMixin +from ...utils import PIL_INTERPOLATION + + +class IFWatermarker(ModelMixin, ConfigMixin): + def __init__(self): + super().__init__() + + self.register_buffer("watermark_image", torch.zeros((62, 62, 4))) + self.watermark_image_as_pil = None + + def apply_watermark(self, images: List[PIL.Image.Image], sample_size=None): + # copied from https://github.com/deep-floyd/IF/blob/b77482e36ca2031cb94dbca1001fc1e6400bf4ab/deepfloyd_if/modules/base.py#L287 + + h = images[0].height + w = images[0].width + + sample_size = sample_size or h + + coef = min(h / sample_size, w / sample_size) + img_h, img_w = (int(h / coef), int(w / coef)) if coef < 1 else (h, w) + + S1, S2 = 1024**2, img_w * img_h + K = (S2 / S1) ** 0.5 + wm_size, wm_x, wm_y = int(K * 62), img_w - int(14 * K), img_h - int(14 * K) + + if self.watermark_image_as_pil is None: + watermark_image = self.watermark_image.to(torch.uint8).cpu().numpy() + watermark_image = Image.fromarray(watermark_image, mode="RGBA") + self.watermark_image_as_pil = watermark_image + + wm_img = self.watermark_image_as_pil.resize( + (wm_size, wm_size), PIL_INTERPOLATION["bicubic"], reducing_gap=None + ) + + for pil_img in images: + pil_img.paste(wm_img, box=(wm_x - wm_size, wm_y - wm_size, wm_x, wm_y), mask=wm_img.split()[-1]) + + return images diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index 2d61f1a3700f..8c028b64a8c8 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -30,7 +30,6 @@ import torch from huggingface_hub import hf_hub_download, model_info, snapshot_download from packaging import version -from PIL import Image from tqdm.auto import tqdm import diffusers @@ -56,6 +55,7 @@ is_torch_version, is_transformers_available, logging, + numpy_to_pil, ) @@ -623,7 +623,9 @@ def module_is_sequentially_offloaded(module): if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"): return False - return hasattr(module, "_hf_hook") and not isinstance(module._hf_hook, accelerate.hooks.CpuOffload) + return hasattr(module, "_hf_hook") and not isinstance( + module._hf_hook, (accelerate.hooks.CpuOffload, accelerate.hooks.AlignDevicesHook) + ) def module_is_offloaded(module): if not is_accelerate_available() or is_accelerate_version("<", "0.17.0.dev0"): @@ -653,7 +655,20 @@ def module_is_offloaded(module): is_offloaded = pipeline_is_offloaded or pipeline_is_sequentially_offloaded for module in modules: - module.to(torch_device, torch_dtype) + is_loaded_in_8bit = hasattr(module, "is_loaded_in_8bit") and module.is_loaded_in_8bit + + if is_loaded_in_8bit and torch_dtype is not None: + logger.warning( + f"The module '{module.__class__.__name__}' has been loaded in 8bit and conversion to {torch_dtype} is not yet supported. Module is still in 8bit precision." + ) + + if is_loaded_in_8bit and torch_device is not None: + logger.warning( + f"The module '{module.__class__.__name__}' has been loaded in 8bit and moving it to {torch_dtype} via `.to()` is not yet supported. Module is still on {module.device}." + ) + else: + module.to(torch_device, torch_dtype) + if ( module.dtype == torch.float16 and str(torch_device) in ["cpu"] @@ -887,6 +902,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P config_dict = cls.load_config(cached_folder) + # pop out "_ignore_files" as it is only needed for download + config_dict.pop("_ignore_files", None) + # 2. Define which model components should load variants # We retrieve the information by matching whether variant # model checkpoints exist in the subfolders @@ -1204,12 +1222,19 @@ def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]: ) config_dict = cls._dict_from_json_file(config_file) + + ignore_filenames = config_dict.pop("_ignore_files", []) + # retrieve all folder_names that contain relevant files folder_names = [k for k, v in config_dict.items() if isinstance(v, list)] filenames = {sibling.rfilename for sibling in info.siblings} model_filenames, variant_filenames = variant_compatible_siblings(filenames, variant=variant) + # remove ignored filenames + model_filenames = set(model_filenames) - set(ignore_filenames) + variant_filenames = set(variant_filenames) - set(ignore_filenames) + # if the whole pipeline is cached we don't have to ping the Hub if revision in DEPRECATED_REVISION_ARGS and version.parse( version.parse(__version__).base_version @@ -1370,16 +1395,7 @@ def numpy_to_pil(images): """ Convert a numpy image or a batch of images to a PIL image. """ - if images.ndim == 3: - images = images[None, ...] - images = (images * 255).round().astype("uint8") - if images.shape[-1] == 1: - # special case for grayscale (single channel) images - pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images] - else: - pil_images = [Image.fromarray(image) for image in images] - - return pil_images + return numpy_to_pil(images) def progress_bar(self, iterable=None, total=None): if not hasattr(self, "_progress_bar_config"): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py index 8db19c2b9109..56681391aeeb 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py @@ -56,7 +56,18 @@ def __init__( scheduler: Any, max_noise_level: int = 350, ): - super().__init__(vae, text_encoder, tokenizer, unet, low_res_scheduler, scheduler, max_noise_level) + super().__init__( + vae=vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + unet=unet, + low_res_scheduler=low_res_scheduler, + scheduler=scheduler, + safety_checker=None, + feature_extractor=None, + watermarker=None, + max_noise_level=max_noise_level, + ) def __call__( self, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py index 693208b18cdd..45b26de284af 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py @@ -13,18 +13,19 @@ # limitations under the License. import inspect -from typing import Callable, List, Optional, Union +from typing import Any, Callable, List, Optional, Union import numpy as np import PIL import torch -from transformers import CLIPTextModel, CLIPTokenizer +from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...loaders import TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import DDPMScheduler, KarrasDiffusionSchedulers from ...utils import deprecate, is_accelerate_available, is_accelerate_version, logging, randn_tensor -from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput +from ..pipeline_utils import DiffusionPipeline +from . import StableDiffusionPipelineOutput logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -76,6 +77,7 @@ class StableDiffusionUpscalePipeline(DiffusionPipeline, TextualInversionLoaderMi A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. """ + _optional_components = ["watermarker", "safety_checker", "feature_extractor"] def __init__( self, @@ -85,12 +87,16 @@ def __init__( unet: UNet2DConditionModel, low_res_scheduler: DDPMScheduler, scheduler: KarrasDiffusionSchedulers, + safety_checker: Optional[Any] = None, + feature_extractor: Optional[CLIPImageProcessor] = None, + watermarker: Optional[Any] = None, max_noise_level: int = 350, ): super().__init__() - if hasattr(vae, "config"): - # check if vae has a config attribute `scaling_factor` and if it is set to 0.08333, else set it to 0.08333 and deprecate + if hasattr( + vae, "config" + ): # check if vae has a config attribute `scaling_factor` and if it is set to 0.08333, else set it to 0.08333 and deprecate is_vae_scaling_factor_set_to_0_08333 = ( hasattr(vae.config, "scaling_factor") and vae.config.scaling_factor == 0.08333 ) @@ -113,6 +119,9 @@ def __init__( unet=unet, low_res_scheduler=low_res_scheduler, scheduler=scheduler, + safety_checker=safety_checker, + watermarker=watermarker, + feature_extractor=feature_extractor, ) self.register_to_config(max_noise_level=max_noise_level) @@ -178,6 +187,23 @@ def _execution_device(self): return torch.device(module._hf_hook.execution_device) return self.device + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker + def run_safety_checker(self, image, device, dtype): + if self.safety_checker is not None: + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device) + image, nsfw_detected, watermark_detected = self.safety_checker( + images=image, + clip_input=safety_checker_input.pixel_values.to(dtype=dtype), + ) + else: + nsfw_detected = None + watermark_detected = None + + if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None: + self.unet_offload_hook.offload() + + return image, nsfw_detected, watermark_detected + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( self, @@ -678,10 +704,19 @@ def __call__( self.final_offload_hook.offload() # 11. Convert to PIL + # has_nsfw_concept = False if output_type == "pil": + image, has_nsfw_concept, _ = self.run_safety_checker(image, device, prompt_embeds.dtype) + image = self.numpy_to_pil(image) + # 11. Apply watermark + if self.watermarker is not None: + image = self.watermarker.apply_watermark(image) + else: + has_nsfw_concept = None + if not return_dict: - return (image,) + return (image, has_nsfw_concept) - return ImagePipelineOutput(images=image) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py index 2a7b80d01da7..57e1abc7315b 100644 --- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py +++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py @@ -15,7 +15,7 @@ AttnProcessor, ) from ...models.dual_transformer_2d import DualTransformer2DModel -from ...models.embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps +from ...models.embeddings import GaussianFourierProjection, TextTimeEmbedding, TimestepEmbedding, Timesteps from ...models.transformer_2d import Transformer2DModel from ...models.unet_2d_condition import UNet2DConditionOutput from ...utils import logging @@ -183,11 +183,16 @@ class UNetFlatConditionModel(ModelMixin, ConfigMixin): class_embed_type (`str`, *optional*, defaults to None): The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`, `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`. + addition_embed_type (`str`, *optional*, defaults to None): + Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or + "text". "text" will use the `TextTimeEmbedding` layer. num_class_embeds (`int`, *optional*, defaults to None): Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing class conditioning with `class_embed_type` equal to `None`. time_embedding_type (`str`, *optional*, default to `positional`): The type of position embedding to use for timesteps. Choose from `positional` or `fourier`. + time_embedding_dim (`int`, *optional*, default to `None`): + An optional override for the dimension of the projected time embedding. time_embedding_act_fn (`str`, *optional*, default to `None`): Optional activation function to use on the time embeddings only one time before they as passed to the rest of the unet. Choose from `silu`, `mish`, `gelu`, and `swish`. @@ -246,12 +251,14 @@ def __init__( dual_cross_attention: bool = False, use_linear_projection: bool = False, class_embed_type: Optional[str] = None, + addition_embed_type: Optional[str] = None, num_class_embeds: Optional[int] = None, upcast_attention: bool = False, resnet_time_scale_shift: str = "default", resnet_skip_time_act: bool = False, resnet_out_scale_factor: int = 1.0, time_embedding_type: str = "positional", + time_embedding_dim: Optional[int] = None, time_embedding_act_fn: Optional[str] = None, timestep_post_act: Optional[str] = None, time_cond_proj_dim: Optional[int] = None, @@ -261,6 +268,7 @@ def __init__( class_embeddings_concat: bool = False, mid_block_only_cross_attention: Optional[bool] = None, cross_attention_norm: Optional[str] = None, + addition_embed_type_num_heads=64, ): super().__init__() @@ -311,7 +319,7 @@ def __init__( # time if time_embedding_type == "fourier": - time_embed_dim = block_out_channels[0] * 2 + time_embed_dim = time_embedding_dim or block_out_channels[0] * 2 if time_embed_dim % 2 != 0: raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.") self.time_proj = GaussianFourierProjection( @@ -319,7 +327,7 @@ def __init__( ) timestep_input_dim = time_embed_dim elif time_embedding_type == "positional": - time_embed_dim = block_out_channels[0] * 4 + time_embed_dim = time_embedding_dim or block_out_channels[0] * 4 self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift) timestep_input_dim = block_out_channels[0] @@ -370,6 +378,18 @@ def __init__( else: self.class_embedding = None + if addition_embed_type == "text": + if encoder_hid_dim is not None: + text_time_embedding_from_dim = encoder_hid_dim + else: + text_time_embedding_from_dim = cross_attention_dim + + self.add_embedding = TextTimeEmbedding( + text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads + ) + elif addition_embed_type is not None: + raise ValueError(f"addition_embed_type: {addition_embed_type} must be None or 'text'.") + if time_embedding_act_fn is None: self.time_embed_act = None elif time_embedding_act_fn == "swish": @@ -781,6 +801,10 @@ def forward( else: emb = emb + class_emb + if self.config.addition_embed_type == "text": + aug_emb = self.add_embedding(encoder_hidden_states) + emb = emb + aug_emb + if self.time_embed_act is not None: emb = self.time_embed_act(emb) diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py index c717d722f84c..1b8eca050c9e 100644 --- a/src/diffusers/utils/__init__.py +++ b/src/diffusers/utils/__init__.py @@ -44,6 +44,7 @@ http_user_agent, ) from .import_utils import ( + BACKENDS_MAPPING, ENV_VARS_TRUE_AND_AUTO_VALUES, ENV_VARS_TRUE_VALUES, USE_JAX, @@ -53,7 +54,9 @@ OptionalDependencyNotAvailable, is_accelerate_available, is_accelerate_version, + is_bs4_available, is_flax_available, + is_ftfy_available, is_inflect_available, is_k_diffusion_available, is_k_diffusion_version, @@ -76,7 +79,7 @@ ) from .logging import get_logger from .outputs import BaseOutput -from .pil_utils import PIL_INTERPOLATION +from .pil_utils import PIL_INTERPOLATION, numpy_to_pil, pt_to_pil from .torch_utils import is_compiled_module, randn_tensor diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py index bda56d2ae8ae..bf4fe8d87ff9 100644 --- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py +++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py @@ -62,6 +62,96 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch", "transformers"]) +class IFImg2ImgPipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + +class IFImg2ImgSuperResolutionPipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + +class IFInpaintingPipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + +class IFInpaintingSuperResolutionPipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + +class IFPipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + +class IFSuperResolutionPipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + class LDMTextToImagePipeline(metaclass=DummyObject): _backends = ["torch", "transformers"] diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py index fd7538b1b5e9..2d90cb9747a7 100644 --- a/src/diffusers/utils/import_utils.py +++ b/src/diffusers/utils/import_utils.py @@ -271,6 +271,23 @@ _compel_available = False +_ftfy_available = importlib.util.find_spec("ftfy") is not None +try: + _ftfy_version = importlib_metadata.version("ftfy") + logger.debug(f"Successfully imported ftfy version {_ftfy_version}") +except importlib_metadata.PackageNotFoundError: + _ftfy_available = False + + +_bs4_available = importlib.util.find_spec("bs4") is not None +try: + # importlib metadata under different name + _bs4_version = importlib_metadata.version("beautifulsoup4") + logger.debug(f"Successfully imported ftfy version {_bs4_version}") +except importlib_metadata.PackageNotFoundError: + _bs4_available = False + + def is_torch_available(): return _torch_available @@ -347,6 +364,14 @@ def is_compel_available(): return _compel_available +def is_ftfy_available(): + return _ftfy_available + + +def is_bs4_available(): + return _bs4_available + + # docstyle-ignore FLAX_IMPORT_ERROR = """ {0} requires the FLAX library but it was not found in your environment. Checkout the instructions on the @@ -437,8 +462,23 @@ def is_compel_available(): {0} requires the compel library but it was not found in your environment. You can install it with pip: `pip install compel` """ +# docstyle-ignore +BS4_IMPORT_ERROR = """ +{0} requires the Beautiful Soup library but it was not found in your environment. You can install it with pip: +`pip install beautifulsoup4`. Please note that you may need to restart your runtime after installation. +""" + +# docstyle-ignore +FTFY_IMPORT_ERROR = """ +{0} requires the ftfy library but it was not found in your environment. Checkout the instructions on the +installation section: https://github.com/rspeer/python-ftfy/tree/master#installing and follow the ones +that match your environment. Please note that you may need to restart your runtime after installation. +""" + + BACKENDS_MAPPING = OrderedDict( [ + ("bs4", (is_bs4_available, BS4_IMPORT_ERROR)), ("flax", (is_flax_available, FLAX_IMPORT_ERROR)), ("inflect", (is_inflect_available, INFLECT_IMPORT_ERROR)), ("onnx", (is_onnx_available, ONNX_IMPORT_ERROR)), @@ -454,6 +494,7 @@ def is_compel_available(): ("omegaconf", (is_omegaconf_available, OMEGACONF_IMPORT_ERROR)), ("tensorboard", (_tensorboard_available, TENSORBOARD_IMPORT_ERROR)), ("compel", (_compel_available, COMPEL_IMPORT_ERROR)), + ("ftfy", (is_ftfy_available, FTFY_IMPORT_ERROR)), ] ) diff --git a/src/diffusers/utils/pil_utils.py b/src/diffusers/utils/pil_utils.py index 39d0a15a4e2f..ad76a32230fb 100644 --- a/src/diffusers/utils/pil_utils.py +++ b/src/diffusers/utils/pil_utils.py @@ -1,6 +1,7 @@ import PIL.Image import PIL.ImageOps from packaging import version +from PIL import Image if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"): @@ -19,3 +20,26 @@ "lanczos": PIL.Image.LANCZOS, "nearest": PIL.Image.NEAREST, } + + +def pt_to_pil(images): + images = (images / 2 + 0.5).clamp(0, 1) + images = images.cpu().permute(0, 2, 3, 1).float().numpy() + images = numpy_to_pil(images) + return images + + +def numpy_to_pil(images): + """ + Convert a numpy image or a batch of images to a PIL image. + """ + if images.ndim == 3: + images = images[None, ...] + images = (images * 255).round().astype("uint8") + if images.shape[-1] == 1: + # special case for grayscale (single channel) images + pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images] + else: + pil_images = [Image.fromarray(image) for image in images] + + return pil_images diff --git a/tests/pipelines/deepfloyd_if/__init__.py b/tests/pipelines/deepfloyd_if/__init__.py new file mode 100644 index 000000000000..094254a61875 --- /dev/null +++ b/tests/pipelines/deepfloyd_if/__init__.py @@ -0,0 +1,272 @@ +import tempfile + +import numpy as np +import torch +from transformers import AutoTokenizer, T5EncoderModel + +from diffusers import DDPMScheduler, UNet2DConditionModel +from diffusers.models.attention_processor import AttnAddedKVProcessor +from diffusers.pipelines.deepfloyd_if import IFWatermarker +from diffusers.utils.testing_utils import torch_device + +from ..test_pipelines_common import to_np + + +# WARN: the hf-internal-testing/tiny-random-t5 text encoder has some non-determinism in the `save_load` tests. + + +class IFPipelineTesterMixin: + def _get_dummy_components(self): + torch.manual_seed(0) + text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5") + + torch.manual_seed(0) + tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5") + + torch.manual_seed(0) + unet = UNet2DConditionModel( + sample_size=32, + layers_per_block=1, + block_out_channels=[32, 64], + down_block_types=[ + "ResnetDownsampleBlock2D", + "SimpleCrossAttnDownBlock2D", + ], + mid_block_type="UNetMidBlock2DSimpleCrossAttn", + up_block_types=["SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"], + in_channels=3, + out_channels=6, + cross_attention_dim=32, + encoder_hid_dim=32, + attention_head_dim=8, + addition_embed_type="text", + addition_embed_type_num_heads=2, + cross_attention_norm="group_norm", + resnet_time_scale_shift="scale_shift", + act_fn="gelu", + ) + unet.set_attn_processor(AttnAddedKVProcessor()) # For reproducibility tests + + torch.manual_seed(0) + scheduler = DDPMScheduler( + num_train_timesteps=1000, + beta_schedule="squaredcos_cap_v2", + beta_start=0.0001, + beta_end=0.02, + thresholding=True, + dynamic_thresholding_ratio=0.95, + sample_max_value=1.0, + prediction_type="epsilon", + variance_type="learned_range", + ) + + torch.manual_seed(0) + watermarker = IFWatermarker() + + return { + "text_encoder": text_encoder, + "tokenizer": tokenizer, + "unet": unet, + "scheduler": scheduler, + "watermarker": watermarker, + "safety_checker": None, + "feature_extractor": None, + } + + def _get_superresolution_dummy_components(self): + torch.manual_seed(0) + text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5") + + torch.manual_seed(0) + tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5") + + torch.manual_seed(0) + unet = UNet2DConditionModel( + sample_size=32, + layers_per_block=[1, 2], + block_out_channels=[32, 64], + down_block_types=[ + "ResnetDownsampleBlock2D", + "SimpleCrossAttnDownBlock2D", + ], + mid_block_type="UNetMidBlock2DSimpleCrossAttn", + up_block_types=["SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"], + in_channels=6, + out_channels=6, + cross_attention_dim=32, + encoder_hid_dim=32, + attention_head_dim=8, + addition_embed_type="text", + addition_embed_type_num_heads=2, + cross_attention_norm="group_norm", + resnet_time_scale_shift="scale_shift", + act_fn="gelu", + class_embed_type="timestep", + mid_block_scale_factor=1.414, + time_embedding_act_fn="gelu", + time_embedding_dim=32, + ) + unet.set_attn_processor(AttnAddedKVProcessor()) # For reproducibility tests + + torch.manual_seed(0) + scheduler = DDPMScheduler( + num_train_timesteps=1000, + beta_schedule="squaredcos_cap_v2", + beta_start=0.0001, + beta_end=0.02, + thresholding=True, + dynamic_thresholding_ratio=0.95, + sample_max_value=1.0, + prediction_type="epsilon", + variance_type="learned_range", + ) + + torch.manual_seed(0) + image_noising_scheduler = DDPMScheduler( + num_train_timesteps=1000, + beta_schedule="squaredcos_cap_v2", + beta_start=0.0001, + beta_end=0.02, + ) + + torch.manual_seed(0) + watermarker = IFWatermarker() + + return { + "text_encoder": text_encoder, + "tokenizer": tokenizer, + "unet": unet, + "scheduler": scheduler, + "image_noising_scheduler": image_noising_scheduler, + "watermarker": watermarker, + "safety_checker": None, + "feature_extractor": None, + } + + # this test is modified from the base class because if pipelines set the text encoder + # as optional with the intention that the user is allowed to encode the prompt once + # and then pass the embeddings directly to the pipeline. The base class test uses + # the unmodified arguments from `self.get_dummy_inputs` which will pass the unencoded + # prompt to the pipeline when the text encoder is set to None, throwing an error. + # So we make the test reflect the intended usage of setting the text encoder to None. + def _test_save_load_optional_components(self): + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(torch_device) + + prompt = inputs["prompt"] + generator = inputs["generator"] + num_inference_steps = inputs["num_inference_steps"] + output_type = inputs["output_type"] + + if "image" in inputs: + image = inputs["image"] + else: + image = None + + if "mask_image" in inputs: + mask_image = inputs["mask_image"] + else: + mask_image = None + + if "original_image" in inputs: + original_image = inputs["original_image"] + else: + original_image = None + + prompt_embeds, negative_prompt_embeds = pipe.encode_prompt(prompt) + + # inputs with prompt converted to embeddings + inputs = { + "prompt_embeds": prompt_embeds, + "negative_prompt_embeds": negative_prompt_embeds, + "generator": generator, + "num_inference_steps": num_inference_steps, + "output_type": output_type, + } + + if image is not None: + inputs["image"] = image + + if mask_image is not None: + inputs["mask_image"] = mask_image + + if original_image is not None: + inputs["original_image"] = original_image + + # set all optional components to None + for optional_component in pipe._optional_components: + setattr(pipe, optional_component, None) + + output = pipe(**inputs)[0] + + with tempfile.TemporaryDirectory() as tmpdir: + pipe.save_pretrained(tmpdir) + pipe_loaded = self.pipeline_class.from_pretrained(tmpdir) + pipe_loaded.to(torch_device) + pipe_loaded.set_progress_bar_config(disable=None) + + pipe_loaded.unet.set_attn_processor(AttnAddedKVProcessor()) # For reproducibility tests + + for optional_component in pipe._optional_components: + self.assertTrue( + getattr(pipe_loaded, optional_component) is None, + f"`{optional_component}` did not stay set to None after loading.", + ) + + inputs = self.get_dummy_inputs(torch_device) + + generator = inputs["generator"] + num_inference_steps = inputs["num_inference_steps"] + output_type = inputs["output_type"] + + # inputs with prompt converted to embeddings + inputs = { + "prompt_embeds": prompt_embeds, + "negative_prompt_embeds": negative_prompt_embeds, + "generator": generator, + "num_inference_steps": num_inference_steps, + "output_type": output_type, + } + + if image is not None: + inputs["image"] = image + + if mask_image is not None: + inputs["mask_image"] = mask_image + + if original_image is not None: + inputs["original_image"] = original_image + + output_loaded = pipe_loaded(**inputs)[0] + + max_diff = np.abs(to_np(output) - to_np(output_loaded)).max() + self.assertLess(max_diff, 1e-4) + + # Modified from `PipelineTesterMixin` to set the attn processor as it's not serialized. + # This should be handled in the base test and then this method can be removed. + def _test_save_load_local(self): + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(torch_device) + output = pipe(**inputs)[0] + + with tempfile.TemporaryDirectory() as tmpdir: + pipe.save_pretrained(tmpdir) + pipe_loaded = self.pipeline_class.from_pretrained(tmpdir) + pipe_loaded.to(torch_device) + pipe_loaded.set_progress_bar_config(disable=None) + + pipe_loaded.unet.set_attn_processor(AttnAddedKVProcessor()) # For reproducibility tests + + inputs = self.get_dummy_inputs(torch_device) + output_loaded = pipe_loaded(**inputs)[0] + + max_diff = np.abs(to_np(output) - to_np(output_loaded)).max() + self.assertLess(max_diff, 1e-4) diff --git a/tests/pipelines/deepfloyd_if/test_if.py b/tests/pipelines/deepfloyd_if/test_if.py new file mode 100644 index 000000000000..e2204cb601a6 --- /dev/null +++ b/tests/pipelines/deepfloyd_if/test_if.py @@ -0,0 +1,340 @@ +# coding=utf-8 +# Copyright 2023 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import random +import unittest + +import torch + +from diffusers import ( + IFImg2ImgPipeline, + IFImg2ImgSuperResolutionPipeline, + IFInpaintingPipeline, + IFInpaintingSuperResolutionPipeline, + IFPipeline, + IFSuperResolutionPipeline, +) +from diffusers.models.attention_processor import AttnAddedKVProcessor +from diffusers.utils.testing_utils import floats_tensor, load_numpy, require_torch_gpu, skip_mps, slow, torch_device + +from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS +from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference +from . import IFPipelineTesterMixin + + +@skip_mps +class IFPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase): + pipeline_class = IFPipeline + params = TEXT_TO_IMAGE_PARAMS - {"width", "height", "latents"} + batch_params = TEXT_TO_IMAGE_BATCH_PARAMS + required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} + + test_xformers_attention = False + + def get_dummy_components(self): + return self._get_dummy_components() + + def get_dummy_inputs(self, device, seed=0): + if str(device).startswith("mps"): + generator = torch.manual_seed(seed) + else: + generator = torch.Generator(device=device).manual_seed(seed) + + inputs = { + "prompt": "A painting of a squirrel eating a burger", + "generator": generator, + "num_inference_steps": 2, + "output_type": "numpy", + } + + return inputs + + def test_save_load_optional_components(self): + self._test_save_load_optional_components() + + @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA") + def test_save_load_float16(self): + # Due to non-determinism in save load of the hf-internal-testing/tiny-random-t5 text encoder + self._test_save_load_float16(expected_max_diff=1e-1) + + def test_attention_slicing_forward_pass(self): + self._test_attention_slicing_forward_pass(expected_max_diff=1e-2) + + def test_save_load_local(self): + self._test_save_load_local() + + def test_inference_batch_single_identical(self): + self._test_inference_batch_single_identical( + expected_max_diff=1e-2, + ) + + +@slow +@require_torch_gpu +class IFPipelineSlowTests(unittest.TestCase): + def tearDown(self): + # clean up the VRAM after each test + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def test_all(self): + # if + + pipe_1 = IFPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16) + + pipe_2 = IFSuperResolutionPipeline.from_pretrained( + "DeepFloyd/IF-II-L-v1.0", variant="fp16", torch_dtype=torch.float16, text_encoder=None, tokenizer=None + ) + + # pre compute text embeddings and remove T5 to save memory + + pipe_1.text_encoder.to("cuda") + + prompt_embeds, negative_prompt_embeds = pipe_1.encode_prompt("anime turtle", device="cuda") + + del pipe_1.tokenizer + del pipe_1.text_encoder + gc.collect() + + pipe_1.tokenizer = None + pipe_1.text_encoder = None + + pipe_1.enable_model_cpu_offload() + pipe_2.enable_model_cpu_offload() + + pipe_1.unet.set_attn_processor(AttnAddedKVProcessor()) + pipe_2.unet.set_attn_processor(AttnAddedKVProcessor()) + + self._test_if(pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds) + + pipe_1.remove_all_hooks() + pipe_2.remove_all_hooks() + + # img2img + + pipe_1 = IFImg2ImgPipeline(**pipe_1.components) + pipe_2 = IFImg2ImgSuperResolutionPipeline(**pipe_2.components) + + pipe_1.enable_model_cpu_offload() + pipe_2.enable_model_cpu_offload() + + pipe_1.unet.set_attn_processor(AttnAddedKVProcessor()) + pipe_2.unet.set_attn_processor(AttnAddedKVProcessor()) + + self._test_if_img2img(pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds) + + pipe_1.remove_all_hooks() + pipe_2.remove_all_hooks() + + # inpainting + + pipe_1 = IFInpaintingPipeline(**pipe_1.components) + pipe_2 = IFInpaintingSuperResolutionPipeline(**pipe_2.components) + + pipe_1.enable_model_cpu_offload() + pipe_2.enable_model_cpu_offload() + + pipe_1.unet.set_attn_processor(AttnAddedKVProcessor()) + pipe_2.unet.set_attn_processor(AttnAddedKVProcessor()) + + self._test_if_inpainting(pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds) + + def _test_if(self, pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds): + # pipeline 1 + + _start_torch_memory_measurement() + + generator = torch.Generator(device="cpu").manual_seed(0) + output = pipe_1( + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + num_inference_steps=2, + generator=generator, + output_type="np", + ) + + image = output.images[0] + + assert image.shape == (64, 64, 3) + + mem_bytes = torch.cuda.max_memory_allocated() + assert mem_bytes < 13 * 10**9 + + expected_image = load_numpy( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/if/test_if.npy" + ) + assert_mean_pixel_difference(image, expected_image) + + # pipeline 2 + + _start_torch_memory_measurement() + + generator = torch.Generator(device="cpu").manual_seed(0) + + image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device) + + output = pipe_2( + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + image=image, + generator=generator, + num_inference_steps=2, + output_type="np", + ) + + image = output.images[0] + + assert image.shape == (256, 256, 3) + + mem_bytes = torch.cuda.max_memory_allocated() + assert mem_bytes < 4 * 10**9 + + expected_image = load_numpy( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/if/test_if_superresolution_stage_II.npy" + ) + assert_mean_pixel_difference(image, expected_image) + + def _test_if_img2img(self, pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds): + # pipeline 1 + + _start_torch_memory_measurement() + + image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device) + + generator = torch.Generator(device="cpu").manual_seed(0) + + output = pipe_1( + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + image=image, + num_inference_steps=2, + generator=generator, + output_type="np", + ) + + image = output.images[0] + + assert image.shape == (64, 64, 3) + + mem_bytes = torch.cuda.max_memory_allocated() + assert mem_bytes < 10 * 10**9 + + expected_image = load_numpy( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/if/test_if_img2img.npy" + ) + assert_mean_pixel_difference(image, expected_image) + + # pipeline 2 + + _start_torch_memory_measurement() + + generator = torch.Generator(device="cpu").manual_seed(0) + + original_image = floats_tensor((1, 3, 256, 256), rng=random.Random(0)).to(torch_device) + image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device) + + output = pipe_2( + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + image=image, + original_image=original_image, + generator=generator, + num_inference_steps=2, + output_type="np", + ) + + image = output.images[0] + + assert image.shape == (256, 256, 3) + + mem_bytes = torch.cuda.max_memory_allocated() + assert mem_bytes < 4 * 10**9 + + expected_image = load_numpy( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/if/test_if_img2img_superresolution_stage_II.npy" + ) + assert_mean_pixel_difference(image, expected_image) + + def _test_if_inpainting(self, pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds): + # pipeline 1 + + _start_torch_memory_measurement() + + image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device) + mask_image = floats_tensor((1, 3, 64, 64), rng=random.Random(1)).to(torch_device) + + generator = torch.Generator(device="cpu").manual_seed(0) + output = pipe_1( + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + image=image, + mask_image=mask_image, + num_inference_steps=2, + generator=generator, + output_type="np", + ) + + image = output.images[0] + + assert image.shape == (64, 64, 3) + + mem_bytes = torch.cuda.max_memory_allocated() + assert mem_bytes < 10 * 10**9 + + expected_image = load_numpy( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/if/test_if_inpainting.npy" + ) + assert_mean_pixel_difference(image, expected_image) + + # pipeline 2 + + _start_torch_memory_measurement() + + generator = torch.Generator(device="cpu").manual_seed(0) + + image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device) + original_image = floats_tensor((1, 3, 256, 256), rng=random.Random(0)).to(torch_device) + mask_image = floats_tensor((1, 3, 256, 256), rng=random.Random(1)).to(torch_device) + + output = pipe_2( + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + image=image, + mask_image=mask_image, + original_image=original_image, + generator=generator, + num_inference_steps=2, + output_type="np", + ) + + image = output.images[0] + + assert image.shape == (256, 256, 3) + + mem_bytes = torch.cuda.max_memory_allocated() + assert mem_bytes < 4 * 10**9 + + expected_image = load_numpy( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/if/test_if_inpainting_superresolution_stage_II.npy" + ) + assert_mean_pixel_difference(image, expected_image) + + +def _start_torch_memory_measurement(): + torch.cuda.empty_cache() + torch.cuda.reset_max_memory_allocated() + torch.cuda.reset_peak_memory_stats() diff --git a/tests/pipelines/deepfloyd_if/test_if_img2img.py b/tests/pipelines/deepfloyd_if/test_if_img2img.py new file mode 100644 index 000000000000..b4c99a8ab93a --- /dev/null +++ b/tests/pipelines/deepfloyd_if/test_if_img2img.py @@ -0,0 +1,84 @@ +# coding=utf-8 +# Copyright 2023 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random +import unittest + +import torch + +from diffusers import IFImg2ImgPipeline +from diffusers.utils import floats_tensor +from diffusers.utils.testing_utils import skip_mps, torch_device + +from ..pipeline_params import ( + TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, + TEXT_GUIDED_IMAGE_VARIATION_PARAMS, +) +from ..test_pipelines_common import PipelineTesterMixin +from . import IFPipelineTesterMixin + + +@skip_mps +class IFImg2ImgPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase): + pipeline_class = IFImg2ImgPipeline + params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"width", "height"} + batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS + required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} + + test_xformers_attention = False + + def get_dummy_components(self): + return self._get_dummy_components() + + def get_dummy_inputs(self, device, seed=0): + if str(device).startswith("mps"): + generator = torch.manual_seed(seed) + else: + generator = torch.Generator(device=device).manual_seed(seed) + + image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) + + inputs = { + "prompt": "A painting of a squirrel eating a burger", + "image": image, + "generator": generator, + "num_inference_steps": 2, + "output_type": "numpy", + } + + return inputs + + def test_save_load_optional_components(self): + self._test_save_load_optional_components() + + @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA") + def test_save_load_float16(self): + # Due to non-determinism in save load of the hf-internal-testing/tiny-random-t5 text encoder + self._test_save_load_float16(expected_max_diff=1e-1) + + @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA") + def test_float16_inference(self): + self._test_float16_inference(expected_max_diff=1e-1) + + def test_attention_slicing_forward_pass(self): + self._test_attention_slicing_forward_pass(expected_max_diff=1e-2) + + def test_save_load_local(self): + self._test_save_load_local() + + def test_inference_batch_single_identical(self): + self._test_inference_batch_single_identical( + expected_max_diff=1e-2, + ) diff --git a/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py b/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py new file mode 100644 index 000000000000..626ab321f895 --- /dev/null +++ b/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py @@ -0,0 +1,79 @@ +# coding=utf-8 +# Copyright 2023 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random +import unittest + +import torch + +from diffusers import IFImg2ImgSuperResolutionPipeline +from diffusers.utils import floats_tensor +from diffusers.utils.testing_utils import skip_mps, torch_device + +from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS +from ..test_pipelines_common import PipelineTesterMixin +from . import IFPipelineTesterMixin + + +@skip_mps +class IFImg2ImgSuperResolutionPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase): + pipeline_class = IFImg2ImgSuperResolutionPipeline + params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"width", "height"} + batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS.union({"original_image"}) + required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} + + test_xformers_attention = False + + def get_dummy_components(self): + return self._get_superresolution_dummy_components() + + def get_dummy_inputs(self, device, seed=0): + if str(device).startswith("mps"): + generator = torch.manual_seed(seed) + else: + generator = torch.Generator(device=device).manual_seed(seed) + + original_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) + image = floats_tensor((1, 3, 16, 16), rng=random.Random(seed)).to(device) + + inputs = { + "prompt": "A painting of a squirrel eating a burger", + "image": image, + "original_image": original_image, + "generator": generator, + "num_inference_steps": 2, + "output_type": "numpy", + } + + return inputs + + def test_save_load_optional_components(self): + self._test_save_load_optional_components() + + @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA") + def test_save_load_float16(self): + # Due to non-determinism in save load of the hf-internal-testing/tiny-random-t5 text encoder + self._test_save_load_float16(expected_max_diff=1e-1) + + def test_attention_slicing_forward_pass(self): + self._test_attention_slicing_forward_pass(expected_max_diff=1e-2) + + def test_save_load_local(self): + self._test_save_load_local() + + def test_inference_batch_single_identical(self): + self._test_inference_batch_single_identical( + expected_max_diff=1e-2, + ) diff --git a/tests/pipelines/deepfloyd_if/test_if_inpainting.py b/tests/pipelines/deepfloyd_if/test_if_inpainting.py new file mode 100644 index 000000000000..37d818c7a910 --- /dev/null +++ b/tests/pipelines/deepfloyd_if/test_if_inpainting.py @@ -0,0 +1,82 @@ +# coding=utf-8 +# Copyright 2023 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random +import unittest + +import torch + +from diffusers import IFInpaintingPipeline +from diffusers.utils import floats_tensor +from diffusers.utils.testing_utils import skip_mps, torch_device + +from ..pipeline_params import ( + TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, + TEXT_GUIDED_IMAGE_INPAINTING_PARAMS, +) +from ..test_pipelines_common import PipelineTesterMixin +from . import IFPipelineTesterMixin + + +@skip_mps +class IFInpaintingPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase): + pipeline_class = IFInpaintingPipeline + params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS - {"width", "height"} + batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS + required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} + + test_xformers_attention = False + + def get_dummy_components(self): + return self._get_dummy_components() + + def get_dummy_inputs(self, device, seed=0): + if str(device).startswith("mps"): + generator = torch.manual_seed(seed) + else: + generator = torch.Generator(device=device).manual_seed(seed) + + image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) + mask_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) + + inputs = { + "prompt": "A painting of a squirrel eating a burger", + "image": image, + "mask_image": mask_image, + "generator": generator, + "num_inference_steps": 2, + "output_type": "numpy", + } + + return inputs + + def test_save_load_optional_components(self): + self._test_save_load_optional_components() + + @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA") + def test_save_load_float16(self): + # Due to non-determinism in save load of the hf-internal-testing/tiny-random-t5 text encoder + self._test_save_load_float16(expected_max_diff=1e-1) + + def test_attention_slicing_forward_pass(self): + self._test_attention_slicing_forward_pass(expected_max_diff=1e-2) + + def test_save_load_local(self): + self._test_save_load_local() + + def test_inference_batch_single_identical(self): + self._test_inference_batch_single_identical( + expected_max_diff=1e-2, + ) diff --git a/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py b/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py new file mode 100644 index 000000000000..30062cb2f8d0 --- /dev/null +++ b/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py @@ -0,0 +1,84 @@ +# coding=utf-8 +# Copyright 2023 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random +import unittest + +import torch + +from diffusers import IFInpaintingSuperResolutionPipeline +from diffusers.utils import floats_tensor +from diffusers.utils.testing_utils import skip_mps, torch_device + +from ..pipeline_params import ( + TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, + TEXT_GUIDED_IMAGE_INPAINTING_PARAMS, +) +from ..test_pipelines_common import PipelineTesterMixin +from . import IFPipelineTesterMixin + + +@skip_mps +class IFInpaintingSuperResolutionPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase): + pipeline_class = IFInpaintingSuperResolutionPipeline + params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS - {"width", "height"} + batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS.union({"original_image"}) + required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} + + test_xformers_attention = False + + def get_dummy_components(self): + return self._get_superresolution_dummy_components() + + def get_dummy_inputs(self, device, seed=0): + if str(device).startswith("mps"): + generator = torch.manual_seed(seed) + else: + generator = torch.Generator(device=device).manual_seed(seed) + + image = floats_tensor((1, 3, 16, 16), rng=random.Random(seed)).to(device) + original_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) + mask_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) + + inputs = { + "prompt": "A painting of a squirrel eating a burger", + "image": image, + "original_image": original_image, + "mask_image": mask_image, + "generator": generator, + "num_inference_steps": 2, + "output_type": "numpy", + } + + return inputs + + def test_save_load_optional_components(self): + self._test_save_load_optional_components() + + @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA") + def test_save_load_float16(self): + # Due to non-determinism in save load of the hf-internal-testing/tiny-random-t5 text encoder + self._test_save_load_float16(expected_max_diff=1e-1) + + def test_attention_slicing_forward_pass(self): + self._test_attention_slicing_forward_pass(expected_max_diff=1e-2) + + def test_save_load_local(self): + self._test_save_load_local() + + def test_inference_batch_single_identical(self): + self._test_inference_batch_single_identical( + expected_max_diff=1e-2, + ) diff --git a/tests/pipelines/deepfloyd_if/test_if_superresolution.py b/tests/pipelines/deepfloyd_if/test_if_superresolution.py new file mode 100644 index 000000000000..14acfa5415c2 --- /dev/null +++ b/tests/pipelines/deepfloyd_if/test_if_superresolution.py @@ -0,0 +1,77 @@ +# coding=utf-8 +# Copyright 2023 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random +import unittest + +import torch + +from diffusers import IFSuperResolutionPipeline +from diffusers.utils import floats_tensor +from diffusers.utils.testing_utils import skip_mps, torch_device + +from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS +from ..test_pipelines_common import PipelineTesterMixin +from . import IFPipelineTesterMixin + + +@skip_mps +class IFSuperResolutionPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase): + pipeline_class = IFSuperResolutionPipeline + params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"width", "height"} + batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS + required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} + + test_xformers_attention = False + + def get_dummy_components(self): + return self._get_superresolution_dummy_components() + + def get_dummy_inputs(self, device, seed=0): + if str(device).startswith("mps"): + generator = torch.manual_seed(seed) + else: + generator = torch.Generator(device=device).manual_seed(seed) + + image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) + + inputs = { + "prompt": "A painting of a squirrel eating a burger", + "image": image, + "generator": generator, + "num_inference_steps": 2, + "output_type": "numpy", + } + + return inputs + + def test_save_load_optional_components(self): + self._test_save_load_optional_components() + + @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA") + def test_save_load_float16(self): + # Due to non-determinism in save load of the hf-internal-testing/tiny-random-t5 text encoder + self._test_save_load_float16(expected_max_diff=1e-1) + + def test_attention_slicing_forward_pass(self): + self._test_attention_slicing_forward_pass(expected_max_diff=1e-2) + + def test_save_load_local(self): + self._test_save_load_local() + + def test_inference_batch_single_identical(self): + self._test_inference_batch_single_identical( + expected_max_diff=1e-2, + ) diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py index 8fb79f0c4057..168ff8106c52 100644 --- a/tests/pipelines/test_pipelines.py +++ b/tests/pipelines/test_pipelines.py @@ -575,6 +575,19 @@ def test_text_inversion_download(self): out = pipe(prompt, num_inference_steps=1, output_type="numpy").images assert out.shape == (1, 128, 128, 3) + def test_download_ignore_files(self): + # Check https://huggingface.co/hf-internal-testing/tiny-stable-diffusion-pipe-ignore-files/blob/72f58636e5508a218c6b3f60550dc96445547817/model_index.json#L4 + with tempfile.TemporaryDirectory() as tmpdirname: + # pipeline has Flax weights + tmpdirname = DiffusionPipeline.download("hf-internal-testing/tiny-stable-diffusion-pipe-ignore-files") + all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname))] + files = [item for sublist in all_root_files for item in sublist] + + # None of the downloaded files should be a pytorch file even if we have some here: + # https://huggingface.co/hf-internal-testing/tiny-stable-diffusion-pipe/blob/main/unet/diffusion_flax_model.msgpack + assert not any(f in ["vae/diffusion_pytorch_model.bin", "text_encoder/config.json"] for f in files) + assert len(files) == 14 + class CustomPipelineTests(unittest.TestCase): def test_load_custom_pipeline(self): diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py index d0712bdec8f6..0278092282ba 100644 --- a/tests/pipelines/test_pipelines_common.py +++ b/tests/pipelines/test_pipelines_common.py @@ -339,6 +339,9 @@ def test_components_function(self): @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA") def test_float16_inference(self): + self._test_float16_inference() + + def _test_float16_inference(self, expected_max_diff=1e-2): components = self.get_dummy_components() pipe = self.pipeline_class(**components) pipe.to(torch_device) @@ -352,10 +355,13 @@ def test_float16_inference(self): output_fp16 = pipe_fp16(**self.get_dummy_inputs(torch_device))[0] max_diff = np.abs(to_np(output) - to_np(output_fp16)).max() - self.assertLess(max_diff, 1e-2, "The outputs of the fp16 and fp32 pipelines are too different.") + self.assertLess(max_diff, expected_max_diff, "The outputs of the fp16 and fp32 pipelines are too different.") @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA") def test_save_load_float16(self): + self._test_save_load_float16() + + def _test_save_load_float16(self, expected_max_diff=1e-2): components = self.get_dummy_components() for name, module in components.items(): if hasattr(module, "half"): @@ -384,7 +390,9 @@ def test_save_load_float16(self): output_loaded = pipe_loaded(**inputs)[0] max_diff = np.abs(to_np(output) - to_np(output_loaded)).max() - self.assertLess(max_diff, 1e-2, "The output of the fp16 pipeline changed after saving and loading.") + self.assertLess( + max_diff, expected_max_diff, "The output of the fp16 pipeline changed after saving and loading." + ) def test_save_load_optional_components(self): if not hasattr(self.pipeline_class, "_optional_components"): From da2ce1a6b92f48cabe9e9d3944c4ee8b007b2871 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Wed, 26 Apr 2023 10:34:34 +0200 Subject: [PATCH 62/71] Allow return pt x4 (#3236) * Add all files * update --- .../pipeline_stable_diffusion_upscale.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py index 45b26de284af..14e5c4ab7cd1 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py @@ -697,15 +697,11 @@ def __call__( # 10. Post-processing # make sure the VAE is in float32 mode, as it overflows in float16 self.vae.to(dtype=torch.float32) - image = self.decode_latents(latents.float()) - - # Offload last model to CPU - if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: - self.final_offload_hook.offload() # 11. Convert to PIL # has_nsfw_concept = False if output_type == "pil": + image = self.decode_latents(latents.float()) image, has_nsfw_concept, _ = self.run_safety_checker(image, device, prompt_embeds.dtype) image = self.numpy_to_pil(image) @@ -713,9 +709,18 @@ def __call__( # 11. Apply watermark if self.watermarker is not None: image = self.watermarker.apply_watermark(image) + elif output_type == "pt": + latents = 1 / self.vae.config.scaling_factor * latents.float() + image = self.vae.decode(latents).sample + has_nsfw_concept = None else: + image = self.decode_latents(latents.float()) has_nsfw_concept = None + # Offload last model to CPU + if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: + self.final_offload_hook.offload() + if not return_dict: return (image, has_nsfw_concept) From abbf3c1adf6bbf424ae82e640647ad4078e1a4b9 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Wed, 26 Apr 2023 12:16:06 +0200 Subject: [PATCH 63/71] Allow fp16 attn for x4 upscaler (#3239) * Add all files * update * Make sure vae is memory efficient for PT 1 * make style --- src/diffusers/models/vae.py | 3 +++ .../pipeline_stable_diffusion_upscale.py | 21 +++++++++++++++---- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/src/diffusers/models/vae.py b/src/diffusers/models/vae.py index b4484823ac3d..400c3030af90 100644 --- a/src/diffusers/models/vae.py +++ b/src/diffusers/models/vae.py @@ -212,6 +212,7 @@ def forward(self, z): sample = z sample = self.conv_in(sample) + upscale_dtype = next(iter(self.up_blocks.parameters())).dtype if self.training and self.gradient_checkpointing: def create_custom_forward(module): @@ -222,6 +223,7 @@ def custom_forward(*inputs): # middle sample = torch.utils.checkpoint.checkpoint(create_custom_forward(self.mid_block), sample) + sample = sample.to(upscale_dtype) # up for up_block in self.up_blocks: @@ -229,6 +231,7 @@ def custom_forward(*inputs): else: # middle sample = self.mid_block(sample) + sample = sample.to(upscale_dtype) # up for up_block in self.up_blocks: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py index 14e5c4ab7cd1..87014f52dfc2 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py @@ -18,6 +18,7 @@ import numpy as np import PIL import torch +import torch.nn.functional as F from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...loaders import TextualInversionLoaderMixin @@ -698,10 +699,22 @@ def __call__( # make sure the VAE is in float32 mode, as it overflows in float16 self.vae.to(dtype=torch.float32) + # TODO(Patrick, William) - clean up when attention is refactored + use_torch_2_0_attn = hasattr(F, "scaled_dot_product_attention") + use_xformers = self.vae.decoder.mid_block.attentions[0]._use_memory_efficient_attention_xformers + # if xformers or torch_2_0 is used attention block does not need + # to be in float32 which can save lots of memory + if not use_torch_2_0_attn and not use_xformers: + self.vae.post_quant_conv.to(latents.dtype) + self.vae.decoder.conv_in.to(latents.dtype) + self.vae.decoder.mid_block.to(latents.dtype) + else: + latents = latents.float() + # 11. Convert to PIL - # has_nsfw_concept = False if output_type == "pil": - image = self.decode_latents(latents.float()) + image = self.decode_latents(latents) + image, has_nsfw_concept, _ = self.run_safety_checker(image, device, prompt_embeds.dtype) image = self.numpy_to_pil(image) @@ -710,11 +723,11 @@ def __call__( if self.watermarker is not None: image = self.watermarker.apply_watermark(image) elif output_type == "pt": - latents = 1 / self.vae.config.scaling_factor * latents.float() + latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample has_nsfw_concept = None else: - image = self.decode_latents(latents.float()) + image = self.decode_latents(latents) has_nsfw_concept = None # Offload last model to CPU From 744663f8dc110c03e10157e92175b8187cf64d59 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Wed, 26 Apr 2023 12:44:19 +0200 Subject: [PATCH 64/71] fix fast test (#3241) --- tests/pipelines/unclip/test_unclip.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pipelines/unclip/test_unclip.py b/tests/pipelines/unclip/test_unclip.py index d2c699ea501d..5c9181c08e3f 100644 --- a/tests/pipelines/unclip/test_unclip.py +++ b/tests/pipelines/unclip/test_unclip.py @@ -358,7 +358,7 @@ class DummyScheduler: def test_attention_slicing_forward_pass(self): test_max_difference = torch_device == "cpu" - self._test_attention_slicing_forward_pass(test_max_difference=test_max_difference) + self._test_attention_slicing_forward_pass(test_max_difference=test_max_difference, expected_max_diff=0.01) # Overriding PipelineTesterMixin::test_inference_batch_single_identical # because UnCLIP undeterminism requires a looser check. From 977162c02b753d088433ec1634e448df8741fb7c Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Wed, 26 Apr 2023 16:25:48 +0530 Subject: [PATCH 65/71] Adds a document on token merging (#3208) * add document on token merging. * fix headline. * fix: headline. * add some samples for comparison. --- docs/source/en/_toctree.yml | 2 + docs/source/en/optimization/tome.mdx | 116 +++++++++++++++++++++++++++ 2 files changed, 118 insertions(+) create mode 100644 docs/source/en/optimization/tome.mdx diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index cc880f3e0b81..ccaaff7ca680 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -105,6 +105,8 @@ title: MPS - local: optimization/habana title: Habana Gaudi + - local: optimization/tome + title: Token Merging title: Optimization/Special Hardware - sections: - local: conceptual/philosophy diff --git a/docs/source/en/optimization/tome.mdx b/docs/source/en/optimization/tome.mdx new file mode 100644 index 000000000000..c2158f539a65 --- /dev/null +++ b/docs/source/en/optimization/tome.mdx @@ -0,0 +1,116 @@ + + +# Token Merging + +Token Merging (introduced in [Token Merging: Your ViT But Faster](https://arxiv.org/abs/2210.09461)) works by merging the redundant tokens / patches progressively in the forward pass of a Transformer-based network. It can speed up the inference latency of the underlying network. + +After Token Merging (ToMe) was released, the authors released [Token Merging for Fast Stable Diffusion](https://arxiv.org/abs/2303.17604), which introduced a version of ToMe which is more compatible with Stable Diffusion. We can use ToMe to gracefully speed up the inference latency of a [`DiffusionPipeline`]. This doc discusses how to apply ToMe to the [`StableDiffusionPipeline`], the expected speedups, and the qualitative aspects of using ToMe on the [`StableDiffusionPipeline`]. + +## Using ToMe + +The authors of ToMe released a convenient Python library called [`tomesd`](https://github.com/dbolya/tomesd) that lets us apply ToMe to a [`DiffusionPipeline`] like so: + +```diff +from diffusers import StableDiffusionPipeline +import tomesd + +pipeline = StableDiffusionPipeline.from_pretrained( + "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16 +).to("cuda") ++ tomesd.apply_patch(pipeline, ratio=0.5) + +image = pipeline("a photo of an astronaut riding a horse on mars").images[0] +``` + +And that’s it! + +`tomesd.apply_patch()` exposes [a number of arguments](https://github.com/dbolya/tomesd#usage) to let us strike a balance between the pipeline inference speed and the quality of the generated tokens. Amongst those arguments, the most important one is `ratio`. `ratio` controls the number of tokens that will be merged during the forward pass. For more details on `tomesd`, please refer to the original repository https://github.com/dbolya/tomesd and [the paper](https://arxiv.org/abs/2303.17604). + +## Benchmarking `tomesd` with `StableDiffusionPipeline` + +We benchmarked the impact of using `tomesd` on [`StableDiffusionPipeline`] along with [xformers](https://huggingface.co/docs/diffusers/optimization/xformers) across different image resolutions. We used A100 and V100 as our test GPU devices with the following development environment (with Python 3.8.5): + +```bash +- `diffusers` version: 0.15.1 +- Python version: 3.8.16 +- PyTorch version (GPU?): 1.13.1+cu116 (True) +- Huggingface_hub version: 0.13.2 +- Transformers version: 4.27.2 +- Accelerate version: 0.18.0 +- xFormers version: 0.0.16 +- tomesd version: 0.1.2 +``` + +We used this script for benchmarking: [https://gist.github.com/sayakpaul/27aec6bca7eb7b0e0aa4112205850335](https://gist.github.com/sayakpaul/27aec6bca7eb7b0e0aa4112205850335). Following are our findings: + +### A100 + +| Resolution | Batch size | Vanilla | ToMe | ToMe + xFormers | ToMe speedup (%) | ToMe + xFormers speedup (%) | +| --- | --- | --- | --- | --- | --- | --- | +| 512 | 10 | 6.88 | 5.26 | 4.69 | 23.54651163 | 31.83139535 | +| | | | | | | | +| 768 | 10 | OOM | 14.71 | 11 | | | +| | 8 | OOM | 11.56 | 8.84 | | | +| | 4 | OOM | 5.98 | 4.66 | | | +| | 2 | 4.99 | 3.24 | 3.1 | 35.07014028 | 37.8757515 | +| | 1 | 3.29 | 2.24 | 2.03 | 31.91489362 | 38.29787234 | +| | | | | | | | +| 1024 | 10 | OOM | OOM | OOM | | | +| | 8 | OOM | OOM | OOM | | | +| | 4 | OOM | 12.51 | 9.09 | | | +| | 2 | OOM | 6.52 | 4.96 | | | +| | 1 | 6.4 | 3.61 | 2.81 | 43.59375 | 56.09375 | + +***The timings reported here are in seconds. Speedups are calculated over the `Vanilla` timings.*** + +### V100 + +| Resolution | Batch size | Vanilla | ToMe | ToMe + xFormers | ToMe speedup (%) | ToMe + xFormers speedup (%) | +| --- | --- | --- | --- | --- | --- | --- | +| 512 | 10 | OOM | 10.03 | 9.29 | | | +| | 8 | OOM | 8.05 | 7.47 | | | +| | 4 | 5.7 | 4.3 | 3.98 | 24.56140351 | 30.1754386 | +| | 2 | 3.14 | 2.43 | 2.27 | 22.61146497 | 27.70700637 | +| | 1 | 1.88 | 1.57 | 1.57 | 16.4893617 | 16.4893617 | +| | | | | | | | +| 768 | 10 | OOM | OOM | 23.67 | | | +| | 8 | OOM | OOM | 18.81 | | | +| | 4 | OOM | 11.81 | 9.7 | | | +| | 2 | OOM | 6.27 | 5.2 | | | +| | 1 | 5.43 | 3.38 | 2.82 | 37.75322284 | 48.06629834 | +| | | | | | | | +| 1024 | 10 | OOM | OOM | OOM | | | +| | 8 | OOM | OOM | OOM | | | +| | 4 | OOM | OOM | 19.35 | | | +| | 2 | OOM | 13 | 10.78 | | | +| | 1 | OOM | 6.66 | 5.54 | | | + +As seen in the tables above, the speedup with `tomesd` becomes more pronounced for larger image resolutions. It is also interesting to note that with `tomesd`, it becomes possible to run the pipeline on a higher resolution, like 1024x1024. + +It might be possible to speed up inference even further with [`torch.compile()`](https://huggingface.co/docs/diffusers/optimization/torch2.0). + +## Quality + +As reported in [the paper](https://arxiv.org/abs/2303.17604), ToMe can preserve the quality of the generated images to a great extent while speeding up inference. By increasing the `ratio`, it is possible to further speed up inference, but that might come at the cost of a deterioration in the image quality. + +To test the quality of the generated samples using our setup, we sampled a few prompts from the “Parti Prompts” (introduced in [Parti](https://parti.research.google/)) and performed inference with the [`StableDiffusionPipeline`] in the following settings: + +- Vanilla [`StableDiffusionPipeline`] +- [`StableDiffusionPipeline`] + ToMe +- [`StableDiffusionPipeline`] + ToMe + xformers + +We didn’t notice any significant decrease in the quality of the generated samples. Here are samples: + +![tome-samples](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/tome/tome_samples.png) + +You can check out the generated samples [here](https://wandb.ai/sayakpaul/tomesd-results/runs/23j4bj3i?workspace=). We used [this script](https://gist.github.com/sayakpaul/8cac98d7f22399085a060992f411ecbd) for conducting this experiment. \ No newline at end of file From 46ceba5b350bbf3d9272e9614f17e5edbeb0e1ef Mon Sep 17 00:00:00 2001 From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com> Date: Wed, 26 Apr 2023 11:33:08 +0000 Subject: [PATCH 66/71] [AudioLDM] Update docs to use updated ckpt (#3240) * [AudioLDM] Update docs to use updated ckpt * make style --- docs/source/en/api/pipelines/audioldm.mdx | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/docs/source/en/api/pipelines/audioldm.mdx b/docs/source/en/api/pipelines/audioldm.mdx index f3987d2263ac..25a5bb8bce13 100644 --- a/docs/source/en/api/pipelines/audioldm.mdx +++ b/docs/source/en/api/pipelines/audioldm.mdx @@ -25,14 +25,14 @@ This pipeline was contributed by [sanchit-gandhi](https://huggingface.co/sanchit ## Text-to-Audio -The [`AudioLDMPipeline`] can be used to load pre-trained weights from [cvssp/audioldm](https://huggingface.co/cvssp/audioldm) and generate text-conditional audio outputs: +The [`AudioLDMPipeline`] can be used to load pre-trained weights from [cvssp/audioldm-s-full-v2](https://huggingface.co/cvssp/audioldm-s-full-v2) and generate text-conditional audio outputs: ```python from diffusers import AudioLDMPipeline import torch import scipy -repo_id = "cvssp/audioldm" +repo_id = "cvssp/audioldm-s-full-v2" pipe = AudioLDMPipeline.from_pretrained(repo_id, torch_dtype=torch.float16) pipe = pipe.to("cuda") @@ -56,7 +56,7 @@ Inference: ### How to load and use different schedulers The AudioLDM pipeline uses [`DDIMScheduler`] scheduler by default. But `diffusers` provides many other schedulers -that can be used with the AudioLDM pipeline such as [`PNDMScheduler`], [`LMSDiscreteScheduler`], [`EulerDiscreteScheduler`], +that can be used with the AudioLDM pipeline such as [`PNDMScheduler`], [`LMSDiscreteScheduler`], [`EulerDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`] etc. We recommend using the [`DPMSolverMultistepScheduler`] as it's currently the fastest scheduler there is. @@ -68,12 +68,14 @@ method, or pass the `scheduler` argument to the `from_pretrained` method of the >>> from diffusers import AudioLDMPipeline, DPMSolverMultistepScheduler >>> import torch ->>> pipeline = AudioLDMPipeline.from_pretrained("cvssp/audioldm", torch_dtype=torch.float16) +>>> pipeline = AudioLDMPipeline.from_pretrained("cvssp/audioldm-s-full-v2", torch_dtype=torch.float16) >>> pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config) >>> # or ->>> dpm_scheduler = DPMSolverMultistepScheduler.from_pretrained("cvssp/audioldm", subfolder="scheduler") ->>> pipeline = AudioLDMPipeline.from_pretrained("cvssp/audioldm", scheduler=dpm_scheduler, torch_dtype=torch.float16) +>>> dpm_scheduler = DPMSolverMultistepScheduler.from_pretrained("cvssp/audioldm-s-full-v2", subfolder="scheduler") +>>> pipeline = AudioLDMPipeline.from_pretrained( +... "cvssp/audioldm-s-full-v2", scheduler=dpm_scheduler, torch_dtype=torch.float16 +... ) ``` ## AudioLDMPipeline From 6ba0efb9a188b08f5b46565a87c0b3da7ff46af4 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Wed, 26 Apr 2023 13:35:01 +0200 Subject: [PATCH 67/71] Release: v0.16.0 --- examples/controlnet/train_controlnet.py | 2 +- examples/controlnet/train_controlnet_flax.py | 2 +- examples/custom_diffusion/train_custom_diffusion.py | 2 +- examples/dreambooth/train_dreambooth.py | 2 +- examples/dreambooth/train_dreambooth_flax.py | 2 +- examples/dreambooth/train_dreambooth_lora.py | 2 +- examples/instruct_pix2pix/train_instruct_pix2pix.py | 2 +- examples/text_to_image/train_text_to_image.py | 2 +- examples/text_to_image/train_text_to_image_flax.py | 2 +- examples/text_to_image/train_text_to_image_lora.py | 2 +- examples/textual_inversion/textual_inversion.py | 2 +- examples/textual_inversion/textual_inversion_flax.py | 2 +- examples/unconditional_image_generation/train_unconditional.py | 2 +- setup.py | 2 +- src/diffusers/__init__.py | 2 +- 15 files changed, 15 insertions(+), 15 deletions(-) diff --git a/examples/controlnet/train_controlnet.py b/examples/controlnet/train_controlnet.py index d52e610ca52d..9b9ba5ab737f 100644 --- a/examples/controlnet/train_controlnet.py +++ b/examples/controlnet/train_controlnet.py @@ -55,7 +55,7 @@ import wandb # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.16.0.dev0") +check_min_version("0.16.0") logger = get_logger(__name__) diff --git a/examples/controlnet/train_controlnet_flax.py b/examples/controlnet/train_controlnet_flax.py index b25f9325403f..aff361cb6e01 100644 --- a/examples/controlnet/train_controlnet_flax.py +++ b/examples/controlnet/train_controlnet_flax.py @@ -59,7 +59,7 @@ import wandb # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.16.0.dev0") +check_min_version("0.16.0") logger = logging.getLogger(__name__) diff --git a/examples/custom_diffusion/train_custom_diffusion.py b/examples/custom_diffusion/train_custom_diffusion.py index 49b05e6b5db3..0954f3d6e789 100644 --- a/examples/custom_diffusion/train_custom_diffusion.py +++ b/examples/custom_diffusion/train_custom_diffusion.py @@ -56,7 +56,7 @@ # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.15.0.dev0") +check_min_version("0.16.0") logger = get_logger(__name__) diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py index 4f731aa1f776..a9449002ca80 100644 --- a/examples/dreambooth/train_dreambooth.py +++ b/examples/dreambooth/train_dreambooth.py @@ -56,7 +56,7 @@ import wandb # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.16.0.dev0") +check_min_version("0.16.0") logger = get_logger(__name__) diff --git a/examples/dreambooth/train_dreambooth_flax.py b/examples/dreambooth/train_dreambooth_flax.py index 8583f64c6fbd..1a4ca9153c80 100644 --- a/examples/dreambooth/train_dreambooth_flax.py +++ b/examples/dreambooth/train_dreambooth_flax.py @@ -36,7 +36,7 @@ # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.16.0.dev0") +check_min_version("0.16.0") # Cache compiled models across invocations of this script. cc.initialize_cache(os.path.expanduser("~/.cache/jax/compilation_cache")) diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py index 367a3422de33..805a8d1eea4d 100644 --- a/examples/dreambooth/train_dreambooth_lora.py +++ b/examples/dreambooth/train_dreambooth_lora.py @@ -55,7 +55,7 @@ # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.16.0.dev0") +check_min_version("0.16.0") logger = get_logger(__name__) diff --git a/examples/instruct_pix2pix/train_instruct_pix2pix.py b/examples/instruct_pix2pix/train_instruct_pix2pix.py index 155c370614dc..dc5a1c3081c0 100644 --- a/examples/instruct_pix2pix/train_instruct_pix2pix.py +++ b/examples/instruct_pix2pix/train_instruct_pix2pix.py @@ -51,7 +51,7 @@ # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.16.0.dev0") +check_min_version("0.16.0") logger = get_logger(__name__, log_level="INFO") diff --git a/examples/text_to_image/train_text_to_image.py b/examples/text_to_image/train_text_to_image.py index 67724698c099..1d6db2a6f1da 100644 --- a/examples/text_to_image/train_text_to_image.py +++ b/examples/text_to_image/train_text_to_image.py @@ -50,7 +50,7 @@ # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.16.0.dev0") +check_min_version("0.16.0") logger = get_logger(__name__, log_level="INFO") diff --git a/examples/text_to_image/train_text_to_image_flax.py b/examples/text_to_image/train_text_to_image_flax.py index d44731896c1d..c5dc71f0536e 100644 --- a/examples/text_to_image/train_text_to_image_flax.py +++ b/examples/text_to_image/train_text_to_image_flax.py @@ -33,7 +33,7 @@ # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.16.0.dev0") +check_min_version("0.16.0") logger = logging.getLogger(__name__) diff --git a/examples/text_to_image/train_text_to_image_lora.py b/examples/text_to_image/train_text_to_image_lora.py index 8dfd96904bd0..39bdb4e59a52 100644 --- a/examples/text_to_image/train_text_to_image_lora.py +++ b/examples/text_to_image/train_text_to_image_lora.py @@ -47,7 +47,7 @@ # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.16.0.dev0") +check_min_version("0.16.0") logger = get_logger(__name__, log_level="INFO") diff --git a/examples/textual_inversion/textual_inversion.py b/examples/textual_inversion/textual_inversion.py index d7be58bdb9ba..824759cc4ca9 100644 --- a/examples/textual_inversion/textual_inversion.py +++ b/examples/textual_inversion/textual_inversion.py @@ -77,7 +77,7 @@ # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.16.0.dev0") +check_min_version("0.16.0") logger = get_logger(__name__) diff --git a/examples/textual_inversion/textual_inversion_flax.py b/examples/textual_inversion/textual_inversion_flax.py index 1d77753791f9..19553ceb92ec 100644 --- a/examples/textual_inversion/textual_inversion_flax.py +++ b/examples/textual_inversion/textual_inversion_flax.py @@ -56,7 +56,7 @@ # ------------------------------------------------------------------------------ # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.16.0.dev0") +check_min_version("0.16.0") logger = logging.getLogger(__name__) diff --git a/examples/unconditional_image_generation/train_unconditional.py b/examples/unconditional_image_generation/train_unconditional.py index c004acc2d850..836a38f96286 100644 --- a/examples/unconditional_image_generation/train_unconditional.py +++ b/examples/unconditional_image_generation/train_unconditional.py @@ -28,7 +28,7 @@ # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.16.0.dev0") +check_min_version("0.16.0") logger = get_logger(__name__, log_level="INFO") diff --git a/setup.py b/setup.py index 19cc1dca73bb..ea98b5d10277 100644 --- a/setup.py +++ b/setup.py @@ -226,7 +226,7 @@ def run(self): setup( name="diffusers", - version="0.16.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) + version="0.16.0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) description="Diffusers", long_description=open("README.md", "r", encoding="utf-8").read(), long_description_content_type="text/markdown", diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index e9d12bdb7cca..d4dbf1145072 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.16.0.dev0" +__version__ = "0.16.0" from .configuration_utils import ConfigMixin from .utils import ( From 9c876a5915fe6621a3d21d7d9146f58be0a8610e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?apolin=C3=A1rio?= Date: Thu, 27 Apr 2023 15:26:58 +0200 Subject: [PATCH 68/71] merge conflict --- docs/source/en/api/pipelines/if.mdx | 34 +++++++++---------- .../pipelines/deepfloyd_if/pipeline_if.py | 2 +- .../deepfloyd_if/pipeline_if_img2img.py | 2 +- .../pipeline_if_img2img_superresolution.py | 2 +- .../deepfloyd_if/pipeline_if_inpainting.py | 2 +- .../pipeline_if_inpainting_superresolution.py | 2 +- .../pipeline_if_superresolution.py | 2 +- tests/pipelines/deepfloyd_if/test_if.py | 2 +- 8 files changed, 24 insertions(+), 24 deletions(-) diff --git a/docs/source/en/api/pipelines/if.mdx b/docs/source/en/api/pipelines/if.mdx index 5d3b292587f6..d79c7035fb75 100644 --- a/docs/source/en/api/pipelines/if.mdx +++ b/docs/source/en/api/pipelines/if.mdx @@ -28,8 +28,8 @@ Our work underscores the potential of larger UNet architectures in the first sta ## Usage Before you can use IF, you need to accept its usage conditions. To do so: -1. Make sure to have a [Hugging Face account](https://huggingface.co/join) and be loggin in -2. Accept the license on the model card of [DeepFloyd/IF-I-IF-v1.0](https://huggingface.co/DeepFloyd/IF-I-IF-v1.0) and [DeepFloyd/IF-II-L-v1.0](https://huggingface.co/DeepFloyd/IF-II-L-v1.0) +1. Make sure to have a [Hugging Face account](https://huggingface.co/join) and be logged in +2. Accept the license on the model card of [DeepFloyd/IF-I-XL-v1.0](https://huggingface.co/DeepFloyd/IF-I-XL-v1.0). Accepting the license on the stage I model card will auto accept for the other IF models. 3. Make sure to login locally. Install `huggingface_hub` ```sh pip install huggingface_hub --upgrade @@ -62,7 +62,7 @@ The following sections give more in-detail examples of how to use IF. Specifical **Available checkpoints** - *Stage-1* - - [DeepFloyd/IF-I-IF-v1.0](https://huggingface.co/DeepFloyd/IF-I-IF-v1.0) + - [DeepFloyd/IF-I-XL-v1.0](https://huggingface.co/DeepFloyd/IF-I-XL-v1.0) - [DeepFloyd/IF-I-L-v1.0](https://huggingface.co/DeepFloyd/IF-I-L-v1.0) - [DeepFloyd/IF-I-M-v1.0](https://huggingface.co/DeepFloyd/IF-I-M-v1.0) @@ -90,7 +90,7 @@ from diffusers.utils import pt_to_pil import torch # stage 1 -stage_1 = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16) +stage_1 = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16) stage_1.enable_model_cpu_offload() # stage 2 @@ -162,7 +162,7 @@ original_image = Image.open(BytesIO(response.content)).convert("RGB") original_image = original_image.resize((768, 512)) # stage 1 -stage_1 = IFImg2ImgPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16) +stage_1 = IFImg2ImgPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16) stage_1.enable_model_cpu_offload() # stage 2 @@ -244,7 +244,7 @@ mask_image = Image.open(BytesIO(response.content)) mask_image = mask_image # stage 1 -stage_1 = IFInpaintingPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16) +stage_1 = IFInpaintingPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16) stage_1.enable_model_cpu_offload() # stage 2 @@ -305,7 +305,7 @@ In addition to being loaded with `from_pretrained`, Pipelines can also be loaded ```python from diffusers import IFPipeline, IFSuperResolutionPipeline -pipe_1 = IFPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0") +pipe_1 = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0") pipe_2 = IFSuperResolutionPipeline.from_pretrained("DeepFloyd/IF-II-L-v1.0") @@ -326,7 +326,7 @@ pipe_2 = IFInpaintingSuperResolutionPipeline(**pipe_2.components) The simplest optimization to run IF faster is to move all model components to the GPU. ```py -pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16) +pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16) pipe.to("cuda") ``` @@ -352,7 +352,7 @@ the input image which also determines how many steps to run in the denoising pro A smaller number will vary the image less but run faster. ```py -pipe = IFImg2ImgPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16) +pipe = IFImg2ImgPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16) pipe.to("cuda") image = pipe(image=image, prompt="", strength=0.3).images @@ -364,7 +364,7 @@ with IF and it might not give expected results. ```py import torch -pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16) +pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16) pipe.to("cuda") pipe.text_encoder = torch.compile(pipe.text_encoder) @@ -378,14 +378,14 @@ When optimizing for GPU memory, we can use the standard diffusers cpu offloading Either the model based CPU offloading, ```py -pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16) +pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16) pipe.enable_model_cpu_offload() ``` or the more aggressive layer based CPU offloading. ```py -pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16) +pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16) pipe.enable_sequential_cpu_offload() ``` @@ -395,13 +395,13 @@ Additionally, T5 can be loaded in 8bit precision from transformers import T5EncoderModel text_encoder = T5EncoderModel.from_pretrained( - "DeepFloyd/IF-I-IF-v1.0", subfolder="text_encoder", device_map="auto", load_in_8bit=True, variant="8bit" + "DeepFloyd/IF-I-XL-v1.0", subfolder="text_encoder", device_map="auto", load_in_8bit=True, variant="8bit" ) from diffusers import DiffusionPipeline pipe = DiffusionPipeline.from_pretrained( - "DeepFloyd/IF-I-IF-v1.0", + "DeepFloyd/IF-I-XL-v1.0", text_encoder=text_encoder, # pass the previously instantiated 8bit text encoder unet=None, device_map="auto", @@ -422,13 +422,13 @@ from transformers import T5EncoderModel from diffusers.utils import pt_to_pil text_encoder = T5EncoderModel.from_pretrained( - "DeepFloyd/IF-I-IF-v1.0", subfolder="text_encoder", device_map="auto", load_in_8bit=True, variant="8bit" + "DeepFloyd/IF-I-XL-v1.0", subfolder="text_encoder", device_map="auto", load_in_8bit=True, variant="8bit" ) # text to image pipe = DiffusionPipeline.from_pretrained( - "DeepFloyd/IF-I-IF-v1.0", + "DeepFloyd/IF-I-XL-v1.0", text_encoder=text_encoder, # pass the previously instantiated 8bit text encoder unet=None, device_map="auto", @@ -444,7 +444,7 @@ gc.collect() torch.cuda.empty_cache() pipe = IFPipeline.from_pretrained( - "DeepFloyd/IF-I-IF-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16, device_map="auto" + "DeepFloyd/IF-I-XL-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16, device_map="auto" ) generator = torch.Generator().manual_seed(0) diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py index a76e51a3ffe9..479ffa9e6635 100644 --- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py @@ -41,7 +41,7 @@ >>> from diffusers.utils import pt_to_pil >>> import torch - >>> pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16) + >>> pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16) >>> pipe.enable_model_cpu_offload() >>> prompt = 'a photo of a kangaroo wearing an orange hoodie and blue sunglasses standing in front of the eiffel tower holding a sign that says "very deep learning"' diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py index a31748450d4b..fac4adeea463 100644 --- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py @@ -70,7 +70,7 @@ def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image: >>> original_image = original_image.resize((768, 512)) >>> pipe = IFImg2ImgPipeline.from_pretrained( - ... "DeepFloyd/IF-I-IF-v1.0", + ... "DeepFloyd/IF-I-XL-v1.0", ... variant="fp16", ... torch_dtype=torch.float16, ... ) diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py index 21e280654cf5..eed1bb43e5d8 100644 --- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py @@ -73,7 +73,7 @@ def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image: >>> original_image = original_image.resize((768, 512)) >>> pipe = IFImg2ImgPipeline.from_pretrained( - ... "DeepFloyd/IF-I-IF-v1.0", + ... "DeepFloyd/IF-I-XL-v1.0", ... variant="fp16", ... torch_dtype=torch.float16, ... ) diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py index 95eba1cc7d24..d3651f5169c1 100644 --- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py @@ -76,7 +76,7 @@ def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image: >>> mask_image = mask_image >>> pipe = IFInpaintingPipeline.from_pretrained( - ... "DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16 + ... "DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16 ... ) >>> pipe.enable_model_cpu_offload() diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py index 4eb0bf300fa5..5ea6a47082ae 100644 --- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py @@ -78,7 +78,7 @@ def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image: >>> mask_image = mask_image >>> pipe = IFInpaintingPipeline.from_pretrained( - ... "DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16 + ... "DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16 ... ) >>> pipe.enable_model_cpu_offload() diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py index bb1d4ee4ba66..a62a51b0972f 100644 --- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py @@ -45,7 +45,7 @@ >>> from diffusers.utils import pt_to_pil >>> import torch - >>> pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16) + >>> pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16) >>> pipe.enable_model_cpu_offload() >>> prompt = 'a photo of a kangaroo wearing an orange hoodie and blue sunglasses standing in front of the eiffel tower holding a sign that says "very deep learning"' diff --git a/tests/pipelines/deepfloyd_if/test_if.py b/tests/pipelines/deepfloyd_if/test_if.py index e2204cb601a6..bf01c2350d22 100644 --- a/tests/pipelines/deepfloyd_if/test_if.py +++ b/tests/pipelines/deepfloyd_if/test_if.py @@ -94,7 +94,7 @@ def tearDown(self): def test_all(self): # if - pipe_1 = IFPipeline.from_pretrained("DeepFloyd/IF-I-IF-v1.0", variant="fp16", torch_dtype=torch.float16) + pipe_1 = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16) pipe_2 = IFSuperResolutionPipeline.from_pretrained( "DeepFloyd/IF-II-L-v1.0", variant="fp16", torch_dtype=torch.float16, text_encoder=None, tokenizer=None From 4c476e99b5cf1cf5d0c84b8f96730f603e6f35cf Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 27 Apr 2023 18:12:08 +0200 Subject: [PATCH 69/71] Fix community pipelines (#3266) --- src/diffusers/utils/dynamic_modules_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/utils/dynamic_modules_utils.py b/src/diffusers/utils/dynamic_modules_utils.py index 1951c4fa2623..aa6c9c657a87 100644 --- a/src/diffusers/utils/dynamic_modules_utils.py +++ b/src/diffusers/utils/dynamic_modules_utils.py @@ -267,7 +267,7 @@ def get_cached_module_file( # retrieve github version that matches if revision is None: - revision = latest_version if latest_version in available_versions else "main" + revision = latest_version if latest_version[1:] in available_versions else "main" logger.info(f"Defaulting to latest_version: {revision}.") elif revision in available_versions: revision = f"v{revision}" From 23159f4adbbb41eba8c5af0b667de4a31e366500 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Fri, 28 Apr 2023 13:31:11 +0200 Subject: [PATCH 70/71] Allow disabling torch 2_0 attention (#3273) * Allow disabling torch 2_0 attention * make style * Update src/diffusers/models/attention.py --- src/diffusers/models/attention.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py index 8e537c6f3680..fb5f6f48b324 100644 --- a/src/diffusers/models/attention.py +++ b/src/diffusers/models/attention.py @@ -71,6 +71,7 @@ def __init__( self.proj_attn = nn.Linear(channels, channels, bias=True) self._use_memory_efficient_attention_xformers = False + self._use_2_0_attn = True self._attention_op = None def reshape_heads_to_batch_dim(self, tensor, merge_head_and_batch=True): @@ -142,9 +143,8 @@ def forward(self, hidden_states): scale = 1 / math.sqrt(self.channels / self.num_heads) - use_torch_2_0_attn = ( - hasattr(F, "scaled_dot_product_attention") and not self._use_memory_efficient_attention_xformers - ) + _use_2_0_attn = self._use_2_0_attn and not self._use_memory_efficient_attention_xformers + use_torch_2_0_attn = hasattr(F, "scaled_dot_product_attention") and _use_2_0_attn query_proj = self.reshape_heads_to_batch_dim(query_proj, merge_head_and_batch=not use_torch_2_0_attn) key_proj = self.reshape_heads_to_batch_dim(key_proj, merge_head_and_batch=not use_torch_2_0_attn) From 9b14ce397e53fc5f5b909b07b6e992a2afe8e3af Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Fri, 28 Apr 2023 14:03:50 +0200 Subject: [PATCH 71/71] Release: v0.16.1 --- setup.py | 2 +- src/diffusers/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index ea98b5d10277..5c26b246aa01 100644 --- a/setup.py +++ b/setup.py @@ -226,7 +226,7 @@ def run(self): setup( name="diffusers", - version="0.16.0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) + version="0.16.1", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) description="Diffusers", long_description=open("README.md", "r", encoding="utf-8").read(), long_description_content_type="text/markdown", diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index d4dbf1145072..bb7381d65a54 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.16.0" +__version__ = "0.16.1" from .configuration_utils import ConfigMixin from .utils import (