From d9af6acc600744872559a0622cec7323a6db2db1 Mon Sep 17 00:00:00 2001
From: Nupur Kumari <nupurkumari@Nupurs-MacBook-Pro.local>
Date: Fri, 7 Apr 2023 00:27:12 -0700
Subject: [PATCH 01/34] diffusers==0.14.0 update

---
 examples/custom_diffusion/model_pipeline.py |  314 +++++
 examples/custom_diffusion/train.py          | 1272 +++++++++++++++++++
 2 files changed, 1586 insertions(+)
 create mode 100644 examples/custom_diffusion/model_pipeline.py
 create mode 100644 examples/custom_diffusion/train.py

diff --git a/examples/custom_diffusion/model_pipeline.py b/examples/custom_diffusion/model_pipeline.py
new file mode 100644
index 000000000000..613f2702b09e
--- /dev/null
+++ b/examples/custom_diffusion/model_pipeline.py
@@ -0,0 +1,314 @@
+# This code is built from the Huggingface repository: https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth_lora.py, and
+# https://github.com/huggingface/diffusers/blob/main/examples/textual_inversion/textual_inversion.py
+
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 Custom Diffusion authors and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+from typing import Callable, Optional
+import torch
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+from accelerate.logging import get_logger
+
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.schedulers.scheduling_utils import SchedulerMixin
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipeline
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.models.cross_attention import CrossAttention
+from diffusers.utils.import_utils import is_xformers_available
+
+if is_xformers_available():
+    import xformers
+    import xformers.ops
+else:
+    xformers = None
+
+logger = get_logger(__name__)
+
+
+def set_use_memory_efficient_attention_xformers(
+    self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[Callable] = None
+):
+    if use_memory_efficient_attention_xformers:
+        if self.added_kv_proj_dim is not None:
+            # TODO(Anton, Patrick, Suraj, William) - currently xformers doesn't work for UnCLIP
+            # which uses this type of cross attention ONLY because the attention mask of format
+            # [0, ..., -10.000, ..., 0, ...,] is not supported
+            raise NotImplementedError(
+                "Memory efficient attention with `xformers` is currently not supported when"
+                " `self.added_kv_proj_dim` is defined."
+            )
+        elif not is_xformers_available():
+            raise ModuleNotFoundError(
+                (
+                    "Refer to https://github.com/facebookresearch/xformers for more information on how to install"
+                    " xformers"
+                ),
+                name="xformers",
+            )
+        elif not torch.cuda.is_available():
+            raise ValueError(
+                "torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is"
+                " only available for GPU "
+            )
+        else:
+            try:
+                # Make sure we can run the memory efficient attention
+                _ = xformers.ops.memory_efficient_attention(
+                    torch.randn((1, 2, 40), device="cuda"),
+                    torch.randn((1, 2, 40), device="cuda"),
+                    torch.randn((1, 2, 40), device="cuda"),
+                )
+            except Exception as e:
+                raise e
+
+        processor = CustomDiffusionXFormersAttnProcessor(attention_op=attention_op)
+    else:
+        processor = CustomDiffusionAttnProcessor()
+
+    self.set_processor(processor)
+
+
+class CustomDiffusionAttnProcessor:
+    def __call__(
+        self,
+        attn: CrossAttention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+    ):
+        batch_size, sequence_length, _ = hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        query = attn.to_q(hidden_states)
+
+        crossattn = False
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        else:
+            crossattn = True
+            if attn.cross_attention_norm:
+                encoder_hidden_states = attn.norm_cross(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        if crossattn:
+            modifier = torch.ones_like(key)
+            modifier[:, :1, :] = modifier[:, :1, :]*0.
+            key = modifier*key + (1-modifier)*key.detach()
+            value = modifier*value + (1-modifier)*value.detach()
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        return hidden_states
+
+
+class CustomDiffusionXFormersAttnProcessor:
+    def __init__(self, attention_op: Optional[Callable] = None):
+        self.attention_op = attention_op
+
+    def __call__(self, attn: CrossAttention, hidden_states, encoder_hidden_states=None, attention_mask=None):
+        batch_size, sequence_length, _ = hidden_states.shape
+
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        query = attn.to_q(hidden_states)
+
+        crossattn = False
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        else:
+            crossattn = True
+            if attn.cross_attention_norm:
+                encoder_hidden_states = attn.norm_cross(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        if crossattn:
+            modifier = torch.ones_like(key)
+            modifier[:, :1, :] = modifier[:, :1, :]*0.
+            key = modifier*key + (1-modifier)*key.detach()
+            value = modifier*value + (1-modifier)*value.detach()
+
+        query = attn.head_to_batch_dim(query).contiguous()
+        key = attn.head_to_batch_dim(key).contiguous()
+        value = attn.head_to_batch_dim(value).contiguous()
+
+        hidden_states = xformers.ops.memory_efficient_attention(
+            query, key, value, attn_bias=attention_mask, op=self.attention_op
+        )
+        hidden_states = hidden_states.to(query.dtype)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+
+
+class CustomDiffusionPipeline(StableDiffusionPipeline):
+    r"""
+    Pipeline for custom diffusion model.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+        modifier_token: list of new modifier tokens added or to be added to text_encoder
+        modifier_token_id: list of id of new modifier tokens added or to be added to text_encoder
+    """
+    _optional_components = ["safety_checker", "feature_extractor", "modifier_token"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: SchedulerMixin,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: bool = True,
+        modifier_token: list = [],
+        modifier_token_id: list = [],
+    ):
+        super().__init__(vae,
+                         text_encoder,
+                         tokenizer,
+                         unet,
+                         scheduler,
+                         safety_checker,
+                         feature_extractor,
+                         requires_safety_checker)
+
+        # change attn class
+        def change_attn(unet):
+            for layer in unet.children():
+                if type(layer) == CrossAttention:
+                    bound_method = set_use_memory_efficient_attention_xformers.__get__(layer, layer.__class__)
+                    setattr(layer, 'set_use_memory_efficient_attention_xformers', bound_method)
+                else:
+                    change_attn(layer)
+
+        change_attn(self.unet)
+        self.unet.set_attn_processor(CustomDiffusionAttnProcessor())
+        self.modifier_token = modifier_token
+        self.modifier_token_id = modifier_token_id
+
+    def add_token(self, initializer_token):
+        initializer_token_id = []
+        for modifier_token_, initializer_token_ in zip(self.modifier_token, initializer_token):
+            # Add the placeholder token in tokenizer
+            num_added_tokens = self.tokenizer.add_tokens(modifier_token_)
+            if num_added_tokens == 0:
+                raise ValueError(
+                    f"The tokenizer already contains the token {modifier_token_}. Please pass a different"
+                    " `modifier_token` that is not already in the tokenizer."
+                )
+
+            # Convert the initializer_token, placeholder_token to ids
+            token_ids = self.tokenizer.encode([initializer_token_], add_special_tokens=False)
+            # Check if initializer_token is a single token or a sequence of tokens
+            if len(token_ids) > 1:
+                raise ValueError("The initializer token must be a single token.")
+
+            self.modifier_token_id.append(self.tokenizer.convert_tokens_to_ids(modifier_token_))
+            initializer_token_id.append(token_ids[0])
+        # Resize the token embeddings as we are adding new special tokens to the tokenizer
+        self.text_encoder.resize_token_embeddings(len(self.tokenizer))
+
+        # Initialise the newly added placeholder token with the embeddings of the initializer token
+        token_embeds = self.text_encoder.get_input_embeddings().weight.data
+        for (x, y) in zip(self.modifier_token_id, initializer_token_id):
+            token_embeds[x] = token_embeds[y]
+
+    def save_pretrained(self, save_path, freeze_model="crossattn_kv", save_text_encoder=False, all=False):
+        if all:
+            super().save_pretrained(save_path)
+        else:
+            delta_dict = {'unet': {}, 'modifier_token': {}}
+            if self.modifier_token is not None:
+                for i in range(len(self.modifier_token_id)):
+                    learned_embeds = self.text_encoder.get_input_embeddings().weight[self.modifier_token_id[i]]
+                    delta_dict['modifier_token'][self.modifier_token[i]] = learned_embeds.detach().cpu()
+            if save_text_encoder:
+                delta_dict['text_encoder'] = self.text_encoder.state_dict()
+            for name, params in self.unet.named_parameters():
+                if freeze_model == "crossattn":
+                    if 'attn2' in name:
+                        delta_dict['unet'][name] = params.cpu().clone()
+                elif freeze_model == "crossattn_kv":
+                    if 'attn2.to_k' in name or 'attn2.to_v' in name:
+                        delta_dict['unet'][name] = params.cpu().clone()
+                else:
+                    raise ValueError(
+                            "freeze_model argument only supports crossattn_kv or crossattn"
+                        )
+            torch.save(delta_dict, save_path)
+
+    def load_model(self, save_path, compress=False):
+        st = torch.load(save_path)
+        if 'text_encoder' in st:
+            self.text_encoder.load_state_dict(st['text_encoder'])
+        if 'modifier_token' in st:
+            modifier_tokens = list(st['modifier_token'].keys())
+            modifier_token_id = []
+            for modifier_token in modifier_tokens:
+                num_added_tokens = self.tokenizer.add_tokens(modifier_token)
+                if num_added_tokens == 0:
+                    raise ValueError(
+                        f"The tokenizer already contains the token {modifier_token}. Please pass a different"
+                        " `modifier_token` that is not already in the tokenizer."
+                    )
+                modifier_token_id.append(self.tokenizer.convert_tokens_to_ids(modifier_token))
+            # Resize the token embeddings as we are adding new special tokens to the tokenizer
+            self.text_encoder.resize_token_embeddings(len(self.tokenizer))
+            token_embeds = self.text_encoder.get_input_embeddings().weight.data
+            for i, id_ in enumerate(modifier_token_id):
+                token_embeds[id_] = st['modifier_token'][modifier_tokens[i]]
+
+        for name, params in self.unet.named_parameters():
+            if 'attn2' in name:
+                if compress and ('to_k' in name or 'to_v' in name):
+                    params.data += st['unet'][name]['u']@st['unet'][name]['v']
+                elif name in st['unet']:
+                    params.data.copy_(st['unet'][f'{name}'])
diff --git a/examples/custom_diffusion/train.py b/examples/custom_diffusion/train.py
new file mode 100644
index 000000000000..5d9b8e62d963
--- /dev/null
+++ b/examples/custom_diffusion/train.py
@@ -0,0 +1,1272 @@
+# This code is built from the Huggingface repository: https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth_lora.py, and
+# https://github.com/huggingface/diffusers/blob/main/examples/textual_inversion/textual_inversion.py
+
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 Custom Diffusion authors and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import hashlib
+import logging
+import math
+import os
+import warnings
+import random
+from pathlib import Path
+import json
+import requests
+import itertools
+from io import BytesIO
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+from clip_retrieval.clip_client import ClipClient
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    DiffusionPipeline,
+    DPMSolverMultistepScheduler,
+    UNet2DConditionModel,
+)
+
+from diffusers.models.cross_attention import CrossAttention
+from diffusers.optimization import get_scheduler
+from diffusers.utils import check_min_version, is_wandb_available
+from diffusers.utils.import_utils import is_xformers_available
+
+from model_pipeline import CustomDiffusionAttnProcessor, CustomDiffusionPipeline, set_use_memory_efficient_attention_xformers
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.14.0")
+
+logger = get_logger(__name__)
+
+
+def create_custom_diffusion(unet, freeze_model):
+    for name, params in unet.named_parameters():
+        if freeze_model == 'crossattn':
+            if 'attn2' in name:
+                params.requires_grad = True
+                print(name)
+            else:
+                params.requires_grad = False
+        elif freeze_model == "crossattn_kv":
+            if 'attn2.to_k' in name or 'attn2.to_v' in name:
+                params.requires_grad = True
+                print(name)
+            else:
+                params.requires_grad = False
+        else:
+            raise ValueError(
+                    "freeze_model argument only supports crossattn_kv or crossattn"
+                )
+
+    # change attn class
+    def change_attn(unet):
+        for layer in unet.children():
+            if type(layer) == CrossAttention:
+                bound_method = set_use_memory_efficient_attention_xformers.__get__(layer, layer.__class__)
+                setattr(layer, 'set_use_memory_efficient_attention_xformers', bound_method)
+            else:
+                change_attn(layer)
+
+    change_attn(unet)
+    unet.set_attn_processor(CustomDiffusionAttnProcessor())
+    return unet
+
+
+def freeze_params(params):
+    for param in params:
+        param.requires_grad = False
+
+
+def retrieve(class_prompt, class_images_dir, num_class_images):
+    factor = 1.5
+    num_images = int(factor*num_class_images)
+    client = ClipClient(url="https://knn.laion.ai/knn-service", indice_name="laion_400m", num_images=num_images,  aesthetic_weight=0.1)
+
+    os.makedirs(f'{class_images_dir}/images', exist_ok=True)
+    if len(list(Path(f'{class_images_dir}/images').iterdir())) >= num_class_images:
+        return
+
+    while True:
+        class_images = client.query(text=class_prompt)
+        if len(class_images) >= num_class_images or num_images > 1e4:
+            break
+        else:
+            num_images = int(factor*num_images)
+            client = ClipClient(url="https://knn.laion.ai/knn-service", indice_name="laion_400m", num_images=num_images,  aesthetic_weight=0.1)
+
+    count = 0
+    total = 0
+    pbar = tqdm(desc='downloading real regularization images', total=num_class_images)
+
+    with open(f'{class_images_dir}/caption.txt', 'w') as f1, open(f'{class_images_dir}/urls.txt', 'w') as f2, open(f'{class_images_dir}/images.txt', 'w') as f3:
+        while total < num_class_images:
+            images = class_images[count]
+            count += 1
+            try:
+                img = requests.get(images['url'])
+                if img.status_code == 200:
+                    _ = Image.open(BytesIO(img.content))
+                    with open(f'{class_images_dir}/images/{total}.jpg', 'wb') as f:
+                        f.write(img.content)
+                    f1.write(images['caption']+'\n')
+                    f2.write(images['url']+'\n')
+                    f3.write(f'{class_images_dir}/images/{total}.jpg'+'\n')
+                    total += 1
+                    pbar.update(1)
+                else:
+                    continue
+            except:
+                continue
+    return
+
+
+def save_model_card(repo_id: str, images=None, base_model=str, prompt=str, repo_folder=None):
+    img_str = ""
+    for i, image in enumerate(images):
+        image.save(os.path.join(repo_folder, f"image_{i}.png"))
+        img_str += f"![img_{i}](./image_{i}.png)\n"
+
+    yaml = f"""
+---
+license: creativeml-openrail-m
+base_model: {base_model}
+instance_prompt: {prompt}
+tags:
+- stable-diffusion
+- stable-diffusion-diffusers
+- text-to-image
+- diffusers
+- custom diffusion
+inference: true
+---
+    """
+    model_card = f"""
+# Custom Diffusion - {repo_id}
+
+These are Custom Diffusion adaption weights for {base_model}. The weights were trained on {prompt} using [Custom Diffusion](https://www.cs.cmu.edu/~custom-diffusion). You can find some example images in the following. \n
+{img_str}
+"""
+    with open(os.path.join(repo_folder, "README.md"), "w") as f:
+        f.write(yaml + model_card)
+
+
+def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        revision=revision,
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "RobertaSeriesModelWithTransformation":
+        from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation
+
+        return RobertaSeriesModelWithTransformation
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def collate_fn(examples, with_prior_preservation):
+    input_ids = [example["instance_prompt_ids"] for example in examples]
+    pixel_values = [example["instance_images"] for example in examples]
+    mask = [example["mask"] for example in examples]
+    # Concat class and instance examples for prior preservation.
+    # We do this to avoid doing two forward passes.
+    if with_prior_preservation:
+        input_ids += [example["class_prompt_ids"] for example in examples]
+        pixel_values += [example["class_images"] for example in examples]
+        mask += [example["class_mask"] for example in examples]
+
+    input_ids = torch.cat(input_ids, dim=0)
+    pixel_values = torch.stack(pixel_values)
+    mask = torch.stack(mask)
+    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+    mask = mask.to(memory_format=torch.contiguous_format).float()
+
+    batch = {
+        "input_ids": input_ids,
+        "pixel_values": pixel_values,
+        "mask": mask.unsqueeze(1)
+    }
+    return batch
+
+
+class PromptDataset(Dataset):
+    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
+
+    def __init__(self, prompt, num_samples):
+        self.prompt = prompt
+        self.num_samples = num_samples
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, index):
+        example = {}
+        example["prompt"] = self.prompt
+        example["index"] = index
+        return example
+
+
+class CustomDiffusionDataset(Dataset):
+    """
+    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
+    It pre-processes the images and the tokenizes prompts.
+    """
+
+    def __init__(
+        self,
+        concepts_list,
+        tokenizer,
+        size=512,
+        center_crop=False,
+        with_prior_preservation=False,
+        num_class_images=200,
+        hflip=False,
+    ):
+        self.size = size
+        self.center_crop = center_crop
+        self.tokenizer = tokenizer
+        self.interpolation = Image.BILINEAR
+
+        self.instance_images_path = []
+        self.class_images_path = []
+        self.with_prior_preservation = with_prior_preservation
+        for concept in concepts_list:
+            inst_img_path = [(x, concept["instance_prompt"]) for x in Path(concept["instance_data_dir"]).iterdir() if x.is_file()]
+            self.instance_images_path.extend(inst_img_path)
+
+            if with_prior_preservation:
+                class_data_root = Path(concept["class_data_dir"])
+                if os.path.isdir(class_data_root):
+                    class_images_path = list(class_data_root.iterdir())
+                    class_prompt = [concept["class_prompt"] for _ in range(len(class_images_path))]
+                else:
+                    with open(class_data_root, "r") as f:
+                        class_images_path = f.read().splitlines()
+                    with open(concept["class_prompt"], "r") as f:
+                        class_prompt = f.read().splitlines()
+
+                class_img_path = [(x, y) for (x, y) in zip(class_images_path, class_prompt)]
+                self.class_images_path.extend(class_img_path[:num_class_images])
+
+        random.shuffle(self.instance_images_path)
+        self.num_instance_images = len(self.instance_images_path)
+        self.num_class_images = len(self.class_images_path)
+        self._length = max(self.num_class_images, self.num_instance_images)
+        self.flip = transforms.RandomHorizontalFlip(0.5 * hflip)
+
+        self.image_transforms = transforms.Compose(
+            [
+                self.flip,
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+
+    def __len__(self):
+        return self._length
+
+    def preprocess(self, image, scale, resample):
+        outer, inner = self.size, scale
+        if scale > self.size:
+            outer, inner = scale, self.size
+        top, left = np.random.randint(0, outer-inner+1), np.random.randint(0, outer-inner+1)
+        image = image.resize((scale, scale), resample=resample)
+        image = np.array(image).astype(np.uint8)
+        image = (image / 127.5 - 1.0).astype(np.float32)
+        instance_image = np.zeros((self.size, self.size, 3), dtype=np.float32)
+        mask = np.zeros((self.size // 8, self.size // 8))
+        if scale > self.size:
+            instance_image = image[top: top + inner, left: left + inner, :]
+            mask = np.ones((self.size // 8, self.size // 8))
+        else:
+            instance_image[top: top + inner, left: left + inner, :] = image
+            mask[top // 8 + 1: (top + scale) // 8 - 1, left // 8 + 1: (left + scale) // 8 - 1] = 1.
+        return instance_image, mask
+
+    def __getitem__(self, index):
+        example = {}
+        instance_image, instance_prompt = self.instance_images_path[index % self.num_instance_images]
+        instance_image = Image.open(instance_image)
+        if not instance_image.mode == "RGB":
+            instance_image = instance_image.convert("RGB")
+        instance_image = self.flip(instance_image)
+
+        # apply resize augmentation and create a valid image region mask
+        random_scale = np.random.randint(self.size // 3, self.size+1) if np.random.uniform() < 0.66 else np.random.randint(int(1.2*self.size), int(1.4*self.size))
+        instance_image, mask = self.preprocess(instance_image, random_scale, self.interpolation)
+
+        if random_scale < 0.6*self.size:
+            instance_prompt = np.random.choice(["a far away ", "very small "]) + instance_prompt
+        elif random_scale > self.size:
+            instance_prompt = np.random.choice(["zoomed in ", "close up "]) + instance_prompt
+
+        example["instance_images"] = torch.from_numpy(instance_image).permute(2, 0, 1)
+        example["mask"] = torch.from_numpy(mask)
+        example["instance_prompt_ids"] = self.tokenizer(
+            instance_prompt,
+            truncation=True,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids
+
+        if self.with_prior_preservation:
+            class_image, class_prompt = self.class_images_path[index % self.num_class_images]
+            class_image = Image.open(class_image)
+            if not class_image.mode == "RGB":
+                class_image = class_image.convert("RGB")
+            example["class_images"] = self.image_transforms(class_image)
+            example["class_mask"] = torch.ones_like(example["mask"])
+            example["class_prompt_ids"] = self.tokenizer(
+                class_prompt,
+                truncation=True,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                return_tensors="pt",
+            ).input_ids
+
+        return example
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default=None,
+        help="A folder containing the training data of instance images.",
+    )
+    parser.add_argument(
+        "--class_data_dir",
+        type=str,
+        default=None,
+        help="A folder containing the training data of class images.",
+    )
+    parser.add_argument(
+        "--instance_prompt",
+        type=str,
+        default=None,
+        help="The prompt with identifier specifying the instance",
+    )
+    parser.add_argument(
+        "--class_prompt",
+        type=str,
+        default=None,
+        help="The prompt to specify images in the same class as provided instance images.",
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        help="A prompt that is used during validation to verify that the model is learning.",
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=50,
+        help=(
+            "Run dreambooth validation every X epochs. Dreambooth validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument(
+        "--with_prior_preservation",
+        default=False,
+        action="store_true",
+        help="Flag to add prior preservation loss.",
+    )
+    parser.add_argument(
+        "--real_prior",
+        default=False,
+        action="store_true",
+        help="real images as prior.",
+    )
+    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
+    parser.add_argument(
+        "--num_class_images",
+        type=int,
+        default=200,
+        help=(
+            "Minimal class images for prior preservation loss. If there are not enough images already present in"
+            " class_data_dir, additional images will be sampled with class_prompt."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="custom-diffusion-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=42, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=250,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=(
+            "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`."
+            " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state"
+            " for more docs"
+        ),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-5,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=2,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument(
+        "--freeze_model",
+        type=str,
+        default='crossattn_kv',
+        help="crossattn to enable fine-tuning of all key, value, query matrices",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--prior_generation_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp32", "fp16", "bf16"],
+        help=(
+            "Choose prior generation precision between fp32, fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to  fp16 if a GPU is available else fp32."
+        ),
+    )
+    parser.add_argument(
+        "--concepts_list",
+        type=str,
+        default=None,
+        help="Path to json containing multiple concepts, will overwrite parameters like instance_prompt, class_prompt, etc.",
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument(
+        "--modifier_token",
+        type=str,
+        default=None,
+        help="A token to use as a modifier for the concept.",
+    )
+    parser.add_argument(
+        "--initializer_token", type=str, default='ktn+pll+ucd', help="A token to use as initializer word."
+    )
+    parser.add_argument("--hflip", action="store_true", help="Apply horizontal flip data augmentation.")
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.with_prior_preservation:
+        if args.concepts_list is None:
+            if args.class_data_dir is None:
+                raise ValueError("You must specify a data directory for class images.")
+            if args.class_prompt is None:
+                raise ValueError("You must specify prompt for class images.")
+    else:
+        # logger is not available yet
+        if args.class_data_dir is not None:
+            warnings.warn("You need not use --class_data_dir without --with_prior_preservation.")
+        if args.class_prompt is not None:
+            warnings.warn("You need not use --class_prompt without --with_prior_preservation.")
+
+    return args
+
+
+def main(args):
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(total_limit=args.checkpoints_total_limit)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        logging_dir=logging_dir,
+        project_config=accelerator_project_config,
+    )
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+        import wandb
+
+    # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate
+    # This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models.
+    # TODO (patil-suraj): Remove this check when gradient accumulation with two models is enabled in accelerate.
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        print(vars(args))
+        accelerator.init_trackers("custom-diffusion", config=vars(args))
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+    if args.concepts_list is None:
+        args.concepts_list = [
+            {
+                "instance_prompt": args.instance_prompt,
+                "class_prompt": args.class_prompt,
+                "instance_data_dir": args.instance_data_dir,
+                "class_data_dir": args.class_data_dir
+            }
+        ]
+    else:
+        with open(args.concepts_list, "r") as f:
+            args.concepts_list = json.load(f)
+
+
+    # Generate class images if prior preservation is enabled.
+    if args.with_prior_preservation:
+        for i, concept in enumerate(args.concepts_list):
+            class_images_dir = Path(concept['class_data_dir'])
+            if not class_images_dir.exists():
+                class_images_dir.mkdir(parents=True, exist_ok=True)
+            if args.real_prior:
+                if accelerator.is_main_process:
+                    name = '_'.join(concept['class_prompt'].split())
+                    if not Path(os.path.join(class_images_dir, name)).exists() or len(list(Path(os.path.join(class_images_dir, name)).iterdir())) < args.num_class_images:
+                        retrieve(concept['class_prompt'], class_images_dir, args.num_class_images)
+                concept['class_prompt'] = os.path.join(class_images_dir, 'caption.txt')
+                concept['class_data_dir'] = os.path.join(class_images_dir, 'images.txt')
+                args.concepts_list[i] = concept
+                accelerator.wait_for_everyone()
+            else:
+                cur_class_images = len(list(class_images_dir.iterdir()))
+
+                if cur_class_images < args.num_class_images:
+                    torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32
+                    if args.prior_generation_precision == "fp32":
+                        torch_dtype = torch.float32
+                    elif args.prior_generation_precision == "fp16":
+                        torch_dtype = torch.float16
+                    elif args.prior_generation_precision == "bf16":
+                        torch_dtype = torch.bfloat16
+                    pipeline = DiffusionPipeline.from_pretrained(
+                        args.pretrained_model_name_or_path,
+                        torch_dtype=torch_dtype,
+                        safety_checker=None,
+                        revision=args.revision,
+                    )
+                    pipeline.set_progress_bar_config(disable=True)
+
+                    num_new_images = args.num_class_images - cur_class_images
+                    logger.info(f"Number of class images to sample: {num_new_images}.")
+
+                    sample_dataset = PromptDataset(args.class_prompt, num_new_images)
+                    sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
+
+                    sample_dataloader = accelerator.prepare(sample_dataloader)
+                    pipeline.to(accelerator.device)
+
+                    for example in tqdm(
+                        sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
+                    ):
+                        images = pipeline(example["prompt"]).images
+
+                        for i, image in enumerate(images):
+                            hash_image = hashlib.sha1(image.tobytes()).hexdigest()
+                            image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
+                            image.save(image_filename)
+
+                    del pipeline
+                    if torch.cuda.is_available():
+                        torch.cuda.empty_cache()
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load the tokenizer
+    if args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, revision=args.revision, use_fast=False)
+    elif args.pretrained_model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.pretrained_model_name_or_path,
+            subfolder="tokenizer",
+            revision=args.revision,
+            use_fast=False,
+        )
+
+    # import correct text encoder class
+    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision)
+
+    # Load scheduler and models
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    text_encoder = text_encoder_cls.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
+    )
+    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision)
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision
+    )
+
+    # We only train the additional adapter LoRA layers
+    vae.requires_grad_(False)
+    if args.modifier_token is None:
+        text_encoder.requires_grad_(False)
+    unet = create_custom_diffusion(unet, args.freeze_model)
+
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move unet, vae and text_encoder to device and cast to weight_dtype
+    unet.to(accelerator.device, dtype=weight_dtype)
+    vae.to(accelerator.device, dtype=weight_dtype)
+    text_encoder.to(accelerator.device, dtype=weight_dtype)
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warn(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+        if args.modifier_token is not None:
+            text_encoder.gradient_checkpointing_enable()
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+        if args.with_prior_preservation:
+            args.learning_rate = args.learning_rate*2.
+
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    # Adding a modifier token which is optimized ####
+    # Code taken from https://github.com/huggingface/diffusers/blob/main/examples/textual_inversion/textual_inversion.py
+    modifier_token_id = []
+    initializer_token_id = []
+    if args.modifier_token is not None:
+        args.modifier_token = args.modifier_token.split('+')
+        args.initializer_token = args.initializer_token.split('+')
+        if len(args.modifier_token) > len(args.initializer_token):
+            raise ValueError("You must specify + separated initializer token for each modifier token.")
+        for modifier_token, initializer_token in zip(args.modifier_token, args.initializer_token[:len(args.modifier_token)]):
+            # Add the placeholder token in tokenizer
+            num_added_tokens = tokenizer.add_tokens(modifier_token)
+            if num_added_tokens == 0:
+                raise ValueError(
+                    f"The tokenizer already contains the token {modifier_token}. Please pass a different"
+                    " `modifier_token` that is not already in the tokenizer."
+                )
+
+            # Convert the initializer_token, placeholder_token to ids
+            token_ids = tokenizer.encode([initializer_token], add_special_tokens=False)
+            print(token_ids)
+            # Check if initializer_token is a single token or a sequence of tokens
+            if len(token_ids) > 1:
+                raise ValueError("The initializer token must be a single token.")
+
+            initializer_token_id.append(token_ids[0])
+            modifier_token_id.append(tokenizer.convert_tokens_to_ids(modifier_token))
+
+        # Resize the token embeddings as we are adding new special tokens to the tokenizer
+        text_encoder.resize_token_embeddings(len(tokenizer))
+
+        # Initialise the newly added placeholder token with the embeddings of the initializer token
+        token_embeds = text_encoder.get_input_embeddings().weight.data
+        for (x,y) in zip(modifier_token_id,initializer_token_id):
+            token_embeds[x] = token_embeds[y]
+
+        # Freeze all parameters except for the token embeddings in text encoder
+        params_to_freeze = itertools.chain(
+            text_encoder.text_model.encoder.parameters(),
+            text_encoder.text_model.final_layer_norm.parameters(),
+            text_encoder.text_model.embeddings.position_embedding.parameters(),
+        )
+        freeze_params(params_to_freeze)
+
+        if args.freeze_model == 'crossattn':
+            params_to_optimize = itertools.chain( text_encoder.get_input_embeddings().parameters() , [x[1] for x in unet.named_parameters() if 'attn2' in x[0]] )
+        else:
+            params_to_optimize = itertools.chain( text_encoder.get_input_embeddings().parameters() , [x[1] for x in unet.named_parameters() if ('attn2.to_k' in x[0] or 'attn2.to_v' in x[0])] )
+
+    ########################################################
+    ########################################################
+    else:
+        if args.freeze_model == 'crossattn':
+            params_to_optimize = (
+                itertools.chain([x[1] for x in unet.named_parameters() if 'attn2' in x[0]] ) 
+            )
+        else:
+            params_to_optimize = (
+                itertools.chain([x[1] for x in unet.named_parameters() if ('attn2.to_k' in x[0] or 'attn2.to_v' in x[0])] ) 
+            )
+
+    # Optimizer creation
+    optimizer = optimizer_class(
+        params_to_optimize,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # Dataset and DataLoaders creation:
+    train_dataset = CustomDiffusionDataset(
+        concepts_list=args.concepts_list,
+        tokenizer=tokenizer,
+        with_prior_preservation=args.with_prior_preservation,
+        size=args.resolution,
+        center_crop=args.center_crop,
+        num_class_images=args.num_class_images,
+        hflip=args.hflip
+    )
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        collate_fn=lambda examples: collate_fn(examples, args.with_prior_preservation),
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+    )
+
+    # Prepare everything with our `accelerator`.
+    if args.modifier_token is not None:
+        unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, text_encoder, optimizer, train_dataloader, lr_scheduler
+        )
+    else:
+        unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, optimizer, train_dataloader, lr_scheduler
+        )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the mos recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            resume_global_step = global_step * args.gradient_accumulation_steps
+            first_epoch = global_step // num_update_steps_per_epoch
+            resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps)
+
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
+    progress_bar.set_description("Steps")
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        unet.train()
+        if args.modifier_token is not None:
+            text_encoder.train()
+        for step, batch in enumerate(train_dataloader):
+            # Skip steps until we reach the resumed step
+            if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
+                if step % args.gradient_accumulation_steps == 0:
+                    progress_bar.update(1)
+                continue
+
+            with accelerator.accumulate(unet):
+                # Convert images to latent space
+                latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
+                latents = latents * vae.config.scaling_factor
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                bsz = latents.shape[0]
+                # Sample a random timestep for each image
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                # Get the text embedding for conditioning
+                encoder_hidden_states = text_encoder(batch["input_ids"])[0]
+
+                # Predict the noise residual
+                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                if args.with_prior_preservation:
+                    # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
+                    model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
+                    target, target_prior = torch.chunk(target, 2, dim=0)
+                    mask = torch.chunk(batch["mask"], 2, dim=0)[0]
+                    # Compute instance loss
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+                    loss = ((loss*mask).sum([1, 2, 3])/mask.sum([1, 2, 3])).mean()
+
+                    # Compute prior loss
+                    prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")
+
+                    # Add the prior loss to the instance loss.
+                    loss = loss + args.prior_loss_weight * prior_loss
+                else:
+                    mask = batch["mask"]
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+                    loss = ((loss*mask).sum([1, 2, 3])/mask.sum([1, 2, 3])).mean()
+
+                accelerator.backward(loss)
+                # Zero out the gradients for all token embeddings except the newly added
+                # embeddings for the concept, as we only want to optimize the concept embeddings
+                if args.modifier_token is not None:
+                    if accelerator.num_processes > 1:
+                        grads_text_encoder = text_encoder.module.get_input_embeddings().weight.grad
+                    else:
+                        grads_text_encoder = text_encoder.get_input_embeddings().weight.grad
+                    # Get the index for tokens that we want to zero the grads for
+                    index_grads_to_zero = torch.arange(len(tokenizer)) != modifier_token_id[0]
+                    for i in range(len(modifier_token_id[1:])):
+                        index_grads_to_zero = index_grads_to_zero & (torch.arange(len(tokenizer)) != modifier_token_id[i])
+                    grads_text_encoder.data[index_grads_to_zero, :] = grads_text_encoder.data[index_grads_to_zero, :].fill_(0)
+
+                if accelerator.sync_gradients:
+                    params_to_clip = (
+                        itertools.chain([x[1] for x in unet.named_parameters() if ('attn2' in x[0])], text_encoder.parameters())
+                        if  args.modifier_token is not None
+                        else itertools.chain([x[1] for x in unet.named_parameters() if ('attn2' in x[0])]) 
+                    )
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if global_step % args.checkpointing_steps == 0:
+                    if accelerator.is_main_process:
+                        pipeline = CustomDiffusionPipeline.from_pretrained(
+                            args.pretrained_model_name_or_path,
+                            unet=accelerator.unwrap_model(unet),
+                            text_encoder=accelerator.unwrap_model(text_encoder),
+                            revision=args.revision,
+                            modifier_token=args.modifier_token,
+                            modifier_token_id=modifier_token_id,
+                        )
+                        save_path = os.path.join(args.output_dir, f"delta-{global_step}.bin")
+                        pipeline.save_pretrained(save_path)
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
+                logger.info(
+                    f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+                    f" {args.validation_prompt}."
+                )
+                # create pipeline
+                pipeline = CustomDiffusionPipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    unet=accelerator.unwrap_model(unet),
+                    text_encoder=accelerator.unwrap_model(text_encoder),
+                    tokenizer=tokenizer,
+                    revision=args.revision,
+                    modifier_token=args.modifier_token,
+                    modifier_token_id=modifier_token_id,
+                )
+                pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
+                pipeline = pipeline.to(accelerator.device)
+                pipeline.set_progress_bar_config(disable=True)
+
+                # run inference
+                generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+                images = [
+                    pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0]
+                    for _ in range(args.num_validation_images)
+                ]
+
+                for tracker in accelerator.trackers:
+                    if tracker.name == "tensorboard":
+                        np_images = np.stack([np.asarray(img) for img in images])
+                        tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
+                    if tracker.name == "wandb":
+                        tracker.log(
+                            {
+                                "validation": [
+                                    wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
+                                    for i, image in enumerate(images)
+                                ]
+                            }
+                        )
+
+                del pipeline
+                torch.cuda.empty_cache()
+
+    # Save the lora layers
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = unet.to(torch.float32)
+        pipeline = CustomDiffusionPipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            unet=accelerator.unwrap_model(unet),
+            text_encoder=accelerator.unwrap_model(text_encoder),
+            tokenizer=tokenizer,
+            revision=args.revision,
+            modifier_token=args.modifier_token,
+            modifier_token_id=modifier_token_id,
+        )
+        save_path = os.path.join(args.output_dir, f"delta.bin")
+        pipeline.save_pretrained(save_path)
+
+        # run inference
+        if args.validation_prompt and args.num_validation_images > 0:
+            pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
+            pipeline = pipeline.to(accelerator.device)
+            pipeline.set_progress_bar_config(disable=True)
+
+            # run inference
+            generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+            images = [
+                pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0]
+                for _ in range(args.num_validation_images)
+            ]
+
+            for tracker in accelerator.trackers:
+                if tracker.name == "tensorboard":
+                    np_images = np.stack([np.asarray(img) for img in images])
+                    tracker.writer.add_images("test", np_images, epoch, dataformats="NHWC")
+                if tracker.name == "wandb":
+                    tracker.log(
+                        {
+                            "test": [
+                                wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
+                                for i, image in enumerate(images)
+                            ]
+                        }
+                    )
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                images=images,
+                base_model=args.pretrained_model_name_or_path,
+                prompt=args.instance_prompt,
+                repo_folder=args.output_dir,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
\ No newline at end of file

From 68e350db42d1fbca45b71ca15276164ddb940ea9 Mon Sep 17 00:00:00 2001
From: Nupur Kumari <nupurkumari@Nupurs-MacBook-Pro.local>
Date: Fri, 7 Apr 2023 18:54:50 -0700
Subject: [PATCH 02/34] custom diffusion update

---
 examples/custom_diffusion/README.md         | 142 ++++++++++++++++++++
 examples/custom_diffusion/model_pipeline.py |  17 ++-
 examples/custom_diffusion/train.py          | 136 ++++++++++---------
 3 files changed, 220 insertions(+), 75 deletions(-)
 create mode 100644 examples/custom_diffusion/README.md

diff --git a/examples/custom_diffusion/README.md b/examples/custom_diffusion/README.md
new file mode 100644
index 000000000000..55a58bcf821e
--- /dev/null
+++ b/examples/custom_diffusion/README.md
@@ -0,0 +1,142 @@
+# Custom Diffusion training example (modified from https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/README.md)
+
+[Custom Diffusion](https://arxiv.org/abs/2212.04488) is a method to customize text2image models like stable diffusion given just a few(4~5) images of a subject.
+The `train.py` script shows how to implement the training procedure and adapt it for stable diffusion.
+
+
+## Running locally with PyTorch
+
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install -e .
+```
+
+Then cd in the example folder and run
+```bash
+pip install -r requirements.txt
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+Or for a default accelerate configuration without answering questions about your environment
+
+```bash
+accelerate config default
+```
+
+Or if your environment doesn't support an interactive shell e.g. a notebook
+
+```python
+from accelerate.utils import write_basic_config
+write_basic_config()
+```
+
+### Cat example
+
+Now let's get our dataset. Download dataset from [here](https://www.cs.cmu.edu/~custom-diffusion/assets/data.zip). 
+
+We also collect 200 real images using `clip-retrieval` which are combined with the target images in the training dataset. The real images are similar to the target concept, e.g., cat and prevent overfitting to the the given target cat. The following flags control the regularization loss `with_prior_preservation`, `real_prior`, `prior_loss_weight`, `class_prompt`, and `num_class_images`. The `class_prompt` should be the category name as the collected real images are with text captions similar to the `class_prompt`. The retrieved image are saved in `class_data_dir`. You can disable `real_prior` to use generated image as regularization.
+
+**___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___**
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export OUTPUT_DIR="path-to-save-model"
+export INSTANCE_DIR="./data/cat"
+## training script (2 GPUs recommended, requires 27 GB VRAM. Increase --max_train_steps to 500 if training on 1 GPU)
+
+accelerate launch train.py \
+          --pretrained_model_name_or_path=$MODEL_NAME  \
+          --instance_data_dir=$INSTANCE_DIR \
+          --output_dir=$OUTPUT_DIR \
+          --class_data_dir=./real_reg/samples_cat/ \
+          --with_prior_preservation --real_prior --prior_loss_weight=1.0 \
+          --class_prompt="cat" --num_class_images=200 \
+          --instance_prompt="photo of a <new1> cat"  \
+          --resolution=512  \
+          --train_batch_size=2  \
+          --learning_rate=1e-5  \
+          --lr_warmup_steps=0 \
+          --max_train_steps=250 \
+          --scale_lr --hflip  \
+          --modifier_token "<new1>"
+```
+
+**Use `--enable_xformers_memory_efficient_attention` for faster training with lower VRAM requirement (16GB per GPU).**
+
+
+### Training on multiple concepts
+
+Provide a [json](https://github.com/adobe-research/custom-diffusion/blob/main/assets/concept_list.json) file with the info about each concept, similar to [this](https://github.com/ShivamShrirao/diffusers/blob/main/examples/dreambooth/train_dreambooth.py).
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export OUTPUT_DIR="path-to-save-model"
+
+## launch training script (2 GPUs recommended, increase --max_train_steps to 1000 if 1 GPU)
+
+accelerate launch train.py \
+          --pretrained_model_name_or_path=$MODEL_NAME  \
+          --output_dir=$OUTPUT_DIR \
+          --concepts_list=./assets/concept_list.json \
+          --with_prior_preservation --real_prior --prior_loss_weight=1.0 \
+          --resolution=512  \
+          --train_batch_size=2  \
+          --learning_rate=1e-5  \
+          --lr_warmup_steps=0 \
+          --max_train_steps=500 \
+          --num_class_images=200 \
+          --scale_lr --hflip  \
+          --modifier_token "<new1>+<new2>" 
+```
+
+
+### Inference
+
+Once you have trained a model using the above command, you can run inference simply using the `StableDiffusionPipeline`. Make sure to include the `identifier` (e.g. sks in above example) in your prompt.
+
+```python
+from model_pipeline import CustomDiffusionPipeline
+import torch
+
+pipe = CustomDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16).to("cuda")
+pipe.load_model('<path-to-your-trained-model>/delta.bin')
+image = pipe("<new1> cat sitting in a bucket", num_inference_steps=50, guidance_scale=7.5, eta=1.).images[0]
+
+image.save("cat.png")
+```
+
+### Inference from a training checkpoint
+
+You can also perform inference from one of the complete checkpoint saved during the training process, if you used the `--checkpointing_steps` argument. 
+
+```python
+from diffusers import StableDiffusionPipeline
+import torch
+
+pipe = StableDiffusionPipeline.from_pretrained('<path-to-your-trained-model>/checkpoint-<step>', torch_dtype=torch.float16).to("cuda")
+image = pipe("<new1> cat sitting in a bucket", num_inference_steps=50, guidance_scale=7.5, eta=1.).images[0]
+
+image.save("cat.png")
+```
+
+### Set grads to none
+
+To save even more memory, pass the `--set_grads_to_none` argument to the script. This will set grads to None instead of zero. However, be aware that it changes certain behaviors, so if you start experiencing any problems, remove this argument.
+
+More info: https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html
+
+### Experimental results
+You can refer to [our webpage](https://www.cs.cmu.edu/~custom-diffusion/) that discusses our experiments in detail. 
\ No newline at end of file
diff --git a/examples/custom_diffusion/model_pipeline.py b/examples/custom_diffusion/model_pipeline.py
index 613f2702b09e..3f425414cfab 100644
--- a/examples/custom_diffusion/model_pipeline.py
+++ b/examples/custom_diffusion/model_pipeline.py
@@ -15,7 +15,6 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-
 from typing import Callable, Optional
 import torch
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
@@ -103,10 +102,10 @@ def __call__(
         key = attn.to_k(encoder_hidden_states)
         value = attn.to_v(encoder_hidden_states)
         if crossattn:
-            modifier = torch.ones_like(key)
-            modifier[:, :1, :] = modifier[:, :1, :]*0.
-            key = modifier*key + (1-modifier)*key.detach()
-            value = modifier*value + (1-modifier)*value.detach()
+            detach = torch.ones_like(key)
+            detach[:, :1, :] = detach[:, :1, :] * 0.
+            key = detach * key + (1 - detach) * key.detach()
+            value = detach * value + (1 - detach) * value.detach()
 
         query = attn.head_to_batch_dim(query)
         key = attn.head_to_batch_dim(key)
@@ -146,10 +145,10 @@ def __call__(self, attn: CrossAttention, hidden_states, encoder_hidden_states=No
         key = attn.to_k(encoder_hidden_states)
         value = attn.to_v(encoder_hidden_states)
         if crossattn:
-            modifier = torch.ones_like(key)
-            modifier[:, :1, :] = modifier[:, :1, :]*0.
-            key = modifier*key + (1-modifier)*key.detach()
-            value = modifier*value + (1-modifier)*value.detach()
+            detach = torch.ones_like(key)
+            detach[:, :1, :] = detach[:, :1, :] * 0.
+            key = detach * key + (1 - detach) * key.detach()
+            value = detach * value + (1 - detach) * value.detach()
 
         query = attn.head_to_batch_dim(query).contiguous()
         key = attn.head_to_batch_dim(key).contiguous()
diff --git a/examples/custom_diffusion/train.py b/examples/custom_diffusion/train.py
index 5d9b8e62d963..0f17409663c2 100644
--- a/examples/custom_diffusion/train.py
+++ b/examples/custom_diffusion/train.py
@@ -37,7 +37,8 @@
 from accelerate import Accelerator
 from accelerate.logging import get_logger
 from accelerate.utils import ProjectConfiguration, set_seed
-from huggingface_hub import create_repo, upload_folder
+from huggingface_hub import create_repo
+from huggingface_hub import HfApi
 from packaging import version
 from PIL import Image
 from torch.utils.data import Dataset
@@ -60,6 +61,7 @@
 from diffusers.utils import check_min_version, is_wandb_available
 from diffusers.utils.import_utils import is_xformers_available
 
+# sys.path.append('./')
 from model_pipeline import CustomDiffusionAttnProcessor, CustomDiffusionPipeline, set_use_memory_efficient_attention_xformers
 
 
@@ -85,8 +87,8 @@ def create_custom_diffusion(unet, freeze_model):
                 params.requires_grad = False
         else:
             raise ValueError(
-                    "freeze_model argument only supports crossattn_kv or crossattn"
-                )
+                "freeze_model argument only supports crossattn_kv or crossattn"
+            )
 
     # change attn class
     def change_attn(unet):
@@ -109,8 +111,8 @@ def freeze_params(params):
 
 def retrieve(class_prompt, class_images_dir, num_class_images):
     factor = 1.5
-    num_images = int(factor*num_class_images)
-    client = ClipClient(url="https://knn.laion.ai/knn-service", indice_name="laion_400m", num_images=num_images,  aesthetic_weight=0.1)
+    num_images = int(factor * num_class_images)
+    client = ClipClient(url="https://knn.laion.ai/knn-service", indice_name="laion_400m", num_images=num_images, aesthetic_weight=0.1)
 
     os.makedirs(f'{class_images_dir}/images', exist_ok=True)
     if len(list(Path(f'{class_images_dir}/images').iterdir())) >= num_class_images:
@@ -121,8 +123,8 @@ def retrieve(class_prompt, class_images_dir, num_class_images):
         if len(class_images) >= num_class_images or num_images > 1e4:
             break
         else:
-            num_images = int(factor*num_images)
-            client = ClipClient(url="https://knn.laion.ai/knn-service", indice_name="laion_400m", num_images=num_images,  aesthetic_weight=0.1)
+            num_images = int(factor * num_images)
+            client = ClipClient(url="https://knn.laion.ai/knn-service", indice_name="laion_400m", num_images=num_images, aesthetic_weight=0.1)
 
     count = 0
     total = 0
@@ -138,9 +140,9 @@ def retrieve(class_prompt, class_images_dir, num_class_images):
                     _ = Image.open(BytesIO(img.content))
                     with open(f'{class_images_dir}/images/{total}.jpg', 'wb') as f:
                         f.write(img.content)
-                    f1.write(images['caption']+'\n')
-                    f2.write(images['url']+'\n')
-                    f3.write(f'{class_images_dir}/images/{total}.jpg'+'\n')
+                    f1.write(images['caption'] + '\n')
+                    f2.write(images['url'] + '\n')
+                    f3.write(f'{class_images_dir}/images/{total}.jpg' + '\n')
                     total += 1
                     pbar.update(1)
                 else:
@@ -154,28 +156,28 @@ def save_model_card(repo_id: str, images=None, base_model=str, prompt=str, repo_
     img_str = ""
     for i, image in enumerate(images):
         image.save(os.path.join(repo_folder, f"image_{i}.png"))
-        img_str += f"![img_{i}](./image_{i}.png)\n"
+        img_str += f"./image_{i}.png\n"
 
     yaml = f"""
----
-license: creativeml-openrail-m
-base_model: {base_model}
-instance_prompt: {prompt}
-tags:
-- stable-diffusion
-- stable-diffusion-diffusers
-- text-to-image
-- diffusers
-- custom diffusion
-inference: true
----
-    """
+        ---
+        license: creativeml-openrail-m
+        base_model: {base_model}
+        instance_prompt: {prompt}
+        tags:
+        - stable-diffusion
+        - stable-diffusion-diffusers
+        - text-to-image
+        - diffusers
+        - custom diffusion
+        inference: true
+        ---
+            """
     model_card = f"""
-# Custom Diffusion - {repo_id}
+        # Custom Diffusion - {repo_id}
 
-These are Custom Diffusion adaption weights for {base_model}. The weights were trained on {prompt} using [Custom Diffusion](https://www.cs.cmu.edu/~custom-diffusion). You can find some example images in the following. \n
-{img_str}
-"""
+        These are Custom Diffusion adaption weights for {base_model}. The weights were trained on {prompt} using [Custom Diffusion](https://www.cs.cmu.edu/~custom-diffusion). You can find some example images in the following. \n
+        {img_str[0]}
+        """
     with open(os.path.join(repo_folder, "README.md"), "w") as f:
         f.write(yaml + model_card)
 
@@ -307,7 +309,7 @@ def preprocess(self, image, scale, resample):
         outer, inner = self.size, scale
         if scale > self.size:
             outer, inner = scale, self.size
-        top, left = np.random.randint(0, outer-inner+1), np.random.randint(0, outer-inner+1)
+        top, left = np.random.randint(0, outer - inner + 1), np.random.randint(0, outer - inner + 1)
         image = image.resize((scale, scale), resample=resample)
         image = np.array(image).astype(np.uint8)
         image = (image / 127.5 - 1.0).astype(np.float32)
@@ -330,10 +332,10 @@ def __getitem__(self, index):
         instance_image = self.flip(instance_image)
 
         # apply resize augmentation and create a valid image region mask
-        random_scale = np.random.randint(self.size // 3, self.size+1) if np.random.uniform() < 0.66 else np.random.randint(int(1.2*self.size), int(1.4*self.size))
+        random_scale = np.random.randint(self.size // 3, self.size + 1) if np.random.uniform() < 0.66 else np.random.randint(int(1.2 * self.size), int(1.4 * self.size))
         instance_image, mask = self.preprocess(instance_image, random_scale, self.interpolation)
 
-        if random_scale < 0.6*self.size:
+        if random_scale < 0.6 * self.size:
             instance_prompt = np.random.choice(["a far away ", "very small "]) + instance_prompt
         elif random_scale > self.size:
             instance_prompt = np.random.choice(["zoomed in ", "close up "]) + instance_prompt
@@ -421,13 +423,13 @@ def parse_args(input_args=None):
     parser.add_argument(
         "--num_validation_images",
         type=int,
-        default=4,
+        default=2,
         help="Number of images that should be generated during validation with `validation_prompt`.",
     )
     parser.add_argument(
-        "--validation_epochs",
+        "--validation_steps",
         type=int,
-        default=50,
+        default=500,
         help=(
             "Run dreambooth validation every X epochs. Dreambooth validation consists of running the prompt"
             " `args.validation_prompt` multiple times: `args.num_validation_images`."
@@ -737,7 +739,6 @@ def main(args):
         with open(args.concepts_list, "r") as f:
             args.concepts_list = json.load(f)
 
-
     # Generate class images if prior preservation is enabled.
     if args.with_prior_preservation:
         for i, concept in enumerate(args.concepts_list):
@@ -801,13 +802,20 @@ def main(args):
             os.makedirs(args.output_dir, exist_ok=True)
 
         if args.push_to_hub:
+            print(args.hub_model_id or Path(args.output_dir).name)
             repo_id = create_repo(
                 repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
-            ).repo_id
+            )
+            print(repo_id)
+            repo_id = args.hub_model_id
 
     # Load the tokenizer
     if args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, revision=args.revision, use_fast=False)
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.tokenizer_name,
+            revision=args.revision,
+            use_fast=False,
+        )
     elif args.pretrained_model_name_or_path:
         tokenizer = AutoTokenizer.from_pretrained(
             args.pretrained_model_name_or_path,
@@ -844,9 +852,10 @@ def main(args):
         weight_dtype = torch.bfloat16
 
     # Move unet, vae and text_encoder to device and cast to weight_dtype
-    unet.to(accelerator.device, dtype=weight_dtype)
+    if accelerator.mixed_precision != "fp16":
+        unet.to(accelerator.device, dtype=weight_dtype)
+        text_encoder.to(accelerator.device, dtype=weight_dtype)
     vae.to(accelerator.device, dtype=weight_dtype)
-    text_encoder.to(accelerator.device, dtype=weight_dtype)
 
     if args.enable_xformers_memory_efficient_attention:
         if is_xformers_available():
@@ -875,7 +884,7 @@ def main(args):
             args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
         )
         if args.with_prior_preservation:
-            args.learning_rate = args.learning_rate*2.
+            args.learning_rate = args.learning_rate * 2.
 
     # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
     if args.use_8bit_adam:
@@ -923,7 +932,7 @@ def main(args):
 
         # Initialise the newly added placeholder token with the embeddings of the initializer token
         token_embeds = text_encoder.get_input_embeddings().weight.data
-        for (x,y) in zip(modifier_token_id,initializer_token_id):
+        for (x, y) in zip(modifier_token_id, initializer_token_id):
             token_embeds[x] = token_embeds[y]
 
         # Freeze all parameters except for the token embeddings in text encoder
@@ -935,20 +944,20 @@ def main(args):
         freeze_params(params_to_freeze)
 
         if args.freeze_model == 'crossattn':
-            params_to_optimize = itertools.chain( text_encoder.get_input_embeddings().parameters() , [x[1] for x in unet.named_parameters() if 'attn2' in x[0]] )
+            params_to_optimize = itertools.chain(text_encoder.get_input_embeddings().parameters() , [x[1] for x in unet.named_parameters() if 'attn2' in x[0]])
         else:
-            params_to_optimize = itertools.chain( text_encoder.get_input_embeddings().parameters() , [x[1] for x in unet.named_parameters() if ('attn2.to_k' in x[0] or 'attn2.to_v' in x[0])] )
+            params_to_optimize = itertools.chain(text_encoder.get_input_embeddings().parameters() , [x[1] for x in unet.named_parameters() if ('attn2.to_k' in x[0] or 'attn2.to_v' in x[0])])
 
     ########################################################
     ########################################################
     else:
         if args.freeze_model == 'crossattn':
             params_to_optimize = (
-                itertools.chain([x[1] for x in unet.named_parameters() if 'attn2' in x[0]] ) 
+                itertools.chain([x[1] for x in unet.named_parameters() if 'attn2' in x[0]])
             )
         else:
             params_to_optimize = (
-                itertools.chain([x[1] for x in unet.named_parameters() if ('attn2.to_k' in x[0] or 'attn2.to_v' in x[0])] ) 
+                itertools.chain([x[1] for x in unet.named_parameters() if ('attn2.to_k' in x[0] or 'attn2.to_v' in x[0])])
             )
 
     # Optimizer creation
@@ -1101,7 +1110,7 @@ def main(args):
                     mask = torch.chunk(batch["mask"], 2, dim=0)[0]
                     # Compute instance loss
                     loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
-                    loss = ((loss*mask).sum([1, 2, 3])/mask.sum([1, 2, 3])).mean()
+                    loss = ((loss * mask).sum([1, 2, 3]) / mask.sum([1, 2, 3])).mean()
 
                     # Compute prior loss
                     prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")
@@ -1111,7 +1120,7 @@ def main(args):
                 else:
                     mask = batch["mask"]
                     loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
-                    loss = ((loss*mask).sum([1, 2, 3])/mask.sum([1, 2, 3])).mean()
+                    loss = ((loss * mask).sum([1, 2, 3]) / mask.sum([1, 2, 3])).mean()
 
                 accelerator.backward(loss)
                 # Zero out the gradients for all token embeddings except the newly added
@@ -1130,7 +1139,7 @@ def main(args):
                 if accelerator.sync_gradients:
                     params_to_clip = (
                         itertools.chain([x[1] for x in unet.named_parameters() if ('attn2' in x[0])], text_encoder.parameters())
-                        if  args.modifier_token is not None
+                        if args.modifier_token is not None
                         else itertools.chain([x[1] for x in unet.named_parameters() if ('attn2' in x[0])]) 
                     )
                     accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
@@ -1145,16 +1154,10 @@ def main(args):
 
                 if global_step % args.checkpointing_steps == 0:
                     if accelerator.is_main_process:
-                        pipeline = CustomDiffusionPipeline.from_pretrained(
-                            args.pretrained_model_name_or_path,
-                            unet=accelerator.unwrap_model(unet),
-                            text_encoder=accelerator.unwrap_model(text_encoder),
-                            revision=args.revision,
-                            modifier_token=args.modifier_token,
-                            modifier_token_id=modifier_token_id,
-                        )
-                        save_path = os.path.join(args.output_dir, f"delta-{global_step}.bin")
-                        pipeline.save_pretrained(save_path)
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
             logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
             progress_bar.set_postfix(**logs)
             accelerator.log(logs, step=global_step)
@@ -1163,7 +1166,7 @@ def main(args):
                 break
 
         if accelerator.is_main_process:
-            if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
+            if args.validation_prompt is not None and global_step % args.validation_steps == 0:
                 logger.info(
                     f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
                     f" {args.validation_prompt}."
@@ -1185,7 +1188,7 @@ def main(args):
                 # run inference
                 generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
                 images = [
-                    pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0]
+                    pipeline(args.validation_prompt, num_inference_steps=25, generator=generator, eta=1.).images[0]
                     for _ in range(args.num_validation_images)
                 ]
 
@@ -1219,7 +1222,7 @@ def main(args):
             modifier_token=args.modifier_token,
             modifier_token_id=modifier_token_id,
         )
-        save_path = os.path.join(args.output_dir, f"delta.bin")
+        save_path = os.path.join(args.output_dir, "delta.bin")
         pipeline.save_pretrained(save_path)
 
         # run inference
@@ -1231,7 +1234,7 @@ def main(args):
             # run inference
             generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
             images = [
-                pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0]
+                pipeline(args.validation_prompt, num_inference_steps=25, generator=generator, eta=1.).images[0]
                 for _ in range(args.num_validation_images)
             ]
 
@@ -1257,11 +1260,12 @@ def main(args):
                 prompt=args.instance_prompt,
                 repo_folder=args.output_dir,
             )
-            upload_folder(
+            api = HfApi(token=args.hub_token)
+            api.upload_folder(
                 repo_id=repo_id,
                 folder_path=args.output_dir,
-                commit_message="End of training",
-                ignore_patterns=["step_*", "epoch_*"],
+                path_in_repo='.',
+                repo_type='model'
             )
 
     accelerator.end_training()
@@ -1269,4 +1273,4 @@ def main(args):
 
 if __name__ == "__main__":
     args = parse_args()
-    main(args)
\ No newline at end of file
+    main(args)

From 68d20e4d80547129258a7a382d402ad1f7318096 Mon Sep 17 00:00:00 2001
From: Nupur Kumari <nupurkumari@Nupurs-MacBook-Pro.local>
Date: Fri, 7 Apr 2023 18:56:02 -0700
Subject: [PATCH 03/34] custom diffusion update

---
 examples/custom_diffusion/README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/custom_diffusion/README.md b/examples/custom_diffusion/README.md
index 55a58bcf821e..222a26293103 100644
--- a/examples/custom_diffusion/README.md
+++ b/examples/custom_diffusion/README.md
@@ -1,4 +1,5 @@
-# Custom Diffusion training example (modified from https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/README.md)
+# Custom Diffusion training example 
+(modified from https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/README.md)
 
 [Custom Diffusion](https://arxiv.org/abs/2212.04488) is a method to customize text2image models like stable diffusion given just a few(4~5) images of a subject.
 The `train.py` script shows how to implement the training procedure and adapt it for stable diffusion.

From d8f1ade0eee60e39762194d5bd6573c5216a2995 Mon Sep 17 00:00:00 2001
From: Nupur Kumari <nupurkumari@Nupurs-MacBook-Pro.local>
Date: Fri, 7 Apr 2023 18:56:53 -0700
Subject: [PATCH 04/34] custom diffusion update

---
 examples/custom_diffusion/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/custom_diffusion/README.md b/examples/custom_diffusion/README.md
index 222a26293103..d2df94c45884 100644
--- a/examples/custom_diffusion/README.md
+++ b/examples/custom_diffusion/README.md
@@ -46,7 +46,7 @@ write_basic_config()
 
 ### Cat example
 
-Now let's get our dataset. Download dataset from [here](https://www.cs.cmu.edu/~custom-diffusion/assets/data.zip). 
+Now let's get our dataset. Download dataset from [here](https://www.cs.cmu.edu/~custom-diffusion/assets/data.zip) and unzip it. 
 
 We also collect 200 real images using `clip-retrieval` which are combined with the target images in the training dataset. The real images are similar to the target concept, e.g., cat and prevent overfitting to the the given target cat. The following flags control the regularization loss `with_prior_preservation`, `real_prior`, `prior_loss_weight`, `class_prompt`, and `num_class_images`. The `class_prompt` should be the category name as the collected real images are with text captions similar to the `class_prompt`. The retrieved image are saved in `class_data_dir`. You can disable `real_prior` to use generated image as regularization.
 

From 992317ef59b1221bb1383b93a001f74b0653d5c0 Mon Sep 17 00:00:00 2001
From: Nupur Kumari <nupurkumari@Nupurs-MacBook-Pro.local>
Date: Fri, 7 Apr 2023 19:01:04 -0700
Subject: [PATCH 05/34] custom diffusion update

---
 examples/custom_diffusion/README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/custom_diffusion/README.md b/examples/custom_diffusion/README.md
index d2df94c45884..5bd51296de13 100644
--- a/examples/custom_diffusion/README.md
+++ b/examples/custom_diffusion/README.md
@@ -48,7 +48,8 @@ write_basic_config()
 
 Now let's get our dataset. Download dataset from [here](https://www.cs.cmu.edu/~custom-diffusion/assets/data.zip) and unzip it. 
 
-We also collect 200 real images using `clip-retrieval` which are combined with the target images in the training dataset. The real images are similar to the target concept, e.g., cat and prevent overfitting to the the given target cat. The following flags control the regularization loss `with_prior_preservation`, `real_prior`, `prior_loss_weight`, `class_prompt`, and `num_class_images`. The `class_prompt` should be the category name as the collected real images are with text captions similar to the `class_prompt`. The retrieved image are saved in `class_data_dir`. You can disable `real_prior` to use generated image as regularization.
+We also collect 200 real images using `clip-retrieval` which are combined with the target images in the training dataset as a regularization. This prevents overfitting to the the given target image. The following flags enable the regularization `with_prior_preservation`, `real_prior` with `prior_loss_weight=1.`. 
+The `class_prompt` should be the category name same as target image. The collected real images are with text captions similar to the `class_prompt`. The retrieved image are saved in `class_data_dir`. You can disable `real_prior` to use generated images as regularization.
 
 **___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___**
 
@@ -106,7 +107,7 @@ accelerate launch train.py \
 
 ### Inference
 
-Once you have trained a model using the above command, you can run inference simply using the `StableDiffusionPipeline`. Make sure to include the `identifier` (e.g. sks in above example) in your prompt.
+Once you have trained a model using the above command, you can run inference using the below command. Make sure to include the `modifier token` (e.g. \<new1\> in above example) in your prompt.
 
 ```python
 from model_pipeline import CustomDiffusionPipeline

From 7000ea2d89d01ebf2c90fa199d6edb8850c4641b Mon Sep 17 00:00:00 2001
From: Nupur Kumari <nupurkumari@Nupurs-MacBook-Pro.local>
Date: Fri, 7 Apr 2023 19:02:50 -0700
Subject: [PATCH 06/34] custom diffusion update

---
 examples/custom_diffusion/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/custom_diffusion/README.md b/examples/custom_diffusion/README.md
index 5bd51296de13..422d76a4c29c 100644
--- a/examples/custom_diffusion/README.md
+++ b/examples/custom_diffusion/README.md
@@ -23,6 +23,7 @@ pip install -e .
 Then cd in the example folder and run
 ```bash
 pip install -r requirements.txt
+pip install clip-retrieval 
 ```
 
 And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:

From 52600016039eb20a910b6e3b8b31d49a9f2668d2 Mon Sep 17 00:00:00 2001
From: Nupur Kumari <nupurkumari@Nupurs-MacBook-Pro.local>
Date: Sun, 9 Apr 2023 12:30:33 -0700
Subject: [PATCH 07/34] custom diffusion update

---
 examples/custom_diffusion/README.md         | 54 ++++++++++++++++++---
 examples/custom_diffusion/model_pipeline.py | 14 +-----
 examples/custom_diffusion/train.py          | 40 ++++++++++++---
 3 files changed, 82 insertions(+), 26 deletions(-)

diff --git a/examples/custom_diffusion/README.md b/examples/custom_diffusion/README.md
index 422d76a4c29c..b7eed93fcb60 100644
--- a/examples/custom_diffusion/README.md
+++ b/examples/custom_diffusion/README.md
@@ -58,7 +58,7 @@ The `class_prompt` should be the category name same as target image. The collect
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
 export OUTPUT_DIR="path-to-save-model"
 export INSTANCE_DIR="./data/cat"
-## training script (2 GPUs recommended, requires 27 GB VRAM. Increase --max_train_steps to 500 if training on 1 GPU)
+## launch training script (2 GPUs recommended, increase --max_train_steps to 500 if 1 GPU, or increase --train_batch_size=4)
 
 accelerate launch train.py \
           --pretrained_model_name_or_path=$MODEL_NAME  \
@@ -88,12 +88,12 @@ Provide a [json](https://github.com/adobe-research/custom-diffusion/blob/main/as
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
 export OUTPUT_DIR="path-to-save-model"
 
-## launch training script (2 GPUs recommended, increase --max_train_steps to 1000 if 1 GPU)
+## launch training script (2 GPUs recommended, increase --max_train_steps to 1000 if 1 GPU, or increase --train_batch_size=4)
 
 accelerate launch train.py \
           --pretrained_model_name_or_path=$MODEL_NAME  \
           --output_dir=$OUTPUT_DIR \
-          --concepts_list=./assets/concept_list.json \
+          --concepts_list=./concept_list.json \
           --with_prior_preservation --real_prior --prior_loss_weight=1.0 \
           --resolution=512  \
           --train_batch_size=2  \
@@ -105,6 +105,35 @@ accelerate launch train.py \
           --modifier_token "<new1>+<new2>" 
 ```
 
+### Training on human faces
+
+For fine-tuning on human faces we found the following configuration to work better: `learning_rate=5e-6`, `max_train_steps=1000 to 2000`, and `freeze_model=crossattn` with atleast 15-20 images. 
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export OUTPUT_DIR="path-to-save-model"
+export INSTANCE_DIR="path-to-images"
+
+## launch training script (2 GPUs recommended, increase --max_train_steps to 1000 if 1 GPU, or increase --train_batch_size=4)
+
+CUDA_VISIBLE_DEVICES=1 accelerate launch train.py \
+          --pretrained_model_name_or_path=$MODEL_NAME  \
+          --instance_data_dir=$INSTANCE_DIR \
+          --output_dir=$OUTPUT_DIR \
+          --class_data_dir=./real_reg/samples_person/ \
+          --with_prior_preservation --real_prior --prior_loss_weight=1.0 \
+          --class_prompt="person" --num_class_images=200 \
+          --instance_prompt="photo of a <new1> person"  \
+          --resolution=512  \
+          --train_batch_size=2  \
+          --learning_rate=5e-6  \
+          --lr_warmup_steps=0 \
+          --max_train_steps=1000 \
+          --scale_lr --hflip --noaug \
+          --freeze_model crossattn \
+          --modifier_token "<new1>" \
+          --enable_xformers_memory_efficient_attention \
+```
 
 ### Inference
 
@@ -116,7 +145,7 @@ import torch
 
 pipe = CustomDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16).to("cuda")
 pipe.load_model('<path-to-your-trained-model>/delta.bin')
-image = pipe("<new1> cat sitting in a bucket", num_inference_steps=50, guidance_scale=7.5, eta=1.).images[0]
+image = pipe("<new1> cat sitting in a bucket", num_inference_steps=100, guidance_scale=7.5, eta=1.).images[0]
 
 image.save("cat.png")
 ```
@@ -129,12 +158,25 @@ You can also perform inference from one of the complete checkpoint saved during
 from diffusers import StableDiffusionPipeline
 import torch
 
-pipe = StableDiffusionPipeline.from_pretrained('<path-to-your-trained-model>/checkpoint-<step>', torch_dtype=torch.float16).to("cuda")
-image = pipe("<new1> cat sitting in a bucket", num_inference_steps=50, guidance_scale=7.5, eta=1.).images[0]
+pipe = StableDiffusionPipeline.from_pretrained('path-to-the-model/checkpoint-<global-step>/', torch_dtype=torch.float16).to("cuda")
+image = pipe("<new1> cat sitting in a bucket", num_inference_steps=100, guidance_scale=7.5, eta=1.).images[0]
 
 image.save("cat.png")
 ```
 
+### Converting delta.bin to diffusers pipeline
+
+You can also perform inference from one of the complete checkpoint saved during the training process, if you used the `--checkpointing_steps` argument. 
+
+```python
+from model_pipeline import CustomDiffusionPipeline
+import torch
+
+pipe = CustomDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16).to("cuda")
+pipe.load_model('<path-to-your-trained-model>/delta.bin')
+pipe.save_pretrained('<path-to-your-save-model>', all=True)
+```
+
 ### Set grads to none
 
 To save even more memory, pass the `--set_grads_to_none` argument to the script. This will set grads to None instead of zero. However, be aware that it changes certain behaviors, so if you start experiencing any problems, remove this argument.
diff --git a/examples/custom_diffusion/model_pipeline.py b/examples/custom_diffusion/model_pipeline.py
index 3f425414cfab..c5d2097ecc3b 100644
--- a/examples/custom_diffusion/model_pipeline.py
+++ b/examples/custom_diffusion/model_pipeline.py
@@ -220,16 +220,6 @@ def __init__(
                          requires_safety_checker)
 
         # change attn class
-        def change_attn(unet):
-            for layer in unet.children():
-                if type(layer) == CrossAttention:
-                    bound_method = set_use_memory_efficient_attention_xformers.__get__(layer, layer.__class__)
-                    setattr(layer, 'set_use_memory_efficient_attention_xformers', bound_method)
-                else:
-                    change_attn(layer)
-
-        change_attn(self.unet)
-        self.unet.set_attn_processor(CustomDiffusionAttnProcessor())
         self.modifier_token = modifier_token
         self.modifier_token_id = modifier_token_id
 
@@ -280,8 +270,8 @@ def save_pretrained(self, save_path, freeze_model="crossattn_kv", save_text_enco
                         delta_dict['unet'][name] = params.cpu().clone()
                 else:
                     raise ValueError(
-                            "freeze_model argument only supports crossattn_kv or crossattn"
-                        )
+                        "freeze_model argument only supports crossattn_kv or crossattn"
+                    )
             torch.save(delta_dict, save_path)
 
     def load_model(self, save_path, compress=False):
diff --git a/examples/custom_diffusion/train.py b/examples/custom_diffusion/train.py
index 0f17409663c2..5cd2e72ca4c4 100644
--- a/examples/custom_diffusion/train.py
+++ b/examples/custom_diffusion/train.py
@@ -120,7 +120,7 @@ def retrieve(class_prompt, class_images_dir, num_class_images):
 
     while True:
         class_images = client.query(text=class_prompt)
-        if len(class_images) >= num_class_images or num_images > 1e4:
+        if len(class_images) >= factor*num_class_images or num_images > 1e4:
             break
         else:
             num_images = int(factor * num_images)
@@ -259,11 +259,13 @@ def __init__(
         with_prior_preservation=False,
         num_class_images=200,
         hflip=False,
+        aug=True,
     ):
         self.size = size
         self.center_crop = center_crop
         self.tokenizer = tokenizer
         self.interpolation = Image.BILINEAR
+        self.aug = aug
 
         self.instance_images_path = []
         self.class_images_path = []
@@ -332,7 +334,9 @@ def __getitem__(self, index):
         instance_image = self.flip(instance_image)
 
         # apply resize augmentation and create a valid image region mask
-        random_scale = np.random.randint(self.size // 3, self.size + 1) if np.random.uniform() < 0.66 else np.random.randint(int(1.2 * self.size), int(1.4 * self.size))
+        random_scale = self.size
+        if self.aug:
+            random_scale = np.random.randint(self.size // 3, self.size + 1) if np.random.uniform() < 0.66 else np.random.randint(int(1.2 * self.size), int(1.4 * self.size))
         instance_image, mask = self.preprocess(instance_image, random_scale, self.interpolation)
 
         if random_scale < 0.6 * self.size:
@@ -646,6 +650,15 @@ def parse_args(input_args=None):
     parser.add_argument(
         "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
     )
+    parser.add_argument(
+        "--set_grads_to_none",
+        action="store_true",
+        help=(
+            "Save more memory by using setting grads to None instead of zero. Be aware, that this changes certain"
+            " behaviors, so disable this argument if it causes any problems. More info:"
+            " https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html"
+        ),
+    )
     parser.add_argument(
         "--modifier_token",
         type=str,
@@ -656,6 +669,7 @@ def parse_args(input_args=None):
         "--initializer_token", type=str, default='ktn+pll+ucd', help="A token to use as initializer word."
     )
     parser.add_argument("--hflip", action="store_true", help="Apply horizontal flip data augmentation.")
+    parser.add_argument("--noaug", action="store_true", help="Dont apply augmentation during data augmentation when this flag is enabled.")
 
     if input_args is not None:
         args = parser.parse_args(input_args)
@@ -837,7 +851,6 @@ def main(args):
         args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision
     )
 
-    # We only train the additional adapter LoRA layers
     vae.requires_grad_(False)
     if args.modifier_token is None:
         text_encoder.requires_grad_(False)
@@ -977,7 +990,7 @@ def main(args):
         size=args.resolution,
         center_crop=args.center_crop,
         num_class_images=args.num_class_images,
-        hflip=args.hflip
+        hflip=args.hflip, aug=not args.noaug,
     )
 
     train_dataloader = torch.utils.data.DataLoader(
@@ -1145,7 +1158,7 @@ def main(args):
                     accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
                 optimizer.step()
                 lr_scheduler.step()
-                optimizer.zero_grad()
+                optimizer.zero_grad(set_to_none=args.set_grads_to_none)
 
             # Checks if the accelerator has performed an optimization step behind the scenes
             if accelerator.sync_gradients:
@@ -1154,9 +1167,20 @@ def main(args):
 
                 if global_step % args.checkpointing_steps == 0:
                     if accelerator.is_main_process:
+                        pipeline = CustomDiffusionPipeline.from_pretrained(
+                            args.pretrained_model_name_or_path,
+                            unet=accelerator.unwrap_model(unet),
+                            text_encoder=accelerator.unwrap_model(text_encoder),
+                            tokenizer=tokenizer,
+                            revision=args.revision,
+                            modifier_token=args.modifier_token,
+                            modifier_token_id=modifier_token_id,
+                        )
                         save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
-                        accelerator.save_state(save_path)
+                        pipeline.save_pretrained(save_path, all=True)
                         logger.info(f"Saved state to {save_path}")
+                        del pipeline
+                        torch.cuda.empty_cache()
 
             logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
             progress_bar.set_postfix(**logs)
@@ -1209,7 +1233,7 @@ def main(args):
                 del pipeline
                 torch.cuda.empty_cache()
 
-    # Save the lora layers
+    # Save the updated weights 
     accelerator.wait_for_everyone()
     if accelerator.is_main_process:
         unet = unet.to(torch.float32)
@@ -1223,7 +1247,7 @@ def main(args):
             modifier_token_id=modifier_token_id,
         )
         save_path = os.path.join(args.output_dir, "delta.bin")
-        pipeline.save_pretrained(save_path)
+        pipeline.save_pretrained(save_path, freeze_model=args.freeze_model)
 
         # run inference
         if args.validation_prompt and args.num_validation_images > 0:

From 74bf2888a3b6fc7af7a556b916bac5cddf950853 Mon Sep 17 00:00:00 2001
From: Nupur Kumari <nupurkumari@Nupurs-MacBook-Pro.local>
Date: Mon, 17 Apr 2023 01:48:47 -0400
Subject: [PATCH 08/34] custom diffusion

---
 examples/custom_diffusion/README.md         |   2 +-
 examples/custom_diffusion/train.py          | 366 ++++++++------------
 src/diffusers/loaders.py                    |  37 +-
 src/diffusers/models/attention_processor.py | 178 ++++++++++
 4 files changed, 362 insertions(+), 221 deletions(-)

diff --git a/examples/custom_diffusion/README.md b/examples/custom_diffusion/README.md
index b7eed93fcb60..cdbf63f28f54 100644
--- a/examples/custom_diffusion/README.md
+++ b/examples/custom_diffusion/README.md
@@ -74,7 +74,7 @@ accelerate launch train.py \
           --lr_warmup_steps=0 \
           --max_train_steps=250 \
           --scale_lr --hflip  \
-          --modifier_token "<new1>"
+          --modifier_token "<new1>" 
 ```
 
 **Use `--enable_xformers_memory_efficient_attention` for faster training with lower VRAM requirement (16GB per GPU).**
diff --git a/examples/custom_diffusion/train.py b/examples/custom_diffusion/train.py
index 5cd2e72ca4c4..0f417b547d73 100644
--- a/examples/custom_diffusion/train.py
+++ b/examples/custom_diffusion/train.py
@@ -18,36 +18,28 @@
 
 import argparse
 import hashlib
+import itertools
+import json
 import logging
 import math
 import os
-import warnings
 import random
-from pathlib import Path
-import json
-import requests
-import itertools
+import warnings
 from io import BytesIO
+from pathlib import Path
 
 import numpy as np
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
+from torch.utils.data import Dataset
+
+import diffusers
+import requests
 import transformers
 from accelerate import Accelerator
 from accelerate.logging import get_logger
 from accelerate.utils import ProjectConfiguration, set_seed
-from huggingface_hub import create_repo
-from huggingface_hub import HfApi
-from packaging import version
-from PIL import Image
-from torch.utils.data import Dataset
-from torchvision import transforms
-from tqdm.auto import tqdm
-from transformers import AutoTokenizer, PretrainedConfig
-from clip_retrieval.clip_client import ClipClient
-
-import diffusers
 from diffusers import (
     AutoencoderKL,
     DDPMScheduler,
@@ -55,15 +47,18 @@
     DPMSolverMultistepScheduler,
     UNet2DConditionModel,
 )
-
-from diffusers.models.cross_attention import CrossAttention
+from diffusers.loaders import AttnProcsLayers
+from diffusers.models.attention_processor import CustomDiffusionAttnProcessor, AttnProcessor
 from diffusers.optimization import get_scheduler
 from diffusers.utils import check_min_version, is_wandb_available
 from diffusers.utils.import_utils import is_xformers_available
-
-# sys.path.append('./')
-from model_pipeline import CustomDiffusionAttnProcessor, CustomDiffusionPipeline, set_use_memory_efficient_attention_xformers
-
+from huggingface_hub import HfApi, create_repo
+from packaging import version
+from PIL import Image
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+import ipdb
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
 check_min_version("0.14.0")
@@ -71,87 +66,11 @@
 logger = get_logger(__name__)
 
 
-def create_custom_diffusion(unet, freeze_model):
-    for name, params in unet.named_parameters():
-        if freeze_model == 'crossattn':
-            if 'attn2' in name:
-                params.requires_grad = True
-                print(name)
-            else:
-                params.requires_grad = False
-        elif freeze_model == "crossattn_kv":
-            if 'attn2.to_k' in name or 'attn2.to_v' in name:
-                params.requires_grad = True
-                print(name)
-            else:
-                params.requires_grad = False
-        else:
-            raise ValueError(
-                "freeze_model argument only supports crossattn_kv or crossattn"
-            )
-
-    # change attn class
-    def change_attn(unet):
-        for layer in unet.children():
-            if type(layer) == CrossAttention:
-                bound_method = set_use_memory_efficient_attention_xformers.__get__(layer, layer.__class__)
-                setattr(layer, 'set_use_memory_efficient_attention_xformers', bound_method)
-            else:
-                change_attn(layer)
-
-    change_attn(unet)
-    unet.set_attn_processor(CustomDiffusionAttnProcessor())
-    return unet
-
-
 def freeze_params(params):
     for param in params:
         param.requires_grad = False
 
 
-def retrieve(class_prompt, class_images_dir, num_class_images):
-    factor = 1.5
-    num_images = int(factor * num_class_images)
-    client = ClipClient(url="https://knn.laion.ai/knn-service", indice_name="laion_400m", num_images=num_images, aesthetic_weight=0.1)
-
-    os.makedirs(f'{class_images_dir}/images', exist_ok=True)
-    if len(list(Path(f'{class_images_dir}/images').iterdir())) >= num_class_images:
-        return
-
-    while True:
-        class_images = client.query(text=class_prompt)
-        if len(class_images) >= factor*num_class_images or num_images > 1e4:
-            break
-        else:
-            num_images = int(factor * num_images)
-            client = ClipClient(url="https://knn.laion.ai/knn-service", indice_name="laion_400m", num_images=num_images, aesthetic_weight=0.1)
-
-    count = 0
-    total = 0
-    pbar = tqdm(desc='downloading real regularization images', total=num_class_images)
-
-    with open(f'{class_images_dir}/caption.txt', 'w') as f1, open(f'{class_images_dir}/urls.txt', 'w') as f2, open(f'{class_images_dir}/images.txt', 'w') as f3:
-        while total < num_class_images:
-            images = class_images[count]
-            count += 1
-            try:
-                img = requests.get(images['url'])
-                if img.status_code == 200:
-                    _ = Image.open(BytesIO(img.content))
-                    with open(f'{class_images_dir}/images/{total}.jpg', 'wb') as f:
-                        f.write(img.content)
-                    f1.write(images['caption'] + '\n')
-                    f2.write(images['url'] + '\n')
-                    f3.write(f'{class_images_dir}/images/{total}.jpg' + '\n')
-                    total += 1
-                    pbar.update(1)
-                else:
-                    continue
-            except:
-                continue
-    return
-
-
 def save_model_card(repo_id: str, images=None, base_model=str, prompt=str, repo_folder=None):
     img_str = ""
     for i, image in enumerate(images):
@@ -372,6 +291,15 @@ def __getitem__(self, index):
         return example
 
 
+def save_new_embed(text_encoder, modifier_token_id, accelerator, args, output_dir):
+    logger.info("Saving embeddings")
+    learned_embeds = accelerator.unwrap_model(text_encoder).get_input_embeddings().weight
+    for (x, y) in zip(modifier_token_id, args.modifier_token):
+        learned_embeds_dict = {}
+        learned_embeds_dict[y] = learned_embeds[x]
+        torch.save(learned_embeds_dict, f'{output_dir}/{y}.bin')
+
+
 def parse_args(input_args=None):
     parser = argparse.ArgumentParser(description="Simple example of a training script.")
     parser.add_argument(
@@ -563,7 +491,8 @@ def parse_args(input_args=None):
         "--freeze_model",
         type=str,
         default='crossattn_kv',
-        help="crossattn to enable fine-tuning of all key, value, query matrices",
+        choices=['crossattn_kv', 'crossattn'],
+        help="crossattn to enable fine-tuning of all params in the cross attention",
     )
     parser.add_argument(
         "--lr_scheduler",
@@ -760,10 +689,10 @@ def main(args):
             if not class_images_dir.exists():
                 class_images_dir.mkdir(parents=True, exist_ok=True)
             if args.real_prior:
-                if accelerator.is_main_process:
-                    name = '_'.join(concept['class_prompt'].split())
-                    if not Path(os.path.join(class_images_dir, name)).exists() or len(list(Path(os.path.join(class_images_dir, name)).iterdir())) < args.num_class_images:
-                        retrieve(concept['class_prompt'], class_images_dir, args.num_class_images)
+                assert (class_images_dir / 'images').exists(), print(f"Please run python retrieve.py --target {concept['class_prompt']} --outpath {class_images_dir}")
+                assert len(list((class_images_dir / 'images').iterdir())) == args.num_class_images, print(f"Please run python retrieve.py --target {concept['class_prompt']} --outpath {class_images_dir}")
+                assert (class_images_dir / 'caption.txt').exists(), print(f"Please run python retrieve.py --target {concept['class_prompt']} --outpath {class_images_dir}")
+                assert (class_images_dir / 'images.txt').exists(), print(f"Please run python retrieve.py --target {concept['class_prompt']} --outpath {class_images_dir}")
                 concept['class_prompt'] = os.path.join(class_images_dir, 'caption.txt')
                 concept['class_data_dir'] = os.path.join(class_images_dir, 'images.txt')
                 args.concepts_list[i] = concept
@@ -851,11 +780,56 @@ def main(args):
         args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision
     )
 
+    # Adding a modifier token which is optimized ####
+    # Code taken from https://github.com/huggingface/diffusers/blob/main/examples/textual_inversion/textual_inversion.py
+    modifier_token_id = []
+    initializer_token_id = []
+    if args.modifier_token is not None:
+        args.modifier_token = args.modifier_token.split('+')
+        args.initializer_token = args.initializer_token.split('+')
+        if len(args.modifier_token) > len(args.initializer_token):
+            raise ValueError("You must specify + separated initializer token for each modifier token.")
+        for modifier_token, initializer_token in zip(args.modifier_token, args.initializer_token[:len(args.modifier_token)]):
+            # Add the placeholder token in tokenizer
+            num_added_tokens = tokenizer.add_tokens(modifier_token)
+            if num_added_tokens == 0:
+                raise ValueError(
+                    f"The tokenizer already contains the token {modifier_token}. Please pass a different"
+                    " `modifier_token` that is not already in the tokenizer."
+                )
+
+            # Convert the initializer_token, placeholder_token to ids
+            token_ids = tokenizer.encode([initializer_token], add_special_tokens=False)
+            print(token_ids)
+            # Check if initializer_token is a single token or a sequence of tokens
+            if len(token_ids) > 1:
+                raise ValueError("The initializer token must be a single token.")
+
+            initializer_token_id.append(token_ids[0])
+            modifier_token_id.append(tokenizer.convert_tokens_to_ids(modifier_token))
+
+        # Resize the token embeddings as we are adding new special tokens to the tokenizer
+        text_encoder.resize_token_embeddings(len(tokenizer))
+
+        # Initialise the newly added placeholder token with the embeddings of the initializer token
+        token_embeds = text_encoder.get_input_embeddings().weight.data
+        for (x, y) in zip(modifier_token_id, initializer_token_id):
+            token_embeds[x] = token_embeds[y]
+
+        # Freeze all parameters except for the token embeddings in text encoder
+        params_to_freeze = itertools.chain(
+            text_encoder.text_model.encoder.parameters(),
+            text_encoder.text_model.final_layer_norm.parameters(),
+            text_encoder.text_model.embeddings.position_embedding.parameters(),
+        )
+        freeze_params(params_to_freeze)
+    ########################################################
+    ########################################################
+
     vae.requires_grad_(False)
     if args.modifier_token is None:
         text_encoder.requires_grad_(False)
-    unet = create_custom_diffusion(unet, args.freeze_model)
-
+    unet.requires_grad_(False)
     # For mixed precision training we cast the text_encoder and vae weights to half-precision
     # as these models are only used for inference, keeping weights in full precision is not required.
     weight_dtype = torch.float32
@@ -864,10 +838,56 @@ def main(args):
     elif accelerator.mixed_precision == "bf16":
         weight_dtype = torch.bfloat16
 
+    # now we will add new Custom Diffusion weights to the attention layers
+    # It's important to realize here how many attention weights will be added and of which sizes
+    # The sizes of the attention layers consist only of two different variables:
+    # 1) - the "hidden_size", which is increased according to `unet.config.block_out_channels`.
+    # 2) - the "cross attention size", which is set to `unet.config.cross_attention_dim`.
+
+    # Let's first see how many attention processors we will have to set.
+    # For Stable Diffusion, it should be equal to:
+    # - down blocks (2x attention layers) * (2x transformer layers) * (3x down blocks) = 12
+    # - mid blocks (2x attention layers) * (1x transformer layers) * (1x mid blocks) = 2
+    # - up blocks (2x attention layers) * (3x transformer layers) * (3x down blocks) = 18
+    # => 32 layers
+
+    # Only train key, value projection layers if freeze_model = 'crossattn_kv' else train all params in the cross attention layer
+    train_kv = True
+    train_q_out = False if args.freeze_model == 'crossattn_kv' else True
+    custom_diffusion_attn_procs = {}
+
+    st = unet.state_dict()
+    for name, attn in unet.attn_processors.items():
+        cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+        if name.startswith("mid_block"):
+            hidden_size = unet.config.block_out_channels[-1]
+        elif name.startswith("up_blocks"):
+            block_id = int(name[len("up_blocks.")])
+            hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+        elif name.startswith("down_blocks"):
+            block_id = int(name[len("down_blocks.")])
+            hidden_size = unet.config.block_out_channels[block_id]
+        layer_name = name.split('.processor')[0]
+        # ipdb.set_trace()
+        weights = {'to_k': st[layer_name + '.to_k.weight'],
+                    'to_v': st[layer_name + '.to_v.weight'],
+                    'to_q': st[layer_name + '.to_q.weight'],
+                    'to_out.weight': st[layer_name + '.to_out.0.weight'],
+                    'to_out.bias': st[layer_name + '.to_out.0.bias'], }
+        if cross_attention_dim is not None:
+            custom_diffusion_attn_procs[name] = CustomDiffusionAttnProcessor(weights, train_kv=train_kv, train_q_out=train_q_out, hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
+        else:
+            custom_diffusion_attn_procs[name] = attn
+    del st
+    unet.set_attn_processor(custom_diffusion_attn_procs)
+    custom_diffusion_layers = AttnProcsLayers({y: x for (y, x) in unet.attn_processors.items() if isinstance(x, CustomDiffusionAttnProcessor)})
+
+    accelerator.register_for_checkpointing(custom_diffusion_layers)
+
     # Move unet, vae and text_encoder to device and cast to weight_dtype
-    if accelerator.mixed_precision != "fp16":
-        unet.to(accelerator.device, dtype=weight_dtype)
+    if accelerator.mixed_precision != "fp16" and args.modifier_token is not None:
         text_encoder.to(accelerator.device, dtype=weight_dtype)
+    unet.to(accelerator.device, dtype=weight_dtype)
     vae.to(accelerator.device, dtype=weight_dtype)
 
     if args.enable_xformers_memory_efficient_attention:
@@ -912,70 +932,9 @@ def main(args):
     else:
         optimizer_class = torch.optim.AdamW
 
-    # Adding a modifier token which is optimized ####
-    # Code taken from https://github.com/huggingface/diffusers/blob/main/examples/textual_inversion/textual_inversion.py
-    modifier_token_id = []
-    initializer_token_id = []
-    if args.modifier_token is not None:
-        args.modifier_token = args.modifier_token.split('+')
-        args.initializer_token = args.initializer_token.split('+')
-        if len(args.modifier_token) > len(args.initializer_token):
-            raise ValueError("You must specify + separated initializer token for each modifier token.")
-        for modifier_token, initializer_token in zip(args.modifier_token, args.initializer_token[:len(args.modifier_token)]):
-            # Add the placeholder token in tokenizer
-            num_added_tokens = tokenizer.add_tokens(modifier_token)
-            if num_added_tokens == 0:
-                raise ValueError(
-                    f"The tokenizer already contains the token {modifier_token}. Please pass a different"
-                    " `modifier_token` that is not already in the tokenizer."
-                )
-
-            # Convert the initializer_token, placeholder_token to ids
-            token_ids = tokenizer.encode([initializer_token], add_special_tokens=False)
-            print(token_ids)
-            # Check if initializer_token is a single token or a sequence of tokens
-            if len(token_ids) > 1:
-                raise ValueError("The initializer token must be a single token.")
-
-            initializer_token_id.append(token_ids[0])
-            modifier_token_id.append(tokenizer.convert_tokens_to_ids(modifier_token))
-
-        # Resize the token embeddings as we are adding new special tokens to the tokenizer
-        text_encoder.resize_token_embeddings(len(tokenizer))
-
-        # Initialise the newly added placeholder token with the embeddings of the initializer token
-        token_embeds = text_encoder.get_input_embeddings().weight.data
-        for (x, y) in zip(modifier_token_id, initializer_token_id):
-            token_embeds[x] = token_embeds[y]
-
-        # Freeze all parameters except for the token embeddings in text encoder
-        params_to_freeze = itertools.chain(
-            text_encoder.text_model.encoder.parameters(),
-            text_encoder.text_model.final_layer_norm.parameters(),
-            text_encoder.text_model.embeddings.position_embedding.parameters(),
-        )
-        freeze_params(params_to_freeze)
-
-        if args.freeze_model == 'crossattn':
-            params_to_optimize = itertools.chain(text_encoder.get_input_embeddings().parameters() , [x[1] for x in unet.named_parameters() if 'attn2' in x[0]])
-        else:
-            params_to_optimize = itertools.chain(text_encoder.get_input_embeddings().parameters() , [x[1] for x in unet.named_parameters() if ('attn2.to_k' in x[0] or 'attn2.to_v' in x[0])])
-
-    ########################################################
-    ########################################################
-    else:
-        if args.freeze_model == 'crossattn':
-            params_to_optimize = (
-                itertools.chain([x[1] for x in unet.named_parameters() if 'attn2' in x[0]])
-            )
-        else:
-            params_to_optimize = (
-                itertools.chain([x[1] for x in unet.named_parameters() if ('attn2.to_k' in x[0] or 'attn2.to_v' in x[0])])
-            )
-
     # Optimizer creation
     optimizer = optimizer_class(
-        params_to_optimize,
+        itertools.chain(text_encoder.get_input_embeddings().parameters(), custom_diffusion_layers.parameters()) if args.modifier_token is not None else custom_diffusion_layers.parameters(),
         lr=args.learning_rate,
         betas=(args.adam_beta1, args.adam_beta2),
         weight_decay=args.adam_weight_decay,
@@ -1017,12 +976,12 @@ def main(args):
 
     # Prepare everything with our `accelerator`.
     if args.modifier_token is not None:
-        unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-            unet, text_encoder, optimizer, train_dataloader, lr_scheduler
+        custom_diffusion_layers, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            custom_diffusion_layers, text_encoder, optimizer, train_dataloader, lr_scheduler
         )
     else:
-        unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-            unet, optimizer, train_dataloader, lr_scheduler
+        custom_diffusion_layers, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            custom_diffusion_layers, optimizer, train_dataloader, lr_scheduler
         )
 
     # We need to recalculate our total training steps as the size of the training dataloader may have changed.
@@ -1051,7 +1010,7 @@ def main(args):
         if args.resume_from_checkpoint != "latest":
             path = os.path.basename(args.resume_from_checkpoint)
         else:
-            # Get the mos recent checkpoint
+            # Get the most recent checkpoint
             dirs = os.listdir(args.output_dir)
             dirs = [d for d in dirs if d.startswith("checkpoint")]
             dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
@@ -1086,7 +1045,7 @@ def main(args):
                     progress_bar.update(1)
                 continue
 
-            with accelerator.accumulate(unet):
+            with accelerator.accumulate(unet), accelerator.accumulate(text_encoder):
                 # Convert images to latent space
                 latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
                 latents = latents * vae.config.scaling_factor
@@ -1134,7 +1093,6 @@ def main(args):
                     mask = batch["mask"]
                     loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
                     loss = ((loss * mask).sum([1, 2, 3]) / mask.sum([1, 2, 3])).mean()
-
                 accelerator.backward(loss)
                 # Zero out the gradients for all token embeddings except the newly added
                 # embeddings for the concept, as we only want to optimize the concept embeddings
@@ -1150,11 +1108,7 @@ def main(args):
                     grads_text_encoder.data[index_grads_to_zero, :] = grads_text_encoder.data[index_grads_to_zero, :].fill_(0)
 
                 if accelerator.sync_gradients:
-                    params_to_clip = (
-                        itertools.chain([x[1] for x in unet.named_parameters() if ('attn2' in x[0])], text_encoder.parameters())
-                        if args.modifier_token is not None
-                        else itertools.chain([x[1] for x in unet.named_parameters() if ('attn2' in x[0])]) 
-                    )
+                    params_to_clip = itertools.chain(text_encoder.parameters(), custom_diffusion_layers.parameters()) if args.modifier_token is not None else custom_diffusion_layers.parameters()
                     accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
                 optimizer.step()
                 lr_scheduler.step()
@@ -1167,20 +1121,9 @@ def main(args):
 
                 if global_step % args.checkpointing_steps == 0:
                     if accelerator.is_main_process:
-                        pipeline = CustomDiffusionPipeline.from_pretrained(
-                            args.pretrained_model_name_or_path,
-                            unet=accelerator.unwrap_model(unet),
-                            text_encoder=accelerator.unwrap_model(text_encoder),
-                            tokenizer=tokenizer,
-                            revision=args.revision,
-                            modifier_token=args.modifier_token,
-                            modifier_token_id=modifier_token_id,
-                        )
                         save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
-                        pipeline.save_pretrained(save_path, all=True)
+                        accelerator.save_state(save_path)
                         logger.info(f"Saved state to {save_path}")
-                        del pipeline
-                        torch.cuda.empty_cache()
 
             logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
             progress_bar.set_postfix(**logs)
@@ -1196,14 +1139,12 @@ def main(args):
                     f" {args.validation_prompt}."
                 )
                 # create pipeline
-                pipeline = CustomDiffusionPipeline.from_pretrained(
+                pipeline = DiffusionPipeline.from_pretrained(
                     args.pretrained_model_name_or_path,
                     unet=accelerator.unwrap_model(unet),
                     text_encoder=accelerator.unwrap_model(text_encoder),
                     tokenizer=tokenizer,
                     revision=args.revision,
-                    modifier_token=args.modifier_token,
-                    modifier_token_id=modifier_token_id,
                 )
                 pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
                 pipeline = pipeline.to(accelerator.device)
@@ -1233,30 +1174,29 @@ def main(args):
                 del pipeline
                 torch.cuda.empty_cache()
 
-    # Save the updated weights 
+    # Save the custom diffusion layers
     accelerator.wait_for_everyone()
     if accelerator.is_main_process:
         unet = unet.to(torch.float32)
-        pipeline = CustomDiffusionPipeline.from_pretrained(
-            args.pretrained_model_name_or_path,
-            unet=accelerator.unwrap_model(unet),
-            text_encoder=accelerator.unwrap_model(text_encoder),
-            tokenizer=tokenizer,
-            revision=args.revision,
-            modifier_token=args.modifier_token,
-            modifier_token_id=modifier_token_id,
+        unet.save_attn_procs(args.output_dir)
+        save_new_embed(text_encoder, modifier_token_id, accelerator, args, args.output_dir)
+
+        # Final inference
+        # Load previous pipeline
+        pipeline = DiffusionPipeline.from_pretrained(
+            args.pretrained_model_name_or_path, revision=args.revision, torch_dtype=weight_dtype
         )
-        save_path = os.path.join(args.output_dir, "delta.bin")
-        pipeline.save_pretrained(save_path, freeze_model=args.freeze_model)
+        pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
+        pipeline = pipeline.to(accelerator.device)
+
+        # load attention processors
+        pipeline.unet.load_attn_procs(args.output_dir, weight_name="pytorch_custom_diffusion_weights.bin")
+        for token in args.modifier_token:
+            pipeline.load_textual_inversion(args.output_dir, weight_name=f'{token}.bin')
 
         # run inference
         if args.validation_prompt and args.num_validation_images > 0:
-            pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
-            pipeline = pipeline.to(accelerator.device)
-            pipeline.set_progress_bar_config(disable=True)
-
-            # run inference
-            generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+            generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
             images = [
                 pipeline(args.validation_prompt, num_inference_steps=25, generator=generator, eta=1.).images[0]
                 for _ in range(args.num_validation_images)
diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py
index a262833938e7..e77f34a4bbf3 100644
--- a/src/diffusers/loaders.py
+++ b/src/diffusers/loaders.py
@@ -17,7 +17,7 @@
 
 import torch
 
-from .models.attention_processor import LoRAAttnProcessor
+from .models.attention_processor import CustomDiffusionAttnProcessor, LoRAAttnProcessor
 from .utils import (
     DIFFUSERS_CACHE,
     HF_HUB_OFFLINE,
@@ -45,6 +45,9 @@
 TEXT_INVERSION_NAME = "learned_embeds.bin"
 TEXT_INVERSION_NAME_SAFE = "learned_embeds.safetensors"
 
+CUSTOM_DIFFUSION_WEIGHT_NAME = "pytorch_custom_diffusion_weights.bin"
+CUSTOM_DIFFUSION_WEIGHT_NAME_SAFE = "pytorch_custom_diffusion_weights.safetensors"
+
 
 class AttnProcsLayers(torch.nn.Module):
     def __init__(self, state_dict: Dict[str, torch.Tensor]):
@@ -213,6 +216,7 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
         attn_processors = {}
 
         is_lora = all("lora" in k for k in state_dict.keys())
+        is_custom_diffusion = all("custom_diffusion" in k for k in state_dict.keys())
 
         if is_lora:
             lora_grouped_dict = defaultdict(dict)
@@ -229,9 +233,25 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
                     hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=rank
                 )
                 attn_processors[key].load_state_dict(value_dict)
-
+        elif is_custom_diffusion:
+            custom_diffusion_grouped_dict = defaultdict(dict)
+            for key, value in state_dict.items():
+                if 'to_out' in key:
+                    attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:])
+                else:
+                    attn_processor_key, sub_key = ".".join(key.split(".")[:-2]), ".".join(key.split(".")[-2:])
+                custom_diffusion_grouped_dict[attn_processor_key][sub_key] = value
+
+            for key, value_dict in custom_diffusion_grouped_dict.items():
+                cross_attention_dim = value_dict["to_k_custom_diffusion.weight"].shape[1]
+                hidden_size = value_dict["to_k_custom_diffusion.weight"].shape[0]
+                train_q_out = True if "to_q_custom_diffusion.weight" in value_dict else False
+                attn_processors[key] = CustomDiffusionAttnProcessor(
+                    weights=None, train_kv=True, train_q_out=train_q_out, hidden_size=hidden_size, cross_attention_dim=cross_attention_dim
+                )
+                attn_processors[key].load_state_dict(value_dict)
         else:
-            raise ValueError(f"{model_file} does not seem to be in the correct format expected by LoRA training.")
+            raise ValueError(f"{model_file} does not seem to be in the correct format expected by LoRA or Custom Diffusion training.")
 
         # set correct dtype & device
         attn_processors = {k: v.to(device=self.device, dtype=self.dtype) for k, v in attn_processors.items()}
@@ -285,16 +305,19 @@ def save_function(weights, filename):
 
         os.makedirs(save_directory, exist_ok=True)
 
-        model_to_save = AttnProcsLayers(self.attn_processors)
-
+        is_custom_diffusion = any(isinstance(x, CustomDiffusionAttnProcessor) for (_, x) in self.attn_processors.items())
+        if is_custom_diffusion:
+            model_to_save = AttnProcsLayers({y: x for (y, x) in self.attn_processors.items() if isinstance(x, CustomDiffusionAttnProcessor)})
+        else:
+            model_to_save = AttnProcsLayers(self.attn_processors)
         # Save the model
         state_dict = model_to_save.state_dict()
 
         if weight_name is None:
             if safe_serialization:
-                weight_name = LORA_WEIGHT_NAME_SAFE
+                weight_name = CUSTOM_DIFFUSION_WEIGHT_NAME_SAFE if is_custom_diffusion else LORA_WEIGHT_NAME_SAFE
             else:
-                weight_name = LORA_WEIGHT_NAME
+                weight_name = CUSTOM_DIFFUSION_WEIGHT_NAME if is_custom_diffusion else LORA_WEIGHT_NAME
 
         # Save the model
         save_function(state_dict, os.path.join(save_directory, weight_name))
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index 30026cd89ff9..132da4860a25 100644
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -116,6 +116,9 @@ def set_use_memory_efficient_attention_xformers(
         is_lora = hasattr(self, "processor") and isinstance(
             self.processor, (LoRAAttnProcessor, LoRAXFormersAttnProcessor)
         )
+        is_custom_diffusion = hasattr(self, "processor") and isinstance(
+            self.processor, (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor)
+        )
 
         if use_memory_efficient_attention_xformers:
             if self.added_kv_proj_dim is not None:
@@ -159,6 +162,20 @@ def set_use_memory_efficient_attention_xformers(
                 )
                 processor.load_state_dict(self.processor.state_dict())
                 processor.to(self.processor.to_q_lora.up.weight.device)
+            elif is_custom_diffusion:
+                if hasattr(self.processor, "to_k_custom_diffusion"):
+                    processor = CustomDiffusionXFormersAttnProcessor(
+                        train_kv=self.processor.train_kv,
+                        train_q_out=self.processor.train_q_out,
+                        hidden_size=self.processor.hidden_size,
+                        cross_attention_dim=self.processor.cross_attention_dim,
+                        attention_op=attention_op,
+                    )
+                    processor.load_state_dict(self.processor.state_dict())
+                    print(self.processor.to_k_custom_diffusion.weight.device, "device")
+                    processor.to(self.processor.to_k_custom_diffusion.weight.device)
+                else:
+                    processor = XFormersAttnProcessor(attention_op=attention_op)
             else:
                 processor = XFormersAttnProcessor(attention_op=attention_op)
         else:
@@ -170,6 +187,8 @@ def set_use_memory_efficient_attention_xformers(
                 )
                 processor.load_state_dict(self.processor.state_dict())
                 processor.to(self.processor.to_q_lora.up.weight.device)
+            elif is_custom_diffusion:
+                processor = self.processor
             else:
                 processor = AttnProcessor()
 
@@ -395,6 +414,85 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a
         return hidden_states
 
 
+class CustomDiffusionAttnProcessor(nn.Module):
+    def __init__(self, weights=None, train_kv=True, train_q_out=True, hidden_size=None, cross_attention_dim=None, out_bias=True, dropout=0.0,):
+        super().__init__()
+        self.train_kv = train_kv
+        self.train_q_out = train_q_out
+
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+
+        # `_custom_diffusion` id for easy serialization and loading.
+        if self.train_kv:
+            self.to_k_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size)
+            self.to_v_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size)
+            if weights is not None:
+                with torch.no_grad():
+                    self.to_k_custom_diffusion.weight.copy_(weights['to_k'])
+                    self.to_v_custom_diffusion.weight.copy_(weights['to_v'])
+        if self.train_q_out:
+            self.to_q_custom_diffusion = nn.Linear(hidden_size, hidden_size)
+            self.to_out_custom_diffusion = nn.ModuleList([])
+            self.to_out_custom_diffusion.append(nn.Linear(hidden_size, hidden_size, bias=out_bias))
+            self.to_out_custom_diffusion.append(nn.Dropout(dropout))
+            if weights is not None:
+                with torch.no_grad():
+                    self.to_q_custom_diffusion.weight.copy_(weights['to_q'])
+                    self.to_out_custom_diffusion[0].weight.copy_(weights['to_out.weight'])
+                    self.to_out_custom_diffusion[0].bias.copy_(weights['to_out.bias'])
+
+    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None):
+        batch_size, sequence_length, _ = hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if self.train_q_out:
+            query = self.to_q_custom_diffusion(hidden_states)
+        else:
+            query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            crossattn = False
+            encoder_hidden_states = hidden_states
+        else:
+            crossattn = True
+            if attn.cross_attention_norm:
+                encoder_hidden_states = attn.norm_cross(encoder_hidden_states)
+
+        if self.train_kv:
+            key = self.to_k_custom_diffusion(encoder_hidden_states)
+            value = self.to_v_custom_diffusion(encoder_hidden_states)
+        else:
+            key = attn.to_k(encoder_hidden_states)
+            value = attn.to_v(encoder_hidden_states)
+
+        if crossattn:
+            detach = torch.ones_like(key)
+            detach[:, :1, :] = detach[:, :1, :] * 0.0
+            key = detach * key + (1 - detach) * key.detach()
+            value = detach * value + (1 - detach) * value.detach()
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        if self.train_q_out:
+            # linear proj
+            hidden_states = self.to_out_custom_diffusion[0](hidden_states)
+            # dropout
+            hidden_states = self.to_out_custom_diffusion[1](hidden_states)
+        else:
+            # linear proj
+            hidden_states = attn.to_out[0](hidden_states)
+            # dropout
+            hidden_states = attn.to_out[1](hidden_states)
+
+        return hidden_states
+
+
 class AttnAddedKVProcessor:
     def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None):
         residual = hidden_states
@@ -567,6 +665,84 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a
         return hidden_states
 
 
+class CustomDiffusionXFormersAttnProcessor(nn.Module):
+    def __init__(self, train_kv=True, train_q_out=False, hidden_size=None, cross_attention_dim=None, out_bias=True, dropout=0.0, attention_op: Optional[Callable] = None):
+        super().__init__()
+        self.train_kv = train_kv
+        self.train_q_out = train_q_out
+
+        self.attention_op = attention_op
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+
+        # `_custom_diffusion` id for easy serialization and loading.
+        if self.train_kv:
+            self.to_k_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size)
+            self.to_v_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size)
+        if self.train_q_out:
+            self.to_q_custom_diffusion = nn.Linear(hidden_size, hidden_size)
+            self.to_out_custom_diffusion = nn.ModuleList([])
+            self.to_out_custom_diffusion.append(nn.Linear(hidden_size, hidden_size, bias=out_bias))
+            self.to_out_custom_diffusion.append(nn.Dropout(dropout))
+
+    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None):
+        batch_size, sequence_length, _ = hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        print(hidden_states.device)
+        if self.train_q_out:
+            print("to_q_custom", self.to_q_custom_diffusion.weight.device)
+            query = self.to_q_custom_diffusion(hidden_states)
+        else:
+            print("to_q", attn.to_q.weight.device)
+            query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            crossattn = False
+            encoder_hidden_states = hidden_states
+        else:
+            crossattn = True
+            if attn.cross_attention_norm:
+                encoder_hidden_states = attn.norm_cross(encoder_hidden_states)
+
+        if self.train_kv:
+            print("to_k_custom", self.to_k_custom_diffusion.weight.device)
+            key = self.to_k_custom_diffusion(encoder_hidden_states)
+            value = self.to_v_custom_diffusion(encoder_hidden_states)
+        else:
+            print("to_k", attn.to_k.weight.device)
+            key = attn.to_k(encoder_hidden_states)
+            value = attn.to_v(encoder_hidden_states)
+
+        if crossattn:
+            detach = torch.ones_like(key)
+            detach[:, :1, :] = detach[:, :1, :] * 0.0
+            key = detach * key + (1 - detach) * key.detach()
+            value = detach * value + (1 - detach) * value.detach()
+
+        query = attn.head_to_batch_dim(query).contiguous()
+        key = attn.head_to_batch_dim(key).contiguous()
+        value = attn.head_to_batch_dim(value).contiguous()
+
+        hidden_states = xformers.ops.memory_efficient_attention(
+            query, key, value, attn_bias=attention_mask, op=self.attention_op, scale=attn.scale
+        )
+        hidden_states = hidden_states.to(query.dtype)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        if self.train_q_out:
+            # linear proj
+            hidden_states = self.to_out_custom_diffusion[0](hidden_states)
+            # dropout
+            hidden_states = self.to_out_custom_diffusion[1](hidden_states)
+        else:
+            # linear proj
+            hidden_states = attn.to_out[0](hidden_states)
+            # dropout
+            hidden_states = attn.to_out[1](hidden_states)
+
+        return hidden_states
+
+
 class SlicedAttnProcessor:
     def __init__(self, slice_size):
         self.slice_size = slice_size
@@ -692,4 +868,6 @@ def __call__(self, attn: "Attention", hidden_states, encoder_hidden_states=None,
     SlicedAttnAddedKVProcessor,
     LoRAAttnProcessor,
     LoRAXFormersAttnProcessor,
+    CustomDiffusionAttnProcessor,
+    CustomDiffusionXFormersAttnProcessor,
 ]

From 37cf524456da65cd2be641e9e513d4f068ea787c Mon Sep 17 00:00:00 2001
From: Nupur Kumari <nupurkumari@Nupurs-MacBook-Pro.local>
Date: Mon, 17 Apr 2023 02:54:54 -0400
Subject: [PATCH 09/34] custom diffusion

---
 examples/custom_diffusion/README.md         |  16 +-
 examples/custom_diffusion/model_pipeline.py | 303 --------------------
 examples/custom_diffusion/train.py          |   2 +-
 src/diffusers/loaders.py                    |  39 ++-
 src/diffusers/models/attention_processor.py |  24 +-
 5 files changed, 43 insertions(+), 341 deletions(-)
 delete mode 100644 examples/custom_diffusion/model_pipeline.py

diff --git a/examples/custom_diffusion/README.md b/examples/custom_diffusion/README.md
index cdbf63f28f54..74e3bfeb1a01 100644
--- a/examples/custom_diffusion/README.md
+++ b/examples/custom_diffusion/README.md
@@ -140,11 +140,12 @@ CUDA_VISIBLE_DEVICES=1 accelerate launch train.py \
 Once you have trained a model using the above command, you can run inference using the below command. Make sure to include the `modifier token` (e.g. \<new1\> in above example) in your prompt.
 
 ```python
-from model_pipeline import CustomDiffusionPipeline
+from diffusers import DiffusionPipeline
 import torch
 
-pipe = CustomDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16).to("cuda")
-pipe.load_model('<path-to-your-trained-model>/delta.bin')
+pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16).to("cuda")
+pipe.unet.load_attn_procs('path-to-save-model', weight_name='pytorch_custom_diffusion_weights.bin')
+pipe.load_textual_inversion('path-to-save-model', weight_name='<new1>.bin')
 image = pipe("<new1> cat sitting in a bucket", num_inference_steps=100, guidance_scale=7.5, eta=1.).images[0]
 
 image.save("cat.png")
@@ -169,12 +170,13 @@ image.save("cat.png")
 You can also perform inference from one of the complete checkpoint saved during the training process, if you used the `--checkpointing_steps` argument. 
 
 ```python
-from model_pipeline import CustomDiffusionPipeline
+from diffusers import DiffusionPipeline
 import torch
 
-pipe = CustomDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16).to("cuda")
-pipe.load_model('<path-to-your-trained-model>/delta.bin')
-pipe.save_pretrained('<path-to-your-save-model>', all=True)
+pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16).to("cuda")
+pipe.unet.load_attn_procs('path-to-save-model', weight_name='pytorch_custom_diffusion_weights.bin')
+pipe.load_textual_inversion('path-to-save-model', weight_name='<new1>.bin')
+pipe.save_pretrained('<path-to-your-save-model>')
 ```
 
 ### Set grads to none
diff --git a/examples/custom_diffusion/model_pipeline.py b/examples/custom_diffusion/model_pipeline.py
deleted file mode 100644
index c5d2097ecc3b..000000000000
--- a/examples/custom_diffusion/model_pipeline.py
+++ /dev/null
@@ -1,303 +0,0 @@
-# This code is built from the Huggingface repository: https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth_lora.py, and
-# https://github.com/huggingface/diffusers/blob/main/examples/textual_inversion/textual_inversion.py
-
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2023 Custom Diffusion authors and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-from typing import Callable, Optional
-import torch
-from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
-from accelerate.logging import get_logger
-
-from diffusers.models import AutoencoderKL, UNet2DConditionModel
-from diffusers.schedulers.scheduling_utils import SchedulerMixin
-from diffusers.pipelines.stable_diffusion import StableDiffusionPipeline
-from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
-from diffusers.models.cross_attention import CrossAttention
-from diffusers.utils.import_utils import is_xformers_available
-
-if is_xformers_available():
-    import xformers
-    import xformers.ops
-else:
-    xformers = None
-
-logger = get_logger(__name__)
-
-
-def set_use_memory_efficient_attention_xformers(
-    self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[Callable] = None
-):
-    if use_memory_efficient_attention_xformers:
-        if self.added_kv_proj_dim is not None:
-            # TODO(Anton, Patrick, Suraj, William) - currently xformers doesn't work for UnCLIP
-            # which uses this type of cross attention ONLY because the attention mask of format
-            # [0, ..., -10.000, ..., 0, ...,] is not supported
-            raise NotImplementedError(
-                "Memory efficient attention with `xformers` is currently not supported when"
-                " `self.added_kv_proj_dim` is defined."
-            )
-        elif not is_xformers_available():
-            raise ModuleNotFoundError(
-                (
-                    "Refer to https://github.com/facebookresearch/xformers for more information on how to install"
-                    " xformers"
-                ),
-                name="xformers",
-            )
-        elif not torch.cuda.is_available():
-            raise ValueError(
-                "torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is"
-                " only available for GPU "
-            )
-        else:
-            try:
-                # Make sure we can run the memory efficient attention
-                _ = xformers.ops.memory_efficient_attention(
-                    torch.randn((1, 2, 40), device="cuda"),
-                    torch.randn((1, 2, 40), device="cuda"),
-                    torch.randn((1, 2, 40), device="cuda"),
-                )
-            except Exception as e:
-                raise e
-
-        processor = CustomDiffusionXFormersAttnProcessor(attention_op=attention_op)
-    else:
-        processor = CustomDiffusionAttnProcessor()
-
-    self.set_processor(processor)
-
-
-class CustomDiffusionAttnProcessor:
-    def __call__(
-        self,
-        attn: CrossAttention,
-        hidden_states,
-        encoder_hidden_states=None,
-        attention_mask=None,
-    ):
-        batch_size, sequence_length, _ = hidden_states.shape
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-        query = attn.to_q(hidden_states)
-
-        crossattn = False
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        else:
-            crossattn = True
-            if attn.cross_attention_norm:
-                encoder_hidden_states = attn.norm_cross(encoder_hidden_states)
-
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-        if crossattn:
-            detach = torch.ones_like(key)
-            detach[:, :1, :] = detach[:, :1, :] * 0.
-            key = detach * key + (1 - detach) * key.detach()
-            value = detach * value + (1 - detach) * value.detach()
-
-        query = attn.head_to_batch_dim(query)
-        key = attn.head_to_batch_dim(key)
-        value = attn.head_to_batch_dim(value)
-
-        attention_probs = attn.get_attention_scores(query, key, attention_mask)
-        hidden_states = torch.bmm(attention_probs, value)
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        return hidden_states
-
-
-class CustomDiffusionXFormersAttnProcessor:
-    def __init__(self, attention_op: Optional[Callable] = None):
-        self.attention_op = attention_op
-
-    def __call__(self, attn: CrossAttention, hidden_states, encoder_hidden_states=None, attention_mask=None):
-        batch_size, sequence_length, _ = hidden_states.shape
-
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-
-        query = attn.to_q(hidden_states)
-
-        crossattn = False
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        else:
-            crossattn = True
-            if attn.cross_attention_norm:
-                encoder_hidden_states = attn.norm_cross(encoder_hidden_states)
-
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-        if crossattn:
-            detach = torch.ones_like(key)
-            detach[:, :1, :] = detach[:, :1, :] * 0.
-            key = detach * key + (1 - detach) * key.detach()
-            value = detach * value + (1 - detach) * value.detach()
-
-        query = attn.head_to_batch_dim(query).contiguous()
-        key = attn.head_to_batch_dim(key).contiguous()
-        value = attn.head_to_batch_dim(value).contiguous()
-
-        hidden_states = xformers.ops.memory_efficient_attention(
-            query, key, value, attn_bias=attention_mask, op=self.attention_op
-        )
-        hidden_states = hidden_states.to(query.dtype)
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-        return hidden_states
-
-
-class CustomDiffusionPipeline(StableDiffusionPipeline):
-    r"""
-    Pipeline for custom diffusion model.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.).
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
-        feature_extractor ([`CLIPFeatureExtractor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-        modifier_token: list of new modifier tokens added or to be added to text_encoder
-        modifier_token_id: list of id of new modifier tokens added or to be added to text_encoder
-    """
-    _optional_components = ["safety_checker", "feature_extractor", "modifier_token"]
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: SchedulerMixin,
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPFeatureExtractor,
-        requires_safety_checker: bool = True,
-        modifier_token: list = [],
-        modifier_token_id: list = [],
-    ):
-        super().__init__(vae,
-                         text_encoder,
-                         tokenizer,
-                         unet,
-                         scheduler,
-                         safety_checker,
-                         feature_extractor,
-                         requires_safety_checker)
-
-        # change attn class
-        self.modifier_token = modifier_token
-        self.modifier_token_id = modifier_token_id
-
-    def add_token(self, initializer_token):
-        initializer_token_id = []
-        for modifier_token_, initializer_token_ in zip(self.modifier_token, initializer_token):
-            # Add the placeholder token in tokenizer
-            num_added_tokens = self.tokenizer.add_tokens(modifier_token_)
-            if num_added_tokens == 0:
-                raise ValueError(
-                    f"The tokenizer already contains the token {modifier_token_}. Please pass a different"
-                    " `modifier_token` that is not already in the tokenizer."
-                )
-
-            # Convert the initializer_token, placeholder_token to ids
-            token_ids = self.tokenizer.encode([initializer_token_], add_special_tokens=False)
-            # Check if initializer_token is a single token or a sequence of tokens
-            if len(token_ids) > 1:
-                raise ValueError("The initializer token must be a single token.")
-
-            self.modifier_token_id.append(self.tokenizer.convert_tokens_to_ids(modifier_token_))
-            initializer_token_id.append(token_ids[0])
-        # Resize the token embeddings as we are adding new special tokens to the tokenizer
-        self.text_encoder.resize_token_embeddings(len(self.tokenizer))
-
-        # Initialise the newly added placeholder token with the embeddings of the initializer token
-        token_embeds = self.text_encoder.get_input_embeddings().weight.data
-        for (x, y) in zip(self.modifier_token_id, initializer_token_id):
-            token_embeds[x] = token_embeds[y]
-
-    def save_pretrained(self, save_path, freeze_model="crossattn_kv", save_text_encoder=False, all=False):
-        if all:
-            super().save_pretrained(save_path)
-        else:
-            delta_dict = {'unet': {}, 'modifier_token': {}}
-            if self.modifier_token is not None:
-                for i in range(len(self.modifier_token_id)):
-                    learned_embeds = self.text_encoder.get_input_embeddings().weight[self.modifier_token_id[i]]
-                    delta_dict['modifier_token'][self.modifier_token[i]] = learned_embeds.detach().cpu()
-            if save_text_encoder:
-                delta_dict['text_encoder'] = self.text_encoder.state_dict()
-            for name, params in self.unet.named_parameters():
-                if freeze_model == "crossattn":
-                    if 'attn2' in name:
-                        delta_dict['unet'][name] = params.cpu().clone()
-                elif freeze_model == "crossattn_kv":
-                    if 'attn2.to_k' in name or 'attn2.to_v' in name:
-                        delta_dict['unet'][name] = params.cpu().clone()
-                else:
-                    raise ValueError(
-                        "freeze_model argument only supports crossattn_kv or crossattn"
-                    )
-            torch.save(delta_dict, save_path)
-
-    def load_model(self, save_path, compress=False):
-        st = torch.load(save_path)
-        if 'text_encoder' in st:
-            self.text_encoder.load_state_dict(st['text_encoder'])
-        if 'modifier_token' in st:
-            modifier_tokens = list(st['modifier_token'].keys())
-            modifier_token_id = []
-            for modifier_token in modifier_tokens:
-                num_added_tokens = self.tokenizer.add_tokens(modifier_token)
-                if num_added_tokens == 0:
-                    raise ValueError(
-                        f"The tokenizer already contains the token {modifier_token}. Please pass a different"
-                        " `modifier_token` that is not already in the tokenizer."
-                    )
-                modifier_token_id.append(self.tokenizer.convert_tokens_to_ids(modifier_token))
-            # Resize the token embeddings as we are adding new special tokens to the tokenizer
-            self.text_encoder.resize_token_embeddings(len(self.tokenizer))
-            token_embeds = self.text_encoder.get_input_embeddings().weight.data
-            for i, id_ in enumerate(modifier_token_id):
-                token_embeds[id_] = st['modifier_token'][modifier_tokens[i]]
-
-        for name, params in self.unet.named_parameters():
-            if 'attn2' in name:
-                if compress and ('to_k' in name or 'to_v' in name):
-                    params.data += st['unet'][name]['u']@st['unet'][name]['v']
-                elif name in st['unet']:
-                    params.data.copy_(st['unet'][f'{name}'])
diff --git a/examples/custom_diffusion/train.py b/examples/custom_diffusion/train.py
index 0f417b547d73..25fe192bb761 100644
--- a/examples/custom_diffusion/train.py
+++ b/examples/custom_diffusion/train.py
@@ -877,7 +877,7 @@ def main(args):
         if cross_attention_dim is not None:
             custom_diffusion_attn_procs[name] = CustomDiffusionAttnProcessor(weights, train_kv=train_kv, train_q_out=train_q_out, hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
         else:
-            custom_diffusion_attn_procs[name] = attn
+            custom_diffusion_attn_procs[name] = CustomDiffusionAttnProcessor(weights, train_kv=False, train_q_out=False, hidden_size=hidden_size, cross_attention_dim=cross_attention_dim) #attn
     del st
     unet.set_attn_processor(custom_diffusion_attn_procs)
     custom_diffusion_layers = AttnProcsLayers({y: x for (y, x) in unet.attn_processors.items() if isinstance(x, CustomDiffusionAttnProcessor)})
diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py
index e77f34a4bbf3..974d6494500d 100644
--- a/src/diffusers/loaders.py
+++ b/src/diffusers/loaders.py
@@ -216,7 +216,7 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
         attn_processors = {}
 
         is_lora = all("lora" in k for k in state_dict.keys())
-        is_custom_diffusion = all("custom_diffusion" in k for k in state_dict.keys())
+        is_custom_diffusion = any("custom_diffusion" in k for k in state_dict.keys())
 
         if is_lora:
             lora_grouped_dict = defaultdict(dict)
@@ -236,20 +236,28 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
         elif is_custom_diffusion:
             custom_diffusion_grouped_dict = defaultdict(dict)
             for key, value in state_dict.items():
-                if 'to_out' in key:
-                    attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:])
+                if len(value) == 0:
+                    custom_diffusion_grouped_dict[key] = {}
                 else:
-                    attn_processor_key, sub_key = ".".join(key.split(".")[:-2]), ".".join(key.split(".")[-2:])
-                custom_diffusion_grouped_dict[attn_processor_key][sub_key] = value
+                    if 'to_out' in key:
+                        attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:])
+                    else:
+                        attn_processor_key, sub_key = ".".join(key.split(".")[:-2]), ".".join(key.split(".")[-2:])
+                    custom_diffusion_grouped_dict[attn_processor_key][sub_key] = value
 
             for key, value_dict in custom_diffusion_grouped_dict.items():
-                cross_attention_dim = value_dict["to_k_custom_diffusion.weight"].shape[1]
-                hidden_size = value_dict["to_k_custom_diffusion.weight"].shape[0]
-                train_q_out = True if "to_q_custom_diffusion.weight" in value_dict else False
-                attn_processors[key] = CustomDiffusionAttnProcessor(
-                    weights=None, train_kv=True, train_q_out=train_q_out, hidden_size=hidden_size, cross_attention_dim=cross_attention_dim
-                )
-                attn_processors[key].load_state_dict(value_dict)
+                if len(value_dict) == 0:
+                    attn_processors[key] = CustomDiffusionAttnProcessor(
+                        weights=None, train_kv=False, train_q_out=False, hidden_size=None, cross_attention_dim=None
+                    )
+                else:
+                    cross_attention_dim = value_dict["to_k_custom_diffusion.weight"].shape[1]
+                    hidden_size = value_dict["to_k_custom_diffusion.weight"].shape[0]
+                    train_q_out = True if "to_q_custom_diffusion.weight" in value_dict else False
+                    attn_processors[key] = CustomDiffusionAttnProcessor(
+                        weights=None, train_kv=True, train_q_out=train_q_out, hidden_size=hidden_size, cross_attention_dim=cross_attention_dim
+                    )
+                    attn_processors[key].load_state_dict(value_dict)
         else:
             raise ValueError(f"{model_file} does not seem to be in the correct format expected by LoRA or Custom Diffusion training.")
 
@@ -308,10 +316,13 @@ def save_function(weights, filename):
         is_custom_diffusion = any(isinstance(x, CustomDiffusionAttnProcessor) for (_, x) in self.attn_processors.items())
         if is_custom_diffusion:
             model_to_save = AttnProcsLayers({y: x for (y, x) in self.attn_processors.items() if isinstance(x, CustomDiffusionAttnProcessor)})
+            state_dict = model_to_save.state_dict()
+            for name, attn in self.attn_processors.items():
+                if len(attn.state_dict()) == 0:
+                    state_dict[name] = {}
         else:
             model_to_save = AttnProcsLayers(self.attn_processors)
-        # Save the model
-        state_dict = model_to_save.state_dict()
+            state_dict = model_to_save.state_dict()
 
         if weight_name is None:
             if safe_serialization:
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index 132da4860a25..3daea8d46908 100644
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -163,19 +163,16 @@ def set_use_memory_efficient_attention_xformers(
                 processor.load_state_dict(self.processor.state_dict())
                 processor.to(self.processor.to_q_lora.up.weight.device)
             elif is_custom_diffusion:
+                processor = CustomDiffusionXFormersAttnProcessor(
+                    train_kv=self.processor.train_kv,
+                    train_q_out=self.processor.train_q_out,
+                    hidden_size=self.processor.hidden_size,
+                    cross_attention_dim=self.processor.cross_attention_dim,
+                    attention_op=attention_op,
+                )
+                processor.load_state_dict(self.processor.state_dict())
                 if hasattr(self.processor, "to_k_custom_diffusion"):
-                    processor = CustomDiffusionXFormersAttnProcessor(
-                        train_kv=self.processor.train_kv,
-                        train_q_out=self.processor.train_q_out,
-                        hidden_size=self.processor.hidden_size,
-                        cross_attention_dim=self.processor.cross_attention_dim,
-                        attention_op=attention_op,
-                    )
-                    processor.load_state_dict(self.processor.state_dict())
-                    print(self.processor.to_k_custom_diffusion.weight.device, "device")
                     processor.to(self.processor.to_k_custom_diffusion.weight.device)
-                else:
-                    processor = XFormersAttnProcessor(attention_op=attention_op)
             else:
                 processor = XFormersAttnProcessor(attention_op=attention_op)
         else:
@@ -688,12 +685,9 @@ def __init__(self, train_kv=True, train_q_out=False, hidden_size=None, cross_att
     def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None):
         batch_size, sequence_length, _ = hidden_states.shape
         attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-        print(hidden_states.device)
         if self.train_q_out:
-            print("to_q_custom", self.to_q_custom_diffusion.weight.device)
             query = self.to_q_custom_diffusion(hidden_states)
         else:
-            print("to_q", attn.to_q.weight.device)
             query = attn.to_q(hidden_states)
 
         if encoder_hidden_states is None:
@@ -705,11 +699,9 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a
                 encoder_hidden_states = attn.norm_cross(encoder_hidden_states)
 
         if self.train_kv:
-            print("to_k_custom", self.to_k_custom_diffusion.weight.device)
             key = self.to_k_custom_diffusion(encoder_hidden_states)
             value = self.to_v_custom_diffusion(encoder_hidden_states)
         else:
-            print("to_k", attn.to_k.weight.device)
             key = attn.to_k(encoder_hidden_states)
             value = attn.to_v(encoder_hidden_states)
 

From 14818cb8902530d9a1c9570f8d0887d36e0680ec Mon Sep 17 00:00:00 2001
From: Nupur Kumari <nupurkumari@Nupurs-MacBook-Pro.local>
Date: Mon, 17 Apr 2023 10:18:35 -0400
Subject: [PATCH 10/34] custom diffusion

---
 examples/custom_diffusion/retrieve.py | 71 +++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 examples/custom_diffusion/retrieve.py

diff --git a/examples/custom_diffusion/retrieve.py b/examples/custom_diffusion/retrieve.py
new file mode 100644
index 000000000000..877cab638b8f
--- /dev/null
+++ b/examples/custom_diffusion/retrieve.py
@@ -0,0 +1,71 @@
+import argparse
+import os
+from io import BytesIO
+from pathlib import Path
+
+import requests
+from clip_retrieval.clip_client import ClipClient
+from PIL import Image
+from tqdm import tqdm
+
+
+def retrieve(class_prompt, class_images_dir, num_class_images):
+    factor = 1.5
+    num_images = int(factor * num_class_images)
+    client = ClipClient(url="https://knn.laion.ai/knn-service",
+                        indice_name="laion_400m", num_images=num_images, aesthetic_weight=0.1)
+
+    os.makedirs(f'{class_images_dir}/images', exist_ok=True)
+    if len(list(Path(f'{class_images_dir}/images').iterdir())) >= num_class_images:
+        return
+
+    while True:
+        class_images = client.query(text=class_prompt)
+        if len(class_images) >= factor * num_class_images or num_images > 1e4:
+            break
+        else:
+            num_images = int(factor * num_images)
+            client = ClipClient(url="https://knn.laion.ai/knn-service",
+                                indice_name="laion_400m", num_images=num_images, aesthetic_weight=0.1)
+
+    count = 0
+    total = 0
+    pbar = tqdm(desc='downloading real regularization images',
+                total=num_class_images)
+
+    with open(f'{class_images_dir}/caption.txt', 'w') as f1, open(f'{class_images_dir}/urls.txt', 'w') as f2, open(f'{class_images_dir}/images.txt', 'w') as f3:
+        while total < num_class_images:
+            images = class_images[count]
+            count += 1
+            try:
+                img = requests.get(images['url'])
+                if img.status_code == 200:
+                    _ = Image.open(BytesIO(img.content))
+                    with open(f'{class_images_dir}/images/{total}.jpg', 'wb') as f:
+                        f.write(img.content)
+                    f1.write(images['caption'] + '\n')
+                    f2.write(images['url'] + '\n')
+                    f3.write(f'{class_images_dir}/images/{total}.jpg' + '\n')
+                    total += 1
+                    pbar.update(1)
+                else:
+                    continue
+            except:
+                continue
+    return
+
+
+def parse_args():
+    parser = argparse.ArgumentParser('', add_help=False)
+    parser.add_argument('--class_prompt', help='text prompt to retrieve images', required=True,
+                        type=str)
+    parser.add_argument('--class_images_dir', help='path to save images', required=True,
+                        type=str)
+    parser.add_argument('--num_class_images', help='number of images to download', default=200,
+                        type=int)
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    retrieve(args.class_prompt, args.class_images_dir, args.num_class_images)

From f9218b02f5dd1bb614d10223eb45d5561515b424 Mon Sep 17 00:00:00 2001
From: Nupur Kumari <nupurkumari@Nupurs-MacBook-Pro.local>
Date: Mon, 17 Apr 2023 23:30:30 -0400
Subject: [PATCH 11/34] custom diffusion

---
 examples/custom_diffusion/README.md        | 21 +++++++++++-
 examples/custom_diffusion/requirements.txt |  6 ++++
 examples/custom_diffusion/retrieve.py      | 16 ++++-----
 examples/custom_diffusion/train.py         | 39 ++++++++++------------
 4 files changed, 52 insertions(+), 30 deletions(-)
 create mode 100644 examples/custom_diffusion/requirements.txt

diff --git a/examples/custom_diffusion/README.md b/examples/custom_diffusion/README.md
index 74e3bfeb1a01..397cfe4c4978 100644
--- a/examples/custom_diffusion/README.md
+++ b/examples/custom_diffusion/README.md
@@ -50,7 +50,12 @@ write_basic_config()
 Now let's get our dataset. Download dataset from [here](https://www.cs.cmu.edu/~custom-diffusion/assets/data.zip) and unzip it. 
 
 We also collect 200 real images using `clip-retrieval` which are combined with the target images in the training dataset as a regularization. This prevents overfitting to the the given target image. The following flags enable the regularization `with_prior_preservation`, `real_prior` with `prior_loss_weight=1.`. 
-The `class_prompt` should be the category name same as target image. The collected real images are with text captions similar to the `class_prompt`. The retrieved image are saved in `class_data_dir`. You can disable `real_prior` to use generated images as regularization.
+The `class_prompt` should be the category name same as target image. The collected real images are with text captions similar to the `class_prompt`. The retrieved image are saved in `class_data_dir`. You can disable `real_prior` to use generated images as regularization. To collect the real images use this command first before training. 
+
+```
+pip install clip-retrieval
+python retrieve.py --class_prompt cat --class_data_dir real_reg/samples_cat --num_class_images 200
+```
 
 **___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___**
 
@@ -84,6 +89,14 @@ accelerate launch train.py \
 
 Provide a [json](https://github.com/adobe-research/custom-diffusion/blob/main/assets/concept_list.json) file with the info about each concept, similar to [this](https://github.com/ShivamShrirao/diffusers/blob/main/examples/dreambooth/train_dreambooth.py).
 
+To collect the real images run this command for each concept in the json file. 
+
+```
+pip install clip-retrieval
+python retrieve.py --class_prompt {} --class_data_dir {} --num_class_images 200
+```
+
+
 ```bash
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
 export OUTPUT_DIR="path-to-save-model"
@@ -108,6 +121,12 @@ accelerate launch train.py \
 ### Training on human faces
 
 For fine-tuning on human faces we found the following configuration to work better: `learning_rate=5e-6`, `max_train_steps=1000 to 2000`, and `freeze_model=crossattn` with atleast 15-20 images. 
+To collect the real images use this command first before training. 
+
+```
+pip install clip-retrieval
+python retrieve.py --class_prompt person --class_data_dir real_reg/samples_person --num_class_images 200
+```
 
 ```bash
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
diff --git a/examples/custom_diffusion/requirements.txt b/examples/custom_diffusion/requirements.txt
new file mode 100644
index 000000000000..7d93f3d03bd8
--- /dev/null
+++ b/examples/custom_diffusion/requirements.txt
@@ -0,0 +1,6 @@
+accelerate
+torchvision
+transformers>=4.25.1
+ftfy
+tensorboard
+Jinja2
diff --git a/examples/custom_diffusion/retrieve.py b/examples/custom_diffusion/retrieve.py
index 877cab638b8f..c705d586086b 100644
--- a/examples/custom_diffusion/retrieve.py
+++ b/examples/custom_diffusion/retrieve.py
@@ -9,14 +9,14 @@
 from tqdm import tqdm
 
 
-def retrieve(class_prompt, class_images_dir, num_class_images):
+def retrieve(class_prompt, class_data_dir, num_class_images):
     factor = 1.5
     num_images = int(factor * num_class_images)
     client = ClipClient(url="https://knn.laion.ai/knn-service",
                         indice_name="laion_400m", num_images=num_images, aesthetic_weight=0.1)
 
-    os.makedirs(f'{class_images_dir}/images', exist_ok=True)
-    if len(list(Path(f'{class_images_dir}/images').iterdir())) >= num_class_images:
+    os.makedirs(f'{class_data_dir}/images', exist_ok=True)
+    if len(list(Path(f'{class_data_dir}/images').iterdir())) >= num_class_images:
         return
 
     while True:
@@ -33,7 +33,7 @@ def retrieve(class_prompt, class_images_dir, num_class_images):
     pbar = tqdm(desc='downloading real regularization images',
                 total=num_class_images)
 
-    with open(f'{class_images_dir}/caption.txt', 'w') as f1, open(f'{class_images_dir}/urls.txt', 'w') as f2, open(f'{class_images_dir}/images.txt', 'w') as f3:
+    with open(f'{class_data_dir}/caption.txt', 'w') as f1, open(f'{class_data_dir}/urls.txt', 'w') as f2, open(f'{class_data_dir}/images.txt', 'w') as f3:
         while total < num_class_images:
             images = class_images[count]
             count += 1
@@ -41,11 +41,11 @@ def retrieve(class_prompt, class_images_dir, num_class_images):
                 img = requests.get(images['url'])
                 if img.status_code == 200:
                     _ = Image.open(BytesIO(img.content))
-                    with open(f'{class_images_dir}/images/{total}.jpg', 'wb') as f:
+                    with open(f'{class_data_dir}/images/{total}.jpg', 'wb') as f:
                         f.write(img.content)
                     f1.write(images['caption'] + '\n')
                     f2.write(images['url'] + '\n')
-                    f3.write(f'{class_images_dir}/images/{total}.jpg' + '\n')
+                    f3.write(f'{class_data_dir}/images/{total}.jpg' + '\n')
                     total += 1
                     pbar.update(1)
                 else:
@@ -59,7 +59,7 @@ def parse_args():
     parser = argparse.ArgumentParser('', add_help=False)
     parser.add_argument('--class_prompt', help='text prompt to retrieve images', required=True,
                         type=str)
-    parser.add_argument('--class_images_dir', help='path to save images', required=True,
+    parser.add_argument('--class_data_dir', help='path to save images', required=True,
                         type=str)
     parser.add_argument('--num_class_images', help='number of images to download', default=200,
                         type=int)
@@ -68,4 +68,4 @@ def parse_args():
 
 if __name__ == "__main__":
     args = parse_args()
-    retrieve(args.class_prompt, args.class_images_dir, args.num_class_images)
+    retrieve(args.class_prompt, args.class_data_dir, args.num_class_images)
diff --git a/examples/custom_diffusion/train.py b/examples/custom_diffusion/train.py
index 25fe192bb761..d77a326e4c8f 100644
--- a/examples/custom_diffusion/train.py
+++ b/examples/custom_diffusion/train.py
@@ -25,7 +25,6 @@
 import os
 import random
 import warnings
-from io import BytesIO
 from pathlib import Path
 
 import numpy as np
@@ -35,7 +34,6 @@
 from torch.utils.data import Dataset
 
 import diffusers
-import requests
 import transformers
 from accelerate import Accelerator
 from accelerate.logging import get_logger
@@ -48,7 +46,7 @@
     UNet2DConditionModel,
 )
 from diffusers.loaders import AttnProcsLayers
-from diffusers.models.attention_processor import CustomDiffusionAttnProcessor, AttnProcessor
+from diffusers.models.attention_processor import CustomDiffusionAttnProcessor
 from diffusers.optimization import get_scheduler
 from diffusers.utils import check_min_version, is_wandb_available
 from diffusers.utils.import_utils import is_xformers_available
@@ -58,7 +56,7 @@
 from torchvision import transforms
 from tqdm.auto import tqdm
 from transformers import AutoTokenizer, PretrainedConfig
-import ipdb
+
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
 check_min_version("0.14.0")
@@ -689,10 +687,10 @@ def main(args):
             if not class_images_dir.exists():
                 class_images_dir.mkdir(parents=True, exist_ok=True)
             if args.real_prior:
-                assert (class_images_dir / 'images').exists(), print(f"Please run python retrieve.py --target {concept['class_prompt']} --outpath {class_images_dir}")
-                assert len(list((class_images_dir / 'images').iterdir())) == args.num_class_images, print(f"Please run python retrieve.py --target {concept['class_prompt']} --outpath {class_images_dir}")
-                assert (class_images_dir / 'caption.txt').exists(), print(f"Please run python retrieve.py --target {concept['class_prompt']} --outpath {class_images_dir}")
-                assert (class_images_dir / 'images.txt').exists(), print(f"Please run python retrieve.py --target {concept['class_prompt']} --outpath {class_images_dir}")
+                assert (class_images_dir / 'images').exists(), print(f"Please run: python retrieve.py --class_prompt {concept['class_prompt']} --class_data_dir {class_images_dir} --num_class_images {args.num_class_images}")
+                assert len(list((class_images_dir / 'images').iterdir())) == args.num_class_images, print(f"Please run: python retrieve.py --class_prompt {concept['class_prompt']} --class_data_dir {class_images_dir} --num_class_images {args.num_class_images}")
+                assert (class_images_dir / 'caption.txt').exists(), print(f"Please run: python retrieve.py --class_prompt {concept['class_prompt']} --class_data_dir {class_images_dir} --num_class_images {args.num_class_images}")
+                assert (class_images_dir / 'images.txt').exists(), print(f"Please run: python retrieve.py --class_prompt {concept['class_prompt']} --class_data_dir {class_images_dir} --num_class_images {args.num_class_images}")
                 concept['class_prompt'] = os.path.join(class_images_dir, 'caption.txt')
                 concept['class_data_dir'] = os.path.join(class_images_dir, 'images.txt')
                 args.concepts_list[i] = concept
@@ -838,6 +836,12 @@ def main(args):
     elif accelerator.mixed_precision == "bf16":
         weight_dtype = torch.bfloat16
 
+    # Move unet, vae and text_encoder to device and cast to weight_dtype
+    if accelerator.mixed_precision != "fp16" and args.modifier_token is not None:
+        text_encoder.to(accelerator.device, dtype=weight_dtype)
+    unet.to(accelerator.device, dtype=weight_dtype)
+    vae.to(accelerator.device, dtype=weight_dtype)
+
     # now we will add new Custom Diffusion weights to the attention layers
     # It's important to realize here how many attention weights will be added and of which sizes
     # The sizes of the attention layers consist only of two different variables:
@@ -868,28 +872,21 @@ def main(args):
             block_id = int(name[len("down_blocks.")])
             hidden_size = unet.config.block_out_channels[block_id]
         layer_name = name.split('.processor')[0]
-        # ipdb.set_trace()
         weights = {'to_k': st[layer_name + '.to_k.weight'],
-                    'to_v': st[layer_name + '.to_v.weight'],
-                    'to_q': st[layer_name + '.to_q.weight'],
-                    'to_out.weight': st[layer_name + '.to_out.0.weight'],
-                    'to_out.bias': st[layer_name + '.to_out.0.bias'], }
+                   'to_v': st[layer_name + '.to_v.weight'],
+                   'to_q': st[layer_name + '.to_q.weight'],
+                   'to_out.weight': st[layer_name + '.to_out.0.weight'],
+                   'to_out.bias': st[layer_name + '.to_out.0.bias'], }
         if cross_attention_dim is not None:
-            custom_diffusion_attn_procs[name] = CustomDiffusionAttnProcessor(weights, train_kv=train_kv, train_q_out=train_q_out, hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
+            custom_diffusion_attn_procs[name] = CustomDiffusionAttnProcessor(weights, train_kv=train_kv, train_q_out=train_q_out, hidden_size=hidden_size, cross_attention_dim=cross_attention_dim).to(unet.device)
         else:
-            custom_diffusion_attn_procs[name] = CustomDiffusionAttnProcessor(weights, train_kv=False, train_q_out=False, hidden_size=hidden_size, cross_attention_dim=cross_attention_dim) #attn
+            custom_diffusion_attn_procs[name] = CustomDiffusionAttnProcessor(weights, train_kv=False, train_q_out=False, hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)  # attn
     del st
     unet.set_attn_processor(custom_diffusion_attn_procs)
     custom_diffusion_layers = AttnProcsLayers({y: x for (y, x) in unet.attn_processors.items() if isinstance(x, CustomDiffusionAttnProcessor)})
 
     accelerator.register_for_checkpointing(custom_diffusion_layers)
 
-    # Move unet, vae and text_encoder to device and cast to weight_dtype
-    if accelerator.mixed_precision != "fp16" and args.modifier_token is not None:
-        text_encoder.to(accelerator.device, dtype=weight_dtype)
-    unet.to(accelerator.device, dtype=weight_dtype)
-    vae.to(accelerator.device, dtype=weight_dtype)
-
     if args.enable_xformers_memory_efficient_attention:
         if is_xformers_available():
             import xformers

From e26597e2d33e39a9d9067b0972374a263e094509 Mon Sep 17 00:00:00 2001
From: Nupur Kumari <nupurkumari@Nupurs-MacBook-Pro.local>
Date: Tue, 18 Apr 2023 00:36:51 -0400
Subject: [PATCH 12/34] custom diffusion

---
 examples/custom_diffusion/README.md |  6 +++---
 examples/custom_diffusion/train.py  |  8 ++++----
 src/diffusers/loaders.py            | 10 +++++++---
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/examples/custom_diffusion/README.md b/examples/custom_diffusion/README.md
index 397cfe4c4978..be3444c52ec3 100644
--- a/examples/custom_diffusion/README.md
+++ b/examples/custom_diffusion/README.md
@@ -165,7 +165,7 @@ import torch
 pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16).to("cuda")
 pipe.unet.load_attn_procs('path-to-save-model', weight_name='pytorch_custom_diffusion_weights.bin')
 pipe.load_textual_inversion('path-to-save-model', weight_name='<new1>.bin')
-image = pipe("<new1> cat sitting in a bucket", num_inference_steps=100, guidance_scale=7.5, eta=1.).images[0]
+image = pipe("<new1> cat sitting in a bucket", num_inference_steps=100, guidance_scale=6., eta=1.).images[0]
 
 image.save("cat.png")
 ```
@@ -178,8 +178,8 @@ You can also perform inference from one of the complete checkpoint saved during
 from diffusers import StableDiffusionPipeline
 import torch
 
-pipe = StableDiffusionPipeline.from_pretrained('path-to-the-model/checkpoint-<global-step>/', torch_dtype=torch.float16).to("cuda")
-image = pipe("<new1> cat sitting in a bucket", num_inference_steps=100, guidance_scale=7.5, eta=1.).images[0]
+pipe = StableDiffusionPipeline.from_pretrained('path-to-the-model/checkpoint-250/', torch_dtype=torch.float16).to("cuda")
+image = pipe("<new1> cat sitting in a bucket", num_inference_steps=100, guidance_scale=6., eta=1.).images[0]
 
 image.save("cat.png")
 ```
diff --git a/examples/custom_diffusion/train.py b/examples/custom_diffusion/train.py
index d77a326e4c8f..8f7690c14c4d 100644
--- a/examples/custom_diffusion/train.py
+++ b/examples/custom_diffusion/train.py
@@ -687,10 +687,10 @@ def main(args):
             if not class_images_dir.exists():
                 class_images_dir.mkdir(parents=True, exist_ok=True)
             if args.real_prior:
-                assert (class_images_dir / 'images').exists(), print(f"Please run: python retrieve.py --class_prompt {concept['class_prompt']} --class_data_dir {class_images_dir} --num_class_images {args.num_class_images}")
-                assert len(list((class_images_dir / 'images').iterdir())) == args.num_class_images, print(f"Please run: python retrieve.py --class_prompt {concept['class_prompt']} --class_data_dir {class_images_dir} --num_class_images {args.num_class_images}")
-                assert (class_images_dir / 'caption.txt').exists(), print(f"Please run: python retrieve.py --class_prompt {concept['class_prompt']} --class_data_dir {class_images_dir} --num_class_images {args.num_class_images}")
-                assert (class_images_dir / 'images.txt').exists(), print(f"Please run: python retrieve.py --class_prompt {concept['class_prompt']} --class_data_dir {class_images_dir} --num_class_images {args.num_class_images}")
+                assert (class_images_dir / 'images').exists(), f"Please run: python retrieve.py --class_prompt \"{concept['class_prompt']}\" --class_data_dir {class_images_dir} --num_class_images {args.num_class_images}"
+                assert len(list((class_images_dir / 'images').iterdir())) == args.num_class_images, f"Please run: python retrieve.py --class_prompt \"{concept['class_prompt']}\" --class_data_dir {class_images_dir} --num_class_images {args.num_class_images}"
+                assert (class_images_dir / 'caption.txt').exists(), f"Please run: python retrieve.py --class_prompt \"{concept['class_prompt']}\" --class_data_dir {class_images_dir} --num_class_images {args.num_class_images}"
+                assert (class_images_dir / 'images.txt').exists(), f"Please run: python retrieve.py --class_prompt \"{concept['class_prompt']}\" --class_data_dir {class_images_dir} --num_class_images {args.num_class_images}"
                 concept['class_prompt'] = os.path.join(class_images_dir, 'caption.txt')
                 concept['class_data_dir'] = os.path.join(class_images_dir, 'images.txt')
                 args.concepts_list[i] = concept
diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py
index 974d6494500d..a4718e077d76 100644
--- a/src/diffusers/loaders.py
+++ b/src/diffusers/loaders.py
@@ -17,7 +17,11 @@
 
 import torch
 
-from .models.attention_processor import CustomDiffusionAttnProcessor, LoRAAttnProcessor
+from .models.attention_processor import (
+    CustomDiffusionAttnProcessor,
+    CustomDiffusionXFormersAttnProcessor,
+    LoRAAttnProcessor,
+)
 from .utils import (
     DIFFUSERS_CACHE,
     HF_HUB_OFFLINE,
@@ -313,9 +317,9 @@ def save_function(weights, filename):
 
         os.makedirs(save_directory, exist_ok=True)
 
-        is_custom_diffusion = any(isinstance(x, CustomDiffusionAttnProcessor) for (_, x) in self.attn_processors.items())
+        is_custom_diffusion = any(isinstance(x, (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor)) for (_, x) in self.attn_processors.items())
         if is_custom_diffusion:
-            model_to_save = AttnProcsLayers({y: x for (y, x) in self.attn_processors.items() if isinstance(x, CustomDiffusionAttnProcessor)})
+            model_to_save = AttnProcsLayers({y: x for (y, x) in self.attn_processors.items() if isinstance(x, (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor))})
             state_dict = model_to_save.state_dict()
             for name, attn in self.attn_processors.items():
                 if len(attn.state_dict()) == 0:

From 80e03fd24a93d7b41c799137e6c4d336f89d58df Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Tue, 18 Apr 2023 10:21:34 +0530
Subject: [PATCH 13/34] apply formatting and get rid of bare except.

---
 examples/custom_diffusion/retrieve.py         |  47 ++---
 .../{train.py => train_custom_diffusion.py}   | 170 +++++++++++-------
 src/diffusers/loaders.py                      |  25 ++-
 src/diffusers/models/attention_processor.py   |  32 +++-
 4 files changed, 176 insertions(+), 98 deletions(-)
 rename examples/custom_diffusion/{train.py => train_custom_diffusion.py} (90%)

diff --git a/examples/custom_diffusion/retrieve.py b/examples/custom_diffusion/retrieve.py
index c705d586086b..74db82764a38 100644
--- a/examples/custom_diffusion/retrieve.py
+++ b/examples/custom_diffusion/retrieve.py
@@ -12,11 +12,12 @@
 def retrieve(class_prompt, class_data_dir, num_class_images):
     factor = 1.5
     num_images = int(factor * num_class_images)
-    client = ClipClient(url="https://knn.laion.ai/knn-service",
-                        indice_name="laion_400m", num_images=num_images, aesthetic_weight=0.1)
+    client = ClipClient(
+        url="https://knn.laion.ai/knn-service", indice_name="laion_400m", num_images=num_images, aesthetic_weight=0.1
+    )
 
-    os.makedirs(f'{class_data_dir}/images', exist_ok=True)
-    if len(list(Path(f'{class_data_dir}/images').iterdir())) >= num_class_images:
+    os.makedirs(f"{class_data_dir}/images", exist_ok=True)
+    if len(list(Path(f"{class_data_dir}/images").iterdir())) >= num_class_images:
         return
 
     while True:
@@ -25,44 +26,46 @@ def retrieve(class_prompt, class_data_dir, num_class_images):
             break
         else:
             num_images = int(factor * num_images)
-            client = ClipClient(url="https://knn.laion.ai/knn-service",
-                                indice_name="laion_400m", num_images=num_images, aesthetic_weight=0.1)
+            client = ClipClient(
+                url="https://knn.laion.ai/knn-service",
+                indice_name="laion_400m",
+                num_images=num_images,
+                aesthetic_weight=0.1,
+            )
 
     count = 0
     total = 0
-    pbar = tqdm(desc='downloading real regularization images',
-                total=num_class_images)
+    pbar = tqdm(desc="downloading real regularization images", total=num_class_images)
 
-    with open(f'{class_data_dir}/caption.txt', 'w') as f1, open(f'{class_data_dir}/urls.txt', 'w') as f2, open(f'{class_data_dir}/images.txt', 'w') as f3:
+    with open(f"{class_data_dir}/caption.txt", "w") as f1, open(f"{class_data_dir}/urls.txt", "w") as f2, open(
+        f"{class_data_dir}/images.txt", "w"
+    ) as f3:
         while total < num_class_images:
             images = class_images[count]
             count += 1
             try:
-                img = requests.get(images['url'])
+                img = requests.get(images["url"])
                 if img.status_code == 200:
                     _ = Image.open(BytesIO(img.content))
-                    with open(f'{class_data_dir}/images/{total}.jpg', 'wb') as f:
+                    with open(f"{class_data_dir}/images/{total}.jpg", "wb") as f:
                         f.write(img.content)
-                    f1.write(images['caption'] + '\n')
-                    f2.write(images['url'] + '\n')
-                    f3.write(f'{class_data_dir}/images/{total}.jpg' + '\n')
+                    f1.write(images["caption"] + "\n")
+                    f2.write(images["url"] + "\n")
+                    f3.write(f"{class_data_dir}/images/{total}.jpg" + "\n")
                     total += 1
                     pbar.update(1)
                 else:
                     continue
-            except:
+            except Exception:
                 continue
     return
 
 
 def parse_args():
-    parser = argparse.ArgumentParser('', add_help=False)
-    parser.add_argument('--class_prompt', help='text prompt to retrieve images', required=True,
-                        type=str)
-    parser.add_argument('--class_data_dir', help='path to save images', required=True,
-                        type=str)
-    parser.add_argument('--num_class_images', help='number of images to download', default=200,
-                        type=int)
+    parser = argparse.ArgumentParser("", add_help=False)
+    parser.add_argument("--class_prompt", help="text prompt to retrieve images", required=True, type=str)
+    parser.add_argument("--class_data_dir", help="path to save images", required=True, type=str)
+    parser.add_argument("--num_class_images", help="number of images to download", default=200, type=int)
     return parser.parse_args()
 
 
diff --git a/examples/custom_diffusion/train.py b/examples/custom_diffusion/train_custom_diffusion.py
similarity index 90%
rename from examples/custom_diffusion/train.py
rename to examples/custom_diffusion/train_custom_diffusion.py
index 8f7690c14c4d..e616a822c9e6 100644
--- a/examples/custom_diffusion/train.py
+++ b/examples/custom_diffusion/train_custom_diffusion.py
@@ -31,13 +31,19 @@
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
-from torch.utils.data import Dataset
-
-import diffusers
 import transformers
 from accelerate import Accelerator
 from accelerate.logging import get_logger
 from accelerate.utils import ProjectConfiguration, set_seed
+from huggingface_hub import HfApi, create_repo
+from packaging import version
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+
+import diffusers
 from diffusers import (
     AutoencoderKL,
     DDPMScheduler,
@@ -50,12 +56,6 @@
 from diffusers.optimization import get_scheduler
 from diffusers.utils import check_min_version, is_wandb_available
 from diffusers.utils.import_utils import is_xformers_available
-from huggingface_hub import HfApi, create_repo
-from packaging import version
-from PIL import Image
-from torchvision import transforms
-from tqdm.auto import tqdm
-from transformers import AutoTokenizer, PretrainedConfig
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
@@ -136,11 +136,7 @@ def collate_fn(examples, with_prior_preservation):
     pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
     mask = mask.to(memory_format=torch.contiguous_format).float()
 
-    batch = {
-        "input_ids": input_ids,
-        "pixel_values": pixel_values,
-        "mask": mask.unsqueeze(1)
-    }
+    batch = {"input_ids": input_ids, "pixel_values": pixel_values, "mask": mask.unsqueeze(1)}
     return batch
 
 
@@ -188,7 +184,9 @@ def __init__(
         self.class_images_path = []
         self.with_prior_preservation = with_prior_preservation
         for concept in concepts_list:
-            inst_img_path = [(x, concept["instance_prompt"]) for x in Path(concept["instance_data_dir"]).iterdir() if x.is_file()]
+            inst_img_path = [
+                (x, concept["instance_prompt"]) for x in Path(concept["instance_data_dir"]).iterdir() if x.is_file()
+            ]
             self.instance_images_path.extend(inst_img_path)
 
             if with_prior_preservation:
@@ -235,11 +233,11 @@ def preprocess(self, image, scale, resample):
         instance_image = np.zeros((self.size, self.size, 3), dtype=np.float32)
         mask = np.zeros((self.size // 8, self.size // 8))
         if scale > self.size:
-            instance_image = image[top: top + inner, left: left + inner, :]
+            instance_image = image[top : top + inner, left : left + inner, :]
             mask = np.ones((self.size // 8, self.size // 8))
         else:
-            instance_image[top: top + inner, left: left + inner, :] = image
-            mask[top // 8 + 1: (top + scale) // 8 - 1, left // 8 + 1: (left + scale) // 8 - 1] = 1.
+            instance_image[top : top + inner, left : left + inner, :] = image
+            mask[top // 8 + 1 : (top + scale) // 8 - 1, left // 8 + 1 : (left + scale) // 8 - 1] = 1.0
         return instance_image, mask
 
     def __getitem__(self, index):
@@ -253,7 +251,11 @@ def __getitem__(self, index):
         # apply resize augmentation and create a valid image region mask
         random_scale = self.size
         if self.aug:
-            random_scale = np.random.randint(self.size // 3, self.size + 1) if np.random.uniform() < 0.66 else np.random.randint(int(1.2 * self.size), int(1.4 * self.size))
+            random_scale = (
+                np.random.randint(self.size // 3, self.size + 1)
+                if np.random.uniform() < 0.66
+                else np.random.randint(int(1.2 * self.size), int(1.4 * self.size))
+            )
         instance_image, mask = self.preprocess(instance_image, random_scale, self.interpolation)
 
         if random_scale < 0.6 * self.size:
@@ -292,10 +294,10 @@ def __getitem__(self, index):
 def save_new_embed(text_encoder, modifier_token_id, accelerator, args, output_dir):
     logger.info("Saving embeddings")
     learned_embeds = accelerator.unwrap_model(text_encoder).get_input_embeddings().weight
-    for (x, y) in zip(modifier_token_id, args.modifier_token):
+    for x, y in zip(modifier_token_id, args.modifier_token):
         learned_embeds_dict = {}
         learned_embeds_dict[y] = learned_embeds[x]
-        torch.save(learned_embeds_dict, f'{output_dir}/{y}.bin')
+        torch.save(learned_embeds_dict, f"{output_dir}/{y}.bin")
 
 
 def parse_args(input_args=None):
@@ -488,8 +490,8 @@ def parse_args(input_args=None):
     parser.add_argument(
         "--freeze_model",
         type=str,
-        default='crossattn_kv',
-        choices=['crossattn_kv', 'crossattn'],
+        default="crossattn_kv",
+        choices=["crossattn_kv", "crossattn"],
         help="crossattn to enable fine-tuning of all params in the cross attention",
     )
     parser.add_argument(
@@ -593,10 +595,14 @@ def parse_args(input_args=None):
         help="A token to use as a modifier for the concept.",
     )
     parser.add_argument(
-        "--initializer_token", type=str, default='ktn+pll+ucd', help="A token to use as initializer word."
+        "--initializer_token", type=str, default="ktn+pll+ucd", help="A token to use as initializer word."
     )
     parser.add_argument("--hflip", action="store_true", help="Apply horizontal flip data augmentation.")
-    parser.add_argument("--noaug", action="store_true", help="Dont apply augmentation during data augmentation when this flag is enabled.")
+    parser.add_argument(
+        "--noaug",
+        action="store_true",
+        help="Dont apply augmentation during data augmentation when this flag is enabled.",
+    )
 
     if input_args is not None:
         args = parser.parse_args(input_args)
@@ -673,7 +679,7 @@ def main(args):
                 "instance_prompt": args.instance_prompt,
                 "class_prompt": args.class_prompt,
                 "instance_data_dir": args.instance_data_dir,
-                "class_data_dir": args.class_data_dir
+                "class_data_dir": args.class_data_dir,
             }
         ]
     else:
@@ -683,16 +689,24 @@ def main(args):
     # Generate class images if prior preservation is enabled.
     if args.with_prior_preservation:
         for i, concept in enumerate(args.concepts_list):
-            class_images_dir = Path(concept['class_data_dir'])
+            class_images_dir = Path(concept["class_data_dir"])
             if not class_images_dir.exists():
                 class_images_dir.mkdir(parents=True, exist_ok=True)
             if args.real_prior:
-                assert (class_images_dir / 'images').exists(), f"Please run: python retrieve.py --class_prompt \"{concept['class_prompt']}\" --class_data_dir {class_images_dir} --num_class_images {args.num_class_images}"
-                assert len(list((class_images_dir / 'images').iterdir())) == args.num_class_images, f"Please run: python retrieve.py --class_prompt \"{concept['class_prompt']}\" --class_data_dir {class_images_dir} --num_class_images {args.num_class_images}"
-                assert (class_images_dir / 'caption.txt').exists(), f"Please run: python retrieve.py --class_prompt \"{concept['class_prompt']}\" --class_data_dir {class_images_dir} --num_class_images {args.num_class_images}"
-                assert (class_images_dir / 'images.txt').exists(), f"Please run: python retrieve.py --class_prompt \"{concept['class_prompt']}\" --class_data_dir {class_images_dir} --num_class_images {args.num_class_images}"
-                concept['class_prompt'] = os.path.join(class_images_dir, 'caption.txt')
-                concept['class_data_dir'] = os.path.join(class_images_dir, 'images.txt')
+                assert (
+                    class_images_dir / "images"
+                ).exists(), f"Please run: python retrieve.py --class_prompt \"{concept['class_prompt']}\" --class_data_dir {class_images_dir} --num_class_images {args.num_class_images}"
+                assert (
+                    len(list((class_images_dir / "images").iterdir())) == args.num_class_images
+                ), f"Please run: python retrieve.py --class_prompt \"{concept['class_prompt']}\" --class_data_dir {class_images_dir} --num_class_images {args.num_class_images}"
+                assert (
+                    class_images_dir / "caption.txt"
+                ).exists(), f"Please run: python retrieve.py --class_prompt \"{concept['class_prompt']}\" --class_data_dir {class_images_dir} --num_class_images {args.num_class_images}"
+                assert (
+                    class_images_dir / "images.txt"
+                ).exists(), f"Please run: python retrieve.py --class_prompt \"{concept['class_prompt']}\" --class_data_dir {class_images_dir} --num_class_images {args.num_class_images}"
+                concept["class_prompt"] = os.path.join(class_images_dir, "caption.txt")
+                concept["class_data_dir"] = os.path.join(class_images_dir, "images.txt")
                 args.concepts_list[i] = concept
                 accelerator.wait_for_everyone()
             else:
@@ -724,13 +738,17 @@ def main(args):
                     pipeline.to(accelerator.device)
 
                     for example in tqdm(
-                        sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
+                        sample_dataloader,
+                        desc="Generating class images",
+                        disable=not accelerator.is_local_main_process,
                     ):
                         images = pipeline(example["prompt"]).images
 
                         for i, image in enumerate(images):
                             hash_image = hashlib.sha1(image.tobytes()).hexdigest()
-                            image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
+                            image_filename = (
+                                class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
+                            )
                             image.save(image_filename)
 
                     del pipeline
@@ -783,11 +801,13 @@ def main(args):
     modifier_token_id = []
     initializer_token_id = []
     if args.modifier_token is not None:
-        args.modifier_token = args.modifier_token.split('+')
-        args.initializer_token = args.initializer_token.split('+')
+        args.modifier_token = args.modifier_token.split("+")
+        args.initializer_token = args.initializer_token.split("+")
         if len(args.modifier_token) > len(args.initializer_token):
             raise ValueError("You must specify + separated initializer token for each modifier token.")
-        for modifier_token, initializer_token in zip(args.modifier_token, args.initializer_token[:len(args.modifier_token)]):
+        for modifier_token, initializer_token in zip(
+            args.modifier_token, args.initializer_token[: len(args.modifier_token)]
+        ):
             # Add the placeholder token in tokenizer
             num_added_tokens = tokenizer.add_tokens(modifier_token)
             if num_added_tokens == 0:
@@ -811,7 +831,7 @@ def main(args):
 
         # Initialise the newly added placeholder token with the embeddings of the initializer token
         token_embeds = text_encoder.get_input_embeddings().weight.data
-        for (x, y) in zip(modifier_token_id, initializer_token_id):
+        for x, y in zip(modifier_token_id, initializer_token_id):
             token_embeds[x] = token_embeds[y]
 
         # Freeze all parameters except for the token embeddings in text encoder
@@ -857,7 +877,7 @@ def main(args):
 
     # Only train key, value projection layers if freeze_model = 'crossattn_kv' else train all params in the cross attention layer
     train_kv = True
-    train_q_out = False if args.freeze_model == 'crossattn_kv' else True
+    train_q_out = False if args.freeze_model == "crossattn_kv" else True
     custom_diffusion_attn_procs = {}
 
     st = unet.state_dict()
@@ -871,19 +891,35 @@ def main(args):
         elif name.startswith("down_blocks"):
             block_id = int(name[len("down_blocks.")])
             hidden_size = unet.config.block_out_channels[block_id]
-        layer_name = name.split('.processor')[0]
-        weights = {'to_k': st[layer_name + '.to_k.weight'],
-                   'to_v': st[layer_name + '.to_v.weight'],
-                   'to_q': st[layer_name + '.to_q.weight'],
-                   'to_out.weight': st[layer_name + '.to_out.0.weight'],
-                   'to_out.bias': st[layer_name + '.to_out.0.bias'], }
+        layer_name = name.split(".processor")[0]
+        weights = {
+            "to_k": st[layer_name + ".to_k.weight"],
+            "to_v": st[layer_name + ".to_v.weight"],
+            "to_q": st[layer_name + ".to_q.weight"],
+            "to_out.weight": st[layer_name + ".to_out.0.weight"],
+            "to_out.bias": st[layer_name + ".to_out.0.bias"],
+        }
         if cross_attention_dim is not None:
-            custom_diffusion_attn_procs[name] = CustomDiffusionAttnProcessor(weights, train_kv=train_kv, train_q_out=train_q_out, hidden_size=hidden_size, cross_attention_dim=cross_attention_dim).to(unet.device)
+            custom_diffusion_attn_procs[name] = CustomDiffusionAttnProcessor(
+                weights,
+                train_kv=train_kv,
+                train_q_out=train_q_out,
+                hidden_size=hidden_size,
+                cross_attention_dim=cross_attention_dim,
+            ).to(unet.device)
         else:
-            custom_diffusion_attn_procs[name] = CustomDiffusionAttnProcessor(weights, train_kv=False, train_q_out=False, hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)  # attn
+            custom_diffusion_attn_procs[name] = CustomDiffusionAttnProcessor(
+                weights,
+                train_kv=False,
+                train_q_out=False,
+                hidden_size=hidden_size,
+                cross_attention_dim=cross_attention_dim,
+            )  # attn
     del st
     unet.set_attn_processor(custom_diffusion_attn_procs)
-    custom_diffusion_layers = AttnProcsLayers({y: x for (y, x) in unet.attn_processors.items() if isinstance(x, CustomDiffusionAttnProcessor)})
+    custom_diffusion_layers = AttnProcsLayers(
+        {y: x for (y, x) in unet.attn_processors.items() if isinstance(x, CustomDiffusionAttnProcessor)}
+    )
 
     accelerator.register_for_checkpointing(custom_diffusion_layers)
 
@@ -914,7 +950,7 @@ def main(args):
             args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
         )
         if args.with_prior_preservation:
-            args.learning_rate = args.learning_rate * 2.
+            args.learning_rate = args.learning_rate * 2.0
 
     # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
     if args.use_8bit_adam:
@@ -931,7 +967,9 @@ def main(args):
 
     # Optimizer creation
     optimizer = optimizer_class(
-        itertools.chain(text_encoder.get_input_embeddings().parameters(), custom_diffusion_layers.parameters()) if args.modifier_token is not None else custom_diffusion_layers.parameters(),
+        itertools.chain(text_encoder.get_input_embeddings().parameters(), custom_diffusion_layers.parameters())
+        if args.modifier_token is not None
+        else custom_diffusion_layers.parameters(),
         lr=args.learning_rate,
         betas=(args.adam_beta1, args.adam_beta2),
         weight_decay=args.adam_weight_decay,
@@ -946,7 +984,8 @@ def main(args):
         size=args.resolution,
         center_crop=args.center_crop,
         num_class_images=args.num_class_images,
-        hflip=args.hflip, aug=not args.noaug,
+        hflip=args.hflip,
+        aug=not args.noaug,
     )
 
     train_dataloader = torch.utils.data.DataLoader(
@@ -1101,11 +1140,19 @@ def main(args):
                     # Get the index for tokens that we want to zero the grads for
                     index_grads_to_zero = torch.arange(len(tokenizer)) != modifier_token_id[0]
                     for i in range(len(modifier_token_id[1:])):
-                        index_grads_to_zero = index_grads_to_zero & (torch.arange(len(tokenizer)) != modifier_token_id[i])
-                    grads_text_encoder.data[index_grads_to_zero, :] = grads_text_encoder.data[index_grads_to_zero, :].fill_(0)
+                        index_grads_to_zero = index_grads_to_zero & (
+                            torch.arange(len(tokenizer)) != modifier_token_id[i]
+                        )
+                    grads_text_encoder.data[index_grads_to_zero, :] = grads_text_encoder.data[
+                        index_grads_to_zero, :
+                    ].fill_(0)
 
                 if accelerator.sync_gradients:
-                    params_to_clip = itertools.chain(text_encoder.parameters(), custom_diffusion_layers.parameters()) if args.modifier_token is not None else custom_diffusion_layers.parameters()
+                    params_to_clip = (
+                        itertools.chain(text_encoder.parameters(), custom_diffusion_layers.parameters())
+                        if args.modifier_token is not None
+                        else custom_diffusion_layers.parameters()
+                    )
                     accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
                 optimizer.step()
                 lr_scheduler.step()
@@ -1150,7 +1197,7 @@ def main(args):
                 # run inference
                 generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
                 images = [
-                    pipeline(args.validation_prompt, num_inference_steps=25, generator=generator, eta=1.).images[0]
+                    pipeline(args.validation_prompt, num_inference_steps=25, generator=generator, eta=1.0).images[0]
                     for _ in range(args.num_validation_images)
                 ]
 
@@ -1189,13 +1236,13 @@ def main(args):
         # load attention processors
         pipeline.unet.load_attn_procs(args.output_dir, weight_name="pytorch_custom_diffusion_weights.bin")
         for token in args.modifier_token:
-            pipeline.load_textual_inversion(args.output_dir, weight_name=f'{token}.bin')
+            pipeline.load_textual_inversion(args.output_dir, weight_name=f"{token}.bin")
 
         # run inference
         if args.validation_prompt and args.num_validation_images > 0:
             generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
             images = [
-                pipeline(args.validation_prompt, num_inference_steps=25, generator=generator, eta=1.).images[0]
+                pipeline(args.validation_prompt, num_inference_steps=25, generator=generator, eta=1.0).images[0]
                 for _ in range(args.num_validation_images)
             ]
 
@@ -1222,12 +1269,7 @@ def main(args):
                 repo_folder=args.output_dir,
             )
             api = HfApi(token=args.hub_token)
-            api.upload_folder(
-                repo_id=repo_id,
-                folder_path=args.output_dir,
-                path_in_repo='.',
-                repo_type='model'
-            )
+            api.upload_folder(repo_id=repo_id, folder_path=args.output_dir, path_in_repo=".", repo_type="model")
 
     accelerator.end_training()
 
diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py
index a4718e077d76..669aca462c3b 100644
--- a/src/diffusers/loaders.py
+++ b/src/diffusers/loaders.py
@@ -243,7 +243,7 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
                 if len(value) == 0:
                     custom_diffusion_grouped_dict[key] = {}
                 else:
-                    if 'to_out' in key:
+                    if "to_out" in key:
                         attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:])
                     else:
                         attn_processor_key, sub_key = ".".join(key.split(".")[:-2]), ".".join(key.split(".")[-2:])
@@ -259,11 +259,17 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
                     hidden_size = value_dict["to_k_custom_diffusion.weight"].shape[0]
                     train_q_out = True if "to_q_custom_diffusion.weight" in value_dict else False
                     attn_processors[key] = CustomDiffusionAttnProcessor(
-                        weights=None, train_kv=True, train_q_out=train_q_out, hidden_size=hidden_size, cross_attention_dim=cross_attention_dim
+                        weights=None,
+                        train_kv=True,
+                        train_q_out=train_q_out,
+                        hidden_size=hidden_size,
+                        cross_attention_dim=cross_attention_dim,
                     )
                     attn_processors[key].load_state_dict(value_dict)
         else:
-            raise ValueError(f"{model_file} does not seem to be in the correct format expected by LoRA or Custom Diffusion training.")
+            raise ValueError(
+                f"{model_file} does not seem to be in the correct format expected by LoRA or Custom Diffusion training."
+            )
 
         # set correct dtype & device
         attn_processors = {k: v.to(device=self.device, dtype=self.dtype) for k, v in attn_processors.items()}
@@ -317,9 +323,18 @@ def save_function(weights, filename):
 
         os.makedirs(save_directory, exist_ok=True)
 
-        is_custom_diffusion = any(isinstance(x, (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor)) for (_, x) in self.attn_processors.items())
+        is_custom_diffusion = any(
+            isinstance(x, (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor))
+            for (_, x) in self.attn_processors.items()
+        )
         if is_custom_diffusion:
-            model_to_save = AttnProcsLayers({y: x for (y, x) in self.attn_processors.items() if isinstance(x, (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor))})
+            model_to_save = AttnProcsLayers(
+                {
+                    y: x
+                    for (y, x) in self.attn_processors.items()
+                    if isinstance(x, (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor))
+                }
+            )
             state_dict = model_to_save.state_dict()
             for name, attn in self.attn_processors.items():
                 if len(attn.state_dict()) == 0:
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index 3daea8d46908..0259924f54f9 100644
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -412,7 +412,16 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a
 
 
 class CustomDiffusionAttnProcessor(nn.Module):
-    def __init__(self, weights=None, train_kv=True, train_q_out=True, hidden_size=None, cross_attention_dim=None, out_bias=True, dropout=0.0,):
+    def __init__(
+        self,
+        weights=None,
+        train_kv=True,
+        train_q_out=True,
+        hidden_size=None,
+        cross_attention_dim=None,
+        out_bias=True,
+        dropout=0.0,
+    ):
         super().__init__()
         self.train_kv = train_kv
         self.train_q_out = train_q_out
@@ -426,8 +435,8 @@ def __init__(self, weights=None, train_kv=True, train_q_out=True, hidden_size=No
             self.to_v_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size)
             if weights is not None:
                 with torch.no_grad():
-                    self.to_k_custom_diffusion.weight.copy_(weights['to_k'])
-                    self.to_v_custom_diffusion.weight.copy_(weights['to_v'])
+                    self.to_k_custom_diffusion.weight.copy_(weights["to_k"])
+                    self.to_v_custom_diffusion.weight.copy_(weights["to_v"])
         if self.train_q_out:
             self.to_q_custom_diffusion = nn.Linear(hidden_size, hidden_size)
             self.to_out_custom_diffusion = nn.ModuleList([])
@@ -435,9 +444,9 @@ def __init__(self, weights=None, train_kv=True, train_q_out=True, hidden_size=No
             self.to_out_custom_diffusion.append(nn.Dropout(dropout))
             if weights is not None:
                 with torch.no_grad():
-                    self.to_q_custom_diffusion.weight.copy_(weights['to_q'])
-                    self.to_out_custom_diffusion[0].weight.copy_(weights['to_out.weight'])
-                    self.to_out_custom_diffusion[0].bias.copy_(weights['to_out.bias'])
+                    self.to_q_custom_diffusion.weight.copy_(weights["to_q"])
+                    self.to_out_custom_diffusion[0].weight.copy_(weights["to_out.weight"])
+                    self.to_out_custom_diffusion[0].bias.copy_(weights["to_out.bias"])
 
     def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None):
         batch_size, sequence_length, _ = hidden_states.shape
@@ -663,7 +672,16 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a
 
 
 class CustomDiffusionXFormersAttnProcessor(nn.Module):
-    def __init__(self, train_kv=True, train_q_out=False, hidden_size=None, cross_attention_dim=None, out_bias=True, dropout=0.0, attention_op: Optional[Callable] = None):
+    def __init__(
+        self,
+        train_kv=True,
+        train_q_out=False,
+        hidden_size=None,
+        cross_attention_dim=None,
+        out_bias=True,
+        dropout=0.0,
+        attention_op: Optional[Callable] = None,
+    ):
         super().__init__()
         self.train_kv = train_kv
         self.train_q_out = train_q_out

From 08483fbd5f298c60c903d0d89630caf644b8f030 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Tue, 18 Apr 2023 10:37:01 +0530
Subject: [PATCH 14/34] refactor readme and other minor changes.

---
 examples/custom_diffusion/README.md           | 169 +++++++++---------
 examples/custom_diffusion/retrieve.py         |  13 ++
 .../train_custom_diffusion.py                 |   3 -
 3 files changed, 93 insertions(+), 92 deletions(-)

diff --git a/examples/custom_diffusion/README.md b/examples/custom_diffusion/README.md
index be3444c52ec3..980d69eb8e17 100644
--- a/examples/custom_diffusion/README.md
+++ b/examples/custom_diffusion/README.md
@@ -1,9 +1,7 @@
 # Custom Diffusion training example 
-(modified from https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/README.md)
-
-[Custom Diffusion](https://arxiv.org/abs/2212.04488) is a method to customize text2image models like stable diffusion given just a few(4~5) images of a subject.
-The `train.py` script shows how to implement the training procedure and adapt it for stable diffusion.
 
+[Custom Diffusion](https://arxiv.org/abs/2212.04488) is a method to customize text-to-image models like Stable Diffusion given just a few (4~5) images of a subject.
+The `train_custom_diffusion.py` script shows how to implement the training procedure and adapt it for stable diffusion.
 
 ## Running locally with PyTorch
 
@@ -14,6 +12,7 @@ Before running the scripts, make sure to install the library's training dependen
 **Important**
 
 To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+
 ```bash
 git clone https://github.com/huggingface/diffusers
 cd diffusers
@@ -21,6 +20,7 @@ pip install -e .
 ```
 
 Then cd in the example folder and run
+
 ```bash
 pip install -r requirements.txt
 pip install clip-retrieval 
@@ -44,7 +44,6 @@ Or if your environment doesn't support an interactive shell e.g. a notebook
 from accelerate.utils import write_basic_config
 write_basic_config()
 ```
-
 ### Cat example
 
 Now let's get our dataset. Download dataset from [here](https://www.cs.cmu.edu/~custom-diffusion/assets/data.zip) and unzip it. 
@@ -52,7 +51,7 @@ Now let's get our dataset. Download dataset from [here](https://www.cs.cmu.edu/~
 We also collect 200 real images using `clip-retrieval` which are combined with the target images in the training dataset as a regularization. This prevents overfitting to the the given target image. The following flags enable the regularization `with_prior_preservation`, `real_prior` with `prior_loss_weight=1.`. 
 The `class_prompt` should be the category name same as target image. The collected real images are with text captions similar to the `class_prompt`. The retrieved image are saved in `class_data_dir`. You can disable `real_prior` to use generated images as regularization. To collect the real images use this command first before training. 
 
-```
+```bash
 pip install clip-retrieval
 python retrieve.py --class_prompt cat --class_data_dir real_reg/samples_cat --num_class_images 200
 ```
@@ -63,27 +62,25 @@ python retrieve.py --class_prompt cat --class_data_dir real_reg/samples_cat --nu
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
 export OUTPUT_DIR="path-to-save-model"
 export INSTANCE_DIR="./data/cat"
-## launch training script (2 GPUs recommended, increase --max_train_steps to 500 if 1 GPU, or increase --train_batch_size=4)
-
-accelerate launch train.py \
-          --pretrained_model_name_or_path=$MODEL_NAME  \
-          --instance_data_dir=$INSTANCE_DIR \
-          --output_dir=$OUTPUT_DIR \
-          --class_data_dir=./real_reg/samples_cat/ \
-          --with_prior_preservation --real_prior --prior_loss_weight=1.0 \
-          --class_prompt="cat" --num_class_images=200 \
-          --instance_prompt="photo of a <new1> cat"  \
-          --resolution=512  \
-          --train_batch_size=2  \
-          --learning_rate=1e-5  \
-          --lr_warmup_steps=0 \
-          --max_train_steps=250 \
-          --scale_lr --hflip  \
-          --modifier_token "<new1>" 
-```
-
-**Use `--enable_xformers_memory_efficient_attention` for faster training with lower VRAM requirement (16GB per GPU).**
 
+accelerate launch train_custom_diffusion.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --class_data_dir=./real_reg/samples_cat/ \
+  --with_prior_preservation --real_prior --prior_loss_weight=1.0 \
+  --class_prompt="cat" --num_class_images=200 \
+  --instance_prompt="photo of a <new1> cat"  \
+  --resolution=512  \
+  --train_batch_size=2  \
+  --learning_rate=1e-5  \
+  --lr_warmup_steps=0 \
+  --max_train_steps=250 \
+  --scale_lr --hflip  \
+  --modifier_token "<new1>" 
+```
+
+**Use `--enable_xformers_memory_efficient_attention` for faster training with lower VRAM requirement (16GB per GPU). Follow [this guide](https://github.com/facebookresearch/xformers) for installation instructions.**
 
 ### Training on multiple concepts
 
@@ -91,82 +88,91 @@ Provide a [json](https://github.com/adobe-research/custom-diffusion/blob/main/as
 
 To collect the real images run this command for each concept in the json file. 
 
-```
+```bash
 pip install clip-retrieval
 python retrieve.py --class_prompt {} --class_data_dir {} --num_class_images 200
 ```
 
+And then we're ready to start training!
 
 ```bash
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
 export OUTPUT_DIR="path-to-save-model"
 
-## launch training script (2 GPUs recommended, increase --max_train_steps to 1000 if 1 GPU, or increase --train_batch_size=4)
-
-accelerate launch train.py \
-          --pretrained_model_name_or_path=$MODEL_NAME  \
-          --output_dir=$OUTPUT_DIR \
-          --concepts_list=./concept_list.json \
-          --with_prior_preservation --real_prior --prior_loss_weight=1.0 \
-          --resolution=512  \
-          --train_batch_size=2  \
-          --learning_rate=1e-5  \
-          --lr_warmup_steps=0 \
-          --max_train_steps=500 \
-          --num_class_images=200 \
-          --scale_lr --hflip  \
-          --modifier_token "<new1>+<new2>" 
+accelerate launch train_custom_diffusion.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --output_dir=$OUTPUT_DIR \
+  --concepts_list=./concept_list.json \
+  --with_prior_preservation --real_prior --prior_loss_weight=1.0 \
+  --resolution=512  \
+  --train_batch_size=2  \
+  --learning_rate=1e-5  \
+  --lr_warmup_steps=0 \
+  --max_train_steps=500 \
+  --num_class_images=200 \
+  --scale_lr --hflip  \
+  --modifier_token "<new1>+<new2>" 
 ```
 
 ### Training on human faces
 
-For fine-tuning on human faces we found the following configuration to work better: `learning_rate=5e-6`, `max_train_steps=1000 to 2000`, and `freeze_model=crossattn` with atleast 15-20 images. 
+For fine-tuning on human faces we found the following configuration to work better: `learning_rate=5e-6`, `max_train_steps=1000 to 2000`, and `freeze_model=crossattn` with at least 15-20 images. 
+
 To collect the real images use this command first before training. 
 
-```
+```bash
 pip install clip-retrieval
 python retrieve.py --class_prompt person --class_data_dir real_reg/samples_person --num_class_images 200
 ```
 
+Then start training!
+
 ```bash
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
 export OUTPUT_DIR="path-to-save-model"
 export INSTANCE_DIR="path-to-images"
 
-## launch training script (2 GPUs recommended, increase --max_train_steps to 1000 if 1 GPU, or increase --train_batch_size=4)
-
-CUDA_VISIBLE_DEVICES=1 accelerate launch train.py \
-          --pretrained_model_name_or_path=$MODEL_NAME  \
-          --instance_data_dir=$INSTANCE_DIR \
-          --output_dir=$OUTPUT_DIR \
-          --class_data_dir=./real_reg/samples_person/ \
-          --with_prior_preservation --real_prior --prior_loss_weight=1.0 \
-          --class_prompt="person" --num_class_images=200 \
-          --instance_prompt="photo of a <new1> person"  \
-          --resolution=512  \
-          --train_batch_size=2  \
-          --learning_rate=5e-6  \
-          --lr_warmup_steps=0 \
-          --max_train_steps=1000 \
-          --scale_lr --hflip --noaug \
-          --freeze_model crossattn \
-          --modifier_token "<new1>" \
-          --enable_xformers_memory_efficient_attention \
-```
-
-### Inference
+accelerate launch train_custom_diffusion.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --class_data_dir=./real_reg/samples_person/ \
+  --with_prior_preservation --real_prior --prior_loss_weight=1.0 \
+  --class_prompt="person" --num_class_images=200 \
+  --instance_prompt="photo of a <new1> person"  \
+  --resolution=512  \
+  --train_batch_size=2  \
+  --learning_rate=5e-6  \
+  --lr_warmup_steps=0 \
+  --max_train_steps=1000 \
+  --scale_lr --hflip --noaug \
+  --freeze_model crossattn \
+  --modifier_token "<new1>" \
+  --enable_xformers_memory_efficient_attention 
+```
+
+## Inference
 
 Once you have trained a model using the above command, you can run inference using the below command. Make sure to include the `modifier token` (e.g. \<new1\> in above example) in your prompt.
 
 ```python
-from diffusers import DiffusionPipeline
 import torch
+from diffusers import DiffusionPipeline
 
-pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16).to("cuda")
-pipe.unet.load_attn_procs('path-to-save-model', weight_name='pytorch_custom_diffusion_weights.bin')
-pipe.load_textual_inversion('path-to-save-model', weight_name='<new1>.bin')
-image = pipe("<new1> cat sitting in a bucket", num_inference_steps=100, guidance_scale=6., eta=1.).images[0]
-
+pipe = DiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16
+).to("cuda")
+pipe.unet.load_attn_procs(
+    "path-to-save-model", weight_name="pytorch_custom_diffusion_weights.bin"
+)
+pipe.load_textual_inversion("path-to-save-model", weight_name="<new1>.bin")
+
+image = pipe(
+    "<new1> cat sitting in a bucket",
+    num_inference_steps=100,
+    guidance_scale=6.0,
+    eta=1.0,
+).images[0]
 image.save("cat.png")
 ```
 
@@ -184,25 +190,10 @@ image = pipe("<new1> cat sitting in a bucket", num_inference_steps=100, guidance
 image.save("cat.png")
 ```
 
-### Converting delta.bin to diffusers pipeline
-
-You can also perform inference from one of the complete checkpoint saved during the training process, if you used the `--checkpointing_steps` argument. 
-
-```python
-from diffusers import DiffusionPipeline
-import torch
-
-pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16).to("cuda")
-pipe.unet.load_attn_procs('path-to-save-model', weight_name='pytorch_custom_diffusion_weights.bin')
-pipe.load_textual_inversion('path-to-save-model', weight_name='<new1>.bin')
-pipe.save_pretrained('<path-to-your-save-model>')
-```
-
-### Set grads to none
-
+## Set grads to none
 To save even more memory, pass the `--set_grads_to_none` argument to the script. This will set grads to None instead of zero. However, be aware that it changes certain behaviors, so if you start experiencing any problems, remove this argument.
 
 More info: https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html
 
-### Experimental results
+## Experimental results
 You can refer to [our webpage](https://www.cs.cmu.edu/~custom-diffusion/) that discusses our experiments in detail. 
\ No newline at end of file
diff --git a/examples/custom_diffusion/retrieve.py b/examples/custom_diffusion/retrieve.py
index 74db82764a38..7b7635c1887d 100644
--- a/examples/custom_diffusion/retrieve.py
+++ b/examples/custom_diffusion/retrieve.py
@@ -1,3 +1,16 @@
+#  Copyright 2023 Custom Diffusion authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import argparse
 import os
 from io import BytesIO
diff --git a/examples/custom_diffusion/train_custom_diffusion.py b/examples/custom_diffusion/train_custom_diffusion.py
index e616a822c9e6..9344db6ef910 100644
--- a/examples/custom_diffusion/train_custom_diffusion.py
+++ b/examples/custom_diffusion/train_custom_diffusion.py
@@ -1,6 +1,3 @@
-# This code is built from the Huggingface repository: https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth_lora.py, and
-# https://github.com/huggingface/diffusers/blob/main/examples/textual_inversion/textual_inversion.py
-
 #!/usr/bin/env python
 # coding=utf-8
 # Copyright 2023 Custom Diffusion authors and the HuggingFace Inc. team. All rights reserved.

From da2055aad21ca56656343a12654b70a52d907646 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Tue, 18 Apr 2023 11:20:26 +0530
Subject: [PATCH 15/34] misc refactor.

---
 examples/custom_diffusion/README.md           | 91 +++++++++++++++++--
 .../train_custom_diffusion.py                 | 16 +++-
 2 files changed, 94 insertions(+), 13 deletions(-)

diff --git a/examples/custom_diffusion/README.md b/examples/custom_diffusion/README.md
index 980d69eb8e17..486e22ac41ba 100644
--- a/examples/custom_diffusion/README.md
+++ b/examples/custom_diffusion/README.md
@@ -44,7 +44,7 @@ Or if your environment doesn't support an interactive shell e.g. a notebook
 from accelerate.utils import write_basic_config
 write_basic_config()
 ```
-### Cat example
+### Cat example 😺
 
 Now let's get our dataset. Download dataset from [here](https://www.cs.cmu.edu/~custom-diffusion/assets/data.zip) and unzip it. 
 
@@ -82,7 +82,37 @@ accelerate launch train_custom_diffusion.py \
 
 **Use `--enable_xformers_memory_efficient_attention` for faster training with lower VRAM requirement (16GB per GPU). Follow [this guide](https://github.com/facebookresearch/xformers) for installation instructions.**
 
-### Training on multiple concepts
+To track your experiments using Weights and Biases (`wandb`) and to save intermediate results (whcih we HIGHLY recommend), follow these steps:
+
+* Install `wandb`: `pip install wandb`.
+* Authorize: `wandb login`. 
+* Then specify a `validation_prompt` and set `report_to` to `wandb` while launching training. You can also configure the following related arguments:
+    * `num_validation_images`
+    * `validation_steps`
+
+Here is an example command:
+
+```bash
+accelerate launch train_custom_diffusion.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --class_data_dir=./real_reg/samples_cat/ \
+  --with_prior_preservation --real_prior --prior_loss_weight=1.0 \
+  --class_prompt="cat" --num_class_images=200 \
+  --instance_prompt="photo of a <new1> cat"  \
+  --resolution=512  \
+  --train_batch_size=2  \
+  --learning_rate=1e-5  \
+  --lr_warmup_steps=0 \
+  --max_train_steps=250 \
+  --scale_lr --hflip  \
+  --modifier_token "<new1>" \
+  --validation_promot="<new1> cat sitting in a bucket" \
+  --report_to="wandb"
+```
+
+### Training on multiple concepts 🐱🪵
 
 Provide a [json](https://github.com/adobe-research/custom-diffusion/blob/main/assets/concept_list.json) file with the info about each concept, similar to [this](https://github.com/ShivamShrirao/diffusers/blob/main/examples/dreambooth/train_dreambooth.py).
 
@@ -176,20 +206,65 @@ image = pipe(
 image.save("cat.png")
 ```
 
-### Inference from a training checkpoint
-
-You can also perform inference from one of the complete checkpoint saved during the training process, if you used the `--checkpointing_steps` argument. 
+It's possible to directly load these parameters from a Hub repository:
 
 ```python
-from diffusers import StableDiffusionPipeline
 import torch
+from huggingface_hub.repocard import RepoCard
+from diffusers import DiffusionPipeline
+
+model_id = "TODO"
+card = RepoCard.load(model_id)
+base_model_id = card.data.to_dict()["base_model"]
 
-pipe = StableDiffusionPipeline.from_pretrained('path-to-the-model/checkpoint-250/', torch_dtype=torch.float16).to("cuda")
-image = pipe("<new1> cat sitting in a bucket", num_inference_steps=100, guidance_scale=6., eta=1.).images[0]
+pipe = DiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16).to(
+"cuda")
+pipe.unet.load_attn_procs(model_id, weight_name="pytorch_custom_diffusion_weights.bin")
+pipe.load_textual_inversion(model_id, weight_name="<new1>.bin")
 
+image = pipe(
+    "<new1> cat sitting in a bucket",
+    num_inference_steps=100,
+    guidance_scale=6.0,
+    eta=1.0,
+).images[0]
 image.save("cat.png")
 ```
 
+Here's an example of performing inference with multiple concepts:
+
+```python
+import torch
+from huggingface_hub.repocard import RepoCard
+from diffusers import DiffusionPipeline
+
+model_id = "TODO"
+card = RepoCard.load(model_id)
+base_model_id = card.data.to_dict()["base_model"]
+
+pipe = DiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16).to(
+"cuda")
+pipe.unet.load_attn_procs(model_id, weight_name="pytorch_custom_diffusion_weights.bin")
+pipe.load_textual_inversion(model_id, weight_name="<new1>.bin")
+pipe.load_textual_inversion(model_id, weight_name="<new2>.bin")
+
+image = pipe(
+    "the <new1> cat sculpture in the style of a <new2> wooden pot",
+    num_inference_steps=100,
+    guidance_scale=6.0,
+    eta=1.0,
+).images[0]
+image.save("multi-subject.png")
+```
+
+Here, `cat` and `wooden pot` refer to the multiple concepts.
+
+### Inference from a training checkpoint
+
+You can also perform inference from one of the complete checkpoint saved during the training process, if you used the `--checkpointing_steps` argument. 
+
+TODO.
+
 ## Set grads to none
 To save even more memory, pass the `--set_grads_to_none` argument to the script. This will set grads to None instead of zero. However, be aware that it changes certain behaviors, so if you start experiencing any problems, remove this argument.
 
diff --git a/examples/custom_diffusion/train_custom_diffusion.py b/examples/custom_diffusion/train_custom_diffusion.py
index 9344db6ef910..bb8865404d1e 100644
--- a/examples/custom_diffusion/train_custom_diffusion.py
+++ b/examples/custom_diffusion/train_custom_diffusion.py
@@ -56,7 +56,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.14.0")
+check_min_version("0.15.0.dev0")
 
 logger = get_logger(__name__)
 
@@ -82,7 +82,7 @@ def save_model_card(repo_id: str, images=None, base_model=str, prompt=str, repo_
         - stable-diffusion-diffusers
         - text-to-image
         - diffusers
-        - custom diffusion
+        - custom-diffusion
         inference: true
         ---
             """
@@ -289,6 +289,7 @@ def __getitem__(self, index):
 
 
 def save_new_embed(text_encoder, modifier_token_id, accelerator, args, output_dir):
+    """Saves the new token embeddings from the text encoder."""
     logger.info("Saving embeddings")
     learned_embeds = accelerator.unwrap_model(text_encoder).get_input_embeddings().weight
     for x, y in zip(modifier_token_id, args.modifier_token):
@@ -298,7 +299,7 @@ def save_new_embed(text_encoder, modifier_token_id, accelerator, args, output_di
 
 
 def parse_args(input_args=None):
-    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser = argparse.ArgumentParser(description="Custom Diffusion training script.")
     parser.add_argument(
         "--pretrained_model_name_or_path",
         type=str,
@@ -358,7 +359,7 @@ def parse_args(input_args=None):
     parser.add_argument(
         "--validation_steps",
         type=int,
-        default=500,
+        default=50,
         help=(
             "Run dreambooth validation every X epochs. Dreambooth validation consists of running the prompt"
             " `args.validation_prompt` multiple times: `args.num_validation_images`."
@@ -1266,7 +1267,12 @@ def main(args):
                 repo_folder=args.output_dir,
             )
             api = HfApi(token=args.hub_token)
-            api.upload_folder(repo_id=repo_id, folder_path=args.output_dir, path_in_repo=".", repo_type="model")
+            api.upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
 
     accelerator.end_training()
 

From c7d5487605f93457ab46258b528604d16cb0f4f8 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Tue, 18 Apr 2023 11:34:07 +0530
Subject: [PATCH 16/34] fix: repo_id issue and loaders logging bug.

---
 examples/custom_diffusion/README.md                 | 4 +++-
 examples/custom_diffusion/train_custom_diffusion.py | 5 +----
 src/diffusers/loaders.py                            | 2 +-
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/examples/custom_diffusion/README.md b/examples/custom_diffusion/README.md
index 486e22ac41ba..b4c8ab1bfda5 100644
--- a/examples/custom_diffusion/README.md
+++ b/examples/custom_diffusion/README.md
@@ -108,10 +108,12 @@ accelerate launch train_custom_diffusion.py \
   --max_train_steps=250 \
   --scale_lr --hflip  \
   --modifier_token "<new1>" \
-  --validation_promot="<new1> cat sitting in a bucket" \
+  --validation_prompt="<new1> cat sitting in a bucket" \
   --report_to="wandb"
 ```
 
+If you specify `--push_to_hub`, the learned parameters will be pushed to a repository on the Hugging Face Hub.
+
 ### Training on multiple concepts 🐱🪵
 
 Provide a [json](https://github.com/adobe-research/custom-diffusion/blob/main/assets/concept_list.json) file with the info about each concept, similar to [this](https://github.com/ShivamShrirao/diffusers/blob/main/examples/dreambooth/train_dreambooth.py).
diff --git a/examples/custom_diffusion/train_custom_diffusion.py b/examples/custom_diffusion/train_custom_diffusion.py
index bb8865404d1e..78ce25192b9f 100644
--- a/examples/custom_diffusion/train_custom_diffusion.py
+++ b/examples/custom_diffusion/train_custom_diffusion.py
@@ -759,12 +759,9 @@ def main(args):
             os.makedirs(args.output_dir, exist_ok=True)
 
         if args.push_to_hub:
-            print(args.hub_model_id or Path(args.output_dir).name)
             repo_id = create_repo(
                 repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
-            )
-            print(repo_id)
-            repo_id = args.hub_model_id
+            ).repo_id
 
     # Load the tokenizer
     if args.tokenizer_name:
diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py
index 669aca462c3b..d6730960fba6 100644
--- a/src/diffusers/loaders.py
+++ b/src/diffusers/loaders.py
@@ -619,4 +619,4 @@ def load_textual_inversion(
         for token_id, embedding in zip(token_ids, embeddings):
             self.text_encoder.get_input_embeddings().weight.data[token_id] = embedding
 
-        logger.info("Loaded textual inversion embedding for {token}.")
+        logger.info(f"Loaded textual inversion embedding for {token}.")

From 04072b49beb8cf6e613f9e13d78cf7ca89a2bdf5 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Tue, 18 Apr 2023 11:49:46 +0530
Subject: [PATCH 17/34] fix: save_model_card.

---
 .../train_custom_diffusion.py                 | 32 ++++++++++---------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/examples/custom_diffusion/train_custom_diffusion.py b/examples/custom_diffusion/train_custom_diffusion.py
index 78ce25192b9f..ecb76ab6695c 100644
--- a/examples/custom_diffusion/train_custom_diffusion.py
+++ b/examples/custom_diffusion/train_custom_diffusion.py
@@ -70,27 +70,29 @@ def save_model_card(repo_id: str, images=None, base_model=str, prompt=str, repo_
     img_str = ""
     for i, image in enumerate(images):
         image.save(os.path.join(repo_folder, f"image_{i}.png"))
-        img_str += f"./image_{i}.png\n"
+        img_str += f"![img_{i}](./image_{i}.png)\n"
 
     yaml = f"""
-        ---
-        license: creativeml-openrail-m
-        base_model: {base_model}
-        instance_prompt: {prompt}
-        tags:
-        - stable-diffusion
-        - stable-diffusion-diffusers
-        - text-to-image
-        - diffusers
-        - custom-diffusion
-        inference: true
-        ---
-            """
+---
+license: creativeml-openrail-m
+base_model: {base_model}
+instance_prompt: {prompt}
+tags:
+- stable-diffusion
+- stable-diffusion-diffusers
+- text-to-image
+- diffusers
+- custom-diffusion
+inference: true
+---
+    """
     model_card = f"""
         # Custom Diffusion - {repo_id}
 
         These are Custom Diffusion adaption weights for {base_model}. The weights were trained on {prompt} using [Custom Diffusion](https://www.cs.cmu.edu/~custom-diffusion). You can find some example images in the following. \n
-        {img_str[0]}
+        {img_str}
+        
+        For more details on the training, please follow [this link](https://github.com/huggingface/diffusers/blob/main/examples/custom_diffusion). 
         """
     with open(os.path.join(repo_folder, "README.md"), "w") as f:
         f.write(yaml + model_card)

From 0788ca914c846f85e4f8f00b5d563a8dc2b8e87e Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Tue, 18 Apr 2023 11:56:50 +0530
Subject: [PATCH 18/34] fix: save_model_card.

---
 examples/custom_diffusion/train_custom_diffusion.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/custom_diffusion/train_custom_diffusion.py b/examples/custom_diffusion/train_custom_diffusion.py
index ecb76ab6695c..8c4873e8ecc6 100644
--- a/examples/custom_diffusion/train_custom_diffusion.py
+++ b/examples/custom_diffusion/train_custom_diffusion.py
@@ -87,13 +87,13 @@ def save_model_card(repo_id: str, images=None, base_model=str, prompt=str, repo_
 ---
     """
     model_card = f"""
-        # Custom Diffusion - {repo_id}
+# Custom Diffusion - {repo_id}
 
-        These are Custom Diffusion adaption weights for {base_model}. The weights were trained on {prompt} using [Custom Diffusion](https://www.cs.cmu.edu/~custom-diffusion). You can find some example images in the following. \n
-        {img_str}
-        
-        For more details on the training, please follow [this link](https://github.com/huggingface/diffusers/blob/main/examples/custom_diffusion). 
-        """
+These are Custom Diffusion adaption weights for {base_model}. The weights were trained on {prompt} using [Custom Diffusion](https://www.cs.cmu.edu/~custom-diffusion). You can find some example images in the following. \n
+{img_str}
+
+For more details on the training, please follow [this link](https://github.com/huggingface/diffusers/blob/main/examples/custom_diffusion). 
+"""
     with open(os.path.join(repo_folder, "README.md"), "w") as f:
         f.write(yaml + model_card)
 

From 5e22bc71b184166989f7176b6de36d3472a20da5 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Tue, 18 Apr 2023 11:58:18 +0530
Subject: [PATCH 19/34] fix: save_model_card.

---
 examples/custom_diffusion/train_custom_diffusion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/custom_diffusion/train_custom_diffusion.py b/examples/custom_diffusion/train_custom_diffusion.py
index 8c4873e8ecc6..d959f1dacac4 100644
--- a/examples/custom_diffusion/train_custom_diffusion.py
+++ b/examples/custom_diffusion/train_custom_diffusion.py
@@ -92,7 +92,7 @@ def save_model_card(repo_id: str, images=None, base_model=str, prompt=str, repo_
 These are Custom Diffusion adaption weights for {base_model}. The weights were trained on {prompt} using [Custom Diffusion](https://www.cs.cmu.edu/~custom-diffusion). You can find some example images in the following. \n
 {img_str}
 
-For more details on the training, please follow [this link](https://github.com/huggingface/diffusers/blob/main/examples/custom_diffusion). 
+\nFor more details on the training, please follow [this link](https://github.com/huggingface/diffusers/blob/main/examples/custom_diffusion). 
 """
     with open(os.path.join(repo_folder, "README.md"), "w") as f:
         f.write(yaml + model_card)

From 76c1acd5306ce7b5dbb97ad3da9c8c1b2748578b Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Tue, 18 Apr 2023 12:54:33 +0530
Subject: [PATCH 20/34] add: doc entry.

---
 docs/source/en/_toctree.yml                  |   2 +
 docs/source/en/training/custom_diffusion.mdx | 285 +++++++++++++++++++
 docs/source/en/training/overview.mdx         |   4 +
 examples/custom_diffusion/README.md          |  12 +-
 4 files changed, 299 insertions(+), 4 deletions(-)
 create mode 100644 docs/source/en/training/custom_diffusion.mdx

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index dc40d9b142ba..065c74dc120e 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -74,6 +74,8 @@
       title: ControlNet
     - local: training/instructpix2pix
       title: InstructPix2Pix Training
+    - local: Custom Diffusion
+      title: Custom Diffusion
     title: Training
   - sections:
     - local: using-diffusers/rl
diff --git a/docs/source/en/training/custom_diffusion.mdx b/docs/source/en/training/custom_diffusion.mdx
new file mode 100644
index 000000000000..46c3cab64260
--- /dev/null
+++ b/docs/source/en/training/custom_diffusion.mdx
@@ -0,0 +1,285 @@
+<!--Copyright 2023 Custom Diffusion authors The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Custom Diffusion training example 
+
+[Custom Diffusion](https://arxiv.org/abs/2212.04488) is a method to customize text-to-image models like Stable Diffusion given just a few (4~5) images of a subject.
+The `train_custom_diffusion.py` script shows how to implement the training procedure and adapt it for stable diffusion.
+
+## Running locally with PyTorch
+
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install -e .
+```
+
+Then cd in the example folder and run
+
+```bash
+pip install -r requirements.txt
+pip install clip-retrieval 
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+Or for a default accelerate configuration without answering questions about your environment
+
+```bash
+accelerate config default
+```
+
+Or if your environment doesn't support an interactive shell e.g. a notebook
+
+```python
+from accelerate.utils import write_basic_config
+
+write_basic_config()
+```
+### Cat example 😺
+
+Now let's get our dataset. Download dataset from [here](https://www.cs.cmu.edu/~custom-diffusion/assets/data.zip) and unzip it. 
+
+We also collect 200 real images using `clip-retrieval` which are combined with the target images in the training dataset as a regularization. This prevents overfitting to the the given target image. The following flags enable the regularization `with_prior_preservation`, `real_prior` with `prior_loss_weight=1.`. 
+The `class_prompt` should be the category name same as target image. The collected real images are with text captions similar to the `class_prompt`. The retrieved image are saved in `class_data_dir`. You can disable `real_prior` to use generated images as regularization. To collect the real images use this command first before training. 
+
+```bash
+pip install clip-retrieval
+python retrieve.py --class_prompt cat --class_data_dir real_reg/samples_cat --num_class_images 200
+```
+
+**___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___**
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export OUTPUT_DIR="path-to-save-model"
+export INSTANCE_DIR="./data/cat"
+
+accelerate launch train_custom_diffusion.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --class_data_dir=./real_reg/samples_cat/ \
+  --with_prior_preservation --real_prior --prior_loss_weight=1.0 \
+  --class_prompt="cat" --num_class_images=200 \
+  --instance_prompt="photo of a <new1> cat"  \
+  --resolution=512  \
+  --train_batch_size=2  \
+  --learning_rate=1e-5  \
+  --lr_warmup_steps=0 \
+  --max_train_steps=250 \
+  --scale_lr --hflip  \
+  --modifier_token "<new1>" 
+```
+
+**Use `--enable_xformers_memory_efficient_attention` for faster training with lower VRAM requirement (16GB per GPU). Follow [this guide](https://github.com/facebookresearch/xformers) for installation instructions.**
+
+To track your experiments using Weights and Biases (`wandb`) and to save intermediate results (whcih we HIGHLY recommend), follow these steps:
+
+* Install `wandb`: `pip install wandb`.
+* Authorize: `wandb login`. 
+* Then specify a `validation_prompt` and set `report_to` to `wandb` while launching training. You can also configure the following related arguments:
+    * `num_validation_images`
+    * `validation_steps`
+
+Here is an example command:
+
+```bash
+accelerate launch train_custom_diffusion.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --class_data_dir=./real_reg/samples_cat/ \
+  --with_prior_preservation --real_prior --prior_loss_weight=1.0 \
+  --class_prompt="cat" --num_class_images=200 \
+  --instance_prompt="photo of a <new1> cat"  \
+  --resolution=512  \
+  --train_batch_size=2  \
+  --learning_rate=1e-5  \
+  --lr_warmup_steps=0 \
+  --max_train_steps=250 \
+  --scale_lr --hflip  \
+  --modifier_token "<new1>" \
+  --validation_prompt="<new1> cat sitting in a bucket" \
+  --report_to="wandb"
+```
+
+Here is an example [Weights and Biases page](https://wandb.ai/sayakpaul/custom-diffusion/runs/26ghrcau) where you can check out the intermediate results along with other training details.  
+
+If you specify `--push_to_hub`, the learned parameters will be pushed to a repository on the Hugging Face Hub. Here is an [example repository](https://huggingface.co/sayakpaul/custom-diffusion-cat).
+
+### Training on multiple concepts 🐱🪵
+
+Provide a [json](https://github.com/adobe-research/custom-diffusion/blob/main/assets/concept_list.json) file with the info about each concept, similar to [this](https://github.com/ShivamShrirao/diffusers/blob/main/examples/dreambooth/train_dreambooth.py).
+
+To collect the real images run this command for each concept in the json file. 
+
+```bash
+pip install clip-retrieval
+python retrieve.py --class_prompt {} --class_data_dir {} --num_class_images 200
+```
+
+And then we're ready to start training!
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export OUTPUT_DIR="path-to-save-model"
+
+accelerate launch train_custom_diffusion.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --output_dir=$OUTPUT_DIR \
+  --concepts_list=./concept_list.json \
+  --with_prior_preservation --real_prior --prior_loss_weight=1.0 \
+  --resolution=512  \
+  --train_batch_size=2  \
+  --learning_rate=1e-5  \
+  --lr_warmup_steps=0 \
+  --max_train_steps=500 \
+  --num_class_images=200 \
+  --scale_lr --hflip  \
+  --modifier_token "<new1>+<new2>" 
+```
+
+### Training on human faces
+
+For fine-tuning on human faces we found the following configuration to work better: `learning_rate=5e-6`, `max_train_steps=1000 to 2000`, and `freeze_model=crossattn` with at least 15-20 images. 
+
+To collect the real images use this command first before training. 
+
+```bash
+pip install clip-retrieval
+python retrieve.py --class_prompt person --class_data_dir real_reg/samples_person --num_class_images 200
+```
+
+Then start training!
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export OUTPUT_DIR="path-to-save-model"
+export INSTANCE_DIR="path-to-images"
+
+accelerate launch train_custom_diffusion.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --class_data_dir=./real_reg/samples_person/ \
+  --with_prior_preservation --real_prior --prior_loss_weight=1.0 \
+  --class_prompt="person" --num_class_images=200 \
+  --instance_prompt="photo of a <new1> person"  \
+  --resolution=512  \
+  --train_batch_size=2  \
+  --learning_rate=5e-6  \
+  --lr_warmup_steps=0 \
+  --max_train_steps=1000 \
+  --scale_lr --hflip --noaug \
+  --freeze_model crossattn \
+  --modifier_token "<new1>" \
+  --enable_xformers_memory_efficient_attention 
+```
+
+## Inference
+
+Once you have trained a model using the above command, you can run inference using the below command. Make sure to include the `modifier token` (e.g. \<new1\> in above example) in your prompt.
+
+```python
+import torch
+from diffusers import DiffusionPipeline
+
+pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16).to("cuda")
+pipe.unet.load_attn_procs("path-to-save-model", weight_name="pytorch_custom_diffusion_weights.bin")
+pipe.load_textual_inversion("path-to-save-model", weight_name="<new1>.bin")
+
+image = pipe(
+    "<new1> cat sitting in a bucket",
+    num_inference_steps=100,
+    guidance_scale=6.0,
+    eta=1.0,
+).images[0]
+image.save("cat.png")
+```
+
+It's possible to directly load these parameters from a Hub repository:
+
+```python
+import torch
+from huggingface_hub.repocard import RepoCard
+from diffusers import DiffusionPipeline
+
+model_id = "sayakpaul/custom-diffusion-cat"
+card = RepoCard.load(model_id)
+base_model_id = card.data.to_dict()["base_model"]
+
+pipe = DiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16).to("cuda")
+pipe.unet.load_attn_procs(model_id, weight_name="pytorch_custom_diffusion_weights.bin")
+pipe.load_textual_inversion(model_id, weight_name="<new1>.bin")
+
+image = pipe(
+    "<new1> cat sitting in a bucket",
+    num_inference_steps=100,
+    guidance_scale=6.0,
+    eta=1.0,
+).images[0]
+image.save("cat.png")
+```
+
+Here is an example of performing inference with multiple concepts:
+
+```python
+import torch
+from huggingface_hub.repocard import RepoCard
+from diffusers import DiffusionPipeline
+
+model_id = "sayakpaul/custom-diffusion-cat-wooden-pot"
+card = RepoCard.load(model_id)
+base_model_id = card.data.to_dict()["base_model"]
+
+pipe = DiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16).to("cuda")
+pipe.unet.load_attn_procs(model_id, weight_name="pytorch_custom_diffusion_weights.bin")
+pipe.load_textual_inversion(model_id, weight_name="<new1>.bin")
+pipe.load_textual_inversion(model_id, weight_name="<new2>.bin")
+
+image = pipe(
+    "the <new1> cat sculpture in the style of a <new2> wooden pot",
+    num_inference_steps=100,
+    guidance_scale=6.0,
+    eta=1.0,
+).images[0]
+image.save("multi-subject.png")
+```
+
+Here, `cat` and `wooden pot` refer to the multiple concepts.
+
+### Inference from a training checkpoint
+
+You can also perform inference from one of the complete checkpoint saved during the training process, if you used the `--checkpointing_steps` argument. 
+
+TODO.
+
+## Set grads to none
+To save even more memory, pass the `--set_grads_to_none` argument to the script. This will set grads to None instead of zero. However, be aware that it changes certain behaviors, so if you start experiencing any problems, remove this argument.
+
+More info: https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html
+
+## Experimental results
+You can refer to [our webpage](https://www.cs.cmu.edu/~custom-diffusion/) that discusses our experiments in detail. 
diff --git a/docs/source/en/training/overview.mdx b/docs/source/en/training/overview.mdx
index 5ad3a1f06cc1..c5cea3bb0a96 100644
--- a/docs/source/en/training/overview.mdx
+++ b/docs/source/en/training/overview.mdx
@@ -39,6 +39,8 @@ Training examples show how to pretrain or fine-tune diffusion models for a varie
 - [Dreambooth](./dreambooth)
 - [LoRA Support](./lora)
 - [ControlNet](./controlnet)
+- [InstructPix2Pix](./instructpix2pix)
+- [Custom Diffusion](./custom_diffusion)
 
 If possible, please [install xFormers](../optimization/xformers) for memory efficient attention. This could help make your training faster and less memory intensive.
 
@@ -50,6 +52,8 @@ If possible, please [install xFormers](../optimization/xformers) for memory effi
 | [**Dreambooth**](./dreambooth) | ✅ | - | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_dreambooth_training.ipynb)
 | [**Training with LoRA**](./lora) | ✅ | - | - |
 | [**ControlNet**](./controlnet) | ✅ | ✅ | - |
+| [**InstructPix2Pix**](./instructpix2pix) | ✅ | ✅ | - |
+| [**Custom Diffusion**](./custom_diffusion) | ✅ | ✅ | - |
 
 ## Community
 
diff --git a/examples/custom_diffusion/README.md b/examples/custom_diffusion/README.md
index b4c8ab1bfda5..ecd972737bc3 100644
--- a/examples/custom_diffusion/README.md
+++ b/examples/custom_diffusion/README.md
@@ -112,7 +112,9 @@ accelerate launch train_custom_diffusion.py \
   --report_to="wandb"
 ```
 
-If you specify `--push_to_hub`, the learned parameters will be pushed to a repository on the Hugging Face Hub.
+Here is an example [Weights and Biases page](https://wandb.ai/sayakpaul/custom-diffusion/runs/26ghrcau) where you can check out the intermediate results along with other training details.  
+
+If you specify `--push_to_hub`, the learned parameters will be pushed to a repository on the Hugging Face Hub. Here is an [example repository](https://huggingface.co/sayakpaul/custom-diffusion-cat).
 
 ### Training on multiple concepts 🐱🪵
 
@@ -146,6 +148,8 @@ accelerate launch train_custom_diffusion.py \
   --modifier_token "<new1>+<new2>" 
 ```
 
+Here is an example [Weights and Biases page](https://wandb.ai/sayakpaul/custom-diffusion/runs/3990tzkg) where you can check out the intermediate results along with other training details.  
+
 ### Training on human faces
 
 For fine-tuning on human faces we found the following configuration to work better: `learning_rate=5e-6`, `max_train_steps=1000 to 2000`, and `freeze_model=crossattn` with at least 15-20 images. 
@@ -215,7 +219,7 @@ import torch
 from huggingface_hub.repocard import RepoCard
 from diffusers import DiffusionPipeline
 
-model_id = "TODO"
+model_id = "sayakpaul/custom-diffusion-cat"
 card = RepoCard.load(model_id)
 base_model_id = card.data.to_dict()["base_model"]
 
@@ -233,14 +237,14 @@ image = pipe(
 image.save("cat.png")
 ```
 
-Here's an example of performing inference with multiple concepts:
+Here is an example of performing inference with multiple concepts:
 
 ```python
 import torch
 from huggingface_hub.repocard import RepoCard
 from diffusers import DiffusionPipeline
 
-model_id = "TODO"
+model_id = "sayakpaul/custom-diffusion-cat-wooden-pot"
 card = RepoCard.load(model_id)
 base_model_id = card.data.to_dict()["base_model"]
 

From 861f8d7165fcf944830ede9ed58d75384e5e0c98 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Tue, 18 Apr 2023 12:55:08 +0530
Subject: [PATCH 21/34] refactor doc,.

---
 docs/source/en/training/custom_diffusion.mdx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/en/training/custom_diffusion.mdx b/docs/source/en/training/custom_diffusion.mdx
index 46c3cab64260..1e1958e1c946 100644
--- a/docs/source/en/training/custom_diffusion.mdx
+++ b/docs/source/en/training/custom_diffusion.mdx
@@ -161,6 +161,8 @@ accelerate launch train_custom_diffusion.py \
   --modifier_token "<new1>+<new2>" 
 ```
 
+Here is an example [Weights and Biases page](https://wandb.ai/sayakpaul/custom-diffusion/runs/3990tzkg) where you can check out the intermediate results along with other training details.  
+
 ### Training on human faces
 
 For fine-tuning on human faces we found the following configuration to work better: `learning_rate=5e-6`, `max_train_steps=1000 to 2000`, and `freeze_model=crossattn` with at least 15-20 images. 

From a2bbe6de4098da27c05af6d0e874c3c50364448b Mon Sep 17 00:00:00 2001
From: Nupur Kumari <nupurkumari@Nupurs-MacBook-Pro.local>
Date: Tue, 18 Apr 2023 21:35:23 -0400
Subject: [PATCH 22/34] custom diffusion

---
 .../train_custom_diffusion.py                 |  51 ++++--
 examples/test_examples.py                     |  24 +++
 src/diffusers/loaders.py                      |   3 +-
 src/diffusers/models/attention_processor.py   |  47 +++---
 tests/models/test_models_unet_2d_condition.py | 148 +++++++++++++++++-
 5 files changed, 234 insertions(+), 39 deletions(-)

diff --git a/examples/custom_diffusion/train_custom_diffusion.py b/examples/custom_diffusion/train_custom_diffusion.py
index d959f1dacac4..93989c751ec7 100644
--- a/examples/custom_diffusion/train_custom_diffusion.py
+++ b/examples/custom_diffusion/train_custom_diffusion.py
@@ -878,7 +878,7 @@ def main(args):
     custom_diffusion_attn_procs = {}
 
     st = unet.state_dict()
-    for name, attn in unet.attn_processors.items():
+    for name, _ in unet.attn_processors.items():
         cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
         if name.startswith("mid_block"):
             hidden_size = unet.config.block_out_channels[-1]
@@ -890,36 +890,61 @@ def main(args):
             hidden_size = unet.config.block_out_channels[block_id]
         layer_name = name.split(".processor")[0]
         weights = {
-            "to_k": st[layer_name + ".to_k.weight"],
-            "to_v": st[layer_name + ".to_v.weight"],
-            "to_q": st[layer_name + ".to_q.weight"],
-            "to_out.weight": st[layer_name + ".to_out.0.weight"],
-            "to_out.bias": st[layer_name + ".to_out.0.bias"],
-        }
+            "to_k_custom_diffusion.weight": st[layer_name + ".to_k.weight"],
+            "to_v_custom_diffusion.weight": st[layer_name + ".to_v.weight"]}
+        if train_q_out:
+            weights["to_q_custom_diffusion.weight"] = st[layer_name + ".to_q.weight"]
+            weights["to_out_custom_diffusion.0.weight"] = st[layer_name + ".to_out.0.weight"]
+            weights["to_out_custom_diffusion.0.bias"] = st[layer_name + ".to_out.0.bias"]
         if cross_attention_dim is not None:
             custom_diffusion_attn_procs[name] = CustomDiffusionAttnProcessor(
-                weights,
                 train_kv=train_kv,
                 train_q_out=train_q_out,
                 hidden_size=hidden_size,
                 cross_attention_dim=cross_attention_dim,
             ).to(unet.device)
+            custom_diffusion_attn_procs[name].load_state_dict(weights)
         else:
             custom_diffusion_attn_procs[name] = CustomDiffusionAttnProcessor(
-                weights,
                 train_kv=False,
                 train_q_out=False,
                 hidden_size=hidden_size,
                 cross_attention_dim=cross_attention_dim,
-            )  # attn
+            )
     del st
     unet.set_attn_processor(custom_diffusion_attn_procs)
-    custom_diffusion_layers = AttnProcsLayers(
-        {y: x for (y, x) in unet.attn_processors.items() if isinstance(x, CustomDiffusionAttnProcessor)}
-    )
+    custom_diffusion_layers = AttnProcsLayers(unet.attn_processors)
 
     accelerator.register_for_checkpointing(custom_diffusion_layers)
 
+    # to test xformers temporary
+    # from diffusers.utils import floats_tensor
+
+    # def dummy_input():
+    #     torch_device = accelerator.device
+    #     batch_size = 4
+    #     num_channels = 4
+    #     sizes = (32, 32)
+
+    #     noise = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
+    #     time_step = torch.tensor([10]).to(torch_device)
+    #     encoder_hidden_states = floats_tensor((batch_size, 77, 768)).to(torch_device)
+
+    #     return {"sample": noise, "timestep": time_step, "encoder_hidden_states": encoder_hidden_states}
+
+    # with torch.no_grad():
+    #     inputs_dict = dummy_input()
+    #     sample = unet(**inputs_dict).sample
+
+    #     unet.enable_xformers_memory_efficient_attention()
+    #     on_sample = unet(**inputs_dict).sample
+
+    #     unet.disable_xformers_memory_efficient_attention()
+    #     off_sample = unet(**inputs_dict).sample
+    #     print((sample - off_sample).abs().max(), (sample - on_sample).abs().max() )
+    # assert (sample - on_sample).abs().max() < 1e-4
+    # assert (sample - off_sample).abs().max() < 1e-4
+
     if args.enable_xformers_memory_efficient_attention:
         if is_xformers_available():
             import xformers
diff --git a/examples/test_examples.py b/examples/test_examples.py
index d9a1f86e53aa..12dc76ed594e 100644
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -221,6 +221,30 @@ def test_dreambooth_checkpointing(self):
             self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-4")))
             self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-6")))
 
+    def test_custom_diffusion(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/custom_diffusion/train_custom_diffusion.py
+                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-pipe
+                --instance_data_dir docs/source/en/imgs
+                --instance_prompt "photo of a <new1>"
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --modifier_token "<new1>"
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_custom_diffusion_weights.bin")))
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "<new1>.bin")))
+
     def test_text_to_image(self):
         with tempfile.TemporaryDirectory() as tmpdir:
             test_args = f"""
diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py
index 49889031da1d..7f41ddaba498 100644
--- a/src/diffusers/loaders.py
+++ b/src/diffusers/loaders.py
@@ -252,14 +252,13 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
             for key, value_dict in custom_diffusion_grouped_dict.items():
                 if len(value_dict) == 0:
                     attn_processors[key] = CustomDiffusionAttnProcessor(
-                        weights=None, train_kv=False, train_q_out=False, hidden_size=None, cross_attention_dim=None
+                        train_kv=False, train_q_out=False, hidden_size=None, cross_attention_dim=None
                     )
                 else:
                     cross_attention_dim = value_dict["to_k_custom_diffusion.weight"].shape[1]
                     hidden_size = value_dict["to_k_custom_diffusion.weight"].shape[0]
                     train_q_out = True if "to_q_custom_diffusion.weight" in value_dict else False
                     attn_processors[key] = CustomDiffusionAttnProcessor(
-                        weights=None,
                         train_kv=True,
                         train_q_out=train_q_out,
                         hidden_size=hidden_size,
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index 6b6f35069239..6040b2b1446e 100644
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -218,7 +218,15 @@ def set_use_memory_efficient_attention_xformers(
                 processor.load_state_dict(self.processor.state_dict())
                 processor.to(self.processor.to_q_lora.up.weight.device)
             elif is_custom_diffusion:
-                processor = self.processor
+                processor = CustomDiffusionAttnProcessor(
+                    train_kv=self.processor.train_kv,
+                    train_q_out=self.processor.train_q_out,
+                    hidden_size=self.processor.hidden_size,
+                    cross_attention_dim=self.processor.cross_attention_dim,
+                )
+                processor.load_state_dict(self.processor.state_dict())
+                if hasattr(self.processor, "to_k_custom_diffusion"):
+                    processor.to(self.processor.to_k_custom_diffusion.weight.device)
             else:
                 processor = AttnProcessor()
 
@@ -478,7 +486,6 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a
 class CustomDiffusionAttnProcessor(nn.Module):
     def __init__(
         self,
-        weights=None,
         train_kv=True,
         train_q_out=True,
         hidden_size=None,
@@ -495,22 +502,13 @@ def __init__(
 
         # `_custom_diffusion` id for easy serialization and loading.
         if self.train_kv:
-            self.to_k_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size)
-            self.to_v_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size)
-            if weights is not None:
-                with torch.no_grad():
-                    self.to_k_custom_diffusion.weight.copy_(weights["to_k"])
-                    self.to_v_custom_diffusion.weight.copy_(weights["to_v"])
+            self.to_k_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+            self.to_v_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
         if self.train_q_out:
-            self.to_q_custom_diffusion = nn.Linear(hidden_size, hidden_size)
+            self.to_q_custom_diffusion = nn.Linear(hidden_size, hidden_size, bias=False)
             self.to_out_custom_diffusion = nn.ModuleList([])
             self.to_out_custom_diffusion.append(nn.Linear(hidden_size, hidden_size, bias=out_bias))
             self.to_out_custom_diffusion.append(nn.Dropout(dropout))
-            if weights is not None:
-                with torch.no_grad():
-                    self.to_q_custom_diffusion.weight.copy_(weights["to_q"])
-                    self.to_out_custom_diffusion[0].weight.copy_(weights["to_out.weight"])
-                    self.to_out_custom_diffusion[0].bias.copy_(weights["to_out.bias"])
 
     def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None):
         batch_size, sequence_length, _ = hidden_states.shape
@@ -525,8 +523,8 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a
             encoder_hidden_states = hidden_states
         else:
             crossattn = True
-            if attn.cross_attention_norm:
-                encoder_hidden_states = attn.norm_cross(encoder_hidden_states)
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
 
         if self.train_kv:
             key = self.to_k_custom_diffusion(encoder_hidden_states)
@@ -824,17 +822,21 @@ def __init__(
 
         # `_custom_diffusion` id for easy serialization and loading.
         if self.train_kv:
-            self.to_k_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size)
-            self.to_v_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size)
+            self.to_k_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+            self.to_v_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
         if self.train_q_out:
-            self.to_q_custom_diffusion = nn.Linear(hidden_size, hidden_size)
+            self.to_q_custom_diffusion = nn.Linear(hidden_size, hidden_size, bias=False)
             self.to_out_custom_diffusion = nn.ModuleList([])
             self.to_out_custom_diffusion.append(nn.Linear(hidden_size, hidden_size, bias=out_bias))
             self.to_out_custom_diffusion.append(nn.Dropout(dropout))
 
     def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None):
-        batch_size, sequence_length, _ = hidden_states.shape
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
         attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
         if self.train_q_out:
             query = self.to_q_custom_diffusion(hidden_states)
         else:
@@ -845,8 +847,8 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a
             encoder_hidden_states = hidden_states
         else:
             crossattn = True
-            if attn.cross_attention_norm:
-                encoder_hidden_states = attn.norm_cross(encoder_hidden_states)
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
 
         if self.train_kv:
             key = self.to_k_custom_diffusion(encoder_hidden_states)
@@ -881,7 +883,6 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a
             hidden_states = attn.to_out[0](hidden_states)
             # dropout
             hidden_states = attn.to_out[1](hidden_states)
-
         return hidden_states
 
 
diff --git a/tests/models/test_models_unet_2d_condition.py b/tests/models/test_models_unet_2d_condition.py
index 15f77fb8c106..580e817c2bbf 100644
--- a/tests/models/test_models_unet_2d_condition.py
+++ b/tests/models/test_models_unet_2d_condition.py
@@ -22,7 +22,7 @@
 from parameterized import parameterized
 
 from diffusers import UNet2DConditionModel
-from diffusers.models.attention_processor import LoRAAttnProcessor
+from diffusers.models.attention_processor import LoRAAttnProcessor, CustomDiffusionAttnProcessor
 from diffusers.utils import (
     floats_tensor,
     load_hf_numpy,
@@ -68,6 +68,55 @@ def create_lora_layers(model, mock_weights: bool = True):
     return lora_attn_procs
 
 
+def create_custom_diffusion_layers(model, mock_weights: bool = True):
+    train_kv = True
+    train_q_out = True
+    custom_diffusion_attn_procs = {}
+
+    st = model.state_dict()
+    for name, attn in model.attn_processors.items():
+        cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim
+        if name.startswith("mid_block"):
+            hidden_size = model.config.block_out_channels[-1]
+        elif name.startswith("up_blocks"):
+            block_id = int(name[len("up_blocks.")])
+            hidden_size = list(reversed(model.config.block_out_channels))[block_id]
+        elif name.startswith("down_blocks"):
+            block_id = int(name[len("down_blocks.")])
+            hidden_size = model.config.block_out_channels[block_id]
+        layer_name = name.split(".processor")[0]
+        weights = {
+            "to_k_custom_diffusion.weight": st[layer_name + ".to_k.weight"],
+            "to_v_custom_diffusion.weight": st[layer_name + ".to_v.weight"]}
+        if train_q_out:
+            weights["to_q_custom_diffusion.weight"] = st[layer_name + ".to_q.weight"]
+            weights["to_out_custom_diffusion.0.weight"] = st[layer_name + ".to_out.0.weight"]
+            weights["to_out_custom_diffusion.0.bias"] = st[layer_name + ".to_out.0.bias"]
+        if cross_attention_dim is not None:
+            custom_diffusion_attn_procs[name] = CustomDiffusionAttnProcessor(
+                train_kv=train_kv,
+                train_q_out=train_q_out,
+                hidden_size=hidden_size,
+                cross_attention_dim=cross_attention_dim,
+            ).to(model.device)
+            custom_diffusion_attn_procs[name].load_state_dict(weights)
+            if mock_weights:
+                # add 1 to weights to mock trained weights
+                with torch.no_grad():
+                    custom_diffusion_attn_procs[name].to_k_custom_diffusion.weight += 1
+                    custom_diffusion_attn_procs[name].to_v_custom_diffusion.weight += 1
+        else:
+            custom_diffusion_attn_procs[name] = CustomDiffusionAttnProcessor(
+                weights,
+                train_kv=False,
+                train_q_out=False,
+                hidden_size=hidden_size,
+                cross_attention_dim=cross_attention_dim,
+            )
+    del st
+    return custom_diffusion_attn_procs
+
+
 class UNet2DConditionModelTests(ModelTesterMixin, unittest.TestCase):
     model_class = UNet2DConditionModel
 
@@ -569,6 +618,103 @@ def test_lora_xformers_on_off(self):
         assert (sample - on_sample).abs().max() < 1e-4
         assert (sample - off_sample).abs().max() < 1e-4
 
+    def test_custom_diffusion_processors(self):
+        # enable deterministic behavior for gradient checkpointing
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["attention_head_dim"] = (8, 16)
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+
+        with torch.no_grad():
+            sample1 = model(**inputs_dict).sample
+
+        custom_diffusion_attn_procs = create_custom_diffusion_layers(model)
+
+        # make sure we can set a list of attention processors
+        model.set_attn_processor(custom_diffusion_attn_procs)
+        model.to(torch_device)
+
+        # test that attn processors can be set to itself
+        model.set_attn_processor(model.attn_processors)
+
+        with torch.no_grad():
+            sample2 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
+            sample3 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+            sample4 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+
+        assert (sample1 - sample2).abs().max() < 1e-4
+        assert (sample3 - sample4).abs().max() < 1e-4
+
+        # sample 2 and sample 3 should be different
+        assert (sample2 - sample3).abs().max() > 1e-4
+
+    def test_custom_diffusion_save_load(self):
+        # enable deterministic behavior for gradient checkpointing
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["attention_head_dim"] = (8, 16)
+
+        torch.manual_seed(0)
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+
+        with torch.no_grad():
+            old_sample = model(**inputs_dict).sample
+
+        custom_diffusion_attn_procs = create_custom_diffusion_layers(model)
+        model.set_attn_processor(custom_diffusion_attn_procs)
+
+        with torch.no_grad():
+            sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_attn_procs(tmpdirname)
+            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_custom_diffusion_weights.bin")))
+            torch.manual_seed(0)
+            new_model = self.model_class(**init_dict)
+            new_model.to(torch_device)
+            new_model.load_attn_procs(tmpdirname)
+
+        with torch.no_grad():
+            new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+
+        assert (sample - new_sample).abs().max() < 1e-4
+
+        # custom diffusion and no custom diffusion should be the same
+        assert (sample - old_sample).abs().max() < 1e-4
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_custom_diffusion_xformers_on_off(self):
+        # enable deterministic behavior for gradient checkpointing
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["attention_head_dim"] = (8, 16)
+
+        torch.manual_seed(0)
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        custom_diffusion_attn_procs = create_custom_diffusion_layers(model)
+        model.set_attn_processor(custom_diffusion_attn_procs)
+
+        # default
+        with torch.no_grad():
+            sample = model(**inputs_dict).sample
+
+            model.enable_xformers_memory_efficient_attention()
+            on_sample = model(**inputs_dict).sample
+
+            model.disable_xformers_memory_efficient_attention()
+            off_sample = model(**inputs_dict).sample
+
+        assert (sample - on_sample).abs().max() < 1e-4
+        assert (sample - off_sample).abs().max() < 1e-4
+
+
 
 @slow
 class UNet2DConditionModelIntegrationTests(unittest.TestCase):

From 08a9bde0bbd27051b8b8844858ba77a7c2530186 Mon Sep 17 00:00:00 2001
From: Nupur Kumari <nupurkumari@Nupurs-MacBook-Pro.local>
Date: Tue, 18 Apr 2023 21:37:15 -0400
Subject: [PATCH 23/34] custom diffusion

---
 .../train_custom_diffusion.py                 | 28 -------------------
 1 file changed, 28 deletions(-)

diff --git a/examples/custom_diffusion/train_custom_diffusion.py b/examples/custom_diffusion/train_custom_diffusion.py
index 93989c751ec7..3004ace012f6 100644
--- a/examples/custom_diffusion/train_custom_diffusion.py
+++ b/examples/custom_diffusion/train_custom_diffusion.py
@@ -917,34 +917,6 @@ def main(args):
 
     accelerator.register_for_checkpointing(custom_diffusion_layers)
 
-    # to test xformers temporary
-    # from diffusers.utils import floats_tensor
-
-    # def dummy_input():
-    #     torch_device = accelerator.device
-    #     batch_size = 4
-    #     num_channels = 4
-    #     sizes = (32, 32)
-
-    #     noise = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
-    #     time_step = torch.tensor([10]).to(torch_device)
-    #     encoder_hidden_states = floats_tensor((batch_size, 77, 768)).to(torch_device)
-
-    #     return {"sample": noise, "timestep": time_step, "encoder_hidden_states": encoder_hidden_states}
-
-    # with torch.no_grad():
-    #     inputs_dict = dummy_input()
-    #     sample = unet(**inputs_dict).sample
-
-    #     unet.enable_xformers_memory_efficient_attention()
-    #     on_sample = unet(**inputs_dict).sample
-
-    #     unet.disable_xformers_memory_efficient_attention()
-    #     off_sample = unet(**inputs_dict).sample
-    #     print((sample - off_sample).abs().max(), (sample - on_sample).abs().max() )
-    # assert (sample - on_sample).abs().max() < 1e-4
-    # assert (sample - off_sample).abs().max() < 1e-4
-
     if args.enable_xformers_memory_efficient_attention:
         if is_xformers_available():
             import xformers

From b14f31830353a42895426dfa65c4ffd432686d5d Mon Sep 17 00:00:00 2001
From: Nupur Kumari <nupurkumari@Nupurs-MacBook-Pro.local>
Date: Tue, 18 Apr 2023 22:31:00 -0400
Subject: [PATCH 24/34] custom diffusion

---
 tests/models/test_models_unet_2d_condition.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/models/test_models_unet_2d_condition.py b/tests/models/test_models_unet_2d_condition.py
index 580e817c2bbf..3057a2e6503c 100644
--- a/tests/models/test_models_unet_2d_condition.py
+++ b/tests/models/test_models_unet_2d_condition.py
@@ -74,7 +74,7 @@ def create_custom_diffusion_layers(model, mock_weights: bool = True):
     custom_diffusion_attn_procs = {}
 
     st = model.state_dict()
-    for name, attn in model.attn_processors.items():
+    for name, _ in model.attn_processors.items():
         cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim
         if name.startswith("mid_block"):
             hidden_size = model.config.block_out_channels[-1]
@@ -715,7 +715,6 @@ def test_custom_diffusion_xformers_on_off(self):
         assert (sample - off_sample).abs().max() < 1e-4
 
 
-
 @slow
 class UNet2DConditionModelIntegrationTests(unittest.TestCase):
     def get_file_format(self, seed, shape):

From 9153f07dd3432d0b505eca95c0824a08d482e3ef Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 19 Apr 2023 09:40:41 +0530
Subject: [PATCH 25/34] apply style.

---
 examples/custom_diffusion/train_custom_diffusion.py | 3 ++-
 tests/models/test_models_unet_2d_condition.py       | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/examples/custom_diffusion/train_custom_diffusion.py b/examples/custom_diffusion/train_custom_diffusion.py
index 3004ace012f6..90251214d7c6 100644
--- a/examples/custom_diffusion/train_custom_diffusion.py
+++ b/examples/custom_diffusion/train_custom_diffusion.py
@@ -891,7 +891,8 @@ def main(args):
         layer_name = name.split(".processor")[0]
         weights = {
             "to_k_custom_diffusion.weight": st[layer_name + ".to_k.weight"],
-            "to_v_custom_diffusion.weight": st[layer_name + ".to_v.weight"]}
+            "to_v_custom_diffusion.weight": st[layer_name + ".to_v.weight"],
+        }
         if train_q_out:
             weights["to_q_custom_diffusion.weight"] = st[layer_name + ".to_q.weight"]
             weights["to_out_custom_diffusion.0.weight"] = st[layer_name + ".to_out.0.weight"]
diff --git a/tests/models/test_models_unet_2d_condition.py b/tests/models/test_models_unet_2d_condition.py
index 3057a2e6503c..53a056c1d9bc 100644
--- a/tests/models/test_models_unet_2d_condition.py
+++ b/tests/models/test_models_unet_2d_condition.py
@@ -22,7 +22,7 @@
 from parameterized import parameterized
 
 from diffusers import UNet2DConditionModel
-from diffusers.models.attention_processor import LoRAAttnProcessor, CustomDiffusionAttnProcessor
+from diffusers.models.attention_processor import CustomDiffusionAttnProcessor, LoRAAttnProcessor
 from diffusers.utils import (
     floats_tensor,
     load_hf_numpy,
@@ -87,7 +87,8 @@ def create_custom_diffusion_layers(model, mock_weights: bool = True):
         layer_name = name.split(".processor")[0]
         weights = {
             "to_k_custom_diffusion.weight": st[layer_name + ".to_k.weight"],
-            "to_v_custom_diffusion.weight": st[layer_name + ".to_v.weight"]}
+            "to_v_custom_diffusion.weight": st[layer_name + ".to_v.weight"],
+        }
         if train_q_out:
             weights["to_q_custom_diffusion.weight"] = st[layer_name + ".to_q.weight"]
             weights["to_out_custom_diffusion.0.weight"] = st[layer_name + ".to_out.0.weight"]

From 3960e407ac81aa8b4b8776f9c83c85dd69791c3a Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 19 Apr 2023 09:43:00 +0530
Subject: [PATCH 26/34] remove tralining whitespace.

---
 examples/custom_diffusion/train_custom_diffusion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/custom_diffusion/train_custom_diffusion.py b/examples/custom_diffusion/train_custom_diffusion.py
index 90251214d7c6..299e1abd8323 100644
--- a/examples/custom_diffusion/train_custom_diffusion.py
+++ b/examples/custom_diffusion/train_custom_diffusion.py
@@ -92,7 +92,7 @@ def save_model_card(repo_id: str, images=None, base_model=str, prompt=str, repo_
 These are Custom Diffusion adaption weights for {base_model}. The weights were trained on {prompt} using [Custom Diffusion](https://www.cs.cmu.edu/~custom-diffusion). You can find some example images in the following. \n
 {img_str}
 
-\nFor more details on the training, please follow [this link](https://github.com/huggingface/diffusers/blob/main/examples/custom_diffusion). 
+\nFor more details on the training, please follow [this link](https://github.com/huggingface/diffusers/blob/main/examples/custom_diffusion).
 """
     with open(os.path.join(repo_folder, "README.md"), "w") as f:
         f.write(yaml + model_card)

From d74070fa4699578fd8c4773a1588bb395ee2d93b Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 19 Apr 2023 09:43:57 +0530
Subject: [PATCH 27/34] fix: toctree entry.

---
 docs/source/en/_toctree.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 05edb649122e..de33ba616d0a 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -74,7 +74,7 @@
       title: ControlNet
     - local: training/instructpix2pix
       title: InstructPix2Pix Training
-    - local: Custom Diffusion
+    - local: training/custom_diffusion
       title: Custom Diffusion
     title: Training
   - sections:

From e947c194e241fb9f2c6c443b0350e4f117f73b46 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 19 Apr 2023 10:25:26 +0530
Subject: [PATCH 28/34] remove unnecessary print.

---
 examples/custom_diffusion/train_custom_diffusion.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/custom_diffusion/train_custom_diffusion.py b/examples/custom_diffusion/train_custom_diffusion.py
index 299e1abd8323..62467f9ed14c 100644
--- a/examples/custom_diffusion/train_custom_diffusion.py
+++ b/examples/custom_diffusion/train_custom_diffusion.py
@@ -667,7 +667,6 @@ def main(args):
     # We need to initialize the trackers we use, and also store our configuration.
     # The trackers initializes automatically on the main process.
     if accelerator.is_main_process:
-        print(vars(args))
         accelerator.init_trackers("custom-diffusion", config=vars(args))
 
     # If passed along, set the training seed now.

From df2649f30588494b69aab2282663300a2c3c839f Mon Sep 17 00:00:00 2001
From: Nupur Kumari <nupurkumari@Nupurs-MacBook-Pro.local>
Date: Wed, 19 Apr 2023 02:01:41 -0400
Subject: [PATCH 29/34] custom diffusion

---
 tests/models/test_models_unet_2d_condition.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/models/test_models_unet_2d_condition.py b/tests/models/test_models_unet_2d_condition.py
index 3057a2e6503c..f09883e45729 100644
--- a/tests/models/test_models_unet_2d_condition.py
+++ b/tests/models/test_models_unet_2d_condition.py
@@ -107,7 +107,6 @@ def create_custom_diffusion_layers(model, mock_weights: bool = True):
                     custom_diffusion_attn_procs[name].to_v_custom_diffusion.weight += 1
         else:
             custom_diffusion_attn_procs[name] = CustomDiffusionAttnProcessor(
-                weights,
                 train_kv=False,
                 train_q_out=False,
                 hidden_size=hidden_size,

From 4f97f3f950b6ae43c149f2e780406a50ad5c39f2 Mon Sep 17 00:00:00 2001
From: Nupur Kumari <nupurkumari@Nupurs-MacBook-Pro.local>
Date: Wed, 19 Apr 2023 02:12:21 -0400
Subject: [PATCH 30/34] custom diffusion

---
 tests/models/test_models_unet_2d_condition.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/tests/models/test_models_unet_2d_condition.py b/tests/models/test_models_unet_2d_condition.py
index 0009130f1b3e..2ec7026998df 100644
--- a/tests/models/test_models_unet_2d_condition.py
+++ b/tests/models/test_models_unet_2d_condition.py
@@ -640,15 +640,9 @@ def test_custom_diffusion_processors(self):
         model.set_attn_processor(model.attn_processors)
 
         with torch.no_grad():
-            sample2 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
-            sample3 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-            sample4 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+            sample2 = model(**inputs_dict).sample
 
         assert (sample1 - sample2).abs().max() < 1e-4
-        assert (sample3 - sample4).abs().max() < 1e-4
-
-        # sample 2 and sample 3 should be different
-        assert (sample2 - sample3).abs().max() > 1e-4
 
     def test_custom_diffusion_save_load(self):
         # enable deterministic behavior for gradient checkpointing
@@ -667,7 +661,7 @@ def test_custom_diffusion_save_load(self):
         model.set_attn_processor(custom_diffusion_attn_procs)
 
         with torch.no_grad():
-            sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+            sample = model(**inputs_dict).sample
 
         with tempfile.TemporaryDirectory() as tmpdirname:
             model.save_attn_procs(tmpdirname)
@@ -678,7 +672,7 @@ def test_custom_diffusion_save_load(self):
             new_model.load_attn_procs(tmpdirname)
 
         with torch.no_grad():
-            new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+            new_sample = new_model(**inputs_dict).sample
 
         assert (sample - new_sample).abs().max() < 1e-4
 

From 9189ecd8d814701ede68276fbbf84a70c9df913e Mon Sep 17 00:00:00 2001
From: Nupur Kumari <nupurkumari@Nupurs-MacBook-Pro.local>
Date: Wed, 19 Apr 2023 02:33:11 -0400
Subject: [PATCH 31/34] custom diffusion test

---
 examples/test_examples.py                     | 2 +-
 tests/models/test_models_unet_2d_condition.py | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/test_examples.py b/examples/test_examples.py
index 12dc76ed594e..5e38ef27a4f3 100644
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -227,7 +227,7 @@ def test_custom_diffusion(self):
                 examples/custom_diffusion/train_custom_diffusion.py
                 --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-pipe
                 --instance_data_dir docs/source/en/imgs
-                --instance_prompt "photo of a <new1>"
+                --instance_prompt "<new1>"
                 --resolution 64
                 --train_batch_size 1
                 --gradient_accumulation_steps 1
diff --git a/tests/models/test_models_unet_2d_condition.py b/tests/models/test_models_unet_2d_condition.py
index 2ec7026998df..2576297762a8 100644
--- a/tests/models/test_models_unet_2d_condition.py
+++ b/tests/models/test_models_unet_2d_condition.py
@@ -630,7 +630,7 @@ def test_custom_diffusion_processors(self):
         with torch.no_grad():
             sample1 = model(**inputs_dict).sample
 
-        custom_diffusion_attn_procs = create_custom_diffusion_layers(model)
+        custom_diffusion_attn_procs = create_custom_diffusion_layers(model, mock_weights=False)
 
         # make sure we can set a list of attention processors
         model.set_attn_processor(custom_diffusion_attn_procs)
@@ -657,7 +657,7 @@ def test_custom_diffusion_save_load(self):
         with torch.no_grad():
             old_sample = model(**inputs_dict).sample
 
-        custom_diffusion_attn_procs = create_custom_diffusion_layers(model)
+        custom_diffusion_attn_procs = create_custom_diffusion_layers(model, mock_weights=False)
         model.set_attn_processor(custom_diffusion_attn_procs)
 
         with torch.no_grad():
@@ -669,7 +669,7 @@ def test_custom_diffusion_save_load(self):
             torch.manual_seed(0)
             new_model = self.model_class(**init_dict)
             new_model.to(torch_device)
-            new_model.load_attn_procs(tmpdirname)
+            new_model.load_attn_procs(tmpdirname, weight_name="pytorch_custom_diffusion_weights.bin")
 
         with torch.no_grad():
             new_sample = new_model(**inputs_dict).sample
@@ -692,7 +692,7 @@ def test_custom_diffusion_xformers_on_off(self):
         torch.manual_seed(0)
         model = self.model_class(**init_dict)
         model.to(torch_device)
-        custom_diffusion_attn_procs = create_custom_diffusion_layers(model)
+        custom_diffusion_attn_procs = create_custom_diffusion_layers(model, mock_weights=False)
         model.set_attn_processor(custom_diffusion_attn_procs)
 
         # default

From 388b2cd609badc294fb0c3b5f8626db06313355d Mon Sep 17 00:00:00 2001
From: Nupur Kumari <nupurkumari@nupurs-mbp.wifi.local.cmu.edu>
Date: Wed, 19 Apr 2023 15:30:25 -0400
Subject: [PATCH 32/34] custom diffusion xformer update

---
 .../train_custom_diffusion.py                 | 43 +++++++++++--------
 examples/test_examples.py                     |  2 +-
 src/diffusers/models/attention_processor.py   |  2 +-
 3 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/examples/custom_diffusion/train_custom_diffusion.py b/examples/custom_diffusion/train_custom_diffusion.py
index 62467f9ed14c..b42561f3cebb 100644
--- a/examples/custom_diffusion/train_custom_diffusion.py
+++ b/examples/custom_diffusion/train_custom_diffusion.py
@@ -49,7 +49,7 @@
     UNet2DConditionModel,
 )
 from diffusers.loaders import AttnProcsLayers
-from diffusers.models.attention_processor import CustomDiffusionAttnProcessor
+from diffusers.models.attention_processor import CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor
 from diffusers.optimization import get_scheduler
 from diffusers.utils import check_min_version, is_wandb_available
 from diffusers.utils.import_utils import is_xformers_available
@@ -167,6 +167,7 @@ def __init__(
         concepts_list,
         tokenizer,
         size=512,
+        mask_size=64,
         center_crop=False,
         with_prior_preservation=False,
         num_class_images=200,
@@ -174,6 +175,7 @@ def __init__(
         aug=True,
     ):
         self.size = size
+        self.mask_size = mask_size
         self.center_crop = center_crop
         self.tokenizer = tokenizer
         self.interpolation = Image.BILINEAR
@@ -223,6 +225,7 @@ def __len__(self):
 
     def preprocess(self, image, scale, resample):
         outer, inner = self.size, scale
+        factor = self.size // self.mask_size
         if scale > self.size:
             outer, inner = scale, self.size
         top, left = np.random.randint(0, outer - inner + 1), np.random.randint(0, outer - inner + 1)
@@ -230,13 +233,13 @@ def preprocess(self, image, scale, resample):
         image = np.array(image).astype(np.uint8)
         image = (image / 127.5 - 1.0).astype(np.float32)
         instance_image = np.zeros((self.size, self.size, 3), dtype=np.float32)
-        mask = np.zeros((self.size // 8, self.size // 8))
+        mask = np.zeros((self.size // factor, self.size // factor))
         if scale > self.size:
             instance_image = image[top : top + inner, left : left + inner, :]
-            mask = np.ones((self.size // 8, self.size // 8))
+            mask = np.ones((self.size // factor, self.size // factor))
         else:
             instance_image[top : top + inner, left : left + inner, :] = image
-            mask[top // 8 + 1 : (top + scale) // 8 - 1, left // 8 + 1 : (left + scale) // 8 - 1] = 1.0
+            mask[top // factor + 1 : (top + scale) // factor - 1, left // factor + 1 : (left + scale) // factor - 1] = 1.0
         return instance_image, mask
 
     def __getitem__(self, index):
@@ -858,6 +861,20 @@ def main(args):
     unet.to(accelerator.device, dtype=weight_dtype)
     vae.to(accelerator.device, dtype=weight_dtype)
 
+    attention_class = CustomDiffusionAttnProcessor
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warn(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            attention_class = CustomDiffusionXFormersAttnProcessor
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
     # now we will add new Custom Diffusion weights to the attention layers
     # It's important to realize here how many attention weights will be added and of which sizes
     # The sizes of the attention layers consist only of two different variables:
@@ -897,7 +914,7 @@ def main(args):
             weights["to_out_custom_diffusion.0.weight"] = st[layer_name + ".to_out.0.weight"]
             weights["to_out_custom_diffusion.0.bias"] = st[layer_name + ".to_out.0.bias"]
         if cross_attention_dim is not None:
-            custom_diffusion_attn_procs[name] = CustomDiffusionAttnProcessor(
+            custom_diffusion_attn_procs[name] = attention_class(
                 train_kv=train_kv,
                 train_q_out=train_q_out,
                 hidden_size=hidden_size,
@@ -905,7 +922,7 @@ def main(args):
             ).to(unet.device)
             custom_diffusion_attn_procs[name].load_state_dict(weights)
         else:
-            custom_diffusion_attn_procs[name] = CustomDiffusionAttnProcessor(
+            custom_diffusion_attn_procs[name] = attention_class(
                 train_kv=False,
                 train_q_out=False,
                 hidden_size=hidden_size,
@@ -917,19 +934,6 @@ def main(args):
 
     accelerator.register_for_checkpointing(custom_diffusion_layers)
 
-    if args.enable_xformers_memory_efficient_attention:
-        if is_xformers_available():
-            import xformers
-
-            xformers_version = version.parse(xformers.__version__)
-            if xformers_version == version.parse("0.0.16"):
-                logger.warn(
-                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
-                )
-            unet.enable_xformers_memory_efficient_attention()
-        else:
-            raise ValueError("xformers is not available. Make sure it is installed correctly")
-
     if args.gradient_checkpointing:
         unet.enable_gradient_checkpointing()
         if args.modifier_token is not None:
@@ -976,6 +980,7 @@ def main(args):
         tokenizer=tokenizer,
         with_prior_preservation=args.with_prior_preservation,
         size=args.resolution,
+        mask_size=vae.encode(torch.randn(1, 3, args.resolution, args.resolution).to(dtype=weight_dtype).to(accelerator.device)).latent_dist.sample().size()[-1],
         center_crop=args.center_crop,
         num_class_images=args.num_class_images,
         hflip=args.hflip,
diff --git a/examples/test_examples.py b/examples/test_examples.py
index 5e38ef27a4f3..53e84538206f 100644
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -232,7 +232,7 @@ def test_custom_diffusion(self):
                 --train_batch_size 1
                 --gradient_accumulation_steps 1
                 --max_train_steps 2
-                --learning_rate 5.0e-04
+                --learning_rate 1.0e-05
                 --scale_lr
                 --lr_scheduler constant
                 --lr_warmup_steps 0
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index 6040b2b1446e..b8787aed91f2 100644
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -816,9 +816,9 @@ def __init__(
         self.train_kv = train_kv
         self.train_q_out = train_q_out
 
-        self.attention_op = attention_op
         self.hidden_size = hidden_size
         self.cross_attention_dim = cross_attention_dim
+        self.attention_op = attention_op
 
         # `_custom_diffusion` id for easy serialization and loading.
         if self.train_kv:

From 097f5bca6f70234ba4e733538e8ea4c3ff0d87b6 Mon Sep 17 00:00:00 2001
From: Nupur Kumari <nupurkumari@nupurs-mbp.wifi.local.cmu.edu>
Date: Wed, 19 Apr 2023 15:33:47 -0400
Subject: [PATCH 33/34] custom diffusion xformer update

---
 examples/custom_diffusion/train_custom_diffusion.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/examples/custom_diffusion/train_custom_diffusion.py b/examples/custom_diffusion/train_custom_diffusion.py
index b42561f3cebb..49b05e6b5db3 100644
--- a/examples/custom_diffusion/train_custom_diffusion.py
+++ b/examples/custom_diffusion/train_custom_diffusion.py
@@ -239,7 +239,9 @@ def preprocess(self, image, scale, resample):
             mask = np.ones((self.size // factor, self.size // factor))
         else:
             instance_image[top : top + inner, left : left + inner, :] = image
-            mask[top // factor + 1 : (top + scale) // factor - 1, left // factor + 1 : (left + scale) // factor - 1] = 1.0
+            mask[
+                top // factor + 1 : (top + scale) // factor - 1, left // factor + 1 : (left + scale) // factor - 1
+            ] = 1.0
         return instance_image, mask
 
     def __getitem__(self, index):
@@ -980,7 +982,11 @@ def main(args):
         tokenizer=tokenizer,
         with_prior_preservation=args.with_prior_preservation,
         size=args.resolution,
-        mask_size=vae.encode(torch.randn(1, 3, args.resolution, args.resolution).to(dtype=weight_dtype).to(accelerator.device)).latent_dist.sample().size()[-1],
+        mask_size=vae.encode(
+            torch.randn(1, 3, args.resolution, args.resolution).to(dtype=weight_dtype).to(accelerator.device)
+        )
+        .latent_dist.sample()
+        .size()[-1],
         center_crop=args.center_crop,
         num_class_images=args.num_class_images,
         hflip=args.hflip,

From 350414bc2cc8894d1481e258a5454f6c093a2287 Mon Sep 17 00:00:00 2001
From: Nupur Kumari <nupurkumari@nupurs-mbp.wifi.local.cmu.edu>
Date: Wed, 19 Apr 2023 15:46:54 -0400
Subject: [PATCH 34/34] custom diffusion xformer update

---
 examples/test_examples.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/test_examples.py b/examples/test_examples.py
index 53e84538206f..a77fa4c7da23 100644
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -227,7 +227,7 @@ def test_custom_diffusion(self):
                 examples/custom_diffusion/train_custom_diffusion.py
                 --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-pipe
                 --instance_data_dir docs/source/en/imgs
-                --instance_prompt "<new1>"
+                --instance_prompt <new1>
                 --resolution 64
                 --train_batch_size 1
                 --gradient_accumulation_steps 1
@@ -236,7 +236,7 @@ def test_custom_diffusion(self):
                 --scale_lr
                 --lr_scheduler constant
                 --lr_warmup_steps 0
-                --modifier_token "<new1>"
+                --modifier_token <new1>
                 --output_dir {tmpdir}
                 """.split()