huggingface · williamberman · Feb 14, 2023 · Feb 8, 2023 · Feb 9, 2023 · Feb 9, 2023
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -154,6 +154,8 @@
       title: Stable Diffusion
     - local: api/pipelines/stable_diffusion_2
       title: Stable Diffusion 2
+    - local: api/pipelines/stable_unclip
+      title: Stable unCLIP
     - local: api/pipelines/stochastic_karras_ve
       title: Stochastic Karras VE
     - local: api/pipelines/unclip

diff --git a/docs/source/en/api/pipelines/overview.mdx b/docs/source/en/api/pipelines/overview.mdx
@@ -64,6 +64,8 @@ available a colab notebook to directly try them out.
 | [stable_diffusion_2](./stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Image Inpainting | 
 | [stable_diffusion_2](./stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Super Resolution Image-to-Image |
 | [stable_diffusion_safe](./stable_diffusion_safe) | [**Safe Stable Diffusion**](https://arxiv.org/abs/2211.05105) | Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ml-research/safe-latent-diffusion/blob/main/examples/Safe%20Latent%20Diffusion.ipynb)
+| [stable_unclip](./stable_unclip) | **Stable unCLIP** | Text-to-Image Generation |
+| [stable_unclip](./stable_unclip) | **Stable unCLIP** | Image-to-Image Text-Guided Generation |
 | [stochastic_karras_ve](./stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation |
 | [unclip](./unclip) | [Hierarchical Text-Conditional Image Generation with CLIP Latents](https://arxiv.org/abs/2204.06125) | Text-to-Image Generation |
 | [versatile_diffusion](./versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Text-to-Image Generation | 

diff --git a/docs/source/en/api/pipelines/stable_diffusion/text2img.mdx b/docs/source/en/api/pipelines/stable_diffusion/text2img.mdx
@@ -17,7 +17,7 @@ specific language governing permissions and limitations under the License.
 The Stable Diffusion model was created by the researchers and engineers from [CompVis](https://github.com/CompVis), [Stability AI](https://stability.ai/), [runway](https://github.com/runwayml), and [LAION](https://laion.ai/). The [`StableDiffusionPipeline`] is capable of generating photo-realistic images given any text input using Stable Diffusion.
 
 The original codebase can be found here: 
-- *Stable Diffusion V1*: [CampVis/stable-diffusion](https://github.com/CompVis/stable-diffusion)
+- *Stable Diffusion V1*: [CompVis/stable-diffusion](https://github.com/CompVis/stable-diffusion)
 - *Stable Diffusion v2*: [Stability-AI/stablediffusion](https://github.com/Stability-AI/stablediffusion)
 
 Available Checkpoints are:

diff --git a/docs/source/en/api/pipelines/stable_unclip.mdx b/docs/source/en/api/pipelines/stable_unclip.mdx
@@ -0,0 +1,97 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Stable unCLIP
+
+Stable unCLIP checkpoints are finetuned from [stable diffusion 2.1](./stable_diffusion_2) checkpoints to condition on CLIP image embeddings.
+Stable unCLIP also still conditions on text embeddings. Given the two separate conditionings, stable unCLIP can be used
+for text guided image variation. When combined with an unCLIP prior, it can also be used for full text to image generation.
+
+## Tips
+
+Stable unCLIP takes a `noise_level` as input during inference. `noise_level` determines how much noise is added 
+to the image embeddings. A higher `noise_level` increases variation in the final un-noised images. By default, 
+we do not add any additional noise to the image embeddings i.e. `noise_level = 0`.
+
+### Available checkpoints:
+
+TODO
+
+### Text-to-Image Generation
+
+```python
+import torch
+from diffusers import StableUnCLIPPipeline
+
+pipe = StableUnCLIPPipeline.from_pretrained(
+    "fusing/stable-unclip-2-1-l", torch_dtype=torch.float16
+)  # TODO update model path
+pipe = pipe.to("cuda")
+
+prompt = "a photo of an astronaut riding a horse on mars"
+images = pipe(prompt).images
+images[0].save("astronaut_horse.png")
+```
+
+
+### Text guided Image-to-Image Variation
+
+```python
+import requests
+import torch
+from PIL import Image
+from io import BytesIO
+
+from diffusers import StableUnCLIPImg2ImgPipeline
+
+pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
+    "fusing/stable-unclip-2-1-l-img2img", torch_dtype=torch.float16
+)  # TODO update model path
+pipe = pipe.to("cuda")
+
+url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+
+response = requests.get(url)
+init_image = Image.open(BytesIO(response.content)).convert("RGB")
+init_image = init_image.resize((768, 512))
+
+prompt = "A fantasy landscape, trending on artstation"
+
+images = pipe(prompt, init_image).images
+images[0].save("fantasy_landscape.png")
+```
+
+### StableUnCLIPPipeline
+
+[[autodoc]] StableUnCLIPPipeline
+	- all
+	- __call__
+	- enable_attention_slicing
+	- disable_attention_slicing
+	- enable_vae_slicing
+	- disable_vae_slicing
+	- enable_xformers_memory_efficient_attention
+	- disable_xformers_memory_efficient_attention
+
+
+### StableUnCLIPImg2ImgPipeline
+
+[[autodoc]] StableUnCLIPImg2ImgPipeline
+	- all
+	- __call__
+	- enable_attention_slicing
+	- disable_attention_slicing
+	- enable_vae_slicing
+	- disable_vae_slicing
+	- enable_xformers_memory_efficient_attention
+	- disable_xformers_memory_efficient_attention
+
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
@@ -54,6 +54,8 @@ available a colab notebook to directly try them out.
 | [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Image Inpainting | 
 | [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Super Resolution Image-to-Image |
 | [stable_diffusion_safe](./api/pipelines/stable_diffusion_safe) | [**Safe Stable Diffusion**](https://arxiv.org/abs/2211.05105) | Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ml-research/safe-latent-diffusion/blob/main/examples/Safe%20Latent%20Diffusion.ipynb)
+| [stable_unclip](./stable_unclip) | **Stable unCLIP** | Text-to-Image Generation |
+| [stable_unclip](./stable_unclip) | **Stable unCLIP** | Image-to-Image Text-Guided Generation |
 | [stochastic_karras_ve](./api/pipelines/stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation | 
 | [unclip](./api/pipelines/unclip) | [Hierarchical Text-Conditional Image Generation with CLIP Latents](https://arxiv.org/abs/2204.06125) | Text-to-Image Generation |
 | [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Text-to-Image Generation | 

diff --git a/scripts/convert_original_stable_diffusion_to_diffusers.py b/scripts/convert_original_stable_diffusion_to_diffusers.py
@@ -100,6 +100,26 @@
     )
     parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
     parser.add_argument("--device", type=str, help="Device to use (e.g. cpu, cuda:0, cuda:1, etc.)")
+    parser.add_argument(
+        "--stable_unclip",
+        type=str,
+        default=None,
+        required=False,
+        help="Set if this is a stable unCLIP model. One of 'txt2img' or 'img2img'.",
+    )
+    parser.add_argument(
+        "--stable_unclip_prior",
+        type=str,
+        default=None,
+        required=False,
+        help="Set if this is a stable unCLIP txt2img model. Selects which prior to use. If `--stable_unclip` is set to `txt2img`, the karlo prior (https://huggingface.co/kakaobrain/karlo-v1-alpha/tree/main/prior) is selected by default.",
+    )
+    parser.add_argument(
+        "--clip_stats_path",
+        type=str,
+        help="Path to the clip stats file. Only required if the stable unclip model's config specifies `model.params.noise_aug_config.params.clip_stats_path`.",
+        required=False,
+    )
     args = parser.parse_args()
 
     pipe = load_pipeline_from_original_stable_diffusion_ckpt(
@@ -114,5 +134,8 @@
         upcast_attention=args.upcast_attention,
         from_safetensors=args.from_safetensors,
         device=args.device,
+        stable_unclip=args.stable_unclip,
+        stable_unclip_prior=args.stable_unclip_prior,
+        clip_stats_path=args.clip_stats_path,
     )
     pipe.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors)
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
@@ -119,6 +119,8 @@
         StableDiffusionPipeline,
         StableDiffusionPipelineSafe,
         StableDiffusionUpscalePipeline,
+        StableUnCLIPImg2ImgPipeline,
+        StableUnCLIPPipeline,
         UnCLIPImageVariationPipeline,
         UnCLIPPipeline,
         VersatileDiffusionDualGuidedPipeline,

diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
@@ -91,7 +91,7 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
         resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
             for resnet blocks, see [`~models.resnet.ResnetBlock2D`]. Choose from `default` or `scale_shift`.
         class_embed_type (`str`, *optional*, defaults to None): The type of class embedding to use which is ultimately
-            summed with the time embeddings. Choose from `None`, `"timestep"`, or `"identity"`.
+            summed with the time embeddings. Choose from `None`, `"timestep"`, `"identity"`, or `"projection"`.
         num_class_embeds (`int`, *optional*, defaults to None):
             Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
             class conditioning with `class_embed_type` equal to `None`.
@@ -102,7 +102,9 @@ class conditioning with `class_embed_type` equal to `None`.
         time_cond_proj_dim (`int`, *optional*, default to `None`):
             The dimension of `cond_proj` layer in timestep embedding.
         conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer.
-        conv_out_kernel (`int`, *optional*, default to `3`): the Kernel size of `conv_out` layer.
+        conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer.
+        projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when
+            using the "projection" `class_embed_type`. Required when using the "projection" `class_embed_type`.
     """
 
     _supports_gradient_checkpointing = True
@@ -145,6 +147,7 @@ def __init__(
         time_cond_proj_dim: Optional[int] = None,
         conv_in_kernel: int = 3,
         conv_out_kernel: int = 3,
+        projection_class_embeddings_input_dim: Optional[int] = None,
     ):
         super().__init__()
 
@@ -211,6 +214,19 @@ def __init__(
             self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
         elif class_embed_type == "identity":
             self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        elif class_embed_type == "projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
+            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
+            # 2. it projects from an arbitrary input dimension.
+            #
+            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
+            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
+            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
+            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
         else:
             self.class_embedding = None
 

diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
@@ -55,6 +55,8 @@
         StableDiffusionLatentUpscalePipeline,
         StableDiffusionPipeline,
         StableDiffusionUpscalePipeline,
+        StableUnCLIPImg2ImgPipeline,
+        StableUnCLIPPipeline,
     )
     from .stable_diffusion_safe import StableDiffusionPipelineSafe
     from .unclip import UnCLIPImageVariationPipeline, UnCLIPPipeline

diff --git a/src/diffusers/pipelines/stable_diffusion/__init__.py b/src/diffusers/pipelines/stable_diffusion/__init__.py
@@ -45,7 +45,10 @@ class StableDiffusionPipelineOutput(BaseOutput):
     from .pipeline_stable_diffusion_instruct_pix2pix import StableDiffusionInstructPix2PixPipeline
     from .pipeline_stable_diffusion_latent_upscale import StableDiffusionLatentUpscalePipeline
     from .pipeline_stable_diffusion_upscale import StableDiffusionUpscalePipeline
+    from .pipeline_stable_unclip import StableUnCLIPPipeline
+    from .pipeline_stable_unclip_img2img import StableUnCLIPImg2ImgPipeline
     from .safety_checker import StableDiffusionSafetyChecker
+    from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
 
 try:
     if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0")):