From 7ce3745a7657f34ebb067c3a9ddd24690ffe748c Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Thu, 5 Oct 2023 17:36:35 +0200
Subject: [PATCH 01/88] Check in 23-10-05

---
 .../models/unet_2d_condition_control.py       | 322 ++++++++++++++++++
 1 file changed, 322 insertions(+)
 create mode 100644 src/diffusers/models/unet_2d_condition_control.py

diff --git a/src/diffusers/models/unet_2d_condition_control.py b/src/diffusers/models/unet_2d_condition_control.py
new file mode 100644
index 000000000000..fe92ec4adbe7
--- /dev/null
+++ b/src/diffusers/models/unet_2d_condition_control.py
@@ -0,0 +1,322 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..loaders import UNet2DConditionLoadersMixin
+from ..utils import BaseOutput, logging
+from .activations import get_activation
+from .attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from .embeddings import (
+    GaussianFourierProjection,
+    ImageHintTimeEmbedding,
+    ImageProjection,
+    ImageTimeEmbedding,
+    PositionNet,
+    TextImageProjection,
+    TextImageTimeEmbedding,
+    TextTimeEmbedding,
+    TimestepEmbedding,
+    Timesteps,
+)
+from .modeling_utils import ModelMixin
+from .unet_2d_blocks import (
+    CrossAttnDownBlock2D,
+    DownBlock2D,
+    CrossAttnUpBlock2D,
+    UpBlock2D,
+    UNetMidBlock2DCrossAttn,
+    UNetMidBlock2DSimpleCrossAttn,
+    UNetMidBlock2DCrossAttn,
+    get_down_block,
+    get_up_block,
+)
+from .unet_2d_condition import UNet2DConditionModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# # # Notes Umer
+# To integrate controlnet-xs, I need to
+# 1. Create an ControlNet-xs class
+# 2. Enable it to load from hub (via .from_pretrained)
+# 3. Make sure it runs with all controlnet pipelines
+#
+# Notes & Questions
+# I: Controlnet-xs has a slightly different architecture than controlnet,
+#       as the encoders of the base and the controller are connected.
+# Q: Do I have to adjust all pipelines?
+#
+# Q: There are controlnet-xs models for sd-xl and sd-2.1. Does that mean I need to have multiple pipelines?
+# A: Yes. For the original controlnet, there are 8 pipelines: {sd-xl, sd-2.1} x {normal, img2img, inpainting} + flax + multicontrolnet
+# # # 
+
+
+@dataclass
+class UNet2DConditionOutput(BaseOutput):
+    sample: torch.FloatTensor = None
+
+
+class ControlledUNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+
+    def __init__(
+            self,
+            in_channels,
+            model_channels,
+            out_channels,
+            hint_channels,
+            num_res_blocks,
+            attention_resolutions,
+            block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+            act_fn: str = "silu",
+            time_embedding_type: str = "positional",
+            time_embedding_dim: Optional[int] = None,
+            time_embedding_act_fn: Optional[str] = None,
+            timestep_post_act: Optional[str] = None,
+            time_cond_proj_dim: Optional[int] = None,
+            flip_sin_to_cos: bool = True,
+            freq_shift: int = 0, 
+        ):
+        super().__init__()
+
+        # 1 - Save parameters
+        # TODO make variables
+        self.control_mode = "canny"
+        self.learn_embedding = False
+        self.infusion2control = "cat"
+        self.infusion2base = "add"
+        self.in_ch_factor = 1 if "cat" == 'add' else 2
+        self.guiding = "encoder"
+        self.two_stream_mode = "cross"
+        self.control_model_ratio = 1.0
+        self.out_channels = out_channels
+        self.dims = 2
+        self.model_channels = model_channels
+        self.no_control = False
+        self.control_scale = 1.0
+
+        self.hint_model = None
+
+        # Time embedding
+        if time_embedding_type == "fourier":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 2
+            if time_embed_dim % 2 != 0:
+                raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
+            self.time_proj = GaussianFourierProjection(
+                time_embed_dim // 2, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
+            )
+            timestep_input_dim = time_embed_dim
+        elif time_embedding_type == "positional":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
+
+            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+            timestep_input_dim = block_out_channels[0]
+        else:
+            raise ValueError(
+                f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
+            )
+
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+            act_fn=act_fn,
+            post_act_fn=timestep_post_act,
+            cond_proj_dim=time_cond_proj_dim,
+        )
+
+        # 2 - Create base and control model
+        # TODO 1. create base model, or 2. pass it
+        self.base_model = base_model = UNet2DConditionModel()
+        # TODO create control model
+        self.control_model = ctrl_model = UNet2DConditionModel()
+
+
+        # 3 - Gather Channel Sizes
+        ch_inout_ctrl = {'enc': [], 'mid': [], 'dec': []}
+        ch_inout_base = {'enc': [], 'mid': [], 'dec': []}
+
+        # 3.1 - input convolution
+        ch_inout_ctrl['enc'].append((ctrl_model.conv_in.in_channels, ctrl_model.conv_in.out_channels))
+        ch_inout_base['enc'].append((base_model.conv_in.in_channels, base_model.conv_in.out_channels))
+
+        # 3.2 - encoder blocks
+        for module in ctrl_model.down_blocks:
+            if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D)):
+                for r in module.resnets:
+                    ch_inout_ctrl['enc'].append((r.in_channels, r.out_channels))
+                if module.downsamplers:
+                    ch_inout_ctrl['enc'].append((module.downsamplers[0].channels, module.downsamplers[0].out_channels))
+            else:
+                raise ValueError(f'Encountered unknown module of type {type(module)} while creating ControlNet-XS.')
+    
+        for module in base_model.down_blocks:
+            if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D)):
+                for r in module.resnets:
+                    ch_inout_base['enc'].append((r.in_channels, r.out_channels))
+                if module.downsamplers:
+                    ch_inout_base['enc'].append((module.downsamplers[0].channels, module.downsamplers[0].out_channels))
+            else:
+                raise ValueError(f'Encountered unknown module of type {type(module)} while creating ControlNet-XS.')
+
+        # 3.3 - middle block
+        ch_inout_ctrl['mid'].append((ctrl_model.mid_block.resnets[0].in_channels, ctrl_model.mid_block.resnets[0].in_channels))
+        ch_inout_base['mid'].append((base_model.mid_block.resnets[0].in_channels, base_model.mid_block.resnets[0].in_channels))
+    
+        # 3.4 - decoder blocks
+        for module in base_model.up_blocks:
+            if isinstance(module, (CrossAttnUpBlock2D, UpBlock2D)):
+                for r in module.resnets:
+                    ch_inout_base['dec'].append((r.in_channels, r.out_channels))
+            else:
+                raise ValueError(f'Encountered unknown module of type {type(module)} while creating ControlNet-XS.')
+            
+        self.ch_inout_ctrl = ch_inout_ctrl
+        self.ch_inout_base = ch_inout_base
+
+        # 4 - Build connections between base and control model
+        self.enc_zero_convs_out = nn.ModuleList([])
+        self.enc_zero_convs_in = nn.ModuleList([])
+
+        self.middle_block_out = nn.ModuleList([])
+        self.middle_block_in = nn.ModuleList([])
+
+        self.dec_zero_convs_out = nn.ModuleList([])
+        self.dec_zero_convs_in = nn.ModuleList([])
+
+        for ch_io_base in ch_inout_base['enc']:
+            self.enc_zero_convs_in.append(self.make_zero_conv(
+                in_channels=ch_io_base[1], out_channels=ch_io_base[1])
+            )
+        
+        self.middle_block_out = self.make_zero_conv(ch_inout_ctrl['mid'][-1][1], ch_inout_base['mid'][-1][1])
+        
+        self.dec_zero_convs_out.append(
+            self.make_zero_conv(ch_inout_ctrl['enc'][-1][1], ch_inout_base['mid'][-1][1])
+        )
+        for i in range(1, len(ch_inout_ctrl['enc'])):
+            self.dec_zero_convs_out.append(
+                self.make_zero_conv(ch_inout_ctrl['enc'][-(i + 1)][1], ch_inout_base['dec'][i - 1][1])
+            )
+    
+        # 5 - Input hint block TODO: Understand
+        self.input_hint_block = nn.Sequential(
+            nn.Conv2d(hint_channels, 16, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(16, 16, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(16, 32, 3, padding=1, stride=2),
+            nn.SiLU(),
+            nn.Conv2d(32, 32, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(32, 96, 3, padding=1, stride=2),
+            nn.SiLU(),
+            nn.Conv2d(96, 96, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(96, 256, 3, padding=1, stride=2),
+            nn.SiLU(),
+            zero_module(nn.Conv2d(256, int(model_channels * self.control_model_ratio), 3, padding=1))
+        )
+    
+        self.scale_list = [1.] * len(self.enc_zero_convs_out) + [1.] + [1.] * len(self.dec_zero_convs_out)
+        self.register_buffer('scale_list', torch.tensor(self.scale_list))
+
+
+    def forward(self, x: torch.Tensor, t: torch.Tensor, c: dict, hint: torch.Tensor, no_control=False, **kwargs):
+        # # # Params from unet_2d_condition.UNet2DConditionModel.forward:
+        # self,
+        # sample: torch.FloatTensor,
+        # timestep: Union[torch.Tensor, float, int],
+        # encoder_hidden_states: torch.Tensor,
+        # class_labels: Optional[torch.Tensor] = None,
+        # timestep_cond: Optional[torch.Tensor] = None,
+        # attention_mask: Optional[torch.Tensor] = None,
+        # cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        # added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        # down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        # mid_block_additional_residual: Optional[torch.Tensor] = None,
+        # encoder_attention_mask: Optional[torch.Tensor] = None,
+        # return_dict: bool = True,
+        #
+
+        x = torch.cat((x, c.get("concat", torch.Tensor([]).type_as(x))), dim=1)
+        if x.size(0) // 2 == hint.size(0): hint = torch.cat([hint, hint], dim=0) # for classifier free guidance
+        
+        timesteps=t
+        context=c.get("crossattn", None)
+        y=c.get("vector", None)
+
+        if no_control: return self.base_model(x=x, timesteps=timesteps, context=context, y=y, **kwargs)
+
+        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+        if self.learn_embedding: emb = self.control_model.time_embed(t_emb) * self.control_scale ** 0.3 + self.base_model.time_embed(t_emb) * (1 - control_scale ** 0.3)
+        else: emb = self.base_model.time_embed(t_emb)
+
+        if y is not None: emb = emb + self.base_model.label_emb(y)
+
+        if precomputed_hint: guided_hint = hint
+        else: guided_hint = self.input_hint_block(hint, emb, context)
+
+        h_ctr = h_base = x
+        hs_base, hs_ctr = [], []
+        it_enc_convs_in, it_enc_convs_out, it_dec_convs_in, it_dec_convs_out = map(iter, (self.enc_zero_convs_in, self.enc_zero_convs_out, self.dec_zero_convs_in, self.dec_zero_convs_out))
+        scales = iter(self.scale_list)
+
+        # Cross Control
+        # 1 - input blocks (encoder)
+        for module_base, module_ctr in zip(self.base_model.down_blocks, self.control_model.down_blocks):
+            h_base = module_base(h_base, emb, context)
+            h_ctr = module_ctr(h_ctr, emb, context)
+            if guided_hint is not None:
+                h_ctr = h_ctr + guided_hint
+                guided_hint = None
+            hs_base.append(h_base)
+            hs_ctr.append(h_ctr)
+            h_ctr = torch.cat([h_ctr, next(it_enc_convs_in)(h_base, emb)], dim=1)
+        # 2 - mid blocks (bottleneck)
+        h_base = self.base_model.mid_block(h_base, emb, context)
+        h_ctr = self.control_model.mid_block(h_ctr, emb, context)
+        h_base = h_base + self.middle_block_out(h_ctr, emb) * next(scales)
+        # 3 - output blocks (decoder)
+        for module_base in self.base_model.output_blocks:
+            h_base = h_base + next(it_dec_convs_out)(hs_ctr.pop(), emb) * next(scales)
+            h_base = torch.cat([h_base, hs_base.pop()], dim=1)
+            h_base = module_base(h_base, emb, context)
+
+        return self.base_model.out(h_base)
+
+
+
+    def make_zero_conv(self, in_channels, out_channels=None):
+        # keep running track # todo: better comment
+        self.in_channels = in_channels
+        self.out_channels = out_channels or in_channels
+        return zero_module(nn.Conv2d(in_channels, out_channels, 1, padding=0))
+
+
+def zero_module(module):
+    for p in module.parameters():
+        nn.init.zeros_(p)
+    return module

From 70d58d905afb9d230724ec6a6a7e848858dcf764 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Fri, 6 Oct 2023 19:45:11 +0200
Subject: [PATCH 02/88] check-in 23-10-06

---
 .../models/unet_2d_condition_control.py       | 65 ++++++++++---------
 1 file changed, 34 insertions(+), 31 deletions(-)

diff --git a/src/diffusers/models/unet_2d_condition_control.py b/src/diffusers/models/unet_2d_condition_control.py
index fe92ec4adbe7..54529a16a886 100644
--- a/src/diffusers/models/unet_2d_condition_control.py
+++ b/src/diffusers/models/unet_2d_condition_control.py
@@ -18,28 +18,15 @@
 import torch.nn as nn
 import torch.utils.checkpoint
 
-from ..configuration_utils import ConfigMixin, register_to_config
+from ..configuration_utils import ConfigMixin
 from ..loaders import UNet2DConditionLoadersMixin
 from ..utils import BaseOutput, logging
-from .activations import get_activation
-from .attention_processor import (
-    ADDED_KV_ATTENTION_PROCESSORS,
-    CROSS_ATTENTION_PROCESSORS,
-    AttentionProcessor,
-    AttnAddedKVProcessor,
-    AttnProcessor,
-)
+
 from .embeddings import (
     GaussianFourierProjection,
-    ImageHintTimeEmbedding,
-    ImageProjection,
-    ImageTimeEmbedding,
-    PositionNet,
-    TextImageProjection,
-    TextImageTimeEmbedding,
-    TextTimeEmbedding,
     TimestepEmbedding,
     Timesteps,
+    get_timestep_embedding
 )
 from .modeling_utils import ModelMixin
 from .unet_2d_blocks import (
@@ -47,11 +34,6 @@
     DownBlock2D,
     CrossAttnUpBlock2D,
     UpBlock2D,
-    UNetMidBlock2DCrossAttn,
-    UNetMidBlock2DSimpleCrossAttn,
-    UNetMidBlock2DCrossAttn,
-    get_down_block,
-    get_up_block,
 )
 from .unet_2d_condition import UNet2DConditionModel
 
@@ -120,6 +102,9 @@ def __init__(
 
         self.hint_model = None
 
+        self.flip_sin_to_cos = flip_sin_to_cos
+        self.freq_shift = freq_shift
+        
         # Time embedding
         if time_embedding_type == "fourier":
             time_embed_dim = time_embedding_dim or block_out_channels[0] * 2
@@ -240,12 +225,12 @@ def __init__(
             zero_module(nn.Conv2d(256, int(model_channels * self.control_model_ratio), 3, padding=1))
         )
     
-        self.scale_list = [1.] * len(self.enc_zero_convs_out) + [1.] + [1.] * len(self.dec_zero_convs_out)
-        self.register_buffer('scale_list', torch.tensor(self.scale_list))
+        scale_list = [1.] * len(self.enc_zero_convs_out) + [1.] + [1.] * len(self.dec_zero_convs_out)
+        self.register_buffer('scale_list', torch.tensor(scale_list))
 
 
     def forward(self, x: torch.Tensor, t: torch.Tensor, c: dict, hint: torch.Tensor, no_control=False, **kwargs):
-        # # # Params from unet_2d_condition.UNet2DConditionModel.forward:
+        """ Params from unet_2d_condition.UNet2DConditionModel.forward:
         # self,
         # sample: torch.FloatTensor,
         # timestep: Union[torch.Tensor, float, int],
@@ -259,25 +244,40 @@ def forward(self, x: torch.Tensor, t: torch.Tensor, c: dict, hint: torch.Tensor,
         # mid_block_additional_residual: Optional[torch.Tensor] = None,
         # encoder_attention_mask: Optional[torch.Tensor] = None,
         # return_dict: bool = True,
-        #
+        """
 
+        # # < from forward
         x = torch.cat((x, c.get("concat", torch.Tensor([]).type_as(x))), dim=1)
         if x.size(0) // 2 == hint.size(0): hint = torch.cat([hint, hint], dim=0) # for classifier free guidance
         
         timesteps=t
         context=c.get("crossattn", None)
         y=c.get("vector", None)
+        # # />
 
+        # # < from forward_
         if no_control: return self.base_model(x=x, timesteps=timesteps, context=context, y=y, **kwargs)
 
-        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
-        if self.learn_embedding: emb = self.control_model.time_embed(t_emb) * self.control_scale ** 0.3 + self.base_model.time_embed(t_emb) * (1 - control_scale ** 0.3)
-        else: emb = self.base_model.time_embed(t_emb)
+        # # Warning for Umer: What I & cnxs call 'projection', diffusers calls 'embedding'; and vice versa
+        # Code from cnxs:
+        #t_emb = self.time_embedding(timesteps, self.model_channels, repeat_only=False)
+        #if self.learn_embedding: emb = self.control_model.time_embed(t_emb) * self.control_scale ** 0.3 + self.base_model.time_embed(t_emb) * (1 - control_scale ** 0.3)
+        #else: emb = self.base_model.time_embed(t_emb)
+
+        t_emb = get_timestep_embedding(
+            timesteps, 
+            self.model_channels,
+            # # TODO: Undetrstand flip_sin_to_cos / (downscale_)freq_shift
+            flip_sin_to_cos=self.flip_sin_to_cos,
+            downscale_freq_shift=self.freq_shift,
+        )
+        
+        # self.learn_embedding == False
+        emb = self.base_model.time_embedding(t_emb)
 
         if y is not None: emb = emb + self.base_model.label_emb(y)
 
-        if precomputed_hint: guided_hint = hint
-        else: guided_hint = self.input_hint_block(hint, emb, context)
+        guided_hint = self.input_hint_block(hint, emb, context)
 
         h_ctr = h_base = x
         hs_base, hs_ctr = [], []
@@ -285,6 +285,9 @@ def forward(self, x: torch.Tensor, t: torch.Tensor, c: dict, hint: torch.Tensor,
         scales = iter(self.scale_list)
 
         # Cross Control
+        # 0 - conv in
+        h_base = self.base_model.conv_in(h_base)
+        h_ctrl = self.control_model.conv_in(h_ctrl)
         # 1 - input blocks (encoder)
         for module_base, module_ctr in zip(self.base_model.down_blocks, self.control_model.down_blocks):
             h_base = module_base(h_base, emb, context)
@@ -306,7 +309,7 @@ def forward(self, x: torch.Tensor, t: torch.Tensor, c: dict, hint: torch.Tensor,
             h_base = module_base(h_base, emb, context)
 
         return self.base_model.out(h_base)
-
+        # # />
 
 
     def make_zero_conv(self, in_channels, out_channels=None):

From 762bdfd8e49f5de55ac62d2adfbba7b531f28d6d Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Sat, 7 Oct 2023 14:29:05 +0200
Subject: [PATCH 03/88] check-in 23-10-07 2pm

---
 .../models/unet_2d_condition_control.py       | 40 +++++++++++--------
 1 file changed, 24 insertions(+), 16 deletions(-)

diff --git a/src/diffusers/models/unet_2d_condition_control.py b/src/diffusers/models/unet_2d_condition_control.py
index 54529a16a886..7a3e7b788ef0 100644
--- a/src/diffusers/models/unet_2d_condition_control.py
+++ b/src/diffusers/models/unet_2d_condition_control.py
@@ -81,6 +81,8 @@ def __init__(
             time_cond_proj_dim: Optional[int] = None,
             flip_sin_to_cos: bool = True,
             freq_shift: int = 0, 
+            encoder_hid_dim: Optional[int] = 768, # Note Umer: should not be hard coded, but okay for minimal functional run - this comes from the text encoder output shape
+            cross_attention_dim: Union[int, Tuple[int]] = 1280, # Note Umer: should not be hard coded, but okay for minimal functional run - this from the unet shapes
         ):
         super().__init__()
 
@@ -131,6 +133,8 @@ def __init__(
             post_act_fn=timestep_post_act,
             cond_proj_dim=time_cond_proj_dim,
         )
+        # Text embedding
+        self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
 
         # 2 - Create base and control model
         # TODO 1. create base model, or 2. pass it
@@ -229,7 +233,7 @@ def __init__(
         self.register_buffer('scale_list', torch.tensor(scale_list))
 
 
-    def forward(self, x: torch.Tensor, t: torch.Tensor, c: dict, hint: torch.Tensor, no_control=False, **kwargs):
+    def forward(self, x: torch.Tensor, t: torch.Tensor, encoder_hidden_states: torch.Tensor, c: dict, hint: torch.Tensor, no_control=False, **kwargs):
         """ Params from unet_2d_condition.UNet2DConditionModel.forward:
         # self,
         # sample: torch.FloatTensor,
@@ -264,23 +268,27 @@ def forward(self, x: torch.Tensor, t: torch.Tensor, c: dict, hint: torch.Tensor,
         #if self.learn_embedding: emb = self.control_model.time_embed(t_emb) * self.control_scale ** 0.3 + self.base_model.time_embed(t_emb) * (1 - control_scale ** 0.3)
         #else: emb = self.base_model.time_embed(t_emb)
 
+        # time embeddings
         t_emb = get_timestep_embedding(
             timesteps, 
             self.model_channels,
             # # TODO: Undetrstand flip_sin_to_cos / (downscale_)freq_shift
             flip_sin_to_cos=self.flip_sin_to_cos,
             downscale_freq_shift=self.freq_shift,
-        )
-        
+        )        
         # self.learn_embedding == False
-        emb = self.base_model.time_embedding(t_emb)
+        temb = self.base_model.time_embedding(t_emb)
 
-        if y is not None: emb = emb + self.base_model.label_emb(y)
+        if y is not None: emb = emb + self.base_model.label_emb(y) # ?? - sth with class-conditioning
+        # text embeddings
+        cemb = self.encoder_hid_proj(encoder_hidden_states) # Q: use the base/ctrl models' encoder_hid_proj? Need to make sure dims fit
+
+        emb = temb + cemb
 
         guided_hint = self.input_hint_block(hint, emb, context)
 
-        h_ctr = h_base = x
-        hs_base, hs_ctr = [], []
+        h_ctrl = h_base = x
+        hs_base, hs_ctrl = [], []
         it_enc_convs_in, it_enc_convs_out, it_dec_convs_in, it_dec_convs_out = map(iter, (self.enc_zero_convs_in, self.enc_zero_convs_out, self.dec_zero_convs_in, self.dec_zero_convs_out))
         scales = iter(self.scale_list)
 
@@ -289,22 +297,22 @@ def forward(self, x: torch.Tensor, t: torch.Tensor, c: dict, hint: torch.Tensor,
         h_base = self.base_model.conv_in(h_base)
         h_ctrl = self.control_model.conv_in(h_ctrl)
         # 1 - input blocks (encoder)
-        for module_base, module_ctr in zip(self.base_model.down_blocks, self.control_model.down_blocks):
-            h_base = module_base(h_base, emb, context)
-            h_ctr = module_ctr(h_ctr, emb, context)
+        for module_base, module_ctrl in zip(self.base_model.down_blocks, self.control_model.down_blocks):
+            h_base = module_base(h_base, temb, cemb, context)[0] # Note Umer: module_base returns hidden_states and running output list
+            h_ctrl = module_ctrl(h_ctrl, temb, cemb, context)[0] # see above
             if guided_hint is not None:
-                h_ctr = h_ctr + guided_hint
+                h_ctrl = h_ctrl + guided_hint
                 guided_hint = None
             hs_base.append(h_base)
-            hs_ctr.append(h_ctr)
-            h_ctr = torch.cat([h_ctr, next(it_enc_convs_in)(h_base, emb)], dim=1)
+            hs_ctrl.append(h_ctrl)
+            h_ctrl = torch.cat([h_ctrl, next(it_enc_convs_in)(h_base, emb)], dim=1)
         # 2 - mid blocks (bottleneck)
         h_base = self.base_model.mid_block(h_base, emb, context)
-        h_ctr = self.control_model.mid_block(h_ctr, emb, context)
-        h_base = h_base + self.middle_block_out(h_ctr, emb) * next(scales)
+        h_ctrl = self.control_model.mid_block(h_ctrl, emb, context)
+        h_base = h_base + self.middle_block_out(h_ctrl, emb) * next(scales)
         # 3 - output blocks (decoder)
         for module_base in self.base_model.output_blocks:
-            h_base = h_base + next(it_dec_convs_out)(hs_ctr.pop(), emb) * next(scales)
+            h_base = h_base + next(it_dec_convs_out)(hs_ctrl.pop(), emb) * next(scales)
             h_base = torch.cat([h_base, hs_base.pop()], dim=1)
             h_base = module_base(h_base, emb, context)
 

From 267ca004867399ee9cddd9c9b78d9dff0a3a0fc0 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Sun, 8 Oct 2023 23:23:10 +0200
Subject: [PATCH 04/88] check-in 23-10-08

---
 .../models/unet_2d_condition_control.py       | 45 ++++++++++++++++---
 1 file changed, 40 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/models/unet_2d_condition_control.py b/src/diffusers/models/unet_2d_condition_control.py
index 7a3e7b788ef0..6de9b4a7214a 100644
--- a/src/diffusers/models/unet_2d_condition_control.py
+++ b/src/diffusers/models/unet_2d_condition_control.py
@@ -18,6 +18,8 @@
 import torch.nn as nn
 import torch.utils.checkpoint
 
+from torch.nn.modules.normalization import GroupNorm
+
 from ..configuration_utils import ConfigMixin
 from ..loaders import UNet2DConditionLoadersMixin
 from ..utils import BaseOutput, logging
@@ -28,6 +30,7 @@
     Timesteps,
     get_timestep_embedding
 )
+from .lora import LoRACompatibleConv
 from .modeling_utils import ModelMixin
 from .unet_2d_blocks import (
     CrossAttnDownBlock2D,
@@ -140,7 +143,9 @@ def __init__(
         # TODO 1. create base model, or 2. pass it
         self.base_model = base_model = UNet2DConditionModel()
         # TODO create control model
-        self.control_model = ctrl_model = UNet2DConditionModel()
+        self.control_model = ctrl_model = UNet2DConditionModel(block_out_channels=[32,64,128,128]) # todo: make variable
+        for i, base_channels in enumerate(block_out_channels[:-1]):
+            increase_block_input(self.control_model, block_no=i+1, by=base_channels)
 
 
         # 3 - Gather Channel Sizes
@@ -296,16 +301,18 @@ def forward(self, x: torch.Tensor, t: torch.Tensor, encoder_hidden_states: torch
         # 0 - conv in
         h_base = self.base_model.conv_in(h_base)
         h_ctrl = self.control_model.conv_in(h_ctrl)
+
+        if guided_hint is not None:
+            h_ctrl = h_ctrl + guided_hint
+            guided_hint = None
         # 1 - input blocks (encoder)
         for module_base, module_ctrl in zip(self.base_model.down_blocks, self.control_model.down_blocks):
             h_base = module_base(h_base, temb, cemb, context)[0] # Note Umer: module_base returns hidden_states and running output list
             h_ctrl = module_ctrl(h_ctrl, temb, cemb, context)[0] # see above
-            if guided_hint is not None:
-                h_ctrl = h_ctrl + guided_hint
-                guided_hint = None
+
             hs_base.append(h_base)
             hs_ctrl.append(h_ctrl)
-            h_ctrl = torch.cat([h_ctrl, next(it_enc_convs_in)(h_base, emb)], dim=1)
+            h_ctrl = torch.cat([h_ctrl, next(it_enc_convs_in)(h_base)], dim=1)
         # 2 - mid blocks (bottleneck)
         h_base = self.base_model.mid_block(h_base, emb, context)
         h_ctrl = self.control_model.mid_block(h_ctrl, emb, context)
@@ -327,6 +334,34 @@ def make_zero_conv(self, in_channels, out_channels=None):
         return zero_module(nn.Conv2d(in_channels, out_channels, 1, padding=0))
 
 
+def increase_block_input(unet, block_no, by):
+        """Double the channels size in a unet down block"""
+        assert block_no!=0, "Only after block 0 do we have info to pass from base to control, so you probably didn't mean block_no=0." 
+        r=unet.down_blocks[block_no].resnets[0]
+        old_norm1, old_conv1, old_conv_shortcut = r.norm1,r.conv1,r.conv_shortcut
+        # norm
+        norm_args = 'num_groups num_channels eps affine'.split(' ')
+        for a in norm_args: assert hasattr(old_norm1, a)
+        norm_kwargs = { a: getattr(old_norm1, a) for a in norm_args }
+        norm_kwargs['num_channels'] += by  # surgery done here
+        # conv1
+        conv1_args = 'in_channels out_channels kernel_size stride padding dilation groups bias padding_mode lora_layer'.split(' ')
+        for a in conv1_args: assert hasattr(old_conv1, a)
+        conv1_kwargs = { a: getattr(old_conv1, a) for a in conv1_args }
+        conv1_kwargs['bias'] = 'bias' in conv1_kwargs  # as param, bias is a boolean, but as attr, it's a tensor.
+        conv1_kwargs['in_channels'] += by  # surgery done here
+        # conv_shortcut
+        if old_conv_shortcut is not None:
+            conv_shortcut_args = 'in_channels out_channels kernel_size stride padding dilation groups bias padding_mode lora_layer'.split(' ')
+            for a in conv_shortcut_args: assert hasattr(old_conv_shortcut, a)
+            conv_shortcut_args_kwargs = { a: getattr(old_conv_shortcut, a) for a in conv_shortcut_args }
+            conv_shortcut_args_kwargs['bias'] = 'bias' in conv_shortcut_args_kwargs  # as param, bias is a boolean, but as attr, it's a tensor.
+            conv_shortcut_args_kwargs['in_channels'] += by  # surgery done here
+        # swap old with new modules
+        unet.down_blocks[block_no].resnets[0].norm1 = GroupNorm(**norm_kwargs)
+        unet.down_blocks[block_no].resnets[0].conv1 = LoRACompatibleConv(**conv1_kwargs)
+        if old_conv_shortcut is not None: unet.down_blocks[block_no].resnets[0].conv_shortcut = LoRACompatibleConv(**conv_shortcut_args_kwargs)
+
 def zero_module(module):
     for p in module.parameters():
         nn.init.zeros_(p)

From 535647811372ff472d5b7409fc389eda69453600 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Mon, 9 Oct 2023 12:58:02 +0200
Subject: [PATCH 05/88] check-in 231009T1200

---
 .../models/unet_2d_condition_control.py       | 78 ++++++++++++++++---
 1 file changed, 67 insertions(+), 11 deletions(-)

diff --git a/src/diffusers/models/unet_2d_condition_control.py b/src/diffusers/models/unet_2d_condition_control.py
index 6de9b4a7214a..15d13c3a4125 100644
--- a/src/diffusers/models/unet_2d_condition_control.py
+++ b/src/diffusers/models/unet_2d_condition_control.py
@@ -143,10 +143,15 @@ def __init__(
         # TODO 1. create base model, or 2. pass it
         self.base_model = base_model = UNet2DConditionModel()
         # TODO create control model
-        self.control_model = ctrl_model = UNet2DConditionModel(block_out_channels=[32,64,128,128]) # todo: make variable
-        for i, base_channels in enumerate(block_out_channels[:-1]):
-            increase_block_input(self.control_model, block_no=i+1, by=base_channels)
-
+        self.control_model = ctrl_model = UNet2DConditionModel(
+            block_out_channels=[32,64,128],
+            down_block_types=("CrossAttnDownBlock2D","CrossAttnDownBlock2D","DownBlock2D",),
+            up_block_types=("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
+        ) # todo: make variable
+        for i, extra_channels in enumerate(((320, 320), (320,640), (640,1280))[:-1]): # todo: make variable (sth like block_out_channels[:-1])
+            e1,e2=extra_channels
+            increase_block_input_in_resnet(self.control_model, block_no=i+1, resnet_idx=0, by=e1)
+            increase_block_input_in_resnet(self.control_model, block_no=i+1, resnet_idx=1, by=e2)
 
         # 3 - Gather Channel Sizes
         ch_inout_ctrl = {'enc': [], 'mid': [], 'dec': []}
@@ -301,10 +306,13 @@ def forward(self, x: torch.Tensor, t: torch.Tensor, encoder_hidden_states: torch
         # 0 - conv in
         h_base = self.base_model.conv_in(h_base)
         h_ctrl = self.control_model.conv_in(h_ctrl)
-
         if guided_hint is not None:
             h_ctrl = h_ctrl + guided_hint
             guided_hint = None
+        hs_base.append(h_base)
+        hs_ctrl.append(h_ctrl)
+        h_ctrl = torch.cat([h_ctrl, next(it_enc_convs_in)(h_base)], dim=1)
+
         # 1 - input blocks (encoder)
         for module_base, module_ctrl in zip(self.base_model.down_blocks, self.control_model.down_blocks):
             h_base = module_base(h_base, temb, cemb, context)[0] # Note Umer: module_base returns hidden_states and running output list
@@ -334,10 +342,10 @@ def make_zero_conv(self, in_channels, out_channels=None):
         return zero_module(nn.Conv2d(in_channels, out_channels, 1, padding=0))
 
 
-def increase_block_input(unet, block_no, by):
-        """Double the channels size in a unet down block"""
+def increase_block_input_in_resnet(unet, block_no, resnet_idx, by):
+        """Increase channels sizes to allow for additional concatted information from base model"""
         assert block_no!=0, "Only after block 0 do we have info to pass from base to control, so you probably didn't mean block_no=0." 
-        r=unet.down_blocks[block_no].resnets[0]
+        r=unet.down_blocks[block_no].resnets[resnet_idx]
         old_norm1, old_conv1, old_conv_shortcut = r.norm1,r.conv1,r.conv_shortcut
         # norm
         norm_args = 'num_groups num_channels eps affine'.split(' ')
@@ -358,11 +366,59 @@ def increase_block_input(unet, block_no, by):
             conv_shortcut_args_kwargs['bias'] = 'bias' in conv_shortcut_args_kwargs  # as param, bias is a boolean, but as attr, it's a tensor.
             conv_shortcut_args_kwargs['in_channels'] += by  # surgery done here
         # swap old with new modules
-        unet.down_blocks[block_no].resnets[0].norm1 = GroupNorm(**norm_kwargs)
-        unet.down_blocks[block_no].resnets[0].conv1 = LoRACompatibleConv(**conv1_kwargs)
-        if old_conv_shortcut is not None: unet.down_blocks[block_no].resnets[0].conv_shortcut = LoRACompatibleConv(**conv_shortcut_args_kwargs)
+        unet.down_blocks[block_no].resnets[resnet_idx].norm1 = GroupNorm(**norm_kwargs)
+        unet.down_blocks[block_no].resnets[resnet_idx].conv1 = LoRACompatibleConv(**conv1_kwargs)
+        if old_conv_shortcut is not None: unet.down_blocks[block_no].resnets[resnet_idx].conv_shortcut = LoRACompatibleConv(**conv_shortcut_args_kwargs)
+        unet.down_blocks[block_no].resnets[resnet_idx].in_channels += by  # surgery done here
 
 def zero_module(module):
     for p in module.parameters():
         nn.init.zeros_(p)
     return module
+
+
+# util functions, do delete laters
+def gether_channel_sizes(m, m_type):
+    if m_type == 'base':
+        ch_inout_base = {'enc': [], 'mid': [], 'dec': []}
+        # 3.1 - input convolution
+        ch_inout_base['enc'].append((m.conv_in.in_channels, m.conv_in.out_channels))
+        # 3.2 - encoder blocks
+        for module in m.down_blocks:
+            if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D)):
+                for r in module.resnets:
+                    ch_inout_base['enc'].append((r.in_channels, r.out_channels))
+                if module.downsamplers:
+                    ch_inout_base['enc'].append((module.downsamplers[0].channels, module.downsamplers[0].out_channels))
+            else:
+                raise ValueError(f'Encountered unknown module of type {type(module)} while creating ControlNet-XS.')
+        # 3.3 - middle block
+        ch_inout_base['mid'].append((m.mid_block.resnets[0].in_channels, m.mid_block.resnets[0].in_channels))
+        # 3.4 - decoder blocks
+        for module in m.up_blocks:
+            if isinstance(module, (CrossAttnUpBlock2D, UpBlock2D)):
+                for r in module.resnets:
+                    ch_inout_base['dec'].append((r.in_channels, r.out_channels))
+            else:
+                raise ValueError(f'Encountered unknown module of type {type(module)} while creating ControlNet-XS.')
+        return ch_inout_base
+    elif m_type == 'control':
+        ch_inout_ctrl = {'enc': [], 'mid': [], 'dec': []}
+        # 3.1 - input convolution
+        ch_inout_ctrl['enc'].append((m.conv_in.in_channels, m.conv_in.out_channels))
+        # 3.2 - encoder blocks
+        for module in m.down_blocks:
+            if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D)):
+                for r in module.resnets:
+                    ch_inout_ctrl['enc'].append((r.in_channels, r.out_channels))
+                if module.downsamplers:
+                    ch_inout_ctrl['enc'].append((module.downsamplers[0].channels, module.downsamplers[0].out_channels))
+            else:
+                raise ValueError(f'Encountered unknown module of type {type(module)} while creating ControlNet-XS.')
+        # 3.3 - middle block
+        ch_inout_ctrl['mid'].append((m.mid_block.resnets[0].in_channels, m.mid_block.resnets[0].in_channels))
+        return ch_inout_ctrl
+    else: raise ValueError(f'model_type must be `base` or `control`, not `{m_type}`')
+
+def print_channels(ch_szs):
+    for k,v in ch_szs.items(): print(k,v)

From b30120c777b9642cfc4c9d63f28fdf64a73f5224 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Mon, 9 Oct 2023 18:15:15 +0200
Subject: [PATCH 06/88] check-in 230109

---
 .../models/unet_2d_condition_control.py       | 88 +++++++++++++++----
 1 file changed, 71 insertions(+), 17 deletions(-)

diff --git a/src/diffusers/models/unet_2d_condition_control.py b/src/diffusers/models/unet_2d_condition_control.py
index 15d13c3a4125..1346441f2488 100644
--- a/src/diffusers/models/unet_2d_condition_control.py
+++ b/src/diffusers/models/unet_2d_condition_control.py
@@ -75,7 +75,7 @@ def __init__(
             hint_channels,
             num_res_blocks,
             attention_resolutions,
-            block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+            block_out_channels: Tuple[int] = (320, 640, 1280, 1280),#note umer: not used everywhere by me. fix later.
             act_fn: str = "silu",
             time_embedding_type: str = "positional",
             time_embedding_dim: Optional[int] = None,
@@ -141,17 +141,24 @@ def __init__(
 
         # 2 - Create base and control model
         # TODO 1. create base model, or 2. pass it
-        self.base_model = base_model = UNet2DConditionModel()
+        self.base_model = base_model = UNet2DConditionModel(#todo make variable
+            block_out_channels=(320, 640, 1280),
+            down_block_types=("CrossAttnDownBlock2D","CrossAttnDownBlock2D","DownBlock2D"),
+            up_block_types=("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
+        )
         # TODO create control model
-        self.control_model = ctrl_model = UNet2DConditionModel(
+        self.control_model = ctrl_model = UNet2DConditionModel(#todo make variable
             block_out_channels=[32,64,128],
-            down_block_types=("CrossAttnDownBlock2D","CrossAttnDownBlock2D","DownBlock2D",),
+            down_block_types=("CrossAttnDownBlock2D","CrossAttnDownBlock2D","DownBlock2D"),
             up_block_types=("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
+            time_embedding_dim=1280
         ) # todo: make variable
-        for i, extra_channels in enumerate(((320, 320), (320,640), (640,1280))[:-1]): # todo: make variable (sth like block_out_channels[:-1])
+        for i, extra_channels in enumerate(((320, 320), (320,640), (640,1280))): # todo: make variable (sth like zip(block_out_channels[:-1],block_out_channels[1:]))
             e1,e2=extra_channels
-            increase_block_input_in_resnet(self.control_model, block_no=i+1, resnet_idx=0, by=e1)
-            increase_block_input_in_resnet(self.control_model, block_no=i+1, resnet_idx=1, by=e2)
+            increase_block_input_in_encoder_resnet(self.control_model, block_no=i, resnet_idx=0, by=e1)
+            increase_block_input_in_encoder_resnet(self.control_model, block_no=i, resnet_idx=1, by=e2)
+            if self.control_model.down_blocks[i].downsamplers: increase_block_input_in_encoder_downsampler(self.control_model, block_no=i, by=e2)
+        increase_block_input_in_mid_resnet(self.control_model, by=1280) # todo: make var
 
         # 3 - Gather Channel Sizes
         ch_inout_ctrl = {'enc': [], 'mid': [], 'dec': []}
@@ -181,8 +188,8 @@ def __init__(
                 raise ValueError(f'Encountered unknown module of type {type(module)} while creating ControlNet-XS.')
 
         # 3.3 - middle block
-        ch_inout_ctrl['mid'].append((ctrl_model.mid_block.resnets[0].in_channels, ctrl_model.mid_block.resnets[0].in_channels))
-        ch_inout_base['mid'].append((base_model.mid_block.resnets[0].in_channels, base_model.mid_block.resnets[0].in_channels))
+        ch_inout_ctrl['mid'].append((ctrl_model.mid_block.resnets[0].in_channels, ctrl_model.mid_block.resnets[0].out_channels))
+        ch_inout_base['mid'].append((base_model.mid_block.resnets[0].in_channels, base_model.mid_block.resnets[0].out_channels))
     
         # 3.4 - decoder blocks
         for module in base_model.up_blocks:
@@ -342,9 +349,8 @@ def make_zero_conv(self, in_channels, out_channels=None):
         return zero_module(nn.Conv2d(in_channels, out_channels, 1, padding=0))
 
 
-def increase_block_input_in_resnet(unet, block_no, resnet_idx, by):
+def increase_block_input_in_encoder_resnet(unet, block_no, resnet_idx, by):
         """Increase channels sizes to allow for additional concatted information from base model"""
-        assert block_no!=0, "Only after block 0 do we have info to pass from base to control, so you probably didn't mean block_no=0." 
         r=unet.down_blocks[block_no].resnets[resnet_idx]
         old_norm1, old_conv1, old_conv_shortcut = r.norm1,r.conv1,r.conv_shortcut
         # norm
@@ -359,6 +365,54 @@ def increase_block_input_in_resnet(unet, block_no, resnet_idx, by):
         conv1_kwargs['bias'] = 'bias' in conv1_kwargs  # as param, bias is a boolean, but as attr, it's a tensor.
         conv1_kwargs['in_channels'] += by  # surgery done here
         # conv_shortcut
+        # as we changed the input size of the block, the input and output sizes are likely different,
+        # therefore we need a conv_shortcut (simply adding won't work) 
+        conv_shortcut_args_kwargs = { 
+            'in_channels': conv1_kwargs['in_channels'],
+            'out_channels': conv1_kwargs['out_channels'],
+            # default arguments from resnet.__init__
+            'kernel_size':1, 
+            'stride':1, 
+            'padding':0,
+            'bias':True
+        }
+        # swap old with new modules
+        unet.down_blocks[block_no].resnets[resnet_idx].norm1 = GroupNorm(**norm_kwargs)
+        unet.down_blocks[block_no].resnets[resnet_idx].conv1 = LoRACompatibleConv(**conv1_kwargs)
+        unet.down_blocks[block_no].resnets[resnet_idx].conv_shortcut = LoRACompatibleConv(**conv_shortcut_args_kwargs)
+        unet.down_blocks[block_no].resnets[resnet_idx].in_channels += by  # surgery done here
+
+
+def increase_block_input_in_encoder_downsampler(unet, block_no, by):
+        """Increase channels sizes to allow for additional concatted information from base model"""
+        old_down=unet.down_blocks[block_no].downsamplers[0].conv
+        # conv1
+        args = 'in_channels out_channels kernel_size stride padding dilation groups bias padding_mode lora_layer'.split(' ')
+        for a in args: assert hasattr(old_down, a)
+        kwargs = { a: getattr(old_down, a) for a in args}
+        kwargs['bias'] = 'bias' in kwargs  # as param, bias is a boolean, but as attr, it's a tensor.
+        kwargs['in_channels'] += by  # surgery done here
+        # swap old with new modules
+        unet.down_blocks[block_no].downsamplers[0].conv = LoRACompatibleConv(**kwargs)
+        unet.down_blocks[block_no].downsamplers[0].channels += by  # surgery done here
+
+
+def increase_block_input_in_mid_resnet(unet, by):
+        """Increase channels sizes to allow for additional concatted information from base model"""
+        m=unet.mid_block.resnets[0]
+        old_norm1, old_conv1, old_conv_shortcut = m.norm1,m.conv1,m.conv_shortcut
+        # norm
+        norm_args = 'num_groups num_channels eps affine'.split(' ')
+        for a in norm_args: assert hasattr(old_norm1, a)
+        norm_kwargs = { a: getattr(old_norm1, a) for a in norm_args }
+        norm_kwargs['num_channels'] += by  # surgery done here
+        # conv1
+        conv1_args = 'in_channels out_channels kernel_size stride padding dilation groups bias padding_mode lora_layer'.split(' ')
+        for a in conv1_args: assert hasattr(old_conv1, a)
+        conv1_kwargs = { a: getattr(old_conv1, a) for a in conv1_args }
+        conv1_kwargs['bias'] = 'bias' in conv1_kwargs  # as param, bias is a boolean, but as attr, it's a tensor.
+        conv1_kwargs['in_channels'] += by  # surgery done here
+        # conv_shortcut
         if old_conv_shortcut is not None:
             conv_shortcut_args = 'in_channels out_channels kernel_size stride padding dilation groups bias padding_mode lora_layer'.split(' ')
             for a in conv_shortcut_args: assert hasattr(old_conv_shortcut, a)
@@ -366,10 +420,10 @@ def increase_block_input_in_resnet(unet, block_no, resnet_idx, by):
             conv_shortcut_args_kwargs['bias'] = 'bias' in conv_shortcut_args_kwargs  # as param, bias is a boolean, but as attr, it's a tensor.
             conv_shortcut_args_kwargs['in_channels'] += by  # surgery done here
         # swap old with new modules
-        unet.down_blocks[block_no].resnets[resnet_idx].norm1 = GroupNorm(**norm_kwargs)
-        unet.down_blocks[block_no].resnets[resnet_idx].conv1 = LoRACompatibleConv(**conv1_kwargs)
-        if old_conv_shortcut is not None: unet.down_blocks[block_no].resnets[resnet_idx].conv_shortcut = LoRACompatibleConv(**conv_shortcut_args_kwargs)
-        unet.down_blocks[block_no].resnets[resnet_idx].in_channels += by  # surgery done here
+        unet.mid_block.resnets[0].norm1 = GroupNorm(**norm_kwargs)
+        unet.mid_block.resnets[0].conv1 = LoRACompatibleConv(**conv1_kwargs)
+        unet.mid_block.resnets[0].in_channels += by  # surgery done here
+
 
 def zero_module(module):
     for p in module.parameters():
@@ -393,7 +447,7 @@ def gether_channel_sizes(m, m_type):
             else:
                 raise ValueError(f'Encountered unknown module of type {type(module)} while creating ControlNet-XS.')
         # 3.3 - middle block
-        ch_inout_base['mid'].append((m.mid_block.resnets[0].in_channels, m.mid_block.resnets[0].in_channels))
+        ch_inout_base['mid'].append((m.mid_block.resnets[0].in_channels, m.mid_block.resnets[0].out_channels))
         # 3.4 - decoder blocks
         for module in m.up_blocks:
             if isinstance(module, (CrossAttnUpBlock2D, UpBlock2D)):
@@ -416,7 +470,7 @@ def gether_channel_sizes(m, m_type):
             else:
                 raise ValueError(f'Encountered unknown module of type {type(module)} while creating ControlNet-XS.')
         # 3.3 - middle block
-        ch_inout_ctrl['mid'].append((m.mid_block.resnets[0].in_channels, m.mid_block.resnets[0].in_channels))
+        ch_inout_ctrl['mid'].append((m.mid_block.resnets[0].in_channels, m.mid_block.resnets[0].out_channels))
         return ch_inout_ctrl
     else: raise ValueError(f'model_type must be `base` or `control`, not `{m_type}`')
 

From 7b67ceb985e7c4a0577a6e26e160b7307d012f64 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Tue, 10 Oct 2023 16:51:38 +0200
Subject: [PATCH 07/88] checkin 231010

---
 .../models/unet_2d_condition_control.py       | 121 ++++++++----------
 1 file changed, 56 insertions(+), 65 deletions(-)

diff --git a/src/diffusers/models/unet_2d_condition_control.py b/src/diffusers/models/unet_2d_condition_control.py
index 1346441f2488..4c6c9ee6f73b 100644
--- a/src/diffusers/models/unet_2d_condition_control.py
+++ b/src/diffusers/models/unet_2d_condition_control.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Union
+from itertools import chain, zip_longest
 
 import torch
 import torch.nn as nn
@@ -37,6 +38,10 @@
     DownBlock2D,
     CrossAttnUpBlock2D,
     UpBlock2D,
+    ResnetBlock2D,
+    Transformer2DModel,
+    Downsample2D,
+    Upsample2D
 )
 from .unet_2d_condition import UNet2DConditionModel
 
@@ -140,17 +145,15 @@ def __init__(
         self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
 
         # 2 - Create base and control model
-        # TODO 1. create base model, or 2. pass it
         self.base_model = base_model = UNet2DConditionModel(#todo make variable
             block_out_channels=(320, 640, 1280),
-            down_block_types=("CrossAttnDownBlock2D","CrossAttnDownBlock2D","DownBlock2D"),
-            up_block_types=("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
+            down_block_types=("DownBlock2D","CrossAttnDownBlock2D","CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "CrossAttnUpBlock2D","UpBlock2D"),
         )
-        # TODO create control model
         self.control_model = ctrl_model = UNet2DConditionModel(#todo make variable
             block_out_channels=[32,64,128],
-            down_block_types=("CrossAttnDownBlock2D","CrossAttnDownBlock2D","DownBlock2D"),
-            up_block_types=("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
+            down_block_types=("DownBlock2D","CrossAttnDownBlock2D","CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "CrossAttnUpBlock2D","UpBlock2D"),
             time_embedding_dim=1280
         ) # todo: make variable
         for i, extra_channels in enumerate(((320, 320), (320,640), (640,1280))): # todo: make variable (sth like zip(block_out_channels[:-1],block_out_channels[1:]))
@@ -309,6 +312,11 @@ def forward(self, x: torch.Tensor, t: torch.Tensor, encoder_hidden_states: torch
         it_enc_convs_in, it_enc_convs_out, it_dec_convs_in, it_dec_convs_out = map(iter, (self.enc_zero_convs_in, self.enc_zero_convs_out, self.dec_zero_convs_in, self.dec_zero_convs_out))
         scales = iter(self.scale_list)
 
+        base_down_block_parts = to_block_parts(self.base_model.down_blocks)
+        ctrl_down_block_parts = to_block_parts(self.control_model.down_blocks)
+        base_mid_block_parts = to_block_parts([self.base_model.mid_block])
+        ctrl_mid_block_parts = to_block_parts([self.control_model.mid_block])
+
         # Cross Control
         # 0 - conv in
         h_base = self.base_model.conv_in(h_base)
@@ -318,20 +326,23 @@ def forward(self, x: torch.Tensor, t: torch.Tensor, encoder_hidden_states: torch
             guided_hint = None
         hs_base.append(h_base)
         hs_ctrl.append(h_ctrl)
-        h_ctrl = torch.cat([h_ctrl, next(it_enc_convs_in)(h_base)], dim=1)
 
         # 1 - input blocks (encoder)
-        for module_base, module_ctrl in zip(self.base_model.down_blocks, self.control_model.down_blocks):
-            h_base = module_base(h_base, temb, cemb, context)[0] # Note Umer: module_base returns hidden_states and running output list
-            h_ctrl = module_ctrl(h_ctrl, temb, cemb, context)[0] # see above
-
+        for i, (m_base, m_ctrl)  in enumerate(zip(base_down_block_parts, ctrl_down_block_parts)):
+            if isinstance(m_ctrl, (ResnetBlock2D, Downsample2D)): # only infuse info from base when passing tru a ResBlock or Downsample (not a Transformer)              
+                conv_base2ctrl = next(it_enc_convs_in)
+                inp_base2ctrl = conv_base2ctrl(h_base)
+                h_ctrl = torch.cat([h_ctrl, inp_base2ctrl], dim=1)
+            h_base = apply_forward(m_base, h_base, temb, cemb, context)
+            h_ctrl = apply_forward(m_ctrl, h_ctrl, temb, cemb, context)
             hs_base.append(h_base)
             hs_ctrl.append(h_ctrl)
-            h_ctrl = torch.cat([h_ctrl, next(it_enc_convs_in)(h_base)], dim=1)
         # 2 - mid blocks (bottleneck)
-        h_base = self.base_model.mid_block(h_base, emb, context)
-        h_ctrl = self.control_model.mid_block(h_ctrl, emb, context)
-        h_base = h_base + self.middle_block_out(h_ctrl, emb) * next(scales)
+        h_ctrl = torch.concat([h_ctrl, h_base], dim=1)
+        for i, (m_base, m_ctrl)  in enumerate(zip(base_mid_block_parts, ctrl_mid_block_parts)):
+            h_base = apply_forward(m_base, h_base, temb, cemb, context)
+            h_ctrl = apply_forward(m_ctrl, h_ctrl, temb, cemb, context)
+        h_base = h_base + self.middle_block_out(h_ctrl) * next(scales)
         # 3 - output blocks (decoder)
         for module_base in self.base_model.output_blocks:
             h_base = h_base + next(it_dec_convs_out)(hs_ctrl.pop(), emb) * next(scales)
@@ -413,15 +424,21 @@ def increase_block_input_in_mid_resnet(unet, by):
         conv1_kwargs['bias'] = 'bias' in conv1_kwargs  # as param, bias is a boolean, but as attr, it's a tensor.
         conv1_kwargs['in_channels'] += by  # surgery done here
         # conv_shortcut
-        if old_conv_shortcut is not None:
-            conv_shortcut_args = 'in_channels out_channels kernel_size stride padding dilation groups bias padding_mode lora_layer'.split(' ')
-            for a in conv_shortcut_args: assert hasattr(old_conv_shortcut, a)
-            conv_shortcut_args_kwargs = { a: getattr(old_conv_shortcut, a) for a in conv_shortcut_args }
-            conv_shortcut_args_kwargs['bias'] = 'bias' in conv_shortcut_args_kwargs  # as param, bias is a boolean, but as attr, it's a tensor.
-            conv_shortcut_args_kwargs['in_channels'] += by  # surgery done here
+        # as we changed the input size of the block, the input and output sizes are likely different,
+        # therefore we need a conv_shortcut (simply adding won't work) 
+        conv_shortcut_args_kwargs = { 
+            'in_channels': conv1_kwargs['in_channels'],
+            'out_channels': conv1_kwargs['out_channels'],
+            # default arguments from resnet.__init__
+            'kernel_size':1, 
+            'stride':1, 
+            'padding':0,
+            'bias':True
+        }
         # swap old with new modules
         unet.mid_block.resnets[0].norm1 = GroupNorm(**norm_kwargs)
         unet.mid_block.resnets[0].conv1 = LoRACompatibleConv(**conv1_kwargs)
+        unet.mid_block.resnets[0].conv_shortcut = LoRACompatibleConv(**conv_shortcut_args_kwargs)
         unet.mid_block.resnets[0].in_channels += by  # surgery done here
 
 
@@ -431,48 +448,22 @@ def zero_module(module):
     return module
 
 
-# util functions, do delete laters
-def gether_channel_sizes(m, m_type):
-    if m_type == 'base':
-        ch_inout_base = {'enc': [], 'mid': [], 'dec': []}
-        # 3.1 - input convolution
-        ch_inout_base['enc'].append((m.conv_in.in_channels, m.conv_in.out_channels))
-        # 3.2 - encoder blocks
-        for module in m.down_blocks:
-            if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D)):
-                for r in module.resnets:
-                    ch_inout_base['enc'].append((r.in_channels, r.out_channels))
-                if module.downsamplers:
-                    ch_inout_base['enc'].append((module.downsamplers[0].channels, module.downsamplers[0].out_channels))
-            else:
-                raise ValueError(f'Encountered unknown module of type {type(module)} while creating ControlNet-XS.')
-        # 3.3 - middle block
-        ch_inout_base['mid'].append((m.mid_block.resnets[0].in_channels, m.mid_block.resnets[0].out_channels))
-        # 3.4 - decoder blocks
-        for module in m.up_blocks:
-            if isinstance(module, (CrossAttnUpBlock2D, UpBlock2D)):
-                for r in module.resnets:
-                    ch_inout_base['dec'].append((r.in_channels, r.out_channels))
-            else:
-                raise ValueError(f'Encountered unknown module of type {type(module)} while creating ControlNet-XS.')
-        return ch_inout_base
-    elif m_type == 'control':
-        ch_inout_ctrl = {'enc': [], 'mid': [], 'dec': []}
-        # 3.1 - input convolution
-        ch_inout_ctrl['enc'].append((m.conv_in.in_channels, m.conv_in.out_channels))
-        # 3.2 - encoder blocks
-        for module in m.down_blocks:
-            if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D)):
-                for r in module.resnets:
-                    ch_inout_ctrl['enc'].append((r.in_channels, r.out_channels))
-                if module.downsamplers:
-                    ch_inout_ctrl['enc'].append((module.downsamplers[0].channels, module.downsamplers[0].out_channels))
-            else:
-                raise ValueError(f'Encountered unknown module of type {type(module)} while creating ControlNet-XS.')
-        # 3.3 - middle block
-        ch_inout_ctrl['mid'].append((m.mid_block.resnets[0].in_channels, m.mid_block.resnets[0].out_channels))
-        return ch_inout_ctrl
-    else: raise ValueError(f'model_type must be `base` or `control`, not `{m_type}`')
+def block_parts(block):
+    modules = list(block.resnets)
+    if hasattr(block, 'attentions') and block.attentions is not None: modules = list(o for o in chain.from_iterable(zip_longest(modules, block.attentions, fillvalue=None)) if o is not None)
+    if hasattr(block, 'downsamplers') and block.downsamplers is not None: modules.extend(block.downsamplers)
+    if hasattr(block, 'upsamplers') and block.upsamplers is not None: modules.extend(block.upsamplers)
+    return modules
+
+
+def to_block_parts(blocks):
+    '''eg: Down(Res, Res, Conv), CrossAttnDown(Res, Attn, Res, Attn, Conv) -> (Res, Res, Conv, Res, Attn, Res, Attn, Conv)'''
+    parts = [block_parts(b) for b in blocks]
+    return list(chain.from_iterable(parts))
 
-def print_channels(ch_szs):
-    for k,v in ch_szs.items(): print(k,v)
+def apply_forward(m, x, temb, cemb, context):
+    if isinstance(m,ResnetBlock2D): return m(x, temb)
+    if isinstance(m,Transformer2DModel): return m(x, cemb).sample # Q: Include temp also?
+    if isinstance(m,Downsample2D): return m(x)
+    if isinstance(m,Upsample2D): return m(x)
+    raise ValueError(f'Type of m is {type(m)} but should be `ResnetBlock2D`, `Transformer2DModel`,  `Downsample2D`, `Upsample2D`')

From f4d8d629f75a105ce6d8928f90139092a10a3714 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Wed, 11 Oct 2023 15:13:22 +0200
Subject: [PATCH 08/88] init + forward run

---
 .../models/unet_2d_condition_control.py       | 137 +++++++++---------
 1 file changed, 69 insertions(+), 68 deletions(-)

diff --git a/src/diffusers/models/unet_2d_condition_control.py b/src/diffusers/models/unet_2d_condition_control.py
index 4c6c9ee6f73b..5280754e77af 100644
--- a/src/diffusers/models/unet_2d_condition_control.py
+++ b/src/diffusers/models/unet_2d_condition_control.py
@@ -49,27 +49,12 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-# # # Notes Umer
-# To integrate controlnet-xs, I need to
-# 1. Create an ControlNet-xs class
-# 2. Enable it to load from hub (via .from_pretrained)
-# 3. Make sure it runs with all controlnet pipelines
-#
-# Notes & Questions
-# I: Controlnet-xs has a slightly different architecture than controlnet,
-#       as the encoders of the base and the controller are connected.
-# Q: Do I have to adjust all pipelines?
-#
-# Q: There are controlnet-xs models for sd-xl and sd-2.1. Does that mean I need to have multiple pipelines?
-# A: Yes. For the original controlnet, there are 8 pipelines: {sd-xl, sd-2.1} x {normal, img2img, inpainting} + flax + multicontrolnet
-# # # 
-
-
 @dataclass
 class UNet2DConditionOutput(BaseOutput):
     sample: torch.FloatTensor = None
 
 
+# Q: better name?
 class ControlledUNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
 
     def __init__(
@@ -282,13 +267,8 @@ def forward(self, x: torch.Tensor, t: torch.Tensor, encoder_hidden_states: torch
         # # < from forward_
         if no_control: return self.base_model(x=x, timesteps=timesteps, context=context, y=y, **kwargs)
 
-        # # Warning for Umer: What I & cnxs call 'projection', diffusers calls 'embedding'; and vice versa
-        # Code from cnxs:
-        #t_emb = self.time_embedding(timesteps, self.model_channels, repeat_only=False)
-        #if self.learn_embedding: emb = self.control_model.time_embed(t_emb) * self.control_scale ** 0.3 + self.base_model.time_embed(t_emb) * (1 - control_scale ** 0.3)
-        #else: emb = self.base_model.time_embed(t_emb)
-
         # time embeddings
+        timesteps = timesteps[None]
         t_emb = get_timestep_embedding(
             timesteps, 
             self.model_channels,
@@ -305,52 +285,47 @@ def forward(self, x: torch.Tensor, t: torch.Tensor, encoder_hidden_states: torch
 
         emb = temb + cemb
 
-        guided_hint = self.input_hint_block(hint, emb, context)
+        guided_hint = self.input_hint_block(hint)
 
         h_ctrl = h_base = x
         hs_base, hs_ctrl = [], []
         it_enc_convs_in, it_enc_convs_out, it_dec_convs_in, it_dec_convs_out = map(iter, (self.enc_zero_convs_in, self.enc_zero_convs_out, self.dec_zero_convs_in, self.dec_zero_convs_out))
         scales = iter(self.scale_list)
 
-        base_down_block_parts = to_block_parts(self.base_model.down_blocks)
-        ctrl_down_block_parts = to_block_parts(self.control_model.down_blocks)
-        base_mid_block_parts = to_block_parts([self.base_model.mid_block])
-        ctrl_mid_block_parts = to_block_parts([self.control_model.mid_block])
+        base_down_subblocks = to_sub_blocks(self.base_model.down_blocks)
+        ctrl_down_subblocks = to_sub_blocks(self.control_model.down_blocks)
+        base_mid_subblocks = to_sub_blocks([self.base_model.mid_block])
+        ctrl_mid_subblocks = to_sub_blocks([self.control_model.mid_block])
+        base_up_subblocks = to_sub_blocks(self.base_model.up_blocks)
 
         # Cross Control
         # 0 - conv in
         h_base = self.base_model.conv_in(h_base)
         h_ctrl = self.control_model.conv_in(h_ctrl)
-        if guided_hint is not None:
-            h_ctrl = h_ctrl + guided_hint
-            guided_hint = None
         hs_base.append(h_base)
         hs_ctrl.append(h_ctrl)
-
         # 1 - input blocks (encoder)
-        for i, (m_base, m_ctrl)  in enumerate(zip(base_down_block_parts, ctrl_down_block_parts)):
-            if isinstance(m_ctrl, (ResnetBlock2D, Downsample2D)): # only infuse info from base when passing tru a ResBlock or Downsample (not a Transformer)              
-                conv_base2ctrl = next(it_enc_convs_in)
-                inp_base2ctrl = conv_base2ctrl(h_base)
-                h_ctrl = torch.cat([h_ctrl, inp_base2ctrl], dim=1)
-            h_base = apply_forward(m_base, h_base, temb, cemb, context)
-            h_ctrl = apply_forward(m_ctrl, h_ctrl, temb, cemb, context)
+        for m_base, m_ctrl  in zip(base_down_subblocks, ctrl_down_subblocks):
+            inp_base2ctrl = next(it_enc_convs_in)(h_base) # get info from base encoder 
+            if guided_hint is not None: # in first, add hint info if it exists 
+                inp_base2ctrl += guided_hint
+                guided_hint = None
+            h_ctrl = torch.cat([h_ctrl, inp_base2ctrl], dim=1)
+            h_base = m_base(h_base, temb, cemb, context)
+            h_ctrl = m_ctrl(h_ctrl, temb, cemb, context)
             hs_base.append(h_base)
             hs_ctrl.append(h_ctrl)
         # 2 - mid blocks (bottleneck)
         h_ctrl = torch.concat([h_ctrl, h_base], dim=1)
-        for i, (m_base, m_ctrl)  in enumerate(zip(base_mid_block_parts, ctrl_mid_block_parts)):
-            h_base = apply_forward(m_base, h_base, temb, cemb, context)
-            h_ctrl = apply_forward(m_ctrl, h_ctrl, temb, cemb, context)
-        h_base = h_base + self.middle_block_out(h_ctrl) * next(scales)
+        for m_base, m_ctrl in zip(base_mid_subblocks, ctrl_mid_subblocks):
+            h_base = m_base(h_base, temb, cemb, context)
+            h_ctrl = m_ctrl(h_ctrl, temb, cemb, context)
         # 3 - output blocks (decoder)
-        for module_base in self.base_model.output_blocks:
-            h_base = h_base + next(it_dec_convs_out)(hs_ctrl.pop(), emb) * next(scales)
-            h_base = torch.cat([h_base, hs_base.pop()], dim=1)
-            h_base = module_base(h_base, emb, context)
-
-        return self.base_model.out(h_base)
-        # # />
+        for m_base in base_up_subblocks:
+            h_base = h_base + next(it_dec_convs_out)(hs_ctrl.pop()) * next(scales) # add info from ctrl encoder 
+            h_base = torch.cat([h_base, hs_base.pop()], dim=1) # concat info from base encoder+ctrl encoder
+            h_base = m_base(h_base, temb, cemb, context)
+        return self.base_model.conv_out(h_base)
 
 
     def make_zero_conv(self, in_channels, out_channels=None):
@@ -448,22 +423,48 @@ def zero_module(module):
     return module
 
 
-def block_parts(block):
-    modules = list(block.resnets)
-    if hasattr(block, 'attentions') and block.attentions is not None: modules = list(o for o in chain.from_iterable(zip_longest(modules, block.attentions, fillvalue=None)) if o is not None)
-    if hasattr(block, 'downsamplers') and block.downsamplers is not None: modules.extend(block.downsamplers)
-    if hasattr(block, 'upsamplers') and block.upsamplers is not None: modules.extend(block.upsamplers)
-    return modules
-
-
-def to_block_parts(blocks):
-    '''eg: Down(Res, Res, Conv), CrossAttnDown(Res, Attn, Res, Attn, Conv) -> (Res, Res, Conv, Res, Attn, Res, Attn, Conv)'''
-    parts = [block_parts(b) for b in blocks]
-    return list(chain.from_iterable(parts))
-
-def apply_forward(m, x, temb, cemb, context):
-    if isinstance(m,ResnetBlock2D): return m(x, temb)
-    if isinstance(m,Transformer2DModel): return m(x, cemb).sample # Q: Include temp also?
-    if isinstance(m,Downsample2D): return m(x)
-    if isinstance(m,Upsample2D): return m(x)
-    raise ValueError(f'Type of m is {type(m)} but should be `ResnetBlock2D`, `Transformer2DModel`,  `Downsample2D`, `Upsample2D`')
+from diffusers.models.unet_2d_blocks import ResnetBlock2D, Transformer2DModel, Downsample2D, Upsample2D
+class EmbedSequential(nn.ModuleList):
+    """Sequential module passing embeddings (time and conditioning) to children if they support it."""
+    def __init__(self,ms,*args,**kwargs):
+        if not is_iterable(ms): ms = [ms]
+        super().__init__(ms,*args,**kwargs)
+    
+    def forward(self,x,temb,cemb,context):
+        for m in self:
+            if isinstance(m,ResnetBlock2D): x=m(x,temb)
+            elif isinstance(m,Transformer2DModel): x=m(x,cemb).sample # Q: Include temp also?
+            elif isinstance(m,Downsample2D): x=m(x)
+            elif isinstance(m,Upsample2D): x=m(x)
+            else: raise ValueError(f'Type of m is {type(m)} but should be `ResnetBlock2D`, `Transformer2DModel`,  `Downsample2D`, `Upsample2D`')
+        return x
+
+
+def is_iterable(o):
+    if isinstance(o, str): return False
+    try:
+        iter(o)
+        return True
+    except TypeError:
+        return False
+
+
+def to_sub_blocks(blocks):
+    if not is_iterable(blocks): blocks = [blocks]
+    sub_blocks = []
+    for b in blocks:
+        current_subblocks = []
+        if hasattr(b, 'resnets'):
+            if hasattr(b, 'attentions') and b.attentions is not None:
+                current_subblocks = list(zip_longest(b.resnets, b.attentions))
+                 # if we have 1 more resnets than attentions, let the last subblock only be the resnet, not (resnet, None)
+                if current_subblocks[-1][1] is None:
+                    current_subblocks[-1] = current_subblocks[-1][0]
+            else:
+                current_subblocks = list(b.resnets)
+        # upsamplers are part of the same block # q: what if we have multiple upsamplers?
+        if hasattr(b, 'upsamplers') and b.upsamplers is not None: current_subblocks[-1] = list(current_subblocks[-1]) + list(b.upsamplers)
+        # downsamplers are own block
+        if hasattr(b, 'downsamplers') and b.downsamplers is not None: current_subblocks.append(list(b.downsamplers))   
+        sub_blocks += current_subblocks
+    return list(map(EmbedSequential, sub_blocks))

From 9c17549184ee9fc478a11b601b4db3b8dad130ad Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Mon, 16 Oct 2023 21:10:23 +0200
Subject: [PATCH 09/88] checkin

---
 ...d_condition_control.py => controlnetxs.py} | 173 ++++++++----------
 src/diffusers/models/unet_2d_blocks.py        |  13 +-
 src/diffusers/models/unet_2d_condition.py     |   5 +
 3 files changed, 92 insertions(+), 99 deletions(-)
 rename src/diffusers/models/{unet_2d_condition_control.py => controlnetxs.py} (83%)

diff --git a/src/diffusers/models/unet_2d_condition_control.py b/src/diffusers/models/controlnetxs.py
similarity index 83%
rename from src/diffusers/models/unet_2d_condition_control.py
rename to src/diffusers/models/controlnetxs.py
index 5280754e77af..c9f7a5a7309c 100644
--- a/src/diffusers/models/unet_2d_condition_control.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -12,27 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Union
-from itertools import chain, zip_longest
+from typing import Optional, Union, Tuple
 
-import torch
-import torch.nn as nn
-import torch.utils.checkpoint
+from itertools import zip_longest
 
+import torch
+from torch import nn
+from torch.nn import functional as F
 from torch.nn.modules.normalization import GroupNorm
+import torch.utils.checkpoint
 
 from ..configuration_utils import ConfigMixin
 from ..loaders import UNet2DConditionLoadersMixin
 from ..utils import BaseOutput, logging
-
-from .embeddings import (
-    GaussianFourierProjection,
-    TimestepEmbedding,
-    Timesteps,
-    get_timestep_embedding
-)
-from .lora import LoRACompatibleConv
+from .embeddings import get_timestep_embedding
 from .modeling_utils import ModelMixin
+from .lora import LoRACompatibleConv
 from .unet_2d_blocks import (
     CrossAttnDownBlock2D,
     DownBlock2D,
@@ -41,107 +36,91 @@
     ResnetBlock2D,
     Transformer2DModel,
     Downsample2D,
-    Upsample2D
+    Upsample2D,
 )
 from .unet_2d_condition import UNet2DConditionModel
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
+# todo Umer later: add attention_bias to relevant docs
 
 @dataclass
 class UNet2DConditionOutput(BaseOutput):
     sample: torch.FloatTensor = None
 
 
-# Q: better name?
-class ControlledUNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+class ControlNetXSModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+    """A ControlNet-XS model."""
 
+    # to delete later
+    @classmethod
+    def create_as_in_paper(cls):
+        # todo: load sdxl instead
+        base_model = UNet2DConditionModel(
+            block_out_channels=(320, 640, 1280),
+            down_block_types=("DownBlock2D","CrossAttnDownBlock2D","CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "CrossAttnUpBlock2D","UpBlock2D"),
+            transformer_layers_per_block=(0,2,10),
+            cross_attention_dim=2048,
+        )
+        return cls(
+            base_model,
+            model_channels=320,
+            out_channels=4,
+            hint_channels=3,
+            block_out_channels=(32,64,128),
+            transformer_layers_per_block=(0,2,10),
+            attention_bias=True,
+            cross_attention_dim=2048,
+        )
+    
     def __init__(
             self,
-            in_channels,
+            base_model: UNet2DConditionModel,
             model_channels,
             out_channels,
             hint_channels,
-            num_res_blocks,
-            attention_resolutions,
-            block_out_channels: Tuple[int] = (320, 640, 1280, 1280),#note umer: not used everywhere by me. fix later.
-            act_fn: str = "silu",
-            time_embedding_type: str = "positional",
-            time_embedding_dim: Optional[int] = None,
-            time_embedding_act_fn: Optional[str] = None,
-            timestep_post_act: Optional[str] = None,
-            time_cond_proj_dim: Optional[int] = None,
-            flip_sin_to_cos: bool = True,
-            freq_shift: int = 0, 
+            block_out_channels,
+            transformer_layers_per_block,
+            attention_bias=False,
             encoder_hid_dim: Optional[int] = 768, # Note Umer: should not be hard coded, but okay for minimal functional run - this comes from the text encoder output shape
-            cross_attention_dim: Union[int, Tuple[int]] = 1280, # Note Umer: should not be hard coded, but okay for minimal functional run - this from the unet shapes
+            cross_attention_dim: Union[int, Tuple[int]] = 1280,
         ):
         super().__init__()
 
+        self.base_model = base_model
+
         # 1 - Save parameters
         # TODO make variables
-        self.control_mode = "canny"
-        self.learn_embedding = False
-        self.infusion2control = "cat"
-        self.infusion2base = "add"
         self.in_ch_factor = 1 if "cat" == 'add' else 2
-        self.guiding = "encoder"
-        self.two_stream_mode = "cross"
         self.control_model_ratio = 1.0
         self.out_channels = out_channels
         self.dims = 2
         self.model_channels = model_channels
-        self.no_control = False
         self.control_scale = 1.0
-
         self.hint_model = None
-
-        self.flip_sin_to_cos = flip_sin_to_cos
-        self.freq_shift = freq_shift
         
-        # Time embedding
-        if time_embedding_type == "fourier":
-            time_embed_dim = time_embedding_dim or block_out_channels[0] * 2
-            if time_embed_dim % 2 != 0:
-                raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
-            self.time_proj = GaussianFourierProjection(
-                time_embed_dim // 2, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
-            )
-            timestep_input_dim = time_embed_dim
-        elif time_embedding_type == "positional":
-            time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
-
-            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
-            timestep_input_dim = block_out_channels[0]
-        else:
-            raise ValueError(
-                f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
-            )
-
-        self.time_embedding = TimestepEmbedding(
-            timestep_input_dim,
-            time_embed_dim,
-            act_fn=act_fn,
-            post_act_fn=timestep_post_act,
-            cond_proj_dim=time_cond_proj_dim,
+        # 1 - Create controller
+        def class_names(modules):
+            return [m.__class__.__name__ for m in modules]
+        
+        def get_time_emd_dim(unet: UNet2DConditionModel):
+            return unet.time_embedding.linear_2.out_features
+
+        self.control_model = ctrl_model = UNet2DConditionModel(
+            block_out_channels=block_out_channels,
+            down_block_types=class_names(base_model.down_blocks),
+            up_block_types=class_names(base_model.up_blocks),
+            time_embedding_dim=get_time_emd_dim(base_model),
+            transformer_layers_per_block=transformer_layers_per_block,
+            attention_bias=attention_bias,
+            cross_attention_dim=cross_attention_dim,
         )
-        # Text embedding
-        self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
 
-        # 2 - Create base and control model
-        self.base_model = base_model = UNet2DConditionModel(#todo make variable
-            block_out_channels=(320, 640, 1280),
-            down_block_types=("DownBlock2D","CrossAttnDownBlock2D","CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "CrossAttnUpBlock2D","UpBlock2D"),
-        )
-        self.control_model = ctrl_model = UNet2DConditionModel(#todo make variable
-            block_out_channels=[32,64,128],
-            down_block_types=("DownBlock2D","CrossAttnDownBlock2D","CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "CrossAttnUpBlock2D","UpBlock2D"),
-            time_embedding_dim=1280
-        ) # todo: make variable
-        for i, extra_channels in enumerate(((320, 320), (320,640), (640,1280))): # todo: make variable (sth like zip(block_out_channels[:-1],block_out_channels[1:]))
+        # 2 - Adapt controller to allow for information infusion from base model
+        # todo: make variable (sth like zip(block_out_channels[:-1],block_out_channels[1:]))
+        for i, extra_channels in enumerate(((320, 320), (320,640), (640,1280))):
             e1,e2=extra_channels
             increase_block_input_in_encoder_resnet(self.control_model, block_no=i, resnet_idx=0, by=e1)
             increase_block_input_in_encoder_resnet(self.control_model, block_no=i, resnet_idx=1, by=e2)
@@ -215,7 +194,7 @@ def __init__(
                 self.make_zero_conv(ch_inout_ctrl['enc'][-(i + 1)][1], ch_inout_base['dec'][i - 1][1])
             )
     
-        # 5 - Input hint block TODO: Understand
+        # 5 - Create conditioning hint embedding
         self.input_hint_block = nn.Sequential(
             nn.Conv2d(hint_channels, 16, 3, padding=1),
             nn.SiLU(),
@@ -234,10 +213,18 @@ def __init__(
             zero_module(nn.Conv2d(256, int(model_channels * self.control_model_ratio), 3, padding=1))
         )
     
+        # 6 - Create time embedding
+        pass
+        self.flip_sin_to_cos = True # default params
+        self.freq_shift = 0
+        # Todo: Only when `learn_embedding = False` can we just use the base model's time embedding, otherwise we need to create our own 
+        # Text embedding
+        # todo: I thinks we might not need this, because we can use the base model's encoder_hid_proj. todo: verify
+        self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
+
         scale_list = [1.] * len(self.enc_zero_convs_out) + [1.] + [1.] * len(self.dec_zero_convs_out)
         self.register_buffer('scale_list', torch.tensor(scale_list))
 
-
     def forward(self, x: torch.Tensor, t: torch.Tensor, encoder_hidden_states: torch.Tensor, c: dict, hint: torch.Tensor, no_control=False, **kwargs):
         """ Params from unet_2d_condition.UNet2DConditionModel.forward:
         # self,
@@ -255,16 +242,13 @@ def forward(self, x: torch.Tensor, t: torch.Tensor, encoder_hidden_states: torch
         # return_dict: bool = True,
         """
 
-        # # < from forward
         x = torch.cat((x, c.get("concat", torch.Tensor([]).type_as(x))), dim=1)
         if x.size(0) // 2 == hint.size(0): hint = torch.cat([hint, hint], dim=0) # for classifier free guidance
         
         timesteps=t
         context=c.get("crossattn", None)
         y=c.get("vector", None)
-        # # />
 
-        # # < from forward_
         if no_control: return self.base_model(x=x, timesteps=timesteps, context=context, y=y, **kwargs)
 
         # time embeddings
@@ -276,15 +260,10 @@ def forward(self, x: torch.Tensor, t: torch.Tensor, encoder_hidden_states: torch
             flip_sin_to_cos=self.flip_sin_to_cos,
             downscale_freq_shift=self.freq_shift,
         )        
-        # self.learn_embedding == False
         temb = self.base_model.time_embedding(t_emb)
-
-        if y is not None: emb = emb + self.base_model.label_emb(y) # ?? - sth with class-conditioning
         # text embeddings
         cemb = self.encoder_hid_proj(encoder_hidden_states) # Q: use the base/ctrl models' encoder_hid_proj? Need to make sure dims fit
 
-        emb = temb + cemb
-
         guided_hint = self.input_hint_block(hint)
 
         h_ctrl = h_base = x
@@ -327,7 +306,6 @@ def forward(self, x: torch.Tensor, t: torch.Tensor, encoder_hidden_states: torch
             h_base = m_base(h_base, temb, cemb, context)
         return self.base_model.conv_out(h_base)
 
-
     def make_zero_conv(self, in_channels, out_channels=None):
         # keep running track # todo: better comment
         self.in_channels = in_channels
@@ -417,13 +395,6 @@ def increase_block_input_in_mid_resnet(unet, by):
         unet.mid_block.resnets[0].in_channels += by  # surgery done here
 
 
-def zero_module(module):
-    for p in module.parameters():
-        nn.init.zeros_(p)
-    return module
-
-
-from diffusers.models.unet_2d_blocks import ResnetBlock2D, Transformer2DModel, Downsample2D, Upsample2D
 class EmbedSequential(nn.ModuleList):
     """Sequential module passing embeddings (time and conditioning) to children if they support it."""
     def __init__(self,ms,*args,**kwargs):
@@ -468,3 +439,9 @@ def to_sub_blocks(blocks):
         if hasattr(b, 'downsamplers') and b.downsamplers is not None: current_subblocks.append(list(b.downsamplers))   
         sub_blocks += current_subblocks
     return list(map(EmbedSequential, sub_blocks))
+
+
+def zero_module(module):
+    for p in module.parameters():
+        nn.init.zeros_(p)
+    return module
diff --git a/src/diffusers/models/unet_2d_blocks.py b/src/diffusers/models/unet_2d_blocks.py
index d6066e92b7ef..a6c5214b9c96 100644
--- a/src/diffusers/models/unet_2d_blocks.py
+++ b/src/diffusers/models/unet_2d_blocks.py
@@ -29,7 +29,7 @@
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
-
+# ToDo Umer: check if attention_bias should be passed to other block types
 def get_down_block(
     down_block_type,
     num_layers,
@@ -50,6 +50,7 @@ def get_down_block(
     upcast_attention=False,
     resnet_time_scale_shift="default",
     attention_type="default",
+    attention_bias=False,
     resnet_skip_time_act=False,
     resnet_out_scale_factor=1.0,
     cross_attention_norm=None,
@@ -136,6 +137,7 @@ def get_down_block(
             upcast_attention=upcast_attention,
             resnet_time_scale_shift=resnet_time_scale_shift,
             attention_type=attention_type,
+            attention_bias=attention_bias,
         )
     elif down_block_type == "SimpleCrossAttnDownBlock2D":
         if cross_attention_dim is None:
@@ -259,6 +261,7 @@ def get_up_block(
     upcast_attention=False,
     resnet_time_scale_shift="default",
     attention_type="default",
+    attention_bias=False,
     resnet_skip_time_act=False,
     resnet_out_scale_factor=1.0,
     cross_attention_norm=None,
@@ -305,6 +308,7 @@ def get_up_block(
             output_scale_factor=resnet_out_scale_factor,
         )
     elif up_block_type == "CrossAttnUpBlock2D":
+        # todo umer: check if attention_bias required for typey other than CrossAttnUpBlock2D
         if cross_attention_dim is None:
             raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock2D")
         return CrossAttnUpBlock2D(
@@ -327,6 +331,7 @@ def get_up_block(
             upcast_attention=upcast_attention,
             resnet_time_scale_shift=resnet_time_scale_shift,
             attention_type=attention_type,
+            attention_bias=attention_bias,
         )
     elif up_block_type == "SimpleCrossAttnUpBlock2D":
         if cross_attention_dim is None:
@@ -585,6 +590,7 @@ def __init__(
         use_linear_projection=False,
         upcast_attention=False,
         attention_type="default",
+        attention_bias=False,
     ):
         super().__init__()
 
@@ -622,6 +628,7 @@ def __init__(
                         use_linear_projection=use_linear_projection,
                         upcast_attention=upcast_attention,
                         attention_type=attention_type,
+                        attention_bias=attention_bias,
                     )
                 )
             else:
@@ -972,6 +979,7 @@ def __init__(
         only_cross_attention=False,
         upcast_attention=False,
         attention_type="default",
+        attention_bias=False,
     ):
         super().__init__()
         resnets = []
@@ -1009,6 +1017,7 @@ def __init__(
                         only_cross_attention=only_cross_attention,
                         upcast_attention=upcast_attention,
                         attention_type=attention_type,
+                        attention_bias=attention_bias,
                     )
                 )
             else:
@@ -2116,6 +2125,7 @@ def __init__(
         only_cross_attention=False,
         upcast_attention=False,
         attention_type="default",
+        attention_bias=False,
     ):
         super().__init__()
         resnets = []
@@ -2155,6 +2165,7 @@ def __init__(
                         only_cross_attention=only_cross_attention,
                         upcast_attention=upcast_attention,
                         attention_type=attention_type,
+                        attention_bias=attention_bias,
                     )
                 )
             else:
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index d695d182fa37..27c83ddb31a0 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -208,6 +208,7 @@ def __init__(
         conv_out_kernel: int = 3,
         projection_class_embeddings_input_dim: Optional[int] = None,
         attention_type: str = "default",
+        attention_bias: bool = False,
         class_embeddings_concat: bool = False,
         mid_block_only_cross_attention: Optional[bool] = None,
         cross_attention_norm: Optional[str] = None,
@@ -457,6 +458,7 @@ def __init__(
                 upcast_attention=upcast_attention,
                 resnet_time_scale_shift=resnet_time_scale_shift,
                 attention_type=attention_type,
+                attention_bias=attention_bias,
                 resnet_skip_time_act=resnet_skip_time_act,
                 resnet_out_scale_factor=resnet_out_scale_factor,
                 cross_attention_norm=cross_attention_norm,
@@ -466,6 +468,7 @@ def __init__(
             self.down_blocks.append(down_block)
 
         # mid
+        # todo umer: check if attention_bias also needed for types other than UNetMidBlock2DCrossAttn
         if mid_block_type == "UNetMidBlock2DCrossAttn":
             self.mid_block = UNetMidBlock2DCrossAttn(
                 transformer_layers_per_block=transformer_layers_per_block[-1],
@@ -483,6 +486,7 @@ def __init__(
                 use_linear_projection=use_linear_projection,
                 upcast_attention=upcast_attention,
                 attention_type=attention_type,
+                attention_bias=attention_bias,
             )
         elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn":
             self.mid_block = UNetMidBlock2DSimpleCrossAttn(
@@ -551,6 +555,7 @@ def __init__(
                 upcast_attention=upcast_attention,
                 resnet_time_scale_shift=resnet_time_scale_shift,
                 attention_type=attention_type,
+                attention_bias=attention_bias,
                 resnet_skip_time_act=resnet_skip_time_act,
                 resnet_out_scale_factor=resnet_out_scale_factor,
                 cross_attention_norm=cross_attention_norm,

From cc40b55be2a209d203705b61cf523fc6d49ca189 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Wed, 18 Oct 2023 16:02:51 +0200
Subject: [PATCH 10/88] checkin

---
 src/diffusers/models/controlnetxs.py      | 207 ++++++++++++----------
 src/diffusers/models/unet_2d_blocks.py    |  12 --
 src/diffusers/models/unet_2d_condition.py |   7 +-
 3 files changed, 116 insertions(+), 110 deletions(-)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index c9f7a5a7309c..7cd878ebbbba 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -71,8 +71,9 @@ def create_as_in_paper(cls):
             hint_channels=3,
             block_out_channels=(32,64,128),
             transformer_layers_per_block=(0,2,10),
-            attention_bias=True,
             cross_attention_dim=2048,
+            learn_embedding=True,
+            control_model_ratio=0.1,
         )
     
     def __init__(
@@ -83,9 +84,9 @@ def __init__(
             hint_channels,
             block_out_channels,
             transformer_layers_per_block,
-            attention_bias=False,
-            encoder_hid_dim: Optional[int] = 768, # Note Umer: should not be hard coded, but okay for minimal functional run - this comes from the text encoder output shape
             cross_attention_dim: Union[int, Tuple[int]] = 1280,
+            learn_embedding=False,
+            control_model_ratio=1.0,
         ):
         super().__init__()
 
@@ -94,13 +95,15 @@ def __init__(
         # 1 - Save parameters
         # TODO make variables
         self.in_ch_factor = 1 if "cat" == 'add' else 2
-        self.control_model_ratio = 1.0
+        self.control_model_ratio = control_model_ratio
         self.out_channels = out_channels
         self.dims = 2
         self.model_channels = model_channels
         self.control_scale = 1.0
         self.hint_model = None
         
+        self.learn_embedding = learn_embedding
+
         # 1 - Create controller
         def class_names(modules):
             return [m.__class__.__name__ for m in modules]
@@ -114,11 +117,15 @@ def get_time_emd_dim(unet: UNet2DConditionModel):
             up_block_types=class_names(base_model.up_blocks),
             time_embedding_dim=get_time_emd_dim(base_model),
             transformer_layers_per_block=transformer_layers_per_block,
-            attention_bias=attention_bias,
             cross_attention_dim=cross_attention_dim,
         )
 
-        # 2 - Adapt controller to allow for information infusion from base model
+        # 2 - Do model surgery on control model
+        # 2.1 - Allow to use the same time information as the base model
+        def get_time_emd_input_dim(unet: UNet2DConditionModel):
+            return unet.time_embedding.linear_1.in_features
+        adjust_time_input_dim(self.control_model, get_time_emd_input_dim(base_model))
+        # 2.2 - Allow for information infusion from base model
         # todo: make variable (sth like zip(block_out_channels[:-1],block_out_channels[1:]))
         for i, extra_channels in enumerate(((320, 320), (320,640), (640,1280))):
             e1,e2=extra_channels
@@ -183,7 +190,11 @@ def get_time_emd_dim(unet: UNet2DConditionModel):
             self.enc_zero_convs_in.append(self.make_zero_conv(
                 in_channels=ch_io_base[1], out_channels=ch_io_base[1])
             )
-        
+        for i in range(len(ch_inout_ctrl['enc'])):
+            self.enc_zero_convs_out.append(
+                self.make_zero_conv(ch_inout_ctrl['enc'][i][1], ch_inout_base['enc'][i][1])
+            )       
+ 
         self.middle_block_out = self.make_zero_conv(ch_inout_ctrl['mid'][-1][1], ch_inout_base['mid'][-1][1])
         
         self.dec_zero_convs_out.append(
@@ -193,7 +204,8 @@ def get_time_emd_dim(unet: UNet2DConditionModel):
             self.dec_zero_convs_out.append(
                 self.make_zero_conv(ch_inout_ctrl['enc'][-(i + 1)][1], ch_inout_base['dec'][i - 1][1])
             )
-    
+
+
         # 5 - Create conditioning hint embedding
         self.input_hint_block = nn.Sequential(
             nn.Conv2d(hint_channels, 16, 3, padding=1),
@@ -218,9 +230,9 @@ def get_time_emd_dim(unet: UNet2DConditionModel):
         self.flip_sin_to_cos = True # default params
         self.freq_shift = 0
         # Todo: Only when `learn_embedding = False` can we just use the base model's time embedding, otherwise we need to create our own 
+        
         # Text embedding
-        # todo: I thinks we might not need this, because we can use the base model's encoder_hid_proj. todo: verify
-        self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
+        # info: I deleted the encoder_hid_proj as it's not given by the Heidelberg CVL weights
 
         scale_list = [1.] * len(self.enc_zero_convs_out) + [1.] + [1.] * len(self.dec_zero_convs_out)
         self.register_buffer('scale_list', torch.tensor(scale_list))
@@ -259,10 +271,14 @@ def forward(self, x: torch.Tensor, t: torch.Tensor, encoder_hidden_states: torch
             # # TODO: Undetrstand flip_sin_to_cos / (downscale_)freq_shift
             flip_sin_to_cos=self.flip_sin_to_cos,
             downscale_freq_shift=self.freq_shift,
-        )        
-        temb = self.base_model.time_embedding(t_emb)
+        )
+        if self.learn_embedding:
+            temb = self.control_model.time_embedding(t_emb) * self.control_scale ** 0.3 + self.base_model.time_embedding(t_emb) * (1 - self.control_scale ** 0.3)
+        else:
+            temb = self.base_model.time_embedding(t_emb)
+
         # text embeddings
-        cemb = self.encoder_hid_proj(encoder_hidden_states) # Q: use the base/ctrl models' encoder_hid_proj? Need to make sure dims fit
+        cemb = encoder_hidden_states
 
         guided_hint = self.input_hint_block(hint)
 
@@ -292,6 +308,7 @@ def forward(self, x: torch.Tensor, t: torch.Tensor, encoder_hidden_states: torch
             h_ctrl = torch.cat([h_ctrl, inp_base2ctrl], dim=1)
             h_base = m_base(h_base, temb, cemb, context)
             h_ctrl = m_ctrl(h_ctrl, temb, cemb, context)
+            h_base = h_base + next(it_enc_convs_out)(h_ctrl, temb, cemb) * next(scales)
             hs_base.append(h_base)
             hs_ctrl.append(h_ctrl)
         # 2 - mid blocks (bottleneck)
@@ -313,86 +330,90 @@ def make_zero_conv(self, in_channels, out_channels=None):
         return zero_module(nn.Conv2d(in_channels, out_channels, 1, padding=0))
 
 
-def increase_block_input_in_encoder_resnet(unet, block_no, resnet_idx, by):
-        """Increase channels sizes to allow for additional concatted information from base model"""
-        r=unet.down_blocks[block_no].resnets[resnet_idx]
-        old_norm1, old_conv1, old_conv_shortcut = r.norm1,r.conv1,r.conv_shortcut
-        # norm
-        norm_args = 'num_groups num_channels eps affine'.split(' ')
-        for a in norm_args: assert hasattr(old_norm1, a)
-        norm_kwargs = { a: getattr(old_norm1, a) for a in norm_args }
-        norm_kwargs['num_channels'] += by  # surgery done here
-        # conv1
-        conv1_args = 'in_channels out_channels kernel_size stride padding dilation groups bias padding_mode lora_layer'.split(' ')
-        for a in conv1_args: assert hasattr(old_conv1, a)
-        conv1_kwargs = { a: getattr(old_conv1, a) for a in conv1_args }
-        conv1_kwargs['bias'] = 'bias' in conv1_kwargs  # as param, bias is a boolean, but as attr, it's a tensor.
-        conv1_kwargs['in_channels'] += by  # surgery done here
-        # conv_shortcut
-        # as we changed the input size of the block, the input and output sizes are likely different,
-        # therefore we need a conv_shortcut (simply adding won't work) 
-        conv_shortcut_args_kwargs = { 
-            'in_channels': conv1_kwargs['in_channels'],
-            'out_channels': conv1_kwargs['out_channels'],
-            # default arguments from resnet.__init__
-            'kernel_size':1, 
-            'stride':1, 
-            'padding':0,
-            'bias':True
-        }
-        # swap old with new modules
-        unet.down_blocks[block_no].resnets[resnet_idx].norm1 = GroupNorm(**norm_kwargs)
-        unet.down_blocks[block_no].resnets[resnet_idx].conv1 = LoRACompatibleConv(**conv1_kwargs)
-        unet.down_blocks[block_no].resnets[resnet_idx].conv_shortcut = LoRACompatibleConv(**conv_shortcut_args_kwargs)
-        unet.down_blocks[block_no].resnets[resnet_idx].in_channels += by  # surgery done here
-
-
-def increase_block_input_in_encoder_downsampler(unet, block_no, by):
-        """Increase channels sizes to allow for additional concatted information from base model"""
-        old_down=unet.down_blocks[block_no].downsamplers[0].conv
-        # conv1
-        args = 'in_channels out_channels kernel_size stride padding dilation groups bias padding_mode lora_layer'.split(' ')
-        for a in args: assert hasattr(old_down, a)
-        kwargs = { a: getattr(old_down, a) for a in args}
-        kwargs['bias'] = 'bias' in kwargs  # as param, bias is a boolean, but as attr, it's a tensor.
-        kwargs['in_channels'] += by  # surgery done here
-        # swap old with new modules
-        unet.down_blocks[block_no].downsamplers[0].conv = LoRACompatibleConv(**kwargs)
-        unet.down_blocks[block_no].downsamplers[0].channels += by  # surgery done here
-
-
-def increase_block_input_in_mid_resnet(unet, by):
-        """Increase channels sizes to allow for additional concatted information from base model"""
-        m=unet.mid_block.resnets[0]
-        old_norm1, old_conv1, old_conv_shortcut = m.norm1,m.conv1,m.conv_shortcut
-        # norm
-        norm_args = 'num_groups num_channels eps affine'.split(' ')
-        for a in norm_args: assert hasattr(old_norm1, a)
-        norm_kwargs = { a: getattr(old_norm1, a) for a in norm_args }
-        norm_kwargs['num_channels'] += by  # surgery done here
-        # conv1
-        conv1_args = 'in_channels out_channels kernel_size stride padding dilation groups bias padding_mode lora_layer'.split(' ')
-        for a in conv1_args: assert hasattr(old_conv1, a)
-        conv1_kwargs = { a: getattr(old_conv1, a) for a in conv1_args }
-        conv1_kwargs['bias'] = 'bias' in conv1_kwargs  # as param, bias is a boolean, but as attr, it's a tensor.
-        conv1_kwargs['in_channels'] += by  # surgery done here
-        # conv_shortcut
-        # as we changed the input size of the block, the input and output sizes are likely different,
-        # therefore we need a conv_shortcut (simply adding won't work) 
-        conv_shortcut_args_kwargs = { 
-            'in_channels': conv1_kwargs['in_channels'],
-            'out_channels': conv1_kwargs['out_channels'],
-            # default arguments from resnet.__init__
-            'kernel_size':1, 
-            'stride':1, 
-            'padding':0,
-            'bias':True
-        }
-        # swap old with new modules
-        unet.mid_block.resnets[0].norm1 = GroupNorm(**norm_kwargs)
-        unet.mid_block.resnets[0].conv1 = LoRACompatibleConv(**conv1_kwargs)
-        unet.mid_block.resnets[0].conv_shortcut = LoRACompatibleConv(**conv_shortcut_args_kwargs)
-        unet.mid_block.resnets[0].in_channels += by  # surgery done here
+def adjust_time_input_dim(unet: UNet2DConditionModel, dim: int):
+    time_emb = unet.time_embedding
+    time_emb.linear_1 = nn.Linear(dim, time_emb.linear_1.out_features)
+
+def increase_block_input_in_encoder_resnet(unet:UNet2DConditionModel, block_no, resnet_idx, by):
+    """Increase channels sizes to allow for additional concatted information from base model"""
+    r=unet.down_blocks[block_no].resnets[resnet_idx]
+    old_norm1, old_conv1, old_conv_shortcut = r.norm1,r.conv1,r.conv_shortcut
+    # norm
+    norm_args = 'num_groups num_channels eps affine'.split(' ')
+    for a in norm_args: assert hasattr(old_norm1, a)
+    norm_kwargs = { a: getattr(old_norm1, a) for a in norm_args }
+    norm_kwargs['num_channels'] += by  # surgery done here
+    # conv1
+    conv1_args = 'in_channels out_channels kernel_size stride padding dilation groups bias padding_mode lora_layer'.split(' ')
+    for a in conv1_args: assert hasattr(old_conv1, a)
+    conv1_kwargs = { a: getattr(old_conv1, a) for a in conv1_args }
+    conv1_kwargs['bias'] = 'bias' in conv1_kwargs  # as param, bias is a boolean, but as attr, it's a tensor.
+    conv1_kwargs['in_channels'] += by  # surgery done here
+    # conv_shortcut
+    # as we changed the input size of the block, the input and output sizes are likely different,
+    # therefore we need a conv_shortcut (simply adding won't work) 
+    conv_shortcut_args_kwargs = { 
+        'in_channels': conv1_kwargs['in_channels'],
+        'out_channels': conv1_kwargs['out_channels'],
+        # default arguments from resnet.__init__
+        'kernel_size':1, 
+        'stride':1, 
+        'padding':0,
+        'bias':True
+    }
+    # swap old with new modules
+    unet.down_blocks[block_no].resnets[resnet_idx].norm1 = GroupNorm(**norm_kwargs)
+    unet.down_blocks[block_no].resnets[resnet_idx].conv1 = LoRACompatibleConv(**conv1_kwargs)
+    unet.down_blocks[block_no].resnets[resnet_idx].conv_shortcut = LoRACompatibleConv(**conv_shortcut_args_kwargs)
+    unet.down_blocks[block_no].resnets[resnet_idx].in_channels += by  # surgery done here
+
+
+def increase_block_input_in_encoder_downsampler(unet:UNet2DConditionModel, block_no, by):
+    """Increase channels sizes to allow for additional concatted information from base model"""
+    old_down=unet.down_blocks[block_no].downsamplers[0].conv
+    # conv1
+    args = 'in_channels out_channels kernel_size stride padding dilation groups bias padding_mode lora_layer'.split(' ')
+    for a in args: assert hasattr(old_down, a)
+    kwargs = { a: getattr(old_down, a) for a in args}
+    kwargs['bias'] = 'bias' in kwargs  # as param, bias is a boolean, but as attr, it's a tensor.
+    kwargs['in_channels'] += by  # surgery done here
+    # swap old with new modules
+    unet.down_blocks[block_no].downsamplers[0].conv = LoRACompatibleConv(**kwargs)
+    unet.down_blocks[block_no].downsamplers[0].channels += by  # surgery done here
+
+
+def increase_block_input_in_mid_resnet(unet:UNet2DConditionModel, by):
+    """Increase channels sizes to allow for additional concatted information from base model"""
+    m=unet.mid_block.resnets[0]
+    old_norm1, old_conv1, old_conv_shortcut = m.norm1,m.conv1,m.conv_shortcut
+    # norm
+    norm_args = 'num_groups num_channels eps affine'.split(' ')
+    for a in norm_args: assert hasattr(old_norm1, a)
+    norm_kwargs = { a: getattr(old_norm1, a) for a in norm_args }
+    norm_kwargs['num_channels'] += by  # surgery done here
+    # conv1
+    conv1_args = 'in_channels out_channels kernel_size stride padding dilation groups bias padding_mode lora_layer'.split(' ')
+    for a in conv1_args: assert hasattr(old_conv1, a)
+    conv1_kwargs = { a: getattr(old_conv1, a) for a in conv1_args }
+    conv1_kwargs['bias'] = 'bias' in conv1_kwargs  # as param, bias is a boolean, but as attr, it's a tensor.
+    conv1_kwargs['in_channels'] += by  # surgery done here
+    # conv_shortcut
+    # as we changed the input size of the block, the input and output sizes are likely different,
+    # therefore we need a conv_shortcut (simply adding won't work) 
+    conv_shortcut_args_kwargs = { 
+        'in_channels': conv1_kwargs['in_channels'],
+        'out_channels': conv1_kwargs['out_channels'],
+        # default arguments from resnet.__init__
+        'kernel_size':1, 
+        'stride':1, 
+        'padding':0,
+        'bias':True
+    }
+    # swap old with new modules
+    unet.mid_block.resnets[0].norm1 = GroupNorm(**norm_kwargs)
+    unet.mid_block.resnets[0].conv1 = LoRACompatibleConv(**conv1_kwargs)
+    unet.mid_block.resnets[0].conv_shortcut = LoRACompatibleConv(**conv_shortcut_args_kwargs)
+    unet.mid_block.resnets[0].in_channels += by  # surgery done here
 
 
 class EmbedSequential(nn.ModuleList):
diff --git a/src/diffusers/models/unet_2d_blocks.py b/src/diffusers/models/unet_2d_blocks.py
index a6c5214b9c96..08f8fad98585 100644
--- a/src/diffusers/models/unet_2d_blocks.py
+++ b/src/diffusers/models/unet_2d_blocks.py
@@ -29,7 +29,6 @@
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
-# ToDo Umer: check if attention_bias should be passed to other block types
 def get_down_block(
     down_block_type,
     num_layers,
@@ -50,7 +49,6 @@ def get_down_block(
     upcast_attention=False,
     resnet_time_scale_shift="default",
     attention_type="default",
-    attention_bias=False,
     resnet_skip_time_act=False,
     resnet_out_scale_factor=1.0,
     cross_attention_norm=None,
@@ -137,7 +135,6 @@ def get_down_block(
             upcast_attention=upcast_attention,
             resnet_time_scale_shift=resnet_time_scale_shift,
             attention_type=attention_type,
-            attention_bias=attention_bias,
         )
     elif down_block_type == "SimpleCrossAttnDownBlock2D":
         if cross_attention_dim is None:
@@ -261,7 +258,6 @@ def get_up_block(
     upcast_attention=False,
     resnet_time_scale_shift="default",
     attention_type="default",
-    attention_bias=False,
     resnet_skip_time_act=False,
     resnet_out_scale_factor=1.0,
     cross_attention_norm=None,
@@ -308,7 +304,6 @@ def get_up_block(
             output_scale_factor=resnet_out_scale_factor,
         )
     elif up_block_type == "CrossAttnUpBlock2D":
-        # todo umer: check if attention_bias required for typey other than CrossAttnUpBlock2D
         if cross_attention_dim is None:
             raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock2D")
         return CrossAttnUpBlock2D(
@@ -331,7 +326,6 @@ def get_up_block(
             upcast_attention=upcast_attention,
             resnet_time_scale_shift=resnet_time_scale_shift,
             attention_type=attention_type,
-            attention_bias=attention_bias,
         )
     elif up_block_type == "SimpleCrossAttnUpBlock2D":
         if cross_attention_dim is None:
@@ -590,7 +584,6 @@ def __init__(
         use_linear_projection=False,
         upcast_attention=False,
         attention_type="default",
-        attention_bias=False,
     ):
         super().__init__()
 
@@ -628,7 +621,6 @@ def __init__(
                         use_linear_projection=use_linear_projection,
                         upcast_attention=upcast_attention,
                         attention_type=attention_type,
-                        attention_bias=attention_bias,
                     )
                 )
             else:
@@ -979,7 +971,6 @@ def __init__(
         only_cross_attention=False,
         upcast_attention=False,
         attention_type="default",
-        attention_bias=False,
     ):
         super().__init__()
         resnets = []
@@ -1017,7 +1008,6 @@ def __init__(
                         only_cross_attention=only_cross_attention,
                         upcast_attention=upcast_attention,
                         attention_type=attention_type,
-                        attention_bias=attention_bias,
                     )
                 )
             else:
@@ -2125,7 +2115,6 @@ def __init__(
         only_cross_attention=False,
         upcast_attention=False,
         attention_type="default",
-        attention_bias=False,
     ):
         super().__init__()
         resnets = []
@@ -2165,7 +2154,6 @@ def __init__(
                         only_cross_attention=only_cross_attention,
                         upcast_attention=upcast_attention,
                         attention_type=attention_type,
-                        attention_bias=attention_bias,
                     )
                 )
             else:
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index 27c83ddb31a0..16a535745683 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -53,6 +53,8 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
+#TODO Umer: Remove attention_bias again
+
 @dataclass
 class UNet2DConditionOutput(BaseOutput):
     """
@@ -208,7 +210,6 @@ def __init__(
         conv_out_kernel: int = 3,
         projection_class_embeddings_input_dim: Optional[int] = None,
         attention_type: str = "default",
-        attention_bias: bool = False,
         class_embeddings_concat: bool = False,
         mid_block_only_cross_attention: Optional[bool] = None,
         cross_attention_norm: Optional[str] = None,
@@ -458,7 +459,6 @@ def __init__(
                 upcast_attention=upcast_attention,
                 resnet_time_scale_shift=resnet_time_scale_shift,
                 attention_type=attention_type,
-                attention_bias=attention_bias,
                 resnet_skip_time_act=resnet_skip_time_act,
                 resnet_out_scale_factor=resnet_out_scale_factor,
                 cross_attention_norm=cross_attention_norm,
@@ -468,7 +468,6 @@ def __init__(
             self.down_blocks.append(down_block)
 
         # mid
-        # todo umer: check if attention_bias also needed for types other than UNetMidBlock2DCrossAttn
         if mid_block_type == "UNetMidBlock2DCrossAttn":
             self.mid_block = UNetMidBlock2DCrossAttn(
                 transformer_layers_per_block=transformer_layers_per_block[-1],
@@ -486,7 +485,6 @@ def __init__(
                 use_linear_projection=use_linear_projection,
                 upcast_attention=upcast_attention,
                 attention_type=attention_type,
-                attention_bias=attention_bias,
             )
         elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn":
             self.mid_block = UNetMidBlock2DSimpleCrossAttn(
@@ -555,7 +553,6 @@ def __init__(
                 upcast_attention=upcast_attention,
                 resnet_time_scale_shift=resnet_time_scale_shift,
                 attention_type=attention_type,
-                attention_bias=attention_bias,
                 resnet_skip_time_act=resnet_skip_time_act,
                 resnet_out_scale_factor=resnet_out_scale_factor,
                 cross_attention_norm=cross_attention_norm,

From 95b7425d3711ef12a48c505f12cd79162ce47333 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Thu, 19 Oct 2023 16:48:10 +0200
Subject: [PATCH 11/88] ControlNetXSModel is now saveable+loadable

---
 .gitignore                           |   3 +-
 src/diffusers/models/controlnetxs.py | 184 +++++++++++++++------------
 2 files changed, 102 insertions(+), 85 deletions(-)

diff --git a/.gitignore b/.gitignore
index 45602a1f547e..4f6974aa0a82 100644
--- a/.gitignore
+++ b/.gitignore
@@ -173,4 +173,5 @@ tags
 # ruff
 .ruff_cache
 
-wandb
\ No newline at end of file
+wandb
+.cursorignore
diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index 7cd878ebbbba..763e27d0a791 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -18,11 +18,10 @@
 
 import torch
 from torch import nn
-from torch.nn import functional as F
 from torch.nn.modules.normalization import GroupNorm
 import torch.utils.checkpoint
 
-from ..configuration_utils import ConfigMixin
+from ..configuration_utils import ConfigMixin, register_to_config
 from ..loaders import UNet2DConditionLoadersMixin
 from ..utils import BaseOutput, logging
 from .embeddings import get_timestep_embedding
@@ -55,43 +54,98 @@ class ControlNetXSModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
 
     # to delete later
     @classmethod
-    def create_as_in_paper(cls):
-        # todo: load sdxl instead
-        base_model = UNet2DConditionModel(
-            block_out_channels=(320, 640, 1280),
-            down_block_types=("DownBlock2D","CrossAttnDownBlock2D","CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "CrossAttnUpBlock2D","UpBlock2D"),
-            transformer_layers_per_block=(0,2,10),
-            cross_attention_dim=2048,
-        )
-        return cls(
-            base_model,
+    def create_as_in_paper(cls, base_model=None):
+        if base_model is None:
+            # todo: load sdxl instead
+            base_model = UNet2DConditionModel(
+                block_out_channels=(320, 640, 1280),
+                down_block_types=("DownBlock2D","CrossAttnDownBlock2D","CrossAttnDownBlock2D"),
+                up_block_types=("DownBlock2D","CrossAttnDownBlock2D","CrossAttnDownBlock2D"),
+                transformer_layers_per_block=(0,2,10),
+                cross_attention_dim=2048,
+            )
+
+        def class_names(modules): return [m.__class__.__name__ for m in modules]
+        def get_time_emb_dim(unet: UNet2DConditionModel): return unet.time_embedding.linear_2.out_features
+        def get_time_emb_input_dim(unet: UNet2DConditionModel):return unet.time_embedding.linear_1.in_features
+
+        base_model_channel_sizes = ControlNetXSModel.gather_base_model_sizes(base_model, base_or_control='base')
+
+        cnxs_model = cls(
             model_channels=320,
             out_channels=4,
             hint_channels=3,
             block_out_channels=(32,64,128),
+            down_block_types=class_names(base_model.down_blocks),
+            up_block_types=class_names(base_model.up_blocks),
+            time_embedding_dim=get_time_emb_dim(base_model),
+            time_embedding_input_dim=get_time_emb_input_dim(base_model),
             transformer_layers_per_block=(0,2,10),
             cross_attention_dim=2048,
             learn_embedding=True,
             control_model_ratio=0.1,
+            base_model_channel_sizes=base_model_channel_sizes,
         )
-    
+        cnxs_model.base_model = base_model
+        return cnxs_model
+
+    @classmethod
+    def gather_base_model_sizes(cls, unet: UNet2DConditionModel, base_or_control):
+        if base_or_control not in ['base', 'control']:
+            raise ValueError(f"`base_or_control` needs to be either `base` or `control`")
+
+        channel_sizes = {'enc': [], 'mid': [], 'dec': []}
+
+        # input convolution
+        channel_sizes['enc'].append((unet.conv_in.in_channels, unet.conv_in.out_channels))
+
+        # encoder blocks
+        for module in unet.down_blocks:
+            if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D)):
+                for r in module.resnets:
+                    channel_sizes['enc'].append((r.in_channels, r.out_channels))
+                if module.downsamplers:
+                    channel_sizes['enc'].append((module.downsamplers[0].channels, module.downsamplers[0].out_channels))
+            else:
+                raise ValueError(f'Encountered unknown module of type {type(module)} while creating ControlNet-XS.')
+
+        # middle block
+        channel_sizes['mid'].append((unet.mid_block.resnets[0].in_channels, unet.mid_block.resnets[0].out_channels))
+
+        # decoder blocks
+        if base_or_control == 'base':
+            for module in unet.up_blocks:
+                if isinstance(module, (CrossAttnUpBlock2D, UpBlock2D)):
+                    for r in module.resnets:
+                        channel_sizes['dec'].append((r.in_channels, r.out_channels))
+                else:
+                   raise ValueError(f'Encountered unknown module of type {type(module)} while creating ControlNet-XS.')
+
+        return channel_sizes
+
+    @register_to_config
     def __init__(
             self,
-            base_model: UNet2DConditionModel,
-            model_channels,
-            out_channels,
-            hint_channels,
-            block_out_channels,
-            transformer_layers_per_block,
-            cross_attention_dim: Union[int, Tuple[int]] = 1280,
+            model_channels=320,
+            out_channels=4,
+            hint_channels=3,
+            block_out_channels=(32,64,128),
+            down_block_types=("DownBlock2D","CrossAttnDownBlock2D","CrossAttnDownBlock2D"),
+            up_block_types=("DownBlock2D","CrossAttnDownBlock2D","CrossAttnDownBlock2D"),
+            time_embedding_dim=1280,
+            time_embedding_input_dim=320,
+            transformer_layers_per_block=(0,2,10),
+            cross_attention_dim: Union[int, Tuple[int]] = 2048,#1280,
             learn_embedding=False,
             control_model_ratio=1.0,
+            base_model_channel_sizes={
+                'enc': [(4, 320), (320, 320), (320, 320), (320, 320), (320, 640), (640, 640), (640, 640), (640, 1280), (1280, 1280)],
+                'mid': [(1280, 1280)],
+                'dec': [(2560, 1280), (2560, 1280), (1920, 1280), (1920, 640), (1280, 640), (960, 640), (960, 320), (640, 320), (640, 320)]
+            },
         ):
         super().__init__()
 
-        self.base_model = base_model
-
         # 1 - Save parameters
         # TODO make variables
         self.in_ch_factor = 1 if "cat" == 'add' else 2
@@ -105,26 +159,18 @@ def __init__(
         self.learn_embedding = learn_embedding
 
         # 1 - Create controller
-        def class_names(modules):
-            return [m.__class__.__name__ for m in modules]
-        
-        def get_time_emd_dim(unet: UNet2DConditionModel):
-            return unet.time_embedding.linear_2.out_features
-
         self.control_model = ctrl_model = UNet2DConditionModel(
             block_out_channels=block_out_channels,
-            down_block_types=class_names(base_model.down_blocks),
-            up_block_types=class_names(base_model.up_blocks),
-            time_embedding_dim=get_time_emd_dim(base_model),
+            down_block_types=down_block_types,
+            up_block_types=up_block_types,
+            time_embedding_dim=time_embedding_dim,
             transformer_layers_per_block=transformer_layers_per_block,
             cross_attention_dim=cross_attention_dim,
         )
 
         # 2 - Do model surgery on control model
         # 2.1 - Allow to use the same time information as the base model
-        def get_time_emd_input_dim(unet: UNet2DConditionModel):
-            return unet.time_embedding.linear_1.in_features
-        adjust_time_input_dim(self.control_model, get_time_emd_input_dim(base_model))
+        adjust_time_input_dim(self.control_model, time_embedding_input_dim)
         # 2.2 - Allow for information infusion from base model
         # todo: make variable (sth like zip(block_out_channels[:-1],block_out_channels[1:]))
         for i, extra_channels in enumerate(((320, 320), (320,640), (640,1280))):
@@ -135,46 +181,8 @@ def get_time_emd_input_dim(unet: UNet2DConditionModel):
         increase_block_input_in_mid_resnet(self.control_model, by=1280) # todo: make var
 
         # 3 - Gather Channel Sizes
-        ch_inout_ctrl = {'enc': [], 'mid': [], 'dec': []}
-        ch_inout_base = {'enc': [], 'mid': [], 'dec': []}
-
-        # 3.1 - input convolution
-        ch_inout_ctrl['enc'].append((ctrl_model.conv_in.in_channels, ctrl_model.conv_in.out_channels))
-        ch_inout_base['enc'].append((base_model.conv_in.in_channels, base_model.conv_in.out_channels))
-
-        # 3.2 - encoder blocks
-        for module in ctrl_model.down_blocks:
-            if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D)):
-                for r in module.resnets:
-                    ch_inout_ctrl['enc'].append((r.in_channels, r.out_channels))
-                if module.downsamplers:
-                    ch_inout_ctrl['enc'].append((module.downsamplers[0].channels, module.downsamplers[0].out_channels))
-            else:
-                raise ValueError(f'Encountered unknown module of type {type(module)} while creating ControlNet-XS.')
-    
-        for module in base_model.down_blocks:
-            if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D)):
-                for r in module.resnets:
-                    ch_inout_base['enc'].append((r.in_channels, r.out_channels))
-                if module.downsamplers:
-                    ch_inout_base['enc'].append((module.downsamplers[0].channels, module.downsamplers[0].out_channels))
-            else:
-                raise ValueError(f'Encountered unknown module of type {type(module)} while creating ControlNet-XS.')
-
-        # 3.3 - middle block
-        ch_inout_ctrl['mid'].append((ctrl_model.mid_block.resnets[0].in_channels, ctrl_model.mid_block.resnets[0].out_channels))
-        ch_inout_base['mid'].append((base_model.mid_block.resnets[0].in_channels, base_model.mid_block.resnets[0].out_channels))
-    
-        # 3.4 - decoder blocks
-        for module in base_model.up_blocks:
-            if isinstance(module, (CrossAttnUpBlock2D, UpBlock2D)):
-                for r in module.resnets:
-                    ch_inout_base['dec'].append((r.in_channels, r.out_channels))
-            else:
-                raise ValueError(f'Encountered unknown module of type {type(module)} while creating ControlNet-XS.')
-            
-        self.ch_inout_ctrl = ch_inout_ctrl
-        self.ch_inout_base = ch_inout_base
+        self.ch_inout_ctrl = ControlNetXSModel.gather_base_model_sizes(self.control_model, base_or_control='control')
+        self.ch_inout_base = base_model_channel_sizes
 
         # 4 - Build connections between base and control model
         self.enc_zero_convs_out = nn.ModuleList([])
@@ -186,26 +194,25 @@ def get_time_emd_input_dim(unet: UNet2DConditionModel):
         self.dec_zero_convs_out = nn.ModuleList([])
         self.dec_zero_convs_in = nn.ModuleList([])
 
-        for ch_io_base in ch_inout_base['enc']:
+        for ch_io_base in self.ch_inout_base['enc']:
             self.enc_zero_convs_in.append(self.make_zero_conv(
                 in_channels=ch_io_base[1], out_channels=ch_io_base[1])
             )
-        for i in range(len(ch_inout_ctrl['enc'])):
+        for i in range(len(self.ch_inout_ctrl['enc'])):
             self.enc_zero_convs_out.append(
-                self.make_zero_conv(ch_inout_ctrl['enc'][i][1], ch_inout_base['enc'][i][1])
+                self.make_zero_conv(self.ch_inout_ctrl['enc'][i][1], self.ch_inout_base['enc'][i][1])
             )       
  
-        self.middle_block_out = self.make_zero_conv(ch_inout_ctrl['mid'][-1][1], ch_inout_base['mid'][-1][1])
+        self.middle_block_out = self.make_zero_conv(self.ch_inout_ctrl['mid'][-1][1], self.ch_inout_base['mid'][-1][1])
         
         self.dec_zero_convs_out.append(
-            self.make_zero_conv(ch_inout_ctrl['enc'][-1][1], ch_inout_base['mid'][-1][1])
+            self.make_zero_conv(self.ch_inout_ctrl['enc'][-1][1], self.ch_inout_base['mid'][-1][1])
         )
-        for i in range(1, len(ch_inout_ctrl['enc'])):
+        for i in range(1, len(self.ch_inout_ctrl['enc'])):
             self.dec_zero_convs_out.append(
-                self.make_zero_conv(ch_inout_ctrl['enc'][-(i + 1)][1], ch_inout_base['dec'][i - 1][1])
+                self.make_zero_conv(self.ch_inout_ctrl['enc'][-(i + 1)][1], self.ch_inout_base['dec'][i - 1][1])
             )
 
-
         # 5 - Create conditioning hint embedding
         self.input_hint_block = nn.Sequential(
             nn.Conv2d(hint_channels, 16, 3, padding=1),
@@ -237,7 +244,16 @@ def get_time_emd_input_dim(unet: UNet2DConditionModel):
         scale_list = [1.] * len(self.enc_zero_convs_out) + [1.] + [1.] * len(self.dec_zero_convs_out)
         self.register_buffer('scale_list', torch.tensor(scale_list))
 
+        # in the mininal implementation setting, we only need the control model up to the mid block
+        # note: these can only be deleted after  has to be `gather_base_model_sizes(self.control_mode, 'control')` has been called
+        del self.control_model.up_blocks
+        del self.control_model.conv_norm_out
+        del self.control_model.conv_out
+
     def forward(self, x: torch.Tensor, t: torch.Tensor, encoder_hidden_states: torch.Tensor, c: dict, hint: torch.Tensor, no_control=False, **kwargs):
+        if self.base_model is None:
+            raise RuntimeError("To use `forward`, first set the base model for this ControlNetXSModel by `cnxs_model.base_model = the_base_model`")
+
         """ Params from unet_2d_condition.UNet2DConditionModel.forward:
         # self,
         # sample: torch.FloatTensor,
@@ -308,7 +324,7 @@ def forward(self, x: torch.Tensor, t: torch.Tensor, encoder_hidden_states: torch
             h_ctrl = torch.cat([h_ctrl, inp_base2ctrl], dim=1)
             h_base = m_base(h_base, temb, cemb, context)
             h_ctrl = m_ctrl(h_ctrl, temb, cemb, context)
-            h_base = h_base + next(it_enc_convs_out)(h_ctrl, temb, cemb) * next(scales)
+            h_base = h_base + next(it_enc_convs_out)(h_ctrl) * next(scales)
             hs_base.append(h_base)
             hs_ctrl.append(h_ctrl)
         # 2 - mid blocks (bottleneck)

From 0e7c848e3eb5d9452f57b24cf396287f90bad350 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Thu, 19 Oct 2023 21:06:30 +0200
Subject: [PATCH 12/88] Forward works

---
 src/diffusers/models/controlnetxs.py | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index 763e27d0a791..3156696e3941 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -313,18 +313,28 @@ def forward(self, x: torch.Tensor, t: torch.Tensor, encoder_hidden_states: torch
         # 0 - conv in
         h_base = self.base_model.conv_in(h_base)
         h_ctrl = self.control_model.conv_in(h_ctrl)
+        if guided_hint is not None:
+            h_ctrl += guided_hint
+        h_base = h_base + next(it_enc_convs_out)(h_ctrl) * next(scales)
         hs_base.append(h_base)
         hs_ctrl.append(h_ctrl)
         # 1 - input blocks (encoder)
-        for m_base, m_ctrl  in zip(base_down_subblocks, ctrl_down_subblocks):
-            inp_base2ctrl = next(it_enc_convs_in)(h_base) # get info from base encoder 
-            if guided_hint is not None: # in first, add hint info if it exists 
-                inp_base2ctrl += guided_hint
-                guided_hint = None
-            h_ctrl = torch.cat([h_ctrl, inp_base2ctrl], dim=1)
+        for i, (m_base, m_ctrl)  in enumerate(zip(base_down_subblocks, ctrl_down_subblocks)):
+            self.debug_print(f'>>> >>> Start {i+1}')
+            self.debug_print(f'{i+1}] h_base.shape: {list(h_base.shape)}')
+            self.debug_print(f'{i+1}] h_ctrl.shape: {list(h_ctrl.shape)}')
+            h_ctrl = torch.cat([h_ctrl, next(it_enc_convs_in)(h_base)], dim=1)
+            self.debug_print('>>> After base->ctrl concat')
+            self.debug_print(f'{i+1}] h_ctrl.shape: {list(h_ctrl.shape)}')
             h_base = m_base(h_base, temb, cemb, context)
             h_ctrl = m_ctrl(h_ctrl, temb, cemb, context)
+            self.debug_print('>>> After block application')
+            self.debug_print(f'{i+1}] h_base.shape: {list(h_base.shape)}')
+            self.debug_print(f'{i+1}] h_ctrl.shape: {list(h_ctrl.shape)}')
             h_base = h_base + next(it_enc_convs_out)(h_ctrl) * next(scales)
+            self.debug_print('>>> After ctrl->base add')
+            self.debug_print(f'{i+1}] h_base.shape: {list(h_base.shape)}')
+            self.debug_print(' - - - - - - - - - - - - - ')
             hs_base.append(h_base)
             hs_ctrl.append(h_ctrl)
         # 2 - mid blocks (bottleneck)
@@ -345,6 +355,10 @@ def make_zero_conv(self, in_channels, out_channels=None):
         self.out_channels = out_channels or in_channels
         return zero_module(nn.Conv2d(in_channels, out_channels, 1, padding=0))
 
+    def debug_print(self, s):
+        if hasattr(self, 'debug') and self.debug:
+            print(s)
+
 
 def adjust_time_input_dim(unet: UNet2DConditionModel, dim: int):
     time_emb = unet.time_embedding

From 969e7e8993fa77eae8900fcebc115d1906ed5858 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Thu, 19 Oct 2023 22:42:10 +0200
Subject: [PATCH 13/88] checkin

---
 src/diffusers/models/__init__.py              |   2 +
 src/diffusers/models/controlnetxs.py          |   6 +-
 .../pipeline_controlnet_xs_sd_xl.py           | 799 ++++++++++++++++++
 3 files changed, 805 insertions(+), 2 deletions(-)
 create mode 100644 src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py

diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
index 75ddb21fb15d..fd1ac0136ef8 100644
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -25,6 +25,7 @@
     _import_structure["autoencoder_kl"] = ["AutoencoderKL"]
     _import_structure["autoencoder_tiny"] = ["AutoencoderTiny"]
     _import_structure["controlnet"] = ["ControlNetModel"]
+    _import_structure["controlnetxs"] = ["ControlNetXSModel"]
     _import_structure["dual_transformer_2d"] = ["DualTransformer2DModel"]
     _import_structure["modeling_utils"] = ["ModelMixin"]
     _import_structure["prior_transformer"] = ["PriorTransformer"]
@@ -50,6 +51,7 @@
         from .autoencoder_kl import AutoencoderKL
         from .autoencoder_tiny import AutoencoderTiny
         from .controlnet import ControlNetModel
+        from .controlnetxs import ControlNetXSModel
         from .dual_transformer_2d import DualTransformer2DModel
         from .modeling_utils import ModelMixin
         from .prior_transformer import PriorTransformer
diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index 3156696e3941..8ed140db705b 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -143,6 +143,7 @@ def __init__(
                 'mid': [(1280, 1280)],
                 'dec': [(2560, 1280), (2560, 1280), (1920, 1280), (1920, 640), (1280, 640), (960, 640), (960, 320), (640, 320), (640, 320)]
             },
+            global_pool_conditions: bool = False, # Todo Umer: Needed by SDXL pipeline, but what is this?
         ):
         super().__init__()
 
@@ -155,7 +156,7 @@ def __init__(
         self.model_channels = model_channels
         self.control_scale = 1.0
         self.hint_model = None
-        
+        self.no_control = False
         self.learn_embedding = learn_embedding
 
         # 1 - Create controller
@@ -277,7 +278,8 @@ def forward(self, x: torch.Tensor, t: torch.Tensor, encoder_hidden_states: torch
         context=c.get("crossattn", None)
         y=c.get("vector", None)
 
-        if no_control: return self.base_model(x=x, timesteps=timesteps, context=context, y=y, **kwargs)
+        if no_control or self.no_control:
+            return self.base_model(x, timesteps, encoder_hidden_states, added_cond_kwargs={}, **kwargs)
 
         # time embeddings
         timesteps = timesteps[None]
diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
new file mode 100644
index 000000000000..3d78580aaa6b
--- /dev/null
+++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
@@ -0,0 +1,799 @@
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
+
+from diffusers.utils.import_utils import is_invisible_watermark_available
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ControlNetXSModel, UNet2DConditionModel
+from ...models.attention_processor import (
+    AttnProcessor2_0,
+    LoRAAttnProcessor2_0,
+    LoRAXFormersAttnProcessor,
+    XFormersAttnProcessor,
+)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import logging
+from ...utils.torch_utils import is_compiled_module, randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from ..stable_diffusion_xl import StableDiffusionXLPipelineOutput
+
+
+if is_invisible_watermark_available():
+    from ..stable_diffusion_xl.watermark import StableDiffusionXLWatermarker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class StableDiffusionXLControlNetXSPipeline(
+    DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionXLLoraLoaderMixin, FromSingleFileMixin
+):
+    model_cpu_offload_seq = (
+        "text_encoder->text_encoder_2->unet->vae"  # leave controlnet out on purpose because it iterates with unet
+    )
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        controlnet: ControlNetXSModel,
+        scheduler: KarrasDiffusionSchedulers,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            controlnet=controlnet,
+            scheduler=scheduler,
+        )
+
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
+        )
+        
+        self.watermark = None
+
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            # textual inversion: procecss multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(
+                    text_input_ids.to(device),
+                    output_hidden_states=True,
+                )
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                prompt_embeds = prompt_embeds.hidden_states[-2]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image
+    def check_image(self, image, prompt, prompt_embeds):
+        image_is_pil = isinstance(image, PIL.Image.Image)
+        image_is_tensor = isinstance(image, torch.Tensor)
+        image_is_np = isinstance(image, np.ndarray)
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+        image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
+
+        if (
+            not image_is_pil
+            and not image_is_tensor
+            and not image_is_np
+            and not image_is_pil_list
+            and not image_is_tensor_list
+            and not image_is_np_list
+        ):
+            raise TypeError(
+                f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
+            )
+
+        if image_is_pil:
+            image_batch_size = 1
+        else:
+            image_batch_size = len(image)
+
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+
+        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.prepare_image
+    def prepare_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._get_add_time_ids
+    def _get_add_time_ids(self, original_size, crops_coords_top_left, target_size, dtype):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + self.text_encoder_2.config.projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+                LoRAXFormersAttnProcessor,
+                LoRAAttnProcessor2_0,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image: PipelineImageInput = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        guess_mode: bool = False,
+        control_guidance_start: Union[float, List[float]] = 0.0,
+        control_guidance_end: Union[float, List[float]] = 1.0,
+        original_size: Tuple[int, int] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Tuple[int, int] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders.
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+                    `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+                The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
+                specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be
+                accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height
+                and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in
+                `init`, images must be passed as a list such that each element of the list can be correctly batched for
+                input to a single ControlNet.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image. Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image. Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. This is sent to `tokenizer_2`
+                and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, pooled text embeddings are generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs (prompt
+                weighting). If not provided, pooled `negative_prompt_embeds` are generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
+                the corresponding scale as a list.
+            guess_mode (`bool`, *optional*, defaults to `False`):
+                The ControlNet encoder tries to recognize the content of the input image even if you remove all
+                prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended.
+            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
+                The percentage of total steps at which the ControlNet starts applying.
+            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The percentage of total steps at which the ControlNet stops applying.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(width, height)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(width, height)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned containing the output images.
+        """
+        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+
+        # align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = 1 # len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
+            control_guidance_start, control_guidance_end = mult * [control_guidance_start], mult * [
+                control_guidance_end
+            ]
+
+        # 1. Check inputs. Raise error if not correct
+        #self.check_inputs(
+        #     prompt,
+        #     prompt_2,
+        #     image,
+        #     callback_steps,
+        #     negative_prompt,
+        #     negative_prompt_2,
+        #     prompt_embeds,
+        #     negative_prompt_embeds,
+        #     pooled_prompt_embeds,
+        #     negative_pooled_prompt_embeds,
+        #     controlnet_conditioning_scale,
+        #     control_guidance_start,
+        #     control_guidance_end,
+        # )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+      
+        global_pool_conditions = (
+            controlnet.config.global_pool_conditions
+            if isinstance(controlnet, ControlNetXSModel)
+            else controlnet.nets[0].config.global_pool_conditions
+        )
+        guess_mode = guess_mode or global_pool_conditions
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt,
+            prompt_2,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+        )
+
+        # 4. Prepare image
+        if isinstance(controlnet, ControlNetXSModel):
+            image = self.prepare_image(
+                image=image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=controlnet.dtype,
+                do_classifier_free_guidance=do_classifier_free_guidance,
+                guess_mode=guess_mode,
+            )
+            height, width = image.shape[-2:]
+        else:
+            assert False
+
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7.1 Create tensor stating which controlnets to keep
+        controlnet_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetXSModel) else keeps)
+
+        # 7.2 Prepare added time ids & embeddings
+        if isinstance(image, list):
+            original_size = original_size or image[0].shape[-2:]
+        else:
+            original_size = original_size or image.shape[-2:]
+        target_size = target_size or (height, width)
+
+        add_text_embeds = pooled_prompt_embeds
+        add_time_ids = self._get_add_time_ids(
+            original_size, crops_coords_top_left, target_size, dtype=prompt_embeds.dtype
+        )
+
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                #added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+                # predict the noise residual
+                noise_pred = self.controlnet(
+                    x=latent_model_input,
+                    t=t,
+                    encoder_hidden_states=prompt_embeds,
+                    c={},
+                    hint=image, # todo: better naming
+                    #cross_attention_kwargs=cross_attention_kwargs,
+                    #return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+
+        # manually for max memory savings
+        if self.vae.dtype == torch.float16 and self.vae.config.force_upcast:
+            self.upcast_vae()
+            latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+
+        if not output_type == "latent":
+            # apply watermark if available
+            if self.watermark is not None:
+                image = self.watermark.apply_watermark(image)
+
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)

From 111fa2d9889837af25e5004591408e3a7d815a9d Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Fri, 20 Oct 2023 17:58:33 +0200
Subject: [PATCH 14/88] Pipeline works with `no_control=True`

---
 src/diffusers/models/controlnetxs.py                         | 4 ++--
 .../pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py  | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index 8ed140db705b..5c7164c90c15 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -279,7 +279,7 @@ def forward(self, x: torch.Tensor, t: torch.Tensor, encoder_hidden_states: torch
         y=c.get("vector", None)
 
         if no_control or self.no_control:
-            return self.base_model(x, timesteps, encoder_hidden_states, added_cond_kwargs={}, **kwargs)
+            return self.base_model(x, timesteps, encoder_hidden_states, **kwargs)
 
         # time embeddings
         timesteps = timesteps[None]
@@ -349,7 +349,7 @@ def forward(self, x: torch.Tensor, t: torch.Tensor, encoder_hidden_states: torch
             h_base = h_base + next(it_dec_convs_out)(hs_ctrl.pop()) * next(scales) # add info from ctrl encoder 
             h_base = torch.cat([h_base, hs_base.pop()], dim=1) # concat info from base encoder+ctrl encoder
             h_base = m_base(h_base, temb, cemb, context)
-        return self.base_model.conv_out(h_base)
+        return UNet2DConditionOutput(sample=self.base_model.conv_out(h_base))
 
     def make_zero_conv(self, in_channels, out_channels=None):
         # keep running track # todo: better comment
diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
index 3d78580aaa6b..5be44de2a139 100644
--- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
@@ -735,7 +735,7 @@ def __call__(
                 latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
-                #added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
 
                 # predict the noise residual
                 noise_pred = self.controlnet(
@@ -746,7 +746,8 @@ def __call__(
                     hint=image, # todo: better naming
                     #cross_attention_kwargs=cross_attention_kwargs,
                     #return_dict=False,
-                )[0]
+                    added_cond_kwargs=added_cond_kwargs,
+                ).sample
 
                 # perform guidance
                 if do_classifier_free_guidance:

From 6270bc0502e7e80d18ef5634091722b35415ea9b Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Mon, 23 Oct 2023 17:19:40 +0200
Subject: [PATCH 15/88] checkin

---
 src/diffusers/models/unet_2d_condition.py                   | 2 --
 .../pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py | 6 ++++++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index 16a535745683..d695d182fa37 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -53,8 +53,6 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-#TODO Umer: Remove attention_bias again
-
 @dataclass
 class UNet2DConditionOutput(BaseOutput):
     """
diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
index 5be44de2a139..862e71f200d7 100644
--- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
@@ -727,6 +727,10 @@ def __call__(
         add_text_embeds = add_text_embeds.to(device)
         add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
 
+        # # DEBUG
+        if callback is not None: callback(-1, -1, latents)
+        # # 
+
         # 8. Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
@@ -751,8 +755,10 @@ def __call__(
 
                 # perform guidance
                 if do_classifier_free_guidance:
+                    print(f'{i}] Yup, doing classifier-free guidance')
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
                     noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                print('{i}] Avg predicted noise = {noise_pred.mean()}')
 
                 # compute the previous noisy sample x_t -> x_t-1
                 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]

From 92673881f41c966e064d0b4cdb671d96ce23918e Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Fri, 27 Oct 2023 14:03:32 +0200
Subject: [PATCH 16/88] debug: save intermediate outputs of resnet

---
 src/diffusers/models/controlnetxs.py          | 76 +++++++++++++------
 src/diffusers/models/resnet.py                | 30 +++++++-
 .../pipeline_controlnet_xs_sd_xl.py           | 20 +++--
 .../schedulers/scheduling_euler_discrete.py   | 15 ++++
 4 files changed, 110 insertions(+), 31 deletions(-)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index 5c7164c90c15..085bad368e7b 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -143,7 +143,8 @@ def __init__(
                 'mid': [(1280, 1280)],
                 'dec': [(2560, 1280), (2560, 1280), (1920, 1280), (1920, 640), (1280, 640), (960, 640), (960, 320), (640, 320), (640, 320)]
             },
-            global_pool_conditions: bool = False, # Todo Umer: Needed by SDXL pipeline, but what is this?
+            global_pool_conditions: bool = False, # Todo Umer: Needed by SDXL pipeline, but what is this?,
+            control_scale=0.95, # 1 in Heidelberg code, but 0.95 in usage script
         ):
         super().__init__()
 
@@ -154,7 +155,6 @@ def __init__(
         self.out_channels = out_channels
         self.dims = 2
         self.model_channels = model_channels
-        self.control_scale = 1.0
         self.hint_model = None
         self.no_control = False
         self.learn_embedding = learn_embedding
@@ -243,7 +243,7 @@ def __init__(
         # info: I deleted the encoder_hid_proj as it's not given by the Heidelberg CVL weights
 
         scale_list = [1.] * len(self.enc_zero_convs_out) + [1.] + [1.] * len(self.dec_zero_convs_out)
-        self.register_buffer('scale_list', torch.tensor(scale_list))
+        self.register_buffer('scale_list', torch.tensor(scale_list) * control_scale)
 
         # in the mininal implementation setting, we only need the control model up to the mid block
         # note: these can only be deleted after  has to be `gather_base_model_sizes(self.control_mode, 'control')` has been called
@@ -251,6 +251,8 @@ def __init__(
         del self.control_model.conv_norm_out
         del self.control_model.conv_out
 
+    DEBUG_LOG_by_Umer = False
+    DEBUG_LOG_by_Umer_file = 'debug_log.pkl'
     def forward(self, x: torch.Tensor, t: torch.Tensor, encoder_hidden_states: torch.Tensor, c: dict, hint: torch.Tensor, no_control=False, **kwargs):
         if self.base_model is None:
             raise RuntimeError("To use `forward`, first set the base model for this ControlNetXSModel by `cnxs_model.base_model = the_base_model`")
@@ -275,7 +277,6 @@ def forward(self, x: torch.Tensor, t: torch.Tensor, encoder_hidden_states: torch
         if x.size(0) // 2 == hint.size(0): hint = torch.cat([hint, hint], dim=0) # for classifier free guidance
         
         timesteps=t
-        context=c.get("crossattn", None)
         y=c.get("vector", None)
 
         if no_control or self.no_control:
@@ -311,44 +312,75 @@ def forward(self, x: torch.Tensor, t: torch.Tensor, encoder_hidden_states: torch
         ctrl_mid_subblocks = to_sub_blocks([self.control_model.mid_block])
         base_up_subblocks = to_sub_blocks(self.base_model.up_blocks)
 
+        # Debug Umer -- to delete later on
+        debug_log = []
+        def debug_by_umer(stage, msg, obj):
+            if not self.DEBUG_LOG_by_Umer: return
+            i = len(debug_log)
+            if isinstance(obj, torch.Tensor): obj = obj.cpu()
+            debug_log.append((i, stage, msg, obj))
+        def debug_save():
+            if not self.DEBUG_LOG_by_Umer: return
+            import pickle
+            pickle.dump(debug_log, open(self.DEBUG_LOG_by_Umer_file, "wb"))
+            raise RuntimeError("Debug Log saved successfully")
+
+        debug_by_umer('prep', 'x', x)
+        debug_by_umer('prep', 'temb', temb)
+        debug_by_umer('prep', 'context', cemb)
+        debug_by_umer('prep', 'raw hint', hint)
+        debug_by_umer('prep', 'guided_hint', guided_hint)
+
         # Cross Control
         # 0 - conv in
         h_base = self.base_model.conv_in(h_base)
+        debug_by_umer('enc', 'h_base', h_base)
         h_ctrl = self.control_model.conv_in(h_ctrl)
+        debug_by_umer('enc', 'h_ctrl', h_ctrl)
         if guided_hint is not None:
             h_ctrl += guided_hint
+        debug_by_umer('enc', 'h_ctrl', h_ctrl)
         h_base = h_base + next(it_enc_convs_out)(h_ctrl) * next(scales)
+        debug_by_umer('enc', 'h_base', h_base)
         hs_base.append(h_base)
         hs_ctrl.append(h_ctrl)
         # 1 - input blocks (encoder)
         for i, (m_base, m_ctrl)  in enumerate(zip(base_down_subblocks, ctrl_down_subblocks)):
-            self.debug_print(f'>>> >>> Start {i+1}')
-            self.debug_print(f'{i+1}] h_base.shape: {list(h_base.shape)}')
-            self.debug_print(f'{i+1}] h_ctrl.shape: {list(h_ctrl.shape)}')
             h_ctrl = torch.cat([h_ctrl, next(it_enc_convs_in)(h_base)], dim=1)
-            self.debug_print('>>> After base->ctrl concat')
-            self.debug_print(f'{i+1}] h_ctrl.shape: {list(h_ctrl.shape)}')
-            h_base = m_base(h_base, temb, cemb, context)
-            h_ctrl = m_ctrl(h_ctrl, temb, cemb, context)
-            self.debug_print('>>> After block application')
-            self.debug_print(f'{i+1}] h_base.shape: {list(h_base.shape)}')
-            self.debug_print(f'{i+1}] h_ctrl.shape: {list(h_ctrl.shape)}')
+            debug_by_umer('enc', 'h_ctr', h_ctrl)
+            h_base = m_base(h_base, temb, cemb)
+            debug_by_umer('enc', 'h_base', h_base)
+            h_ctrl = m_ctrl(h_ctrl, temb, cemb)
+            debug_by_umer('enc', 'h_ctrl', h_ctrl)
             h_base = h_base + next(it_enc_convs_out)(h_ctrl) * next(scales)
-            self.debug_print('>>> After ctrl->base add')
-            self.debug_print(f'{i+1}] h_base.shape: {list(h_base.shape)}')
-            self.debug_print(' - - - - - - - - - - - - - ')
+            debug_by_umer('enc', 'h_base', h_base)
             hs_base.append(h_base)
             hs_ctrl.append(h_ctrl)
-        # 2 - mid blocks (bottleneck)
         h_ctrl = torch.concat([h_ctrl, h_base], dim=1)
+        debug_by_umer('enc', 'h_ctrl', h_ctrl)
+        # 2 - mid blocks (bottleneck)
         for m_base, m_ctrl in zip(base_mid_subblocks, ctrl_mid_subblocks):
-            h_base = m_base(h_base, temb, cemb, context)
-            h_ctrl = m_ctrl(h_ctrl, temb, cemb, context)
+            h_base = m_base(h_base, temb, cemb)
+            h_ctrl = m_ctrl(h_ctrl, temb, cemb)
+        # Heidelberg treats the R/A/R as one block, while I treat is as 2 subblocks
+        # Let's therefore only log after the mid section
+        debug_by_umer('mid', 'h_base', h_base)
+        debug_by_umer('mid', 'h_ctrl', h_ctrl)
+    
+        h_base = h_base + self.middle_block_out(h_ctrl) * next(scales)
+        debug_by_umer('mid', 'h_base', h_base)
+
         # 3 - output blocks (decoder)
         for m_base in base_up_subblocks:
             h_base = h_base + next(it_dec_convs_out)(hs_ctrl.pop()) * next(scales) # add info from ctrl encoder 
+            debug_by_umer('dec', 'h_base', h_base)
             h_base = torch.cat([h_base, hs_base.pop()], dim=1) # concat info from base encoder+ctrl encoder
-            h_base = m_base(h_base, temb, cemb, context)
+            debug_by_umer('dec', 'h_base', h_base)
+            h_base = m_base(h_base, temb, cemb)
+            debug_by_umer('dec', 'h_base', h_base)
+
+        debug_save()
+
         return UNet2DConditionOutput(sample=self.base_model.conv_out(h_base))
 
     def make_zero_conv(self, in_channels, out_channels=None):
@@ -454,7 +486,7 @@ def __init__(self,ms,*args,**kwargs):
         if not is_iterable(ms): ms = [ms]
         super().__init__(ms,*args,**kwargs)
     
-    def forward(self,x,temb,cemb,context):
+    def forward(self,x,temb,cemb):
         for m in self:
             if isinstance(m,ResnetBlock2D): x=m(x,temb)
             elif isinstance(m,Transformer2DModel): x=m(x,cemb).sample # Q: Include temp also?
diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py
index 80bf269fc4e3..62411f3d0d1b 100644
--- a/src/diffusers/models/resnet.py
+++ b/src/diffusers/models/resnet.py
@@ -683,14 +683,22 @@ def __init__(
             )
 
     def forward(self, input_tensor, temb, scale: float = 1.0):
+        UMER_DEBUG_CACHE = []
+
         hidden_states = input_tensor
 
+        UMER_DEBUG_CACHE.append(('hidden_states', hidden_states, 'start'))
+        UMER_DEBUG_CACHE.append(('temb', temb, 'start'))
+        UMER_DEBUG_CACHE.append(('scale', scale, 'start'))
+
         if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
             hidden_states = self.norm1(hidden_states, temb)
         else:
             hidden_states = self.norm1(hidden_states)
+        UMER_DEBUG_CACHE.append(('hidden_states', hidden_states, 'after norm1'))
 
         hidden_states = self.nonlinearity(hidden_states)
+        UMER_DEBUG_CACHE.append(('hidden_states', hidden_states, 'after silu'))
 
         if self.upsample is not None:
             # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
@@ -720,32 +728,42 @@ def forward(self, input_tensor, temb, scale: float = 1.0):
             )
 
         hidden_states = self.conv1(hidden_states, scale) if not USE_PEFT_BACKEND else self.conv1(hidden_states)
+        UMER_DEBUG_CACHE.append(('hidden_states', hidden_states, 'after conv1'))
 
         if self.time_emb_proj is not None:
             if not self.skip_time_act:
                 temb = self.nonlinearity(temb)
+                UMER_DEBUG_CACHE.append(('temb', temb, 'after silu'))
             temb = (
                 self.time_emb_proj(temb, scale)[:, :, None, None]
                 if not USE_PEFT_BACKEND
                 else self.time_emb_proj(temb)[:, :, None, None]
-            )
+            )        
+            UMER_DEBUG_CACHE.append(('temb', temb, 'after linear'))
+
 
         if temb is not None and self.time_embedding_norm == "default":
             hidden_states = hidden_states + temb
+            UMER_DEBUG_CACHE.append(('hidden_states', hidden_states, 'after time add'))
+
 
         if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
             hidden_states = self.norm2(hidden_states, temb)
         else:
             hidden_states = self.norm2(hidden_states)
+            UMER_DEBUG_CACHE.append(('hidden_states', hidden_states, 'after norm2'))
 
         if temb is not None and self.time_embedding_norm == "scale_shift":
             scale, shift = torch.chunk(temb, 2, dim=1)
             hidden_states = hidden_states * (1 + scale) + shift
 
         hidden_states = self.nonlinearity(hidden_states)
+        UMER_DEBUG_CACHE.append(('hidden_states', hidden_states, 'after silu'))
 
         hidden_states = self.dropout(hidden_states)
+        UMER_DEBUG_CACHE.append(('hidden_states', hidden_states, 'after dropout'))
         hidden_states = self.conv2(hidden_states, scale) if not USE_PEFT_BACKEND else self.conv2(hidden_states)
+        UMER_DEBUG_CACHE.append(('hidden_states', hidden_states, 'after conv2'))
 
         if self.conv_shortcut is not None:
             input_tensor = (
@@ -753,6 +771,16 @@ def forward(self, input_tensor, temb, scale: float = 1.0):
             )
 
         output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
+        UMER_DEBUG_CACHE.append(('hidden_states', output_tensor, 'after skip + scale'))
+
+        import pickle
+        with open('intermediate_output/local_resnet.pkl','wb') as f:
+            pickle.dump(UMER_DEBUG_CACHE, f)
+
+        msg = "End of 1st ResNet reached."
+        msg += "\nLet's analyze the intermediate steps, my man. Don't be intimidated, you can do it. Believe in the you that believes in yourself."
+        msg += "\n\nBtw, results are saved to file 'intermediate_output/local_resnet.pkl'."
+        raise ValueError(msg)
 
         return output_tensor
 
diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
index 862e71f200d7..6a0745fd8667 100644
--- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
@@ -374,8 +374,9 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
             latents = latents.to(device)
 
         # scale the initial noise by the standard deviation required by the scheduler
+        initial_unscaled_latents = latents # Umer: remove here & from return
         latents = latents * self.scheduler.init_noise_sigma
-        return latents
+        return latents, initial_unscaled_latents
 
     # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._get_add_time_ids
     def _get_add_time_ids(self, original_size, crops_coords_top_left, target_size, dtype):
@@ -672,8 +673,11 @@ def __call__(
         timesteps = self.scheduler.timesteps
 
         # 6. Prepare latent variables
+        if latents is not None: print("Passed in latents: ", latents.flatten()[:5])
+        else: print("No latents passed in")
+
         num_channels_latents = self.unet.config.in_channels
-        latents = self.prepare_latents(
+        latents, initial_unscaled_latents = self.prepare_latents(
             batch_size * num_images_per_prompt,
             num_channels_latents,
             height,
@@ -683,6 +687,12 @@ def __call__(
             generator,
             latents,
         )
+        print("initial_unscaled_latents: ", initial_unscaled_latents.flatten()[:5])
+        print("latents: ", latents.flatten()[:5])
+        # # DEBUG
+        if callback is not None: callback(-1, -1, initial_unscaled_latents)
+        # # 
+
 
         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
@@ -727,10 +737,6 @@ def __call__(
         add_text_embeds = add_text_embeds.to(device)
         add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
 
-        # # DEBUG
-        if callback is not None: callback(-1, -1, latents)
-        # # 
-
         # 8. Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
@@ -755,10 +761,8 @@ def __call__(
 
                 # perform guidance
                 if do_classifier_free_guidance:
-                    print(f'{i}] Yup, doing classifier-free guidance')
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
                     noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                print('{i}] Avg predicted noise = {noise_pred.mean()}')
 
                 # compute the previous noisy sample x_t -> x_t-1
                 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py
index 0875e1af3325..d4183a44ad74 100644
--- a/src/diffusers/schedulers/scheduling_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_discrete.py
@@ -153,6 +153,9 @@ def __init__(
             self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
         elif beta_schedule == "scaled_linear":
             # this schedule is very specific to the latent diffusion model.
+
+            print(f'beta_schedule = "scaled_linear" and beta_start={beta_start}, beta_end={beta_end}')
+
             self.betas = (
                 torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
             )
@@ -169,6 +172,8 @@ def __init__(
         sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
         self.sigmas = torch.from_numpy(sigmas)
 
+        print(f'At the end of __init__, the sigmas are {self.sigmas[:5]} ...')
+
         # setable values
         self.num_inference_steps = None
         timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy()
@@ -242,6 +247,9 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
             # casting to int to avoid issues when num_inference_step is power of 3
             timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.float32)
             timesteps += self.config.steps_offset
+
+            print(f'timestep_spacing = "leading" and timesteps={timesteps[:5]} ...')
+
         elif self.config.timestep_spacing == "trailing":
             step_ratio = self.config.num_train_timesteps / self.num_inference_steps
             # creates integer timesteps by multiplying by ratio
@@ -254,10 +262,13 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
             )
 
         sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        print(f'sigmas before interpolation: {sigmas[:5]} ...')
+
         log_sigmas = np.log(sigmas)
 
         if self.config.interpolation_type == "linear":
             sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+            print(f'sigmas after (linear) interpolation: {sigmas[:5]} ...')
         elif self.config.interpolation_type == "log_linear":
             sigmas = torch.linspace(np.log(sigmas[-1]), np.log(sigmas[0]), num_inference_steps + 1).exp()
         else:
@@ -276,6 +287,10 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
         self.timesteps = torch.from_numpy(timesteps).to(device=device)
         self._step_index = None
 
+        print(f'At end of `set_timesteps`:')
+        print(f'sigmas =  {self.sigmas[:5]} ...')
+        print(f'timesteps = {self.timesteps[:5]} ...')
+
     def _sigma_to_t(self, sigma, log_sigmas):
         # get log sigma
         log_sigma = np.log(sigma)

From 8bcb3d04d637d3a3369cc1849ee97f8d26159fd7 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Fri, 27 Oct 2023 23:58:47 +0200
Subject: [PATCH 17/88] checkin

---
 src/diffusers/models/controlnetxs.py          |  9 +++-
 src/diffusers/models/resnet.py                | 54 +++++++++++--------
 .../schedulers/scheduling_euler_discrete.py   |  7 +--
 3 files changed, 43 insertions(+), 27 deletions(-)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index 085bad368e7b..8c577e2c4399 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -85,6 +85,7 @@ def get_time_emb_input_dim(unet: UNet2DConditionModel):return unet.time_embeddin
             learn_embedding=True,
             control_model_ratio=0.1,
             base_model_channel_sizes=base_model_channel_sizes,
+            control_scale=0.95,
         )
         cnxs_model.base_model = base_model
         return cnxs_model
@@ -144,7 +145,7 @@ def __init__(
                 'dec': [(2560, 1280), (2560, 1280), (1920, 1280), (1920, 640), (1280, 640), (960, 640), (960, 320), (640, 320), (640, 320)]
             },
             global_pool_conditions: bool = False, # Todo Umer: Needed by SDXL pipeline, but what is this?,
-            control_scale=0.95, # 1 in Heidelberg code, but 0.95 in usage script
+            control_scale=1,
         ):
         super().__init__()
 
@@ -237,6 +238,7 @@ def __init__(
         pass
         self.flip_sin_to_cos = True # default params
         self.freq_shift = 0
+        # !! TODO !! : learn_embedding is True, so we need our own embedding
         # Todo: Only when `learn_embedding = False` can we just use the base model's time embedding, otherwise we need to create our own 
         
         # Text embedding
@@ -283,7 +285,9 @@ def forward(self, x: torch.Tensor, t: torch.Tensor, encoder_hidden_states: torch
             return self.base_model(x, timesteps, encoder_hidden_states, **kwargs)
 
         # time embeddings
+        print("timesteps =",timesteps)
         timesteps = timesteps[None]
+        print("timesteps =",timesteps)
         t_emb = get_timestep_embedding(
             timesteps, 
             self.model_channels,
@@ -291,8 +295,11 @@ def forward(self, x: torch.Tensor, t: torch.Tensor, encoder_hidden_states: torch
             flip_sin_to_cos=self.flip_sin_to_cos,
             downscale_freq_shift=self.freq_shift,
         )
+        print(f't_emb.shape = {list(t_emb.shape)}')
+        print(f'learn_embedding = {self.learn_embedding}')
         if self.learn_embedding:
             temb = self.control_model.time_embedding(t_emb) * self.control_scale ** 0.3 + self.base_model.time_embedding(t_emb) * (1 - self.control_scale ** 0.3)
+            print(f't_emb.shape = {list(temb.shape)}')
         else:
             temb = self.base_model.time_embedding(t_emb)
 
diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py
index 62411f3d0d1b..d2f935dfe926 100644
--- a/src/diffusers/models/resnet.py
+++ b/src/diffusers/models/resnet.py
@@ -683,22 +683,33 @@ def __init__(
             )
 
     def forward(self, input_tensor, temb, scale: float = 1.0):
+
+        DO_UMER_CACHE = False
+
         UMER_DEBUG_CACHE = []
+        umer_cache_i = 0 
+        def append_to_umer_cache(msg,obj,comment):
+            nonlocal umer_cache_i
+            if not DO_UMER_CACHE: return
+            if hasattr(obj,'cpu'): obj = obj.cpu()
+            UMER_DEBUG_CACHE.append((umer_cache_i, msg,comment,obj))
+            umer_cache_i += 1
+
 
         hidden_states = input_tensor
 
-        UMER_DEBUG_CACHE.append(('hidden_states', hidden_states, 'start'))
-        UMER_DEBUG_CACHE.append(('temb', temb, 'start'))
-        UMER_DEBUG_CACHE.append(('scale', scale, 'start'))
+        append_to_umer_cache('hidden_states', hidden_states, 'start')
+        append_to_umer_cache('temb', temb, 'start')
+        append_to_umer_cache('scale', scale, 'start')
 
         if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
             hidden_states = self.norm1(hidden_states, temb)
         else:
             hidden_states = self.norm1(hidden_states)
-        UMER_DEBUG_CACHE.append(('hidden_states', hidden_states, 'after norm1'))
+        append_to_umer_cache('hidden_states', hidden_states, 'after norm1')
 
         hidden_states = self.nonlinearity(hidden_states)
-        UMER_DEBUG_CACHE.append(('hidden_states', hidden_states, 'after silu'))
+        append_to_umer_cache('hidden_states', hidden_states, 'after silu')
 
         if self.upsample is not None:
             # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
@@ -728,42 +739,42 @@ def forward(self, input_tensor, temb, scale: float = 1.0):
             )
 
         hidden_states = self.conv1(hidden_states, scale) if not USE_PEFT_BACKEND else self.conv1(hidden_states)
-        UMER_DEBUG_CACHE.append(('hidden_states', hidden_states, 'after conv1'))
+        append_to_umer_cache('hidden_states', hidden_states, 'after conv1')
 
         if self.time_emb_proj is not None:
             if not self.skip_time_act:
                 temb = self.nonlinearity(temb)
-                UMER_DEBUG_CACHE.append(('temb', temb, 'after silu'))
+                append_to_umer_cache('temb', temb, 'after silu')
             temb = (
                 self.time_emb_proj(temb, scale)[:, :, None, None]
                 if not USE_PEFT_BACKEND
                 else self.time_emb_proj(temb)[:, :, None, None]
             )        
-            UMER_DEBUG_CACHE.append(('temb', temb, 'after linear'))
+            append_to_umer_cache('temb', temb, 'after linear')
 
 
         if temb is not None and self.time_embedding_norm == "default":
             hidden_states = hidden_states + temb
-            UMER_DEBUG_CACHE.append(('hidden_states', hidden_states, 'after time add'))
+            append_to_umer_cache('hidden_states', hidden_states, 'after time add')
 
 
         if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
             hidden_states = self.norm2(hidden_states, temb)
         else:
             hidden_states = self.norm2(hidden_states)
-            UMER_DEBUG_CACHE.append(('hidden_states', hidden_states, 'after norm2'))
+            append_to_umer_cache('hidden_states', hidden_states, 'after norm2')
 
         if temb is not None and self.time_embedding_norm == "scale_shift":
             scale, shift = torch.chunk(temb, 2, dim=1)
             hidden_states = hidden_states * (1 + scale) + shift
 
         hidden_states = self.nonlinearity(hidden_states)
-        UMER_DEBUG_CACHE.append(('hidden_states', hidden_states, 'after silu'))
+        append_to_umer_cache('hidden_states', hidden_states, 'after silu')
 
         hidden_states = self.dropout(hidden_states)
-        UMER_DEBUG_CACHE.append(('hidden_states', hidden_states, 'after dropout'))
+        append_to_umer_cache('hidden_states', hidden_states, 'after dropout')
         hidden_states = self.conv2(hidden_states, scale) if not USE_PEFT_BACKEND else self.conv2(hidden_states)
-        UMER_DEBUG_CACHE.append(('hidden_states', hidden_states, 'after conv2'))
+        append_to_umer_cache('hidden_states', hidden_states, 'after conv2')
 
         if self.conv_shortcut is not None:
             input_tensor = (
@@ -771,16 +782,17 @@ def forward(self, input_tensor, temb, scale: float = 1.0):
             )
 
         output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
-        UMER_DEBUG_CACHE.append(('hidden_states', output_tensor, 'after skip + scale'))
+        append_to_umer_cache('hidden_states', output_tensor, 'after skip + scale')
 
-        import pickle
-        with open('intermediate_output/local_resnet.pkl','wb') as f:
-            pickle.dump(UMER_DEBUG_CACHE, f)
+        if DO_UMER_CACHE:
+            import pickle
+            with open('intermediate_output/local_resnet.pkl','wb') as f:
+                pickle.dump(UMER_DEBUG_CACHE, f)
 
-        msg = "End of 1st ResNet reached."
-        msg += "\nLet's analyze the intermediate steps, my man. Don't be intimidated, you can do it. Believe in the you that believes in yourself."
-        msg += "\n\nBtw, results are saved to file 'intermediate_output/local_resnet.pkl'."
-        raise ValueError(msg)
+            msg = "End of 1st ResNet reached."
+            msg += "\nLet's analyze the intermediate steps, my man. Don't be intimidated, you can do it. Believe in the you that believes in yourself."
+            msg += "\n\nBtw, results are saved to file 'intermediate_output/local_resnet.pkl'."
+            raise ValueError(msg)
 
         return output_tensor
 
diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py
index d4183a44ad74..91d12e26d887 100644
--- a/src/diffusers/schedulers/scheduling_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_discrete.py
@@ -153,9 +153,6 @@ def __init__(
             self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
         elif beta_schedule == "scaled_linear":
             # this schedule is very specific to the latent diffusion model.
-
-            print(f'beta_schedule = "scaled_linear" and beta_start={beta_start}, beta_end={beta_end}')
-
             self.betas = (
                 torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
             )
@@ -248,8 +245,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
             timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.float32)
             timesteps += self.config.steps_offset
 
-            print(f'timestep_spacing = "leading" and timesteps={timesteps[:5]} ...')
-
         elif self.config.timestep_spacing == "trailing":
             step_ratio = self.config.num_train_timesteps / self.num_inference_steps
             # creates integer timesteps by multiplying by ratio
@@ -261,6 +256,8 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
                 f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
             )
 
+        print(f'timestep_spacing = "leading" and timesteps={timesteps[:5]} ...')
+
         sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
         print(f'sigmas before interpolation: {sigmas[:5]} ...')
 

From a52e605c4a0e1d29c5a35344d68ba5553e94c8a6 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Thu, 2 Nov 2023 19:46:58 +0100
Subject: [PATCH 18/88] Understood time error + fixed connection error

---
 src/diffusers/models/controlnetxs.py          | 106 +++++++++++++++---
 src/diffusers/models/resnet.py                |  10 +-
 .../pipeline_controlnet_xs_sd_xl.py           |   7 +-
 .../pipeline_stable_diffusion_xl.py           |  14 +++
 .../schedulers/scheduling_euler_discrete.py   |   2 +-
 5 files changed, 114 insertions(+), 25 deletions(-)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index 8c577e2c4399..b7be225ff5fd 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import dataclass
-from typing import Optional, Union, Tuple
+from typing import Any, Dict, Optional, Union, Tuple
 
 from itertools import zip_longest
 
@@ -86,6 +86,7 @@ def get_time_emb_input_dim(unet: UNet2DConditionModel):return unet.time_embeddin
             control_model_ratio=0.1,
             base_model_channel_sizes=base_model_channel_sizes,
             control_scale=0.95,
+            addition_embed_type='text_time',
         )
         cnxs_model.base_model = base_model
         return cnxs_model
@@ -146,6 +147,7 @@ def __init__(
             },
             global_pool_conditions: bool = False, # Todo Umer: Needed by SDXL pipeline, but what is this?,
             control_scale=1,
+            addition_embed_type: Optional[str] = None,
         ):
         super().__init__()
 
@@ -161,7 +163,7 @@ def __init__(
         self.learn_embedding = learn_embedding
 
         # 1 - Create controller
-        self.control_model = ctrl_model = UNet2DConditionModel(
+        self.control_model = UNet2DConditionModel(
             block_out_channels=block_out_channels,
             down_block_types=down_block_types,
             up_block_types=up_block_types,
@@ -189,10 +191,8 @@ def __init__(
         # 4 - Build connections between base and control model
         self.enc_zero_convs_out = nn.ModuleList([])
         self.enc_zero_convs_in = nn.ModuleList([])
-
         self.middle_block_out = nn.ModuleList([])
         self.middle_block_in = nn.ModuleList([])
-
         self.dec_zero_convs_out = nn.ModuleList([])
         self.dec_zero_convs_in = nn.ModuleList([])
 
@@ -239,6 +239,7 @@ def __init__(
         self.flip_sin_to_cos = True # default params
         self.freq_shift = 0
         # !! TODO !! : learn_embedding is True, so we need our own embedding
+        # Edit: That's already part of the ctrl model, even thought it's not used
         # Todo: Only when `learn_embedding = False` can we just use the base model's time embedding, otherwise we need to create our own 
         
         # Text embedding
@@ -255,7 +256,16 @@ def __init__(
 
     DEBUG_LOG_by_Umer = False
     DEBUG_LOG_by_Umer_file = 'debug_log.pkl'
-    def forward(self, x: torch.Tensor, t: torch.Tensor, encoder_hidden_states: torch.Tensor, c: dict, hint: torch.Tensor, no_control=False, **kwargs):
+    def forward(
+        self,
+        x: torch.Tensor,
+        t: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        hint: torch.Tensor,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        no_control=False,
+    ):
         if self.base_model is None:
             raise RuntimeError("To use `forward`, first set the base model for this ControlNetXSModel by `cnxs_model.base_model = the_base_model`")
 
@@ -275,19 +285,16 @@ def forward(self, x: torch.Tensor, t: torch.Tensor, encoder_hidden_states: torch
         # return_dict: bool = True,
         """
 
-        x = torch.cat((x, c.get("concat", torch.Tensor([]).type_as(x))), dim=1)
+        #x = torch.cat((x, c.get("concat", torch.Tensor([]).type_as(x))), dim=1)
         if x.size(0) // 2 == hint.size(0): hint = torch.cat([hint, hint], dim=0) # for classifier free guidance
         
         timesteps=t
-        y=c.get("vector", None)
-
+        
         if no_control or self.no_control:
-            return self.base_model(x, timesteps, encoder_hidden_states, **kwargs)
+            return self.base_model(x, timesteps, encoder_hidden_states,cross_attention_kwargs=cross_attention_kwargs,added_cond_kwargs=added_cond_kwargs)
 
         # time embeddings
-        print("timesteps =",timesteps)
         timesteps = timesteps[None]
-        print("timesteps =",timesteps)
         t_emb = get_timestep_embedding(
             timesteps, 
             self.model_channels,
@@ -295,17 +302,47 @@ def forward(self, x: torch.Tensor, t: torch.Tensor, encoder_hidden_states: torch
             flip_sin_to_cos=self.flip_sin_to_cos,
             downscale_freq_shift=self.freq_shift,
         )
-        print(f't_emb.shape = {list(t_emb.shape)}')
-        print(f'learn_embedding = {self.learn_embedding}')
         if self.learn_embedding:
-            temb = self.control_model.time_embedding(t_emb) * self.control_scale ** 0.3 + self.base_model.time_embedding(t_emb) * (1 - self.control_scale ** 0.3)
-            print(f't_emb.shape = {list(temb.shape)}')
+            temb = self.control_model.time_embedding(t_emb) * self.config.control_scale ** 0.3 + self.base_model.time_embedding(t_emb) * (1 - self.config.control_scale ** 0.3)
         else:
             temb = self.base_model.time_embedding(t_emb)
+        aug_emb = None
 
         # text embeddings
         cemb = encoder_hidden_states
 
+        # added time & text embeddings
+        if self.config.addition_embed_type == "text":
+            raise NotImplementedError()
+        elif self.config.addition_embed_type == "text_image":
+            raise NotImplementedError()
+        elif self.config.addition_embed_type == "text_time":
+            # SDXL - style
+            if "text_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                )
+            text_embeds = added_cond_kwargs.get("text_embeds")
+            if "time_ids" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                )
+            time_ids = added_cond_kwargs.get("time_ids")
+            time_embeds = self.base_model.add_time_proj(time_ids.flatten())
+            time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+            add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+            add_embeds = add_embeds.to(temb.dtype)
+            aug_emb = self.base_model.add_embedding(add_embeds)
+
+        elif self.config.addition_embed_type == "image":
+            raise NotImplementedError()
+        elif self.config.addition_embed_type == "image_hint":
+            raise NotImplementedError()
+
+        temb = temb + aug_emb if aug_emb is not None else temb
+     
+
+        ###
         guided_hint = self.input_hint_block(hint)
 
         h_ctrl = h_base = x
@@ -353,16 +390,51 @@ def debug_save():
         hs_ctrl.append(h_ctrl)
         # 1 - input blocks (encoder)
         for i, (m_base, m_ctrl)  in enumerate(zip(base_down_subblocks, ctrl_down_subblocks)):
-            h_ctrl = torch.cat([h_ctrl, next(it_enc_convs_in)(h_base)], dim=1)
+            # A - concat base -> ctrl
+            torch.save(h_ctrl, 'enc_A1.pt')
+            print('A1]',h_ctrl.flatten()[:10])
+
+            cat_to_ctrl = next(it_enc_convs_in)(h_base)
+            torch.save(cat_to_ctrl, 'enc_A2.pt')
+            print('A2]',cat_to_ctrl.flatten()[:10])
+
+            h_ctrl = torch.cat([h_ctrl, cat_to_ctrl], dim=1)
+            torch.save(h_ctrl, 'enc_A3.pt')
+            print('A3]',h_ctrl.flatten()[:10])
+
             debug_by_umer('enc', 'h_ctr', h_ctrl)
+
+            # B - apply base subblock
             h_base = m_base(h_base, temb, cemb)
+            torch.save(h_base, 'enc_B1.pt')
+            print('B1]',h_base.flatten()[:10])
+
             debug_by_umer('enc', 'h_base', h_base)
+
+            # C - apply ctrl subblock
             h_ctrl = m_ctrl(h_ctrl, temb, cemb)
+            torch.save(h_ctrl, 'enc_C1.pt')
+            print('C1]',h_ctrl.flatten()[:10])
+
             debug_by_umer('enc', 'h_ctrl', h_ctrl)
-            h_base = h_base + next(it_enc_convs_out)(h_ctrl) * next(scales)
+
+            # D - add ctrl -> base
+            add_to_base = next(it_enc_convs_out)(h_ctrl)
+            torch.save(add_to_base, 'enc_D1.pt')
+            print('D1]',add_to_base.flatten()[:10])
+
+            scale = next(scales)
+            torch.save(scale, 'enc_D2.pt')
+            print('D2]',scale.flatten()[:10])
+
+            h_base = h_base + add_to_base * scale
+            torch.save(h_base, 'enc_D3.pt')
+            print('D3]',h_base.flatten()[:10])
+
             debug_by_umer('enc', 'h_base', h_base)
             hs_base.append(h_base)
             hs_ctrl.append(h_ctrl)
+            raise ValueError("Alright captain, do your analysis")
         h_ctrl = torch.concat([h_ctrl, h_base], dim=1)
         debug_by_umer('enc', 'h_ctrl', h_ctrl)
         # 2 - mid blocks (bottleneck)
diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py
index d2f935dfe926..833d6794bd44 100644
--- a/src/diffusers/models/resnet.py
+++ b/src/diffusers/models/resnet.py
@@ -682,15 +682,17 @@ def __init__(
                 in_channels, conv_2d_out_channels, kernel_size=1, stride=1, padding=0, bias=conv_shortcut_bias
             )
 
-    def forward(self, input_tensor, temb, scale: float = 1.0):
+    @classmethod
+    def toggle_DO_UMER_CACHE(cls, b): cls.DO_UMER_CACHE = b
 
-        DO_UMER_CACHE = False
+    DO_UMER_CACHE = False
+    def forward(self, input_tensor, temb, scale: float = 1.0):
 
         UMER_DEBUG_CACHE = []
         umer_cache_i = 0 
         def append_to_umer_cache(msg,obj,comment):
             nonlocal umer_cache_i
-            if not DO_UMER_CACHE: return
+            if not self.DO_UMER_CACHE: return
             if hasattr(obj,'cpu'): obj = obj.cpu()
             UMER_DEBUG_CACHE.append((umer_cache_i, msg,comment,obj))
             umer_cache_i += 1
@@ -784,7 +786,7 @@ def append_to_umer_cache(msg,obj,comment):
         output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
         append_to_umer_cache('hidden_states', output_tensor, 'after skip + scale')
 
-        if DO_UMER_CACHE:
+        if self.DO_UMER_CACHE:
             import pickle
             with open('intermediate_output/local_resnet.pkl','wb') as f:
                 pickle.dump(UMER_DEBUG_CACHE, f)
diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
index 6a0745fd8667..244b32c25549 100644
--- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
@@ -737,6 +737,8 @@ def __call__(
         add_text_embeds = add_text_embeds.to(device)
         add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
 
+        print('add_time_ids =', add_time_ids)
+
         # 8. Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
@@ -752,11 +754,10 @@ def __call__(
                     x=latent_model_input,
                     t=t,
                     encoder_hidden_states=prompt_embeds,
-                    c={},
                     hint=image, # todo: better naming
-                    #cross_attention_kwargs=cross_attention_kwargs,
-                    #return_dict=False,
+                    cross_attention_kwargs=cross_attention_kwargs,
                     added_cond_kwargs=added_cond_kwargs,
+                    #return_dict=False,
                 ).sample
 
                 # perform guidance
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
index 2658b58de240..c1a3bc7b9a96 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -912,6 +912,10 @@ def __call__(
             num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
             timesteps = timesteps[:num_inference_steps]
 
+        # todo: delete
+        print(f"add_time_ids.shape = {add_time_ids.shape}")
+        print(f"add_text_embeds.shape = {add_text_embeds.shape}")
+
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
@@ -921,6 +925,16 @@ def __call__(
 
                 # predict the noise residual
                 added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+                print(f'latents.shape={list(latent_model_input.shape)} | ', end='')
+                print(f't={t} | ', end='')
+                print(f'enc_h.shape={list(prompt_embeds.shape)} | ', end='')
+                if cross_attention_kwargs is not None:
+                    print(f'cross_attn_kw.keys={list(cross_attention_kwargs.keys())} | ', end='')
+                else:
+                    print(f'cross_attn_kw is None | ', end='')
+                print(f'added_cond_kw.keys={list(added_cond_kwargs.keys())}')
+
                 noise_pred = self.unet(
                     latent_model_input,
                     t,
diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py
index 91d12e26d887..3351516a995d 100644
--- a/src/diffusers/schedulers/scheduling_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_discrete.py
@@ -169,7 +169,7 @@ def __init__(
         sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
         self.sigmas = torch.from_numpy(sigmas)
 
-        print(f'At the end of __init__, the sigmas are {self.sigmas[:5]} ...')
+        #print(f'At the end of __init__, the sigmas are {self.sigmas[:5]} ...')
 
         # setable values
         self.num_inference_steps = None

From b408dbc608c685a0a117285a436cc6fd9928bdd8 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Thu, 2 Nov 2023 21:07:46 +0100
Subject: [PATCH 19/88] checkin

---
 src/diffusers/models/controlnetxs.py | 28 ----------------------------
 1 file changed, 28 deletions(-)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index b7be225ff5fd..444126407eb8 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -391,50 +391,22 @@ def debug_save():
         # 1 - input blocks (encoder)
         for i, (m_base, m_ctrl)  in enumerate(zip(base_down_subblocks, ctrl_down_subblocks)):
             # A - concat base -> ctrl
-            torch.save(h_ctrl, 'enc_A1.pt')
-            print('A1]',h_ctrl.flatten()[:10])
-
             cat_to_ctrl = next(it_enc_convs_in)(h_base)
-            torch.save(cat_to_ctrl, 'enc_A2.pt')
-            print('A2]',cat_to_ctrl.flatten()[:10])
-
             h_ctrl = torch.cat([h_ctrl, cat_to_ctrl], dim=1)
-            torch.save(h_ctrl, 'enc_A3.pt')
-            print('A3]',h_ctrl.flatten()[:10])
-
             debug_by_umer('enc', 'h_ctr', h_ctrl)
-
             # B - apply base subblock
             h_base = m_base(h_base, temb, cemb)
-            torch.save(h_base, 'enc_B1.pt')
-            print('B1]',h_base.flatten()[:10])
-
             debug_by_umer('enc', 'h_base', h_base)
-
             # C - apply ctrl subblock
             h_ctrl = m_ctrl(h_ctrl, temb, cemb)
-            torch.save(h_ctrl, 'enc_C1.pt')
-            print('C1]',h_ctrl.flatten()[:10])
-
             debug_by_umer('enc', 'h_ctrl', h_ctrl)
-
             # D - add ctrl -> base
             add_to_base = next(it_enc_convs_out)(h_ctrl)
-            torch.save(add_to_base, 'enc_D1.pt')
-            print('D1]',add_to_base.flatten()[:10])
-
             scale = next(scales)
-            torch.save(scale, 'enc_D2.pt')
-            print('D2]',scale.flatten()[:10])
-
             h_base = h_base + add_to_base * scale
-            torch.save(h_base, 'enc_D3.pt')
-            print('D3]',h_base.flatten()[:10])
-
             debug_by_umer('enc', 'h_base', h_base)
             hs_base.append(h_base)
             hs_ctrl.append(h_ctrl)
-            raise ValueError("Alright captain, do your analysis")
         h_ctrl = torch.concat([h_ctrl, h_base], dim=1)
         debug_by_umer('enc', 'h_ctrl', h_ctrl)
         # 2 - mid blocks (bottleneck)

From e0ad61bd4b98e626e340fcd4a1bd5aa616e6acca Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Mon, 6 Nov 2023 16:31:11 +0100
Subject: [PATCH 20/88] checkin 231106T1600

---
 src/diffusers/models/attention.py             | 17 +++++-
 src/diffusers/models/controlnetxs.py          | 61 +++++++++++++++----
 src/diffusers/models/resnet.py                | 48 ++-------------
 src/diffusers/models/transformer_2d.py        | 13 +++-
 .../pipeline_controlnet_xs_sd_xl.py           |  8 ---
 .../schedulers/scheduling_euler_discrete.py   |  8 ---
 6 files changed, 82 insertions(+), 73 deletions(-)

diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 47608005d374..e7acb9020321 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -197,6 +197,9 @@ def forward(
         cross_attention_kwargs: Dict[str, Any] = None,
         class_labels: Optional[torch.LongTensor] = None,
     ) -> torch.FloatTensor:
+
+        UMER_DEBUG_CACHE = []
+
         # Notice that normalization is always applied before the real computation in the following blocks.
         # 0. Self-Attention
         if self.use_ada_layer_norm:
@@ -221,9 +224,12 @@ def forward(
             attention_mask=attention_mask,
             **cross_attention_kwargs,
         )
+        UMER_DEBUG_CACHE.append(('attn1', attn_output))
+
         if self.use_ada_layer_norm_zero:
             attn_output = gate_msa.unsqueeze(1) * attn_output
         hidden_states = attn_output + hidden_states
+        UMER_DEBUG_CACHE.append(('add attn1', hidden_states))
 
         # 2.5 GLIGEN Control
         if gligen_kwargs is not None:
@@ -235,7 +241,10 @@ def forward(
             norm_hidden_states = (
                 self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
             )
-
+            UMER_DEBUG_CACHE.append(('norm2', norm_hidden_states))
+            UMER_DEBUG_CACHE.append(('context', encoder_hidden_states))
+            if encoder_attention_mask is not None: print('encoder_attention_mask is not None. Shape = '+str(list(encoder_attention_mask.shape)+'\tvals = '+str(encoder_attention_mask.flatten[:10])))
+            if cross_attention_kwargs is not None: print('cross_attention_kwargs is not None. Keys = '+str(cross_attention_kwargs.keys()))
             attn_output = self.attn2(
                 norm_hidden_states,
                 encoder_hidden_states=encoder_hidden_states,
@@ -243,6 +252,8 @@ def forward(
                 **cross_attention_kwargs,
             )
             hidden_states = attn_output + hidden_states
+        UMER_DEBUG_CACHE.append(('attn2', attn_output))
+        UMER_DEBUG_CACHE.append(('add attn2', hidden_states))
 
         # 4. Feed-forward
         norm_hidden_states = self.norm3(hidden_states)
@@ -272,8 +283,10 @@ def forward(
             ff_output = gate_mlp.unsqueeze(1) * ff_output
 
         hidden_states = ff_output + hidden_states
+        UMER_DEBUG_CACHE.append(('ff', ff_output))
+        UMER_DEBUG_CACHE.append(('add ff', hidden_states))
 
-        return hidden_states
+        return hidden_states, UMER_DEBUG_CACHE
 
 
 class FeedForward(nn.Module):
diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index 444126407eb8..0b15429aa4ff 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -256,6 +256,7 @@ def __init__(
 
     DEBUG_LOG_by_Umer = False
     DEBUG_LOG_by_Umer_file = 'debug_log.pkl'
+    DETAILLED_DEBUG_LOG_by_Umer = False
     def forward(
         self,
         x: torch.Tensor,
@@ -357,6 +358,7 @@ def forward(
         base_up_subblocks = to_sub_blocks(self.base_model.up_blocks)
 
         # Debug Umer -- to delete later on
+        # this is for a global view, ie on subblock level
         debug_log = []
         def debug_by_umer(stage, msg, obj):
             if not self.DEBUG_LOG_by_Umer: return
@@ -375,6 +377,10 @@ def debug_save():
         debug_by_umer('prep', 'raw hint', hint)
         debug_by_umer('prep', 'guided_hint', guided_hint)
 
+        # Debug Umer - another one!
+        # this is for a detail view, ie below subblock level
+        more_detailled_debug_log = []
+
         # Cross Control
         # 0 - conv in
         h_base = self.base_model.conv_in(h_base)
@@ -389,17 +395,22 @@ def debug_save():
         hs_base.append(h_base)
         hs_ctrl.append(h_ctrl)
         # 1 - input blocks (encoder)
+        print('------ enc ------')
         for i, (m_base, m_ctrl)  in enumerate(zip(base_down_subblocks, ctrl_down_subblocks)):
             # A - concat base -> ctrl
             cat_to_ctrl = next(it_enc_convs_in)(h_base)
             h_ctrl = torch.cat([h_ctrl, cat_to_ctrl], dim=1)
             debug_by_umer('enc', 'h_ctr', h_ctrl)
             # B - apply base subblock
-            h_base = m_base(h_base, temb, cemb)
+            print('>> Applying base block\t', end='')
+            h_base, debug_cache_i_dont_care_about_sry_mr_debug_cache = m_base(h_base, temb, cemb)
             debug_by_umer('enc', 'h_base', h_base)
             # C - apply ctrl subblock
-            h_ctrl = m_ctrl(h_ctrl, temb, cemb)
+            print('>> Applying ctrl block\t', end='')
+            h_ctrl, another_debug_cache = m_ctrl(h_ctrl, temb, cemb)
             debug_by_umer('enc', 'h_ctrl', h_ctrl)
+            more_detailled_debug_log += another_debug_cache # We only record details for the application of ctrl blocks
+            print()
             # D - add ctrl -> base
             add_to_base = next(it_enc_convs_out)(h_ctrl)
             scale = next(scales)
@@ -410,9 +421,14 @@ def debug_save():
         h_ctrl = torch.concat([h_ctrl, h_base], dim=1)
         debug_by_umer('enc', 'h_ctrl', h_ctrl)
         # 2 - mid blocks (bottleneck)
+        print('------ mid ------')
         for m_base, m_ctrl in zip(base_mid_subblocks, ctrl_mid_subblocks):
-            h_base = m_base(h_base, temb, cemb)
-            h_ctrl = m_ctrl(h_ctrl, temb, cemb)
+            print('>> Applying base block\t', end='')
+            h_base, debug_cache_i_dont_care_about_sry_mr_debug_cache = m_base(h_base, temb, cemb)
+            print('>> Applying ctrl block\t', end='')
+            h_ctrl, another_debug_cache = m_ctrl(h_ctrl, temb, cemb)
+            more_detailled_debug_log += another_debug_cache # We only record details for the application of ctrl blocks
+            print()
         # Heidelberg treats the R/A/R as one block, while I treat is as 2 subblocks
         # Let's therefore only log after the mid section
         debug_by_umer('mid', 'h_base', h_base)
@@ -422,15 +438,24 @@ def debug_save():
         debug_by_umer('mid', 'h_base', h_base)
 
         # 3 - output blocks (decoder)
+        print('------ dec ------')
         for m_base in base_up_subblocks:
             h_base = h_base + next(it_dec_convs_out)(hs_ctrl.pop()) * next(scales) # add info from ctrl encoder 
             debug_by_umer('dec', 'h_base', h_base)
             h_base = torch.cat([h_base, hs_base.pop()], dim=1) # concat info from base encoder+ctrl encoder
             debug_by_umer('dec', 'h_base', h_base)
-            h_base = m_base(h_base, temb, cemb)
+            print('>> Applying base block\t', end='')
+            h_base, debug_cache_i_dont_care_about_sry_mr_debug_cache = m_base(h_base, temb, cemb)
             debug_by_umer('dec', 'h_base', h_base)
+            print()
 
         debug_save()
+        if self.DETAILLED_DEBUG_LOG_by_Umer:
+            more_detailled_debug_log = [(txt, t.cpu().detach()) for txt,t in more_detailled_debug_log]
+            import pickle
+            pickle.dump(more_detailled_debug_log, open('intermediate_output/detailled_debug_log.pkl', 'wb'))
+            print('Alright captain. Look at all these tensors we caught. Time to do some real analysis.')
+            raise ValueError('stop right here')
 
         return UNet2DConditionOutput(sample=self.base_model.conv_out(h_base))
 
@@ -538,13 +563,27 @@ def __init__(self,ms,*args,**kwargs):
         super().__init__(ms,*args,**kwargs)
     
     def forward(self,x,temb,cemb):
+        def cls_name(x): return str(type(x)).split('.')[-1].replace("'>",'')
+        content = ' '.join(cls_name(m) for m in self)
+        print(f'EmbedSequential.forward with content {content}')
+        UMER_DEBUG_CACHE = []
         for m in self:
-            if isinstance(m,ResnetBlock2D): x=m(x,temb)
-            elif isinstance(m,Transformer2DModel): x=m(x,cemb).sample # Q: Include temp also?
-            elif isinstance(m,Downsample2D): x=m(x)
-            elif isinstance(m,Upsample2D): x=m(x)
-            else: raise ValueError(f'Type of m is {type(m)} but should be `ResnetBlock2D`, `Transformer2DModel`,  `Downsample2D`, `Upsample2D`')
-        return x
+            if isinstance(m,ResnetBlock2D):
+                x, debug_cache = m(x,temb)
+                UMER_DEBUG_CACHE += debug_cache
+            elif isinstance(m,Transformer2DModel):
+                result = m(x,cemb)
+                x = result.sample
+                UMER_DEBUG_CACHE += result.debug_cache
+            elif isinstance(m,Downsample2D):
+                x = m(x)
+                UMER_DEBUG_CACHE += [('conv',x)] # Downsample2D only has 1 operation, so {intermediate results} = {final result}
+            elif isinstance(m,Upsample2D):
+                x = m(x)
+                UMER_DEBUG_CACHE += [('conv',x)] # Upsample2D only has 1 operation, so {intermediate results} = {final result}
+            else: raise ValueError(f'Type of m is {type(m)} but should be `ResnetBlock2D`, `Transformer2DModel`,  `Downsample2D` or `Upsample2D`')
+
+        return x, UMER_DEBUG_CACHE
 
 
 def is_iterable(o):
diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py
index 833d6794bd44..df62a89b5569 100644
--- a/src/diffusers/models/resnet.py
+++ b/src/diffusers/models/resnet.py
@@ -682,36 +682,18 @@ def __init__(
                 in_channels, conv_2d_out_channels, kernel_size=1, stride=1, padding=0, bias=conv_shortcut_bias
             )
 
-    @classmethod
-    def toggle_DO_UMER_CACHE(cls, b): cls.DO_UMER_CACHE = b
-
-    DO_UMER_CACHE = False
     def forward(self, input_tensor, temb, scale: float = 1.0):
 
         UMER_DEBUG_CACHE = []
-        umer_cache_i = 0 
-        def append_to_umer_cache(msg,obj,comment):
-            nonlocal umer_cache_i
-            if not self.DO_UMER_CACHE: return
-            if hasattr(obj,'cpu'): obj = obj.cpu()
-            UMER_DEBUG_CACHE.append((umer_cache_i, msg,comment,obj))
-            umer_cache_i += 1
-
 
         hidden_states = input_tensor
 
-        append_to_umer_cache('hidden_states', hidden_states, 'start')
-        append_to_umer_cache('temb', temb, 'start')
-        append_to_umer_cache('scale', scale, 'start')
-
         if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
             hidden_states = self.norm1(hidden_states, temb)
         else:
             hidden_states = self.norm1(hidden_states)
-        append_to_umer_cache('hidden_states', hidden_states, 'after norm1')
 
         hidden_states = self.nonlinearity(hidden_states)
-        append_to_umer_cache('hidden_states', hidden_states, 'after silu')
 
         if self.upsample is not None:
             # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
@@ -741,42 +723,34 @@ def append_to_umer_cache(msg,obj,comment):
             )
 
         hidden_states = self.conv1(hidden_states, scale) if not USE_PEFT_BACKEND else self.conv1(hidden_states)
-        append_to_umer_cache('hidden_states', hidden_states, 'after conv1')
+        UMER_DEBUG_CACHE.append(('conv1', hidden_states))
 
         if self.time_emb_proj is not None:
             if not self.skip_time_act:
                 temb = self.nonlinearity(temb)
-                append_to_umer_cache('temb', temb, 'after silu')
             temb = (
                 self.time_emb_proj(temb, scale)[:, :, None, None]
                 if not USE_PEFT_BACKEND
                 else self.time_emb_proj(temb)[:, :, None, None]
-            )        
-            append_to_umer_cache('temb', temb, 'after linear')
-
+            )
 
         if temb is not None and self.time_embedding_norm == "default":
             hidden_states = hidden_states + temb
-            append_to_umer_cache('hidden_states', hidden_states, 'after time add')
-
+            UMER_DEBUG_CACHE.append(('add time_emb_proj', hidden_states))
 
         if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
             hidden_states = self.norm2(hidden_states, temb)
         else:
             hidden_states = self.norm2(hidden_states)
-            append_to_umer_cache('hidden_states', hidden_states, 'after norm2')
 
         if temb is not None and self.time_embedding_norm == "scale_shift":
             scale, shift = torch.chunk(temb, 2, dim=1)
             hidden_states = hidden_states * (1 + scale) + shift
 
         hidden_states = self.nonlinearity(hidden_states)
-        append_to_umer_cache('hidden_states', hidden_states, 'after silu')
-
         hidden_states = self.dropout(hidden_states)
-        append_to_umer_cache('hidden_states', hidden_states, 'after dropout')
         hidden_states = self.conv2(hidden_states, scale) if not USE_PEFT_BACKEND else self.conv2(hidden_states)
-        append_to_umer_cache('hidden_states', hidden_states, 'after conv2')
+        UMER_DEBUG_CACHE.append(('conv2', hidden_states))
 
         if self.conv_shortcut is not None:
             input_tensor = (
@@ -784,19 +758,9 @@ def append_to_umer_cache(msg,obj,comment):
             )
 
         output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
-        append_to_umer_cache('hidden_states', output_tensor, 'after skip + scale')
-
-        if self.DO_UMER_CACHE:
-            import pickle
-            with open('intermediate_output/local_resnet.pkl','wb') as f:
-                pickle.dump(UMER_DEBUG_CACHE, f)
-
-            msg = "End of 1st ResNet reached."
-            msg += "\nLet's analyze the intermediate steps, my man. Don't be intimidated, you can do it. Believe in the you that believes in yourself."
-            msg += "\n\nBtw, results are saved to file 'intermediate_output/local_resnet.pkl'."
-            raise ValueError(msg)
+        UMER_DEBUG_CACHE.append(('add conv_shortcut', output_tensor))
 
-        return output_tensor
+        return output_tensor, UMER_DEBUG_CACHE
 
 
 # unet_rl.py
diff --git a/src/diffusers/models/transformer_2d.py b/src/diffusers/models/transformer_2d.py
index 0f00932f3014..47af5963dfad 100644
--- a/src/diffusers/models/transformer_2d.py
+++ b/src/diffusers/models/transformer_2d.py
@@ -288,6 +288,8 @@ def forward(
         # Retrieve lora scale.
         lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
 
+        UMER_DEBUG_CACHE = []
+
         # 1. Input
         if self.is_input_continuous:
             batch, _, height, width = hidden_states.shape
@@ -316,6 +318,8 @@ def forward(
         elif self.is_input_patches:
             hidden_states = self.pos_embed(hidden_states)
 
+        UMER_DEBUG_CACHE.append(('proj_in', hidden_states))
+
         # 2. Blocks
         for block in self.transformer_blocks:
             if self.training and self.gradient_checkpointing:
@@ -331,7 +335,7 @@ def forward(
                     use_reentrant=False,
                 )
             else:
-                hidden_states = block(
+                hidden_states, debug_cache_from_attention_block = block(
                     hidden_states,
                     attention_mask=attention_mask,
                     encoder_hidden_states=encoder_hidden_states,
@@ -340,6 +344,7 @@ def forward(
                     cross_attention_kwargs=cross_attention_kwargs,
                     class_labels=class_labels,
                 )
+                UMER_DEBUG_CACHE += debug_cache_from_attention_block
 
         # 3. Output
         if self.is_input_continuous:
@@ -386,7 +391,11 @@ def forward(
                 shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size)
             )
 
+        UMER_DEBUG_CACHE.append(('proj_out', output))
+
         if not return_dict:
             return (output,)
 
-        return Transformer2DModelOutput(sample=output)
+        result = Transformer2DModelOutput(sample=output)
+        result.debug_cache = UMER_DEBUG_CACHE
+        return result#Transformer2DModelOutput(sample=output)
diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
index 244b32c25549..100f921525f1 100644
--- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
@@ -673,9 +673,6 @@ def __call__(
         timesteps = self.scheduler.timesteps
 
         # 6. Prepare latent variables
-        if latents is not None: print("Passed in latents: ", latents.flatten()[:5])
-        else: print("No latents passed in")
-
         num_channels_latents = self.unet.config.in_channels
         latents, initial_unscaled_latents = self.prepare_latents(
             batch_size * num_images_per_prompt,
@@ -687,13 +684,10 @@ def __call__(
             generator,
             latents,
         )
-        print("initial_unscaled_latents: ", initial_unscaled_latents.flatten()[:5])
-        print("latents: ", latents.flatten()[:5])
         # # DEBUG
         if callback is not None: callback(-1, -1, initial_unscaled_latents)
         # # 
 
-
         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
@@ -737,8 +731,6 @@ def __call__(
         add_text_embeds = add_text_embeds.to(device)
         add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
 
-        print('add_time_ids =', add_time_ids)
-
         # 8. Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py
index 3351516a995d..e7f333bc62dc 100644
--- a/src/diffusers/schedulers/scheduling_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_discrete.py
@@ -256,11 +256,7 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
                 f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
             )
 
-        print(f'timestep_spacing = "leading" and timesteps={timesteps[:5]} ...')
-
         sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
-        print(f'sigmas before interpolation: {sigmas[:5]} ...')
-
         log_sigmas = np.log(sigmas)
 
         if self.config.interpolation_type == "linear":
@@ -284,10 +280,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
         self.timesteps = torch.from_numpy(timesteps).to(device=device)
         self._step_index = None
 
-        print(f'At end of `set_timesteps`:')
-        print(f'sigmas =  {self.sigmas[:5]} ...')
-        print(f'timesteps = {self.timesteps[:5]} ...')
-
     def _sigma_to_t(self, sigma, log_sigmas):
         # get log sigma
         log_sigma = np.log(sigma)

From d76881af7ab1423e7f61e0b2faa3dd44ef5185c2 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Mon, 6 Nov 2023 16:45:01 +0100
Subject: [PATCH 21/88] turned off detailled debug prints

---
 src/diffusers/models/attention.py    |  3 ++-
 src/diffusers/models/controlnetxs.py | 20 +++++++++++---------
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index e7acb9020321..8b52aef3597f 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -244,7 +244,8 @@ def forward(
             UMER_DEBUG_CACHE.append(('norm2', norm_hidden_states))
             UMER_DEBUG_CACHE.append(('context', encoder_hidden_states))
             if encoder_attention_mask is not None: print('encoder_attention_mask is not None. Shape = '+str(list(encoder_attention_mask.shape)+'\tvals = '+str(encoder_attention_mask.flatten[:10])))
-            if cross_attention_kwargs is not None: print('cross_attention_kwargs is not None. Keys = '+str(cross_attention_kwargs.keys()))
+            if cross_attention_kwargs is not None:
+                if len(cross_attention_kwargs.keys()) > 0: print('cross_attention_kwargs is not None. Keys = '+str(cross_attention_kwargs.keys()))
             attn_output = self.attn2(
                 norm_hidden_states,
                 encoder_hidden_states=encoder_hidden_states,
diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index 0b15429aa4ff..be9789223943 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -381,6 +381,8 @@ def debug_save():
         # this is for a detail view, ie below subblock level
         more_detailled_debug_log = []
 
+        any_debug = self.DEBUG_LOG_by_Umer or self.DETAILLED_DEBUG_LOG_by_Umer
+
         # Cross Control
         # 0 - conv in
         h_base = self.base_model.conv_in(h_base)
@@ -395,18 +397,18 @@ def debug_save():
         hs_base.append(h_base)
         hs_ctrl.append(h_ctrl)
         # 1 - input blocks (encoder)
-        print('------ enc ------')
+        if any_debug: print('------ enc ------')
         for i, (m_base, m_ctrl)  in enumerate(zip(base_down_subblocks, ctrl_down_subblocks)):
             # A - concat base -> ctrl
             cat_to_ctrl = next(it_enc_convs_in)(h_base)
             h_ctrl = torch.cat([h_ctrl, cat_to_ctrl], dim=1)
             debug_by_umer('enc', 'h_ctr', h_ctrl)
             # B - apply base subblock
-            print('>> Applying base block\t', end='')
+            if any_debug: print('>> Applying base block\t', end='')
             h_base, debug_cache_i_dont_care_about_sry_mr_debug_cache = m_base(h_base, temb, cemb)
             debug_by_umer('enc', 'h_base', h_base)
             # C - apply ctrl subblock
-            print('>> Applying ctrl block\t', end='')
+            if any_debug: print('>> Applying ctrl block\t', end='')
             h_ctrl, another_debug_cache = m_ctrl(h_ctrl, temb, cemb)
             debug_by_umer('enc', 'h_ctrl', h_ctrl)
             more_detailled_debug_log += another_debug_cache # We only record details for the application of ctrl blocks
@@ -421,14 +423,14 @@ def debug_save():
         h_ctrl = torch.concat([h_ctrl, h_base], dim=1)
         debug_by_umer('enc', 'h_ctrl', h_ctrl)
         # 2 - mid blocks (bottleneck)
-        print('------ mid ------')
+        if any_debug: print('------ mid ------')
         for m_base, m_ctrl in zip(base_mid_subblocks, ctrl_mid_subblocks):
-            print('>> Applying base block\t', end='')
+            if any_debug: print('>> Applying base block\t', end='')
             h_base, debug_cache_i_dont_care_about_sry_mr_debug_cache = m_base(h_base, temb, cemb)
-            print('>> Applying ctrl block\t', end='')
+            if any_debug: print('>> Applying ctrl block\t', end='')
             h_ctrl, another_debug_cache = m_ctrl(h_ctrl, temb, cemb)
             more_detailled_debug_log += another_debug_cache # We only record details for the application of ctrl blocks
-            print()
+            if any_debug: print()
         # Heidelberg treats the R/A/R as one block, while I treat is as 2 subblocks
         # Let's therefore only log after the mid section
         debug_by_umer('mid', 'h_base', h_base)
@@ -438,13 +440,13 @@ def debug_save():
         debug_by_umer('mid', 'h_base', h_base)
 
         # 3 - output blocks (decoder)
-        print('------ dec ------')
+        if any_debug: print('------ dec ------')
         for m_base in base_up_subblocks:
             h_base = h_base + next(it_dec_convs_out)(hs_ctrl.pop()) * next(scales) # add info from ctrl encoder 
             debug_by_umer('dec', 'h_base', h_base)
             h_base = torch.cat([h_base, hs_base.pop()], dim=1) # concat info from base encoder+ctrl encoder
             debug_by_umer('dec', 'h_base', h_base)
-            print('>> Applying base block\t', end='')
+            if any_debug: print('>> Applying base block\t', end='')
             h_base, debug_cache_i_dont_care_about_sry_mr_debug_cache = m_base(h_base, temb, cemb)
             debug_by_umer('dec', 'h_base', h_base)
             print()

From 202d3def0637e944046f4aa4c52a6eda5829ba53 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Tue, 7 Nov 2023 18:57:07 +0100
Subject: [PATCH 22/88] time debug logs

---
 src/diffusers/models/controlnetxs.py | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index be9789223943..c8506a26326b 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -257,6 +257,7 @@ def __init__(
     DEBUG_LOG_by_Umer = False
     DEBUG_LOG_by_Umer_file = 'debug_log.pkl'
     DETAILLED_DEBUG_LOG_by_Umer = False
+    TIME_DEBUG_LOG_by_Umer = False
     def forward(
         self,
         x: torch.Tensor,
@@ -267,6 +268,12 @@ def forward(
         added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
         no_control=False,
     ):
+        def time_debug_log(txt,t):
+            if not hasattr(t,'shape'): t = torch.tensor(t)
+            t = t.cpu().detach()
+            print(f'{txt:<20}{t.flatten()[:10]}')
+            torch.save(t,'time__'+txt+'.pt')
+
         if self.base_model is None:
             raise RuntimeError("To use `forward`, first set the base model for this ControlNetXSModel by `cnxs_model.base_model = the_base_model`")
 
@@ -296,6 +303,7 @@ def forward(
 
         # time embeddings
         timesteps = timesteps[None]
+        if self.TIME_DEBUG_LOG_by_Umer: time_debug_log('timestep',timesteps)
         t_emb = get_timestep_embedding(
             timesteps, 
             self.model_channels,
@@ -303,10 +311,13 @@ def forward(
             flip_sin_to_cos=self.flip_sin_to_cos,
             downscale_freq_shift=self.freq_shift,
         )
+        if self.TIME_DEBUG_LOG_by_Umer: time_debug_log('time_emb',t_emb)
         if self.learn_embedding:
             temb = self.control_model.time_embedding(t_emb) * self.config.control_scale ** 0.3 + self.base_model.time_embedding(t_emb) * (1 - self.config.control_scale ** 0.3)
         else:
             temb = self.base_model.time_embedding(t_emb)
+        if self.TIME_DEBUG_LOG_by_Umer: time_debug_log('time_proj',temb)
+
         aug_emb = None
 
         # text embeddings
@@ -324,16 +335,20 @@ def forward(
                     f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
                 )
             text_embeds = added_cond_kwargs.get("text_embeds")
+            if self.TIME_DEBUG_LOG_by_Umer: time_debug_log('text_embeds',text_embeds)
             if "time_ids" not in added_cond_kwargs:
                 raise ValueError(
                     f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
                 )
             time_ids = added_cond_kwargs.get("time_ids")
+            if self.TIME_DEBUG_LOG_by_Umer: time_debug_log('add_input',time_ids.flatten())
             time_embeds = self.base_model.add_time_proj(time_ids.flatten())
             time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+            if self.TIME_DEBUG_LOG_by_Umer: time_debug_log('add_emb',time_ids.flatten())
             add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
             add_embeds = add_embeds.to(temb.dtype)
             aug_emb = self.base_model.add_embedding(add_embeds)
+            if self.TIME_DEBUG_LOG_by_Umer: time_debug_log('add_proj',aug_emb)
 
         elif self.config.addition_embed_type == "image":
             raise NotImplementedError()
@@ -341,7 +356,11 @@ def forward(
             raise NotImplementedError()
 
         temb = temb + aug_emb if aug_emb is not None else temb
-     
+        if self.TIME_DEBUG_LOG_by_Umer: time_debug_log('final_temb',temb)
+
+        if self.TIME_DEBUG_LOG_by_Umer:
+            print('Time to analyze time!')
+            raise ValueError('Time to analyze time!')
 
         ###
         guided_hint = self.input_hint_block(hint)
@@ -449,7 +468,7 @@ def debug_save():
             if any_debug: print('>> Applying base block\t', end='')
             h_base, debug_cache_i_dont_care_about_sry_mr_debug_cache = m_base(h_base, temb, cemb)
             debug_by_umer('dec', 'h_base', h_base)
-            print()
+            if any_debug: print()
 
         debug_save()
         if self.DETAILLED_DEBUG_LOG_by_Umer:

From 8996cf4c96c4592d17264fa198001413030cee49 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Tue, 7 Nov 2023 19:02:44 +0100
Subject: [PATCH 23/88] small fix

---
 src/diffusers/models/controlnetxs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index c8506a26326b..ee2848a58b1e 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -344,7 +344,7 @@ def time_debug_log(txt,t):
             if self.TIME_DEBUG_LOG_by_Umer: time_debug_log('add_input',time_ids.flatten())
             time_embeds = self.base_model.add_time_proj(time_ids.flatten())
             time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
-            if self.TIME_DEBUG_LOG_by_Umer: time_debug_log('add_emb',time_ids.flatten())
+            if self.TIME_DEBUG_LOG_by_Umer: time_debug_log('add_emb',time_embeds)
             add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
             add_embeds = add_embeds.to(temb.dtype)
             aug_emb = self.base_model.add_embedding(add_embeds)

From f54ac820817849f2d26d769bfa0efa29f6a7e153 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Wed, 8 Nov 2023 12:05:46 +0100
Subject: [PATCH 24/88] Separated control_scale for connections/time

---
 src/diffusers/models/controlnetxs.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index ee2848a58b1e..89746e0c1fb8 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -147,6 +147,7 @@ def __init__(
             },
             global_pool_conditions: bool = False, # Todo Umer: Needed by SDXL pipeline, but what is this?,
             control_scale=1,
+            time_control_scale=1,
             addition_embed_type: Optional[str] = None,
         ):
         super().__init__()
@@ -313,7 +314,11 @@ def time_debug_log(txt,t):
         )
         if self.TIME_DEBUG_LOG_by_Umer: time_debug_log('time_emb',t_emb)
         if self.learn_embedding:
-            temb = self.control_model.time_embedding(t_emb) * self.config.control_scale ** 0.3 + self.base_model.time_embedding(t_emb) * (1 - self.config.control_scale ** 0.3)
+            if self.TIME_DEBUG_LOG_by_Umer: time_debug_log('time_proj_ctrl',self.control_model.time_embedding(t_emb) )
+            if self.TIME_DEBUG_LOG_by_Umer: time_debug_log('time_proj_ctrl_scaled',self.control_model.time_embedding(t_emb) * self.config.time_control_scale ** 0.3)
+            if self.TIME_DEBUG_LOG_by_Umer: time_debug_log('time_proj_base',self.base_model.time_embedding(t_emb))
+            if self.TIME_DEBUG_LOG_by_Umer: time_debug_log('time_proj_base_scaled',self.base_model.time_embedding(t_emb) * (1 - self.config.time_control_scale ** 0.3))
+            temb = self.control_model.time_embedding(t_emb) * self.config.time_control_scale ** 0.3 + self.base_model.time_embedding(t_emb) * (1 - self.config.time_control_scale ** 0.3)
         else:
             temb = self.base_model.time_embedding(t_emb)
         if self.TIME_DEBUG_LOG_by_Umer: time_debug_log('time_proj',temb)

From 7654c3268142e0cee7047a2671eb83727dfc345a Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Thu, 9 Nov 2023 21:25:21 +0100
Subject: [PATCH 25/88] simplified debug logging

---
 src/diffusers/models/attention.py      |  22 ++--
 src/diffusers/models/controlnetxs.py   | 163 ++++++++++---------------
 src/diffusers/models/resnet.py         |  18 +--
 src/diffusers/models/transformer_2d.py |  14 +--
 src/diffusers/umer_debug_logger.py     | 106 ++++++++++++++++
 5 files changed, 195 insertions(+), 128 deletions(-)
 create mode 100644 src/diffusers/umer_debug_logger.py

diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 8b52aef3597f..773a3fc38cca 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -24,6 +24,7 @@
 from .embeddings import CombinedTimestepLabelEmbeddings
 from .lora import LoRACompatibleLinear
 
+from ..umer_debug_logger import udl
 
 @maybe_allow_in_graph
 class GatedSelfAttentionDense(nn.Module):
@@ -197,9 +198,6 @@ def forward(
         cross_attention_kwargs: Dict[str, Any] = None,
         class_labels: Optional[torch.LongTensor] = None,
     ) -> torch.FloatTensor:
-
-        UMER_DEBUG_CACHE = []
-
         # Notice that normalization is always applied before the real computation in the following blocks.
         # 0. Self-Attention
         if self.use_ada_layer_norm:
@@ -224,12 +222,12 @@ def forward(
             attention_mask=attention_mask,
             **cross_attention_kwargs,
         )
-        UMER_DEBUG_CACHE.append(('attn1', attn_output))
+        udl.log_if('attn1', attn_output, 'SUBBLOCK-MINUS-1')
 
         if self.use_ada_layer_norm_zero:
             attn_output = gate_msa.unsqueeze(1) * attn_output
         hidden_states = attn_output + hidden_states
-        UMER_DEBUG_CACHE.append(('add attn1', hidden_states))
+        udl.log_if('add attn1', hidden_states, 'SUBBLOCK-MINUS-1')
 
         # 2.5 GLIGEN Control
         if gligen_kwargs is not None:
@@ -241,8 +239,8 @@ def forward(
             norm_hidden_states = (
                 self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
             )
-            UMER_DEBUG_CACHE.append(('norm2', norm_hidden_states))
-            UMER_DEBUG_CACHE.append(('context', encoder_hidden_states))
+            udl.log_if('norm2', norm_hidden_states, 'SUBBLOCK-MINUS-1')
+            udl.log_if('context', encoder_hidden_states, 'SUBBLOCK-MINUS-1')
             if encoder_attention_mask is not None: print('encoder_attention_mask is not None. Shape = '+str(list(encoder_attention_mask.shape)+'\tvals = '+str(encoder_attention_mask.flatten[:10])))
             if cross_attention_kwargs is not None:
                 if len(cross_attention_kwargs.keys()) > 0: print('cross_attention_kwargs is not None. Keys = '+str(cross_attention_kwargs.keys()))
@@ -253,8 +251,8 @@ def forward(
                 **cross_attention_kwargs,
             )
             hidden_states = attn_output + hidden_states
-        UMER_DEBUG_CACHE.append(('attn2', attn_output))
-        UMER_DEBUG_CACHE.append(('add attn2', hidden_states))
+        udl.log_if('attn2', attn_output, 'SUBBLOCK-MINUS-1')
+        udl.log_if('add attn2', hidden_states, 'SUBBLOCK-MINUS-1')
 
         # 4. Feed-forward
         norm_hidden_states = self.norm3(hidden_states)
@@ -284,10 +282,10 @@ def forward(
             ff_output = gate_mlp.unsqueeze(1) * ff_output
 
         hidden_states = ff_output + hidden_states
-        UMER_DEBUG_CACHE.append(('ff', ff_output))
-        UMER_DEBUG_CACHE.append(('add ff', hidden_states))
+        udl.log_if('ff', ff_output, 'SUBBLOCK-MINUS-1')
+        udl.log_if('add ff', hidden_states, 'SUBBLOCK-MINUS-1')
 
-        return hidden_states, UMER_DEBUG_CACHE
+        return hidden_states
 
 
 class FeedForward(nn.Module):
diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index 89746e0c1fb8..1201ec9b32b6 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -38,6 +38,7 @@
     Upsample2D,
 )
 from .unet_2d_condition import UNet2DConditionModel
+from ..umer_debug_logger import udl
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -255,10 +256,6 @@ def __init__(
         del self.control_model.conv_norm_out
         del self.control_model.conv_out
 
-    DEBUG_LOG_by_Umer = False
-    DEBUG_LOG_by_Umer_file = 'debug_log.pkl'
-    DETAILLED_DEBUG_LOG_by_Umer = False
-    TIME_DEBUG_LOG_by_Umer = False
     def forward(
         self,
         x: torch.Tensor,
@@ -269,12 +266,6 @@ def forward(
         added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
         no_control=False,
     ):
-        def time_debug_log(txt,t):
-            if not hasattr(t,'shape'): t = torch.tensor(t)
-            t = t.cpu().detach()
-            print(f'{txt:<20}{t.flatten()[:10]}')
-            torch.save(t,'time__'+txt+'.pt')
-
         if self.base_model is None:
             raise RuntimeError("To use `forward`, first set the base model for this ControlNetXSModel by `cnxs_model.base_model = the_base_model`")
 
@@ -304,7 +295,8 @@ def time_debug_log(txt,t):
 
         # time embeddings
         timesteps = timesteps[None]
-        if self.TIME_DEBUG_LOG_by_Umer: time_debug_log('timestep',timesteps)
+
+        udl.log_if('timestep', timesteps, condition='TIME', print_=True)
         t_emb = get_timestep_embedding(
             timesteps, 
             self.model_channels,
@@ -312,16 +304,16 @@ def time_debug_log(txt,t):
             flip_sin_to_cos=self.flip_sin_to_cos,
             downscale_freq_shift=self.freq_shift,
         )
-        if self.TIME_DEBUG_LOG_by_Umer: time_debug_log('time_emb',t_emb)
+        udl.log_if('time_emb', t_emb, condition='TIME', print_=True)
         if self.learn_embedding:
-            if self.TIME_DEBUG_LOG_by_Umer: time_debug_log('time_proj_ctrl',self.control_model.time_embedding(t_emb) )
-            if self.TIME_DEBUG_LOG_by_Umer: time_debug_log('time_proj_ctrl_scaled',self.control_model.time_embedding(t_emb) * self.config.time_control_scale ** 0.3)
-            if self.TIME_DEBUG_LOG_by_Umer: time_debug_log('time_proj_base',self.base_model.time_embedding(t_emb))
-            if self.TIME_DEBUG_LOG_by_Umer: time_debug_log('time_proj_base_scaled',self.base_model.time_embedding(t_emb) * (1 - self.config.time_control_scale ** 0.3))
+            udl.log_if('time_proj_ctrl',self.control_model.time_embedding(t_emb), condition='TIME', print_=True)
+            udl.log_if('time_proj_ctrl_scaled',self.control_model.time_embedding(t_emb) * self.config.time_control_scale ** 0.3, condition='TIME', print_=True)
+            udl.log_if('time_proj_base',self.base_model.time_embedding(t_emb), condition='TIME', print_=True)
+            udl.log_if('time_proj_base_scaled',self.base_model.time_embedding(t_emb) * (1 - self.config.time_control_scale ** 0.3), condition='TIME', print_=True)
             temb = self.control_model.time_embedding(t_emb) * self.config.time_control_scale ** 0.3 + self.base_model.time_embedding(t_emb) * (1 - self.config.time_control_scale ** 0.3)
         else:
             temb = self.base_model.time_embedding(t_emb)
-        if self.TIME_DEBUG_LOG_by_Umer: time_debug_log('time_proj',temb)
+        udl.log_if('time_proj',temb, condition='TIME', print_=True)
 
         aug_emb = None
 
@@ -340,20 +332,20 @@ def time_debug_log(txt,t):
                     f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
                 )
             text_embeds = added_cond_kwargs.get("text_embeds")
-            if self.TIME_DEBUG_LOG_by_Umer: time_debug_log('text_embeds',text_embeds)
+            udl.log_if('text_embeds',text_embeds, condition='TIME', print_=True)
             if "time_ids" not in added_cond_kwargs:
                 raise ValueError(
                     f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
                 )
             time_ids = added_cond_kwargs.get("time_ids")
-            if self.TIME_DEBUG_LOG_by_Umer: time_debug_log('add_input',time_ids.flatten())
+            udl.log_if('add_input',time_ids.flatten(), condition='TIME', print_=True)
             time_embeds = self.base_model.add_time_proj(time_ids.flatten())
             time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
-            if self.TIME_DEBUG_LOG_by_Umer: time_debug_log('add_emb',time_embeds)
+            udl.log_if('add_emb',time_embeds, condition='TIME', print_=True)
             add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
             add_embeds = add_embeds.to(temb.dtype)
             aug_emb = self.base_model.add_embedding(add_embeds)
-            if self.TIME_DEBUG_LOG_by_Umer: time_debug_log('add_proj',aug_emb)
+            udl.log_if('add_proj',aug_emb, condition='TIME', print_=True)
 
         elif self.config.addition_embed_type == "image":
             raise NotImplementedError()
@@ -361,11 +353,8 @@ def time_debug_log(txt,t):
             raise NotImplementedError()
 
         temb = temb + aug_emb if aug_emb is not None else temb
-        if self.TIME_DEBUG_LOG_by_Umer: time_debug_log('final_temb',temb)
-
-        if self.TIME_DEBUG_LOG_by_Umer:
-            print('Time to analyze time!')
-            raise ValueError('Time to analyze time!')
+        udl.log_if('final_temb',temb,condition='TIME', print_=True)
+        udl.stop_if(condition='TIME', funny_msg='Time to analyze time!')
 
         ###
         guided_hint = self.input_hint_block(hint)
@@ -383,105 +372,87 @@ def time_debug_log(txt,t):
 
         # Debug Umer -- to delete later on
         # this is for a global view, ie on subblock level
-        debug_log = []
-        def debug_by_umer(stage, msg, obj):
-            if not self.DEBUG_LOG_by_Umer: return
-            i = len(debug_log)
-            if isinstance(obj, torch.Tensor): obj = obj.cpu()
-            debug_log.append((i, stage, msg, obj))
-        def debug_save():
-            if not self.DEBUG_LOG_by_Umer: return
-            import pickle
-            pickle.dump(debug_log, open(self.DEBUG_LOG_by_Umer_file, "wb"))
-            raise RuntimeError("Debug Log saved successfully")
-
-        debug_by_umer('prep', 'x', x)
-        debug_by_umer('prep', 'temb', temb)
-        debug_by_umer('prep', 'context', cemb)
-        debug_by_umer('prep', 'raw hint', hint)
-        debug_by_umer('prep', 'guided_hint', guided_hint)
+        udl.log_if('prep.x',            x,              condition='SUBBLOCK')
+        udl.log_if('prep.temb',         temb,           condition='SUBBLOCK')
+        udl.log_if('prep.context',      cemb,           condition='SUBBLOCK')
+        udl.log_if('prep.raw_hint',     hint,           condition='SUBBLOCK')
+        udl.log_if('prep.guided_hint',  guided_hint,    condition='SUBBLOCK')
 
         # Debug Umer - another one!
         # this is for a detail view, ie below subblock level
-        more_detailled_debug_log = []
-
-        any_debug = self.DEBUG_LOG_by_Umer or self.DETAILLED_DEBUG_LOG_by_Umer
 
         # Cross Control
         # 0 - conv in
         h_base = self.base_model.conv_in(h_base)
-        debug_by_umer('enc', 'h_base', h_base)
+        udl.log_if('enc.h_base', h_base, condition='SUBBLOCK')
+
         h_ctrl = self.control_model.conv_in(h_ctrl)
-        debug_by_umer('enc', 'h_ctrl', h_ctrl)
-        if guided_hint is not None:
-            h_ctrl += guided_hint
-        debug_by_umer('enc', 'h_ctrl', h_ctrl)
+        udl.log_if('enc.h_ctrl', h_ctrl, condition='SUBBLOCK')
+        
+        if guided_hint is not None: h_ctrl += guided_hint
+        udl.log_if('enc.h_ctrl', h_ctrl, condition='SUBBLOCK')
+        
         h_base = h_base + next(it_enc_convs_out)(h_ctrl) * next(scales)
-        debug_by_umer('enc', 'h_base', h_base)
+        udl.log_if('enc.h_base', h_base, condition='SUBBLOCK')
+
         hs_base.append(h_base)
         hs_ctrl.append(h_ctrl)
         # 1 - input blocks (encoder)
-        if any_debug: print('------ enc ------')
+        RUN_ONCE = ('SUBBLOCK', 'SUBBLOCK-MINUS-1')
+        udl.print_if('------ enc ------', conditions=RUN_ONCE)
         for i, (m_base, m_ctrl)  in enumerate(zip(base_down_subblocks, ctrl_down_subblocks)):
             # A - concat base -> ctrl
             cat_to_ctrl = next(it_enc_convs_in)(h_base)
             h_ctrl = torch.cat([h_ctrl, cat_to_ctrl], dim=1)
-            debug_by_umer('enc', 'h_ctr', h_ctrl)
+            udl.log_if('enc.h_ctr', h_ctrl, condition='SUBBLOCK')
             # B - apply base subblock
-            if any_debug: print('>> Applying base block\t', end='')
-            h_base, debug_cache_i_dont_care_about_sry_mr_debug_cache = m_base(h_base, temb, cemb)
-            debug_by_umer('enc', 'h_base', h_base)
+            udl.print_if('>> Applying base block\t', end='', conditions=RUN_ONCE)
+            h_base = m_base(h_base, temb, cemb)
+            udl.log_if('enc.h_base', h_base, condition='SUBBLOCK')
             # C - apply ctrl subblock
-            if any_debug: print('>> Applying ctrl block\t', end='')
-            h_ctrl, another_debug_cache = m_ctrl(h_ctrl, temb, cemb)
-            debug_by_umer('enc', 'h_ctrl', h_ctrl)
-            more_detailled_debug_log += another_debug_cache # We only record details for the application of ctrl blocks
-            print()
+            udl.print_if('>> Applying ctrl block\t', end='', conditions=RUN_ONCE)
+            h_ctrl = m_ctrl(h_ctrl, temb, cemb)
+            udl.log_if('enc.h_ctrl', h_ctrl, condition='SUBBLOCK')
+            udl.print_if('', conditions=RUN_ONCE)
             # D - add ctrl -> base
             add_to_base = next(it_enc_convs_out)(h_ctrl)
             scale = next(scales)
             h_base = h_base + add_to_base * scale
-            debug_by_umer('enc', 'h_base', h_base)
+            udl.log_if('enc.h_base', h_base, condition='SUBBLOCK')
             hs_base.append(h_base)
             hs_ctrl.append(h_ctrl)
         h_ctrl = torch.concat([h_ctrl, h_base], dim=1)
-        debug_by_umer('enc', 'h_ctrl', h_ctrl)
+        udl.log_if('enc.h_ctrl', h_ctrl, condition='SUBBLOCK')
         # 2 - mid blocks (bottleneck)
-        if any_debug: print('------ mid ------')
+        udl.print_if('------ mid ------', conditions=RUN_ONCE)
         for m_base, m_ctrl in zip(base_mid_subblocks, ctrl_mid_subblocks):
-            if any_debug: print('>> Applying base block\t', end='')
-            h_base, debug_cache_i_dont_care_about_sry_mr_debug_cache = m_base(h_base, temb, cemb)
-            if any_debug: print('>> Applying ctrl block\t', end='')
-            h_ctrl, another_debug_cache = m_ctrl(h_ctrl, temb, cemb)
-            more_detailled_debug_log += another_debug_cache # We only record details for the application of ctrl blocks
-            if any_debug: print()
+            udl.print_if('>> Applying base block\t', end='', conditions=RUN_ONCE)
+            h_base = m_base(h_base, temb, cemb)
+            udl.print_if('>> Applying ctrl block\t', end='', conditions=RUN_ONCE)
+            h_ctrl = m_ctrl(h_ctrl, temb, cemb)
+            udl.print_if('', conditions=RUN_ONCE)
         # Heidelberg treats the R/A/R as one block, while I treat is as 2 subblocks
         # Let's therefore only log after the mid section
-        debug_by_umer('mid', 'h_base', h_base)
-        debug_by_umer('mid', 'h_ctrl', h_ctrl)
+        udl.log_if('mid.h_base', h_base, condition='SUBBLOCK')
+        udl.log_if('mid.h_ctrl', h_ctrl, condition='SUBBLOCK')
     
         h_base = h_base + self.middle_block_out(h_ctrl) * next(scales)
-        debug_by_umer('mid', 'h_base', h_base)
+        udl.log_if('mid.h_base', h_base, condition='SUBBLOCK')
 
         # 3 - output blocks (decoder)
-        if any_debug: print('------ dec ------')
+        udl.print_if('------ dec ------', conditions=RUN_ONCE)
         for m_base in base_up_subblocks:
             h_base = h_base + next(it_dec_convs_out)(hs_ctrl.pop()) * next(scales) # add info from ctrl encoder 
-            debug_by_umer('dec', 'h_base', h_base)
+            udl.log_if('dec.h_base', h_base, condition='SUBBLOCK')
             h_base = torch.cat([h_base, hs_base.pop()], dim=1) # concat info from base encoder+ctrl encoder
-            debug_by_umer('dec', 'h_base', h_base)
-            if any_debug: print('>> Applying base block\t', end='')
-            h_base, debug_cache_i_dont_care_about_sry_mr_debug_cache = m_base(h_base, temb, cemb)
-            debug_by_umer('dec', 'h_base', h_base)
-            if any_debug: print()
-
-        debug_save()
-        if self.DETAILLED_DEBUG_LOG_by_Umer:
-            more_detailled_debug_log = [(txt, t.cpu().detach()) for txt,t in more_detailled_debug_log]
-            import pickle
-            pickle.dump(more_detailled_debug_log, open('intermediate_output/detailled_debug_log.pkl', 'wb'))
-            print('Alright captain. Look at all these tensors we caught. Time to do some real analysis.')
-            raise ValueError('stop right here')
+            udl.log_if('dec.h_base', h_base, condition='SUBBLOCK')
+            udl.print_if('>> Applying base block\t', end='', conditions=RUN_ONCE)
+            h_base = m_base(h_base, temb, cemb)
+            udl.log_if('dec.h_base', h_base, condition='SUBBLOCK')
+            udl.print_if('',conditions=RUN_ONCE)
+
+        udl.stop_if('SUBBLOCK', 'The subblocks are cought. Let us gaze into their soul, their very essence.')
+        udl.stop_if('SUBBLOCK-MINUS-1', 'Alright captain. Look at all these tensors we caught. Time to do some real analysis.')
 
         return UNet2DConditionOutput(sample=self.base_model.conv_out(h_base))
 
@@ -591,25 +562,19 @@ def __init__(self,ms,*args,**kwargs):
     def forward(self,x,temb,cemb):
         def cls_name(x): return str(type(x)).split('.')[-1].replace("'>",'')
         content = ' '.join(cls_name(m) for m in self)
-        print(f'EmbedSequential.forward with content {content}')
-        UMER_DEBUG_CACHE = []
+        udl.print_if(f'EmbedSequential.forward with content {content}', conditions='SUBBLOCK-MINUS-1')
         for m in self:
             if isinstance(m,ResnetBlock2D):
-                x, debug_cache = m(x,temb)
-                UMER_DEBUG_CACHE += debug_cache
+                x = m(x,temb)
             elif isinstance(m,Transformer2DModel):
-                result = m(x,cemb)
-                x = result.sample
-                UMER_DEBUG_CACHE += result.debug_cache
+                x = m(x,cemb).sample
             elif isinstance(m,Downsample2D):
                 x = m(x)
-                UMER_DEBUG_CACHE += [('conv',x)] # Downsample2D only has 1 operation, so {intermediate results} = {final result}
             elif isinstance(m,Upsample2D):
                 x = m(x)
-                UMER_DEBUG_CACHE += [('conv',x)] # Upsample2D only has 1 operation, so {intermediate results} = {final result}
             else: raise ValueError(f'Type of m is {type(m)} but should be `ResnetBlock2D`, `Transformer2DModel`,  `Downsample2D` or `Upsample2D`')
 
-        return x, UMER_DEBUG_CACHE
+        return x
 
 
 def is_iterable(o):
diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py
index df62a89b5569..dc3be9464876 100644
--- a/src/diffusers/models/resnet.py
+++ b/src/diffusers/models/resnet.py
@@ -26,6 +26,7 @@
 from .attention_processor import SpatialNorm
 from .lora import LoRACompatibleConv, LoRACompatibleLinear
 
+from ..umer_debug_logger import udl
 
 class Upsample1D(nn.Module):
     """A 1D upsampling layer with an optional convolution.
@@ -205,6 +206,8 @@ def forward(self, hidden_states: torch.Tensor, output_size: Optional[int] = None
                 else:
                     hidden_states = self.Conv2d_0(hidden_states)
 
+        udl.log_if('conv',hidden_states, 'SUBBLOCK-MINUS-1')
+
         return hidden_states
 
 
@@ -273,6 +276,8 @@ def forward(self, hidden_states, scale: float = 1.0):
         else:
             hidden_states = self.conv(hidden_states)
 
+        udl.log_if('conv',hidden_states, 'SUBBLOCK-MINUS-1')
+
         return hidden_states
 
 
@@ -683,9 +688,6 @@ def __init__(
             )
 
     def forward(self, input_tensor, temb, scale: float = 1.0):
-
-        UMER_DEBUG_CACHE = []
-
         hidden_states = input_tensor
 
         if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
@@ -723,7 +725,7 @@ def forward(self, input_tensor, temb, scale: float = 1.0):
             )
 
         hidden_states = self.conv1(hidden_states, scale) if not USE_PEFT_BACKEND else self.conv1(hidden_states)
-        UMER_DEBUG_CACHE.append(('conv1', hidden_states))
+        udl.log_if('conv1', hidden_states, condition='SUBBLOCK-MINUS-1')
 
         if self.time_emb_proj is not None:
             if not self.skip_time_act:
@@ -736,7 +738,7 @@ def forward(self, input_tensor, temb, scale: float = 1.0):
 
         if temb is not None and self.time_embedding_norm == "default":
             hidden_states = hidden_states + temb
-            UMER_DEBUG_CACHE.append(('add time_emb_proj', hidden_states))
+            udl.log_if('add time_emb_proj', hidden_states, condition='SUBBLOCK-MINUS-1')
 
         if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
             hidden_states = self.norm2(hidden_states, temb)
@@ -750,7 +752,7 @@ def forward(self, input_tensor, temb, scale: float = 1.0):
         hidden_states = self.nonlinearity(hidden_states)
         hidden_states = self.dropout(hidden_states)
         hidden_states = self.conv2(hidden_states, scale) if not USE_PEFT_BACKEND else self.conv2(hidden_states)
-        UMER_DEBUG_CACHE.append(('conv2', hidden_states))
+        udl.log_if('conv2', hidden_states, condition='SUBBLOCK-MINUS-1')
 
         if self.conv_shortcut is not None:
             input_tensor = (
@@ -758,9 +760,9 @@ def forward(self, input_tensor, temb, scale: float = 1.0):
             )
 
         output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
-        UMER_DEBUG_CACHE.append(('add conv_shortcut', output_tensor))
+        udl.log_if('add conv_shortcut', output_tensor, condition='SUBBLOCK-MINUS-1')
 
-        return output_tensor, UMER_DEBUG_CACHE
+        return output_tensor
 
 
 # unet_rl.py
diff --git a/src/diffusers/models/transformer_2d.py b/src/diffusers/models/transformer_2d.py
index 47af5963dfad..0063099fdc67 100644
--- a/src/diffusers/models/transformer_2d.py
+++ b/src/diffusers/models/transformer_2d.py
@@ -26,6 +26,7 @@
 from .lora import LoRACompatibleConv, LoRACompatibleLinear
 from .modeling_utils import ModelMixin
 
+from ..umer_debug_logger import udl
 
 @dataclass
 class Transformer2DModelOutput(BaseOutput):
@@ -288,8 +289,6 @@ def forward(
         # Retrieve lora scale.
         lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
 
-        UMER_DEBUG_CACHE = []
-
         # 1. Input
         if self.is_input_continuous:
             batch, _, height, width = hidden_states.shape
@@ -318,7 +317,7 @@ def forward(
         elif self.is_input_patches:
             hidden_states = self.pos_embed(hidden_states)
 
-        UMER_DEBUG_CACHE.append(('proj_in', hidden_states))
+        udl.log_if('proj_in', hidden_states, condition='SUBBLOCK-MINUS-1')
 
         # 2. Blocks
         for block in self.transformer_blocks:
@@ -335,7 +334,7 @@ def forward(
                     use_reentrant=False,
                 )
             else:
-                hidden_states, debug_cache_from_attention_block = block(
+                hidden_states = block(
                     hidden_states,
                     attention_mask=attention_mask,
                     encoder_hidden_states=encoder_hidden_states,
@@ -344,7 +343,6 @@ def forward(
                     cross_attention_kwargs=cross_attention_kwargs,
                     class_labels=class_labels,
                 )
-                UMER_DEBUG_CACHE += debug_cache_from_attention_block
 
         # 3. Output
         if self.is_input_continuous:
@@ -391,11 +389,9 @@ def forward(
                 shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size)
             )
 
-        UMER_DEBUG_CACHE.append(('proj_out', output))
+        udl.log_if('proj_out', output, condition='SUBBLOCK-MINUS-1')
 
         if not return_dict:
             return (output,)
 
-        result = Transformer2DModelOutput(sample=output)
-        result.debug_cache = UMER_DEBUG_CACHE
-        return result#Transformer2DModelOutput(sample=output)
+        return Transformer2DModelOutput(sample=output)
diff --git a/src/diffusers/umer_debug_logger.py b/src/diffusers/umer_debug_logger.py
new file mode 100644
index 000000000000..6aaa266e8765
--- /dev/null
+++ b/src/diffusers/umer_debug_logger.py
@@ -0,0 +1,106 @@
+# Logger to help me (UmerHA) debug controlnet-xs
+
+import os
+import csv
+import torch
+import inspect
+import logging
+import shutil
+from types import SimpleNamespace
+
+from datetime import datetime
+
+class UmerDebugLogger:
+    def __init__(self, log_dir='logs', condition=None):
+        self.log_dir, self.condition, self.tensor_counter = log_dir, condition, 0
+
+        os.makedirs(log_dir, exist_ok=True)
+        # Set up CSV logging
+        self.file = os.path.join(log_dir, 'custom_log.csv')
+        self.fields = ['timestamp', 'cls', 'fn', 'shape', 'msg', 'condition', 'tensor_file']
+        # Write the header only once if the file does not exist
+        if not os.path.isfile(self.file):
+            with open(self.file, 'w', newline='') as f:
+                writer = csv.DictWriter(f, fieldnames=self.fields)
+                writer.writeheader()
+        # Configure the logger to not propagate messages to the root logger
+        self.logger = logging.getLogger(__name__)
+        self.logger.propagate = False
+
+        self.warned_of_no_condition = False
+
+    def clear_logs(self):
+        shutil.rmtree(self.log_dir)
+        os.makedirs(self.log_dir, exist_ok=True)
+        with open(self.file, 'w', newline='') as f:
+            writer = csv.DictWriter(f, fieldnames=self.fields)
+            writer.writeheader()
+
+    def set_condition(self, condition): self.condition = condition
+
+    def log_if(self, msg, t, condition, *, print_=False):
+        self.maybe_warn_of_no_condition()
+       
+        # Use inspect to get the current frame and then go back one level to find caller
+        frame = inspect.currentframe()
+        caller_frame = frame.f_back
+        caller_info = inspect.getframeinfo(caller_frame)
+
+        # Extract class and function name from the caller
+        cls_name = caller_frame.f_locals.get('self', None).__class__.__name__ if 'self' in caller_frame.f_locals else None
+        function_name = caller_info.function
+
+        if not hasattr(t, 'shape'): t = torch.tensor(t)
+        t = t.cpu().detach()
+
+        if condition == self.condition:
+            # Save tensor to a file
+            tensor_filename = f"tensor_{self.tensor_counter}.pt"
+            torch.save(t, os.path.join(self.log_dir, tensor_filename))
+            self.tensor_counter += 1
+
+            # Log information to CSV
+            log_info = {
+                'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
+                'cls': cls_name,
+                'fn': function_name,
+                'shape': str(list(t.shape)),
+                'msg': msg,
+                'condition': condition,
+                'tensor_file': tensor_filename
+            }
+
+            with open(self.file, 'a', newline='') as f:
+                writer = csv.DictWriter(f, fieldnames=self.fields)
+                writer.writerow(log_info)
+
+            if print_: print(f'{msg}\t{t.flatten()[:10]}')
+    
+    def print_if(self, msg, conditions, end='\n'):
+        self.maybe_warn_of_no_condition()
+        if not isinstance(conditions, list): conditions = [conditions]
+        if any(self.condition==c for c in conditions): print(msg, end=end)
+
+    def stop_if(self, condition, funny_msg):
+        if condition == self.condition:
+            print(funny_msg)
+            raise SystemExit(funny_msg)
+
+    def maybe_warn_of_no_condition(self):
+        if self.condition is None and not self.warned_of_no_condition :
+            print("Warning: No condition set for UmerDebugLogger")
+            self.warned_of_no_condition = True
+
+    def get_log_objects(self):
+        log_objects = []
+        with open(self.file, newline='') as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                row['tensor'] = torch.load(os.path.join(self.log_dir, row['tensor_file']))
+                row['head'] = row['tensor'].flatten()[:10]
+                del row['tensor_file']
+                log_objects.append(SimpleNamespace(**row))
+        return log_objects
+
+
+udl = UmerDebugLogger()

From cc1a706518069e76728818aeb30233689e971587 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Sun, 12 Nov 2023 19:56:59 +0100
Subject: [PATCH 26/88] Full denoising works with control scale = 0

---
 src/diffusers/models/controlnetxs.py | 33 +++++++++++++++--
 src/diffusers/umer_debug_logger.py   | 54 +++++++++++++++++++---------
 2 files changed, 68 insertions(+), 19 deletions(-)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index 1201ec9b32b6..a58e75fafe83 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -256,6 +256,24 @@ def __init__(
         del self.control_model.conv_norm_out
         del self.control_model.conv_out
 
+    def toggle_control(self, to):
+        if not hasattr(self, 'do_control'): self.do_control = True
+        if not hasattr(self, 'scale_back_up'): self.back_up = None
+        if self.do_control == to:
+            print(f'Model already set to control mode == {to}')
+            return
+        if not to:
+            self.scale_back_up = self.scale_list[0].clone()
+            self.scale_list = self.scale_list * 0.
+            self.do_control = False
+        else:
+            self.scale_list = self.scale_list * 0. + self.scale_back_up 
+            self.scale_back_up = None
+            self.do_control = True
+        assert self.do_control == to
+        print(f'Model set to control mode == {self.do_control}')
+
+
     def forward(
         self,
         x: torch.Tensor,
@@ -269,6 +287,8 @@ def forward(
         if self.base_model is None:
             raise RuntimeError("To use `forward`, first set the base model for this ControlNetXSModel by `cnxs_model.base_model = the_base_model`")
 
+        print('control_scale:',self.scale_list)
+
         """ Params from unet_2d_condition.UNet2DConditionModel.forward:
         # self,
         # sample: torch.FloatTensor,
@@ -404,7 +424,7 @@ def forward(
             # A - concat base -> ctrl
             cat_to_ctrl = next(it_enc_convs_in)(h_base)
             h_ctrl = torch.cat([h_ctrl, cat_to_ctrl], dim=1)
-            udl.log_if('enc.h_ctr', h_ctrl, condition='SUBBLOCK')
+            udl.log_if('enc.h_ctrl', h_ctrl, condition='SUBBLOCK')
             # B - apply base subblock
             udl.print_if('>> Applying base block\t', end='', conditions=RUN_ONCE)
             h_base = m_base(h_base, temb, cemb)
@@ -451,10 +471,19 @@ def forward(
             udl.log_if('dec.h_base', h_base, condition='SUBBLOCK')
             udl.print_if('',conditions=RUN_ONCE)
 
+        h_base = self.base_model.conv_norm_out(h_base)
+        h_base = self.base_model.conv_act(h_base)
+        h_base = self.base_model.conv_out(h_base)
+
+        result = h_base
+
+        udl.log_if('conv_out.h_base', result, condition='SUBBLOCK')
+        udl.print_if('',conditions=RUN_ONCE)
+
         udl.stop_if('SUBBLOCK', 'The subblocks are cought. Let us gaze into their soul, their very essence.')
         udl.stop_if('SUBBLOCK-MINUS-1', 'Alright captain. Look at all these tensors we caught. Time to do some real analysis.')
 
-        return UNet2DConditionOutput(sample=self.base_model.conv_out(h_base))
+        return UNet2DConditionOutput(sample=result)
 
     def make_zero_conv(self, in_channels, out_channels=None):
         # keep running track # todo: better comment
diff --git a/src/diffusers/umer_debug_logger.py b/src/diffusers/umer_debug_logger.py
index 6aaa266e8765..391955756c84 100644
--- a/src/diffusers/umer_debug_logger.py
+++ b/src/diffusers/umer_debug_logger.py
@@ -11,30 +11,36 @@
 from datetime import datetime
 
 class UmerDebugLogger:
+
+    _FILE = 'udl.csv'
+
     def __init__(self, log_dir='logs', condition=None):
         self.log_dir, self.condition, self.tensor_counter = log_dir, condition, 0
-
         os.makedirs(log_dir, exist_ok=True)
-        # Set up CSV logging
-        self.file = os.path.join(log_dir, 'custom_log.csv')
         self.fields = ['timestamp', 'cls', 'fn', 'shape', 'msg', 'condition', 'tensor_file']
-        # Write the header only once if the file does not exist
-        if not os.path.isfile(self.file):
-            with open(self.file, 'w', newline='') as f:
+        self.create_file()
+        self.warned_of_no_condition = False
+
+    @property
+    def full_file_path(self): return os.path.join(self.log_dir, self._FILE) 
+
+    def create_file(self):
+        file = self.full_file_path    
+        if not os.path.isfile(file):
+            with open(file, 'w', newline='') as f:
                 writer = csv.DictWriter(f, fieldnames=self.fields)
                 writer.writeheader()
-        # Configure the logger to not propagate messages to the root logger
-        self.logger = logging.getLogger(__name__)
-        self.logger.propagate = False
 
-        self.warned_of_no_condition = False
+
+    def set_dir(self, log_dir, clear=False):
+        self.log_dir = log_dir
+        if clear: self.clear_logs()
+        self.create_file()
 
     def clear_logs(self):
-        shutil.rmtree(self.log_dir)
+        shutil.rmtree(self.log_dir, ignore_errors=True)
         os.makedirs(self.log_dir, exist_ok=True)
-        with open(self.file, 'w', newline='') as f:
-            writer = csv.DictWriter(f, fieldnames=self.fields)
-            writer.writeheader()
+        self.create_file()
 
     def set_condition(self, condition): self.condition = condition
 
@@ -70,7 +76,7 @@ def log_if(self, msg, t, condition, *, print_=False):
                 'tensor_file': tensor_filename
             }
 
-            with open(self.file, 'a', newline='') as f:
+            with open(self.full_file_path, 'a', newline='') as f:
                 writer = csv.DictWriter(f, fieldnames=self.fields)
                 writer.writerow(log_info)
 
@@ -78,7 +84,7 @@ def log_if(self, msg, t, condition, *, print_=False):
     
     def print_if(self, msg, conditions, end='\n'):
         self.maybe_warn_of_no_condition()
-        if not isinstance(conditions, list): conditions = [conditions]
+        if not isinstance(conditions, (tuple, list)): conditions = [conditions]
         if any(self.condition==c for c in conditions): print(msg, end=end)
 
     def stop_if(self, condition, funny_msg):
@@ -93,7 +99,8 @@ def maybe_warn_of_no_condition(self):
 
     def get_log_objects(self):
         log_objects = []
-        with open(self.file, newline='') as f:
+        file = self.full_file_path  
+        with open(file, newline='') as f:
             reader = csv.DictReader(f)
             for row in reader:
                 row['tensor'] = torch.load(os.path.join(self.log_dir, row['tensor_file']))
@@ -102,5 +109,18 @@ def get_log_objects(self):
                 log_objects.append(SimpleNamespace(**row))
         return log_objects
 
+    @classmethod
+    def load_log_objects_from_dir(self, log_dir):
+        file = os.path.join(log_dir, self._FILE)   
+        log_objects = []
+        with open(file, newline='') as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                row['t'] = torch.load(os.path.join(log_dir, row['tensor_file']))
+                row['head'] = row['t'].flatten()[:10]
+                del row['tensor_file']
+                log_objects.append(SimpleNamespace(**row))
+        return log_objects
+
 
 udl = UmerDebugLogger()

From 05f99269cf43a50e6420dbecb95394adfb4c0c36 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Sun, 12 Nov 2023 23:42:06 +0100
Subject: [PATCH 27/88] aligned logs

---
 src/diffusers/models/controlnetxs.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index a58e75fafe83..331757a79d28 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -445,12 +445,28 @@ def forward(
         udl.log_if('enc.h_ctrl', h_ctrl, condition='SUBBLOCK')
         # 2 - mid blocks (bottleneck)
         udl.print_if('------ mid ------', conditions=RUN_ONCE)
-        for m_base, m_ctrl in zip(base_mid_subblocks, ctrl_mid_subblocks):
+        # Because Heidelberg treats the R/A/R as one block, they first execute the full base mid block,
+        # then the full ctrl mid block; while I execute them interlaced.
+        # This doesn't change the computation, but messes up parts of the logs.
+        # So let's, while debugging, first execute full base mid block and then full ctrl mid block.
+
+        #for m_base, m_ctrl in zip(base_mid_subblocks, ctrl_mid_subblocks):
+        #    udl.print_if('>> Applying base block\t', end='', conditions=RUN_ONCE)
+        #    h_base = m_base(h_base, temb, cemb)
+        #    udl.print_if('>> Applying ctrl block\t', end='', conditions=RUN_ONCE)
+        #    h_ctrl = m_ctrl(h_ctrl, temb, cemb)
+        #    udl.print_if('', conditions=RUN_ONCE)
+
+        for m_base in base_mid_subblocks:
             udl.print_if('>> Applying base block\t', end='', conditions=RUN_ONCE)
             h_base = m_base(h_base, temb, cemb)
+            udl.print_if('', conditions=RUN_ONCE)
+
+        for m_ctrl in ctrl_mid_subblocks:
             udl.print_if('>> Applying ctrl block\t', end='', conditions=RUN_ONCE)
             h_ctrl = m_ctrl(h_ctrl, temb, cemb)
             udl.print_if('', conditions=RUN_ONCE)
+
         # Heidelberg treats the R/A/R as one block, while I treat is as 2 subblocks
         # Let's therefore only log after the mid section
         udl.log_if('mid.h_base', h_base, condition='SUBBLOCK')

From 34ecd9a5d46e6b121671815f899a9d606c6f391e Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Mon, 13 Nov 2023 12:08:53 +0100
Subject: [PATCH 28/88] Added control_attention_head_dim param

---
 src/diffusers/models/controlnetxs.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index 331757a79d28..8d34910b6f8f 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -88,6 +88,7 @@ def get_time_emb_input_dim(unet: UNet2DConditionModel):return unet.time_embeddin
             base_model_channel_sizes=base_model_channel_sizes,
             control_scale=0.95,
             addition_embed_type='text_time',
+            control_attention_head_dim=64,
         )
         cnxs_model.base_model = base_model
         return cnxs_model
@@ -150,6 +151,7 @@ def __init__(
             control_scale=1,
             time_control_scale=1,
             addition_embed_type: Optional[str] = None,
+            control_attention_head_dim: Optional[int] = 8,
         ):
         super().__init__()
 
@@ -172,6 +174,7 @@ def __init__(
             time_embedding_dim=time_embedding_dim,
             transformer_layers_per_block=transformer_layers_per_block,
             cross_attention_dim=cross_attention_dim,
+            attention_head_dim=control_attention_head_dim,
         )
 
         # 2 - Do model surgery on control model

From c975ea85afb2fa6d0d67dc613f27fc2bd5bf876c Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Mon, 13 Nov 2023 12:34:58 +0100
Subject: [PATCH 29/88] Passing n_heads instead of dim_head into ctrl unet

---
 src/diffusers/models/controlnetxs.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index 8d34910b6f8f..620572bb3b6a 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -174,7 +174,9 @@ def __init__(
             time_embedding_dim=time_embedding_dim,
             transformer_layers_per_block=transformer_layers_per_block,
             cross_attention_dim=cross_attention_dim,
-            attention_head_dim=control_attention_head_dim,
+            # Currently, `attention_head_dim` actually describes the numer of attention heads. See https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
+            # TODO: How to handle this?
+            attention_head_dim=[c//control_attention_head_dim for c in block_out_channels], 
         )
 
         # 2 - Do model surgery on control model

From 535149d310a308fecd360d1114c1cdc2b3cb5c9f Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Mon, 13 Nov 2023 15:03:52 +0100
Subject: [PATCH 30/88] Fixed ctrl midblock bug

---
 src/diffusers/models/controlnetxs.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index 620572bb3b6a..d043bb89b484 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -446,7 +446,8 @@ def forward(
             udl.log_if('enc.h_base', h_base, condition='SUBBLOCK')
             hs_base.append(h_base)
             hs_ctrl.append(h_ctrl)
-        h_ctrl = torch.concat([h_ctrl, h_base], dim=1)
+        cat_to_ctrl = next(it_enc_convs_in)(h_base)
+        h_ctrl = torch.cat([h_ctrl, cat_to_ctrl], dim=1)
         udl.log_if('enc.h_ctrl', h_ctrl, condition='SUBBLOCK')
         # 2 - mid blocks (bottleneck)
         udl.print_if('------ mid ------', conditions=RUN_ONCE)

From 1583c13df3eca8368c2f0d6eb158aacb327bccd1 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Mon, 13 Nov 2023 18:23:35 +0100
Subject: [PATCH 31/88] Cleanup

---
 src/diffusers/__init__.py                     |   4 +
 src/diffusers/models/controlnetxs.py          | 243 +++-------
 src/diffusers/pipelines/__init__.py           |  10 +-
 .../pipelines/controlnet_xs/__init__.py       |  66 +++
 .../pipeline_controlnet_xs_sd_xl.py           | 438 ++++++++++++++++--
 .../schedulers/scheduling_euler_discrete.py   |   1 -
 6 files changed, 534 insertions(+), 228 deletions(-)
 create mode 100644 src/diffusers/pipelines/controlnet_xs/__init__.py

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 42f352c029c8..a41d405421db 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -78,6 +78,7 @@
             "AutoencoderKL",
             "AutoencoderTiny",
             "ControlNetModel",
+            "ControlNetXSModel",
             "ModelMixin",
             "MultiAdapter",
             "PriorTransformer",
@@ -260,6 +261,7 @@
             "StableDiffusionXLControlNetImg2ImgPipeline",
             "StableDiffusionXLControlNetInpaintPipeline",
             "StableDiffusionXLControlNetPipeline",
+            "StableDiffusionXLControlNetXSPipeline",
             "StableDiffusionXLImg2ImgPipeline",
             "StableDiffusionXLInpaintPipeline",
             "StableDiffusionXLInstructPix2PixPipeline",
@@ -437,6 +439,7 @@
             AutoencoderKL,
             AutoencoderTiny,
             ControlNetModel,
+            ControlNetXSModel,
             ModelMixin,
             MultiAdapter,
             PriorTransformer,
@@ -598,6 +601,7 @@
             StableDiffusionXLControlNetImg2ImgPipeline,
             StableDiffusionXLControlNetInpaintPipeline,
             StableDiffusionXLControlNetPipeline,
+            StableDiffusionXLControlNetXSPipeline,
             StableDiffusionXLImg2ImgPipeline,
             StableDiffusionXLInpaintPipeline,
             StableDiffusionXLInstructPix2PixPipeline,
diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index d043bb89b484..8edca311d238 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import dataclass
-from typing import Any, Dict, Optional, Union, Tuple
+from typing import Any, Dict, List, Optional, Union, Tuple
 
 from itertools import zip_longest
 
@@ -43,10 +43,9 @@
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
-# todo Umer later: add attention_bias to relevant docs
-
 @dataclass
-class UNet2DConditionOutput(BaseOutput):
+class ControlNetXSOutput(BaseOutput):
+    # todo: docstring
     sample: torch.FloatTensor = None
 
 
@@ -261,67 +260,40 @@ def __init__(
         del self.control_model.conv_norm_out
         del self.control_model.conv_out
 
-    def toggle_control(self, to):
-        if not hasattr(self, 'do_control'): self.do_control = True
-        if not hasattr(self, 'scale_back_up'): self.back_up = None
-        if self.do_control == to:
-            print(f'Model already set to control mode == {to}')
-            return
-        if not to:
-            self.scale_back_up = self.scale_list[0].clone()
-            self.scale_list = self.scale_list * 0.
-            self.do_control = False
-        else:
-            self.scale_list = self.scale_list * 0. + self.scale_back_up 
-            self.scale_back_up = None
-            self.do_control = True
-        assert self.do_control == to
-        print(f'Model set to control mode == {self.do_control}')
-
 
     def forward(
         self,
-        x: torch.Tensor,
-        t: torch.Tensor,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
         encoder_hidden_states: torch.Tensor,
-        hint: torch.Tensor,
+        controlnet_cond: torch.Tensor,
+        conditioning_scale: float = 1.0,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
         no_control=False,
-    ):
+        guess_mode: bool = False, # todo: understand and implement if required
+        return_dict: bool = True,
+    ) -> Union[ControlNetXSOutput, Tuple]:
         if self.base_model is None:
             raise RuntimeError("To use `forward`, first set the base model for this ControlNetXSModel by `cnxs_model.base_model = the_base_model`")
 
-        print('control_scale:',self.scale_list)
-
-        """ Params from unet_2d_condition.UNet2DConditionModel.forward:
-        # self,
-        # sample: torch.FloatTensor,
-        # timestep: Union[torch.Tensor, float, int],
-        # encoder_hidden_states: torch.Tensor,
-        # class_labels: Optional[torch.Tensor] = None,
-        # timestep_cond: Optional[torch.Tensor] = None,
-        # attention_mask: Optional[torch.Tensor] = None,
-        # cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        # added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
-        # down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
-        # mid_block_additional_residual: Optional[torch.Tensor] = None,
-        # encoder_attention_mask: Optional[torch.Tensor] = None,
-        # return_dict: bool = True,
-        """
 
         #x = torch.cat((x, c.get("concat", torch.Tensor([]).type_as(x))), dim=1)
-        if x.size(0) // 2 == hint.size(0): hint = torch.cat([hint, hint], dim=0) # for classifier free guidance
+        if sample.size(0) // 2 == controlnet_cond.size(0): controlnet_cond = torch.cat([controlnet_cond, controlnet_cond], dim=0) # for classifier free guidance
         
-        timesteps=t
+        # todo: Can a tensor with different timesteps be passed? if so, do I need to adapt sth?
+        timesteps=timestep
         
         if no_control or self.no_control:
             return self.base_model(x, timesteps, encoder_hidden_states,cross_attention_kwargs=cross_attention_kwargs,added_cond_kwargs=added_cond_kwargs)
 
+        # todo: should scale_list remain an attribute?
+        scale_list = self.scale_list * 0. + conditioning_scale
+
         # time embeddings
         timesteps = timesteps[None]
-
-        udl.log_if('timestep', timesteps, condition='TIME', print_=True)
         t_emb = get_timestep_embedding(
             timesteps, 
             self.model_channels,
@@ -329,7 +301,6 @@ def forward(
             flip_sin_to_cos=self.flip_sin_to_cos,
             downscale_freq_shift=self.freq_shift,
         )
-        udl.log_if('time_emb', t_emb, condition='TIME', print_=True)
         if self.learn_embedding:
             udl.log_if('time_proj_ctrl',self.control_model.time_embedding(t_emb), condition='TIME', print_=True)
             udl.log_if('time_proj_ctrl_scaled',self.control_model.time_embedding(t_emb) * self.config.time_control_scale ** 0.3, condition='TIME', print_=True)
@@ -338,14 +309,9 @@ def forward(
             temb = self.control_model.time_embedding(t_emb) * self.config.time_control_scale ** 0.3 + self.base_model.time_embedding(t_emb) * (1 - self.config.time_control_scale ** 0.3)
         else:
             temb = self.base_model.time_embedding(t_emb)
-        udl.log_if('time_proj',temb, condition='TIME', print_=True)
-
-        aug_emb = None
-
-        # text embeddings
-        cemb = encoder_hidden_states
 
         # added time & text embeddings
+        aug_emb = None
         if self.config.addition_embed_type == "text":
             raise NotImplementedError()
         elif self.config.addition_embed_type == "text_image":
@@ -357,20 +323,16 @@ def forward(
                     f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
                 )
             text_embeds = added_cond_kwargs.get("text_embeds")
-            udl.log_if('text_embeds',text_embeds, condition='TIME', print_=True)
             if "time_ids" not in added_cond_kwargs:
                 raise ValueError(
                     f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
                 )
             time_ids = added_cond_kwargs.get("time_ids")
-            udl.log_if('add_input',time_ids.flatten(), condition='TIME', print_=True)
             time_embeds = self.base_model.add_time_proj(time_ids.flatten())
             time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
-            udl.log_if('add_emb',time_embeds, condition='TIME', print_=True)
             add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
             add_embeds = add_embeds.to(temb.dtype)
             aug_emb = self.base_model.add_embedding(add_embeds)
-            udl.log_if('add_proj',aug_emb, condition='TIME', print_=True)
 
         elif self.config.addition_embed_type == "image":
             raise NotImplementedError()
@@ -378,16 +340,17 @@ def forward(
             raise NotImplementedError()
 
         temb = temb + aug_emb if aug_emb is not None else temb
-        udl.log_if('final_temb',temb,condition='TIME', print_=True)
-        udl.stop_if(condition='TIME', funny_msg='Time to analyze time!')
+
+        # text embeddings
+        cemb = encoder_hidden_states
 
         ###
-        guided_hint = self.input_hint_block(hint)
+        guided_hint = self.input_hint_block(controlnet_cond)
 
-        h_ctrl = h_base = x
+        h_ctrl = h_base = sample
         hs_base, hs_ctrl = [], []
         it_enc_convs_in, it_enc_convs_out, it_dec_convs_in, it_dec_convs_out = map(iter, (self.enc_zero_convs_in, self.enc_zero_convs_out, self.dec_zero_convs_in, self.dec_zero_convs_out))
-        scales = iter(self.scale_list)
+        scales = iter(scale_list)
 
         base_down_subblocks = to_sub_blocks(self.base_model.down_blocks)
         ctrl_down_subblocks = to_sub_blocks(self.control_model.down_blocks)
@@ -395,117 +358,50 @@ def forward(
         ctrl_mid_subblocks = to_sub_blocks([self.control_model.mid_block])
         base_up_subblocks = to_sub_blocks(self.base_model.up_blocks)
 
-        # Debug Umer -- to delete later on
-        # this is for a global view, ie on subblock level
-        udl.log_if('prep.x',            x,              condition='SUBBLOCK')
-        udl.log_if('prep.temb',         temb,           condition='SUBBLOCK')
-        udl.log_if('prep.context',      cemb,           condition='SUBBLOCK')
-        udl.log_if('prep.raw_hint',     hint,           condition='SUBBLOCK')
-        udl.log_if('prep.guided_hint',  guided_hint,    condition='SUBBLOCK')
-
-        # Debug Umer - another one!
-        # this is for a detail view, ie below subblock level
-
         # Cross Control
         # 0 - conv in
         h_base = self.base_model.conv_in(h_base)
-        udl.log_if('enc.h_base', h_base, condition='SUBBLOCK')
-
         h_ctrl = self.control_model.conv_in(h_ctrl)
-        udl.log_if('enc.h_ctrl', h_ctrl, condition='SUBBLOCK')
-        
         if guided_hint is not None: h_ctrl += guided_hint
-        udl.log_if('enc.h_ctrl', h_ctrl, condition='SUBBLOCK')
-        
         h_base = h_base + next(it_enc_convs_out)(h_ctrl) * next(scales)
-        udl.log_if('enc.h_base', h_base, condition='SUBBLOCK')
 
         hs_base.append(h_base)
         hs_ctrl.append(h_ctrl)
+
         # 1 - input blocks (encoder)
-        RUN_ONCE = ('SUBBLOCK', 'SUBBLOCK-MINUS-1')
-        udl.print_if('------ enc ------', conditions=RUN_ONCE)
-        for i, (m_base, m_ctrl)  in enumerate(zip(base_down_subblocks, ctrl_down_subblocks)):
-            # A - concat base -> ctrl
-            cat_to_ctrl = next(it_enc_convs_in)(h_base)
-            h_ctrl = torch.cat([h_ctrl, cat_to_ctrl], dim=1)
-            udl.log_if('enc.h_ctrl', h_ctrl, condition='SUBBLOCK')
-            # B - apply base subblock
-            udl.print_if('>> Applying base block\t', end='', conditions=RUN_ONCE)
-            h_base = m_base(h_base, temb, cemb)
-            udl.log_if('enc.h_base', h_base, condition='SUBBLOCK')
-            # C - apply ctrl subblock
-            udl.print_if('>> Applying ctrl block\t', end='', conditions=RUN_ONCE)
-            h_ctrl = m_ctrl(h_ctrl, temb, cemb)
-            udl.log_if('enc.h_ctrl', h_ctrl, condition='SUBBLOCK')
-            udl.print_if('', conditions=RUN_ONCE)
-            # D - add ctrl -> base
-            add_to_base = next(it_enc_convs_out)(h_ctrl)
-            scale = next(scales)
-            h_base = h_base + add_to_base * scale
-            udl.log_if('enc.h_base', h_base, condition='SUBBLOCK')
+        for m_base, m_ctrl  in zip(base_down_subblocks, ctrl_down_subblocks):
+            h_ctrl = torch.cat([h_ctrl, next(it_enc_convs_in)(h_base)], dim=1)  # A - concat base -> ctrl
+            h_base = m_base(h_base, temb, cemb)                                 # B - apply base subblock
+            h_ctrl = m_ctrl(h_ctrl, temb, cemb)                                 # C - apply ctrl subblock
+            h_base = h_base + next(it_enc_convs_out)(h_ctrl) * next(scales)     # D - add ctrl -> base
+
             hs_base.append(h_base)
             hs_ctrl.append(h_ctrl)
-        cat_to_ctrl = next(it_enc_convs_in)(h_base)
-        h_ctrl = torch.cat([h_ctrl, cat_to_ctrl], dim=1)
-        udl.log_if('enc.h_ctrl', h_ctrl, condition='SUBBLOCK')
+
+        h_ctrl = torch.cat([h_ctrl, next(it_enc_convs_in)(h_base)], dim=1)
+
         # 2 - mid blocks (bottleneck)
-        udl.print_if('------ mid ------', conditions=RUN_ONCE)
-        # Because Heidelberg treats the R/A/R as one block, they first execute the full base mid block,
-        # then the full ctrl mid block; while I execute them interlaced.
-        # This doesn't change the computation, but messes up parts of the logs.
-        # So let's, while debugging, first execute full base mid block and then full ctrl mid block.
-
-        #for m_base, m_ctrl in zip(base_mid_subblocks, ctrl_mid_subblocks):
-        #    udl.print_if('>> Applying base block\t', end='', conditions=RUN_ONCE)
-        #    h_base = m_base(h_base, temb, cemb)
-        #    udl.print_if('>> Applying ctrl block\t', end='', conditions=RUN_ONCE)
-        #    h_ctrl = m_ctrl(h_ctrl, temb, cemb)
-        #    udl.print_if('', conditions=RUN_ONCE)
-
-        for m_base in base_mid_subblocks:
-            udl.print_if('>> Applying base block\t', end='', conditions=RUN_ONCE)
+        for m_base, m_ctrl in zip(base_mid_subblocks, ctrl_mid_subblocks):
             h_base = m_base(h_base, temb, cemb)
-            udl.print_if('', conditions=RUN_ONCE)
-
-        for m_ctrl in ctrl_mid_subblocks:
-            udl.print_if('>> Applying ctrl block\t', end='', conditions=RUN_ONCE)
             h_ctrl = m_ctrl(h_ctrl, temb, cemb)
-            udl.print_if('', conditions=RUN_ONCE)
-
-        # Heidelberg treats the R/A/R as one block, while I treat is as 2 subblocks
-        # Let's therefore only log after the mid section
-        udl.log_if('mid.h_base', h_base, condition='SUBBLOCK')
-        udl.log_if('mid.h_ctrl', h_ctrl, condition='SUBBLOCK')
-    
+   
         h_base = h_base + self.middle_block_out(h_ctrl) * next(scales)
-        udl.log_if('mid.h_base', h_base, condition='SUBBLOCK')
-
+ 
         # 3 - output blocks (decoder)
-        udl.print_if('------ dec ------', conditions=RUN_ONCE)
         for m_base in base_up_subblocks:
-            h_base = h_base + next(it_dec_convs_out)(hs_ctrl.pop()) * next(scales) # add info from ctrl encoder 
-            udl.log_if('dec.h_base', h_base, condition='SUBBLOCK')
-            h_base = torch.cat([h_base, hs_base.pop()], dim=1) # concat info from base encoder+ctrl encoder
-            udl.log_if('dec.h_base', h_base, condition='SUBBLOCK')
-            udl.print_if('>> Applying base block\t', end='', conditions=RUN_ONCE)
+            h_base = h_base + next(it_dec_convs_out)(hs_ctrl.pop()) * next(scales)  # add info from ctrl encoder 
+            h_base = torch.cat([h_base, hs_base.pop()], dim=1)                      # concat info from base encoder+ctrl encoder
             h_base = m_base(h_base, temb, cemb)
-            udl.log_if('dec.h_base', h_base, condition='SUBBLOCK')
-            udl.print_if('',conditions=RUN_ONCE)
 
         h_base = self.base_model.conv_norm_out(h_base)
         h_base = self.base_model.conv_act(h_base)
         h_base = self.base_model.conv_out(h_base)
 
-        result = h_base
-
-        udl.log_if('conv_out.h_base', result, condition='SUBBLOCK')
-        udl.print_if('',conditions=RUN_ONCE)
-
-        udl.stop_if('SUBBLOCK', 'The subblocks are cought. Let us gaze into their soul, their very essence.')
-        udl.stop_if('SUBBLOCK-MINUS-1', 'Alright captain. Look at all these tensors we caught. Time to do some real analysis.')
+        if not return_dict:
+            return h_base
+        
+        return ControlNetXSOutput(sample=h_base)
 
-        return UNet2DConditionOutput(sample=result)
 
     def make_zero_conv(self, in_channels, out_channels=None):
         # keep running track # todo: better comment
@@ -513,15 +409,36 @@ def make_zero_conv(self, in_channels, out_channels=None):
         self.out_channels = out_channels or in_channels
         return zero_module(nn.Conv2d(in_channels, out_channels, 1, padding=0))
 
-    def debug_print(self, s):
-        if hasattr(self, 'debug') and self.debug:
-            print(s)
+
+class EmbedSequential(nn.ModuleList):
+    """Sequential module passing embeddings (time and conditioning) to children if they support it."""
+    def __init__(self,ms,*args,**kwargs):
+        if not is_iterable(ms): ms = [ms]
+        super().__init__(ms,*args,**kwargs)
+    
+    def forward(self,x,temb,cemb):
+        def cls_name(x): return str(type(x)).split('.')[-1].replace("'>",'')
+        content = ' '.join(cls_name(m) for m in self)
+        udl.print_if(f'EmbedSequential.forward with content {content}', conditions='SUBBLOCK-MINUS-1')
+        for m in self:
+            if isinstance(m,ResnetBlock2D):
+                x = m(x,temb)
+            elif isinstance(m,Transformer2DModel):
+                x = m(x,cemb).sample
+            elif isinstance(m,Downsample2D):
+                x = m(x)
+            elif isinstance(m,Upsample2D):
+                x = m(x)
+            else: raise ValueError(f'Type of m is {type(m)} but should be `ResnetBlock2D`, `Transformer2DModel`,  `Downsample2D` or `Upsample2D`')
+
+        return x
 
 
 def adjust_time_input_dim(unet: UNet2DConditionModel, dim: int):
     time_emb = unet.time_embedding
     time_emb.linear_1 = nn.Linear(dim, time_emb.linear_1.out_features)
 
+
 def increase_block_input_in_encoder_resnet(unet:UNet2DConditionModel, block_no, resnet_idx, by):
     """Increase channels sizes to allow for additional concatted information from base model"""
     r=unet.down_blocks[block_no].resnets[resnet_idx]
@@ -604,30 +521,6 @@ def increase_block_input_in_mid_resnet(unet:UNet2DConditionModel, by):
     unet.mid_block.resnets[0].in_channels += by  # surgery done here
 
 
-class EmbedSequential(nn.ModuleList):
-    """Sequential module passing embeddings (time and conditioning) to children if they support it."""
-    def __init__(self,ms,*args,**kwargs):
-        if not is_iterable(ms): ms = [ms]
-        super().__init__(ms,*args,**kwargs)
-    
-    def forward(self,x,temb,cemb):
-        def cls_name(x): return str(type(x)).split('.')[-1].replace("'>",'')
-        content = ' '.join(cls_name(m) for m in self)
-        udl.print_if(f'EmbedSequential.forward with content {content}', conditions='SUBBLOCK-MINUS-1')
-        for m in self:
-            if isinstance(m,ResnetBlock2D):
-                x = m(x,temb)
-            elif isinstance(m,Transformer2DModel):
-                x = m(x,cemb).sample
-            elif isinstance(m,Downsample2D):
-                x = m(x)
-            elif isinstance(m,Upsample2D):
-                x = m(x)
-            else: raise ValueError(f'Type of m is {type(m)} but should be `ResnetBlock2D`, `Transformer2DModel`,  `Downsample2D` or `Upsample2D`')
-
-        return x
-
-
 def is_iterable(o):
     if isinstance(o, str): return False
     try:
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 19fe2f72d447..404c785e1b22 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -17,7 +17,7 @@
 
 # These modules contain pipelines from multiple libraries/frameworks
 _dummy_objects = {}
-_import_structure = {"stable_diffusion": [], "stable_diffusion_xl": [], "latent_diffusion": [], "controlnet": []}
+_import_structure = {"stable_diffusion": [], "stable_diffusion_xl": [], "latent_diffusion": [], "controlnet": [], "controlnet_xs": []}
 
 try:
     if not is_torch_available():
@@ -80,6 +80,11 @@
             "StableDiffusionXLControlNetPipeline",
         ]
     )
+    _import_structure["controlnet_xs"].extend(
+        [
+            "StableDiffusionXLControlNetXSPipeline",
+        ]
+    )
     _import_structure["deepfloyd_if"] = [
         "IFImg2ImgPipeline",
         "IFImg2ImgSuperResolutionPipeline",
@@ -302,6 +307,9 @@
             StableDiffusionXLControlNetInpaintPipeline,
             StableDiffusionXLControlNetPipeline,
         )
+        from .controlnet_xs import (
+            StableDiffusionXLControlNetXSPipeline,
+        )
         from .deepfloyd_if import (
             IFImg2ImgPipeline,
             IFImg2ImgSuperResolutionPipeline,
diff --git a/src/diffusers/pipelines/controlnet_xs/__init__.py b/src/diffusers/pipelines/controlnet_xs/__init__.py
new file mode 100644
index 000000000000..abd5fd38b2e1
--- /dev/null
+++ b/src/diffusers/pipelines/controlnet_xs/__init__.py
@@ -0,0 +1,66 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_flax_available,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_controlnet_xs_sd_xl"] = ["StableDiffusionXLControlNetXSPipeline"]
+try:
+    if not (is_transformers_available() and is_flax_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_flax_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_flax_and_transformers_objects))
+else:
+    pass # _import_structure["pipeline_flax_controlnet"] = ["FlaxStableDiffusionControlNetPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_controlnet_xs_sd_xl import StableDiffusionXLControlNetXSPipeline
+
+    try:
+        if not (is_transformers_available() and is_flax_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_flax_and_transformers_objects import *  # noqa F403
+    else:
+        pass # from .pipeline_flax_controlnet import FlaxStableDiffusionControlNetPipeline
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
index 100f921525f1..616beaf945f6 100644
--- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
@@ -1,3 +1,17 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import inspect
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
@@ -20,7 +34,7 @@
 )
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
-from ...utils import logging
+from ...utils import USE_PEFT_BACKEND, logging, replace_example_docstring, scale_lora_layers, unscale_lora_layers
 from ...utils.torch_utils import is_compiled_module, randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from ..stable_diffusion_xl import StableDiffusionXLPipelineOutput
@@ -33,12 +47,97 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
+# todo: Test if this runs
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> # !pip install opencv-python transformers accelerate
+        >>> from diffusers import StableDiffusionXLControlNetXSPipeline, ControlNetXSModel, AutoencoderKL
+        >>> from diffusers.utils import load_image
+        >>> import numpy as np
+        >>> import torch
+
+        >>> import cv2
+        >>> from PIL import Image
+
+        >>> prompt = "aerial view, a futuristic research complex in a bright foggy jungle, hard lighting"
+        >>> negative_prompt = "low quality, bad quality, sketches"
+
+        >>> # download an image
+        >>> image = load_image(
+        ...     "https://hf.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png"
+        ... )
+
+        >>> # initialize the models and pipeline
+        >>> controlnet_conditioning_scale = 0.5  # recommended for good generalization
+        >>> controlnet = ControlNetXSModel.from_pretrained(
+        ...     "diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16
+        ... )
+        >>> vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
+        >>> pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, vae=vae, torch_dtype=torch.float16
+        ... )
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> # get canny image
+        >>> image = np.array(image)
+        >>> image = cv2.Canny(image, 100, 200)
+        >>> image = image[:, :, None]
+        >>> image = np.concatenate([image, image, image], axis=2)
+        >>> canny_image = Image.fromarray(image)
+
+        >>> # generate image
+        >>> image = pipe(
+        ...     prompt, controlnet_conditioning_scale=controlnet_conditioning_scale, image=canny_image
+        ... ).images[0]
+        ```
+"""
+
+
 class StableDiffusionXLControlNetXSPipeline(
     DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionXLLoraLoaderMixin, FromSingleFileMixin
 ):
-    model_cpu_offload_seq = (
-        "text_encoder->text_encoder_2->unet->vae"  # leave controlnet out on purpose because it iterates with unet
-    )
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion XL with ControlNet guidance.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        text_encoder_2 ([`~transformers.CLIPTextModelWithProjection`]):
+            Second frozen text-encoder
+            ([laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        tokenizer_2 ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        controlnet ([`ControlNetXSModel`]:
+            Provides additional conditioning to the `unet` during the denoising process.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
+            Whether the negative prompt embeddings should always be set to 0. Also see the config of
+            `stabilityai/stable-diffusion-xl-base-1-0`.
+        add_watermarker (`bool`, *optional*):
+            Whether to use the [invisible_watermark](https://github.com/ShieldMnt/invisible-watermark/) library to
+            watermark output images. If not defined, it defaults to `True` if the package is installed; otherwise no
+            watermarker is used.
+    """
+    # leave controlnet out on purpose because it iterates with unet
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
+    _optional_components = ["tokenizer", "tokenizer_2", "text_encoder", "text_encoder_2"]
 
     def __init__(
         self,
@@ -55,6 +154,8 @@ def __init__(
     ):
         super().__init__()
 
+        # todo: add multi contronet?
+
         self.register_modules(
             vae=vae,
             text_encoder=text_encoder,
@@ -71,11 +172,48 @@ def __init__(
         self.control_image_processor = VaeImageProcessor(
             vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
         )
-        
-        self.watermark = None
+        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+ 
+        if add_watermarker:
+            self.watermark = StableDiffusionXLWatermarker()
+        else:
+            self.watermark = None
 
         self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+
     # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
     def encode_prompt(
         self,
@@ -91,6 +229,7 @@ def encode_prompt(
         pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
         negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
         lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
     ):
         r"""
         Encodes the prompt into text encoder hidden states.
@@ -130,6 +269,9 @@ def encode_prompt(
                 input argument.
             lora_scale (`float`, *optional*):
                 A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
         """
         device = device or self._execution_device
 
@@ -139,12 +281,21 @@ def encode_prompt(
             self._lora_scale = lora_scale
 
             # dynamically adjust the LoRA scale
-            adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
-            adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
 
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt is not None:
             batch_size = len(prompt)
         else:
             batch_size = prompt_embeds.shape[0]
@@ -157,6 +308,8 @@ def encode_prompt(
 
         if prompt_embeds is None:
             prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
             # textual inversion: procecss multi-vector tokens if necessary
             prompt_embeds_list = []
             prompts = [prompt, prompt_2]
@@ -184,14 +337,15 @@ def encode_prompt(
                         f" {tokenizer.model_max_length} tokens: {removed_text}"
                     )
 
-                prompt_embeds = text_encoder(
-                    text_input_ids.to(device),
-                    output_hidden_states=True,
-                )
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
 
                 # We are only ALWAYS interested in the pooled output of the final text encoder
                 pooled_prompt_embeds = prompt_embeds[0]
-                prompt_embeds = prompt_embeds.hidden_states[-2]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
 
                 prompt_embeds_list.append(prompt_embeds)
 
@@ -206,14 +360,18 @@ def encode_prompt(
             negative_prompt = negative_prompt or ""
             negative_prompt_2 = negative_prompt_2 or negative_prompt
 
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+
             uncond_tokens: List[str]
             if prompt is not None and type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
                     f" {type(prompt)}."
                 )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt, negative_prompt_2]
             elif batch_size != len(negative_prompt):
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
@@ -249,7 +407,11 @@ def encode_prompt(
 
             negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
 
-        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        if self.text_encoder_2 is not None:
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
         bs_embed, seq_len, _ = prompt_embeds.shape
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
@@ -258,7 +420,12 @@ def encode_prompt(
         if do_classifier_free_guidance:
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = negative_prompt_embeds.shape[1]
-            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+
+            if self.text_encoder_2 is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
             negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
             negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
 
@@ -270,6 +437,16 @@ def encode_prompt(
                 bs_embed * num_images_per_prompt, -1
             )
 
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2)
+
         return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
@@ -290,6 +467,129 @@ def prepare_extra_step_kwargs(self, generator, eta):
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        image,
+        callback_steps,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        controlnet_conditioning_scale=1.0,
+        control_guidance_start=0.0,
+        control_guidance_end=1.0,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+        # todo: multi control net?
+
+        # Check `image`
+        is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+            self.controlnet, torch._dynamo.eval_frame.OptimizedModule
+        )
+        if (
+            isinstance(self.controlnet, ControlNetXSModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetXSModel)
+        ):
+            self.check_image(image, prompt, prompt_embeds)
+        # elif # todo: multi control net?
+        else:
+            assert False
+
+        # Check `controlnet_conditioning_scale`
+        if (
+            isinstance(self.controlnet, ControlNetXSModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetXSModel)
+        ):
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        # elif # todo: multi control net?
+        else:
+            assert False
+
+        if not isinstance(control_guidance_start, (tuple, list)):
+            control_guidance_start = [control_guidance_start]
+
+        if not isinstance(control_guidance_end, (tuple, list)):
+            control_guidance_end = [control_guidance_end]
+
+        if len(control_guidance_start) != len(control_guidance_end):
+            raise ValueError(
+                f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
+            )
+
+        #if isinstance(self.controlnet, MultiControlNetModel): # todo?
+
+        for start, end in zip(control_guidance_start, control_guidance_end):
+            if start >= end:
+                raise ValueError(
+                    f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
+                )
+            if start < 0.0:
+                raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
+            if end > 1.0:
+                raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
+
     # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image
     def check_image(self, image, prompt, prompt_embeds):
         image_is_pil = isinstance(image, PIL.Image.Image)
@@ -374,9 +674,8 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
             latents = latents.to(device)
 
         # scale the initial noise by the standard deviation required by the scheduler
-        initial_unscaled_latents = latents # Umer: remove here & from return
         latents = latents * self.scheduler.init_noise_sigma
-        return latents, initial_unscaled_latents
+        return latents
 
     # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._get_add_time_ids
     def _get_add_time_ids(self, original_size, crops_coords_top_left, target_size, dtype):
@@ -415,7 +714,36 @@ def upcast_vae(self):
             self.vae.decoder.conv_in.to(dtype)
             self.vae.decoder.mid_block.to(dtype)
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
+    def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
+        r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
+
+        The suffixes after the scaling factors represent the stages where they are being applied.
+
+        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
+        that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+
+        Args:
+            s1 (`float`):
+                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+                mitigate "oversmoothing effect" in the enhanced denoising process.
+            s2 (`float`):
+                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+                mitigate "oversmoothing effect" in the enhanced denoising process.
+            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+        """
+        if not hasattr(self, "unet"):
+            raise ValueError("The pipeline must have `unet` for using FreeU.")
+        self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
+    def disable_freeu(self):
+        """Disables the FreeU mechanism if enabled."""
+        self.unet.disable_freeu()
+
     @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,
@@ -450,6 +778,7 @@ def __call__(
         negative_original_size: Optional[Tuple[int, int]] = None,
         negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
         negative_target_size: Optional[Tuple[int, int]] = None,
+        clip_skip: Optional[int] = None,
     ):
         r"""
         The call function to the pipeline for generation.
@@ -529,8 +858,7 @@ def __call__(
                 [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
             controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
                 The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
-                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
-                the corresponding scale as a list.
+                to the residual in the original `unet`.
             guess_mode (`bool`, *optional*, defaults to `False`):
                 The ControlNet encoder tries to recognize the content of the input image even if you remove all
                 prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended.
@@ -567,16 +895,19 @@ def __call__(
                 as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
                 [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
                 information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
-
+                
         Examples:
 
         Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] is returned,
                 otherwise a `tuple` is returned containing the output images.
         """
         controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
 
+        # set current this pipeline's unet as the base model for the controlnet
+        self.controlnet.base_model = self.unet
+
         # align format for control guidance
         if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
             control_guidance_start = len(control_guidance_end) * [control_guidance_start]
@@ -589,21 +920,21 @@ def __call__(
             ]
 
         # 1. Check inputs. Raise error if not correct
-        #self.check_inputs(
-        #     prompt,
-        #     prompt_2,
-        #     image,
-        #     callback_steps,
-        #     negative_prompt,
-        #     negative_prompt_2,
-        #     prompt_embeds,
-        #     negative_prompt_embeds,
-        #     pooled_prompt_embeds,
-        #     negative_pooled_prompt_embeds,
-        #     controlnet_conditioning_scale,
-        #     control_guidance_start,
-        #     control_guidance_end,
-        # )
+        self.check_inputs(
+             prompt,
+             prompt_2,
+             image,
+             callback_steps,
+             negative_prompt,
+             negative_prompt_2,
+             prompt_embeds,
+             negative_prompt_embeds,
+             pooled_prompt_embeds,
+             negative_pooled_prompt_embeds,
+             controlnet_conditioning_scale,
+             control_guidance_start,
+             control_guidance_end,
+        )
 
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
@@ -619,7 +950,8 @@ def __call__(
         # corresponds to doing no classifier free guidance.
         do_classifier_free_guidance = guidance_scale > 1.0
 
-      
+        #todo: if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float): ...
+        
         global_pool_conditions = (
             controlnet.config.global_pool_conditions
             if isinstance(controlnet, ControlNetXSModel)
@@ -649,6 +981,7 @@ def __call__(
             pooled_prompt_embeds=pooled_prompt_embeds,
             negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
             lora_scale=text_encoder_lora_scale,
+            clip_skip=clip_skip,
         )
 
         # 4. Prepare image
@@ -665,6 +998,7 @@ def __call__(
                 guess_mode=guess_mode,
             )
             height, width = image.shape[-2:]
+        #elif isinstance(controlnet, MultiControlNetModel): todo?
         else:
             assert False
 
@@ -674,7 +1008,8 @@ def __call__(
 
         # 6. Prepare latent variables
         num_channels_latents = self.unet.config.in_channels
-        latents, initial_unscaled_latents = self.prepare_latents(
+
+        latents = self.prepare_latents(
             batch_size * num_images_per_prompt,
             num_channels_latents,
             height,
@@ -684,9 +1019,6 @@ def __call__(
             generator,
             latents,
         )
-        # # DEBUG
-        if callback is not None: callback(-1, -1, initial_unscaled_latents)
-        # # 
 
         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
@@ -743,13 +1075,14 @@ def __call__(
 
                 # predict the noise residual
                 noise_pred = self.controlnet(
-                    x=latent_model_input,
-                    t=t,
+                    sample=latent_model_input,
+                    timestep=t,
                     encoder_hidden_states=prompt_embeds,
-                    hint=image, # todo: better naming
+                    controlnet_cond=image,
+                    conditioning_scale=controlnet_conditioning_scale,
                     cross_attention_kwargs=cross_attention_kwargs,
                     added_cond_kwargs=added_cond_kwargs,
-                    #return_dict=False,
+                    return_dict=True,
                 ).sample
 
                 # perform guidance
@@ -797,6 +1130,9 @@ def __call__(
         # Offload all models
         self.maybe_free_model_hooks()
 
+        # remove the base model from controlnet, which we set above
+        del self.controlnet.base_model
+
         if not return_dict:
             return (image,)
 
diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py
index e7f333bc62dc..46f715d1fb17 100644
--- a/src/diffusers/schedulers/scheduling_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_discrete.py
@@ -261,7 +261,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
 
         if self.config.interpolation_type == "linear":
             sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
-            print(f'sigmas after (linear) interpolation: {sigmas[:5]} ...')
         elif self.config.interpolation_type == "log_linear":
             sigmas = torch.linspace(np.log(sigmas[-1]), np.log(sigmas[0]), num_inference_steps + 1).exp()
         else:

From 73a7b99ac45f330154a49e1d2a6e3f90e1b52ec8 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Mon, 13 Nov 2023 19:57:43 +0100
Subject: [PATCH 32/88] Fixed time dtype bug

---
 src/diffusers/models/controlnetxs.py          | 38 +++++++++++--------
 .../pipeline_controlnet_xs_sd_xl.py           |  2 +-
 2 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index 8edca311d238..0c465ab55f52 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -272,40 +272,48 @@ def forward(
         timestep_cond: Optional[torch.Tensor] = None,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
-        no_control=False,
         guess_mode: bool = False, # todo: understand and implement if required
         return_dict: bool = True,
     ) -> Union[ControlNetXSOutput, Tuple]:
         if self.base_model is None:
             raise RuntimeError("To use `forward`, first set the base model for this ControlNetXSModel by `cnxs_model.base_model = the_base_model`")
 
+        # todo: should scale_list remain an attribute?
+        scale_list = self.scale_list * 0. + conditioning_scale
 
         #x = torch.cat((x, c.get("concat", torch.Tensor([]).type_as(x))), dim=1)
+        # todo: check if we need this line. I assume duplication of guiding image is done in pipeline
         if sample.size(0) // 2 == controlnet_cond.size(0): controlnet_cond = torch.cat([controlnet_cond, controlnet_cond], dim=0) # for classifier free guidance
         
-        # todo: Can a tensor with different timesteps be passed? if so, do I need to adapt sth?
+        # 1. time
         timesteps=timestep
-        
-        if no_control or self.no_control:
-            return self.base_model(x, timesteps, encoder_hidden_states,cross_attention_kwargs=cross_attention_kwargs,added_cond_kwargs=added_cond_kwargs)
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
 
-        # todo: should scale_list remain an attribute?
-        scale_list = self.scale_list * 0. + conditioning_scale
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
 
-        # time embeddings
-        timesteps = timesteps[None]
         t_emb = get_timestep_embedding(
             timesteps, 
             self.model_channels,
-            # # TODO: Undetrstand flip_sin_to_cos / (downscale_)freq_shift
             flip_sin_to_cos=self.flip_sin_to_cos,
             downscale_freq_shift=self.freq_shift,
         )
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+
         if self.learn_embedding:
-            udl.log_if('time_proj_ctrl',self.control_model.time_embedding(t_emb), condition='TIME', print_=True)
-            udl.log_if('time_proj_ctrl_scaled',self.control_model.time_embedding(t_emb) * self.config.time_control_scale ** 0.3, condition='TIME', print_=True)
-            udl.log_if('time_proj_base',self.base_model.time_embedding(t_emb), condition='TIME', print_=True)
-            udl.log_if('time_proj_base_scaled',self.base_model.time_embedding(t_emb) * (1 - self.config.time_control_scale ** 0.3), condition='TIME', print_=True)
             temb = self.control_model.time_embedding(t_emb) * self.config.time_control_scale ** 0.3 + self.base_model.time_embedding(t_emb) * (1 - self.config.time_control_scale ** 0.3)
         else:
             temb = self.base_model.time_embedding(t_emb)
@@ -344,7 +352,7 @@ def forward(
         # text embeddings
         cemb = encoder_hidden_states
 
-        ###
+        # Preparation
         guided_hint = self.input_hint_block(controlnet_cond)
 
         h_ctrl = h_base = sample
diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
index 616beaf945f6..abf6d74687da 100644
--- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
@@ -71,7 +71,7 @@
         >>> # initialize the models and pipeline
         >>> controlnet_conditioning_scale = 0.5  # recommended for good generalization
         >>> controlnet = ControlNetXSModel.from_pretrained(
-        ...     "diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16
+        ...     "UmerHA/ConrolNetXS-SDXL-canny", torch_dtype=torch.float16
         ... )
         >>> vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
         >>> pipe = StableDiffusionXLControlNetPipeline.from_pretrained(

From e20d4b583d3b5ef873d2f9a12074632e620af4b0 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Tue, 14 Nov 2023 00:24:19 +0100
Subject: [PATCH 33/88] checkin

---
 src/diffusers/models/controlnetxs.py          | 510 ++++++++++++++----
 .../pipeline_controlnet_xs_sd_xl.py           |   1 -
 src/diffusers/umer_debug_logger.py            |   4 +-
 3 files changed, 394 insertions(+), 121 deletions(-)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index 0c465ab55f52..1df4d22ecf1c 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -16,6 +16,7 @@
 
 from itertools import zip_longest
 
+import math
 import torch
 from torch import nn
 from torch.nn.modules.normalization import GroupNorm
@@ -24,7 +25,14 @@
 from ..configuration_utils import ConfigMixin, register_to_config
 from ..loaders import UNet2DConditionLoadersMixin
 from ..utils import BaseOutput, logging
-from .embeddings import get_timestep_embedding
+from .attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from .embeddings import Timesteps
 from .modeling_utils import ModelMixin
 from .lora import LoRACompatibleConv
 from .unet_2d_blocks import (
@@ -54,40 +62,33 @@ class ControlNetXSModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
 
     # to delete later
     @classmethod
-    def create_as_in_paper(cls, base_model=None):
-        if base_model is None:
-            # todo: load sdxl instead
-            base_model = UNet2DConditionModel(
-                block_out_channels=(320, 640, 1280),
-                down_block_types=("DownBlock2D","CrossAttnDownBlock2D","CrossAttnDownBlock2D"),
-                up_block_types=("DownBlock2D","CrossAttnDownBlock2D","CrossAttnDownBlock2D"),
-                transformer_layers_per_block=(0,2,10),
-                cross_attention_dim=2048,
-            )
+    def create_as_in_paper(cls, base_model: UNet2DConditionModel):
 
-        def class_names(modules): return [m.__class__.__name__ for m in modules]
         def get_time_emb_dim(unet: UNet2DConditionModel): return unet.time_embedding.linear_2.out_features
         def get_time_emb_input_dim(unet: UNet2DConditionModel):return unet.time_embedding.linear_1.in_features
 
         base_model_channel_sizes = ControlNetXSModel.gather_base_model_sizes(base_model, base_or_control='base')
 
+        control_model_ratio = 0.1
+
+        block_out_channels = [int(c*control_model_ratio)for c in base_model.config.block_out_channels]
+        dim_attention_heads = 64
+        num_attention_heads = [math.ceil(c/dim_attention_heads) for c in block_out_channels]
+
         cnxs_model = cls(
-            model_channels=320,
-            out_channels=4,
-            hint_channels=3,
-            block_out_channels=(32,64,128),
-            down_block_types=class_names(base_model.down_blocks),
-            up_block_types=class_names(base_model.up_blocks),
+            conditioning_channels=3,
+            block_out_channels=block_out_channels,
+            down_block_types=base_model.config.down_block_types,
+            up_block_types=base_model.config.up_block_types,
             time_embedding_dim=get_time_emb_dim(base_model),
             time_embedding_input_dim=get_time_emb_input_dim(base_model),
-            transformer_layers_per_block=(0,2,10),
-            cross_attention_dim=2048,
+            layers_per_block=base_model.config.layers_per_block,
+            transformer_layers_per_block=base_model.config.transformer_layers_per_block,
+            cross_attention_dim=base_model.config.cross_attention_dim,
             learn_embedding=True,
-            control_model_ratio=0.1,
             base_model_channel_sizes=base_model_channel_sizes,
-            control_scale=0.95,
-            addition_embed_type='text_time',
-            control_attention_head_dim=64,
+            addition_embed_type=base_model.config.addition_embed_type,
+            num_attention_heads=num_attention_heads,
         )
         cnxs_model.base_model = base_model
         return cnxs_model
@@ -128,67 +129,111 @@ def gather_base_model_sizes(cls, unet: UNet2DConditionModel, base_or_control):
 
     @register_to_config
     def __init__(
-            self,
-            model_channels=320,
-            out_channels=4,
-            hint_channels=3,
-            block_out_channels=(32,64,128),
-            down_block_types=("DownBlock2D","CrossAttnDownBlock2D","CrossAttnDownBlock2D"),
-            up_block_types=("DownBlock2D","CrossAttnDownBlock2D","CrossAttnDownBlock2D"),
-            time_embedding_dim=1280,
-            time_embedding_input_dim=320,
-            transformer_layers_per_block=(0,2,10),
-            cross_attention_dim: Union[int, Tuple[int]] = 2048,#1280,
-            learn_embedding=False,
-            control_model_ratio=1.0,
-            base_model_channel_sizes={
-                'enc': [(4, 320), (320, 320), (320, 320), (320, 320), (320, 640), (640, 640), (640, 640), (640, 1280), (1280, 1280)],
-                'mid': [(1280, 1280)],
-                'dec': [(2560, 1280), (2560, 1280), (1920, 1280), (1920, 640), (1280, 640), (960, 640), (960, 320), (640, 320), (640, 320)]
-            },
-            global_pool_conditions: bool = False, # Todo Umer: Needed by SDXL pipeline, but what is this?,
-            control_scale=1,
-            time_control_scale=1,
-            addition_embed_type: Optional[str] = None,
-            control_attention_head_dim: Optional[int] = 8,
-        ):
+        self,
+        conditioning_channels: int = 3,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str]=("DownBlock2D","CrossAttnDownBlock2D","CrossAttnDownBlock2D"),
+        up_block_types: Tuple[str]=("DownBlock2D","CrossAttnDownBlock2D","CrossAttnDownBlock2D"),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int]=(32,64,128),
+        layers_per_block: int = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        time_embedding_dim=1280,
+        time_embedding_input_dim=320,
+        cross_attention_dim: Union[int, Tuple[int]] = 1280,
+        transformer_layers_per_block: Union[int, Tuple[int]]=(0,2,10),
+        base_model_channel_sizes: Dict[str, List[Tuple[int]]]={
+            'enc': [(4, 320), (320, 320), (320, 320), (320, 320), (320, 640), (640, 640), (640, 640), (640, 1280), (1280, 1280)],
+            'mid': [(1280, 1280)],
+            'dec': [(2560, 1280), (2560, 1280), (1920, 1280), (1920, 640), (1280, 640), (960, 640), (960, 320), (640, 320), (640, 320)]
+        },
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        projection_class_embeddings_input_dim: Optional[int] = None,
+        controlnet_conditioning_channel_order: str = "rgb",
+        global_pool_conditions: bool = False,
+        time_control_scale:float=1.0,
+        learn_embedding: bool =False,
+        addition_embed_type: Optional[str] = None,
+    ):
         super().__init__()
 
-        # 1 - Save parameters
-        # TODO make variables
-        self.in_ch_factor = 1 if "cat" == 'add' else 2
-        self.control_model_ratio = control_model_ratio
-        self.out_channels = out_channels
-        self.dims = 2
-        self.model_channels = model_channels
-        self.hint_model = None
-        self.no_control = False
-        self.learn_embedding = learn_embedding
-
-        # 1 - Create controller
+        # If `num_attention_heads` is not defined (which is the case for most models)
+        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
+        # The reason for this behavior is to correct for incorrectly named variables that were introduced
+        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
+        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
+        # which is why we correct for the naming here.
+        num_attention_heads = num_attention_heads or attention_head_dim
+
+        # Check inputs
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+
+        # 1 - Create control unet
         self.control_model = UNet2DConditionModel(
             block_out_channels=block_out_channels,
             down_block_types=down_block_types,
             up_block_types=up_block_types,
             time_embedding_dim=time_embedding_dim,
+            layers_per_block=layers_per_block,
             transformer_layers_per_block=transformer_layers_per_block,
             cross_attention_dim=cross_attention_dim,
-            # Currently, `attention_head_dim` actually describes the numer of attention heads. See https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
-            # TODO: How to handle this?
-            attention_head_dim=[c//control_attention_head_dim for c in block_out_channels], 
+            attention_head_dim=num_attention_heads, 
+            downsample_padding=downsample_padding,
+            mid_block_scale_factor=mid_block_scale_factor,
+            act_fn=act_fn,
+            norm_num_groups=norm_num_groups,
+            norm_eps=norm_eps,
+            use_linear_projection=use_linear_projection,
+            class_embed_type=class_embed_type,
+            num_class_embeds=num_class_embeds,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            projection_class_embeddings_input_dim=projection_class_embeddings_input_dim,
         )
 
         # 2 - Do model surgery on control model
         # 2.1 - Allow to use the same time information as the base model
         adjust_time_input_dim(self.control_model, time_embedding_input_dim)
+ 
         # 2.2 - Allow for information infusion from base model
-        # todo: make variable (sth like zip(block_out_channels[:-1],block_out_channels[1:]))
-        for i, extra_channels in enumerate(((320, 320), (320,640), (640,1280))):
-            e1,e2=extra_channels
+        base_block_out_channels = [sz[1] for sz in base_model_channel_sizes['enc'] if sz[0] != sz[1]]
+
+        extra_channels = list(zip(
+            base_block_out_channels[0:1] + base_block_out_channels[:-1],
+            base_block_out_channels
+        ))
+        for i, (e1, e2) in enumerate(extra_channels):
             increase_block_input_in_encoder_resnet(self.control_model, block_no=i, resnet_idx=0, by=e1)
             increase_block_input_in_encoder_resnet(self.control_model, block_no=i, resnet_idx=1, by=e2)
             if self.control_model.down_blocks[i].downsamplers: increase_block_input_in_encoder_downsampler(self.control_model, block_no=i, by=e2)
-        increase_block_input_in_mid_resnet(self.control_model, by=1280) # todo: make var
+        increase_block_input_in_mid_resnet(self.control_model, by=base_block_out_channels[-1])
 
         # 3 - Gather Channel Sizes
         self.ch_inout_ctrl = ControlNetXSModel.gather_base_model_sizes(self.control_model, base_or_control='control')
@@ -223,7 +268,7 @@ def __init__(
 
         # 5 - Create conditioning hint embedding
         self.input_hint_block = nn.Sequential(
-            nn.Conv2d(hint_channels, 16, 3, padding=1),
+            nn.Conv2d(conditioning_channels, 16, 3, padding=1),
             nn.SiLU(),
             nn.Conv2d(16, 16, 3, padding=1),
             nn.SiLU(),
@@ -237,29 +282,232 @@ def __init__(
             nn.SiLU(),
             nn.Conv2d(96, 256, 3, padding=1, stride=2),
             nn.SiLU(),
-            zero_module(nn.Conv2d(256, int(model_channels * self.control_model_ratio), 3, padding=1))
+            zero_module(nn.Conv2d(256, block_out_channels[0], 3, padding=1))
         )
     
         # 6 - Create time embedding
-        pass
-        self.flip_sin_to_cos = True # default params
-        self.freq_shift = 0
-        # !! TODO !! : learn_embedding is True, so we need our own embedding
-        # Edit: That's already part of the ctrl model, even thought it's not used
-        # Todo: Only when `learn_embedding = False` can we just use the base model's time embedding, otherwise we need to create our own 
-        
-        # Text embedding
-        # info: I deleted the encoder_hid_proj as it's not given by the Heidelberg CVL weights
-
-        scale_list = [1.] * len(self.enc_zero_convs_out) + [1.] + [1.] * len(self.dec_zero_convs_out)
-        self.register_buffer('scale_list', torch.tensor(scale_list) * control_scale)
+        self.time_proj = Timesteps(time_embedding_input_dim, flip_sin_to_cos, freq_shift)
 
-        # in the mininal implementation setting, we only need the control model up to the mid block
-        # note: these can only be deleted after  has to be `gather_base_model_sizes(self.control_mode, 'control')` has been called
+        # In the mininal implementation setting, we only need the control model up to the mid block
         del self.control_model.up_blocks
         del self.control_model.conv_norm_out
         del self.control_model.conv_out
 
+    @classmethod
+    def from_unet(
+        cls,
+        unet: UNet2DConditionModel,
+        controlnet_conditioning_channel_order: str = "rgb",
+        block_out_channels: Optional[Tuple[int]] = None,
+        control_size: Optional[float] = 0.1
+    ):
+        r"""
+        Instantiate a [`ControlNetXSModel`] from [`UNet2DConditionModel`].
+
+        Parameters:
+            unet (`UNet2DConditionModel`):
+                The UNet model whose configuration are copief to the [`ControlNetXSModel`].
+        """
+
+        fixed_size = block_out_channels is not None
+        relative_size = control_size is not None
+
+        if not (fixed_size ^ relative_size):
+            raise ValueError("Exactly one of `block_out_channels` (for absolute sizing) or `control_size` (for relative sizing) must be given to create a controlnetxs model from a unet.")
+
+        if block_out_channels is None:
+            block_out_channels = [control_size*c for c in unet.config.block_out_channels]
+
+        transformer_layers_per_block = (
+            unet.config.transformer_layers_per_block if "transformer_layers_per_block" in unet.config else 1
+        )
+        encoder_hid_dim = unet.config.encoder_hid_dim if "encoder_hid_dim" in unet.config else None
+        encoder_hid_dim_type = unet.config.encoder_hid_dim_type if "encoder_hid_dim_type" in unet.config else None
+        addition_embed_type = unet.config.addition_embed_type if "addition_embed_type" in unet.config else None
+        addition_time_embed_dim = (
+            unet.config.addition_time_embed_dim if "addition_time_embed_dim" in unet.config else None
+        )
+
+        base_model_channel_sizes = ControlNetXSModel.gather_base_model_sizes(unet, base_or_control='base')
+
+        controlnet = cls(
+            base_model_channel_sizes=base_model_channel_sizes,
+            addition_time_embed_dim=addition_time_embed_dim,
+            transformer_layers_per_block=transformer_layers_per_block,
+            in_channels=unet.config.in_channels,
+            flip_sin_to_cos=unet.config.flip_sin_to_cos,
+            freq_shift=unet.config.freq_shift,
+            down_block_types=unet.config.down_block_types,
+            only_cross_attention=unet.config.only_cross_attention,
+            block_out_channels=unet.config.block_out_channels,
+            layers_per_block=unet.config.layers_per_block,
+            downsample_padding=unet.config.downsample_padding,
+            mid_block_scale_factor=unet.config.mid_block_scale_factor,
+            act_fn=unet.config.act_fn,
+            norm_num_groups=unet.config.norm_num_groups,
+            norm_eps=unet.config.norm_eps,
+            cross_attention_dim=unet.config.cross_attention_dim,
+            attention_head_dim=unet.config.attention_head_dim,
+            num_attention_heads=unet.config.num_attention_heads,
+            use_linear_projection=unet.config.use_linear_projection,
+            class_embed_type=unet.config.class_embed_type,
+            num_class_embeds=unet.config.num_class_embeds,
+            upcast_attention=unet.config.upcast_attention,
+            resnet_time_scale_shift=unet.config.resnet_time_scale_shift,
+            projection_class_embeddings_input_dim=unet.config.projection_class_embeddings_input_dim,
+            controlnet_conditioning_channel_order=controlnet_conditioning_channel_order,
+        )
+
+        return controlnet
+
+    @property
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(
+        self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False
+    ):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor, _remove_lora=_remove_lora)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor, _remove_lora=True)
+
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attention_slice
+    def set_attention_slice(self, slice_size):
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
+        several steps. This is useful for saving some memory in exchange for a small decrease in speed.
+
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
+                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+
+        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+
+            for child in module.children():
+                fn_recursive_retrieve_sliceable_dims(child)
+
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_sliceable_dims(module)
+
+        num_sliceable_layers = len(sliceable_head_dims)
+
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_sliceable_layers * [1]
+
+        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+
+    # Copied from diffusers.models.controlnet.ControlNetModel._set_gradient_checkpointing
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D)):
+            module.gradient_checkpointing = value
 
     def forward(
         self,
@@ -270,21 +518,35 @@ def forward(
         conditioning_scale: float = 1.0,
         class_labels: Optional[torch.Tensor] = None,
         timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
         guess_mode: bool = False, # todo: understand and implement if required
         return_dict: bool = True,
     ) -> Union[ControlNetXSOutput, Tuple]:
         if self.base_model is None:
-            raise RuntimeError("To use `forward`, first set the base model for this ControlNetXSModel by `cnxs_model.base_model = the_base_model`")
+            raise RuntimeError("To use `forward`, first set the base model for this ControlNetXSModel via `cnxs_model.base_model = the_base_model`")
 
-        # todo: should scale_list remain an attribute?
-        scale_list = self.scale_list * 0. + conditioning_scale
+        # check channel order
+        channel_order = self.config.controlnet_conditioning_channel_order
+
+        if channel_order == "rgb":
+            # in rgb order by default
+            ...
+        elif channel_order == "bgr":
+            controlnet_cond = torch.flip(controlnet_cond, dims=[1])
+        else:
+            raise ValueError(f"unknown `controlnet_conditioning_channel_order`: {channel_order}")
+
+        # scale control strength
+        n_connections = len(self.enc_zero_convs_out) + 1 + len(self.dec_zero_convs_out)
+        scale_list = torch.full((n_connections,), conditioning_scale)
+   
+        # prepare attention_mask
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
 
-        #x = torch.cat((x, c.get("concat", torch.Tensor([]).type_as(x))), dim=1)
-        # todo: check if we need this line. I assume duplication of guiding image is done in pipeline
-        if sample.size(0) // 2 == controlnet_cond.size(0): controlnet_cond = torch.cat([controlnet_cond, controlnet_cond], dim=0) # for classifier free guidance
-        
         # 1. time
         timesteps=timestep
         if not torch.is_tensor(timesteps):
@@ -302,18 +564,14 @@ def forward(
         # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
         timesteps = timesteps.expand(sample.shape[0])
 
-        t_emb = get_timestep_embedding(
-            timesteps, 
-            self.model_channels,
-            flip_sin_to_cos=self.flip_sin_to_cos,
-            downscale_freq_shift=self.freq_shift,
-        )
+        t_emb = self.time_proj(timesteps)
+
         # timesteps does not contain any weights and will always return f32 tensors
         # but time_embedding might actually be running in fp16. so we need to cast here.
         # there might be better ways to encapsulate this.
         t_emb = t_emb.to(dtype=sample.dtype)
 
-        if self.learn_embedding:
+        if self.config.learn_embedding:
             temb = self.control_model.time_embedding(t_emb) * self.config.time_control_scale ** 0.3 + self.base_model.time_embedding(t_emb) * (1 - self.config.time_control_scale ** 0.3)
         else:
             temb = self.base_model.time_embedding(t_emb)
@@ -321,7 +579,7 @@ def forward(
         # added time & text embeddings
         aug_emb = None
         if self.config.addition_embed_type == "text":
-            raise NotImplementedError()
+            aug_emb = self.base_model.add_embedding(encoder_hidden_states)
         elif self.config.addition_embed_type == "text_image":
             raise NotImplementedError()
         elif self.config.addition_embed_type == "text_time":
@@ -341,7 +599,6 @@ def forward(
             add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
             add_embeds = add_embeds.to(temb.dtype)
             aug_emb = self.base_model.add_embedding(add_embeds)
-
         elif self.config.addition_embed_type == "image":
             raise NotImplementedError()
         elif self.config.addition_embed_type == "image_hint":
@@ -376,30 +633,43 @@ def forward(
         hs_base.append(h_base)
         hs_ctrl.append(h_ctrl)
 
-        # 1 - input blocks (encoder)
+        # 1 - down
         for m_base, m_ctrl  in zip(base_down_subblocks, ctrl_down_subblocks):
             h_ctrl = torch.cat([h_ctrl, next(it_enc_convs_in)(h_base)], dim=1)  # A - concat base -> ctrl
-            h_base = m_base(h_base, temb, cemb)                                 # B - apply base subblock
-            h_ctrl = m_ctrl(h_ctrl, temb, cemb)                                 # C - apply ctrl subblock
+            h_base = m_base(                                                    # B - apply base subblock
+                h_base, temb, cemb,
+                attention_mask, cross_attention_kwargs
+            )                                 
+            h_ctrl = m_ctrl(                                                    # C - apply ctrl subblock
+                h_ctrl, temb, cemb,
+                attention_mask, cross_attention_kwargs
+            )                             
             h_base = h_base + next(it_enc_convs_out)(h_ctrl) * next(scales)     # D - add ctrl -> base
 
             hs_base.append(h_base)
             hs_ctrl.append(h_ctrl)
 
-        h_ctrl = torch.cat([h_ctrl, next(it_enc_convs_in)(h_base)], dim=1)
-
-        # 2 - mid blocks (bottleneck)
+        # 2 - mid
+        h_ctrl = torch.cat([h_ctrl, next(it_enc_convs_in)(h_base)], dim=1)      # A - concat base -> ctrl
         for m_base, m_ctrl in zip(base_mid_subblocks, ctrl_mid_subblocks):
-            h_base = m_base(h_base, temb, cemb)
-            h_ctrl = m_ctrl(h_ctrl, temb, cemb)
-   
-        h_base = h_base + self.middle_block_out(h_ctrl) * next(scales)
+            h_base = m_base(                                                    # B - apply base subblock
+                h_base, temb, cemb,
+                attention_mask, cross_attention_kwargs
+            )  
+            h_ctrl  = m_ctrl(                                                   # C - apply ctrl subblock
+                h_ctrl, temb, cemb,
+                attention_mask, cross_attention_kwargs
+            )  
+        h_base = h_base + self.middle_block_out(h_ctrl) * next(scales)          # D - add ctrl -> base
  
-        # 3 - output blocks (decoder)
+        # 3 - up
         for m_base in base_up_subblocks:
             h_base = h_base + next(it_dec_convs_out)(hs_ctrl.pop()) * next(scales)  # add info from ctrl encoder 
             h_base = torch.cat([h_base, hs_base.pop()], dim=1)                      # concat info from base encoder+ctrl encoder
-            h_base = m_base(h_base, temb, cemb)
+            h_base = m_base(                                                   
+                h_base, temb, cemb,
+                attention_mask, cross_attention_kwargs
+            )  
 
         h_base = self.base_model.conv_norm_out(h_base)
         h_base = self.base_model.conv_act(h_base)
@@ -410,7 +680,6 @@ def forward(
         
         return ControlNetXSOutput(sample=h_base)
 
-
     def make_zero_conv(self, in_channels, out_channels=None):
         # keep running track # todo: better comment
         self.in_channels = in_channels
@@ -424,20 +693,25 @@ def __init__(self,ms,*args,**kwargs):
         if not is_iterable(ms): ms = [ms]
         super().__init__(ms,*args,**kwargs)
     
-    def forward(self,x,temb,cemb):
-        def cls_name(x): return str(type(x)).split('.')[-1].replace("'>",'')
-        content = ' '.join(cls_name(m) for m in self)
-        udl.print_if(f'EmbedSequential.forward with content {content}', conditions='SUBBLOCK-MINUS-1')
+    def forward(
+        self,
+        x: torch.Tensor,
+        temb: torch.Tensor,
+        cemb: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
         for m in self:
             if isinstance(m,ResnetBlock2D):
                 x = m(x,temb)
             elif isinstance(m,Transformer2DModel):
-                x = m(x,cemb).sample
+                x = m(x,cemb,attention_mask=attention_mask,cross_attention_kwargs=cross_attention_kwargs).sample
             elif isinstance(m,Downsample2D):
                 x = m(x)
             elif isinstance(m,Upsample2D):
                 x = m(x)
-            else: raise ValueError(f'Type of m is {type(m)} but should be `ResnetBlock2D`, `Transformer2DModel`,  `Downsample2D` or `Upsample2D`')
+            else:
+                raise ValueError(f'Type of m is {type(m)} but should be `ResnetBlock2D`, `Transformer2DModel`,  `Downsample2D` or `Upsample2D`')
 
         return x
 
diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
index abf6d74687da..27768dd5d239 100644
--- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
@@ -47,7 +47,6 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-# todo: Test if this runs
 EXAMPLE_DOC_STRING = """
     Examples:
         ```py
diff --git a/src/diffusers/umer_debug_logger.py b/src/diffusers/umer_debug_logger.py
index 391955756c84..e8b63c115682 100644
--- a/src/diffusers/umer_debug_logger.py
+++ b/src/diffusers/umer_debug_logger.py
@@ -20,6 +20,7 @@ def __init__(self, log_dir='logs', condition=None):
         self.fields = ['timestamp', 'cls', 'fn', 'shape', 'msg', 'condition', 'tensor_file']
         self.create_file()
         self.warned_of_no_condition = False
+        print("Info: `UmerDebugLogger` created. This is a logging class that will be deleted when the PR to integrate ControlNet-XS is done.")
 
     @property
     def full_file_path(self): return os.path.join(self.log_dir, self._FILE) 
@@ -31,7 +32,6 @@ def create_file(self):
                 writer = csv.DictWriter(f, fieldnames=self.fields)
                 writer.writeheader()
 
-
     def set_dir(self, log_dir, clear=False):
         self.log_dir = log_dir
         if clear: self.clear_logs()
@@ -94,7 +94,7 @@ def stop_if(self, condition, funny_msg):
 
     def maybe_warn_of_no_condition(self):
         if self.condition is None and not self.warned_of_no_condition :
-            print("Warning: No condition set for UmerDebugLogger")
+            print("Info: No condition set for UmerDebugLogger")
             self.warned_of_no_condition = True
 
     def get_log_objects(self):

From 56e9b59d989840e6c6b4a99a83288901261ea0b8 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Tue, 14 Nov 2023 16:51:34 +0100
Subject: [PATCH 34/88] 1. from_unet, 2. base passed, 3. all unet params

---
 src/diffusers/models/controlnetxs.py          | 348 +++++++++---------
 src/diffusers/models/unet_2d_condition.py     |   6 +-
 .../pipeline_controlnet_xs_sd_xl.py           |  20 +-
 3 files changed, 185 insertions(+), 189 deletions(-)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index 1df4d22ecf1c..214e529bd4da 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -63,35 +63,13 @@ class ControlNetXSModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
     # to delete later
     @classmethod
     def create_as_in_paper(cls, base_model: UNet2DConditionModel):
-
-        def get_time_emb_dim(unet: UNet2DConditionModel): return unet.time_embedding.linear_2.out_features
-        def get_time_emb_input_dim(unet: UNet2DConditionModel):return unet.time_embedding.linear_1.in_features
-
-        base_model_channel_sizes = ControlNetXSModel.gather_base_model_sizes(base_model, base_or_control='base')
-
-        control_model_ratio = 0.1
-
-        block_out_channels = [int(c*control_model_ratio)for c in base_model.config.block_out_channels]
-        dim_attention_heads = 64
-        num_attention_heads = [math.ceil(c/dim_attention_heads) for c in block_out_channels]
-
-        cnxs_model = cls(
-            conditioning_channels=3,
-            block_out_channels=block_out_channels,
-            down_block_types=base_model.config.down_block_types,
-            up_block_types=base_model.config.up_block_types,
-            time_embedding_dim=get_time_emb_dim(base_model),
-            time_embedding_input_dim=get_time_emb_input_dim(base_model),
-            layers_per_block=base_model.config.layers_per_block,
-            transformer_layers_per_block=base_model.config.transformer_layers_per_block,
-            cross_attention_dim=base_model.config.cross_attention_dim,
-            learn_embedding=True,
-            base_model_channel_sizes=base_model_channel_sizes,
-            addition_embed_type=base_model.config.addition_embed_type,
-            num_attention_heads=num_attention_heads,
+        return ControlNetXSModel.from_unet(
+            base_model,
+            time_control_scale =  0.95,
+            learn_embedding = True,
+            control_model_ratio = 0.1,
+            dim_attention_heads = 64
         )
-        cnxs_model.base_model = base_model
-        return cnxs_model
 
     @classmethod
     def gather_base_model_sizes(cls, unet: UNet2DConditionModel, base_or_control):
@@ -129,98 +107,112 @@ def gather_base_model_sizes(cls, unet: UNet2DConditionModel, base_or_control):
 
     @register_to_config
     def __init__(
-        self,
+        self,      
         conditioning_channels: int = 3,
-        flip_sin_to_cos: bool = True,
-        freq_shift: int = 0,
-        down_block_types: Tuple[str]=("DownBlock2D","CrossAttnDownBlock2D","CrossAttnDownBlock2D"),
-        up_block_types: Tuple[str]=("DownBlock2D","CrossAttnDownBlock2D","CrossAttnDownBlock2D"),
+        controlnet_conditioning_channel_order: str = "rgb",
+        time_embedding_input_dim: int = 320,
+        time_embedding_dim: int = 1280,
+        time_control_scale:float=1.0,
+        learn_embedding: bool =False,
+        base_model_channel_sizes: Dict[str, List[Tuple[int]]]={
+            'enc': [(4, 320), (320, 320), (320, 320), (320, 320), (320, 640), (640, 640), (640, 640), (640, 1280), (1280, 1280)],
+            'mid': [(1280, 1280)],
+            'dec': [(2560, 1280), (2560, 1280), (1920, 1280), (1920, 640), (1280, 640), (960, 640), (960, 320), (640, 320), (640, 320)]
+        },
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        center_input_sample: bool = False,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
+        up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
         only_cross_attention: Union[bool, Tuple[bool]] = False,
-        block_out_channels: Tuple[int]=(32,64,128),
-        layers_per_block: int = 2,
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: Union[int, Tuple[int]] = 2,
         downsample_padding: int = 1,
         mid_block_scale_factor: float = 1,
+        dropout: float = 0.0,
         act_fn: str = "silu",
         norm_num_groups: Optional[int] = 32,
         norm_eps: float = 1e-5,
-        time_embedding_dim=1280,
-        time_embedding_input_dim=320,
         cross_attention_dim: Union[int, Tuple[int]] = 1280,
-        transformer_layers_per_block: Union[int, Tuple[int]]=(0,2,10),
-        base_model_channel_sizes: Dict[str, List[Tuple[int]]]={
-            'enc': [(4, 320), (320, 320), (320, 320), (320, 320), (320, 640), (640, 640), (640, 640), (640, 1280), (1280, 1280)],
-            'mid': [(1280, 1280)],
-            'dec': [(2560, 1280), (2560, 1280), (1920, 1280), (1920, 640), (1280, 640), (960, 640), (960, 320), (640, 320), (640, 320)]
-        },
+        transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
+        reverse_transformer_layers_per_block: Optional[Tuple[Tuple[int]]] = None,
+        encoder_hid_dim: Optional[int] = None,
+        encoder_hid_dim_type: Optional[str] = None,
         attention_head_dim: Union[int, Tuple[int]] = 8,
         num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
+        dual_cross_attention: bool = False,
         use_linear_projection: bool = False,
-        class_embed_type: Optional[str] = None,
-        num_class_embeds: Optional[int] = None,
         upcast_attention: bool = False,
         resnet_time_scale_shift: str = "default",
-        projection_class_embeddings_input_dim: Optional[int] = None,
-        controlnet_conditioning_channel_order: str = "rgb",
-        global_pool_conditions: bool = False,
-        time_control_scale:float=1.0,
-        learn_embedding: bool =False,
-        addition_embed_type: Optional[str] = None,
+        resnet_skip_time_act: bool = False,
+        resnet_out_scale_factor: int = 1.0,
+        time_embedding_type: str = "positional",
+        time_embedding_act_fn: Optional[str] = None,
+        timestep_post_act: Optional[str] = None,
+        time_cond_proj_dim: Optional[int] = None,
+        conv_in_kernel: int = 3,
+        conv_out_kernel: int = 3,
+        attention_type: str = "default",
+        mid_block_only_cross_attention: Optional[bool] = None,
+        cross_attention_norm: Optional[str] = None,
+        addition_embed_type_num_heads=64,
     ):
         super().__init__()
 
-        # If `num_attention_heads` is not defined (which is the case for most models)
-        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
-        # The reason for this behavior is to correct for incorrectly named variables that were introduced
-        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
-        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
-        # which is why we correct for the naming here.
-        num_attention_heads = num_attention_heads or attention_head_dim
-
-        # Check inputs
-        if len(block_out_channels) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
-            )
-
-        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
-            )
-
-        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
-            )
-
-        if isinstance(transformer_layers_per_block, int):
-            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
-
         # 1 - Create control unet
         self.control_model = UNet2DConditionModel(
-            block_out_channels=block_out_channels,
+            sample_size=sample_size,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            center_input_sample=center_input_sample,
             down_block_types=down_block_types,
+            mid_block_type=mid_block_type,
             up_block_types=up_block_types,
-            time_embedding_dim=time_embedding_dim,
+            only_cross_attention=only_cross_attention,
+            block_out_channels=block_out_channels,
             layers_per_block=layers_per_block,
-            transformer_layers_per_block=transformer_layers_per_block,
-            cross_attention_dim=cross_attention_dim,
-            attention_head_dim=num_attention_heads, 
             downsample_padding=downsample_padding,
             mid_block_scale_factor=mid_block_scale_factor,
+            dropout=dropout,
             act_fn=act_fn,
             norm_num_groups=norm_num_groups,
             norm_eps=norm_eps,
+            cross_attention_dim=cross_attention_dim,
+            transformer_layers_per_block=transformer_layers_per_block,
+            reverse_transformer_layers_per_block=reverse_transformer_layers_per_block,
+            encoder_hid_dim=encoder_hid_dim,
+            encoder_hid_dim_type=encoder_hid_dim_type,
+            attention_head_dim=attention_head_dim,
+            num_attention_heads=num_attention_heads,
+            dual_cross_attention=dual_cross_attention,
             use_linear_projection=use_linear_projection,
-            class_embed_type=class_embed_type,
-            num_class_embeds=num_class_embeds,
             upcast_attention=upcast_attention,
             resnet_time_scale_shift=resnet_time_scale_shift,
-            projection_class_embeddings_input_dim=projection_class_embeddings_input_dim,
-        )
+            resnet_skip_time_act=resnet_skip_time_act,
+            resnet_out_scale_factor=resnet_out_scale_factor,
+            time_embedding_type=time_embedding_type,
+            time_embedding_dim=time_embedding_dim,
+            time_embedding_act_fn=time_embedding_act_fn,
+            timestep_post_act=timestep_post_act,
+            time_cond_proj_dim=time_cond_proj_dim,
+            conv_in_kernel=conv_in_kernel,
+            conv_out_kernel=conv_out_kernel,
+            attention_type=attention_type,
+            mid_block_only_cross_attention=mid_block_only_cross_attention,
+            cross_attention_norm=cross_attention_norm,
+            addition_embed_type_num_heads=addition_embed_type_num_heads,
+        )       
 
         # 2 - Do model surgery on control model
         # 2.1 - Allow to use the same time information as the base model
-        adjust_time_input_dim(self.control_model, time_embedding_input_dim)
+        adjust_time_dims(self.control_model, time_embedding_input_dim, time_embedding_dim)
  
         # 2.2 - Allow for information infusion from base model
         base_block_out_channels = [sz[1] for sz in base_model_channel_sizes['enc'] if sz[0] != sz[1]]
@@ -285,9 +277,6 @@ def __init__(
             zero_module(nn.Conv2d(256, block_out_channels[0], 3, padding=1))
         )
     
-        # 6 - Create time embedding
-        self.time_proj = Timesteps(time_embedding_input_dim, flip_sin_to_cos, freq_shift)
-
         # In the mininal implementation setting, we only need the control model up to the mid block
         del self.control_model.up_blocks
         del self.control_model.conv_norm_out
@@ -298,8 +287,13 @@ def from_unet(
         cls,
         unet: UNet2DConditionModel,
         controlnet_conditioning_channel_order: str = "rgb",
+        conditioning_channels: int = 3,
+        time_control_scale: float =  1.0,
+        learn_embedding: bool = False,
         block_out_channels: Optional[Tuple[int]] = None,
-        control_size: Optional[float] = 0.1
+        control_model_ratio: Optional[float] = None,
+        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
+        dim_attention_heads: Optional[int] = None
     ):
         r"""
         Instantiate a [`ControlNetXSModel`] from [`UNet2DConditionModel`].
@@ -309,56 +303,49 @@ def from_unet(
                 The UNet model whose configuration are copief to the [`ControlNetXSModel`].
         """
 
+        # check input
         fixed_size = block_out_channels is not None
-        relative_size = control_size is not None
-
+        relative_size = control_model_ratio is not None
         if not (fixed_size ^ relative_size):
-            raise ValueError("Exactly one of `block_out_channels` (for absolute sizing) or `control_size` (for relative sizing) must be given to create a controlnetxs model from a unet.")
+            raise ValueError("Pass exactly one of `block_out_channels` (for absolute sizing) or `control_model_ratio` (for relative sizing).")
 
+        if num_attention_heads is not None and dim_attention_heads is not None:
+            raise ValueError("Pass only one of `num_attention_heads` or `dim_attention_heads`.")
+
+        # create model
         if block_out_channels is None:
-            block_out_channels = [control_size*c for c in unet.config.block_out_channels]
+            block_out_channels = [int(control_model_ratio*c) for c in unet.config.block_out_channels]
 
-        transformer_layers_per_block = (
-            unet.config.transformer_layers_per_block if "transformer_layers_per_block" in unet.config else 1
-        )
-        encoder_hid_dim = unet.config.encoder_hid_dim if "encoder_hid_dim" in unet.config else None
-        encoder_hid_dim_type = unet.config.encoder_hid_dim_type if "encoder_hid_dim_type" in unet.config else None
-        addition_embed_type = unet.config.addition_embed_type if "addition_embed_type" in unet.config else None
-        addition_time_embed_dim = (
-            unet.config.addition_time_embed_dim if "addition_time_embed_dim" in unet.config else None
-        )
+        if dim_attention_heads is not None:
+            num_attention_heads = [math.ceil(c/dim_attention_heads) for c in block_out_channels]
 
-        base_model_channel_sizes = ControlNetXSModel.gather_base_model_sizes(unet, base_or_control='base')
+        def get_time_emb_input_dim(unet: UNet2DConditionModel):return unet.time_embedding.linear_1.in_features
+        def get_time_emb_dim(unet: UNet2DConditionModel): return unet.time_embedding.linear_2.out_features
 
-        controlnet = cls(
-            base_model_channel_sizes=base_model_channel_sizes,
-            addition_time_embed_dim=addition_time_embed_dim,
-            transformer_layers_per_block=transformer_layers_per_block,
-            in_channels=unet.config.in_channels,
-            flip_sin_to_cos=unet.config.flip_sin_to_cos,
-            freq_shift=unet.config.freq_shift,
-            down_block_types=unet.config.down_block_types,
-            only_cross_attention=unet.config.only_cross_attention,
-            block_out_channels=unet.config.block_out_channels,
-            layers_per_block=unet.config.layers_per_block,
-            downsample_padding=unet.config.downsample_padding,
-            mid_block_scale_factor=unet.config.mid_block_scale_factor,
-            act_fn=unet.config.act_fn,
-            norm_num_groups=unet.config.norm_num_groups,
-            norm_eps=unet.config.norm_eps,
-            cross_attention_dim=unet.config.cross_attention_dim,
-            attention_head_dim=unet.config.attention_head_dim,
-            num_attention_heads=unet.config.num_attention_heads,
-            use_linear_projection=unet.config.use_linear_projection,
-            class_embed_type=unet.config.class_embed_type,
-            num_class_embeds=unet.config.num_class_embeds,
-            upcast_attention=unet.config.upcast_attention,
-            resnet_time_scale_shift=unet.config.resnet_time_scale_shift,
-            projection_class_embeddings_input_dim=unet.config.projection_class_embeddings_input_dim,
-            controlnet_conditioning_channel_order=controlnet_conditioning_channel_order,
+        kwargs = dict(unet.config)
+        kwargs.update(block_out_channels=block_out_channels)
+        if num_attention_heads is not None:
+            kwargs.update(attention_head_dim=num_attention_heads)
+
+        to_remove = (
+            'flip_sin_to_cos','freq_shift',
+            'addition_embed_type','addition_time_embed_dim',
+            'class_embed_type', 'num_class_embeds', 'projection_class_embeddings_input_dim', 'class_embeddings_concat'
+        )
+        for o in to_remove:
+            del kwargs[o]
+
+        kwargs.update(
+            conditioning_channels = conditioning_channels,
+            controlnet_conditioning_channel_order = controlnet_conditioning_channel_order,
+            time_embedding_input_dim = get_time_emb_input_dim(unet),
+            time_embedding_dim = get_time_emb_dim(unet),
+            time_control_scale =  time_control_scale,
+            learn_embedding = learn_embedding,
+            base_model_channel_sizes = ControlNetXSModel.gather_base_model_sizes(unet, base_or_control='base'),
         )
 
-        return controlnet
+        return cls(**kwargs)
 
     @property
     # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
@@ -511,6 +498,7 @@ def _set_gradient_checkpointing(self, module, value=False):
 
     def forward(
         self,
+        base_model: UNet2DConditionModel,
         sample: torch.FloatTensor,
         timestep: Union[torch.Tensor, float, int],
         encoder_hidden_states: torch.Tensor,
@@ -524,9 +512,6 @@ def forward(
         guess_mode: bool = False, # todo: understand and implement if required
         return_dict: bool = True,
     ) -> Union[ControlNetXSOutput, Tuple]:
-        if self.base_model is None:
-            raise RuntimeError("To use `forward`, first set the base model for this ControlNetXSModel via `cnxs_model.base_model = the_base_model`")
-
         # check channel order
         channel_order = self.config.controlnet_conditioning_channel_order
 
@@ -564,7 +549,7 @@ def forward(
         # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
         timesteps = timesteps.expand(sample.shape[0])
 
-        t_emb = self.time_proj(timesteps)
+        t_emb = base_model.time_proj(timesteps)
 
         # timesteps does not contain any weights and will always return f32 tensors
         # but time_embedding might actually be running in fp16. so we need to cast here.
@@ -572,37 +557,53 @@ def forward(
         t_emb = t_emb.to(dtype=sample.dtype)
 
         if self.config.learn_embedding:
-            temb = self.control_model.time_embedding(t_emb) * self.config.time_control_scale ** 0.3 + self.base_model.time_embedding(t_emb) * (1 - self.config.time_control_scale ** 0.3)
+            ctrl_temb = self.control_model.time_embedding(t_emb, timestep_cond)
+            base_temb = base_model.time_embedding(t_emb, timestep_cond)
+            interpolation_param = self.config.time_control_scale ** 0.3
+
+            temb = ctrl_temb * interpolation_param + base_temb * (1 - interpolation_param)
         else:
-            temb = self.base_model.time_embedding(t_emb)
+            temb = base_model.time_embedding(t_emb)
 
         # added time & text embeddings
         aug_emb = None
-        if self.config.addition_embed_type == "text":
-            aug_emb = self.base_model.add_embedding(encoder_hidden_states)
-        elif self.config.addition_embed_type == "text_image":
-            raise NotImplementedError()
-        elif self.config.addition_embed_type == "text_time":
-            # SDXL - style
-            if "text_embeds" not in added_cond_kwargs:
-                raise ValueError(
-                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
-                )
-            text_embeds = added_cond_kwargs.get("text_embeds")
-            if "time_ids" not in added_cond_kwargs:
-                raise ValueError(
-                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
-                )
-            time_ids = added_cond_kwargs.get("time_ids")
-            time_embeds = self.base_model.add_time_proj(time_ids.flatten())
-            time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
-            add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
-            add_embeds = add_embeds.to(temb.dtype)
-            aug_emb = self.base_model.add_embedding(add_embeds)
-        elif self.config.addition_embed_type == "image":
-            raise NotImplementedError()
-        elif self.config.addition_embed_type == "image_hint":
-            raise NotImplementedError()
+
+        if base_model.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+
+            if base_model.config.class_embed_type == "timestep":
+                class_labels = base_model.time_proj(class_labels)
+
+            class_emb = base_model.class_embedding(class_labels).to(dtype=self.dtype)
+            temb = temb + class_emb
+
+        if self.config.addition_embed_type is not None:
+            if self.config.addition_embed_type == "text":
+                aug_emb = base_model.add_embedding(encoder_hidden_states)
+            elif self.config.addition_embed_type == "text_image":
+                raise NotImplementedError()
+            elif self.config.addition_embed_type == "text_time":
+                # SDXL - style
+                if "text_embeds" not in added_cond_kwargs:
+                    raise ValueError(
+                        f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                    )
+                text_embeds = added_cond_kwargs.get("text_embeds")
+                if "time_ids" not in added_cond_kwargs:
+                    raise ValueError(
+                        f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                    )
+                time_ids = added_cond_kwargs.get("time_ids")
+                time_embeds = base_model.add_time_proj(time_ids.flatten())
+                time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+                add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+                add_embeds = add_embeds.to(temb.dtype)
+                aug_emb = base_model.add_embedding(add_embeds)
+            elif self.config.addition_embed_type == "image":
+                raise NotImplementedError()
+            elif self.config.addition_embed_type == "image_hint":
+                raise NotImplementedError()
 
         temb = temb + aug_emb if aug_emb is not None else temb
 
@@ -617,15 +618,15 @@ def forward(
         it_enc_convs_in, it_enc_convs_out, it_dec_convs_in, it_dec_convs_out = map(iter, (self.enc_zero_convs_in, self.enc_zero_convs_out, self.dec_zero_convs_in, self.dec_zero_convs_out))
         scales = iter(scale_list)
 
-        base_down_subblocks = to_sub_blocks(self.base_model.down_blocks)
+        base_down_subblocks = to_sub_blocks(base_model.down_blocks)
         ctrl_down_subblocks = to_sub_blocks(self.control_model.down_blocks)
-        base_mid_subblocks = to_sub_blocks([self.base_model.mid_block])
+        base_mid_subblocks = to_sub_blocks([base_model.mid_block])
         ctrl_mid_subblocks = to_sub_blocks([self.control_model.mid_block])
-        base_up_subblocks = to_sub_blocks(self.base_model.up_blocks)
+        base_up_subblocks = to_sub_blocks(base_model.up_blocks)
 
         # Cross Control
         # 0 - conv in
-        h_base = self.base_model.conv_in(h_base)
+        h_base = base_model.conv_in(h_base)
         h_ctrl = self.control_model.conv_in(h_ctrl)
         if guided_hint is not None: h_ctrl += guided_hint
         h_base = h_base + next(it_enc_convs_out)(h_ctrl) * next(scales)
@@ -671,9 +672,9 @@ def forward(
                 attention_mask, cross_attention_kwargs
             )  
 
-        h_base = self.base_model.conv_norm_out(h_base)
-        h_base = self.base_model.conv_act(h_base)
-        h_base = self.base_model.conv_out(h_base)
+        h_base = base_model.conv_norm_out(h_base)
+        h_base = base_model.conv_act(h_base)
+        h_base = base_model.conv_out(h_base)
 
         if not return_dict:
             return h_base
@@ -716,9 +717,8 @@ def forward(
         return x
 
 
-def adjust_time_input_dim(unet: UNet2DConditionModel, dim: int):
-    time_emb = unet.time_embedding
-    time_emb.linear_1 = nn.Linear(dim, time_emb.linear_1.out_features)
+def adjust_time_dims(unet: UNet2DConditionModel, in_dim: int, out_dim: int):
+    unet.time_embedding.linear_1 = nn.Linear(in_dim, out_dim)
 
 
 def increase_block_input_in_encoder_resnet(unet:UNet2DConditionModel, block_no, resnet_idx, by):
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index 1a242ff165f6..27752b819747 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -148,9 +148,9 @@ class conditioning with `class_embed_type` equal to `None`.
             The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`.
         time_cond_proj_dim (`int`, *optional*, defaults to `None`):
             The dimension of `cond_proj` layer in the timestep embedding.
-        conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer. conv_out_kernel (`int`,
-        *optional*, default to `3`): The kernel size of `conv_out` layer. projection_class_embeddings_input_dim (`int`,
-        *optional*): The dimension of the `class_labels` input when
+        conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer.
+        conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer.
+        projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when
             `class_embed_type="projection"`. Required when `class_embed_type="projection"`.
         class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time
             embeddings with the class embeddings.
diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
index 27768dd5d239..e841bda16bbf 100644
--- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
@@ -904,9 +904,6 @@ def __call__(
         """
         controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
 
-        # set current this pipeline's unet as the base model for the controlnet
-        self.controlnet.base_model = self.unet
-
         # align format for control guidance
         if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
             control_guidance_start = len(control_guidance_end) * [control_guidance_start]
@@ -951,12 +948,13 @@ def __call__(
 
         #todo: if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float): ...
         
-        global_pool_conditions = (
-            controlnet.config.global_pool_conditions
-            if isinstance(controlnet, ControlNetXSModel)
-            else controlnet.nets[0].config.global_pool_conditions
-        )
-        guess_mode = guess_mode or global_pool_conditions
+        # todo umer: understand & implement if needed
+        # global_pool_conditions = (
+        #     controlnet.config.global_pool_conditions
+        #     if isinstance(controlnet, ControlNetXSModel)
+        #     else controlnet.nets[0].config.global_pool_conditions
+        # )
+        # guess_mode = guess_mode or global_pool_conditions
 
         # 3. Encode input prompt
         text_encoder_lora_scale = (
@@ -1074,6 +1072,7 @@ def __call__(
 
                 # predict the noise residual
                 noise_pred = self.controlnet(
+                    base_model=self.unet,
                     sample=latent_model_input,
                     timestep=t,
                     encoder_hidden_states=prompt_embeds,
@@ -1129,9 +1128,6 @@ def __call__(
         # Offload all models
         self.maybe_free_model_hooks()
 
-        # remove the base model from controlnet, which we set above
-        del self.controlnet.base_model
-
         if not return_dict:
             return (image,)
 

From 2cc2cfb52e3239c28ba09ea9be3a4f71ac501c9f Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Tue, 14 Nov 2023 17:07:12 +0100
Subject: [PATCH 35/88] checkin

---
 src/diffusers/models/controlnetxs.py | 41 +++++++++++++++++++++++-----
 1 file changed, 34 insertions(+), 7 deletions(-)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index 214e529bd4da..5ce37e1ebe57 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -57,8 +57,34 @@ class ControlNetXSOutput(BaseOutput):
     sample: torch.FloatTensor = None
 
 
+# todo umer: do we need UNet2DConditionLoadersMixin?
 class ControlNetXSModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
-    """A ControlNet-XS model."""
+    r"""
+    A ControlNet-XS model
+
+    This model inherits from [`ModelMixin`], [`ConfigMixin`] and [`UNet2DConditionLoadersMixin`].
+    Check the superclass documentation for it's generic methods implemented for all models
+    (such as downloading or saving).
+
+    Most of parameters for this model are passed into the [`UNet2DConditionModel`] it creates.
+    Check the documentation of [`UNet2DConditionModel`] for them.
+
+    Parameters:
+        conditioning_channels (`int`, defaults to 3):
+            Number of channels of conditioning input (e.g. an image)
+        controlnet_conditioning_channel_order (`str`, defaults to "rgb"):
+            todo Channel order for controlnet conditioning, e.g., "rgb".
+        time_embedding_input_dim (`int`, defaults to 320):
+            todo  Dimension of input for time embedding.
+        time_embedding_dim (`int`, defaults to 1280):
+            todo Dimension of time embedding.
+        time_control_scale (`float`, defaults to 1.0):
+            todo Scale factor for time control.
+        learn_embedding (`bool`, defaults to `False`):
+            ... Flag to determine if embedding is learnable.
+        base_model_channel_sizes (`Dict[str, List[Tuple[int]]]`):
+            ... Dictionary mapping base model names to lists of channel size tuples.
+    """
 
     # to delete later
     @classmethod
@@ -215,6 +241,7 @@ def __init__(
         adjust_time_dims(self.control_model, time_embedding_input_dim, time_embedding_dim)
  
         # 2.2 - Allow for information infusion from base model
+        # todo umer: the assumption that block sizes = changing subblock sizes is false, eg when we have consecutive blocks of same size
         base_block_out_channels = [sz[1] for sz in base_model_channel_sizes['enc'] if sz[0] != sz[1]]
 
         extra_channels = list(zip(
@@ -578,12 +605,12 @@ def forward(
             class_emb = base_model.class_embedding(class_labels).to(dtype=self.dtype)
             temb = temb + class_emb
 
-        if self.config.addition_embed_type is not None:
-            if self.config.addition_embed_type == "text":
+        if base_model.config.addition_embed_type is not None:
+            if base_model.config.addition_embed_type == "text":
                 aug_emb = base_model.add_embedding(encoder_hidden_states)
-            elif self.config.addition_embed_type == "text_image":
+            elif base_model.config.addition_embed_type == "text_image":
                 raise NotImplementedError()
-            elif self.config.addition_embed_type == "text_time":
+            elif base_model.config.addition_embed_type == "text_time":
                 # SDXL - style
                 if "text_embeds" not in added_cond_kwargs:
                     raise ValueError(
@@ -600,9 +627,9 @@ def forward(
                 add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
                 add_embeds = add_embeds.to(temb.dtype)
                 aug_emb = base_model.add_embedding(add_embeds)
-            elif self.config.addition_embed_type == "image":
+            elif base_model.config.addition_embed_type == "image":
                 raise NotImplementedError()
-            elif self.config.addition_embed_type == "image_hint":
+            elif base_model.config.addition_embed_type == "image_hint":
                 raise NotImplementedError()
 
         temb = temb + aug_emb if aug_emb is not None else temb

From 0bb945781686d39d440c29198b874c9a6ed668e1 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Tue, 14 Nov 2023 19:12:18 +0100
Subject: [PATCH 36/88] Finished docstrings

---
 src/diffusers/models/controlnetxs.py | 118 ++++++++++++++++++++-------
 1 file changed, 90 insertions(+), 28 deletions(-)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index 5ce37e1ebe57..e73087c3a522 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -72,20 +72,27 @@ class ControlNetXSModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
     Parameters:
         conditioning_channels (`int`, defaults to 3):
             Number of channels of conditioning input (e.g. an image)
-        controlnet_conditioning_channel_order (`str`, defaults to "rgb"):
-            todo Channel order for controlnet conditioning, e.g., "rgb".
+        controlnet_conditioning_channel_order (`str`, defaults to `"rgb"`):
+            The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
         time_embedding_input_dim (`int`, defaults to 320):
-            todo  Dimension of input for time embedding.
+            Dimension of input into time embedding. Needs to be same as in the base model.
         time_embedding_dim (`int`, defaults to 1280):
-            todo Dimension of time embedding.
-        time_control_scale (`float`, defaults to 1.0):
-            todo Scale factor for time control.
+            Dimension of output from time embedding. Needs to be same as in the base model.
         learn_embedding (`bool`, defaults to `False`):
-            ... Flag to determine if embedding is learnable.
+            Wether to use time embedding of the control model. If yes, the time embedding
+            is a linear interpolation of the time embeddings of the control and base model
+            with interpolation parameter `time_control_scale**3`.
+        time_control_scale (`float`, defaults to 1.0):
+            Linear interpolation parameter used if `learn_embedding` is `True`.
         base_model_channel_sizes (`Dict[str, List[Tuple[int]]]`):
-            ... Dictionary mapping base model names to lists of channel size tuples.
+            Channel sizes of each subblock of base model. Use `gather_base_model_sizes` on
+            your base model to compute it.
     """
 
+    # todo: rename enc/mid/dec in variable names (eg connections)
+    # todo: is time_control_scale a good name?
+    # todo: gather_base_model_sizes good name?
+
     # to delete later
     @classmethod
     def create_as_in_paper(cls, base_model: UNet2DConditionModel):
@@ -102,18 +109,18 @@ def gather_base_model_sizes(cls, unet: UNet2DConditionModel, base_or_control):
         if base_or_control not in ['base', 'control']:
             raise ValueError(f"`base_or_control` needs to be either `base` or `control`")
 
-        channel_sizes = {'enc': [], 'mid': [], 'dec': []}
+        channel_sizes = {'down': [], 'mid': [], 'up': []}
 
         # input convolution
-        channel_sizes['enc'].append((unet.conv_in.in_channels, unet.conv_in.out_channels))
+        channel_sizes['down'].append((unet.conv_in.in_channels, unet.conv_in.out_channels))
 
         # encoder blocks
         for module in unet.down_blocks:
             if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D)):
                 for r in module.resnets:
-                    channel_sizes['enc'].append((r.in_channels, r.out_channels))
+                    channel_sizes['down'].append((r.in_channels, r.out_channels))
                 if module.downsamplers:
-                    channel_sizes['enc'].append((module.downsamplers[0].channels, module.downsamplers[0].out_channels))
+                    channel_sizes['down'].append((module.downsamplers[0].channels, module.downsamplers[0].out_channels))
             else:
                 raise ValueError(f'Encountered unknown module of type {type(module)} while creating ControlNet-XS.')
 
@@ -125,7 +132,7 @@ def gather_base_model_sizes(cls, unet: UNet2DConditionModel, base_or_control):
             for module in unet.up_blocks:
                 if isinstance(module, (CrossAttnUpBlock2D, UpBlock2D)):
                     for r in module.resnets:
-                        channel_sizes['dec'].append((r.in_channels, r.out_channels))
+                        channel_sizes['up'].append((r.in_channels, r.out_channels))
                 else:
                    raise ValueError(f'Encountered unknown module of type {type(module)} while creating ControlNet-XS.')
 
@@ -141,9 +148,9 @@ def __init__(
         time_control_scale:float=1.0,
         learn_embedding: bool =False,
         base_model_channel_sizes: Dict[str, List[Tuple[int]]]={
-            'enc': [(4, 320), (320, 320), (320, 320), (320, 320), (320, 640), (640, 640), (640, 640), (640, 1280), (1280, 1280)],
+            'down':[(4, 320), (320, 320), (320, 320), (320, 320), (320, 640), (640, 640), (640, 640), (640, 1280), (1280, 1280)],
             'mid': [(1280, 1280)],
-            'dec': [(2560, 1280), (2560, 1280), (1920, 1280), (1920, 640), (1280, 640), (960, 640), (960, 320), (640, 320), (640, 320)]
+            'up':  [(2560, 1280), (2560, 1280), (1920, 1280), (1920, 640), (1280, 640), (960, 640), (960, 320), (640, 320), (640, 320)]
         },
         sample_size: Optional[int] = None,
         in_channels: int = 4,
@@ -242,7 +249,7 @@ def __init__(
  
         # 2.2 - Allow for information infusion from base model
         # todo umer: the assumption that block sizes = changing subblock sizes is false, eg when we have consecutive blocks of same size
-        base_block_out_channels = [sz[1] for sz in base_model_channel_sizes['enc'] if sz[0] != sz[1]]
+        base_block_out_channels = [sz[1] for sz in base_model_channel_sizes['down'] if sz[0] != sz[1]]
 
         extra_channels = list(zip(
             base_block_out_channels[0:1] + base_block_out_channels[:-1],
@@ -266,23 +273,23 @@ def __init__(
         self.dec_zero_convs_out = nn.ModuleList([])
         self.dec_zero_convs_in = nn.ModuleList([])
 
-        for ch_io_base in self.ch_inout_base['enc']:
+        for ch_io_base in self.ch_inout_base['down']:
             self.enc_zero_convs_in.append(self.make_zero_conv(
                 in_channels=ch_io_base[1], out_channels=ch_io_base[1])
             )
-        for i in range(len(self.ch_inout_ctrl['enc'])):
+        for i in range(len(self.ch_inout_ctrl['down'])):
             self.enc_zero_convs_out.append(
-                self.make_zero_conv(self.ch_inout_ctrl['enc'][i][1], self.ch_inout_base['enc'][i][1])
+                self.make_zero_conv(self.ch_inout_ctrl['down'][i][1], self.ch_inout_base['down'][i][1])
             )       
  
         self.middle_block_out = self.make_zero_conv(self.ch_inout_ctrl['mid'][-1][1], self.ch_inout_base['mid'][-1][1])
         
         self.dec_zero_convs_out.append(
-            self.make_zero_conv(self.ch_inout_ctrl['enc'][-1][1], self.ch_inout_base['mid'][-1][1])
+            self.make_zero_conv(self.ch_inout_ctrl['down'][-1][1], self.ch_inout_base['mid'][-1][1])
         )
-        for i in range(1, len(self.ch_inout_ctrl['enc'])):
+        for i in range(1, len(self.ch_inout_ctrl['down'])):
             self.dec_zero_convs_out.append(
-                self.make_zero_conv(self.ch_inout_ctrl['enc'][-(i + 1)][1], self.ch_inout_base['dec'][i - 1][1])
+                self.make_zero_conv(self.ch_inout_ctrl['down'][-(i + 1)][1], self.ch_inout_base['up'][i - 1][1])
             )
 
         # 5 - Create conditioning hint embedding
@@ -313,12 +320,12 @@ def __init__(
     def from_unet(
         cls,
         unet: UNet2DConditionModel,
-        controlnet_conditioning_channel_order: str = "rgb",
         conditioning_channels: int = 3,
-        time_control_scale: float =  1.0,
+        controlnet_conditioning_channel_order: str = "rgb",
         learn_embedding: bool = False,
+        time_control_scale: float =  1.0,
         block_out_channels: Optional[Tuple[int]] = None,
-        control_model_ratio: Optional[float] = None,
+        control_model_size_ratio: Optional[float] = None,
         num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
         dim_attention_heads: Optional[int] = None
     ):
@@ -327,12 +334,27 @@ def from_unet(
 
         Parameters:
             unet (`UNet2DConditionModel`):
-                The UNet model whose configuration are copief to the [`ControlNetXSModel`].
+                The UNet model we want to control. The dimensions of the ControlNetXSModel will be adapted to it.
+            conditioning_channels (`int`, defaults to 3):
+                Number of channels of conditioning input (e.g. an image)
+            controlnet_conditioning_channel_order (`str`, defaults to `"rgb"`):
+                The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
+            learn_embedding (`bool`, defaults to `False`):
+                Wether to use time embedding of the control model. If yes, the time embedding
+                is a linear interpolation of the time embeddings of the control and base model
+                with interpolation parameter `time_control_scale**3`.
+            time_control_scale (`float`, defaults to 1.0):
+                Linear interpolation parameter used if `learn_embedding` is `True`.
+            block_out_channels (`Tuple[int]`, *optional*):
+                Down blocks output channels in control model. Either this or `block_out_channels` must be given.
+            control_model_size_ratio (float, *optional*):
+                When given, block_out_channels is set to a relative fraction of the base model's block_out_channels.
+                Either this or `control_model_size_ratio` must be given.
         """
 
         # check input
         fixed_size = block_out_channels is not None
-        relative_size = control_model_ratio is not None
+        relative_size = control_model_size_ratio is not None
         if not (fixed_size ^ relative_size):
             raise ValueError("Pass exactly one of `block_out_channels` (for absolute sizing) or `control_model_ratio` (for relative sizing).")
 
@@ -341,7 +363,7 @@ def from_unet(
 
         # create model
         if block_out_channels is None:
-            block_out_channels = [int(control_model_ratio*c) for c in unet.config.block_out_channels]
+            block_out_channels = [int(control_model_size_ratio*c) for c in unet.config.block_out_channels]
 
         if dim_attention_heads is not None:
             num_attention_heads = [math.ceil(c/dim_attention_heads) for c in block_out_channels]
@@ -539,6 +561,46 @@ def forward(
         guess_mode: bool = False, # todo: understand and implement if required
         return_dict: bool = True,
     ) -> Union[ControlNetXSOutput, Tuple]:
+        """
+        The [`ControlNetModel`] forward method.
+
+        Args:
+            base_model (`UNet2DConditionModel`):
+                The base unet model we want to control.
+            sample (`torch.FloatTensor`):
+                The noisy input tensor.
+            timestep (`Union[torch.Tensor, float, int]`):
+                The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.Tensor`):
+                The encoder hidden states.
+            controlnet_cond (`torch.FloatTensor`):
+                The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
+            conditioning_scale (`float`, defaults to `1.0`):
+                How much the control model affects the base model outputs.
+            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
+                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
+            timestep_cond (`torch.Tensor`, *optional*, defaults to `None`):
+                Additional conditional embeddings for timestep. If provided, the embeddings will be summed with the
+                timestep_embedding passed through the `self.time_embedding` layer to obtain the final timestep
+                embeddings.
+            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            added_cond_kwargs (`dict`):
+                Additional conditions for the Stable Diffusion XL UNet.
+            cross_attention_kwargs (`dict[str]`, *optional*, defaults to `None`):
+                A kwargs dictionary that if specified is passed along to the `AttnProcessor`.
+            # guess_mode (`bool`, defaults to `False`):
+                # todo
+            return_dict (`bool`, defaults to `True`):
+                Whether or not to return a [`~models.controlnet.ControlNetOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.controlnetxs.ControlNetXSOutput`] **or** `tuple`:
+                If `return_dict` is `True`, a [`~models.controlnetxs.ControlNetXSOutput`] is returned, otherwise a tuple is
+                returned where the first element is the sample tensor.
+        """
         # check channel order
         channel_order = self.config.controlnet_conditioning_channel_order
 

From 39257c59d5c2072d86f9ebc310c3e62bad550f22 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Tue, 14 Nov 2023 20:16:17 +0100
Subject: [PATCH 37/88] cleanup

---
 src/diffusers/models/controlnetxs.py | 96 +++++++++++++++-------------
 1 file changed, 50 insertions(+), 46 deletions(-)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index e73087c3a522..f8f0514185ed 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -53,16 +53,23 @@
 
 @dataclass
 class ControlNetXSOutput(BaseOutput):
-    # todo: docstring
+    """
+    The output of [`ControlNetXSModel`].
+
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            The output of the `ControlNetXSModel`. Unlike `ControlNetOutput` this is NOT to be added to
+            the base model output, but is already the final output.
+    """
     sample: torch.FloatTensor = None
 
 
-# todo umer: do we need UNet2DConditionLoadersMixin?
-class ControlNetXSModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+# todo umer: add sth like FromOriginalControlnetMixin
+class ControlNetXSModel(ModelMixin, ConfigMixin):
     r"""
     A ControlNet-XS model
 
-    This model inherits from [`ModelMixin`], [`ConfigMixin`] and [`UNet2DConditionLoadersMixin`].
+    This model inherits from [`ModelMixin`] and [`ConfigMixin`].
     Check the superclass documentation for it's generic methods implemented for all models
     (such as downloading or saving).
 
@@ -81,31 +88,27 @@ class ControlNetXSModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
         learn_embedding (`bool`, defaults to `False`):
             Wether to use time embedding of the control model. If yes, the time embedding
             is a linear interpolation of the time embeddings of the control and base model
-            with interpolation parameter `time_control_scale**3`.
-        time_control_scale (`float`, defaults to 1.0):
+            with interpolation parameter `time_embedding_mix**3`.
+        time_embedding_mix (`float`, defaults to 1.0):
             Linear interpolation parameter used if `learn_embedding` is `True`.
         base_model_channel_sizes (`Dict[str, List[Tuple[int]]]`):
-            Channel sizes of each subblock of base model. Use `gather_base_model_sizes` on
+            Channel sizes of each subblock of base model. Use `gather_subblock_sizes` on
             your base model to compute it.
     """
 
-    # todo: rename enc/mid/dec in variable names (eg connections)
-    # todo: is time_control_scale a good name?
-    # todo: gather_base_model_sizes good name?
-
     # to delete later
     @classmethod
     def create_as_in_paper(cls, base_model: UNet2DConditionModel):
         return ControlNetXSModel.from_unet(
             base_model,
-            time_control_scale =  0.95,
+            time_embedding_mix =  0.95,
             learn_embedding = True,
-            control_model_ratio = 0.1,
+            control_model_size_ratio = 0.1,
             dim_attention_heads = 64
         )
 
     @classmethod
-    def gather_base_model_sizes(cls, unet: UNet2DConditionModel, base_or_control):
+    def gather_subblock_sizes(cls, unet: UNet2DConditionModel, base_or_control):
         if base_or_control not in ['base', 'control']:
             raise ValueError(f"`base_or_control` needs to be either `base` or `control`")
 
@@ -145,7 +148,7 @@ def __init__(
         controlnet_conditioning_channel_order: str = "rgb",
         time_embedding_input_dim: int = 320,
         time_embedding_dim: int = 1280,
-        time_control_scale:float=1.0,
+        time_embedding_mix:float=1.0,
         learn_embedding: bool =False,
         base_model_channel_sizes: Dict[str, List[Tuple[int]]]={
             'down':[(4, 320), (320, 320), (320, 320), (320, 320), (320, 640), (640, 640), (640, 640), (640, 1280), (1280, 1280)],
@@ -262,33 +265,33 @@ def __init__(
         increase_block_input_in_mid_resnet(self.control_model, by=base_block_out_channels[-1])
 
         # 3 - Gather Channel Sizes
-        self.ch_inout_ctrl = ControlNetXSModel.gather_base_model_sizes(self.control_model, base_or_control='control')
+        self.ch_inout_ctrl = ControlNetXSModel.gather_subblock_sizes(self.control_model, base_or_control='control')
         self.ch_inout_base = base_model_channel_sizes
 
         # 4 - Build connections between base and control model
-        self.enc_zero_convs_out = nn.ModuleList([])
-        self.enc_zero_convs_in = nn.ModuleList([])
+        self.down_zero_convs_out = nn.ModuleList([])
+        self.down_zero_convs_in = nn.ModuleList([])
         self.middle_block_out = nn.ModuleList([])
         self.middle_block_in = nn.ModuleList([])
-        self.dec_zero_convs_out = nn.ModuleList([])
-        self.dec_zero_convs_in = nn.ModuleList([])
+        self.up_zero_convs_out = nn.ModuleList([])
+        self.up_zero_convs_in = nn.ModuleList([])
 
         for ch_io_base in self.ch_inout_base['down']:
-            self.enc_zero_convs_in.append(self.make_zero_conv(
+            self.down_zero_convs_in.append(self.make_zero_conv(
                 in_channels=ch_io_base[1], out_channels=ch_io_base[1])
             )
         for i in range(len(self.ch_inout_ctrl['down'])):
-            self.enc_zero_convs_out.append(
+            self.down_zero_convs_out.append(
                 self.make_zero_conv(self.ch_inout_ctrl['down'][i][1], self.ch_inout_base['down'][i][1])
             )       
  
         self.middle_block_out = self.make_zero_conv(self.ch_inout_ctrl['mid'][-1][1], self.ch_inout_base['mid'][-1][1])
         
-        self.dec_zero_convs_out.append(
+        self.up_zero_convs_out.append(
             self.make_zero_conv(self.ch_inout_ctrl['down'][-1][1], self.ch_inout_base['mid'][-1][1])
         )
         for i in range(1, len(self.ch_inout_ctrl['down'])):
-            self.dec_zero_convs_out.append(
+            self.up_zero_convs_out.append(
                 self.make_zero_conv(self.ch_inout_ctrl['down'][-(i + 1)][1], self.ch_inout_base['up'][i - 1][1])
             )
 
@@ -323,7 +326,7 @@ def from_unet(
         conditioning_channels: int = 3,
         controlnet_conditioning_channel_order: str = "rgb",
         learn_embedding: bool = False,
-        time_control_scale: float =  1.0,
+        time_embedding_mix: float =  1.0,
         block_out_channels: Optional[Tuple[int]] = None,
         control_model_size_ratio: Optional[float] = None,
         num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
@@ -342,8 +345,8 @@ def from_unet(
             learn_embedding (`bool`, defaults to `False`):
                 Wether to use time embedding of the control model. If yes, the time embedding
                 is a linear interpolation of the time embeddings of the control and base model
-                with interpolation parameter `time_control_scale**3`.
-            time_control_scale (`float`, defaults to 1.0):
+                with interpolation parameter `time_embedding_mix**3`.
+            time_embedding_mix (`float`, defaults to 1.0):
                 Linear interpolation parameter used if `learn_embedding` is `True`.
             block_out_channels (`Tuple[int]`, *optional*):
                 Down blocks output channels in control model. Either this or `block_out_channels` must be given.
@@ -389,9 +392,9 @@ def get_time_emb_dim(unet: UNet2DConditionModel): return unet.time_embedding.lin
             controlnet_conditioning_channel_order = controlnet_conditioning_channel_order,
             time_embedding_input_dim = get_time_emb_input_dim(unet),
             time_embedding_dim = get_time_emb_dim(unet),
-            time_control_scale =  time_control_scale,
+            time_embedding_mix =  time_embedding_mix,
             learn_embedding = learn_embedding,
-            base_model_channel_sizes = ControlNetXSModel.gather_base_model_sizes(unet, base_or_control='base'),
+            base_model_channel_sizes = ControlNetXSModel.gather_subblock_sizes(unet, base_or_control='base'),
         )
 
         return cls(**kwargs)
@@ -558,7 +561,7 @@ def forward(
         attention_mask: Optional[torch.Tensor] = None,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
-        guess_mode: bool = False, # todo: understand and implement if required
+        guess_mode: bool = False, # todo umer: understand and implement if required
         return_dict: bool = True,
     ) -> Union[ControlNetXSOutput, Tuple]:
         """
@@ -592,7 +595,7 @@ def forward(
             cross_attention_kwargs (`dict[str]`, *optional*, defaults to `None`):
                 A kwargs dictionary that if specified is passed along to the `AttnProcessor`.
             # guess_mode (`bool`, defaults to `False`):
-                # todo
+                # todo umer
             return_dict (`bool`, defaults to `True`):
                 Whether or not to return a [`~models.controlnet.ControlNetOutput`] instead of a plain tuple.
 
@@ -613,7 +616,7 @@ def forward(
             raise ValueError(f"unknown `controlnet_conditioning_channel_order`: {channel_order}")
 
         # scale control strength
-        n_connections = len(self.enc_zero_convs_out) + 1 + len(self.dec_zero_convs_out)
+        n_connections = len(self.down_zero_convs_out) + 1 + len(self.up_zero_convs_out)
         scale_list = torch.full((n_connections,), conditioning_scale)
    
         # prepare attention_mask
@@ -648,7 +651,7 @@ def forward(
         if self.config.learn_embedding:
             ctrl_temb = self.control_model.time_embedding(t_emb, timestep_cond)
             base_temb = base_model.time_embedding(t_emb, timestep_cond)
-            interpolation_param = self.config.time_control_scale ** 0.3
+            interpolation_param = self.config.time_embedding_mix ** 0.3
 
             temb = ctrl_temb * interpolation_param + base_temb * (1 - interpolation_param)
         else:
@@ -704,7 +707,7 @@ def forward(
 
         h_ctrl = h_base = sample
         hs_base, hs_ctrl = [], []
-        it_enc_convs_in, it_enc_convs_out, it_dec_convs_in, it_dec_convs_out = map(iter, (self.enc_zero_convs_in, self.enc_zero_convs_out, self.dec_zero_convs_in, self.dec_zero_convs_out))
+        it_down_convs_in, it_down_convs_out, it_dec_convs_in, it_up_convs_out = map(iter, (self.down_zero_convs_in, self.down_zero_convs_out, self.up_zero_convs_in, self.up_zero_convs_out))
         scales = iter(scale_list)
 
         base_down_subblocks = to_sub_blocks(base_model.down_blocks)
@@ -718,44 +721,44 @@ def forward(
         h_base = base_model.conv_in(h_base)
         h_ctrl = self.control_model.conv_in(h_ctrl)
         if guided_hint is not None: h_ctrl += guided_hint
-        h_base = h_base + next(it_enc_convs_out)(h_ctrl) * next(scales)
+        h_base = h_base + next(it_down_convs_out)(h_ctrl) * next(scales)
 
         hs_base.append(h_base)
         hs_ctrl.append(h_ctrl)
 
         # 1 - down
         for m_base, m_ctrl  in zip(base_down_subblocks, ctrl_down_subblocks):
-            h_ctrl = torch.cat([h_ctrl, next(it_enc_convs_in)(h_base)], dim=1)  # A - concat base -> ctrl
-            h_base = m_base(                                                    # B - apply base subblock
+            h_ctrl = torch.cat([h_ctrl, next(it_down_convs_in)(h_base)], dim=1)  # A - concat base -> ctrl
+            h_base = m_base(                                                     # B - apply base subblock
                 h_base, temb, cemb,
                 attention_mask, cross_attention_kwargs
             )                                 
-            h_ctrl = m_ctrl(                                                    # C - apply ctrl subblock
+            h_ctrl = m_ctrl(                                                     # C - apply ctrl subblock
                 h_ctrl, temb, cemb,
                 attention_mask, cross_attention_kwargs
             )                             
-            h_base = h_base + next(it_enc_convs_out)(h_ctrl) * next(scales)     # D - add ctrl -> base
+            h_base = h_base + next(it_down_convs_out)(h_ctrl) * next(scales)     # D - add ctrl -> base
 
             hs_base.append(h_base)
             hs_ctrl.append(h_ctrl)
 
         # 2 - mid
-        h_ctrl = torch.cat([h_ctrl, next(it_enc_convs_in)(h_base)], dim=1)      # A - concat base -> ctrl
+        h_ctrl = torch.cat([h_ctrl, next(it_down_convs_in)(h_base)], dim=1)      # A - concat base -> ctrl
         for m_base, m_ctrl in zip(base_mid_subblocks, ctrl_mid_subblocks):
-            h_base = m_base(                                                    # B - apply base subblock
+            h_base = m_base(                                                     # B - apply base subblock
                 h_base, temb, cemb,
                 attention_mask, cross_attention_kwargs
             )  
-            h_ctrl  = m_ctrl(                                                   # C - apply ctrl subblock
+            h_ctrl  = m_ctrl(                                                    # C - apply ctrl subblock
                 h_ctrl, temb, cemb,
                 attention_mask, cross_attention_kwargs
             )  
-        h_base = h_base + self.middle_block_out(h_ctrl) * next(scales)          # D - add ctrl -> base
+        h_base = h_base + self.middle_block_out(h_ctrl) * next(scales)           # D - add ctrl -> base
  
         # 3 - up
         for m_base in base_up_subblocks:
-            h_base = h_base + next(it_dec_convs_out)(hs_ctrl.pop()) * next(scales)  # add info from ctrl encoder 
-            h_base = torch.cat([h_base, hs_base.pop()], dim=1)                      # concat info from base encoder+ctrl encoder
+            h_base = h_base + next(it_up_convs_out)(hs_ctrl.pop()) * next(scales) # add info from ctrl encoder 
+            h_base = torch.cat([h_base, hs_base.pop()], dim=1)                    # concat info from base encoder+ctrl encoder
             h_base = m_base(                                                   
                 h_base, temb, cemb,
                 attention_mask, cross_attention_kwargs
@@ -771,9 +774,10 @@ def forward(
         return ControlNetXSOutput(sample=h_base)
 
     def make_zero_conv(self, in_channels, out_channels=None):
-        # keep running track # todo: better comment
+        # keep running track of channels sizes
         self.in_channels = in_channels
         self.out_channels = out_channels or in_channels
+
         return zero_module(nn.Conv2d(in_channels, out_channels, 1, padding=0))
 
 

From e4f412e1c641b666bd0ea9572a6783b2ea9459ac Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Tue, 14 Nov 2023 22:58:18 +0100
Subject: [PATCH 38/88] make style

---
 src/diffusers/models/attention.py             |  27 +-
 src/diffusers/models/controlnetxs.py          | 417 ++++++++++--------
 src/diffusers/models/resnet.py                |  14 +-
 src/diffusers/models/transformer_2d.py        |   6 +-
 src/diffusers/models/unet_2d_blocks.py        |   1 +
 src/diffusers/models/unet_2d_condition.py     |   6 +-
 src/diffusers/pipelines/__init__.py           |   8 +-
 .../pipelines/controlnet_xs/__init__.py       |   4 +-
 .../pipeline_controlnet_xs_sd_xl.py           |  50 +--
 .../pipeline_stable_diffusion_xl.py           |  12 +-
 .../schedulers/scheduling_euler_discrete.py   |   2 +-
 src/diffusers/umer_debug_logger.py            |  92 ++--
 tests/pipelines/controlnetxs/__init__.py      |   0
 .../controlnetxs/test_controlnetxs_sdxl.py    | 390 ++++++++++++++++
 14 files changed, 738 insertions(+), 291 deletions(-)
 create mode 100644 tests/pipelines/controlnetxs/__init__.py
 create mode 100644 tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py

diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 773a3fc38cca..6abb95a53d84 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -17,6 +17,7 @@
 import torch.nn.functional as F
 from torch import nn
 
+from ..umer_debug_logger import udl
 from ..utils import USE_PEFT_BACKEND
 from ..utils.torch_utils import maybe_allow_in_graph
 from .activations import get_activation
@@ -24,7 +25,6 @@
 from .embeddings import CombinedTimestepLabelEmbeddings
 from .lora import LoRACompatibleLinear
 
-from ..umer_debug_logger import udl
 
 @maybe_allow_in_graph
 class GatedSelfAttentionDense(nn.Module):
@@ -222,12 +222,12 @@ def forward(
             attention_mask=attention_mask,
             **cross_attention_kwargs,
         )
-        udl.log_if('attn1', attn_output, 'SUBBLOCK-MINUS-1')
+        udl.log_if("attn1", attn_output, "SUBBLOCK-MINUS-1")
 
         if self.use_ada_layer_norm_zero:
             attn_output = gate_msa.unsqueeze(1) * attn_output
         hidden_states = attn_output + hidden_states
-        udl.log_if('add attn1', hidden_states, 'SUBBLOCK-MINUS-1')
+        udl.log_if("add attn1", hidden_states, "SUBBLOCK-MINUS-1")
 
         # 2.5 GLIGEN Control
         if gligen_kwargs is not None:
@@ -239,11 +239,16 @@ def forward(
             norm_hidden_states = (
                 self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
             )
-            udl.log_if('norm2', norm_hidden_states, 'SUBBLOCK-MINUS-1')
-            udl.log_if('context', encoder_hidden_states, 'SUBBLOCK-MINUS-1')
-            if encoder_attention_mask is not None: print('encoder_attention_mask is not None. Shape = '+str(list(encoder_attention_mask.shape)+'\tvals = '+str(encoder_attention_mask.flatten[:10])))
+            udl.log_if("norm2", norm_hidden_states, "SUBBLOCK-MINUS-1")
+            udl.log_if("context", encoder_hidden_states, "SUBBLOCK-MINUS-1")
+            if encoder_attention_mask is not None:
+                print(
+                    "encoder_attention_mask is not None. Shape = "
+                    + str(list(encoder_attention_mask.shape) + "\tvals = " + str(encoder_attention_mask.flatten[:10]))
+                )
             if cross_attention_kwargs is not None:
-                if len(cross_attention_kwargs.keys()) > 0: print('cross_attention_kwargs is not None. Keys = '+str(cross_attention_kwargs.keys()))
+                if len(cross_attention_kwargs.keys()) > 0:
+                    print("cross_attention_kwargs is not None. Keys = " + str(cross_attention_kwargs.keys()))
             attn_output = self.attn2(
                 norm_hidden_states,
                 encoder_hidden_states=encoder_hidden_states,
@@ -251,8 +256,8 @@ def forward(
                 **cross_attention_kwargs,
             )
             hidden_states = attn_output + hidden_states
-        udl.log_if('attn2', attn_output, 'SUBBLOCK-MINUS-1')
-        udl.log_if('add attn2', hidden_states, 'SUBBLOCK-MINUS-1')
+        udl.log_if("attn2", attn_output, "SUBBLOCK-MINUS-1")
+        udl.log_if("add attn2", hidden_states, "SUBBLOCK-MINUS-1")
 
         # 4. Feed-forward
         norm_hidden_states = self.norm3(hidden_states)
@@ -282,8 +287,8 @@ def forward(
             ff_output = gate_mlp.unsqueeze(1) * ff_output
 
         hidden_states = ff_output + hidden_states
-        udl.log_if('ff', ff_output, 'SUBBLOCK-MINUS-1')
-        udl.log_if('add ff', hidden_states, 'SUBBLOCK-MINUS-1')
+        udl.log_if("ff", ff_output, "SUBBLOCK-MINUS-1")
+        udl.log_if("add ff", hidden_states, "SUBBLOCK-MINUS-1")
 
         return hidden_states
 
diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index f8f0514185ed..88c1a64eef83 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -11,19 +11,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import math
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Union, Tuple
-
 from itertools import zip_longest
+from typing import Any, Dict, List, Optional, Tuple, Union
 
-import math
 import torch
+import torch.utils.checkpoint
 from torch import nn
 from torch.nn.modules.normalization import GroupNorm
-import torch.utils.checkpoint
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..loaders import UNet2DConditionLoadersMixin
 from ..utils import BaseOutput, logging
 from .attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
@@ -32,25 +30,24 @@
     AttnAddedKVProcessor,
     AttnProcessor,
 )
-from .embeddings import Timesteps
-from .modeling_utils import ModelMixin
 from .lora import LoRACompatibleConv
+from .modeling_utils import ModelMixin
 from .unet_2d_blocks import (
     CrossAttnDownBlock2D,
-    DownBlock2D,
     CrossAttnUpBlock2D,
-    UpBlock2D,
+    DownBlock2D,
+    Downsample2D,
     ResnetBlock2D,
     Transformer2DModel,
-    Downsample2D,
+    UpBlock2D,
     Upsample2D,
 )
 from .unet_2d_condition import UNet2DConditionModel
-from ..umer_debug_logger import udl
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
+
 @dataclass
 class ControlNetXSOutput(BaseOutput):
     """
@@ -58,9 +55,10 @@ class ControlNetXSOutput(BaseOutput):
 
     Args:
         sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            The output of the `ControlNetXSModel`. Unlike `ControlNetOutput` this is NOT to be added to
-            the base model output, but is already the final output.
+            The output of the `ControlNetXSModel`. Unlike `ControlNetOutput` this is NOT to be added to the base model
+            output, but is already the final output.
     """
+
     sample: torch.FloatTensor = None
 
 
@@ -69,12 +67,11 @@ class ControlNetXSModel(ModelMixin, ConfigMixin):
     r"""
     A ControlNet-XS model
 
-    This model inherits from [`ModelMixin`] and [`ConfigMixin`].
-    Check the superclass documentation for it's generic methods implemented for all models
-    (such as downloading or saving).
+    This model inherits from [`ModelMixin`] and [`ConfigMixin`]. Check the superclass documentation for it's generic
+    methods implemented for all models (such as downloading or saving).
 
-    Most of parameters for this model are passed into the [`UNet2DConditionModel`] it creates.
-    Check the documentation of [`UNet2DConditionModel`] for them.
+    Most of parameters for this model are passed into the [`UNet2DConditionModel`] it creates. Check the documentation
+    of [`UNet2DConditionModel`] for them.
 
     Parameters:
         conditioning_channels (`int`, defaults to 3):
@@ -86,14 +83,12 @@ class ControlNetXSModel(ModelMixin, ConfigMixin):
         time_embedding_dim (`int`, defaults to 1280):
             Dimension of output from time embedding. Needs to be same as in the base model.
         learn_embedding (`bool`, defaults to `False`):
-            Wether to use time embedding of the control model. If yes, the time embedding
-            is a linear interpolation of the time embeddings of the control and base model
-            with interpolation parameter `time_embedding_mix**3`.
+            Wether to use time embedding of the control model. If yes, the time embedding is a linear interpolation of
+            the time embeddings of the control and base model with interpolation parameter `time_embedding_mix**3`.
         time_embedding_mix (`float`, defaults to 1.0):
             Linear interpolation parameter used if `learn_embedding` is `True`.
         base_model_channel_sizes (`Dict[str, List[Tuple[int]]]`):
-            Channel sizes of each subblock of base model. Use `gather_subblock_sizes` on
-            your base model to compute it.
+            Channel sizes of each subblock of base model. Use `gather_subblock_sizes` on your base model to compute it.
     """
 
     # to delete later
@@ -101,59 +96,83 @@ class ControlNetXSModel(ModelMixin, ConfigMixin):
     def create_as_in_paper(cls, base_model: UNet2DConditionModel):
         return ControlNetXSModel.from_unet(
             base_model,
-            time_embedding_mix =  0.95,
-            learn_embedding = True,
-            control_model_size_ratio = 0.1,
-            dim_attention_heads = 64
+            time_embedding_mix=0.95,
+            learn_embedding=True,
+            control_model_size_ratio=0.1,
+            dim_attention_heads=64,
         )
 
     @classmethod
     def gather_subblock_sizes(cls, unet: UNet2DConditionModel, base_or_control):
-        if base_or_control not in ['base', 'control']:
-            raise ValueError(f"`base_or_control` needs to be either `base` or `control`")
+        if base_or_control not in ["base", "control"]:
+            raise ValueError("`base_or_control` needs to be either `base` or `control`")
 
-        channel_sizes = {'down': [], 'mid': [], 'up': []}
+        channel_sizes = {"down": [], "mid": [], "up": []}
 
         # input convolution
-        channel_sizes['down'].append((unet.conv_in.in_channels, unet.conv_in.out_channels))
+        channel_sizes["down"].append((unet.conv_in.in_channels, unet.conv_in.out_channels))
 
         # encoder blocks
         for module in unet.down_blocks:
             if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D)):
                 for r in module.resnets:
-                    channel_sizes['down'].append((r.in_channels, r.out_channels))
+                    channel_sizes["down"].append((r.in_channels, r.out_channels))
                 if module.downsamplers:
-                    channel_sizes['down'].append((module.downsamplers[0].channels, module.downsamplers[0].out_channels))
+                    channel_sizes["down"].append(
+                        (module.downsamplers[0].channels, module.downsamplers[0].out_channels)
+                    )
             else:
-                raise ValueError(f'Encountered unknown module of type {type(module)} while creating ControlNet-XS.')
+                raise ValueError(f"Encountered unknown module of type {type(module)} while creating ControlNet-XS.")
 
         # middle block
-        channel_sizes['mid'].append((unet.mid_block.resnets[0].in_channels, unet.mid_block.resnets[0].out_channels))
+        channel_sizes["mid"].append((unet.mid_block.resnets[0].in_channels, unet.mid_block.resnets[0].out_channels))
 
         # decoder blocks
-        if base_or_control == 'base':
+        if base_or_control == "base":
             for module in unet.up_blocks:
                 if isinstance(module, (CrossAttnUpBlock2D, UpBlock2D)):
                     for r in module.resnets:
-                        channel_sizes['up'].append((r.in_channels, r.out_channels))
+                        channel_sizes["up"].append((r.in_channels, r.out_channels))
                 else:
-                   raise ValueError(f'Encountered unknown module of type {type(module)} while creating ControlNet-XS.')
+                    raise ValueError(
+                        f"Encountered unknown module of type {type(module)} while creating ControlNet-XS."
+                    )
 
         return channel_sizes
 
     @register_to_config
     def __init__(
-        self,      
+        self,
         conditioning_channels: int = 3,
         controlnet_conditioning_channel_order: str = "rgb",
         time_embedding_input_dim: int = 320,
         time_embedding_dim: int = 1280,
-        time_embedding_mix:float=1.0,
-        learn_embedding: bool =False,
-        base_model_channel_sizes: Dict[str, List[Tuple[int]]]={
-            'down':[(4, 320), (320, 320), (320, 320), (320, 320), (320, 640), (640, 640), (640, 640), (640, 1280), (1280, 1280)],
-            'mid': [(1280, 1280)],
-            'up':  [(2560, 1280), (2560, 1280), (1920, 1280), (1920, 640), (1280, 640), (960, 640), (960, 320), (640, 320), (640, 320)]
+        time_embedding_mix: float = 1.0,
+        learn_embedding: bool = False,
+        base_model_channel_sizes: Dict[str, List[Tuple[int]]] = {
+            "down": [
+                (4, 320),
+                (320, 320),
+                (320, 320),
+                (320, 320),
+                (320, 640),
+                (640, 640),
+                (640, 640),
+                (640, 1280),
+                (1280, 1280),
+            ],
+            "mid": [(1280, 1280)],
+            "up": [
+                (2560, 1280),
+                (2560, 1280),
+                (1920, 1280),
+                (1920, 640),
+                (1280, 640),
+                (960, 640),
+                (960, 320),
+                (640, 320),
+                (640, 320),
+            ],
         },
         sample_size: Optional[int] = None,
         in_channels: int = 4,
@@ -244,28 +263,28 @@ def __init__(
             mid_block_only_cross_attention=mid_block_only_cross_attention,
             cross_attention_norm=cross_attention_norm,
             addition_embed_type_num_heads=addition_embed_type_num_heads,
-        )       
+        )
 
         # 2 - Do model surgery on control model
         # 2.1 - Allow to use the same time information as the base model
         adjust_time_dims(self.control_model, time_embedding_input_dim, time_embedding_dim)
- 
+
         # 2.2 - Allow for information infusion from base model
         # todo umer: the assumption that block sizes = changing subblock sizes is false, eg when we have consecutive blocks of same size
-        base_block_out_channels = [sz[1] for sz in base_model_channel_sizes['down'] if sz[0] != sz[1]]
+        base_block_out_channels = [sz[1] for sz in base_model_channel_sizes["down"] if sz[0] != sz[1]]
 
-        extra_channels = list(zip(
-            base_block_out_channels[0:1] + base_block_out_channels[:-1],
-            base_block_out_channels
-        ))
+        extra_channels = list(
+            zip(base_block_out_channels[0:1] + base_block_out_channels[:-1], base_block_out_channels)
+        )
         for i, (e1, e2) in enumerate(extra_channels):
             increase_block_input_in_encoder_resnet(self.control_model, block_no=i, resnet_idx=0, by=e1)
             increase_block_input_in_encoder_resnet(self.control_model, block_no=i, resnet_idx=1, by=e2)
-            if self.control_model.down_blocks[i].downsamplers: increase_block_input_in_encoder_downsampler(self.control_model, block_no=i, by=e2)
+            if self.control_model.down_blocks[i].downsamplers:
+                increase_block_input_in_encoder_downsampler(self.control_model, block_no=i, by=e2)
         increase_block_input_in_mid_resnet(self.control_model, by=base_block_out_channels[-1])
 
         # 3 - Gather Channel Sizes
-        self.ch_inout_ctrl = ControlNetXSModel.gather_subblock_sizes(self.control_model, base_or_control='control')
+        self.ch_inout_ctrl = ControlNetXSModel.gather_subblock_sizes(self.control_model, base_or_control="control")
         self.ch_inout_base = base_model_channel_sizes
 
         # 4 - Build connections between base and control model
@@ -276,23 +295,21 @@ def __init__(
         self.up_zero_convs_out = nn.ModuleList([])
         self.up_zero_convs_in = nn.ModuleList([])
 
-        for ch_io_base in self.ch_inout_base['down']:
-            self.down_zero_convs_in.append(self.make_zero_conv(
-                in_channels=ch_io_base[1], out_channels=ch_io_base[1])
-            )
-        for i in range(len(self.ch_inout_ctrl['down'])):
+        for ch_io_base in self.ch_inout_base["down"]:
+            self.down_zero_convs_in.append(self.make_zero_conv(in_channels=ch_io_base[1], out_channels=ch_io_base[1]))
+        for i in range(len(self.ch_inout_ctrl["down"])):
             self.down_zero_convs_out.append(
-                self.make_zero_conv(self.ch_inout_ctrl['down'][i][1], self.ch_inout_base['down'][i][1])
-            )       
- 
-        self.middle_block_out = self.make_zero_conv(self.ch_inout_ctrl['mid'][-1][1], self.ch_inout_base['mid'][-1][1])
-        
+                self.make_zero_conv(self.ch_inout_ctrl["down"][i][1], self.ch_inout_base["down"][i][1])
+            )
+
+        self.middle_block_out = self.make_zero_conv(self.ch_inout_ctrl["mid"][-1][1], self.ch_inout_base["mid"][-1][1])
+
         self.up_zero_convs_out.append(
-            self.make_zero_conv(self.ch_inout_ctrl['down'][-1][1], self.ch_inout_base['mid'][-1][1])
+            self.make_zero_conv(self.ch_inout_ctrl["down"][-1][1], self.ch_inout_base["mid"][-1][1])
         )
-        for i in range(1, len(self.ch_inout_ctrl['down'])):
+        for i in range(1, len(self.ch_inout_ctrl["down"])):
             self.up_zero_convs_out.append(
-                self.make_zero_conv(self.ch_inout_ctrl['down'][-(i + 1)][1], self.ch_inout_base['up'][i - 1][1])
+                self.make_zero_conv(self.ch_inout_ctrl["down"][-(i + 1)][1], self.ch_inout_base["up"][i - 1][1])
             )
 
         # 5 - Create conditioning hint embedding
@@ -311,9 +328,9 @@ def __init__(
             nn.SiLU(),
             nn.Conv2d(96, 256, 3, padding=1, stride=2),
             nn.SiLU(),
-            zero_module(nn.Conv2d(256, block_out_channels[0], 3, padding=1))
+            zero_module(nn.Conv2d(256, block_out_channels[0], 3, padding=1)),
         )
-    
+
         # In the mininal implementation setting, we only need the control model up to the mid block
         del self.control_model.up_blocks
         del self.control_model.conv_norm_out
@@ -326,11 +343,11 @@ def from_unet(
         conditioning_channels: int = 3,
         controlnet_conditioning_channel_order: str = "rgb",
         learn_embedding: bool = False,
-        time_embedding_mix: float =  1.0,
+        time_embedding_mix: float = 1.0,
         block_out_channels: Optional[Tuple[int]] = None,
         control_model_size_ratio: Optional[float] = None,
         num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
-        dim_attention_heads: Optional[int] = None
+        dim_attention_heads: Optional[int] = None,
     ):
         r"""
         Instantiate a [`ControlNetXSModel`] from [`UNet2DConditionModel`].
@@ -343,9 +360,9 @@ def from_unet(
             controlnet_conditioning_channel_order (`str`, defaults to `"rgb"`):
                 The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
             learn_embedding (`bool`, defaults to `False`):
-                Wether to use time embedding of the control model. If yes, the time embedding
-                is a linear interpolation of the time embeddings of the control and base model
-                with interpolation parameter `time_embedding_mix**3`.
+                Wether to use time embedding of the control model. If yes, the time embedding is a linear interpolation
+                of the time embeddings of the control and base model with interpolation parameter
+                `time_embedding_mix**3`.
             time_embedding_mix (`float`, defaults to 1.0):
                 Linear interpolation parameter used if `learn_embedding` is `True`.
             block_out_channels (`Tuple[int]`, *optional*):
@@ -359,42 +376,55 @@ def from_unet(
         fixed_size = block_out_channels is not None
         relative_size = control_model_size_ratio is not None
         if not (fixed_size ^ relative_size):
-            raise ValueError("Pass exactly one of `block_out_channels` (for absolute sizing) or `control_model_ratio` (for relative sizing).")
+            raise ValueError(
+                "Pass exactly one of `block_out_channels` (for absolute sizing) or `control_model_ratio` (for relative sizing)."
+            )
 
         if num_attention_heads is not None and dim_attention_heads is not None:
             raise ValueError("Pass only one of `num_attention_heads` or `dim_attention_heads`.")
 
         # create model
         if block_out_channels is None:
-            block_out_channels = [int(control_model_size_ratio*c) for c in unet.config.block_out_channels]
+            block_out_channels = [int(control_model_size_ratio * c) for c in unet.config.block_out_channels]
 
         if dim_attention_heads is not None:
-            num_attention_heads = [math.ceil(c/dim_attention_heads) for c in block_out_channels]
+            num_attention_heads = [math.ceil(c / dim_attention_heads) for c in block_out_channels]
+
+        def get_time_emb_input_dim(unet: UNet2DConditionModel):
+            return unet.time_embedding.linear_1.in_features
 
-        def get_time_emb_input_dim(unet: UNet2DConditionModel):return unet.time_embedding.linear_1.in_features
-        def get_time_emb_dim(unet: UNet2DConditionModel): return unet.time_embedding.linear_2.out_features
+        def get_time_emb_dim(unet: UNet2DConditionModel):
+            return unet.time_embedding.linear_2.out_features
 
+        # clone params from base unet
         kwargs = dict(unet.config)
         kwargs.update(block_out_channels=block_out_channels)
         if num_attention_heads is not None:
             kwargs.update(attention_head_dim=num_attention_heads)
 
+        # time embedding of control unet is not used. So remove params for them.
         to_remove = (
-            'flip_sin_to_cos','freq_shift',
-            'addition_embed_type','addition_time_embed_dim',
-            'class_embed_type', 'num_class_embeds', 'projection_class_embeddings_input_dim', 'class_embeddings_concat'
+            "flip_sin_to_cos",
+            "freq_shift",
+            "addition_embed_type",
+            "addition_time_embed_dim",
+            "class_embed_type",
+            "num_class_embeds",
+            "projection_class_embeddings_input_dim",
+            "class_embeddings_concat",
         )
         for o in to_remove:
             del kwargs[o]
 
+        # add controlnetxs-specific params
         kwargs.update(
-            conditioning_channels = conditioning_channels,
-            controlnet_conditioning_channel_order = controlnet_conditioning_channel_order,
-            time_embedding_input_dim = get_time_emb_input_dim(unet),
-            time_embedding_dim = get_time_emb_dim(unet),
-            time_embedding_mix =  time_embedding_mix,
-            learn_embedding = learn_embedding,
-            base_model_channel_sizes = ControlNetXSModel.gather_subblock_sizes(unet, base_or_control='base'),
+            conditioning_channels=conditioning_channels,
+            controlnet_conditioning_channel_order=controlnet_conditioning_channel_order,
+            time_embedding_input_dim=get_time_emb_input_dim(unet),
+            time_embedding_dim=get_time_emb_dim(unet),
+            time_embedding_mix=time_embedding_mix,
+            learn_embedding=learn_embedding,
+            base_model_channel_sizes=ControlNetXSModel.gather_subblock_sizes(unet, base_or_control="base"),
         )
 
         return cls(**kwargs)
@@ -561,7 +591,7 @@ def forward(
         attention_mask: Optional[torch.Tensor] = None,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
-        guess_mode: bool = False, # todo umer: understand and implement if required
+        guess_mode: bool = False,  # todo umer: understand and implement if required
         return_dict: bool = True,
     ) -> Union[ControlNetXSOutput, Tuple]:
         """
@@ -601,8 +631,8 @@ def forward(
 
         Returns:
             [`~models.controlnetxs.ControlNetXSOutput`] **or** `tuple`:
-                If `return_dict` is `True`, a [`~models.controlnetxs.ControlNetXSOutput`] is returned, otherwise a tuple is
-                returned where the first element is the sample tensor.
+                If `return_dict` is `True`, a [`~models.controlnetxs.ControlNetXSOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
         """
         # check channel order
         channel_order = self.config.controlnet_conditioning_channel_order
@@ -618,14 +648,14 @@ def forward(
         # scale control strength
         n_connections = len(self.down_zero_convs_out) + 1 + len(self.up_zero_convs_out)
         scale_list = torch.full((n_connections,), conditioning_scale)
-   
+
         # prepare attention_mask
         if attention_mask is not None:
             attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
             attention_mask = attention_mask.unsqueeze(1)
 
         # 1. time
-        timesteps=timestep
+        timesteps = timestep
         if not torch.is_tensor(timesteps):
             # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
             # This would be a good case for the `match` statement (Python 3.10+)
@@ -651,7 +681,7 @@ def forward(
         if self.config.learn_embedding:
             ctrl_temb = self.control_model.time_embedding(t_emb, timestep_cond)
             base_temb = base_model.time_embedding(t_emb, timestep_cond)
-            interpolation_param = self.config.time_embedding_mix ** 0.3
+            interpolation_param = self.config.time_embedding_mix**0.3
 
             temb = ctrl_temb * interpolation_param + base_temb * (1 - interpolation_param)
         else:
@@ -707,7 +737,9 @@ def forward(
 
         h_ctrl = h_base = sample
         hs_base, hs_ctrl = [], []
-        it_down_convs_in, it_down_convs_out, it_dec_convs_in, it_up_convs_out = map(iter, (self.down_zero_convs_in, self.down_zero_convs_out, self.up_zero_convs_in, self.up_zero_convs_out))
+        it_down_convs_in, it_down_convs_out, it_dec_convs_in, it_up_convs_out = map(
+            iter, (self.down_zero_convs_in, self.down_zero_convs_out, self.up_zero_convs_in, self.up_zero_convs_out)
+        )
         scales = iter(scale_list)
 
         base_down_subblocks = to_sub_blocks(base_model.down_blocks)
@@ -720,49 +752,35 @@ def forward(
         # 0 - conv in
         h_base = base_model.conv_in(h_base)
         h_ctrl = self.control_model.conv_in(h_ctrl)
-        if guided_hint is not None: h_ctrl += guided_hint
+        if guided_hint is not None:
+            h_ctrl += guided_hint
         h_base = h_base + next(it_down_convs_out)(h_ctrl) * next(scales)
 
         hs_base.append(h_base)
         hs_ctrl.append(h_ctrl)
 
         # 1 - down
-        for m_base, m_ctrl  in zip(base_down_subblocks, ctrl_down_subblocks):
+        for m_base, m_ctrl in zip(base_down_subblocks, ctrl_down_subblocks):
             h_ctrl = torch.cat([h_ctrl, next(it_down_convs_in)(h_base)], dim=1)  # A - concat base -> ctrl
-            h_base = m_base(                                                     # B - apply base subblock
-                h_base, temb, cemb,
-                attention_mask, cross_attention_kwargs
-            )                                 
-            h_ctrl = m_ctrl(                                                     # C - apply ctrl subblock
-                h_ctrl, temb, cemb,
-                attention_mask, cross_attention_kwargs
-            )                             
-            h_base = h_base + next(it_down_convs_out)(h_ctrl) * next(scales)     # D - add ctrl -> base
+            h_base = m_base(h_base, temb, cemb, attention_mask, cross_attention_kwargs)  # B - apply base subblock
+            h_ctrl = m_ctrl(h_ctrl, temb, cemb, attention_mask, cross_attention_kwargs)  # C - apply ctrl subblock
+            h_base = h_base + next(it_down_convs_out)(h_ctrl) * next(scales)  # D - add ctrl -> base
 
             hs_base.append(h_base)
             hs_ctrl.append(h_ctrl)
 
         # 2 - mid
-        h_ctrl = torch.cat([h_ctrl, next(it_down_convs_in)(h_base)], dim=1)      # A - concat base -> ctrl
+        h_ctrl = torch.cat([h_ctrl, next(it_down_convs_in)(h_base)], dim=1)  # A - concat base -> ctrl
         for m_base, m_ctrl in zip(base_mid_subblocks, ctrl_mid_subblocks):
-            h_base = m_base(                                                     # B - apply base subblock
-                h_base, temb, cemb,
-                attention_mask, cross_attention_kwargs
-            )  
-            h_ctrl  = m_ctrl(                                                    # C - apply ctrl subblock
-                h_ctrl, temb, cemb,
-                attention_mask, cross_attention_kwargs
-            )  
-        h_base = h_base + self.middle_block_out(h_ctrl) * next(scales)           # D - add ctrl -> base
- 
+            h_base = m_base(h_base, temb, cemb, attention_mask, cross_attention_kwargs)  # B - apply base subblock
+            h_ctrl = m_ctrl(h_ctrl, temb, cemb, attention_mask, cross_attention_kwargs)  # C - apply ctrl subblock
+        h_base = h_base + self.middle_block_out(h_ctrl) * next(scales)  # D - add ctrl -> base
+
         # 3 - up
         for m_base in base_up_subblocks:
-            h_base = h_base + next(it_up_convs_out)(hs_ctrl.pop()) * next(scales) # add info from ctrl encoder 
-            h_base = torch.cat([h_base, hs_base.pop()], dim=1)                    # concat info from base encoder+ctrl encoder
-            h_base = m_base(                                                   
-                h_base, temb, cemb,
-                attention_mask, cross_attention_kwargs
-            )  
+            h_base = h_base + next(it_up_convs_out)(hs_ctrl.pop()) * next(scales)  # add info from ctrl encoder
+            h_base = torch.cat([h_base, hs_base.pop()], dim=1)  # concat info from base encoder+ctrl encoder
+            h_base = m_base(h_base, temb, cemb, attention_mask, cross_attention_kwargs)
 
         h_base = base_model.conv_norm_out(h_base)
         h_base = base_model.conv_act(h_base)
@@ -770,7 +788,7 @@ def forward(
 
         if not return_dict:
             return h_base
-        
+
         return ControlNetXSOutput(sample=h_base)
 
     def make_zero_conv(self, in_channels, out_channels=None):
@@ -783,10 +801,12 @@ def make_zero_conv(self, in_channels, out_channels=None):
 
 class EmbedSequential(nn.ModuleList):
     """Sequential module passing embeddings (time and conditioning) to children if they support it."""
-    def __init__(self,ms,*args,**kwargs):
-        if not is_iterable(ms): ms = [ms]
-        super().__init__(ms,*args,**kwargs)
-    
+
+    def __init__(self, ms, *args, **kwargs):
+        if not is_iterable(ms):
+            ms = [ms]
+        super().__init__(ms, *args, **kwargs)
+
     def forward(
         self,
         x: torch.Tensor,
@@ -796,16 +816,18 @@ def forward(
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
     ):
         for m in self:
-            if isinstance(m,ResnetBlock2D):
-                x = m(x,temb)
-            elif isinstance(m,Transformer2DModel):
-                x = m(x,cemb,attention_mask=attention_mask,cross_attention_kwargs=cross_attention_kwargs).sample
-            elif isinstance(m,Downsample2D):
+            if isinstance(m, ResnetBlock2D):
+                x = m(x, temb)
+            elif isinstance(m, Transformer2DModel):
+                x = m(x, cemb, attention_mask=attention_mask, cross_attention_kwargs=cross_attention_kwargs).sample
+            elif isinstance(m, Downsample2D):
                 x = m(x)
-            elif isinstance(m,Upsample2D):
+            elif isinstance(m, Upsample2D):
                 x = m(x)
             else:
-                raise ValueError(f'Type of m is {type(m)} but should be `ResnetBlock2D`, `Transformer2DModel`,  `Downsample2D` or `Upsample2D`')
+                raise ValueError(
+                    f"Type of m is {type(m)} but should be `ResnetBlock2D`, `Transformer2DModel`,  `Downsample2D` or `Upsample2D`"
+                )
 
         return x
 
@@ -814,32 +836,36 @@ def adjust_time_dims(unet: UNet2DConditionModel, in_dim: int, out_dim: int):
     unet.time_embedding.linear_1 = nn.Linear(in_dim, out_dim)
 
 
-def increase_block_input_in_encoder_resnet(unet:UNet2DConditionModel, block_no, resnet_idx, by):
+def increase_block_input_in_encoder_resnet(unet: UNet2DConditionModel, block_no, resnet_idx, by):
     """Increase channels sizes to allow for additional concatted information from base model"""
-    r=unet.down_blocks[block_no].resnets[resnet_idx]
-    old_norm1, old_conv1, old_conv_shortcut = r.norm1,r.conv1,r.conv_shortcut
+    r = unet.down_blocks[block_no].resnets[resnet_idx]
+    old_norm1, old_conv1 = r.norm1, r.conv1
     # norm
-    norm_args = 'num_groups num_channels eps affine'.split(' ')
-    for a in norm_args: assert hasattr(old_norm1, a)
-    norm_kwargs = { a: getattr(old_norm1, a) for a in norm_args }
-    norm_kwargs['num_channels'] += by  # surgery done here
+    norm_args = "num_groups num_channels eps affine".split(" ")
+    for a in norm_args:
+        assert hasattr(old_norm1, a)
+    norm_kwargs = {a: getattr(old_norm1, a) for a in norm_args}
+    norm_kwargs["num_channels"] += by  # surgery done here
     # conv1
-    conv1_args = 'in_channels out_channels kernel_size stride padding dilation groups bias padding_mode lora_layer'.split(' ')
-    for a in conv1_args: assert hasattr(old_conv1, a)
-    conv1_kwargs = { a: getattr(old_conv1, a) for a in conv1_args }
-    conv1_kwargs['bias'] = 'bias' in conv1_kwargs  # as param, bias is a boolean, but as attr, it's a tensor.
-    conv1_kwargs['in_channels'] += by  # surgery done here
+    conv1_args = (
+        "in_channels out_channels kernel_size stride padding dilation groups bias padding_mode lora_layer".split(" ")
+    )
+    for a in conv1_args:
+        assert hasattr(old_conv1, a)
+    conv1_kwargs = {a: getattr(old_conv1, a) for a in conv1_args}
+    conv1_kwargs["bias"] = "bias" in conv1_kwargs  # as param, bias is a boolean, but as attr, it's a tensor.
+    conv1_kwargs["in_channels"] += by  # surgery done here
     # conv_shortcut
     # as we changed the input size of the block, the input and output sizes are likely different,
-    # therefore we need a conv_shortcut (simply adding won't work) 
-    conv_shortcut_args_kwargs = { 
-        'in_channels': conv1_kwargs['in_channels'],
-        'out_channels': conv1_kwargs['out_channels'],
+    # therefore we need a conv_shortcut (simply adding won't work)
+    conv_shortcut_args_kwargs = {
+        "in_channels": conv1_kwargs["in_channels"],
+        "out_channels": conv1_kwargs["out_channels"],
         # default arguments from resnet.__init__
-        'kernel_size':1, 
-        'stride':1, 
-        'padding':0,
-        'bias':True
+        "kernel_size": 1,
+        "stride": 1,
+        "padding": 0,
+        "bias": True,
     }
     # swap old with new modules
     unet.down_blocks[block_no].resnets[resnet_idx].norm1 = GroupNorm(**norm_kwargs)
@@ -848,46 +874,53 @@ def increase_block_input_in_encoder_resnet(unet:UNet2DConditionModel, block_no,
     unet.down_blocks[block_no].resnets[resnet_idx].in_channels += by  # surgery done here
 
 
-def increase_block_input_in_encoder_downsampler(unet:UNet2DConditionModel, block_no, by):
+def increase_block_input_in_encoder_downsampler(unet: UNet2DConditionModel, block_no, by):
     """Increase channels sizes to allow for additional concatted information from base model"""
-    old_down=unet.down_blocks[block_no].downsamplers[0].conv
+    old_down = unet.down_blocks[block_no].downsamplers[0].conv
     # conv1
-    args = 'in_channels out_channels kernel_size stride padding dilation groups bias padding_mode lora_layer'.split(' ')
-    for a in args: assert hasattr(old_down, a)
-    kwargs = { a: getattr(old_down, a) for a in args}
-    kwargs['bias'] = 'bias' in kwargs  # as param, bias is a boolean, but as attr, it's a tensor.
-    kwargs['in_channels'] += by  # surgery done here
+    args = "in_channels out_channels kernel_size stride padding dilation groups bias padding_mode lora_layer".split(
+        " "
+    )
+    for a in args:
+        assert hasattr(old_down, a)
+    kwargs = {a: getattr(old_down, a) for a in args}
+    kwargs["bias"] = "bias" in kwargs  # as param, bias is a boolean, but as attr, it's a tensor.
+    kwargs["in_channels"] += by  # surgery done here
     # swap old with new modules
     unet.down_blocks[block_no].downsamplers[0].conv = LoRACompatibleConv(**kwargs)
     unet.down_blocks[block_no].downsamplers[0].channels += by  # surgery done here
 
 
-def increase_block_input_in_mid_resnet(unet:UNet2DConditionModel, by):
+def increase_block_input_in_mid_resnet(unet: UNet2DConditionModel, by):
     """Increase channels sizes to allow for additional concatted information from base model"""
-    m=unet.mid_block.resnets[0]
-    old_norm1, old_conv1, old_conv_shortcut = m.norm1,m.conv1,m.conv_shortcut
+    m = unet.mid_block.resnets[0]
+    old_norm1, old_conv1 = m.norm1, m.conv1
     # norm
-    norm_args = 'num_groups num_channels eps affine'.split(' ')
-    for a in norm_args: assert hasattr(old_norm1, a)
-    norm_kwargs = { a: getattr(old_norm1, a) for a in norm_args }
-    norm_kwargs['num_channels'] += by  # surgery done here
+    norm_args = "num_groups num_channels eps affine".split(" ")
+    for a in norm_args:
+        assert hasattr(old_norm1, a)
+    norm_kwargs = {a: getattr(old_norm1, a) for a in norm_args}
+    norm_kwargs["num_channels"] += by  # surgery done here
     # conv1
-    conv1_args = 'in_channels out_channels kernel_size stride padding dilation groups bias padding_mode lora_layer'.split(' ')
-    for a in conv1_args: assert hasattr(old_conv1, a)
-    conv1_kwargs = { a: getattr(old_conv1, a) for a in conv1_args }
-    conv1_kwargs['bias'] = 'bias' in conv1_kwargs  # as param, bias is a boolean, but as attr, it's a tensor.
-    conv1_kwargs['in_channels'] += by  # surgery done here
+    conv1_args = (
+        "in_channels out_channels kernel_size stride padding dilation groups bias padding_mode lora_layer".split(" ")
+    )
+    for a in conv1_args:
+        assert hasattr(old_conv1, a)
+    conv1_kwargs = {a: getattr(old_conv1, a) for a in conv1_args}
+    conv1_kwargs["bias"] = "bias" in conv1_kwargs  # as param, bias is a boolean, but as attr, it's a tensor.
+    conv1_kwargs["in_channels"] += by  # surgery done here
     # conv_shortcut
     # as we changed the input size of the block, the input and output sizes are likely different,
-    # therefore we need a conv_shortcut (simply adding won't work) 
-    conv_shortcut_args_kwargs = { 
-        'in_channels': conv1_kwargs['in_channels'],
-        'out_channels': conv1_kwargs['out_channels'],
+    # therefore we need a conv_shortcut (simply adding won't work)
+    conv_shortcut_args_kwargs = {
+        "in_channels": conv1_kwargs["in_channels"],
+        "out_channels": conv1_kwargs["out_channels"],
         # default arguments from resnet.__init__
-        'kernel_size':1, 
-        'stride':1, 
-        'padding':0,
-        'bias':True
+        "kernel_size": 1,
+        "stride": 1,
+        "padding": 0,
+        "bias": True,
     }
     # swap old with new modules
     unet.mid_block.resnets[0].norm1 = GroupNorm(**norm_kwargs)
@@ -897,7 +930,8 @@ def increase_block_input_in_mid_resnet(unet:UNet2DConditionModel, by):
 
 
 def is_iterable(o):
-    if isinstance(o, str): return False
+    if isinstance(o, str):
+        return False
     try:
         iter(o)
         return True
@@ -906,22 +940,25 @@ def is_iterable(o):
 
 
 def to_sub_blocks(blocks):
-    if not is_iterable(blocks): blocks = [blocks]
+    if not is_iterable(blocks):
+        blocks = [blocks]
     sub_blocks = []
     for b in blocks:
         current_subblocks = []
-        if hasattr(b, 'resnets'):
-            if hasattr(b, 'attentions') and b.attentions is not None:
+        if hasattr(b, "resnets"):
+            if hasattr(b, "attentions") and b.attentions is not None:
                 current_subblocks = list(zip_longest(b.resnets, b.attentions))
-                 # if we have 1 more resnets than attentions, let the last subblock only be the resnet, not (resnet, None)
+                # if we have 1 more resnets than attentions, let the last subblock only be the resnet, not (resnet, None)
                 if current_subblocks[-1][1] is None:
                     current_subblocks[-1] = current_subblocks[-1][0]
             else:
                 current_subblocks = list(b.resnets)
         # upsamplers are part of the same block # q: what if we have multiple upsamplers?
-        if hasattr(b, 'upsamplers') and b.upsamplers is not None: current_subblocks[-1] = list(current_subblocks[-1]) + list(b.upsamplers)
+        if hasattr(b, "upsamplers") and b.upsamplers is not None:
+            current_subblocks[-1] = list(current_subblocks[-1]) + list(b.upsamplers)
         # downsamplers are own block
-        if hasattr(b, 'downsamplers') and b.downsamplers is not None: current_subblocks.append(list(b.downsamplers))   
+        if hasattr(b, "downsamplers") and b.downsamplers is not None:
+            current_subblocks.append(list(b.downsamplers))
         sub_blocks += current_subblocks
     return list(map(EmbedSequential, sub_blocks))
 
diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py
index dc3be9464876..0e6d999e5efe 100644
--- a/src/diffusers/models/resnet.py
+++ b/src/diffusers/models/resnet.py
@@ -20,13 +20,13 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
+from ..umer_debug_logger import udl
 from ..utils import USE_PEFT_BACKEND
 from .activations import get_activation
 from .attention import AdaGroupNorm
 from .attention_processor import SpatialNorm
 from .lora import LoRACompatibleConv, LoRACompatibleLinear
 
-from ..umer_debug_logger import udl
 
 class Upsample1D(nn.Module):
     """A 1D upsampling layer with an optional convolution.
@@ -206,7 +206,7 @@ def forward(self, hidden_states: torch.Tensor, output_size: Optional[int] = None
                 else:
                     hidden_states = self.Conv2d_0(hidden_states)
 
-        udl.log_if('conv',hidden_states, 'SUBBLOCK-MINUS-1')
+        udl.log_if("conv", hidden_states, "SUBBLOCK-MINUS-1")
 
         return hidden_states
 
@@ -276,7 +276,7 @@ def forward(self, hidden_states, scale: float = 1.0):
         else:
             hidden_states = self.conv(hidden_states)
 
-        udl.log_if('conv',hidden_states, 'SUBBLOCK-MINUS-1')
+        udl.log_if("conv", hidden_states, "SUBBLOCK-MINUS-1")
 
         return hidden_states
 
@@ -725,7 +725,7 @@ def forward(self, input_tensor, temb, scale: float = 1.0):
             )
 
         hidden_states = self.conv1(hidden_states, scale) if not USE_PEFT_BACKEND else self.conv1(hidden_states)
-        udl.log_if('conv1', hidden_states, condition='SUBBLOCK-MINUS-1')
+        udl.log_if("conv1", hidden_states, condition="SUBBLOCK-MINUS-1")
 
         if self.time_emb_proj is not None:
             if not self.skip_time_act:
@@ -738,7 +738,7 @@ def forward(self, input_tensor, temb, scale: float = 1.0):
 
         if temb is not None and self.time_embedding_norm == "default":
             hidden_states = hidden_states + temb
-            udl.log_if('add time_emb_proj', hidden_states, condition='SUBBLOCK-MINUS-1')
+            udl.log_if("add time_emb_proj", hidden_states, condition="SUBBLOCK-MINUS-1")
 
         if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
             hidden_states = self.norm2(hidden_states, temb)
@@ -752,7 +752,7 @@ def forward(self, input_tensor, temb, scale: float = 1.0):
         hidden_states = self.nonlinearity(hidden_states)
         hidden_states = self.dropout(hidden_states)
         hidden_states = self.conv2(hidden_states, scale) if not USE_PEFT_BACKEND else self.conv2(hidden_states)
-        udl.log_if('conv2', hidden_states, condition='SUBBLOCK-MINUS-1')
+        udl.log_if("conv2", hidden_states, condition="SUBBLOCK-MINUS-1")
 
         if self.conv_shortcut is not None:
             input_tensor = (
@@ -760,7 +760,7 @@ def forward(self, input_tensor, temb, scale: float = 1.0):
             )
 
         output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
-        udl.log_if('add conv_shortcut', output_tensor, condition='SUBBLOCK-MINUS-1')
+        udl.log_if("add conv_shortcut", output_tensor, condition="SUBBLOCK-MINUS-1")
 
         return output_tensor
 
diff --git a/src/diffusers/models/transformer_2d.py b/src/diffusers/models/transformer_2d.py
index 0063099fdc67..ea0cbeb20b03 100644
--- a/src/diffusers/models/transformer_2d.py
+++ b/src/diffusers/models/transformer_2d.py
@@ -20,13 +20,13 @@
 
 from ..configuration_utils import ConfigMixin, register_to_config
 from ..models.embeddings import ImagePositionalEmbeddings
+from ..umer_debug_logger import udl
 from ..utils import USE_PEFT_BACKEND, BaseOutput, deprecate
 from .attention import BasicTransformerBlock
 from .embeddings import PatchEmbed
 from .lora import LoRACompatibleConv, LoRACompatibleLinear
 from .modeling_utils import ModelMixin
 
-from ..umer_debug_logger import udl
 
 @dataclass
 class Transformer2DModelOutput(BaseOutput):
@@ -317,7 +317,7 @@ def forward(
         elif self.is_input_patches:
             hidden_states = self.pos_embed(hidden_states)
 
-        udl.log_if('proj_in', hidden_states, condition='SUBBLOCK-MINUS-1')
+        udl.log_if("proj_in", hidden_states, condition="SUBBLOCK-MINUS-1")
 
         # 2. Blocks
         for block in self.transformer_blocks:
@@ -389,7 +389,7 @@ def forward(
                 shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size)
             )
 
-        udl.log_if('proj_out', output, condition='SUBBLOCK-MINUS-1')
+        udl.log_if("proj_out", output, condition="SUBBLOCK-MINUS-1")
 
         if not return_dict:
             return (output,)
diff --git a/src/diffusers/models/unet_2d_blocks.py b/src/diffusers/models/unet_2d_blocks.py
index 7ff8e3968595..ebd45c09ae33 100644
--- a/src/diffusers/models/unet_2d_blocks.py
+++ b/src/diffusers/models/unet_2d_blocks.py
@@ -30,6 +30,7 @@
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
+
 def get_down_block(
     down_block_type,
     num_layers,
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index 27752b819747..1a242ff165f6 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -148,9 +148,9 @@ class conditioning with `class_embed_type` equal to `None`.
             The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`.
         time_cond_proj_dim (`int`, *optional*, defaults to `None`):
             The dimension of `cond_proj` layer in the timestep embedding.
-        conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer.
-        conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer.
-        projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when
+        conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer. conv_out_kernel (`int`,
+        *optional*, default to `3`): The kernel size of `conv_out` layer. projection_class_embeddings_input_dim (`int`,
+        *optional*): The dimension of the `class_labels` input when
             `class_embed_type="projection"`. Required when `class_embed_type="projection"`.
         class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time
             embeddings with the class embeddings.
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 404c785e1b22..643d19c52c8e 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -17,7 +17,13 @@
 
 # These modules contain pipelines from multiple libraries/frameworks
 _dummy_objects = {}
-_import_structure = {"stable_diffusion": [], "stable_diffusion_xl": [], "latent_diffusion": [], "controlnet": [], "controlnet_xs": []}
+_import_structure = {
+    "controlnet": [],
+    "controlnet_xs": [],
+    "latent_diffusion": [],
+    "stable_diffusion": [],
+    "stable_diffusion_xl": [],
+}
 
 try:
     if not is_torch_available():
diff --git a/src/diffusers/pipelines/controlnet_xs/__init__.py b/src/diffusers/pipelines/controlnet_xs/__init__.py
index abd5fd38b2e1..669dc0419456 100644
--- a/src/diffusers/pipelines/controlnet_xs/__init__.py
+++ b/src/diffusers/pipelines/controlnet_xs/__init__.py
@@ -31,7 +31,7 @@
 
     _dummy_objects.update(get_objects_from_module(dummy_flax_and_transformers_objects))
 else:
-    pass # _import_structure["pipeline_flax_controlnet"] = ["FlaxStableDiffusionControlNetPipeline"]
+    pass  # _import_structure["pipeline_flax_controlnet"] = ["FlaxStableDiffusionControlNetPipeline"]
 
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
@@ -50,7 +50,7 @@
     except OptionalDependencyNotAvailable:
         from ...utils.dummy_flax_and_transformers_objects import *  # noqa F403
     else:
-        pass # from .pipeline_flax_controlnet import FlaxStableDiffusionControlNetPipeline
+        pass  # from .pipeline_flax_controlnet import FlaxStableDiffusionControlNetPipeline
 
 
 else:
diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
index e841bda16bbf..978c161296f9 100644
--- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
@@ -69,11 +69,9 @@
 
         >>> # initialize the models and pipeline
         >>> controlnet_conditioning_scale = 0.5  # recommended for good generalization
-        >>> controlnet = ControlNetXSModel.from_pretrained(
-        ...     "UmerHA/ConrolNetXS-SDXL-canny", torch_dtype=torch.float16
-        ... )
+        >>> controlnet = ControlNetXSModel.from_pretrained("UmerHA/ConrolNetXS-SDXL-canny", torch_dtype=torch.float16)
         >>> vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
-        >>> pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
+        >>> pipe = StableDiffusionXLControlNetXSPipeline.from_pretrained(
         ...     "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, vae=vae, torch_dtype=torch.float16
         ... )
         >>> pipe.enable_model_cpu_offload()
@@ -172,7 +170,7 @@ def __init__(
             vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
         )
         add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
- 
+
         if add_watermarker:
             self.watermark = StableDiffusionXLWatermarker()
         else:
@@ -577,7 +575,7 @@ def check_inputs(
                 f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
             )
 
-        #if isinstance(self.controlnet, MultiControlNetModel): # todo?
+        # if isinstance(self.controlnet, MultiControlNetModel): # todo?
 
         for start, end in zip(control_guidance_start, control_guidance_end):
             if start >= end:
@@ -894,13 +892,13 @@ def __call__(
                 as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
                 [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
                 information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
-                
+
         Examples:
 
         Returns:
             [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`:
-                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] is returned,
-                otherwise a `tuple` is returned containing the output images.
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] is
+                returned, otherwise a `tuple` is returned containing the output images.
         """
         controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
 
@@ -910,26 +908,26 @@ def __call__(
         elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
             control_guidance_end = len(control_guidance_start) * [control_guidance_end]
         elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
-            mult = 1 # len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
+            mult = 1  # len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
             control_guidance_start, control_guidance_end = mult * [control_guidance_start], mult * [
                 control_guidance_end
             ]
 
         # 1. Check inputs. Raise error if not correct
         self.check_inputs(
-             prompt,
-             prompt_2,
-             image,
-             callback_steps,
-             negative_prompt,
-             negative_prompt_2,
-             prompt_embeds,
-             negative_prompt_embeds,
-             pooled_prompt_embeds,
-             negative_pooled_prompt_embeds,
-             controlnet_conditioning_scale,
-             control_guidance_start,
-             control_guidance_end,
+            prompt,
+            prompt_2,
+            image,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+            controlnet_conditioning_scale,
+            control_guidance_start,
+            control_guidance_end,
         )
 
         # 2. Define call parameters
@@ -946,8 +944,8 @@ def __call__(
         # corresponds to doing no classifier free guidance.
         do_classifier_free_guidance = guidance_scale > 1.0
 
-        #todo: if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float): ...
-        
+        # todo: if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float): ...
+
         # todo umer: understand & implement if needed
         # global_pool_conditions = (
         #     controlnet.config.global_pool_conditions
@@ -995,7 +993,7 @@ def __call__(
                 guess_mode=guess_mode,
             )
             height, width = image.shape[-2:]
-        #elif isinstance(controlnet, MultiControlNetModel): todo?
+        # elif isinstance(controlnet, MultiControlNetModel): todo?
         else:
             assert False
 
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
index c1a3bc7b9a96..99b63634d41c 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -926,14 +926,14 @@ def __call__(
                 # predict the noise residual
                 added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
 
-                print(f'latents.shape={list(latent_model_input.shape)} | ', end='')
-                print(f't={t} | ', end='')
-                print(f'enc_h.shape={list(prompt_embeds.shape)} | ', end='')
+                print(f"latents.shape={list(latent_model_input.shape)} | ", end="")
+                print(f"t={t} | ", end="")
+                print(f"enc_h.shape={list(prompt_embeds.shape)} | ", end="")
                 if cross_attention_kwargs is not None:
-                    print(f'cross_attn_kw.keys={list(cross_attention_kwargs.keys())} | ', end='')
+                    print(f"cross_attn_kw.keys={list(cross_attention_kwargs.keys())} | ", end="")
                 else:
-                    print(f'cross_attn_kw is None | ', end='')
-                print(f'added_cond_kw.keys={list(added_cond_kwargs.keys())}')
+                    print("cross_attn_kw is None | ", end="")
+                print(f"added_cond_kw.keys={list(added_cond_kwargs.keys())}")
 
                 noise_pred = self.unet(
                     latent_model_input,
diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py
index 46f715d1fb17..0130a16fb11a 100644
--- a/src/diffusers/schedulers/scheduling_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_discrete.py
@@ -169,7 +169,7 @@ def __init__(
         sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
         self.sigmas = torch.from_numpy(sigmas)
 
-        #print(f'At the end of __init__, the sigmas are {self.sigmas[:5]} ...')
+        # print(f'At the end of __init__, the sigmas are {self.sigmas[:5]} ...')
 
         # setable values
         self.num_inference_steps = None
diff --git a/src/diffusers/umer_debug_logger.py b/src/diffusers/umer_debug_logger.py
index e8b63c115682..0fda24a9d277 100644
--- a/src/diffusers/umer_debug_logger.py
+++ b/src/diffusers/umer_debug_logger.py
@@ -1,40 +1,43 @@
 # Logger to help me (UmerHA) debug controlnet-xs
 
-import os
 import csv
-import torch
 import inspect
-import logging
+import os
 import shutil
+from datetime import datetime
 from types import SimpleNamespace
 
-from datetime import datetime
+import torch
 
-class UmerDebugLogger:
 
-    _FILE = 'udl.csv'
+class UmerDebugLogger:
+    _FILE = "udl.csv"
 
-    def __init__(self, log_dir='logs', condition=None):
+    def __init__(self, log_dir="logs", condition=None):
         self.log_dir, self.condition, self.tensor_counter = log_dir, condition, 0
         os.makedirs(log_dir, exist_ok=True)
-        self.fields = ['timestamp', 'cls', 'fn', 'shape', 'msg', 'condition', 'tensor_file']
+        self.fields = ["timestamp", "cls", "fn", "shape", "msg", "condition", "tensor_file"]
         self.create_file()
         self.warned_of_no_condition = False
-        print("Info: `UmerDebugLogger` created. This is a logging class that will be deleted when the PR to integrate ControlNet-XS is done.")
+        print(
+            "Info: `UmerDebugLogger` created. This is a logging class that will be deleted when the PR to integrate ControlNet-XS is done."
+        )
 
     @property
-    def full_file_path(self): return os.path.join(self.log_dir, self._FILE) 
+    def full_file_path(self):
+        return os.path.join(self.log_dir, self._FILE)
 
     def create_file(self):
-        file = self.full_file_path    
+        file = self.full_file_path
         if not os.path.isfile(file):
-            with open(file, 'w', newline='') as f:
+            with open(file, "w", newline="") as f:
                 writer = csv.DictWriter(f, fieldnames=self.fields)
                 writer.writeheader()
 
     def set_dir(self, log_dir, clear=False):
         self.log_dir = log_dir
-        if clear: self.clear_logs()
+        if clear:
+            self.clear_logs()
         self.create_file()
 
     def clear_logs(self):
@@ -42,21 +45,25 @@ def clear_logs(self):
         os.makedirs(self.log_dir, exist_ok=True)
         self.create_file()
 
-    def set_condition(self, condition): self.condition = condition
+    def set_condition(self, condition):
+        self.condition = condition
 
     def log_if(self, msg, t, condition, *, print_=False):
         self.maybe_warn_of_no_condition()
-       
+
         # Use inspect to get the current frame and then go back one level to find caller
         frame = inspect.currentframe()
         caller_frame = frame.f_back
         caller_info = inspect.getframeinfo(caller_frame)
 
         # Extract class and function name from the caller
-        cls_name = caller_frame.f_locals.get('self', None).__class__.__name__ if 'self' in caller_frame.f_locals else None
+        cls_name = (
+            caller_frame.f_locals.get("self", None).__class__.__name__ if "self" in caller_frame.f_locals else None
+        )
         function_name = caller_info.function
 
-        if not hasattr(t, 'shape'): t = torch.tensor(t)
+        if not hasattr(t, "shape"):
+            t = torch.tensor(t)
         t = t.cpu().detach()
 
         if condition == self.condition:
@@ -67,25 +74,28 @@ def log_if(self, msg, t, condition, *, print_=False):
 
             # Log information to CSV
             log_info = {
-                'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
-                'cls': cls_name,
-                'fn': function_name,
-                'shape': str(list(t.shape)),
-                'msg': msg,
-                'condition': condition,
-                'tensor_file': tensor_filename
+                "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+                "cls": cls_name,
+                "fn": function_name,
+                "shape": str(list(t.shape)),
+                "msg": msg,
+                "condition": condition,
+                "tensor_file": tensor_filename,
             }
 
-            with open(self.full_file_path, 'a', newline='') as f:
+            with open(self.full_file_path, "a", newline="") as f:
                 writer = csv.DictWriter(f, fieldnames=self.fields)
                 writer.writerow(log_info)
 
-            if print_: print(f'{msg}\t{t.flatten()[:10]}')
-    
-    def print_if(self, msg, conditions, end='\n'):
+            if print_:
+                print(f"{msg}\t{t.flatten()[:10]}")
+
+    def print_if(self, msg, conditions, end="\n"):
         self.maybe_warn_of_no_condition()
-        if not isinstance(conditions, (tuple, list)): conditions = [conditions]
-        if any(self.condition==c for c in conditions): print(msg, end=end)
+        if not isinstance(conditions, (tuple, list)):
+            conditions = [conditions]
+        if any(self.condition == c for c in conditions):
+            print(msg, end=end)
 
     def stop_if(self, condition, funny_msg):
         if condition == self.condition:
@@ -93,32 +103,32 @@ def stop_if(self, condition, funny_msg):
             raise SystemExit(funny_msg)
 
     def maybe_warn_of_no_condition(self):
-        if self.condition is None and not self.warned_of_no_condition :
+        if self.condition is None and not self.warned_of_no_condition:
             print("Info: No condition set for UmerDebugLogger")
             self.warned_of_no_condition = True
 
     def get_log_objects(self):
         log_objects = []
-        file = self.full_file_path  
-        with open(file, newline='') as f:
+        file = self.full_file_path
+        with open(file, newline="") as f:
             reader = csv.DictReader(f)
             for row in reader:
-                row['tensor'] = torch.load(os.path.join(self.log_dir, row['tensor_file']))
-                row['head'] = row['tensor'].flatten()[:10]
-                del row['tensor_file']
+                row["tensor"] = torch.load(os.path.join(self.log_dir, row["tensor_file"]))
+                row["head"] = row["tensor"].flatten()[:10]
+                del row["tensor_file"]
                 log_objects.append(SimpleNamespace(**row))
         return log_objects
 
     @classmethod
     def load_log_objects_from_dir(self, log_dir):
-        file = os.path.join(log_dir, self._FILE)   
+        file = os.path.join(log_dir, self._FILE)
         log_objects = []
-        with open(file, newline='') as f:
+        with open(file, newline="") as f:
             reader = csv.DictReader(f)
             for row in reader:
-                row['t'] = torch.load(os.path.join(log_dir, row['tensor_file']))
-                row['head'] = row['t'].flatten()[:10]
-                del row['tensor_file']
+                row["t"] = torch.load(os.path.join(log_dir, row["tensor_file"]))
+                row["head"] = row["t"].flatten()[:10]
+                del row["tensor_file"]
                 log_objects.append(SimpleNamespace(**row))
         return log_objects
 
diff --git a/tests/pipelines/controlnetxs/__init__.py b/tests/pipelines/controlnetxs/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py b/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py
new file mode 100644
index 000000000000..c675f9f38109
--- /dev/null
+++ b/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py
@@ -0,0 +1,390 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    ControlNetModel,
+    EulerDiscreteScheduler,
+    StableDiffusionXLControlNetPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.pipelines.controlnet_xs.pipeline_controlnet_xs_sd_xl import StableDiffusionXLControlNetXSPipeline
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.testing_utils import enable_full_determinism, load_image, require_torch_gpu, slow, torch_device
+from diffusers.utils.torch_utils import randn_tensor
+
+from ..pipeline_params import (
+    IMAGE_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_TO_IMAGE_BATCH_PARAMS,
+    TEXT_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_TO_IMAGE_PARAMS,
+)
+from ..test_pipelines_common import (
+    PipelineKarrasSchedulerTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineTesterMixin,
+    SDXLOptionalComponentsTesterMixin,
+)
+
+
+# TODO UMER
+# these tests are so far only copied from `test_controlnet_sdxl.py` and need to be adapted
+
+
+enable_full_determinism()
+
+
+class StableDiffusionXLControlNetXSPipelineFastTests(
+    PipelineLatentTesterMixin,
+    PipelineKarrasSchedulerTesterMixin,
+    PipelineTesterMixin,
+    SDXLOptionalComponentsTesterMixin,
+    unittest.TestCase,
+):
+    pipeline_class = StableDiffusionXLControlNetXSPipeline
+    params = TEXT_TO_IMAGE_PARAMS
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2),
+            projection_class_embeddings_input_dim=80,  # 6 * 8 + 32
+            cross_attention_dim=64,
+        )
+        torch.manual_seed(0)
+        controlnet = ControlNetModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            in_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            conditioning_embedding_out_channels=(16, 32),
+            # SD2-specific config below
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+            addition_embed_type="text_time",
+            addition_time_embed_dim=8,
+            transformer_layers_per_block=(1, 2),
+            projection_class_embeddings_input_dim=80,  # 6 * 8 + 32
+            cross_attention_dim=64,
+        )
+        torch.manual_seed(0)
+        scheduler = EulerDiscreteScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            steps_offset=1,
+            beta_schedule="scaled_linear",
+            timestep_spacing="leading",
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            # SD2-specific config below
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
+        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "controlnet": controlnet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "text_encoder_2": text_encoder_2,
+            "tokenizer_2": tokenizer_2,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        controlnet_embedder_scale_factor = 2
+        image = randn_tensor(
+            (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
+            generator=generator,
+            device=torch.device(device),
+        )
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+            "image": image,
+        }
+
+        return inputs
+
+    def test_attention_slicing_forward_pass(self):
+        return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3)
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(expected_max_diff=2e-3)
+
+    def test_save_load_optional_components(self):
+        self._test_save_load_optional_components()
+
+    @require_torch_gpu
+    def test_stable_diffusion_xl_offloads(self):
+        pipes = []
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components).to(torch_device)
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_model_cpu_offload()
+        pipes.append(sd_pipe)
+
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe.enable_sequential_cpu_offload()
+        pipes.append(sd_pipe)
+
+        image_slices = []
+        for pipe in pipes:
+            pipe.unet.set_default_attn_processor()
+
+            inputs = self.get_dummy_inputs(torch_device)
+            image = pipe(**inputs).images
+
+            image_slices.append(image[0, -3:, -3:, -1].flatten())
+
+        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
+        assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
+
+    def test_stable_diffusion_xl_multi_prompts(self):
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components).to(torch_device)
+
+        # forward with single prompt
+        inputs = self.get_dummy_inputs(torch_device)
+        output = sd_pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        # forward with same prompt duplicated
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["prompt_2"] = inputs["prompt"]
+        output = sd_pipe(**inputs)
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        # ensure the results are equal
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+
+        # forward with different prompt
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["prompt_2"] = "different prompt"
+        output = sd_pipe(**inputs)
+        image_slice_3 = output.images[0, -3:, -3:, -1]
+
+        # ensure the results are not equal
+        assert np.abs(image_slice_1.flatten() - image_slice_3.flatten()).max() > 1e-4
+
+        # manually set a negative_prompt
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["negative_prompt"] = "negative prompt"
+        output = sd_pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        # forward with same negative_prompt duplicated
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["negative_prompt"] = "negative prompt"
+        inputs["negative_prompt_2"] = inputs["negative_prompt"]
+        output = sd_pipe(**inputs)
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        # ensure the results are equal
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+
+        # forward with different negative_prompt
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["negative_prompt"] = "negative prompt"
+        inputs["negative_prompt_2"] = "different negative prompt"
+        output = sd_pipe(**inputs)
+        image_slice_3 = output.images[0, -3:, -3:, -1]
+
+        # ensure the results are not equal
+        assert np.abs(image_slice_1.flatten() - image_slice_3.flatten()).max() > 1e-4
+
+    # copied from test_stable_diffusion_xl.py
+    def test_stable_diffusion_xl_prompt_embeds(self):
+        components = self.get_dummy_components()
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        # forward without prompt embeds
+        inputs = self.get_dummy_inputs(torch_device)
+        inputs["prompt"] = 2 * [inputs["prompt"]]
+        inputs["num_images_per_prompt"] = 2
+
+        output = sd_pipe(**inputs)
+        image_slice_1 = output.images[0, -3:, -3:, -1]
+
+        # forward with prompt embeds
+        inputs = self.get_dummy_inputs(torch_device)
+        prompt = 2 * [inputs.pop("prompt")]
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = sd_pipe.encode_prompt(prompt)
+
+        output = sd_pipe(
+            **inputs,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+        )
+        image_slice_2 = output.images[0, -3:, -3:, -1]
+
+        # make sure that it's equal
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+
+    def test_controlnet_sdxl_guess(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+
+        sd_pipe = self.pipeline_class(**components)
+        sd_pipe = sd_pipe.to(device)
+
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["guess_mode"] = True
+
+        output = sd_pipe(**inputs)
+        image_slice = output.images[0, -3:, -3:, -1]
+        expected_slice = np.array(
+            [0.7330834, 0.590667, 0.5667336, 0.6029023, 0.5679491, 0.5968194, 0.4032986, 0.47612396, 0.5089609]
+        )
+
+        # make sure that it's equal
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-4
+
+
+@slow
+@require_torch_gpu
+class ControlNetSDXLPipelineXSSlowTests(unittest.TestCase):
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_canny(self):
+        controlnet = ControlNetModel.from_pretrained("diffusers/controlnet-canny-sdxl-1.0")
+
+        pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet
+        )
+        pipe.enable_sequential_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "bird"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
+        )
+
+        images = pipe(prompt, image=image, generator=generator, output_type="np", num_inference_steps=3).images
+
+        assert images[0].shape == (768, 512, 3)
+
+        original_image = images[0, -3:, -3:, -1].flatten()
+        expected_image = np.array([0.4185, 0.4127, 0.4089, 0.4046, 0.4115, 0.4096, 0.4081, 0.4112, 0.3913])
+        assert np.allclose(original_image, expected_image, atol=1e-04)
+
+    def test_depth(self):
+        controlnet = ControlNetModel.from_pretrained("diffusers/controlnet-depth-sdxl-1.0")
+
+        pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet
+        )
+        pipe.enable_sequential_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "Stormtrooper's lecture"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth.png"
+        )
+
+        images = pipe(prompt, image=image, generator=generator, output_type="np", num_inference_steps=3).images
+
+        assert images[0].shape == (512, 512, 3)
+
+        original_image = images[0, -3:, -3:, -1].flatten()
+        expected_image = np.array([0.4399, 0.5112, 0.5478, 0.4314, 0.472, 0.4823, 0.4647, 0.4957, 0.4853])
+        assert np.allclose(original_image, expected_image, atol=1e-04)

From e032fa085fadd3d001a163ef88e4e813c2d01928 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Wed, 15 Nov 2023 12:59:15 +0100
Subject: [PATCH 39/88] checkin

---
 .cursorignore                                 |   1 +
 Pipfile                                       |  11 ++
 .../en/api/pipelines/controlnetxs_sdxl.md     |  55 +++++++++
 src/diffusers/models/controlnetxs.py          |  21 +++-
 .../pipeline_controlnet_xs_sd_xl.py           |   5 +
 .../controlnetxs/test_controlnetxs_sdxl.py    | 107 +++++++++---------
 6 files changed, 138 insertions(+), 62 deletions(-)
 create mode 100644 .cursorignore
 create mode 100644 Pipfile
 create mode 100644 docs/source/en/api/pipelines/controlnetxs_sdxl.md

diff --git a/.cursorignore b/.cursorignore
new file mode 100644
index 000000000000..dd449725e188
--- /dev/null
+++ b/.cursorignore
@@ -0,0 +1 @@
+*.md
diff --git a/Pipfile b/Pipfile
new file mode 100644
index 000000000000..0757494bb360
--- /dev/null
+++ b/Pipfile
@@ -0,0 +1,11 @@
+[[source]]
+url = "https://pypi.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[packages]
+
+[dev-packages]
+
+[requires]
+python_version = "3.11"
diff --git a/docs/source/en/api/pipelines/controlnetxs_sdxl.md b/docs/source/en/api/pipelines/controlnetxs_sdxl.md
new file mode 100644
index 000000000000..755f18341d20
--- /dev/null
+++ b/docs/source/en/api/pipelines/controlnetxs_sdxl.md
@@ -0,0 +1,55 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# ControlNet with Stable Diffusion XL
+
+ControlNet was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543) by Lvmin Zhang, Anyi Rao, and Maneesh Agrawala.
+
+With a ControlNet model, you can provide an additional control image to condition and control Stable Diffusion generation. For example, if you provide a depth map, the ControlNet model generates an image that'll preserve the spatial information from the depth map. It is a more flexible and accurate way to control the image generation process.
+
+The abstract from the paper is:
+
+*We present ControlNet, a neural network architecture to add spatial conditioning controls to large, pretrained text-to-image diffusion models. ControlNet locks the production-ready large diffusion models, and reuses their deep and robust encoding layers pretrained with billions of images as a strong backbone to learn a diverse set of conditional controls. The neural architecture is connected with "zero convolutions" (zero-initialized convolution layers) that progressively grow the parameters from zero and ensure that no harmful noise could affect the finetuning. We test various conditioning controls, eg, edges, depth, segmentation, human pose, etc, with Stable Diffusion, using single or multiple conditions, with or without prompts. We show that the training of ControlNets is robust with small (<50k) and large (>1m) datasets. Extensive results show that ControlNet may facilitate wider applications to control image diffusion models.*
+
+You can find additional smaller Stable Diffusion XL (SDXL) ControlNet checkpoints from the 🤗 [Diffusers](https://huggingface.co/diffusers) Hub organization, and browse [community-trained](https://huggingface.co/models?other=stable-diffusion-xl&other=controlnet) checkpoints on the Hub.
+
+<Tip warning={true}>
+
+🧪 Many of the SDXL ControlNet checkpoints are experimental, and there is a lot of room for improvement. Feel free to open an [Issue](https://github.com/huggingface/diffusers/issues/new/choose) and leave us feedback on how we can improve!
+
+</Tip>
+
+If you don't see a checkpoint you're interested in, you can train your own SDXL ControlNet with our [training script](../../../../../examples/controlnet/README_sdxl).
+
+<Tip>
+
+Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
+
+</Tip>
+
+## StableDiffusionXLControlNetPipeline
+[[autodoc]] StableDiffusionXLControlNetPipeline
+	- all
+	- __call__
+
+## StableDiffusionXLControlNetImg2ImgPipeline
+[[autodoc]] StableDiffusionXLControlNetImg2ImgPipeline
+	- all
+	- __call__
+
+## StableDiffusionXLControlNetInpaintPipeline
+[[autodoc]] StableDiffusionXLControlNetInpaintPipeline
+	- all
+	- __call__
+
+## StableDiffusionPipelineOutput
+[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index 88c1a64eef83..32413c82c26b 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -98,7 +98,7 @@ def create_as_in_paper(cls, base_model: UNet2DConditionModel):
             base_model,
             time_embedding_mix=0.95,
             learn_embedding=True,
-            control_model_size_ratio=0.1,
+            size_ratio=0.1,
             dim_attention_heads=64,
         )
 
@@ -144,6 +144,7 @@ def gather_subblock_sizes(cls, unet: UNet2DConditionModel, base_or_control):
     def __init__(
         self,
         conditioning_channels: int = 3,
+        conditioning_block_sizes: Tuple[int] = (16,32,96,256),
         controlnet_conditioning_channel_order: str = "rgb",
         time_embedding_input_dim: int = 320,
         time_embedding_dim: int = 1280,
@@ -345,9 +346,10 @@ def from_unet(
         learn_embedding: bool = False,
         time_embedding_mix: float = 1.0,
         block_out_channels: Optional[Tuple[int]] = None,
-        control_model_size_ratio: Optional[float] = None,
+        size_ratio: Optional[float] = None,
         num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
         dim_attention_heads: Optional[int] = None,
+        norm_num_groups: Optional[int] = None,
     ):
         r"""
         Instantiate a [`ControlNetXSModel`] from [`UNet2DConditionModel`].
@@ -367,14 +369,17 @@ def from_unet(
                 Linear interpolation parameter used if `learn_embedding` is `True`.
             block_out_channels (`Tuple[int]`, *optional*):
                 Down blocks output channels in control model. Either this or `block_out_channels` must be given.
-            control_model_size_ratio (float, *optional*):
+            size_ratio (float, *optional*):
                 When given, block_out_channels is set to a relative fraction of the base model's block_out_channels.
-                Either this or `control_model_size_ratio` must be given.
+                Either this or `size_ratio` must be given.
+            norm_num_groups (int, *optional*, defaults to `None`): The number of groups to use for the normalization of the control unet.
+                If `None`, `int(unet.config.norm_num_groups * size_ratio)` is taken.
+
         """
 
         # check input
         fixed_size = block_out_channels is not None
-        relative_size = control_model_size_ratio is not None
+        relative_size = size_ratio is not None
         if not (fixed_size ^ relative_size):
             raise ValueError(
                 "Pass exactly one of `block_out_channels` (for absolute sizing) or `control_model_ratio` (for relative sizing)."
@@ -385,11 +390,14 @@ def from_unet(
 
         # create model
         if block_out_channels is None:
-            block_out_channels = [int(control_model_size_ratio * c) for c in unet.config.block_out_channels]
+            block_out_channels = [int(size_ratio * c) for c in unet.config.block_out_channels]
 
         if dim_attention_heads is not None:
             num_attention_heads = [math.ceil(c / dim_attention_heads) for c in block_out_channels]
 
+        if norm_num_groups is None:
+            norm_num_groups = int(unet.config.norm_num_groups * size_ratio)
+
         def get_time_emb_input_dim(unet: UNet2DConditionModel):
             return unet.time_embedding.linear_1.in_features
 
@@ -401,6 +409,7 @@ def get_time_emb_dim(unet: UNet2DConditionModel):
         kwargs.update(block_out_channels=block_out_channels)
         if num_attention_heads is not None:
             kwargs.update(attention_head_dim=num_attention_heads)
+        kwargs.update(norm_num_groups=norm_num_groups)
 
         # time embedding of control unet is not used. So remove params for them.
         to_remove = (
diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
index 978c161296f9..5a0d03935878 100644
--- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
@@ -638,8 +638,10 @@ def prepare_image(
         do_classifier_free_guidance=False,
         guess_mode=False,
     ):
+        print('Image dims:', image.shape)
         image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
         image_batch_size = image.shape[0]
+        print('Latents dims:', image.shape)
 
         if image_batch_size == 1:
             repeat_by = batch_size
@@ -659,6 +661,7 @@ def prepare_image(
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
         shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        print("Preparing latents: shape to be =",shape)
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
                 f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -672,6 +675,8 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
 
         # scale the initial noise by the standard deviation required by the scheduler
         latents = latents * self.scheduler.init_noise_sigma
+        print("Preparing latents: shape =",latents.shape)
+        
         return latents
 
     # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._get_add_time_ids
diff --git a/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py b/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py
index c675f9f38109..d99bb8ef2753 100644
--- a/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py
+++ b/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py
@@ -22,12 +22,11 @@
 
 from diffusers import (
     AutoencoderKL,
-    ControlNetModel,
+    ControlNetXSModel,
     EulerDiscreteScheduler,
-    StableDiffusionXLControlNetPipeline,
+    StableDiffusionXLControlNetXSPipeline,
     UNet2DConditionModel,
 )
-from diffusers.pipelines.controlnet_xs.pipeline_controlnet_xs_sd_xl import StableDiffusionXLControlNetXSPipeline
 from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.testing_utils import enable_full_determinism, load_image, require_torch_gpu, slow, torch_device
 from diffusers.utils.torch_utils import randn_tensor
@@ -46,10 +45,6 @@
 )
 
 
-# TODO UMER
-# these tests are so far only copied from `test_controlnet_sdxl.py` and need to be adapted
-
-
 enable_full_determinism()
 
 
@@ -86,20 +81,11 @@ def get_dummy_components(self):
             cross_attention_dim=64,
         )
         torch.manual_seed(0)
-        controlnet = ControlNetModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            in_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            conditioning_embedding_out_channels=(16, 32),
-            # SD2-specific config below
-            attention_head_dim=(2, 4),
-            use_linear_projection=True,
-            addition_embed_type="text_time",
-            addition_time_embed_dim=8,
-            transformer_layers_per_block=(1, 2),
-            projection_class_embeddings_input_dim=80,  # 6 * 8 + 32
-            cross_attention_dim=64,
+        controlnet = ControlNetXSModel.from_unet(
+            unet,
+            time_embedding_mix=0.95,
+            learn_embedding=True,
+            size_ratio=0.5,
         )
         torch.manual_seed(0)
         scheduler = EulerDiscreteScheduler(
@@ -151,6 +137,7 @@ def get_dummy_components(self):
         }
         return components
 
+    # copied from test_controlnet_sdxl.py
     def get_dummy_inputs(self, device, seed=0):
         if str(device).startswith("mps"):
             generator = torch.manual_seed(seed)
@@ -175,9 +162,11 @@ def get_dummy_inputs(self, device, seed=0):
 
         return inputs
 
+    # copied from test_controlnet_sdxl.py
     def test_attention_slicing_forward_pass(self):
         return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)
 
+    # copied from test_controlnet_sdxl.py
     @unittest.skipIf(
         torch_device != "cuda" or not is_xformers_available(),
         reason="XFormers attention is only available with CUDA and `xformers` installed",
@@ -185,12 +174,15 @@ def test_attention_slicing_forward_pass(self):
     def test_xformers_attention_forwardGenerator_pass(self):
         self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3)
 
+    # copied from test_controlnet_sdxl.py
     def test_inference_batch_single_identical(self):
         self._test_inference_batch_single_identical(expected_max_diff=2e-3)
 
+    # copied from test_controlnet_sdxl.py
     def test_save_load_optional_components(self):
         self._test_save_load_optional_components()
 
+    # copied from test_controlnet_sdxl.py
     @require_torch_gpu
     def test_stable_diffusion_xl_offloads(self):
         pipes = []
@@ -220,6 +212,7 @@ def test_stable_diffusion_xl_offloads(self):
         assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
         assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
 
+    # copied from test_controlnet_sdxl.py
     def test_stable_diffusion_xl_multi_prompts(self):
         components = self.get_dummy_components()
         sd_pipe = self.pipeline_class(**components).to(torch_device)
@@ -312,27 +305,28 @@ def test_stable_diffusion_xl_prompt_embeds(self):
         # make sure that it's equal
         assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
 
-    def test_controlnet_sdxl_guess(self):
-        device = "cpu"
+    # TODO Umer: Understand guess mode and enable this test if needed
+    # def test_controlnet_sdxl_guess(self):
+    #     device = "cpu"
 
-        components = self.get_dummy_components()
+    #     components = self.get_dummy_components()
 
-        sd_pipe = self.pipeline_class(**components)
-        sd_pipe = sd_pipe.to(device)
+    #     sd_pipe = self.pipeline_class(**components)
+    #     sd_pipe = sd_pipe.to(device)
 
-        sd_pipe.set_progress_bar_config(disable=None)
+    #     sd_pipe.set_progress_bar_config(disable=None)
 
-        inputs = self.get_dummy_inputs(device)
-        inputs["guess_mode"] = True
+    #     inputs = self.get_dummy_inputs(device)
+    #     inputs["guess_mode"] = True
 
-        output = sd_pipe(**inputs)
-        image_slice = output.images[0, -3:, -3:, -1]
-        expected_slice = np.array(
-            [0.7330834, 0.590667, 0.5667336, 0.6029023, 0.5679491, 0.5968194, 0.4032986, 0.47612396, 0.5089609]
-        )
+    #     output = sd_pipe(**inputs)
+    #     image_slice = output.images[0, -3:, -3:, -1]
+    #     expected_slice = np.array(
+    #         [0.7330834, 0.590667, 0.5667336, 0.6029023, 0.5679491, 0.5968194, 0.4032986, 0.47612396, 0.5089609]
+    #     )
 
-        # make sure that it's equal
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-4
+    #     # make sure that it's equal
+    #     assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-4
 
 
 @slow
@@ -344,9 +338,9 @@ def tearDown(self):
         torch.cuda.empty_cache()
 
     def test_canny(self):
-        controlnet = ControlNetModel.from_pretrained("diffusers/controlnet-canny-sdxl-1.0")
+        controlnet = ControlNetXSModel.from_pretrained("UmerHA/ConrolNetXS-SDXL-canny")
 
-        pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
+        pipe = StableDiffusionXLControlNetXSPipeline.from_pretrained(
             "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet
         )
         pipe.enable_sequential_cpu_offload()
@@ -363,28 +357,29 @@ def test_canny(self):
         assert images[0].shape == (768, 512, 3)
 
         original_image = images[0, -3:, -3:, -1].flatten()
-        expected_image = np.array([0.4185, 0.4127, 0.4089, 0.4046, 0.4115, 0.4096, 0.4081, 0.4112, 0.3913])
+        expected_image = np.array([0.4359, 0.4335, 0.4609, 0.4515, 0.4669, 0.4494, 0.452 , 0.4493, 0.4382])
         assert np.allclose(original_image, expected_image, atol=1e-04)
 
-    def test_depth(self):
-        controlnet = ControlNetModel.from_pretrained("diffusers/controlnet-depth-sdxl-1.0")
+    # ToDo Umer: Implement depth and enable this test
+    # def test_depth(self):
+    #     controlnet = ControlNetModel.from_pretrained("diffusers/controlnet-depth-sdxl-1.0")
 
-        pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet
-        )
-        pipe.enable_sequential_cpu_offload()
-        pipe.set_progress_bar_config(disable=None)
+    #     pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
+    #         "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet
+    #     )
+    #     pipe.enable_sequential_cpu_offload()
+    #     pipe.set_progress_bar_config(disable=None)
 
-        generator = torch.Generator(device="cpu").manual_seed(0)
-        prompt = "Stormtrooper's lecture"
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth.png"
-        )
+    #     generator = torch.Generator(device="cpu").manual_seed(0)
+    #     prompt = "Stormtrooper's lecture"
+    #     image = load_image(
+    #         "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth.png"
+    #     )
 
-        images = pipe(prompt, image=image, generator=generator, output_type="np", num_inference_steps=3).images
+    #     images = pipe(prompt, image=image, generator=generator, output_type="np", num_inference_steps=3).images
 
-        assert images[0].shape == (512, 512, 3)
+    #     assert images[0].shape == (512, 512, 3)
 
-        original_image = images[0, -3:, -3:, -1].flatten()
-        expected_image = np.array([0.4399, 0.5112, 0.5478, 0.4314, 0.472, 0.4823, 0.4647, 0.4957, 0.4853])
-        assert np.allclose(original_image, expected_image, atol=1e-04)
+    #     original_image = images[0, -3:, -3:, -1].flatten()
+    #     expected_image = np.array([0.4399, 0.5112, 0.5478, 0.4314, 0.472, 0.4823, 0.4647, 0.4957, 0.4853])
+    #     assert np.allclose(original_image, expected_image, atol=1e-04)

From e543a545e1a8bec44405d0c4b72f8fbf8a17ea50 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Thu, 16 Nov 2023 04:51:21 +0100
Subject: [PATCH 40/88] more tests pass

---
 src/diffusers/models/controlnetxs.py          | 152 ++++--------------
 .../pipeline_controlnet_xs_sd_xl.py           |  10 +-
 .../controlnetxs/test_controlnetxs_sdxl.py    |   1 +
 3 files changed, 38 insertions(+), 125 deletions(-)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index 32413c82c26b..001881c28f9e 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -62,6 +62,8 @@ class ControlNetXSOutput(BaseOutput):
     sample: torch.FloatTensor = None
 
 
+# todo umer: assert in pipe that conditioning_block_sizes matches vae downblocks
+
 # todo umer: add sth like FromOriginalControlnetMixin
 class ControlNetXSModel(ModelMixin, ConfigMixin):
     r"""
@@ -77,7 +79,9 @@ class ControlNetXSModel(ModelMixin, ConfigMixin):
         conditioning_channels (`int`, defaults to 3):
             Number of channels of conditioning input (e.g. an image)
         controlnet_conditioning_channel_order (`str`, defaults to `"rgb"`):
-            The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
+            The channel order of conditional image. Will convert to `rgb` if it's `bgr`.  
+        conditioning_block_sizes (`Tuple[int]`, defaults to `(16,32,96,256))`):
+                TODO
         time_embedding_input_dim (`int`, defaults to 320):
             Dimension of input into time embedding. Needs to be same as in the base model.
         time_embedding_dim (`int`, defaults to 1280):
@@ -100,6 +104,7 @@ def create_as_in_paper(cls, base_model: UNet2DConditionModel):
             learn_embedding=True,
             size_ratio=0.1,
             dim_attention_heads=64,
+            conditioning_block_sizes = (16,32,96,256),
         )
 
     @classmethod
@@ -314,23 +319,25 @@ def __init__(
             )
 
         # 5 - Create conditioning hint embedding
-        self.input_hint_block = nn.Sequential(
-            nn.Conv2d(conditioning_channels, 16, 3, padding=1),
-            nn.SiLU(),
-            nn.Conv2d(16, 16, 3, padding=1),
-            nn.SiLU(),
-            nn.Conv2d(16, 32, 3, padding=1, stride=2),
-            nn.SiLU(),
-            nn.Conv2d(32, 32, 3, padding=1),
-            nn.SiLU(),
-            nn.Conv2d(32, 96, 3, padding=1, stride=2),
-            nn.SiLU(),
-            nn.Conv2d(96, 96, 3, padding=1),
-            nn.SiLU(),
-            nn.Conv2d(96, 256, 3, padding=1, stride=2),
-            nn.SiLU(),
-            zero_module(nn.Conv2d(256, block_out_channels[0], 3, padding=1)),
-        )
+        conditioning_emb_layers = [
+            nn.Conv2d(conditioning_channels, conditioning_block_sizes[0], 3, padding=1),
+            nn.SiLU()
+        ]
+
+        for i in range(len(conditioning_block_sizes)-1):
+            in_channels = conditioning_block_sizes[i]
+            out_channels = conditioning_block_sizes[i+1]
+
+            conditioning_emb_layers += [
+                nn.Conv2d(in_channels, in_channels, 3, padding=1, stride=1),
+                nn.SiLU(),
+                nn.Conv2d(in_channels, out_channels, 3, padding=1, stride=2),
+                nn.SiLU()
+            ]
+
+        conditioning_emb_layers.append(zero_module(nn.Conv2d(conditioning_block_sizes[-1], block_out_channels[0], 3, padding=1)))
+
+        self.input_hint_block = nn.Sequential(*conditioning_emb_layers)
 
         # In the mininal implementation setting, we only need the control model up to the mid block
         del self.control_model.up_blocks
@@ -342,6 +349,7 @@ def from_unet(
         cls,
         unet: UNet2DConditionModel,
         conditioning_channels: int = 3,
+        conditioning_block_sizes: Tuple[int] = (16,32,96,256),
         controlnet_conditioning_channel_order: str = "rgb",
         learn_embedding: bool = False,
         time_embedding_mix: float = 1.0,
@@ -359,6 +367,8 @@ def from_unet(
                 The UNet model we want to control. The dimensions of the ControlNetXSModel will be adapted to it.
             conditioning_channels (`int`, defaults to 3):
                 Number of channels of conditioning input (e.g. an image)
+            conditioning_block_sizes (`Tuple[int]`, defaults to `(16,32,96,256))`):
+                TODO
             controlnet_conditioning_channel_order (`str`, defaults to `"rgb"`):
                 The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
             learn_embedding (`bool`, defaults to `False`):
@@ -434,36 +444,20 @@ def get_time_emb_dim(unet: UNet2DConditionModel):
             time_embedding_mix=time_embedding_mix,
             learn_embedding=learn_embedding,
             base_model_channel_sizes=ControlNetXSModel.gather_subblock_sizes(unet, base_or_control="base"),
+            conditioning_block_sizes=conditioning_block_sizes,
         )
 
         return cls(**kwargs)
 
     @property
-    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
     def attn_processors(self) -> Dict[str, AttentionProcessor]:
         r"""
         Returns:
             `dict` of attention processors: A dictionary containing all attention processors used in the model with
             indexed by its weight name.
         """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+        return self.control_model.attn_processors
 
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
     def set_attn_processor(
         self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False
     ):
@@ -479,44 +473,14 @@ def set_attn_processor(
                 processor. This is strongly recommended when setting trainable attention processors.
 
         """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor, _remove_lora=_remove_lora)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        self.control_model.set_attn_processor(processor, _remove_lora)
 
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
-    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
     def set_default_attn_processor(self):
         """
         Disables custom attention processors and sets the default attention implementation.
         """
-        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
-            processor = AttnAddedKVProcessor()
-        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
-            processor = AttnProcessor()
-        else:
-            raise ValueError(
-                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
-            )
-
-        self.set_attn_processor(processor, _remove_lora=True)
+        self.control_model.set_default_attn_processor()
 
-    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attention_slice
     def set_attention_slice(self, slice_size):
         r"""
         Enable sliced attention computation.
@@ -531,57 +495,9 @@ def set_attention_slice(self, slice_size):
                 provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
                 must be a multiple of `slice_size`.
         """
-        sliceable_head_dims = []
-
-        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
-            if hasattr(module, "set_attention_slice"):
-                sliceable_head_dims.append(module.sliceable_head_dim)
-
-            for child in module.children():
-                fn_recursive_retrieve_sliceable_dims(child)
-
-        # retrieve number of attention layers
-        for module in self.children():
-            fn_recursive_retrieve_sliceable_dims(module)
-
-        num_sliceable_layers = len(sliceable_head_dims)
-
-        if slice_size == "auto":
-            # half the attention head size is usually a good trade-off between
-            # speed and memory
-            slice_size = [dim // 2 for dim in sliceable_head_dims]
-        elif slice_size == "max":
-            # make smallest slice possible
-            slice_size = num_sliceable_layers * [1]
-
-        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
-
-        if len(slice_size) != len(sliceable_head_dims):
-            raise ValueError(
-                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
-                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
-            )
-
-        for i in range(len(slice_size)):
-            size = slice_size[i]
-            dim = sliceable_head_dims[i]
-            if size is not None and size > dim:
-                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
-
-        # Recursively walk through all the children.
-        # Any children which exposes the set_attention_slice method
-        # gets the message
-        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
-            if hasattr(module, "set_attention_slice"):
-                module.set_attention_slice(slice_size.pop())
-
-            for child in module.children():
-                fn_recursive_set_attention_slice(child, slice_size)
-
-        reversed_slice_size = list(reversed(slice_size))
-        for module in self.children():
-            fn_recursive_set_attention_slice(module, reversed_slice_size)
+        self.control_model.set_attention_slice(slice_size)
 
+    # todo umer: understand & either remove or adapt
     # Copied from diffusers.models.controlnet.ControlNetModel._set_gradient_checkpointing
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D)):
diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
index 5a0d03935878..0bd2208bcb70 100644
--- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
@@ -133,8 +133,8 @@ class StableDiffusionXLControlNetXSPipeline(
             watermarker is used.
     """
     # leave controlnet out on purpose because it iterates with unet
-    model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
-    _optional_components = ["tokenizer", "tokenizer_2", "text_encoder", "text_encoder_2"]
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae->controlnet"
+    _optional_components = ["tokenizer_2", "text_encoder_2"]
 
     def __init__(
         self,
@@ -638,10 +638,8 @@ def prepare_image(
         do_classifier_free_guidance=False,
         guess_mode=False,
     ):
-        print('Image dims:', image.shape)
         image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
         image_batch_size = image.shape[0]
-        print('Latents dims:', image.shape)
 
         if image_batch_size == 1:
             repeat_by = batch_size
@@ -661,7 +659,6 @@ def prepare_image(
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
         shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
-        print("Preparing latents: shape to be =",shape)
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
                 f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -675,8 +672,7 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
 
         # scale the initial noise by the standard deviation required by the scheduler
         latents = latents * self.scheduler.init_noise_sigma
-        print("Preparing latents: shape =",latents.shape)
-        
+
         return latents
 
     # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._get_add_time_ids
diff --git a/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py b/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py
index d99bb8ef2753..d8fffb46fb9a 100644
--- a/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py
+++ b/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py
@@ -86,6 +86,7 @@ def get_dummy_components(self):
             time_embedding_mix=0.95,
             learn_embedding=True,
             size_ratio=0.5,
+            conditioning_block_sizes=(16,32),
         )
         torch.manual_seed(0)
         scheduler = EulerDiscreteScheduler(

From 6945de8ab20db8b52cc8a5d08922d4658b6b30af Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Thu, 16 Nov 2023 17:17:58 +0100
Subject: [PATCH 41/88] Fixed tests

---
 .../pipeline_controlnet_xs_sd_xl.py           | 37 ++++++++++++++-----
 1 file changed, 27 insertions(+), 10 deletions(-)

diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
index 0bd2208bcb70..76e2cf264fa7 100644
--- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
@@ -35,7 +35,7 @@
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import USE_PEFT_BACKEND, logging, replace_example_docstring, scale_lora_layers, unscale_lora_layers
-from ...utils.torch_utils import is_compiled_module, randn_tensor
+from ...utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from ..stable_diffusion_xl import StableDiffusionXLPipelineOutput
 
@@ -134,7 +134,7 @@ class StableDiffusionXLControlNetXSPipeline(
     """
     # leave controlnet out on purpose because it iterates with unet
     model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae->controlnet"
-    _optional_components = ["tokenizer_2", "text_encoder_2"]
+    _optional_components = ["tokenizer", "tokenizer_2", "text_encoder", "text_encoder_2"]
 
     def __init__(
         self,
@@ -437,12 +437,12 @@ def encode_prompt(
         if self.text_encoder is not None:
             if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
                 # Retrieve the original scale by scaling back the LoRA layers
-                unscale_lora_layers(self.text_encoder)
+                unscale_lora_layers(self.text_encoder, lora_scale)
 
         if self.text_encoder_2 is not None:
             if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
                 # Retrieve the original scale by scaling back the LoRA layers
-                unscale_lora_layers(self.text_encoder_2)
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
 
         return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
 
@@ -672,15 +672,16 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
 
         # scale the initial noise by the standard deviation required by the scheduler
         latents = latents * self.scheduler.init_noise_sigma
-
         return latents
 
     # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._get_add_time_ids
-    def _get_add_time_ids(self, original_size, crops_coords_top_left, target_size, dtype):
+    def _get_add_time_ids(
+        self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
+    ):
         add_time_ids = list(original_size + crops_coords_top_left + target_size)
 
         passed_add_embed_dim = (
-            self.unet.config.addition_time_embed_dim * len(add_time_ids) + self.text_encoder_2.config.projection_dim
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
         )
         expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
 
@@ -1004,7 +1005,6 @@ def __call__(
 
         # 6. Prepare latent variables
         num_channels_latents = self.unet.config.in_channels
-
         latents = self.prepare_latents(
             batch_size * num_images_per_prompt,
             num_channels_latents,
@@ -1036,8 +1036,17 @@ def __call__(
         target_size = target_size or (height, width)
 
         add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
         add_time_ids = self._get_add_time_ids(
-            original_size, crops_coords_top_left, target_size, dtype=prompt_embeds.dtype
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
         )
 
         if negative_original_size is not None and negative_target_size is not None:
@@ -1061,8 +1070,15 @@ def __call__(
 
         # 8. Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        is_unet_compiled = is_compiled_module(self.unet)
+        is_controlnet_compiled = is_compiled_module(self.controlnet)
+        is_torch_higher_equal_2_1 = is_torch_version(">=", "2.1")
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
+                # Relevant thread:
+                # https://dev-discuss.pytorch.org/t/cudagraphs-in-pytorch-2-0/1428
+                if (is_unet_compiled and is_controlnet_compiled) and is_torch_higher_equal_2_1:
+                    torch._inductor.cudagraph_mark_step_begin()
                 # expand the latents if we are doing classifier free guidance
                 latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
@@ -1094,7 +1110,8 @@ def __call__(
                 if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
 
         # manually for max memory savings
         if self.vae.dtype == torch.float16 and self.vae.config.force_upcast:

From 1a0c4ad222df18224688fd3c78cdd76a8e12d875 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Thu, 16 Nov 2023 17:26:49 +0100
Subject: [PATCH 42/88] removed debug logs

---
 src/diffusers/models/attention.py      |   4 -
 src/diffusers/models/resnet.py         |   9 --
 src/diffusers/models/transformer_2d.py |   3 -
 src/diffusers/umer_debug_logger.py     | 136 -------------------------
 4 files changed, 152 deletions(-)
 delete mode 100644 src/diffusers/umer_debug_logger.py

diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 9d1c8b2fd971..132aee92c5c8 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -16,7 +16,6 @@
 import torch
 from torch import nn
 
-from ..umer_debug_logger import udl
 from ..utils import USE_PEFT_BACKEND
 from ..utils.torch_utils import maybe_allow_in_graph
 from .activations import GEGLU, GELU, ApproximateGELU
@@ -262,7 +261,6 @@ def forward(
             attention_mask=attention_mask,
             **cross_attention_kwargs,
         )
-        udl.log_if("attn1", attn_output, "SUBBLOCK-MINUS-1")
 
         if self.use_ada_layer_norm_zero:
             attn_output = gate_msa.unsqueeze(1) * attn_output
@@ -300,8 +298,6 @@ def forward(
                 **cross_attention_kwargs,
             )
             hidden_states = attn_output + hidden_states
-        udl.log_if("attn2", attn_output, "SUBBLOCK-MINUS-1")
-        udl.log_if("add attn2", hidden_states, "SUBBLOCK-MINUS-1")
 
         # 4. Feed-forward
         if not self.use_ada_layer_norm_single:
diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py
index 37c30a902c03..e45e8f0ae522 100644
--- a/src/diffusers/models/resnet.py
+++ b/src/diffusers/models/resnet.py
@@ -20,7 +20,6 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from ..umer_debug_logger import udl
 from ..utils import USE_PEFT_BACKEND
 from .activations import get_activation
 from .attention_processor import SpatialNorm
@@ -206,8 +205,6 @@ def forward(self, hidden_states: torch.Tensor, output_size: Optional[int] = None
                 else:
                     hidden_states = self.Conv2d_0(hidden_states)
 
-        udl.log_if("conv", hidden_states, "SUBBLOCK-MINUS-1")
-
         return hidden_states
 
 
@@ -276,8 +273,6 @@ def forward(self, hidden_states, scale: float = 1.0):
         else:
             hidden_states = self.conv(hidden_states)
 
-        udl.log_if("conv", hidden_states, "SUBBLOCK-MINUS-1")
-
         return hidden_states
 
 
@@ -725,7 +720,6 @@ def forward(self, input_tensor, temb, scale: float = 1.0):
             )
 
         hidden_states = self.conv1(hidden_states, scale) if not USE_PEFT_BACKEND else self.conv1(hidden_states)
-        udl.log_if("conv1", hidden_states, condition="SUBBLOCK-MINUS-1")
 
         if self.time_emb_proj is not None:
             if not self.skip_time_act:
@@ -738,7 +732,6 @@ def forward(self, input_tensor, temb, scale: float = 1.0):
 
         if temb is not None and self.time_embedding_norm == "default":
             hidden_states = hidden_states + temb
-            udl.log_if("add time_emb_proj", hidden_states, condition="SUBBLOCK-MINUS-1")
 
         if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
             hidden_states = self.norm2(hidden_states, temb)
@@ -752,7 +745,6 @@ def forward(self, input_tensor, temb, scale: float = 1.0):
         hidden_states = self.nonlinearity(hidden_states)
         hidden_states = self.dropout(hidden_states)
         hidden_states = self.conv2(hidden_states, scale) if not USE_PEFT_BACKEND else self.conv2(hidden_states)
-        udl.log_if("conv2", hidden_states, condition="SUBBLOCK-MINUS-1")
 
         if self.conv_shortcut is not None:
             input_tensor = (
@@ -760,7 +752,6 @@ def forward(self, input_tensor, temb, scale: float = 1.0):
             )
 
         output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
-        udl.log_if("add conv_shortcut", output_tensor, condition="SUBBLOCK-MINUS-1")
 
         return output_tensor
 
diff --git a/src/diffusers/models/transformer_2d.py b/src/diffusers/models/transformer_2d.py
index 46d06f45c31c..24abf54d6da7 100644
--- a/src/diffusers/models/transformer_2d.py
+++ b/src/diffusers/models/transformer_2d.py
@@ -20,7 +20,6 @@
 
 from ..configuration_utils import ConfigMixin, register_to_config
 from ..models.embeddings import ImagePositionalEmbeddings
-from ..umer_debug_logger import udl
 from ..utils import USE_PEFT_BACKEND, BaseOutput, deprecate
 from .attention import BasicTransformerBlock
 from .embeddings import CaptionProjection, PatchEmbed
@@ -437,8 +436,6 @@ def forward(
                 shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size)
             )
 
-        udl.log_if("proj_out", output, condition="SUBBLOCK-MINUS-1")
-
         if not return_dict:
             return (output,)
 
diff --git a/src/diffusers/umer_debug_logger.py b/src/diffusers/umer_debug_logger.py
deleted file mode 100644
index 0fda24a9d277..000000000000
--- a/src/diffusers/umer_debug_logger.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# Logger to help me (UmerHA) debug controlnet-xs
-
-import csv
-import inspect
-import os
-import shutil
-from datetime import datetime
-from types import SimpleNamespace
-
-import torch
-
-
-class UmerDebugLogger:
-    _FILE = "udl.csv"
-
-    def __init__(self, log_dir="logs", condition=None):
-        self.log_dir, self.condition, self.tensor_counter = log_dir, condition, 0
-        os.makedirs(log_dir, exist_ok=True)
-        self.fields = ["timestamp", "cls", "fn", "shape", "msg", "condition", "tensor_file"]
-        self.create_file()
-        self.warned_of_no_condition = False
-        print(
-            "Info: `UmerDebugLogger` created. This is a logging class that will be deleted when the PR to integrate ControlNet-XS is done."
-        )
-
-    @property
-    def full_file_path(self):
-        return os.path.join(self.log_dir, self._FILE)
-
-    def create_file(self):
-        file = self.full_file_path
-        if not os.path.isfile(file):
-            with open(file, "w", newline="") as f:
-                writer = csv.DictWriter(f, fieldnames=self.fields)
-                writer.writeheader()
-
-    def set_dir(self, log_dir, clear=False):
-        self.log_dir = log_dir
-        if clear:
-            self.clear_logs()
-        self.create_file()
-
-    def clear_logs(self):
-        shutil.rmtree(self.log_dir, ignore_errors=True)
-        os.makedirs(self.log_dir, exist_ok=True)
-        self.create_file()
-
-    def set_condition(self, condition):
-        self.condition = condition
-
-    def log_if(self, msg, t, condition, *, print_=False):
-        self.maybe_warn_of_no_condition()
-
-        # Use inspect to get the current frame and then go back one level to find caller
-        frame = inspect.currentframe()
-        caller_frame = frame.f_back
-        caller_info = inspect.getframeinfo(caller_frame)
-
-        # Extract class and function name from the caller
-        cls_name = (
-            caller_frame.f_locals.get("self", None).__class__.__name__ if "self" in caller_frame.f_locals else None
-        )
-        function_name = caller_info.function
-
-        if not hasattr(t, "shape"):
-            t = torch.tensor(t)
-        t = t.cpu().detach()
-
-        if condition == self.condition:
-            # Save tensor to a file
-            tensor_filename = f"tensor_{self.tensor_counter}.pt"
-            torch.save(t, os.path.join(self.log_dir, tensor_filename))
-            self.tensor_counter += 1
-
-            # Log information to CSV
-            log_info = {
-                "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
-                "cls": cls_name,
-                "fn": function_name,
-                "shape": str(list(t.shape)),
-                "msg": msg,
-                "condition": condition,
-                "tensor_file": tensor_filename,
-            }
-
-            with open(self.full_file_path, "a", newline="") as f:
-                writer = csv.DictWriter(f, fieldnames=self.fields)
-                writer.writerow(log_info)
-
-            if print_:
-                print(f"{msg}\t{t.flatten()[:10]}")
-
-    def print_if(self, msg, conditions, end="\n"):
-        self.maybe_warn_of_no_condition()
-        if not isinstance(conditions, (tuple, list)):
-            conditions = [conditions]
-        if any(self.condition == c for c in conditions):
-            print(msg, end=end)
-
-    def stop_if(self, condition, funny_msg):
-        if condition == self.condition:
-            print(funny_msg)
-            raise SystemExit(funny_msg)
-
-    def maybe_warn_of_no_condition(self):
-        if self.condition is None and not self.warned_of_no_condition:
-            print("Info: No condition set for UmerDebugLogger")
-            self.warned_of_no_condition = True
-
-    def get_log_objects(self):
-        log_objects = []
-        file = self.full_file_path
-        with open(file, newline="") as f:
-            reader = csv.DictReader(f)
-            for row in reader:
-                row["tensor"] = torch.load(os.path.join(self.log_dir, row["tensor_file"]))
-                row["head"] = row["tensor"].flatten()[:10]
-                del row["tensor_file"]
-                log_objects.append(SimpleNamespace(**row))
-        return log_objects
-
-    @classmethod
-    def load_log_objects_from_dir(self, log_dir):
-        file = os.path.join(log_dir, self._FILE)
-        log_objects = []
-        with open(file, newline="") as f:
-            reader = csv.DictReader(f)
-            for row in reader:
-                row["t"] = torch.load(os.path.join(log_dir, row["tensor_file"]))
-                row["head"] = row["t"].flatten()[:10]
-                del row["tensor_file"]
-                log_objects.append(SimpleNamespace(**row))
-        return log_objects
-
-
-udl = UmerDebugLogger()

From 7cc437e3dcd04f415842f531637fc143e8ad3fd4 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Thu, 16 Nov 2023 17:48:47 +0100
Subject: [PATCH 43/88] make style + quality

---
 src/diffusers/models/controlnetxs.py          | 30 +++++++++----------
 .../controlnetxs/test_controlnetxs_sdxl.py    |  4 +--
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index 001881c28f9e..fbb887dd46dd 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -24,11 +24,7 @@
 from ..configuration_utils import ConfigMixin, register_to_config
 from ..utils import BaseOutput, logging
 from .attention_processor import (
-    ADDED_KV_ATTENTION_PROCESSORS,
-    CROSS_ATTENTION_PROCESSORS,
     AttentionProcessor,
-    AttnAddedKVProcessor,
-    AttnProcessor,
 )
 from .lora import LoRACompatibleConv
 from .modeling_utils import ModelMixin
@@ -64,6 +60,7 @@ class ControlNetXSOutput(BaseOutput):
 
 # todo umer: assert in pipe that conditioning_block_sizes matches vae downblocks
 
+
 # todo umer: add sth like FromOriginalControlnetMixin
 class ControlNetXSModel(ModelMixin, ConfigMixin):
     r"""
@@ -79,7 +76,7 @@ class ControlNetXSModel(ModelMixin, ConfigMixin):
         conditioning_channels (`int`, defaults to 3):
             Number of channels of conditioning input (e.g. an image)
         controlnet_conditioning_channel_order (`str`, defaults to `"rgb"`):
-            The channel order of conditional image. Will convert to `rgb` if it's `bgr`.  
+            The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
         conditioning_block_sizes (`Tuple[int]`, defaults to `(16,32,96,256))`):
                 TODO
         time_embedding_input_dim (`int`, defaults to 320):
@@ -104,7 +101,7 @@ def create_as_in_paper(cls, base_model: UNet2DConditionModel):
             learn_embedding=True,
             size_ratio=0.1,
             dim_attention_heads=64,
-            conditioning_block_sizes = (16,32,96,256),
+            conditioning_block_sizes=(16, 32, 96, 256),
         )
 
     @classmethod
@@ -149,7 +146,7 @@ def gather_subblock_sizes(cls, unet: UNet2DConditionModel, base_or_control):
     def __init__(
         self,
         conditioning_channels: int = 3,
-        conditioning_block_sizes: Tuple[int] = (16,32,96,256),
+        conditioning_block_sizes: Tuple[int] = (16, 32, 96, 256),
         controlnet_conditioning_channel_order: str = "rgb",
         time_embedding_input_dim: int = 320,
         time_embedding_dim: int = 1280,
@@ -321,21 +318,23 @@ def __init__(
         # 5 - Create conditioning hint embedding
         conditioning_emb_layers = [
             nn.Conv2d(conditioning_channels, conditioning_block_sizes[0], 3, padding=1),
-            nn.SiLU()
+            nn.SiLU(),
         ]
 
-        for i in range(len(conditioning_block_sizes)-1):
+        for i in range(len(conditioning_block_sizes) - 1):
             in_channels = conditioning_block_sizes[i]
-            out_channels = conditioning_block_sizes[i+1]
+            out_channels = conditioning_block_sizes[i + 1]
 
             conditioning_emb_layers += [
                 nn.Conv2d(in_channels, in_channels, 3, padding=1, stride=1),
                 nn.SiLU(),
                 nn.Conv2d(in_channels, out_channels, 3, padding=1, stride=2),
-                nn.SiLU()
+                nn.SiLU(),
             ]
 
-        conditioning_emb_layers.append(zero_module(nn.Conv2d(conditioning_block_sizes[-1], block_out_channels[0], 3, padding=1)))
+        conditioning_emb_layers.append(
+            zero_module(nn.Conv2d(conditioning_block_sizes[-1], block_out_channels[0], 3, padding=1))
+        )
 
         self.input_hint_block = nn.Sequential(*conditioning_emb_layers)
 
@@ -349,7 +348,7 @@ def from_unet(
         cls,
         unet: UNet2DConditionModel,
         conditioning_channels: int = 3,
-        conditioning_block_sizes: Tuple[int] = (16,32,96,256),
+        conditioning_block_sizes: Tuple[int] = (16, 32, 96, 256),
         controlnet_conditioning_channel_order: str = "rgb",
         learn_embedding: bool = False,
         time_embedding_mix: float = 1.0,
@@ -382,8 +381,9 @@ def from_unet(
             size_ratio (float, *optional*):
                 When given, block_out_channels is set to a relative fraction of the base model's block_out_channels.
                 Either this or `size_ratio` must be given.
-            norm_num_groups (int, *optional*, defaults to `None`): The number of groups to use for the normalization of the control unet.
-                If `None`, `int(unet.config.norm_num_groups * size_ratio)` is taken.
+            norm_num_groups (int, *optional*, defaults to `None`):
+                The number of groups to use for the normalization of the control unet. If `None`,
+                `int(unet.config.norm_num_groups * size_ratio)` is taken.
 
         """
 
diff --git a/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py b/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py
index d8fffb46fb9a..8aa32a7156fd 100644
--- a/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py
+++ b/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py
@@ -86,7 +86,7 @@ def get_dummy_components(self):
             time_embedding_mix=0.95,
             learn_embedding=True,
             size_ratio=0.5,
-            conditioning_block_sizes=(16,32),
+            conditioning_block_sizes=(16, 32),
         )
         torch.manual_seed(0)
         scheduler = EulerDiscreteScheduler(
@@ -358,7 +358,7 @@ def test_canny(self):
         assert images[0].shape == (768, 512, 3)
 
         original_image = images[0, -3:, -3:, -1].flatten()
-        expected_image = np.array([0.4359, 0.4335, 0.4609, 0.4515, 0.4669, 0.4494, 0.452 , 0.4493, 0.4382])
+        expected_image = np.array([0.4359, 0.4335, 0.4609, 0.4515, 0.4669, 0.4494, 0.452, 0.4493, 0.4382])
         assert np.allclose(original_image, expected_image, atol=1e-04)
 
     # ToDo Umer: Implement depth and enable this test

From 0f655bbdade1d3331956543cb9998fe905524b6a Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Thu, 16 Nov 2023 18:55:34 +0100
Subject: [PATCH 44/88] make fix-copies

---
 .../alt_diffusion/pipeline_alt_diffusion.py       |  1 -
 .../pipeline_alt_diffusion_img2img.py             |  1 -
 src/diffusers/utils/dummy_pt_objects.py           | 15 +++++++++++++++
 .../utils/dummy_torch_and_transformers_objects.py | 15 +++++++++++++++
 4 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
index 335df9e6f461..abbb87f307da 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
@@ -106,7 +106,6 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
-
     model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
     _exclude_from_cpu_offload = ["safety_checker"]
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index 7f24bad90f8d..6517c3516ea3 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -144,7 +144,6 @@ class AltDiffusionImg2ImgPipeline(
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
-
     model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
     _exclude_from_cpu_offload = ["safety_checker"]
diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py
index 090b1081fdaf..deb2aafae7d6 100644
--- a/src/diffusers/utils/dummy_pt_objects.py
+++ b/src/diffusers/utils/dummy_pt_objects.py
@@ -77,6 +77,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
 
+class ControlNetXSModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
 class ModelMixin(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index d6200bcaf122..61a38217f537 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -1052,6 +1052,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class StableDiffusionXLControlNetXSPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class StableDiffusionXLImg2ImgPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 

From cf0b6d5f323fe0c7e6f095b8e172425e97bf018f Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Fri, 17 Nov 2023 10:47:41 +0100
Subject: [PATCH 45/88] fixed documentation

---
 .../pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
index 76e2cf264fa7..e9d78325484b 100644
--- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
@@ -37,7 +37,7 @@
 from ...utils import USE_PEFT_BACKEND, logging, replace_example_docstring, scale_lora_layers, unscale_lora_layers
 from ...utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor
 from ..pipeline_utils import DiffusionPipeline
-from ..stable_diffusion_xl import StableDiffusionXLPipelineOutput
+from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
 
 
 if is_invisible_watermark_available():

From 3cb6d32ed7ad5926f10b440e6ab1f732d43cfd4d Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Fri, 17 Nov 2023 10:59:31 +0100
Subject: [PATCH 46/88] added cnxs to doc toc

---
 docs/source/en/_toctree.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index a7a2ea895f9c..b604a266834c 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -248,6 +248,8 @@
       title: ControlNet
     - local: api/pipelines/controlnet_sdxl
       title: ControlNet with Stable Diffusion XL
+    - local: api/pipelines/controlnetxs_sdxl
+      title: ControlNet-XS with Stable Diffusion XL
     - local: api/pipelines/cycle_diffusion
       title: Cycle Diffusion
     - local: api/pipelines/dance_diffusion

From f11e0e73909c933e9dc24fe814c5030e851c8e7d Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Fri, 17 Nov 2023 11:45:47 +0100
Subject: [PATCH 47/88] added control start/end param

---
 .../pipeline_controlnet_xs_sd_xl.py           | 111 ++++++------------
 1 file changed, 37 insertions(+), 74 deletions(-)

diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
index e9d78325484b..d0e18a2d17d4 100644
--- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
@@ -151,8 +151,6 @@ def __init__(
     ):
         super().__init__()
 
-        # todo: add multi contronet?
-
         self.register_modules(
             vae=vae,
             text_encoder=text_encoder,
@@ -163,7 +161,6 @@ def __init__(
             controlnet=controlnet,
             scheduler=scheduler,
         )
-
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
         self.control_image_processor = VaeImageProcessor(
@@ -536,8 +533,6 @@ def check_inputs(
                 "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
             )
 
-        # todo: multi control net?
-
         # Check `image`
         is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
             self.controlnet, torch._dynamo.eval_frame.OptimizedModule
@@ -548,7 +543,6 @@ def check_inputs(
             and isinstance(self.controlnet._orig_mod, ControlNetXSModel)
         ):
             self.check_image(image, prompt, prompt_embeds)
-        # elif # todo: multi control net?
         else:
             assert False
 
@@ -560,32 +554,18 @@ def check_inputs(
         ):
             if not isinstance(controlnet_conditioning_scale, float):
                 raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
-        # elif # todo: multi control net?
         else:
             assert False
 
-        if not isinstance(control_guidance_start, (tuple, list)):
-            control_guidance_start = [control_guidance_start]
-
-        if not isinstance(control_guidance_end, (tuple, list)):
-            control_guidance_end = [control_guidance_end]
-
-        if len(control_guidance_start) != len(control_guidance_end):
+        start, end = control_guidance_start, control_guidance_end
+        if start >= end:
             raise ValueError(
-                f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
+                f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
             )
-
-        # if isinstance(self.controlnet, MultiControlNetModel): # todo?
-
-        for start, end in zip(control_guidance_start, control_guidance_end):
-            if start >= end:
-                raise ValueError(
-                    f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
-                )
-            if start < 0.0:
-                raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
-            if end > 1.0:
-                raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
+        if start < 0.0:
+            raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
+        if end > 1.0:
+            raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
 
     # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image
     def check_image(self, image, prompt, prompt_embeds):
@@ -769,8 +749,8 @@ def __call__(
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
         guess_mode: bool = False,
-        control_guidance_start: Union[float, List[float]] = 0.0,
-        control_guidance_end: Union[float, List[float]] = 1.0,
+        control_guidance_start: float = 0.0,
+        control_guidance_end: float = 1.0,
         original_size: Tuple[int, int] = None,
         crops_coords_top_left: Tuple[int, int] = (0, 0),
         target_size: Tuple[int, int] = None,
@@ -861,9 +841,9 @@ def __call__(
             guess_mode (`bool`, *optional*, defaults to `False`):
                 The ControlNet encoder tries to recognize the content of the input image even if you remove all
                 prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended.
-            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
+            control_guidance_start (`float`, *optional*, defaults to 0.0):
                 The percentage of total steps at which the ControlNet starts applying.
-            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
+            control_guidance_end (`float`, *optional*, defaults to 1.0):
                 The percentage of total steps at which the ControlNet stops applying.
             original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
                 If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
@@ -904,17 +884,6 @@ def __call__(
         """
         controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
 
-        # align format for control guidance
-        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
-            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
-        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
-            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
-        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
-            mult = 1  # len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
-            control_guidance_start, control_guidance_end = mult * [control_guidance_start], mult * [
-                control_guidance_end
-            ]
-
         # 1. Check inputs. Raise error if not correct
         self.check_inputs(
             prompt,
@@ -946,16 +915,6 @@ def __call__(
         # corresponds to doing no classifier free guidance.
         do_classifier_free_guidance = guidance_scale > 1.0
 
-        # todo: if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float): ...
-
-        # todo umer: understand & implement if needed
-        # global_pool_conditions = (
-        #     controlnet.config.global_pool_conditions
-        #     if isinstance(controlnet, ControlNetXSModel)
-        #     else controlnet.nets[0].config.global_pool_conditions
-        # )
-        # guess_mode = guess_mode or global_pool_conditions
-
         # 3. Encode input prompt
         text_encoder_lora_scale = (
             cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
@@ -995,7 +954,6 @@ def __call__(
                 guess_mode=guess_mode,
             )
             height, width = image.shape[-2:]
-        # elif isinstance(controlnet, MultiControlNetModel): todo?
         else:
             assert False
 
@@ -1019,16 +977,7 @@ def __call__(
         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
-        # 7.1 Create tensor stating which controlnets to keep
-        controlnet_keep = []
-        for i in range(len(timesteps)):
-            keeps = [
-                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
-                for s, e in zip(control_guidance_start, control_guidance_end)
-            ]
-            controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetXSModel) else keeps)
-
-        # 7.2 Prepare added time ids & embeddings
+        # 7.1 Prepare added time ids & embeddings
         if isinstance(image, list):
             original_size = original_size or image[0].shape[-2:]
         else:
@@ -1055,6 +1004,7 @@ def __call__(
                 negative_crops_coords_top_left,
                 negative_target_size,
                 dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
             )
         else:
             negative_add_time_ids = add_time_ids
@@ -1086,17 +1036,30 @@ def __call__(
                 added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
 
                 # predict the noise residual
-                noise_pred = self.controlnet(
-                    base_model=self.unet,
-                    sample=latent_model_input,
-                    timestep=t,
-                    encoder_hidden_states=prompt_embeds,
-                    controlnet_cond=image,
-                    conditioning_scale=controlnet_conditioning_scale,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    added_cond_kwargs=added_cond_kwargs,
-                    return_dict=True,
-                ).sample
+                dont_control = (
+                    i / len(timesteps) < control_guidance_start or (i + 1) / len(timesteps) > control_guidance_end
+                )
+                if dont_control:
+                    noise_pred = self.unet(
+                        sample=latent_model_input,
+                        timestep=t,
+                        encoder_hidden_states=prompt_embeds,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        added_cond_kwargs=added_cond_kwargs,
+                        return_dict=True,
+                    ).sample
+                else:
+                    noise_pred = self.controlnet(
+                        base_model=self.unet,
+                        sample=latent_model_input,
+                        timestep=t,
+                        encoder_hidden_states=prompt_embeds,
+                        controlnet_cond=image,
+                        conditioning_scale=controlnet_conditioning_scale,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        added_cond_kwargs=added_cond_kwargs,
+                        return_dict=True,
+                    ).sample
 
                 # perform guidance
                 if do_classifier_free_guidance:

From 7b05af47747e2664cd6cf69a6ed7038c6a710a4f Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Fri, 17 Nov 2023 12:03:21 +0100
Subject: [PATCH 48/88] Update controlnetxs_sdxl.md

---
 .../en/api/pipelines/controlnetxs_sdxl.md     | 24 ++++---------------
 1 file changed, 4 insertions(+), 20 deletions(-)

diff --git a/docs/source/en/api/pipelines/controlnetxs_sdxl.md b/docs/source/en/api/pipelines/controlnetxs_sdxl.md
index 755f18341d20..eb89fec724ca 100644
--- a/docs/source/en/api/pipelines/controlnetxs_sdxl.md
+++ b/docs/source/en/api/pipelines/controlnetxs_sdxl.md
@@ -12,15 +12,9 @@ specific language governing permissions and limitations under the License.
 
 # ControlNet with Stable Diffusion XL
 
-ControlNet was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543) by Lvmin Zhang, Anyi Rao, and Maneesh Agrawala.
+ControlNet-XS was introduced in [ControlNet-XS](https://vislearn.github.io/ControlNet-XS/) by Denis Zavadski and Carsten Rother. It is based on the observation that the control model in the [original ControlNet](https://huggingface.co/papers/2302.05543) can be made much smaller and still produces good results.
 
-With a ControlNet model, you can provide an additional control image to condition and control Stable Diffusion generation. For example, if you provide a depth map, the ControlNet model generates an image that'll preserve the spatial information from the depth map. It is a more flexible and accurate way to control the image generation process.
-
-The abstract from the paper is:
-
-*We present ControlNet, a neural network architecture to add spatial conditioning controls to large, pretrained text-to-image diffusion models. ControlNet locks the production-ready large diffusion models, and reuses their deep and robust encoding layers pretrained with billions of images as a strong backbone to learn a diverse set of conditional controls. The neural architecture is connected with "zero convolutions" (zero-initialized convolution layers) that progressively grow the parameters from zero and ensure that no harmful noise could affect the finetuning. We test various conditioning controls, eg, edges, depth, segmentation, human pose, etc, with Stable Diffusion, using single or multiple conditions, with or without prompts. We show that the training of ControlNets is robust with small (<50k) and large (>1m) datasets. Extensive results show that ControlNet may facilitate wider applications to control image diffusion models.*
-
-You can find additional smaller Stable Diffusion XL (SDXL) ControlNet checkpoints from the 🤗 [Diffusers](https://huggingface.co/diffusers) Hub organization, and browse [community-trained](https://huggingface.co/models?other=stable-diffusion-xl&other=controlnet) checkpoints on the Hub.
+As with the original ControlNet model, you can provide an additional control image to condition and control Stable Diffusion generation. For example, if you provide a depth map, the ControlNet model generates an image that'll preserve the spatial information from the depth map. It is a more flexible and accurate way to control the image generation process.
 
 <Tip warning={true}>
 
@@ -36,18 +30,8 @@ Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers)
 
 </Tip>
 
-## StableDiffusionXLControlNetPipeline
-[[autodoc]] StableDiffusionXLControlNetPipeline
-	- all
-	- __call__
-
-## StableDiffusionXLControlNetImg2ImgPipeline
-[[autodoc]] StableDiffusionXLControlNetImg2ImgPipeline
-	- all
-	- __call__
-
-## StableDiffusionXLControlNetInpaintPipeline
-[[autodoc]] StableDiffusionXLControlNetInpaintPipeline
+## StableDiffusionXLControlNetXSPipeline
+[[autodoc]] StableDiffusionXLControlNetXSPipeline
 	- all
 	- __call__
 

From 5f99f7983905ce642b5d31b9ce1091e6fe71cce8 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Fri, 17 Nov 2023 12:21:32 +0100
Subject: [PATCH 49/88] tried to fix copies..

---
 .../pipelines/alt_diffusion/pipeline_alt_diffusion.py          | 1 +
 .../pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py  | 1 +
 .../pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py    | 3 +++
 3 files changed, 5 insertions(+)

diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
index abbb87f307da..ca2db0e5d92b 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
@@ -106,6 +106,7 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
+    
     model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
     _exclude_from_cpu_offload = ["safety_checker"]
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index 6517c3516ea3..c779c7b4d6d0 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -144,6 +144,7 @@ class AltDiffusionImg2ImgPipeline(
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
+    
     model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
     _exclude_from_cpu_offload = ["safety_checker"]
diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
index d0e18a2d17d4..4743e3231349 100644
--- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
@@ -874,6 +874,9 @@ def __call__(
                 as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
                 [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
                 information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
 
         Examples:
 

From 85dc48510b872c79f0a61004dfd38a541d821f15 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Fri, 17 Nov 2023 15:18:29 +0100
Subject: [PATCH 50/88] Fixed norm_num_groups in from_unet

---
 src/diffusers/models/controlnetxs.py           | 18 +++++++++++++++++-
 .../alt_diffusion/pipeline_alt_diffusion.py    |  2 +-
 .../pipeline_alt_diffusion_img2img.py          |  2 +-
 3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index fbb887dd46dd..b8b09f7895eb 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -405,8 +405,24 @@ def from_unet(
         if dim_attention_heads is not None:
             num_attention_heads = [math.ceil(c / dim_attention_heads) for c in block_out_channels]
 
+        def group_norms_match_channel_sizes(num_groups, channel_sizes):
+            return all(c % num_groups == 0 for c in channel_sizes)
+
         if norm_num_groups is None:
-            norm_num_groups = int(unet.config.norm_num_groups * size_ratio)
+            if group_norms_match_channel_sizes(unet.config.norm_num_groups, block_out_channels):
+                norm_num_groups = unet.config.norm_num_groups
+            else:
+                if not size_ratio:
+                    raise ValueError(
+                        f"`block_out_channels` ({block_out_channels}) don't match the base models `norm_num_groups` ({unet.config.norm_num_groups}). Pass `norm_num_groups` explicitly so it divides all block_out_channels."
+                    )
+
+                # try to scale down `norm_num_groups` by `size_ratio`
+                norm_num_groups = int(unet.config.norm_num_groups * size_ratio)
+                if not group_norms_match_channel_sizes(norm_num_groups, block_out_channels):
+                    raise ValueError(
+                        f"`block_out_channels` ({block_out_channels}) don't match the base models `norm_num_groups` ({unet.config.norm_num_groups}). Dividing `norm_num_groups` by `size_ratio` ({size_ratio}) didn't fix this. Pass `norm_num_groups` explicitly so it divides all block_out_channels."
+                    )
 
         def get_time_emb_input_dim(unet: UNet2DConditionModel):
             return unet.time_embedding.linear_1.in_features
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
index ca2db0e5d92b..335df9e6f461 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
@@ -106,7 +106,7 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
-    
+
     model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
     _exclude_from_cpu_offload = ["safety_checker"]
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index c779c7b4d6d0..7f24bad90f8d 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -144,7 +144,7 @@ class AltDiffusionImg2ImgPipeline(
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
-    
+
     model_cpu_offload_seq = "text_encoder->unet->vae"
     _optional_components = ["safety_checker", "feature_extractor"]
     _exclude_from_cpu_offload = ["safety_checker"]

From c76eb352804a0da49172f3571e0c597b9c701d2b Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Fri, 17 Nov 2023 15:44:11 +0100
Subject: [PATCH 51/88] added sdxl-depth test

---
 .../controlnetxs/test_controlnetxs_sdxl.py    | 35 +++++++++----------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py b/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py
index 8aa32a7156fd..fb52c12df7a1 100644
--- a/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py
+++ b/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py
@@ -361,26 +361,25 @@ def test_canny(self):
         expected_image = np.array([0.4359, 0.4335, 0.4609, 0.4515, 0.4669, 0.4494, 0.452, 0.4493, 0.4382])
         assert np.allclose(original_image, expected_image, atol=1e-04)
 
-    # ToDo Umer: Implement depth and enable this test
-    # def test_depth(self):
-    #     controlnet = ControlNetModel.from_pretrained("diffusers/controlnet-depth-sdxl-1.0")
+    def test_depth(self):
+        controlnet = ControlNetXSModel.from_pretrained("UmerHA/ConrolNetXS-SDXL-depth")
 
-    #     pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
-    #         "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet
-    #     )
-    #     pipe.enable_sequential_cpu_offload()
-    #     pipe.set_progress_bar_config(disable=None)
+        pipe = StableDiffusionXLControlNetXSPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet
+        )
+        pipe.enable_sequential_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
 
-    #     generator = torch.Generator(device="cpu").manual_seed(0)
-    #     prompt = "Stormtrooper's lecture"
-    #     image = load_image(
-    #         "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth.png"
-    #     )
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "Stormtrooper's lecture"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth.png"
+        )
 
-    #     images = pipe(prompt, image=image, generator=generator, output_type="np", num_inference_steps=3).images
+        images = pipe(prompt, image=image, generator=generator, output_type="np", num_inference_steps=3).images
 
-    #     assert images[0].shape == (512, 512, 3)
+        assert images[0].shape == (512, 512, 3)
 
-    #     original_image = images[0, -3:, -3:, -1].flatten()
-    #     expected_image = np.array([0.4399, 0.5112, 0.5478, 0.4314, 0.472, 0.4823, 0.4647, 0.4957, 0.4853])
-    #     assert np.allclose(original_image, expected_image, atol=1e-04)
+        original_image = images[0, -3:, -3:, -1].flatten()
+        expected_image = np.array([0.4411, 0.3617, 0.2654, 0.266 , 0.3449, 0.3898, 0.3745, 0.353 , 0.326])
+        assert np.allclose(original_image, expected_image, atol=1e-04)

From 08668ae35c3f9dbd6b61f75b65496ba44d7b25f0 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Mon, 20 Nov 2023 21:49:54 +0100
Subject: [PATCH 52/88] created SD2.1 controlnet-xs pipeline

---
 src/diffusers/__init__.py                     |   2 +
 src/diffusers/models/controlnetxs.py          | 112 ++-
 src/diffusers/pipelines/__init__.py           |   2 +
 .../pipelines/controlnet_xs/__init__.py       |   2 +
 .../controlnet_xs/pipeline_controlnet_xs.py   | 946 ++++++++++++++++++
 .../pipeline_controlnet_xs_sd_xl.py           |   2 +-
 6 files changed, 1033 insertions(+), 33 deletions(-)
 create mode 100644 src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 98b7fe79a570..2ee6c5c3025e 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -246,6 +246,7 @@
             "StableDiffusionControlNetImg2ImgPipeline",
             "StableDiffusionControlNetInpaintPipeline",
             "StableDiffusionControlNetPipeline",
+            "StableDiffusionControlNetXSPipeline",
             "StableDiffusionDepth2ImgPipeline",
             "StableDiffusionDiffEditPipeline",
             "StableDiffusionGLIGENPipeline",
@@ -594,6 +595,7 @@
             StableDiffusionControlNetImg2ImgPipeline,
             StableDiffusionControlNetInpaintPipeline,
             StableDiffusionControlNetPipeline,
+            StableDiffusionControlNetXSPipeline,
             StableDiffusionDepth2ImgPipeline,
             StableDiffusionDiffEditPipeline,
             StableDiffusionGLIGENPipeline,
diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index b8b09f7895eb..afc5065a64f3 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -94,15 +94,25 @@ class ControlNetXSModel(ModelMixin, ConfigMixin):
 
     # to delete later
     @classmethod
-    def create_as_in_paper(cls, base_model: UNet2DConditionModel):
-        return ControlNetXSModel.from_unet(
-            base_model,
-            time_embedding_mix=0.95,
-            learn_embedding=True,
-            size_ratio=0.1,
-            dim_attention_heads=64,
-            conditioning_block_sizes=(16, 32, 96, 256),
-        )
+    def create_as_in_paper(cls, base_model: UNet2DConditionModel, sdxl=True):
+        if sdxl:
+            return ControlNetXSModel.from_unet(
+                base_model,
+                time_embedding_mix=0.95,
+                learn_embedding=True,
+                size_ratio=0.1,
+                dim_attention_heads=64,
+                conditioning_block_sizes=(16, 32, 96, 256),
+            )
+        else:
+            return ControlNetXSModel.from_unet(
+                base_model,
+                time_embedding_mix=0.95,
+                learn_embedding=True,
+                size_ratio=0.0125,
+                dim_attention_heads=8,
+                conditioning_block_sizes=(16, 32, 96, 256),
+            )
 
     @classmethod
     def gather_subblock_sizes(cls, unet: UNet2DConditionModel, base_or_control):
@@ -273,8 +283,24 @@ def __init__(
         adjust_time_dims(self.control_model, time_embedding_input_dim, time_embedding_dim)
 
         # 2.2 - Allow for information infusion from base model
-        # todo umer: the assumption that block sizes = changing subblock sizes is false, eg when we have consecutive blocks of same size
-        base_block_out_channels = [sz[1] for sz in base_model_channel_sizes["down"] if sz[0] != sz[1]]
+        def compute_block_out_channels(subblock_channels, layers_per_block):
+            channels = []
+            for i, (_, subblock_out_channels) in enumerate(subblock_channels):
+                # first subblock is the conv_in
+                if i==0:
+                    continue
+                # every block consists of `layers_per_block` resnet/attention subblocks and a down sample subblock
+                if i %(layers_per_block+1)==0:
+                    channels.append(subblock_out_channels)
+                # the last block doesn't have a down conv, so is handled separately
+                if i==len(subblock_channels)-1:
+                    channels.append(subblock_out_channels)
+            return channels
+
+        base_block_out_channels = compute_block_out_channels(
+            subblock_channels=base_model_channel_sizes["down"],
+            layers_per_block=layers_per_block
+        )
 
         extra_channels = list(
             zip(base_block_out_channels[0:1] + base_block_out_channels[:-1], base_block_out_channels)
@@ -405,6 +431,18 @@ def from_unet(
         if dim_attention_heads is not None:
             num_attention_heads = [math.ceil(c / dim_attention_heads) for c in block_out_channels]
 
+        # check that attention heads and group norms match channel sizes
+        # - attention heads
+        def attn_heads_match_channel_sizes(attn_heads, channel_sizes):
+            return all(c % a == 0 for a,c in zip(attn_heads,channel_sizes))
+
+        attention_head_dim = num_attention_heads or unet.config.attention_head_dim
+        if not attn_heads_match_channel_sizes(attention_head_dim, block_out_channels):
+            raise ValueError(
+                f"The number of attention heads ({attention_head_dim}) must divide `block_out_channels` ({block_out_channels}). If you didn't set `num_attention_heads` or `attention_head_dim` the default settings don't match your model. Set one of them  manually."
+            )
+
+        # - group norms
         def group_norms_match_channel_sizes(num_groups, channel_sizes):
             return all(c % num_groups == 0 for c in channel_sizes)
 
@@ -412,16 +450,15 @@ def group_norms_match_channel_sizes(num_groups, channel_sizes):
             if group_norms_match_channel_sizes(unet.config.norm_num_groups, block_out_channels):
                 norm_num_groups = unet.config.norm_num_groups
             else:
-                if not size_ratio:
-                    raise ValueError(
-                        f"`block_out_channels` ({block_out_channels}) don't match the base models `norm_num_groups` ({unet.config.norm_num_groups}). Pass `norm_num_groups` explicitly so it divides all block_out_channels."
-                    )
+                norm_num_groups = min(block_out_channels)
 
-                # try to scale down `norm_num_groups` by `size_ratio`
-                norm_num_groups = int(unet.config.norm_num_groups * size_ratio)
-                if not group_norms_match_channel_sizes(norm_num_groups, block_out_channels):
+                if group_norms_match_channel_sizes(norm_num_groups, block_out_channels):
+                    print(
+                        f"`norm_num_groups` was set to `min(block_out_channels)` (={norm_num_groups}) so it divides all block_out_channels` ({block_out_channels}). Set it explicitly to remove this information."
+                    )
+                else:
                     raise ValueError(
-                        f"`block_out_channels` ({block_out_channels}) don't match the base models `norm_num_groups` ({unet.config.norm_num_groups}). Dividing `norm_num_groups` by `size_ratio` ({size_ratio}) didn't fix this. Pass `norm_num_groups` explicitly so it divides all block_out_channels."
+                        f"`block_out_channels` ({block_out_channels}) don't match the base models `norm_num_groups` ({unet.config.norm_num_groups}). Setting `norm_num_groups` to `min(norm_num_groups)` ({norm_num_groups}) didn't fix this. Pass `norm_num_groups` explicitly so it divides all block_out_channels."
                     )
 
         def get_time_emb_input_dim(unet: UNet2DConditionModel):
@@ -434,7 +471,7 @@ def get_time_emb_dim(unet: UNet2DConditionModel):
         kwargs = dict(unet.config)
         kwargs.update(block_out_channels=block_out_channels)
         if num_attention_heads is not None:
-            kwargs.update(attention_head_dim=num_attention_heads)
+            kwargs.update(attention_head_dim=attention_head_dim)
         kwargs.update(norm_num_groups=norm_num_groups)
 
         # time embedding of control unet is not used. So remove params for them.
@@ -883,26 +920,37 @@ def is_iterable(o):
 def to_sub_blocks(blocks):
     if not is_iterable(blocks):
         blocks = [blocks]
+
     sub_blocks = []
+
     for b in blocks:
-        current_subblocks = []
         if hasattr(b, "resnets"):
             if hasattr(b, "attentions") and b.attentions is not None:
-                current_subblocks = list(zip_longest(b.resnets, b.attentions))
-                # if we have 1 more resnets than attentions, let the last subblock only be the resnet, not (resnet, None)
-                if current_subblocks[-1][1] is None:
-                    current_subblocks[-1] = current_subblocks[-1][0]
+                for r,a in zip(b.resnets, b.attentions):
+                    sub_blocks.append([r,a])
+
+                num_resnets = len(b.resnets)
+                num_attns = len(b.attentions)
+                
+                if num_resnets > num_attns:
+                    # we can have more resnets than attentions, so add each resnet as separate subblock
+                    for i in range(num_attns, num_resnets):
+                        sub_blocks.append([b.resnets[i]])                
             else:
-                current_subblocks = list(b.resnets)
-        # upsamplers are part of the same block # q: what if we have multiple upsamplers?
+                for r in b.resnets:
+                    sub_blocks.append([r])
+
+        # upsamplers are part of the same subblock
         if hasattr(b, "upsamplers") and b.upsamplers is not None:
-            current_subblocks[-1] = list(current_subblocks[-1]) + list(b.upsamplers)
-        # downsamplers are own block
+            for u in b.upsamplers:
+                sub_blocks[-1].extend([u])
+                
+        # downsamplers are own subblock
         if hasattr(b, "downsamplers") and b.downsamplers is not None:
-            current_subblocks.append(list(b.downsamplers))
-        sub_blocks += current_subblocks
-    return list(map(EmbedSequential, sub_blocks))
+            for d in b.downsamplers:
+                sub_blocks.append([d])
 
+    return list(map(EmbedSequential, sub_blocks))
 
 def zero_module(module):
     for p in module.parameters():
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 4a6d08e6b9b0..9781d5de60de 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -89,6 +89,7 @@
     )
     _import_structure["controlnet_xs"].extend(
         [
+            "StableDiffusionControlNetXSPipeline",
             "StableDiffusionXLControlNetXSPipeline",
         ]
     )
@@ -321,6 +322,7 @@
             StableDiffusionXLControlNetPipeline,
         )
         from .controlnet_xs import (
+            StableDiffusionControlNetXSPipeline,
             StableDiffusionXLControlNetXSPipeline,
         )
         from .deepfloyd_if import (
diff --git a/src/diffusers/pipelines/controlnet_xs/__init__.py b/src/diffusers/pipelines/controlnet_xs/__init__.py
index 669dc0419456..978278b184f9 100644
--- a/src/diffusers/pipelines/controlnet_xs/__init__.py
+++ b/src/diffusers/pipelines/controlnet_xs/__init__.py
@@ -22,6 +22,7 @@
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
+    _import_structure["pipeline_controlnet_xs"] = ["StableDiffusionControlNetXSPipeline"]
     _import_structure["pipeline_controlnet_xs_sd_xl"] = ["StableDiffusionXLControlNetXSPipeline"]
 try:
     if not (is_transformers_available() and is_flax_available()):
@@ -42,6 +43,7 @@
     except OptionalDependencyNotAvailable:
         from ...utils.dummy_torch_and_transformers_objects import *
     else:
+        from .pipeline_controlnet_xs import StableDiffusionControlNetXSPipeline
         from .pipeline_controlnet_xs_sd_xl import StableDiffusionXLControlNetXSPipeline
 
     try:
diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py
new file mode 100644
index 000000000000..c963f85c039a
--- /dev/null
+++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py
@@ -0,0 +1,946 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ControlNetXSModel, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from ..stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
+from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# TODO umer
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> # !pip install opencv-python transformers accelerate
+        >>> from diffusers import StableDiffusionControlNetXSPipeline, ControlNetXSModel, UniPCMultistepScheduler
+        >>> from diffusers.utils import load_image
+        >>> import numpy as np
+        >>> import torch
+
+        >>> import cv2
+        >>> from PIL import Image
+
+        >>> # download an image
+        >>> image = load_image(
+        ...     "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
+        ... )
+        >>> image = np.array(image)
+
+        >>> # get canny image
+        >>> image = cv2.Canny(image, 100, 200)
+        >>> image = image[:, :, None]
+        >>> image = np.concatenate([image, image, image], axis=2)
+        >>> canny_image = Image.fromarray(image)
+
+        >>> # load control net and stable diffusion v1-5
+        >>> controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
+        >>> pipe = StableDiffusionControlNetPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
+        ... )
+
+        >>> # speed up diffusion process with faster scheduler and memory optimization
+        >>> pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+        >>> # remove following line if xformers is not installed
+        >>> pipe.enable_xformers_memory_efficient_attention()
+
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> # generate image
+        >>> generator = torch.manual_seed(0)
+        >>> image = pipe(
+        ...     "futuristic-looking woman", num_inference_steps=20, generator=generator, image=canny_image
+        ... ).images[0]
+        ```
+"""
+
+
+class StableDiffusionControlNetXSPipeline(
+    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion with ControlNet-XS guidance.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        controlnet ([`ControlNetXSModel`]):
+            Provides additional conditioning to the `unet` during the denoising process.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+    model_cpu_offload_seq = "text_encoder->unet->vae>controlnet"
+    _optional_components = ["safety_checker", "feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        controlnet: ControlNetXSModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        controlnet_conditioning_scale=1.0,
+        control_guidance_start=0.0,
+        control_guidance_end=1.0,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        # Check `image`
+        is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+            self.controlnet, torch._dynamo.eval_frame.OptimizedModule
+        )
+        if (
+            isinstance(self.controlnet, ControlNetXSModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetXSModel)
+        ):
+            self.check_image(image, prompt, prompt_embeds)
+        else:
+            assert False
+
+        # Check `controlnet_conditioning_scale`
+        if (
+            isinstance(self.controlnet, ControlNetXSModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetXSModel)
+        ):
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        else:
+            assert False
+
+        start, end = control_guidance_start, control_guidance_end
+        if start >= end:
+            raise ValueError(
+                f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
+            )
+        if start < 0.0:
+            raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
+        if end > 1.0:
+            raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
+
+    def check_image(self, image, prompt, prompt_embeds):
+        image_is_pil = isinstance(image, PIL.Image.Image)
+        image_is_tensor = isinstance(image, torch.Tensor)
+        image_is_np = isinstance(image, np.ndarray)
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+        image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
+
+        if (
+            not image_is_pil
+            and not image_is_tensor
+            and not image_is_np
+            and not image_is_pil_list
+            and not image_is_tensor_list
+            and not image_is_np_list
+        ):
+            raise TypeError(
+                f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
+            )
+
+        if image_is_pil:
+            image_batch_size = 1
+        else:
+            image_batch_size = len(image)
+
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+
+        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+
+    def prepare_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
+    def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
+        r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
+
+        The suffixes after the scaling factors represent the stages where they are being applied.
+
+        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
+        that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+
+        Args:
+            s1 (`float`):
+                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+                mitigate "oversmoothing effect" in the enhanced denoising process.
+            s2 (`float`):
+                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+                mitigate "oversmoothing effect" in the enhanced denoising process.
+            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+        """
+        if not hasattr(self, "unet"):
+            raise ValueError("The pipeline must have `unet` for using FreeU.")
+        self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
+    def disable_freeu(self):
+        """Disables the FreeU mechanism if enabled."""
+        self.unet.disable_freeu()
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        guess_mode: bool = False,
+        control_guidance_start: float = 0.0,
+        control_guidance_end: float = 1.0,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+                    `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+                The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
+                specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be
+                accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height
+                and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in
+                `init`, images must be passed as a list such that each element of the list can be correctly batched for
+                input to a single ControlNet.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
+                the corresponding scale as a list.
+            guess_mode (`bool`, *optional*, defaults to `False`):
+                The ControlNet encoder tries to recognize the content of the input image even if you remove all
+                prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended.
+            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
+                The percentage of total steps at which the ControlNet starts applying.
+            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The percentage of total steps at which the ControlNet stops applying.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            image,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            controlnet_conditioning_scale,
+            control_guidance_start,
+            control_guidance_end,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare image
+        if isinstance(controlnet, ControlNetXSModel):
+            image = self.prepare_image(
+                image=image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=controlnet.dtype,
+                do_classifier_free_guidance=do_classifier_free_guidance,
+                guess_mode=guess_mode,
+            )
+            height, width = image.shape[-2:]
+        else:
+            assert False
+
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        is_unet_compiled = is_compiled_module(self.unet)
+        is_controlnet_compiled = is_compiled_module(self.controlnet)
+        is_torch_higher_equal_2_1 = is_torch_version(">=", "2.1")
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # Relevant thread:
+                # https://dev-discuss.pytorch.org/t/cudagraphs-in-pytorch-2-0/1428
+                if (is_unet_compiled and is_controlnet_compiled) and is_torch_higher_equal_2_1:
+                    torch._inductor.cudagraph_mark_step_begin()
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                dont_control = (
+                    i / len(timesteps) < control_guidance_start or (i + 1) / len(timesteps) > control_guidance_end
+                )
+                if dont_control:
+                    noise_pred = self.unet(
+                        sample=latent_model_input,
+                        timestep=t,
+                        encoder_hidden_states=prompt_embeds,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        return_dict=True,
+                    ).sample
+                else:
+                    noise_pred = self.controlnet(
+                        base_model=self.unet,
+                        sample=latent_model_input,
+                        timestep=t,
+                        encoder_hidden_states=prompt_embeds,
+                        controlnet_cond=image,
+                        conditioning_scale=controlnet_conditioning_scale,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        return_dict=True,
+                    ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # If we do sequential model offloading, let's offload unet and controlnet
+        # manually for max memory savings
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.unet.to("cpu")
+            self.controlnet.to("cpu")
+            torch.cuda.empty_cache()
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                0
+            ]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
index 4743e3231349..5ec506f7d657 100644
--- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
@@ -95,7 +95,7 @@ class StableDiffusionXLControlNetXSPipeline(
     DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionXLLoraLoaderMixin, FromSingleFileMixin
 ):
     r"""
-    Pipeline for text-to-image generation using Stable Diffusion XL with ControlNet guidance.
+    Pipeline for text-to-image generation using Stable Diffusion XL with ControlNet-XS guidance.
 
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
     implemented for all pipelines (downloading, saving, running on a particular device, etc.).

From ad1f39f789b09631bea35db7108b9dbc4ca2163b Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Wed, 22 Nov 2023 11:50:56 +0100
Subject: [PATCH 53/88] re-added debug logs

---
 src/diffusers/models/controlnetxs.py |  40 +++++++-
 src/diffusers/umer_debug_logger.py   | 137 +++++++++++++++++++++++++++
 2 files changed, 174 insertions(+), 3 deletions(-)
 create mode 100644 src/diffusers/umer_debug_logger.py

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index afc5065a64f3..edc1f9fa2ac5 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -39,7 +39,7 @@
     Upsample2D,
 )
 from .unet_2d_condition import UNet2DConditionModel
-
+from ..umer_debug_logger import udl
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
@@ -726,44 +726,77 @@ def forward(
         ctrl_mid_subblocks = to_sub_blocks([self.control_model.mid_block])
         base_up_subblocks = to_sub_blocks(base_model.up_blocks)
 
+        udl.log_if('prep.x',            sample,          condition='SUBBLOCK')
+        udl.log_if('prep.temb',         temb,           condition='SUBBLOCK')
+        udl.log_if('prep.context',      cemb,           condition='SUBBLOCK')
+        udl.log_if('prep.raw_hint',     controlnet_cond,condition='SUBBLOCK')
+        udl.log_if('prep.guided_hint',  guided_hint,    condition='SUBBLOCK')
+
         # Cross Control
         # 0 - conv in
         h_base = base_model.conv_in(h_base)
+        udl.log_if('enc.h_base', h_base, condition='SUBBLOCK')
+        
         h_ctrl = self.control_model.conv_in(h_ctrl)
+        udl.log_if('enc.h_ctrl', h_ctrl, condition='SUBBLOCK')
+        
         if guided_hint is not None:
             h_ctrl += guided_hint
-        h_base = h_base + next(it_down_convs_out)(h_ctrl) * next(scales)
+        udl.log_if('enc.h_ctrl', h_ctrl, condition='SUBBLOCK')
 
+        h_base = h_base + next(it_down_convs_out)(h_ctrl) * next(scales)
+        udl.log_if('enc.h_base', h_base, condition='SUBBLOCK')
+        
         hs_base.append(h_base)
         hs_ctrl.append(h_ctrl)
 
         # 1 - down
+        RUN_ONCE = ('SUBBLOCK', 'SUBBLOCK-MINUS-1')
+        udl.print_if('------ enc ------', conditions=RUN_ONCE)
+
         for m_base, m_ctrl in zip(base_down_subblocks, ctrl_down_subblocks):
             h_ctrl = torch.cat([h_ctrl, next(it_down_convs_in)(h_base)], dim=1)  # A - concat base -> ctrl
+            udl.log_if('enc.h_ctrl', h_ctrl, condition='SUBBLOCK')
+
             h_base = m_base(h_base, temb, cemb, attention_mask, cross_attention_kwargs)  # B - apply base subblock
+            udl.log_if('enc.h_base', h_base, condition='SUBBLOCK')
+
             h_ctrl = m_ctrl(h_ctrl, temb, cemb, attention_mask, cross_attention_kwargs)  # C - apply ctrl subblock
+            udl.log_if('enc.h_ctrl', h_ctrl, condition='SUBBLOCK')
+
             h_base = h_base + next(it_down_convs_out)(h_ctrl) * next(scales)  # D - add ctrl -> base
+            udl.log_if('enc.h_base', h_base, condition='SUBBLOCK')
 
             hs_base.append(h_base)
             hs_ctrl.append(h_ctrl)
 
         # 2 - mid
         h_ctrl = torch.cat([h_ctrl, next(it_down_convs_in)(h_base)], dim=1)  # A - concat base -> ctrl
+        udl.log_if('enc.h_ctrl', h_ctrl, 'SUBBLOCK')
+        
         for m_base, m_ctrl in zip(base_mid_subblocks, ctrl_mid_subblocks):
             h_base = m_base(h_base, temb, cemb, attention_mask, cross_attention_kwargs)  # B - apply base subblock
             h_ctrl = m_ctrl(h_ctrl, temb, cemb, attention_mask, cross_attention_kwargs)  # C - apply ctrl subblock
         h_base = h_base + self.middle_block_out(h_ctrl) * next(scales)  # D - add ctrl -> base
+        udl.log_if('mid.h_base', h_base, condition='SUBBLOCK')
+        udl.log_if('mid.h_ctrl', h_ctrl, condition='SUBBLOCK')
 
         # 3 - up
-        for m_base in base_up_subblocks:
+        for i, m_base in enumerate(base_up_subblocks):
+            udl.print_if(f'> processing up subblock {i}','SUBBLOCK')
             h_base = h_base + next(it_up_convs_out)(hs_ctrl.pop()) * next(scales)  # add info from ctrl encoder
+            udl.log_if('dec.h_base', h_base, condition='SUBBLOCK')
             h_base = torch.cat([h_base, hs_base.pop()], dim=1)  # concat info from base encoder+ctrl encoder
             h_base = m_base(h_base, temb, cemb, attention_mask, cross_attention_kwargs)
+            udl.log_if('dec.h_base', h_base, condition='SUBBLOCK')
 
         h_base = base_model.conv_norm_out(h_base)
         h_base = base_model.conv_act(h_base)
         h_base = base_model.conv_out(h_base)
 
+        udl.log_if('conv_out.h_base', h_base, condition='SUBBLOCK')
+        udl.stop_if('SUBBLOCK', 'The subblocks are cought. Let us gaze into their soul, their very essence.')
+
         if not return_dict:
             return h_base
 
@@ -952,6 +985,7 @@ def to_sub_blocks(blocks):
 
     return list(map(EmbedSequential, sub_blocks))
 
+
 def zero_module(module):
     for p in module.parameters():
         nn.init.zeros_(p)
diff --git a/src/diffusers/umer_debug_logger.py b/src/diffusers/umer_debug_logger.py
new file mode 100644
index 000000000000..831536ac9b8b
--- /dev/null
+++ b/src/diffusers/umer_debug_logger.py
@@ -0,0 +1,137 @@
+
+# Logger to help me (UmerHA) debug controlnet-xs
+
+import csv
+import inspect
+import os
+import shutil
+from datetime import datetime
+from types import SimpleNamespace
+
+import torch
+
+
+class UmerDebugLogger:
+    _FILE = "udl.csv"
+
+    def __init__(self, log_dir="logs", condition=None):
+        self.log_dir, self.condition, self.tensor_counter = log_dir, condition, 0
+        os.makedirs(log_dir, exist_ok=True)
+        self.fields = ["timestamp", "cls", "fn", "shape", "msg", "condition", "tensor_file"]
+        self.create_file()
+        self.warned_of_no_condition = False
+        print(
+            "Info: `UmerDebugLogger` created. This is a logging class that will be deleted when the PR to integrate ControlNet-XS is done."
+        )
+
+    @property
+    def full_file_path(self):
+        return os.path.join(self.log_dir, self._FILE)
+
+    def create_file(self):
+        file = self.full_file_path
+        if not os.path.isfile(file):
+            with open(file, "w", newline="") as f:
+                writer = csv.DictWriter(f, fieldnames=self.fields)
+                writer.writeheader()
+
+    def set_dir(self, log_dir, clear=False):
+        self.log_dir = log_dir
+        if clear:
+            self.clear_logs()
+        self.create_file()
+
+    def clear_logs(self):
+        shutil.rmtree(self.log_dir, ignore_errors=True)
+        os.makedirs(self.log_dir, exist_ok=True)
+        self.create_file()
+
+    def set_condition(self, condition):
+        self.condition = condition
+
+    def log_if(self, msg, t, condition, *, print_=False):
+        self.maybe_warn_of_no_condition()
+
+        # Use inspect to get the current frame and then go back one level to find caller
+        frame = inspect.currentframe()
+        caller_frame = frame.f_back
+        caller_info = inspect.getframeinfo(caller_frame)
+
+        # Extract class and function name from the caller
+        cls_name = (
+            caller_frame.f_locals.get("self", None).__class__.__name__ if "self" in caller_frame.f_locals else None
+        )
+        function_name = caller_info.function
+
+        if not hasattr(t, "shape"):
+            t = torch.tensor(t)
+        t = t.cpu().detach()
+
+        if condition == self.condition:
+            # Save tensor to a file
+            tensor_filename = f"tensor_{self.tensor_counter}.pt"
+            torch.save(t, os.path.join(self.log_dir, tensor_filename))
+            self.tensor_counter += 1
+
+            # Log information to CSV
+            log_info = {
+                "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+                "cls": cls_name,
+                "fn": function_name,
+                "shape": str(list(t.shape)),
+                "msg": msg,
+                "condition": condition,
+                "tensor_file": tensor_filename,
+            }
+
+            with open(self.full_file_path, "a", newline="") as f:
+                writer = csv.DictWriter(f, fieldnames=self.fields)
+                writer.writerow(log_info)
+
+            if print_:
+                print(f"{msg}\t{t.flatten()[:10]}")
+
+    def print_if(self, msg, conditions, end="\n"):
+        self.maybe_warn_of_no_condition()
+        if not isinstance(conditions, (tuple, list)):
+            conditions = [conditions]
+        if any(self.condition == c for c in conditions):
+            print(msg, end=end)
+
+    def stop_if(self, condition, funny_msg):
+        if condition == self.condition:
+            print(funny_msg)
+            raise SystemExit(funny_msg)
+
+    def maybe_warn_of_no_condition(self):
+        if self.condition is None and not self.warned_of_no_condition:
+            print("Info: No condition set for UmerDebugLogger")
+            self.warned_of_no_condition = True
+
+    def get_log_objects(self):
+        log_objects = []
+        file = self.full_file_path
+        with open(file, newline="") as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                row["tensor"] = torch.load(os.path.join(self.log_dir, row["tensor_file"]))
+                row["head"] = row["tensor"].flatten()[:10]
+                del row["tensor_file"]
+                log_objects.append(SimpleNamespace(**row))
+        return log_objects
+
+    @classmethod
+    def load_log_objects_from_dir(self, log_dir):
+        file = os.path.join(log_dir, self._FILE)
+        log_objects = []
+        with open(file, newline="") as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                row["t"] = torch.load(os.path.join(log_dir, row["tensor_file"]))
+                row["head"] = row["t"].flatten()[:10]
+                del row["tensor_file"]
+                log_objects.append(SimpleNamespace(**row))
+        return log_objects
+
+
+udl = UmerDebugLogger()
\ No newline at end of file

From 37cbe6c90082a09b53eac44e0bcbe63cf0721ac9 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Thu, 23 Nov 2023 15:32:53 +0100
Subject: [PATCH 54/88] Adjusting group norm ; readded logs

---
 src/diffusers/models/attention.py      | 13 +++++++
 src/diffusers/models/controlnetxs.py   | 51 ++++++++++++++++++++++----
 src/diffusers/models/resnet.py         | 14 +++++++
 src/diffusers/models/transformer_2d.py |  6 +++
 src/diffusers/umer_debug_logger.py     |  4 +-
 5 files changed, 80 insertions(+), 8 deletions(-)

diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 132aee92c5c8..bded3d8ac544 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -16,6 +16,7 @@
 import torch
 from torch import nn
 
+from ..umer_debug_logger import udl
 from ..utils import USE_PEFT_BACKEND
 from ..utils.torch_utils import maybe_allow_in_graph
 from .activations import GEGLU, GELU, ApproximateGELU
@@ -271,6 +272,10 @@ def forward(
         if hidden_states.ndim == 4:
             hidden_states = hidden_states.squeeze(1)
 
+        udl.log_if("norm1", norm_hidden_states, "SUBBLOCK-MINUS-1")
+        udl.log_if("attn1", attn_output, "SUBBLOCK-MINUS-1")
+        udl.log_if("add attn1", hidden_states, "SUBBLOCK-MINUS-1")
+
         # 2.5 GLIGEN Control
         if gligen_kwargs is not None:
             hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])
@@ -298,6 +303,10 @@ def forward(
                 **cross_attention_kwargs,
             )
             hidden_states = attn_output + hidden_states
+        udl.log_if("norm2", norm_hidden_states, "SUBBLOCK-MINUS-1")
+        udl.log_if("context", encoder_hidden_states, "SUBBLOCK-MINUS-1")
+        udl.log_if("attn2", attn_output, "SUBBLOCK-MINUS-1")
+        udl.log_if("add attn2", hidden_states, "SUBBLOCK-MINUS-1")
 
         # 4. Feed-forward
         if not self.use_ada_layer_norm_single:
@@ -337,6 +346,10 @@ def forward(
         if hidden_states.ndim == 4:
             hidden_states = hidden_states.squeeze(1)
 
+        udl.log_if("norm3", norm_hidden_states, "SUBBLOCK-MINUS-1")
+        udl.log_if("ff", ff_output, "SUBBLOCK-MINUS-1")
+        udl.log_if("add ff", hidden_states, "SUBBLOCK-MINUS-1")
+
         return hidden_states
 
 
diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index edc1f9fa2ac5..23aefcf53b2d 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -87,7 +87,8 @@ class ControlNetXSModel(ModelMixin, ConfigMixin):
             Wether to use time embedding of the control model. If yes, the time embedding is a linear interpolation of
             the time embeddings of the control and base model with interpolation parameter `time_embedding_mix**3`.
         time_embedding_mix (`float`, defaults to 1.0):
-            Linear interpolation parameter used if `learn_embedding` is `True`.
+            Linear interpolation parameter used if `learn_embedding` is `True`. A value of 1.0 means only the
+            control model's time embedding will be used. A value of 0.0 means only the base model's time embedding will be used.
         base_model_channel_sizes (`Dict[str, List[Tuple[int]]]`):
             Channel sizes of each subblock of base model. Use `gather_subblock_sizes` on your base model to compute it.
     """
@@ -107,7 +108,7 @@ def create_as_in_paper(cls, base_model: UNet2DConditionModel, sdxl=True):
         else:
             return ControlNetXSModel.from_unet(
                 base_model,
-                time_embedding_mix=0.95,
+                time_embedding_mix=1.0,
                 learn_embedding=True,
                 size_ratio=0.0125,
                 dim_attention_heads=8,
@@ -312,6 +313,9 @@ def compute_block_out_channels(subblock_channels, layers_per_block):
                 increase_block_input_in_encoder_downsampler(self.control_model, block_no=i, by=e2)
         increase_block_input_in_mid_resnet(self.control_model, by=base_block_out_channels[-1])
 
+        # 2.3 - Make group norms work with modified channel sizes
+        adjust_group_norms(self.control_model)
+
         # 3 - Gather Channel Sizes
         self.ch_inout_ctrl = ControlNetXSModel.gather_subblock_sizes(self.control_model, base_or_control="control")
         self.ch_inout_base = base_model_channel_sizes
@@ -649,6 +653,8 @@ def forward(
         # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
         timesteps = timesteps.expand(sample.shape[0])
 
+        print(f"Timesteps = {timesteps}")
+
         t_emb = base_model.time_proj(timesteps)
 
         # timesteps does not contain any weights and will always return f32 tensors
@@ -662,7 +668,12 @@ def forward(
             interpolation_param = self.config.time_embedding_mix**0.3
 
             temb = ctrl_temb * interpolation_param + base_temb * (1 - interpolation_param)
+
+            print(f"Of course I've not learned a time embedding. I'm smart! Let me collaborate with the base model. Me {interpolation_param:.2f}, him {1-interpolation_param:.2f}")
+            print(f"> Before: {t_emb.flatten()[:5]}")
+            print(f"> After: {temb.flatten()[:5]}")
         else:
+            print("Nah man, I've not learned any time embedding. Let the base model do it.")
             temb = base_model.time_embedding(t_emb)
 
         # added time & text embeddings
@@ -726,11 +737,11 @@ def forward(
         ctrl_mid_subblocks = to_sub_blocks([self.control_model.mid_block])
         base_up_subblocks = to_sub_blocks(base_model.up_blocks)
 
-        udl.log_if('prep.x',            sample,          condition='SUBBLOCK')
-        udl.log_if('prep.temb',         temb,           condition='SUBBLOCK')
-        udl.log_if('prep.context',      cemb,           condition='SUBBLOCK')
-        udl.log_if('prep.raw_hint',     controlnet_cond,condition='SUBBLOCK')
-        udl.log_if('prep.guided_hint',  guided_hint,    condition='SUBBLOCK')
+        udl.log_if('prep.x',            sample,         condition=('SUBBLOCK', 'SUBBLOCK-MINUS-1'))
+        udl.log_if('prep.temb',         temb,           condition=('SUBBLOCK', 'SUBBLOCK-MINUS-1'))
+        udl.log_if('prep.context',      cemb,           condition=('SUBBLOCK', 'SUBBLOCK-MINUS-1'))
+        udl.log_if('prep.raw_hint',     controlnet_cond,condition=('SUBBLOCK', 'SUBBLOCK-MINUS-1'))
+        udl.log_if('prep.guided_hint',  guided_hint,    condition=('SUBBLOCK', 'SUBBLOCK-MINUS-1'))
 
         # Cross Control
         # 0 - conv in
@@ -750,6 +761,9 @@ def forward(
         hs_base.append(h_base)
         hs_ctrl.append(h_ctrl)
 
+        udl.log_if('conv_in.output', h_base, condition=('SUBBLOCK', 'SUBBLOCK-MINUS-1'))
+        udl.log_if('conv_in.output', h_ctrl, condition=('SUBBLOCK', 'SUBBLOCK-MINUS-1'))
+
         # 1 - down
         RUN_ONCE = ('SUBBLOCK', 'SUBBLOCK-MINUS-1')
         udl.print_if('------ enc ------', conditions=RUN_ONCE)
@@ -796,6 +810,7 @@ def forward(
 
         udl.log_if('conv_out.h_base', h_base, condition='SUBBLOCK')
         udl.stop_if('SUBBLOCK', 'The subblocks are cought. Let us gaze into their soul, their very essence.')
+        udl.stop_if('SUBBLOCK-MINUS-1', 'Alright captain. Look at all these tensors we caught. Time to do some real analysis.')
 
         if not return_dict:
             return h_base
@@ -940,6 +955,28 @@ def increase_block_input_in_mid_resnet(unet: UNet2DConditionModel, by):
     unet.mid_block.resnets[0].in_channels += by  # surgery done here
 
 
+def adjust_group_norms(unet: UNet2DConditionModel, max_num_group:int=32):
+    def find_denominator(number, start):
+        if start >= number:
+            return number
+        while (start != 0):
+            residual = number % start
+            if residual == 0:
+                return start
+            start -= 1
+
+    # resnets
+    for d in unet.down_blocks:
+        for r in d.resnets:
+            if r.norm1.num_groups < max_num_group:
+                r.norm1.num_groups = find_denominator(r.norm1.num_channels ,start=32)
+            
+            if r.norm2.num_groups < max_num_group:
+                r.norm2.num_groups = find_denominator(r.norm2.num_channels ,start=32)
+
+    # transformers
+    pass # TODO
+
 def is_iterable(o):
     if isinstance(o, str):
         return False
diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py
index 139019eb87c3..40ecb3a34053 100644
--- a/src/diffusers/models/resnet.py
+++ b/src/diffusers/models/resnet.py
@@ -20,6 +20,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
+from ..umer_debug_logger import udl
 from ..utils import USE_PEFT_BACKEND
 from .activations import get_activation
 from .attention_processor import SpatialNorm
@@ -207,6 +208,8 @@ def forward(
                 else:
                     hidden_states = self.Conv2d_0(hidden_states)
 
+        udl.log_if("conv", hidden_states, "SUBBLOCK-MINUS-1")
+
         return hidden_states
 
 
@@ -275,6 +278,8 @@ def forward(self, hidden_states: torch.FloatTensor, scale: float = 1.0) -> torch
         else:
             hidden_states = self.conv(hidden_states)
 
+        udl.log_if("conv", hidden_states, "SUBBLOCK-MINUS-1")
+
         return hidden_states
 
 
@@ -698,10 +703,13 @@ def forward(
     ) -> torch.FloatTensor:
         hidden_states = input_tensor
 
+        udl.log_if("input", hidden_states, condition="SUBBLOCK-MINUS-1")
+
         if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
             hidden_states = self.norm1(hidden_states, temb)
         else:
             hidden_states = self.norm1(hidden_states)
+        udl.log_if("norm1", hidden_states, condition="SUBBLOCK-MINUS-1")
 
         hidden_states = self.nonlinearity(hidden_states)
 
@@ -734,6 +742,8 @@ def forward(
 
         hidden_states = self.conv1(hidden_states, scale) if not USE_PEFT_BACKEND else self.conv1(hidden_states)
 
+        udl.log_if("conv1", hidden_states, condition="SUBBLOCK-MINUS-1")
+
         if self.time_emb_proj is not None:
             if not self.skip_time_act:
                 temb = self.nonlinearity(temb)
@@ -745,11 +755,13 @@ def forward(
 
         if temb is not None and self.time_embedding_norm == "default":
             hidden_states = hidden_states + temb
+            udl.log_if("add time_emb_proj", hidden_states, condition="SUBBLOCK-MINUS-1")
 
         if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
             hidden_states = self.norm2(hidden_states, temb)
         else:
             hidden_states = self.norm2(hidden_states)
+        udl.log_if("norm2", hidden_states, condition="SUBBLOCK-MINUS-1")
 
         if temb is not None and self.time_embedding_norm == "scale_shift":
             scale, shift = torch.chunk(temb, 2, dim=1)
@@ -758,6 +770,7 @@ def forward(
         hidden_states = self.nonlinearity(hidden_states)
         hidden_states = self.dropout(hidden_states)
         hidden_states = self.conv2(hidden_states, scale) if not USE_PEFT_BACKEND else self.conv2(hidden_states)
+        udl.log_if("conv2", hidden_states, condition="SUBBLOCK-MINUS-1")
 
         if self.conv_shortcut is not None:
             input_tensor = (
@@ -765,6 +778,7 @@ def forward(
             )
 
         output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
+        udl.log_if("add conv_shortcut", output_tensor, condition="SUBBLOCK-MINUS-1")
 
         return output_tensor
 
diff --git a/src/diffusers/models/transformer_2d.py b/src/diffusers/models/transformer_2d.py
index 24abf54d6da7..f1617dfbec11 100644
--- a/src/diffusers/models/transformer_2d.py
+++ b/src/diffusers/models/transformer_2d.py
@@ -20,6 +20,7 @@
 
 from ..configuration_utils import ConfigMixin, register_to_config
 from ..models.embeddings import ImagePositionalEmbeddings
+from ..umer_debug_logger import udl
 from ..utils import USE_PEFT_BACKEND, BaseOutput, deprecate
 from .attention import BasicTransformerBlock
 from .embeddings import CaptionProjection, PatchEmbed
@@ -319,6 +320,8 @@ def forward(
             residual = hidden_states
 
             hidden_states = self.norm(hidden_states)
+            udl.log_if("norm", hidden_states, condition="SUBBLOCK-MINUS-1")
+
             if not self.use_linear_projection:
                 hidden_states = (
                     self.proj_in(hidden_states, scale=lora_scale)
@@ -336,6 +339,8 @@ def forward(
                     else self.proj_in(hidden_states)
                 )
 
+            udl.log_if("proj_in", hidden_states, condition="SUBBLOCK-MINUS-1")
+
         elif self.is_input_vectorized:
             hidden_states = self.latent_image_embedding(hidden_states)
         elif self.is_input_patches:
@@ -435,6 +440,7 @@ def forward(
             output = hidden_states.reshape(
                 shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size)
             )
+        udl.log_if("proj_out", output, condition="SUBBLOCK-MINUS-1")
 
         if not return_dict:
             return (output,)
diff --git a/src/diffusers/umer_debug_logger.py b/src/diffusers/umer_debug_logger.py
index 831536ac9b8b..f0f68d92664c 100644
--- a/src/diffusers/umer_debug_logger.py
+++ b/src/diffusers/umer_debug_logger.py
@@ -52,6 +52,8 @@ def set_condition(self, condition):
     def log_if(self, msg, t, condition, *, print_=False):
         self.maybe_warn_of_no_condition()
 
+        if not isinstance(condition, (tuple, list)): condition = [condition]
+
         # Use inspect to get the current frame and then go back one level to find caller
         frame = inspect.currentframe()
         caller_frame = frame.f_back
@@ -67,7 +69,7 @@ def log_if(self, msg, t, condition, *, print_=False):
             t = torch.tensor(t)
         t = t.cpu().detach()
 
-        if condition == self.condition:
+        if self.condition in condition:
             # Save tensor to a file
             tensor_filename = f"tensor_{self.tensor_counter}.pt"
             torch.save(t, os.path.join(self.log_dir, tensor_filename))

From d2c2635de7b271ea3cce48f98e36ae5a0d77073a Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Fri, 24 Nov 2023 14:02:50 +0100
Subject: [PATCH 55/88] Added debug log statements

---
 src/diffusers/models/controlnetxs.py | 41 ++++++++++++++++++----------
 1 file changed, 27 insertions(+), 14 deletions(-)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index 23aefcf53b2d..cd0b946294eb 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -668,12 +668,7 @@ def forward(
             interpolation_param = self.config.time_embedding_mix**0.3
 
             temb = ctrl_temb * interpolation_param + base_temb * (1 - interpolation_param)
-
-            print(f"Of course I've not learned a time embedding. I'm smart! Let me collaborate with the base model. Me {interpolation_param:.2f}, him {1-interpolation_param:.2f}")
-            print(f"> Before: {t_emb.flatten()[:5]}")
-            print(f"> After: {temb.flatten()[:5]}")
         else:
-            print("Nah man, I've not learned any time embedding. Let the base model do it.")
             temb = base_model.time_embedding(t_emb)
 
         # added time & text embeddings
@@ -769,7 +764,10 @@ def forward(
         udl.print_if('------ enc ------', conditions=RUN_ONCE)
 
         for m_base, m_ctrl in zip(base_down_subblocks, ctrl_down_subblocks):
-            h_ctrl = torch.cat([h_ctrl, next(it_down_convs_in)(h_base)], dim=1)  # A - concat base -> ctrl
+            connection = next(it_down_convs_in)
+            to_concat = connection(h_base)
+
+            h_ctrl = torch.cat([h_ctrl, to_concat], dim=1)  # A - concat base -> ctrl
             udl.log_if('enc.h_ctrl', h_ctrl, condition='SUBBLOCK')
 
             h_base = m_base(h_base, temb, cemb, attention_mask, cross_attention_kwargs)  # B - apply base subblock
@@ -787,10 +785,22 @@ def forward(
         # 2 - mid
         h_ctrl = torch.cat([h_ctrl, next(it_down_convs_in)(h_base)], dim=1)  # A - concat base -> ctrl
         udl.log_if('enc.h_ctrl', h_ctrl, 'SUBBLOCK')
+
+        # Because Heidelberg treats the R/A/R as one block, they first execute the full base mid block,
+        # then the full ctrl mid block; while I execute them interlaced.
+        # This doesn't change the computation, but messes up parts of the logs.
+        # So let's, while debugging, first execute full base mid block and then full ctrl mid block.
+
+        for m_base in base_mid_subblocks:
+            h_base = m_base(h_base, temb, cemb, attention_mask, cross_attention_kwargs)
+
+        for m_ctrl in ctrl_mid_subblocks:
+            h_ctrl = m_ctrl(h_ctrl, temb, cemb, attention_mask, cross_attention_kwargs)
+        
+        #for m_base, m_ctrl in zip(base_mid_subblocks, ctrl_mid_subblocks):
+        #    h_base = m_base(h_base, temb, cemb, attention_mask, cross_attention_kwargs)  # B - apply base subblock
+        #    h_ctrl = m_ctrl(h_ctrl, temb, cemb, attention_mask, cross_attention_kwargs)  # C - apply ctrl subblock
         
-        for m_base, m_ctrl in zip(base_mid_subblocks, ctrl_mid_subblocks):
-            h_base = m_base(h_base, temb, cemb, attention_mask, cross_attention_kwargs)  # B - apply base subblock
-            h_ctrl = m_ctrl(h_ctrl, temb, cemb, attention_mask, cross_attention_kwargs)  # C - apply ctrl subblock
         h_base = h_base + self.middle_block_out(h_ctrl) * next(scales)  # D - add ctrl -> base
         udl.log_if('mid.h_base', h_base, condition='SUBBLOCK')
         udl.log_if('mid.h_ctrl', h_ctrl, condition='SUBBLOCK')
@@ -965,17 +975,20 @@ def find_denominator(number, start):
                 return start
             start -= 1
 
-    # resnets
-    for d in unet.down_blocks:
-        for r in d.resnets:
+    for block in [*unet.down_blocks, unet.mid_block]:
+        # resnets
+        for r in block.resnets:
             if r.norm1.num_groups < max_num_group:
                 r.norm1.num_groups = find_denominator(r.norm1.num_channels ,start=32)
             
             if r.norm2.num_groups < max_num_group:
                 r.norm2.num_groups = find_denominator(r.norm2.num_channels ,start=32)
 
-    # transformers
-    pass # TODO
+        # transformers
+        if hasattr(block, 'attentions'):
+            for a in block.attentions:
+                if a.norm.num_groups < max_num_group:
+                    a.norm.num_groups = find_denominator(a.norm.num_channels ,start=32)
 
 def is_iterable(o):
     if isinstance(o, str):

From f79411341a8a3b3f20f0dbf2ac280c2460401e4c Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Mon, 27 Nov 2023 21:46:12 +0100
Subject: [PATCH 56/88] removed debug logs ; started tests for sd2.1

---
 src/diffusers/models/attention.py             |  13 -
 src/diffusers/models/controlnetxs.py          |  70 +---
 src/diffusers/models/resnet.py                |  20 +-
 src/diffusers/models/transformer_2d.py        |   5 -
 .../controlnet_xs/pipeline_controlnet_xs.py   |   2 -
 src/diffusers/schedulers/scheduling_ddim.py   |   1 -
 src/diffusers/umer_debug_logger.py            | 139 -------
 .../pipelines/controlnetxs/test_controlnet.py | 382 ++++++++++++++++++
 8 files changed, 391 insertions(+), 241 deletions(-)
 delete mode 100644 src/diffusers/umer_debug_logger.py
 create mode 100644 tests/pipelines/controlnetxs/test_controlnet.py

diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index bded3d8ac544..132aee92c5c8 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -16,7 +16,6 @@
 import torch
 from torch import nn
 
-from ..umer_debug_logger import udl
 from ..utils import USE_PEFT_BACKEND
 from ..utils.torch_utils import maybe_allow_in_graph
 from .activations import GEGLU, GELU, ApproximateGELU
@@ -272,10 +271,6 @@ def forward(
         if hidden_states.ndim == 4:
             hidden_states = hidden_states.squeeze(1)
 
-        udl.log_if("norm1", norm_hidden_states, "SUBBLOCK-MINUS-1")
-        udl.log_if("attn1", attn_output, "SUBBLOCK-MINUS-1")
-        udl.log_if("add attn1", hidden_states, "SUBBLOCK-MINUS-1")
-
         # 2.5 GLIGEN Control
         if gligen_kwargs is not None:
             hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])
@@ -303,10 +298,6 @@ def forward(
                 **cross_attention_kwargs,
             )
             hidden_states = attn_output + hidden_states
-        udl.log_if("norm2", norm_hidden_states, "SUBBLOCK-MINUS-1")
-        udl.log_if("context", encoder_hidden_states, "SUBBLOCK-MINUS-1")
-        udl.log_if("attn2", attn_output, "SUBBLOCK-MINUS-1")
-        udl.log_if("add attn2", hidden_states, "SUBBLOCK-MINUS-1")
 
         # 4. Feed-forward
         if not self.use_ada_layer_norm_single:
@@ -346,10 +337,6 @@ def forward(
         if hidden_states.ndim == 4:
             hidden_states = hidden_states.squeeze(1)
 
-        udl.log_if("norm3", norm_hidden_states, "SUBBLOCK-MINUS-1")
-        udl.log_if("ff", ff_output, "SUBBLOCK-MINUS-1")
-        udl.log_if("add ff", hidden_states, "SUBBLOCK-MINUS-1")
-
         return hidden_states
 
 
diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index cd0b946294eb..0165b71ca5e0 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -39,7 +39,6 @@
     Upsample2D,
 )
 from .unet_2d_condition import UNet2DConditionModel
-from ..umer_debug_logger import udl
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
@@ -653,8 +652,6 @@ def forward(
         # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
         timesteps = timesteps.expand(sample.shape[0])
 
-        print(f"Timesteps = {timesteps}")
-
         t_emb = base_model.time_proj(timesteps)
 
         # timesteps does not contain any weights and will always return f32 tensors
@@ -732,96 +729,43 @@ def forward(
         ctrl_mid_subblocks = to_sub_blocks([self.control_model.mid_block])
         base_up_subblocks = to_sub_blocks(base_model.up_blocks)
 
-        udl.log_if('prep.x',            sample,         condition=('SUBBLOCK', 'SUBBLOCK-MINUS-1'))
-        udl.log_if('prep.temb',         temb,           condition=('SUBBLOCK', 'SUBBLOCK-MINUS-1'))
-        udl.log_if('prep.context',      cemb,           condition=('SUBBLOCK', 'SUBBLOCK-MINUS-1'))
-        udl.log_if('prep.raw_hint',     controlnet_cond,condition=('SUBBLOCK', 'SUBBLOCK-MINUS-1'))
-        udl.log_if('prep.guided_hint',  guided_hint,    condition=('SUBBLOCK', 'SUBBLOCK-MINUS-1'))
-
         # Cross Control
         # 0 - conv in
         h_base = base_model.conv_in(h_base)
-        udl.log_if('enc.h_base', h_base, condition='SUBBLOCK')
-        
         h_ctrl = self.control_model.conv_in(h_ctrl)
-        udl.log_if('enc.h_ctrl', h_ctrl, condition='SUBBLOCK')
-        
         if guided_hint is not None:
             h_ctrl += guided_hint
-        udl.log_if('enc.h_ctrl', h_ctrl, condition='SUBBLOCK')
-
-        h_base = h_base + next(it_down_convs_out)(h_ctrl) * next(scales)
-        udl.log_if('enc.h_base', h_base, condition='SUBBLOCK')
+        h_base = h_base + next(it_down_convs_out)(h_ctrl) * next(scales)   # D - add ctrl -> base
         
         hs_base.append(h_base)
         hs_ctrl.append(h_ctrl)
 
-        udl.log_if('conv_in.output', h_base, condition=('SUBBLOCK', 'SUBBLOCK-MINUS-1'))
-        udl.log_if('conv_in.output', h_ctrl, condition=('SUBBLOCK', 'SUBBLOCK-MINUS-1'))
-
         # 1 - down
-        RUN_ONCE = ('SUBBLOCK', 'SUBBLOCK-MINUS-1')
-        udl.print_if('------ enc ------', conditions=RUN_ONCE)
-
         for m_base, m_ctrl in zip(base_down_subblocks, ctrl_down_subblocks):
-            connection = next(it_down_convs_in)
-            to_concat = connection(h_base)
-
-            h_ctrl = torch.cat([h_ctrl, to_concat], dim=1)  # A - concat base -> ctrl
-            udl.log_if('enc.h_ctrl', h_ctrl, condition='SUBBLOCK')
-
+            h_ctrl = torch.cat([h_ctrl, next(it_down_convs_in)(h_base)], dim=1)  # A - concat base -> ctrl
             h_base = m_base(h_base, temb, cemb, attention_mask, cross_attention_kwargs)  # B - apply base subblock
-            udl.log_if('enc.h_base', h_base, condition='SUBBLOCK')
-
             h_ctrl = m_ctrl(h_ctrl, temb, cemb, attention_mask, cross_attention_kwargs)  # C - apply ctrl subblock
-            udl.log_if('enc.h_ctrl', h_ctrl, condition='SUBBLOCK')
-
-            h_base = h_base + next(it_down_convs_out)(h_ctrl) * next(scales)  # D - add ctrl -> base
-            udl.log_if('enc.h_base', h_base, condition='SUBBLOCK')
-
+            h_base = h_base + next(it_down_convs_out)(h_ctrl) * next(scales)   # D - add ctrl -> base
             hs_base.append(h_base)
             hs_ctrl.append(h_ctrl)
 
         # 2 - mid
         h_ctrl = torch.cat([h_ctrl, next(it_down_convs_in)(h_base)], dim=1)  # A - concat base -> ctrl
-        udl.log_if('enc.h_ctrl', h_ctrl, 'SUBBLOCK')
-
-        # Because Heidelberg treats the R/A/R as one block, they first execute the full base mid block,
-        # then the full ctrl mid block; while I execute them interlaced.
-        # This doesn't change the computation, but messes up parts of the logs.
-        # So let's, while debugging, first execute full base mid block and then full ctrl mid block.
-
-        for m_base in base_mid_subblocks:
-            h_base = m_base(h_base, temb, cemb, attention_mask, cross_attention_kwargs)
-
-        for m_ctrl in ctrl_mid_subblocks:
-            h_ctrl = m_ctrl(h_ctrl, temb, cemb, attention_mask, cross_attention_kwargs)
-        
-        #for m_base, m_ctrl in zip(base_mid_subblocks, ctrl_mid_subblocks):
-        #    h_base = m_base(h_base, temb, cemb, attention_mask, cross_attention_kwargs)  # B - apply base subblock
-        #    h_ctrl = m_ctrl(h_ctrl, temb, cemb, attention_mask, cross_attention_kwargs)  # C - apply ctrl subblock
-        
+        for m_base, m_ctrl in zip(base_mid_subblocks, ctrl_mid_subblocks):
+            h_base = m_base(h_base, temb, cemb, attention_mask, cross_attention_kwargs)  # B - apply base subblock
+            h_ctrl = m_ctrl(h_ctrl, temb, cemb, attention_mask, cross_attention_kwargs)  # C - apply ctrl subblock
         h_base = h_base + self.middle_block_out(h_ctrl) * next(scales)  # D - add ctrl -> base
-        udl.log_if('mid.h_base', h_base, condition='SUBBLOCK')
-        udl.log_if('mid.h_ctrl', h_ctrl, condition='SUBBLOCK')
-
+        
         # 3 - up
         for i, m_base in enumerate(base_up_subblocks):
-            udl.print_if(f'> processing up subblock {i}','SUBBLOCK')
             h_base = h_base + next(it_up_convs_out)(hs_ctrl.pop()) * next(scales)  # add info from ctrl encoder
-            udl.log_if('dec.h_base', h_base, condition='SUBBLOCK')
             h_base = torch.cat([h_base, hs_base.pop()], dim=1)  # concat info from base encoder+ctrl encoder
             h_base = m_base(h_base, temb, cemb, attention_mask, cross_attention_kwargs)
-            udl.log_if('dec.h_base', h_base, condition='SUBBLOCK')
 
         h_base = base_model.conv_norm_out(h_base)
         h_base = base_model.conv_act(h_base)
         h_base = base_model.conv_out(h_base)
 
-        udl.log_if('conv_out.h_base', h_base, condition='SUBBLOCK')
-        udl.stop_if('SUBBLOCK', 'The subblocks are cought. Let us gaze into their soul, their very essence.')
-        udl.stop_if('SUBBLOCK-MINUS-1', 'Alright captain. Look at all these tensors we caught. Time to do some real analysis.')
-
         if not return_dict:
             return h_base
 
diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py
index 40ecb3a34053..dba19854349a 100644
--- a/src/diffusers/models/resnet.py
+++ b/src/diffusers/models/resnet.py
@@ -20,7 +20,6 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from ..umer_debug_logger import udl
 from ..utils import USE_PEFT_BACKEND
 from .activations import get_activation
 from .attention_processor import SpatialNorm
@@ -208,8 +207,6 @@ def forward(
                 else:
                     hidden_states = self.Conv2d_0(hidden_states)
 
-        udl.log_if("conv", hidden_states, "SUBBLOCK-MINUS-1")
-
         return hidden_states
 
 
@@ -278,8 +275,6 @@ def forward(self, hidden_states: torch.FloatTensor, scale: float = 1.0) -> torch
         else:
             hidden_states = self.conv(hidden_states)
 
-        udl.log_if("conv", hidden_states, "SUBBLOCK-MINUS-1")
-
         return hidden_states
 
 
@@ -703,14 +698,11 @@ def forward(
     ) -> torch.FloatTensor:
         hidden_states = input_tensor
 
-        udl.log_if("input", hidden_states, condition="SUBBLOCK-MINUS-1")
-
         if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
             hidden_states = self.norm1(hidden_states, temb)
         else:
             hidden_states = self.norm1(hidden_states)
-        udl.log_if("norm1", hidden_states, condition="SUBBLOCK-MINUS-1")
-
+   
         hidden_states = self.nonlinearity(hidden_states)
 
         if self.upsample is not None:
@@ -742,8 +734,6 @@ def forward(
 
         hidden_states = self.conv1(hidden_states, scale) if not USE_PEFT_BACKEND else self.conv1(hidden_states)
 
-        udl.log_if("conv1", hidden_states, condition="SUBBLOCK-MINUS-1")
-
         if self.time_emb_proj is not None:
             if not self.skip_time_act:
                 temb = self.nonlinearity(temb)
@@ -755,13 +745,11 @@ def forward(
 
         if temb is not None and self.time_embedding_norm == "default":
             hidden_states = hidden_states + temb
-            udl.log_if("add time_emb_proj", hidden_states, condition="SUBBLOCK-MINUS-1")
 
         if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
             hidden_states = self.norm2(hidden_states, temb)
         else:
             hidden_states = self.norm2(hidden_states)
-        udl.log_if("norm2", hidden_states, condition="SUBBLOCK-MINUS-1")
 
         if temb is not None and self.time_embedding_norm == "scale_shift":
             scale, shift = torch.chunk(temb, 2, dim=1)
@@ -770,17 +758,13 @@ def forward(
         hidden_states = self.nonlinearity(hidden_states)
         hidden_states = self.dropout(hidden_states)
         hidden_states = self.conv2(hidden_states, scale) if not USE_PEFT_BACKEND else self.conv2(hidden_states)
-        udl.log_if("conv2", hidden_states, condition="SUBBLOCK-MINUS-1")
 
         if self.conv_shortcut is not None:
             input_tensor = (
                 self.conv_shortcut(input_tensor, scale) if not USE_PEFT_BACKEND else self.conv_shortcut(input_tensor)
             )
 
-        output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
-        udl.log_if("add conv_shortcut", output_tensor, condition="SUBBLOCK-MINUS-1")
-
-        return output_tensor
+        return (input_tensor + hidden_states) / self.output_scale_factor
 
 
 # unet_rl.py
diff --git a/src/diffusers/models/transformer_2d.py b/src/diffusers/models/transformer_2d.py
index f1617dfbec11..e76028a69500 100644
--- a/src/diffusers/models/transformer_2d.py
+++ b/src/diffusers/models/transformer_2d.py
@@ -20,7 +20,6 @@
 
 from ..configuration_utils import ConfigMixin, register_to_config
 from ..models.embeddings import ImagePositionalEmbeddings
-from ..umer_debug_logger import udl
 from ..utils import USE_PEFT_BACKEND, BaseOutput, deprecate
 from .attention import BasicTransformerBlock
 from .embeddings import CaptionProjection, PatchEmbed
@@ -320,7 +319,6 @@ def forward(
             residual = hidden_states
 
             hidden_states = self.norm(hidden_states)
-            udl.log_if("norm", hidden_states, condition="SUBBLOCK-MINUS-1")
 
             if not self.use_linear_projection:
                 hidden_states = (
@@ -339,8 +337,6 @@ def forward(
                     else self.proj_in(hidden_states)
                 )
 
-            udl.log_if("proj_in", hidden_states, condition="SUBBLOCK-MINUS-1")
-
         elif self.is_input_vectorized:
             hidden_states = self.latent_image_embedding(hidden_states)
         elif self.is_input_patches:
@@ -440,7 +436,6 @@ def forward(
             output = hidden_states.reshape(
                 shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size)
             )
-        udl.log_if("proj_out", output, condition="SUBBLOCK-MINUS-1")
 
         if not return_dict:
             return (output,)
diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py
index c963f85c039a..adaf08d75828 100644
--- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py
+++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py
@@ -859,7 +859,6 @@ def __call__(
         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
-
         # 8. Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         is_unet_compiled = is_compiled_module(self.unet)
@@ -904,7 +903,6 @@ def __call__(
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
                     noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
-                # compute the previous noisy sample x_t -> x_t-1
                 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
 
                 # call the callback, if provided
diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index d325cde7d9d4..7ffa43a5848f 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -27,7 +27,6 @@
 from ..utils.torch_utils import randn_tensor
 from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
 
-
 @dataclass
 # Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM
 class DDIMSchedulerOutput(BaseOutput):
diff --git a/src/diffusers/umer_debug_logger.py b/src/diffusers/umer_debug_logger.py
deleted file mode 100644
index f0f68d92664c..000000000000
--- a/src/diffusers/umer_debug_logger.py
+++ /dev/null
@@ -1,139 +0,0 @@
-
-# Logger to help me (UmerHA) debug controlnet-xs
-
-import csv
-import inspect
-import os
-import shutil
-from datetime import datetime
-from types import SimpleNamespace
-
-import torch
-
-
-class UmerDebugLogger:
-    _FILE = "udl.csv"
-
-    def __init__(self, log_dir="logs", condition=None):
-        self.log_dir, self.condition, self.tensor_counter = log_dir, condition, 0
-        os.makedirs(log_dir, exist_ok=True)
-        self.fields = ["timestamp", "cls", "fn", "shape", "msg", "condition", "tensor_file"]
-        self.create_file()
-        self.warned_of_no_condition = False
-        print(
-            "Info: `UmerDebugLogger` created. This is a logging class that will be deleted when the PR to integrate ControlNet-XS is done."
-        )
-
-    @property
-    def full_file_path(self):
-        return os.path.join(self.log_dir, self._FILE)
-
-    def create_file(self):
-        file = self.full_file_path
-        if not os.path.isfile(file):
-            with open(file, "w", newline="") as f:
-                writer = csv.DictWriter(f, fieldnames=self.fields)
-                writer.writeheader()
-
-    def set_dir(self, log_dir, clear=False):
-        self.log_dir = log_dir
-        if clear:
-            self.clear_logs()
-        self.create_file()
-
-    def clear_logs(self):
-        shutil.rmtree(self.log_dir, ignore_errors=True)
-        os.makedirs(self.log_dir, exist_ok=True)
-        self.create_file()
-
-    def set_condition(self, condition):
-        self.condition = condition
-
-    def log_if(self, msg, t, condition, *, print_=False):
-        self.maybe_warn_of_no_condition()
-
-        if not isinstance(condition, (tuple, list)): condition = [condition]
-
-        # Use inspect to get the current frame and then go back one level to find caller
-        frame = inspect.currentframe()
-        caller_frame = frame.f_back
-        caller_info = inspect.getframeinfo(caller_frame)
-
-        # Extract class and function name from the caller
-        cls_name = (
-            caller_frame.f_locals.get("self", None).__class__.__name__ if "self" in caller_frame.f_locals else None
-        )
-        function_name = caller_info.function
-
-        if not hasattr(t, "shape"):
-            t = torch.tensor(t)
-        t = t.cpu().detach()
-
-        if self.condition in condition:
-            # Save tensor to a file
-            tensor_filename = f"tensor_{self.tensor_counter}.pt"
-            torch.save(t, os.path.join(self.log_dir, tensor_filename))
-            self.tensor_counter += 1
-
-            # Log information to CSV
-            log_info = {
-                "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
-                "cls": cls_name,
-                "fn": function_name,
-                "shape": str(list(t.shape)),
-                "msg": msg,
-                "condition": condition,
-                "tensor_file": tensor_filename,
-            }
-
-            with open(self.full_file_path, "a", newline="") as f:
-                writer = csv.DictWriter(f, fieldnames=self.fields)
-                writer.writerow(log_info)
-
-            if print_:
-                print(f"{msg}\t{t.flatten()[:10]}")
-
-    def print_if(self, msg, conditions, end="\n"):
-        self.maybe_warn_of_no_condition()
-        if not isinstance(conditions, (tuple, list)):
-            conditions = [conditions]
-        if any(self.condition == c for c in conditions):
-            print(msg, end=end)
-
-    def stop_if(self, condition, funny_msg):
-        if condition == self.condition:
-            print(funny_msg)
-            raise SystemExit(funny_msg)
-
-    def maybe_warn_of_no_condition(self):
-        if self.condition is None and not self.warned_of_no_condition:
-            print("Info: No condition set for UmerDebugLogger")
-            self.warned_of_no_condition = True
-
-    def get_log_objects(self):
-        log_objects = []
-        file = self.full_file_path
-        with open(file, newline="") as f:
-            reader = csv.DictReader(f)
-            for row in reader:
-                row["tensor"] = torch.load(os.path.join(self.log_dir, row["tensor_file"]))
-                row["head"] = row["tensor"].flatten()[:10]
-                del row["tensor_file"]
-                log_objects.append(SimpleNamespace(**row))
-        return log_objects
-
-    @classmethod
-    def load_log_objects_from_dir(self, log_dir):
-        file = os.path.join(log_dir, self._FILE)
-        log_objects = []
-        with open(file, newline="") as f:
-            reader = csv.DictReader(f)
-            for row in reader:
-                row["t"] = torch.load(os.path.join(log_dir, row["tensor_file"]))
-                row["head"] = row["t"].flatten()[:10]
-                del row["tensor_file"]
-                log_objects.append(SimpleNamespace(**row))
-        return log_objects
-
-
-udl = UmerDebugLogger()
\ No newline at end of file
diff --git a/tests/pipelines/controlnetxs/test_controlnet.py b/tests/pipelines/controlnetxs/test_controlnet.py
new file mode 100644
index 000000000000..8e223cb5f995
--- /dev/null
+++ b/tests/pipelines/controlnetxs/test_controlnet.py
@@ -0,0 +1,382 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import tempfile
+import traceback
+import unittest
+
+import numpy as np
+import torch
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from diffusers import (
+    AutoencoderKL,
+    ControlNetXSModel,
+    DDIMScheduler,
+    EulerDiscreteScheduler,
+    LCMScheduler,
+    StableDiffusionControlNetXSPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    load_image,
+    load_numpy,
+    require_python39_or_higher,
+    require_torch_2,
+    require_torch_gpu,
+    run_test_in_subprocess,
+    slow,
+    torch_device,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+from ..pipeline_params import (
+    IMAGE_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_TO_IMAGE_BATCH_PARAMS,
+    TEXT_TO_IMAGE_IMAGE_PARAMS,
+    TEXT_TO_IMAGE_PARAMS,
+)
+from ..test_pipelines_common import (
+    PipelineKarrasSchedulerTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineTesterMixin,
+)
+
+
+enable_full_determinism()
+
+
+# todo umer: understand & adapt
+# Will be run via run_test_in_subprocess
+def _test_stable_diffusion_compile(in_queue, out_queue, timeout):
+    error = None
+    try:
+        _ = in_queue.get(timeout=timeout)
+
+        controlnet = ControlNetXSModel.from_pretrained("lllyasviel/sd-controlnet-canny")
+
+        pipe = StableDiffusionControlNetXSPipeline.from_pretrained(
+            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+        )
+        pipe.to("cuda")
+        pipe.set_progress_bar_config(disable=None)
+
+        pipe.unet.to(memory_format=torch.channels_last)
+        pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+
+        pipe.controlnet.to(memory_format=torch.channels_last)
+        pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "bird"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
+        ).resize((512, 512))
+
+        output = pipe(prompt, image, num_inference_steps=10, generator=generator, output_type="np")
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny_out_full.npy"
+        )
+        expected_image = np.resize(expected_image, (512, 512, 3))
+
+        assert np.abs(expected_image - image).max() < 1.0
+
+    except Exception:
+        error = f"{traceback.format_exc()}"
+
+    results = {"error": error}
+    out_queue.put(results, timeout=timeout)
+    out_queue.join()
+
+
+class ControlNetXSPipelineFastTests(
+    PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase
+):
+    pipeline_class = StableDiffusionControlNetXSPipeline
+    params = TEXT_TO_IMAGE_PARAMS
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+
+    def get_dummy_components(self, time_cond_proj_dim=None):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(4, 8),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+            norm_num_groups=1,
+            time_cond_proj_dim=time_cond_proj_dim,
+        )
+        torch.manual_seed(0)
+        controlnet = ControlNetXSModel(
+            block_out_channels=(4, 8),
+            layers_per_block=2,
+            in_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            cross_attention_dim=32,
+            conditioning_embedding_out_channels=(16, 32),
+            norm_num_groups=1,
+        )
+        torch.manual_seed(0)
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[4, 8],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            norm_num_groups=2,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "controlnet": controlnet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+            "image_encoder": None,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        controlnet_embedder_scale_factor = 2
+        image = randn_tensor(
+            (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
+            generator=generator,
+            device=torch.device(device),
+        )
+
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "numpy",
+            "image": image,
+        }
+
+        return inputs
+
+    def test_attention_slicing_forward_pass(self):
+        return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)
+
+    @unittest.skipIf(
+        torch_device != "cuda" or not is_xformers_available(),
+        reason="XFormers attention is only available with CUDA and `xformers` installed",
+    )
+    def test_xformers_attention_forwardGenerator_pass(self):
+        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3)
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(expected_max_diff=2e-3)
+
+    def test_controlnet_lcm(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+
+        components = self.get_dummy_components(time_cond_proj_dim=256)
+        sd_pipe = StableDiffusionControlNetXSPipeline(**components)
+        sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs)
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array(
+            [0.52700454, 0.3930534, 0.25509018, 0.7132304, 0.53696585, 0.46568912, 0.7095368, 0.7059624, 0.4744786]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+
+@slow
+@require_torch_gpu
+class ControlNetXSPipelineSlowTests(unittest.TestCase):
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_canny(self):
+        controlnet = ControlNetXSModel.from_pretrained("UmerHA/ConrolNetXS-S2.1-canny")
+
+        pipe = StableDiffusionControlNetXSPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-2-1", safety_checker=None, controlnet=controlnet
+        )
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "bird"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
+        )
+
+        output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
+
+        image = output.images[0]
+
+        assert image.shape == (768, 512, 3)
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny_out.npy"
+        )
+
+        assert np.abs(expected_image - image).max() < 9e-2
+
+    def test_depth(self):
+        controlnet = ControlNetXSModel.from_pretrained("UmerHA/ConrolNetXS-S2.1-depth")
+
+        pipe = StableDiffusionControlNetXSPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-2-1", safety_checker=None, controlnet=controlnet
+        )
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "Stormtrooper's lecture"
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth.png"
+        )
+
+        output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
+
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth_out.npy"
+        )
+
+        assert np.abs(expected_image - image).max() < 8e-1
+
+    @require_python39_or_higher
+    @require_torch_2
+    def test_stable_diffusion_compile(self):
+        run_test_in_subprocess(test_case=self, target_func=_test_stable_diffusion_compile, inputs=None)
+
+    def test_v11_shuffle_global_pool_conditions(self):
+        controlnet = ControlNetXSModel.from_pretrained("lllyasviel/control_v11e_sd15_shuffle")
+
+        pipe = StableDiffusionControlNetXSPipeline.from_pretrained(
+            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+        )
+        pipe.enable_model_cpu_offload()
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device="cpu").manual_seed(0)
+        prompt = "New York"
+        image = load_image(
+            "https://huggingface.co/lllyasviel/control_v11e_sd15_shuffle/resolve/main/images/control.png"
+        )
+
+        output = pipe(
+            prompt,
+            image,
+            generator=generator,
+            output_type="np",
+            num_inference_steps=3,
+            guidance_scale=7.0,
+        )
+
+        image = output.images[0]
+        assert image.shape == (512, 640, 3)
+
+        image_slice = image[-3:, -3:, -1]
+        expected_slice = np.array([0.1338, 0.1597, 0.1202, 0.1687, 0.1377, 0.1017, 0.2070, 0.1574, 0.1348])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_load_local(self):
+        controlnet = ControlNetXSModel.from_pretrained("lllyasviel/control_v11p_sd15_canny")
+        pipe_1 = StableDiffusionControlNetXSPipeline.from_pretrained(
+            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+        )
+
+        controlnet = ControlNetXSModel.from_single_file(
+            "https://huggingface.co/lllyasviel/ControlNet-v1-1/blob/main/control_v11p_sd15_canny.pth"
+        )
+        pipe_2 = StableDiffusionControlNetXSPipeline.from_single_file(
+            "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors",
+            safety_checker=None,
+            controlnet=controlnet,
+        )
+        pipes = [pipe_1, pipe_2]
+        images = []
+
+        for pipe in pipes:
+            pipe.enable_model_cpu_offload()
+            pipe.set_progress_bar_config(disable=None)
+
+            generator = torch.Generator(device="cpu").manual_seed(0)
+            prompt = "bird"
+            image = load_image(
+                "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
+            )
+
+            output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
+            images.append(output.images[0])
+
+            del pipe
+            gc.collect()
+            torch.cuda.empty_cache()
+
+        assert np.abs(images[0] - images[1]).max() < 1e-3

From e6de144cdda1387d74e3f86669eae828d3bd0864 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Mon, 27 Nov 2023 22:07:40 +0100
Subject: [PATCH 57/88] updated sd21 tests

---
 .../pipelines/controlnetxs/test_controlnet.py | 67 +++++--------------
 1 file changed, 16 insertions(+), 51 deletions(-)

diff --git a/tests/pipelines/controlnetxs/test_controlnet.py b/tests/pipelines/controlnetxs/test_controlnet.py
index 8e223cb5f995..54bd8cec355e 100644
--- a/tests/pipelines/controlnetxs/test_controlnet.py
+++ b/tests/pipelines/controlnetxs/test_controlnet.py
@@ -62,6 +62,7 @@
 
 
 # todo umer: understand & adapt
+# -- my understanding is that cnxs can't be compiled because it's not a "full" model. it needs a base model to function. is this correct?
 # Will be run via run_test_in_subprocess
 def _test_stable_diffusion_compile(in_queue, out_queue, timeout):
     error = None
@@ -132,14 +133,12 @@ def get_dummy_components(self, time_cond_proj_dim=None):
             time_cond_proj_dim=time_cond_proj_dim,
         )
         torch.manual_seed(0)
-        controlnet = ControlNetXSModel(
-            block_out_channels=(4, 8),
-            layers_per_block=2,
-            in_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            cross_attention_dim=32,
-            conditioning_embedding_out_channels=(16, 32),
-            norm_num_groups=1,
+        controlnet = ControlNetXSModel.from_unet(
+            unet=unet,
+            time_embedding_mix=0.95,
+            learn_embedding=True,
+            size_ratio=0.5,
+            conditioning_block_sizes=(16, 32),
         )
         torch.manual_seed(0)
         scheduler = DDIMScheduler(
@@ -256,7 +255,7 @@ def tearDown(self):
         torch.cuda.empty_cache()
 
     def test_canny(self):
-        controlnet = ControlNetXSModel.from_pretrained("UmerHA/ConrolNetXS-S2.1-canny")
+        controlnet = ControlNetXSModel.from_pretrained("UmerHA/ConrolNetXS-SD2.1-canny")
 
         pipe = StableDiffusionControlNetXSPipeline.from_pretrained(
             "stabilityai/stable-diffusion-2-1", safety_checker=None, controlnet=controlnet
@@ -276,14 +275,12 @@ def test_canny(self):
 
         assert image.shape == (768, 512, 3)
 
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny_out.npy"
-        )
-
-        assert np.abs(expected_image - image).max() < 9e-2
+        original_image = image[-3:, -3:, -1].flatten()
+        expected_image = np.array([0.1274, 0.1401, 0.147 , 0.1185, 0.1555, 0.1492, 0.1565, 0.1474, 0.1701])
+        assert np.allclose(original_image, expected_image, atol=1e-04)
 
     def test_depth(self):
-        controlnet = ControlNetXSModel.from_pretrained("UmerHA/ConrolNetXS-S2.1-depth")
+        controlnet = ControlNetXSModel.from_pretrained("UmerHA/ConrolNetXS-SD2.1-depth")
 
         pipe = StableDiffusionControlNetXSPipeline.from_pretrained(
             "stabilityai/stable-diffusion-2-1", safety_checker=None, controlnet=controlnet
@@ -303,48 +300,16 @@ def test_depth(self):
 
         assert image.shape == (512, 512, 3)
 
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth_out.npy"
-        )
-
-        assert np.abs(expected_image - image).max() < 8e-1
+        original_image = image[-3:, -3:, -1].flatten()
+        expected_image = np.array([0.1098, 0.1025, 0.1211, 0.1129, 0.1165, 0.1262, 0.1185, 0.1261, 0.1703])
+        assert np.allclose(original_image, expected_image, atol=1e-04)
 
     @require_python39_or_higher
     @require_torch_2
     def test_stable_diffusion_compile(self):
         run_test_in_subprocess(test_case=self, target_func=_test_stable_diffusion_compile, inputs=None)
 
-    def test_v11_shuffle_global_pool_conditions(self):
-        controlnet = ControlNetXSModel.from_pretrained("lllyasviel/control_v11e_sd15_shuffle")
-
-        pipe = StableDiffusionControlNetXSPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
-        )
-        pipe.enable_model_cpu_offload()
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = torch.Generator(device="cpu").manual_seed(0)
-        prompt = "New York"
-        image = load_image(
-            "https://huggingface.co/lllyasviel/control_v11e_sd15_shuffle/resolve/main/images/control.png"
-        )
-
-        output = pipe(
-            prompt,
-            image,
-            generator=generator,
-            output_type="np",
-            num_inference_steps=3,
-            guidance_scale=7.0,
-        )
-
-        image = output.images[0]
-        assert image.shape == (512, 640, 3)
-
-        image_slice = image[-3:, -3:, -1]
-        expected_slice = np.array([0.1338, 0.1597, 0.1202, 0.1687, 0.1377, 0.1017, 0.2070, 0.1574, 0.1348])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
+    # todo umer
     def test_load_local(self):
         controlnet = ControlNetXSModel.from_pretrained("lllyasviel/control_v11p_sd15_canny")
         pipe_1 = StableDiffusionControlNetXSPipeline.from_pretrained(

From 0ba4b032852f90e5ea2f2c7527bff71e70c0c6ed Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Tue, 28 Nov 2023 10:39:18 +0100
Subject: [PATCH 58/88] fixed tests

---
 src/diffusers/models/controlnetxs.py                         | 5 ++++-
 .../{test_controlnet.py => test_controlnetxs.py}             | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)
 rename tests/pipelines/controlnetxs/{test_controlnet.py => test_controlnetxs.py} (99%)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index 0165b71ca5e0..bec8e7d606df 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -437,7 +437,10 @@ def from_unet(
         # check that attention heads and group norms match channel sizes
         # - attention heads
         def attn_heads_match_channel_sizes(attn_heads, channel_sizes):
-            return all(c % a == 0 for a,c in zip(attn_heads,channel_sizes))
+            if isinstance(attn_heads, list):
+                return all(c % a == 0 for a,c in zip(attn_heads,channel_sizes))
+            else:
+                return all(c % attn_heads == 0 for c in channel_sizes)
 
         attention_head_dim = num_attention_heads or unet.config.attention_head_dim
         if not attn_heads_match_channel_sizes(attention_head_dim, block_out_channels):
diff --git a/tests/pipelines/controlnetxs/test_controlnet.py b/tests/pipelines/controlnetxs/test_controlnetxs.py
similarity index 99%
rename from tests/pipelines/controlnetxs/test_controlnet.py
rename to tests/pipelines/controlnetxs/test_controlnetxs.py
index 54bd8cec355e..7af3cfa1734f 100644
--- a/tests/pipelines/controlnetxs/test_controlnet.py
+++ b/tests/pipelines/controlnetxs/test_controlnetxs.py
@@ -139,6 +139,7 @@ def get_dummy_components(self, time_cond_proj_dim=None):
             learn_embedding=True,
             size_ratio=0.5,
             conditioning_block_sizes=(16, 32),
+            dim_attention_heads=2,
         )
         torch.manual_seed(0)
         scheduler = DDIMScheduler(
@@ -182,7 +183,6 @@ def get_dummy_components(self, time_cond_proj_dim=None):
             "tokenizer": tokenizer,
             "safety_checker": None,
             "feature_extractor": None,
-            "image_encoder": None,
         }
         return components
 

From 412c772fae04f61ff3664f0bfb230a16789fc913 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Tue, 28 Nov 2023 11:07:49 +0100
Subject: [PATCH 59/88] fixed tests

---
 src/diffusers/models/controlnetxs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index bec8e7d606df..dde462741f77 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -437,7 +437,7 @@ def from_unet(
         # check that attention heads and group norms match channel sizes
         # - attention heads
         def attn_heads_match_channel_sizes(attn_heads, channel_sizes):
-            if isinstance(attn_heads, list):
+            if isinstance(attn_heads, (tuple, list)):
                 return all(c % a == 0 for a,c in zip(attn_heads,channel_sizes))
             else:
                 return all(c % attn_heads == 0 for c in channel_sizes)

From 3776ce719e769828cc110ae1954485bd448b021b Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Tue, 28 Nov 2023 11:11:13 +0100
Subject: [PATCH 60/88] slightly increased error tolerance for 1 test

---
 tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py b/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py
index fb52c12df7a1..2926896cf574 100644
--- a/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py
+++ b/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py
@@ -304,7 +304,7 @@ def test_stable_diffusion_xl_prompt_embeds(self):
         image_slice_2 = output.images[0, -3:, -3:, -1]
 
         # make sure that it's equal
-        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
+        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1.1e-4
 
     # TODO Umer: Understand guess mode and enable this test if needed
     # def test_controlnet_sdxl_guess(self):

From 17a12ce474028294a0f1d1ff0d2604f380cd14e7 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Tue, 28 Nov 2023 11:25:22 +0100
Subject: [PATCH 61/88] make style & quality

---
 src/diffusers/models/controlnetxs.py          | 46 +++++++++----------
 src/diffusers/models/resnet.py                |  2 +-
 .../controlnet_xs/pipeline_controlnet_xs.py   |  3 +-
 .../pipeline_controlnet_xs_sd_xl.py           |  1 +
 src/diffusers/schedulers/scheduling_ddim.py   |  1 +
 src/diffusers/utils/dummy_pt_objects.py       |  2 +-
 .../controlnetxs/test_controlnetxs.py         |  4 +-
 .../controlnetxs/test_controlnetxs_sdxl.py    |  2 +-
 8 files changed, 31 insertions(+), 30 deletions(-)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index dde462741f77..fa5b06525c49 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 import math
 from dataclasses import dataclass
-from itertools import zip_longest
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
@@ -40,6 +39,7 @@
 )
 from .unet_2d_condition import UNet2DConditionModel
 
+
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
@@ -287,19 +287,18 @@ def compute_block_out_channels(subblock_channels, layers_per_block):
             channels = []
             for i, (_, subblock_out_channels) in enumerate(subblock_channels):
                 # first subblock is the conv_in
-                if i==0:
+                if i == 0:
                     continue
                 # every block consists of `layers_per_block` resnet/attention subblocks and a down sample subblock
-                if i %(layers_per_block+1)==0:
+                if i % (layers_per_block + 1) == 0:
                     channels.append(subblock_out_channels)
                 # the last block doesn't have a down conv, so is handled separately
-                if i==len(subblock_channels)-1:
+                if i == len(subblock_channels) - 1:
                     channels.append(subblock_out_channels)
             return channels
 
         base_block_out_channels = compute_block_out_channels(
-            subblock_channels=base_model_channel_sizes["down"],
-            layers_per_block=layers_per_block
+            subblock_channels=base_model_channel_sizes["down"], layers_per_block=layers_per_block
         )
 
         extra_channels = list(
@@ -438,7 +437,7 @@ def from_unet(
         # - attention heads
         def attn_heads_match_channel_sizes(attn_heads, channel_sizes):
             if isinstance(attn_heads, (tuple, list)):
-                return all(c % a == 0 for a,c in zip(attn_heads,channel_sizes))
+                return all(c % a == 0 for a, c in zip(attn_heads, channel_sizes))
             else:
                 return all(c % attn_heads == 0 for c in channel_sizes)
 
@@ -738,8 +737,8 @@ def forward(
         h_ctrl = self.control_model.conv_in(h_ctrl)
         if guided_hint is not None:
             h_ctrl += guided_hint
-        h_base = h_base + next(it_down_convs_out)(h_ctrl) * next(scales)   # D - add ctrl -> base
-        
+        h_base = h_base + next(it_down_convs_out)(h_ctrl) * next(scales)  # D - add ctrl -> base
+
         hs_base.append(h_base)
         hs_ctrl.append(h_ctrl)
 
@@ -748,7 +747,7 @@ def forward(
             h_ctrl = torch.cat([h_ctrl, next(it_down_convs_in)(h_base)], dim=1)  # A - concat base -> ctrl
             h_base = m_base(h_base, temb, cemb, attention_mask, cross_attention_kwargs)  # B - apply base subblock
             h_ctrl = m_ctrl(h_ctrl, temb, cemb, attention_mask, cross_attention_kwargs)  # C - apply ctrl subblock
-            h_base = h_base + next(it_down_convs_out)(h_ctrl) * next(scales)   # D - add ctrl -> base
+            h_base = h_base + next(it_down_convs_out)(h_ctrl) * next(scales)  # D - add ctrl -> base
             hs_base.append(h_base)
             hs_ctrl.append(h_ctrl)
 
@@ -758,7 +757,7 @@ def forward(
             h_base = m_base(h_base, temb, cemb, attention_mask, cross_attention_kwargs)  # B - apply base subblock
             h_ctrl = m_ctrl(h_ctrl, temb, cemb, attention_mask, cross_attention_kwargs)  # C - apply ctrl subblock
         h_base = h_base + self.middle_block_out(h_ctrl) * next(scales)  # D - add ctrl -> base
-        
+
         # 3 - up
         for i, m_base in enumerate(base_up_subblocks):
             h_base = h_base + next(it_up_convs_out)(hs_ctrl.pop()) * next(scales)  # add info from ctrl encoder
@@ -912,11 +911,11 @@ def increase_block_input_in_mid_resnet(unet: UNet2DConditionModel, by):
     unet.mid_block.resnets[0].in_channels += by  # surgery done here
 
 
-def adjust_group_norms(unet: UNet2DConditionModel, max_num_group:int=32):
+def adjust_group_norms(unet: UNet2DConditionModel, max_num_group: int = 32):
     def find_denominator(number, start):
         if start >= number:
             return number
-        while (start != 0):
+        while start != 0:
             residual = number % start
             if residual == 0:
                 return start
@@ -926,16 +925,17 @@ def find_denominator(number, start):
         # resnets
         for r in block.resnets:
             if r.norm1.num_groups < max_num_group:
-                r.norm1.num_groups = find_denominator(r.norm1.num_channels ,start=32)
-            
+                r.norm1.num_groups = find_denominator(r.norm1.num_channels, start=32)
+
             if r.norm2.num_groups < max_num_group:
-                r.norm2.num_groups = find_denominator(r.norm2.num_channels ,start=32)
+                r.norm2.num_groups = find_denominator(r.norm2.num_channels, start=32)
 
         # transformers
-        if hasattr(block, 'attentions'):
+        if hasattr(block, "attentions"):
             for a in block.attentions:
                 if a.norm.num_groups < max_num_group:
-                    a.norm.num_groups = find_denominator(a.norm.num_channels ,start=32)
+                    a.norm.num_groups = find_denominator(a.norm.num_channels, start=32)
+
 
 def is_iterable(o):
     if isinstance(o, str):
@@ -956,16 +956,16 @@ def to_sub_blocks(blocks):
     for b in blocks:
         if hasattr(b, "resnets"):
             if hasattr(b, "attentions") and b.attentions is not None:
-                for r,a in zip(b.resnets, b.attentions):
-                    sub_blocks.append([r,a])
+                for r, a in zip(b.resnets, b.attentions):
+                    sub_blocks.append([r, a])
 
                 num_resnets = len(b.resnets)
                 num_attns = len(b.attentions)
-                
+
                 if num_resnets > num_attns:
                     # we can have more resnets than attentions, so add each resnet as separate subblock
                     for i in range(num_attns, num_resnets):
-                        sub_blocks.append([b.resnets[i]])                
+                        sub_blocks.append([b.resnets[i]])
             else:
                 for r in b.resnets:
                     sub_blocks.append([r])
@@ -974,7 +974,7 @@ def to_sub_blocks(blocks):
         if hasattr(b, "upsamplers") and b.upsamplers is not None:
             for u in b.upsamplers:
                 sub_blocks[-1].extend([u])
-                
+
         # downsamplers are own subblock
         if hasattr(b, "downsamplers") and b.downsamplers is not None:
             for d in b.downsamplers:
diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py
index dba19854349a..555dfac92451 100644
--- a/src/diffusers/models/resnet.py
+++ b/src/diffusers/models/resnet.py
@@ -702,7 +702,7 @@ def forward(
             hidden_states = self.norm1(hidden_states, temb)
         else:
             hidden_states = self.norm1(hidden_states)
-   
+
         hidden_states = self.nonlinearity(hidden_states)
 
         if self.upsample is not None:
diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py
index adaf08d75828..721f94f6ba4e 100644
--- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py
+++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import inspect
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Union
 
 import numpy as np
 import PIL.Image
@@ -124,6 +124,7 @@ class StableDiffusionControlNetXSPipeline(
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
     """
+
     model_cpu_offload_seq = "text_encoder->unet->vae>controlnet"
     _optional_components = ["safety_checker", "feature_extractor"]
     _exclude_from_cpu_offload = ["safety_checker"]
diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
index 5ec506f7d657..933a43a1c24e 100644
--- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
@@ -132,6 +132,7 @@ class StableDiffusionXLControlNetXSPipeline(
             watermark output images. If not defined, it defaults to `True` if the package is installed; otherwise no
             watermarker is used.
     """
+
     # leave controlnet out on purpose because it iterates with unet
     model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae->controlnet"
     _optional_components = ["tokenizer", "tokenizer_2", "text_encoder", "text_encoder_2"]
diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index 7ffa43a5848f..d325cde7d9d4 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -27,6 +27,7 @@
 from ..utils.torch_utils import randn_tensor
 from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
 
+
 @dataclass
 # Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM
 class DDIMSchedulerOutput(BaseOutput):
diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py
index bf37947148ea..cf7b5f431ce6 100644
--- a/src/diffusers/utils/dummy_pt_objects.py
+++ b/src/diffusers/utils/dummy_pt_objects.py
@@ -90,7 +90,7 @@ def from_config(cls, *args, **kwargs):
     @classmethod
     def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
- 
+
 
 class Kandinsky3UNet(metaclass=DummyObject):
     _backends = ["torch"]
diff --git a/tests/pipelines/controlnetxs/test_controlnetxs.py b/tests/pipelines/controlnetxs/test_controlnetxs.py
index 7af3cfa1734f..2943008e5f8f 100644
--- a/tests/pipelines/controlnetxs/test_controlnetxs.py
+++ b/tests/pipelines/controlnetxs/test_controlnetxs.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 import gc
-import tempfile
 import traceback
 import unittest
 
@@ -26,7 +25,6 @@
     AutoencoderKL,
     ControlNetXSModel,
     DDIMScheduler,
-    EulerDiscreteScheduler,
     LCMScheduler,
     StableDiffusionControlNetXSPipeline,
     UNet2DConditionModel,
@@ -276,7 +274,7 @@ def test_canny(self):
         assert image.shape == (768, 512, 3)
 
         original_image = image[-3:, -3:, -1].flatten()
-        expected_image = np.array([0.1274, 0.1401, 0.147 , 0.1185, 0.1555, 0.1492, 0.1565, 0.1474, 0.1701])
+        expected_image = np.array([0.1274, 0.1401, 0.147, 0.1185, 0.1555, 0.1492, 0.1565, 0.1474, 0.1701])
         assert np.allclose(original_image, expected_image, atol=1e-04)
 
     def test_depth(self):
diff --git a/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py b/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py
index 2926896cf574..d6eb9c3d524c 100644
--- a/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py
+++ b/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py
@@ -381,5 +381,5 @@ def test_depth(self):
         assert images[0].shape == (512, 512, 3)
 
         original_image = images[0, -3:, -3:, -1].flatten()
-        expected_image = np.array([0.4411, 0.3617, 0.2654, 0.266 , 0.3449, 0.3898, 0.3745, 0.353 , 0.326])
+        expected_image = np.array([0.4411, 0.3617, 0.2654, 0.266, 0.3449, 0.3898, 0.3745, 0.353, 0.326])
         assert np.allclose(original_image, expected_image, atol=1e-04)

From a884e8774191934d435c00bdb9d51306126d0bbb Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Tue, 28 Nov 2023 11:48:25 +0100
Subject: [PATCH 62/88] Added docs for CNXS-SD

---
 docs/source/en/_toctree.yml                   |  2 +
 docs/source/en/api/pipelines/controlnetxs.md  | 33 +++++++++++++++++
 .../en/api/pipelines/controlnetxs_sdxl.md     |  2 +
 .../controlnet_xs/pipeline_controlnet_xs.py   | 37 +++++++++----------
 4 files changed, 54 insertions(+), 20 deletions(-)
 create mode 100644 docs/source/en/api/pipelines/controlnetxs.md

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index ba928916860f..710fe6f032e2 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -258,6 +258,8 @@
       title: ControlNet
     - local: api/pipelines/controlnet_sdxl
       title: ControlNet with Stable Diffusion XL
+    - local: api/pipelines/controlnetxs
+      title: ControlNet-XS
     - local: api/pipelines/controlnetxs_sdxl
       title: ControlNet-XS with Stable Diffusion XL
     - local: api/pipelines/cycle_diffusion
diff --git a/docs/source/en/api/pipelines/controlnetxs.md b/docs/source/en/api/pipelines/controlnetxs.md
new file mode 100644
index 000000000000..552d5c03a69a
--- /dev/null
+++ b/docs/source/en/api/pipelines/controlnetxs.md
@@ -0,0 +1,33 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# ControlNet with Stable Diffusion
+
+ControlNet-XS was introduced in [ControlNet-XS](https://vislearn.github.io/ControlNet-XS/) by Denis Zavadski and Carsten Rother. It is based on the observation that the control model in the [original ControlNet](https://huggingface.co/papers/2302.05543) can be made much smaller and still produces good results.
+
+As with the original ControlNet model, you can provide an additional control image to condition and control Stable Diffusion generation. For example, if you provide a depth map, the ControlNet model generates an image that'll preserve the spatial information from the depth map. It is a more flexible and accurate way to control the image generation process.
+
+This model was contributed by [UmerHA](https://twitter.com/UmerHAdil). ❤️
+
+<Tip>
+
+Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
+
+</Tip>
+
+## StableDiffusionControlNetXSPipeline
+[[autodoc]] StableDiffusionControlNetXSPipeline
+	- all
+	- __call__
+
+## StableDiffusionPipelineOutput
+[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
diff --git a/docs/source/en/api/pipelines/controlnetxs_sdxl.md b/docs/source/en/api/pipelines/controlnetxs_sdxl.md
index eb89fec724ca..2e1381667180 100644
--- a/docs/source/en/api/pipelines/controlnetxs_sdxl.md
+++ b/docs/source/en/api/pipelines/controlnetxs_sdxl.md
@@ -16,6 +16,8 @@ ControlNet-XS was introduced in [ControlNet-XS](https://vislearn.github.io/Contr
 
 As with the original ControlNet model, you can provide an additional control image to condition and control Stable Diffusion generation. For example, if you provide a depth map, the ControlNet model generates an image that'll preserve the spatial information from the depth map. It is a more flexible and accurate way to control the image generation process.
 
+This model was contributed by [UmerHA](https://twitter.com/UmerHAdil). ❤️
+
 <Tip warning={true}>
 
 🧪 Many of the SDXL ControlNet checkpoints are experimental, and there is a lot of room for improvement. Feel free to open an [Issue](https://github.com/huggingface/diffusers/issues/new/choose) and leave us feedback on how we can improve!
diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py
index 721f94f6ba4e..182168011777 100644
--- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py
+++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py
@@ -43,12 +43,11 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-# TODO umer
 EXAMPLE_DOC_STRING = """
     Examples:
         ```py
         >>> # !pip install opencv-python transformers accelerate
-        >>> from diffusers import StableDiffusionControlNetXSPipeline, ControlNetXSModel, UniPCMultistepScheduler
+        >>> from diffusers import StableDiffusionControlNetXSPipeline, ControlNetXSModel, AutoencoderKL
         >>> from diffusers.utils import load_image
         >>> import numpy as np
         >>> import torch
@@ -56,35 +55,33 @@
         >>> import cv2
         >>> from PIL import Image
 
+        >>> prompt = "aerial view, a futuristic research complex in a bright foggy jungle, hard lighting"
+        >>> negative_prompt = "low quality, bad quality, sketches"
+
         >>> # download an image
         >>> image = load_image(
-        ...     "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
+        ...    "https://hf.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png"
         ... )
-        >>> image = np.array(image)
+
+        >>> # initialize the models and pipeline
+        >>> controlnet_conditioning_scale = 0.5
+        >>> controlnet = ControlNetXSModel.from_pretrained(
+        ...     "UmerHA/ConrolNetXS-SD2.1-canny", torch_dtype=torch.float32
+        ... )
+        >>> pipe = StableDiffusionControlNetXSPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-2-1", controlnet=controlnet, torch_dtype=torch.float32
+        ... )
+        >>> pipe.enable_model_cpu_offload()
 
         >>> # get canny image
+        >>> image = np.array(image)
         >>> image = cv2.Canny(image, 100, 200)
         >>> image = image[:, :, None]
         >>> image = np.concatenate([image, image, image], axis=2)
         >>> canny_image = Image.fromarray(image)
-
-        >>> # load control net and stable diffusion v1-5
-        >>> controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
-        >>> pipe = StableDiffusionControlNetPipeline.from_pretrained(
-        ...     "runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
-        ... )
-
-        >>> # speed up diffusion process with faster scheduler and memory optimization
-        >>> pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
-        >>> # remove following line if xformers is not installed
-        >>> pipe.enable_xformers_memory_efficient_attention()
-
-        >>> pipe.enable_model_cpu_offload()
-
         >>> # generate image
-        >>> generator = torch.manual_seed(0)
         >>> image = pipe(
-        ...     "futuristic-looking woman", num_inference_steps=20, generator=generator, image=canny_image
+        ...     prompt, controlnet_conditioning_scale=controlnet_conditioning_scale, image=canny_image
         ... ).images[0]
         ```
 """

From e6dc4730ac4ab2581f37dcbe168153a29522adf4 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Tue, 28 Nov 2023 11:58:25 +0100
Subject: [PATCH 63/88] make fix-copies

---
 .../utils/dummy_torch_and_transformers_objects.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index 8c2ef0785c6b..f82f4fb5e102 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -737,6 +737,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class StableDiffusionControlNetXSPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class StableDiffusionDepth2ImgPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 

From b5982735e0560956351f1d71ad01305448ddb426 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Tue, 28 Nov 2023 12:31:29 +0100
Subject: [PATCH 64/88] Fixed sd compile test ; fixed gradient ckpointing

---
 src/diffusers/models/controlnetxs.py          | 15 +++----
 .../controlnet_xs/pipeline_controlnet_xs.py   |  2 +-
 .../controlnetxs/test_controlnetxs.py         | 43 +------------------
 3 files changed, 10 insertions(+), 50 deletions(-)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index fa5b06525c49..a3e231f79344 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -60,7 +60,6 @@ class ControlNetXSOutput(BaseOutput):
 # todo umer: assert in pipe that conditioning_block_sizes matches vae downblocks
 
 
-# todo umer: add sth like FromOriginalControlnetMixin
 class ControlNetXSModel(ModelMixin, ConfigMixin):
     r"""
     A ControlNet-XS model
@@ -92,10 +91,9 @@ class ControlNetXSModel(ModelMixin, ConfigMixin):
             Channel sizes of each subblock of base model. Use `gather_subblock_sizes` on your base model to compute it.
     """
 
-    # to delete later
     @classmethod
-    def create_as_in_paper(cls, base_model: UNet2DConditionModel, sdxl=True):
-        if sdxl:
+    def create_as_in_original_paper(cls, base_model: UNet2DConditionModel, is_sdxl=True):
+        if is_sdxl:
             return ControlNetXSModel.from_unet(
                 base_model,
                 time_embedding_mix=0.95,
@@ -555,11 +553,12 @@ def set_attention_slice(self, slice_size):
         """
         self.control_model.set_attention_slice(slice_size)
 
-    # todo umer: understand & either remove or adapt
-    # Copied from diffusers.models.controlnet.ControlNetModel._set_gradient_checkpointing
     def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D)):
-            module.gradient_checkpointing = value
+        if isinstance(module, (UNet2DConditionModel)):
+            if value:
+                module.enable_gradient_checkpointing()
+            else:
+                module.disable_gradient_checkpointing()
 
     def forward(
         self,
diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py
index 182168011777..10976e57eca6 100644
--- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py
+++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py
@@ -47,7 +47,7 @@
     Examples:
         ```py
         >>> # !pip install opencv-python transformers accelerate
-        >>> from diffusers import StableDiffusionControlNetXSPipeline, ControlNetXSModel, AutoencoderKL
+        >>> from diffusers import StableDiffusionControlNetXSPipeline, ControlNetXSModel
         >>> from diffusers.utils import load_image
         >>> import numpy as np
         >>> import torch
diff --git a/tests/pipelines/controlnetxs/test_controlnetxs.py b/tests/pipelines/controlnetxs/test_controlnetxs.py
index 2943008e5f8f..721956cf23b9 100644
--- a/tests/pipelines/controlnetxs/test_controlnetxs.py
+++ b/tests/pipelines/controlnetxs/test_controlnetxs.py
@@ -59,18 +59,16 @@
 enable_full_determinism()
 
 
-# todo umer: understand & adapt
-# -- my understanding is that cnxs can't be compiled because it's not a "full" model. it needs a base model to function. is this correct?
 # Will be run via run_test_in_subprocess
 def _test_stable_diffusion_compile(in_queue, out_queue, timeout):
     error = None
     try:
         _ = in_queue.get(timeout=timeout)
 
-        controlnet = ControlNetXSModel.from_pretrained("lllyasviel/sd-controlnet-canny")
+        controlnet = ControlNetXSModel.from_pretrained("UmerHA/ConrolNetXS-SD2.1-canny")
 
         pipe = StableDiffusionControlNetXSPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
+            "stabilityai/stable-diffusion-2-1", safety_checker=None, controlnet=controlnet
         )
         pipe.to("cuda")
         pipe.set_progress_bar_config(disable=None)
@@ -306,40 +304,3 @@ def test_depth(self):
     @require_torch_2
     def test_stable_diffusion_compile(self):
         run_test_in_subprocess(test_case=self, target_func=_test_stable_diffusion_compile, inputs=None)
-
-    # todo umer
-    def test_load_local(self):
-        controlnet = ControlNetXSModel.from_pretrained("lllyasviel/control_v11p_sd15_canny")
-        pipe_1 = StableDiffusionControlNetXSPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
-        )
-
-        controlnet = ControlNetXSModel.from_single_file(
-            "https://huggingface.co/lllyasviel/ControlNet-v1-1/blob/main/control_v11p_sd15_canny.pth"
-        )
-        pipe_2 = StableDiffusionControlNetXSPipeline.from_single_file(
-            "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors",
-            safety_checker=None,
-            controlnet=controlnet,
-        )
-        pipes = [pipe_1, pipe_2]
-        images = []
-
-        for pipe in pipes:
-            pipe.enable_model_cpu_offload()
-            pipe.set_progress_bar_config(disable=None)
-
-            generator = torch.Generator(device="cpu").manual_seed(0)
-            prompt = "bird"
-            image = load_image(
-                "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
-            )
-
-            output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
-            images.append(output.images[0])
-
-            del pipe
-            gc.collect()
-            torch.cuda.empty_cache()
-
-        assert np.abs(images[0] - images[1]).max() < 1e-3

From b75e6233461c91954f44d0d1ccd1683ee66bf9a6 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Tue, 28 Nov 2023 13:31:17 +0100
Subject: [PATCH 65/88] vae downs = cnxs conditioning downs; removed guess

---
 .../controlnet_xs/pipeline_controlnet_xs.py   | 13 +++++-----
 .../pipeline_controlnet_xs_sd_xl.py           | 15 ++++++------
 .../controlnetxs/test_controlnetxs_sdxl.py    | 24 -------------------
 3 files changed, 13 insertions(+), 39 deletions(-)

diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py
index 10976e57eca6..ee787397ec29 100644
--- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py
+++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py
@@ -156,6 +156,11 @@ def __init__(
                 " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
 
+        num_vae_down_blocks = len(vae.encoder.down_blocks)
+        num_controlnet_conditioning_down_blocks = len(controlnet.config.conditioning_block_sizes)
+        if num_vae_down_blocks != num_controlnet_conditioning_down_blocks:
+            raise ValueError(f"The number of down blocks in the VAE ({num_vae_down_blocks}) and the conditioning part of ControlNetXS model {num_controlnet_conditioning_down_blocks} need to be equal. Consider building the ControlNetXS model with different `conditioning_block_sizes`.")
+
         self.register_modules(
             vae=vae,
             text_encoder=text_encoder,
@@ -593,7 +598,6 @@ def prepare_image(
         device,
         dtype,
         do_classifier_free_guidance=False,
-        guess_mode=False,
     ):
         image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
         image_batch_size = image.shape[0]
@@ -608,7 +612,7 @@ def prepare_image(
 
         image = image.to(device=device, dtype=dtype)
 
-        if do_classifier_free_guidance and not guess_mode:
+        if do_classifier_free_guidance:
             image = torch.cat([image] * 2)
 
         return image
@@ -682,7 +686,6 @@ def __call__(
         callback_steps: int = 1,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
-        guess_mode: bool = False,
         control_guidance_start: float = 0.0,
         control_guidance_end: float = 1.0,
         clip_skip: Optional[int] = None,
@@ -750,9 +753,6 @@ def __call__(
                 The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
                 to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
                 the corresponding scale as a list.
-            guess_mode (`bool`, *optional*, defaults to `False`):
-                The ControlNet encoder tries to recognize the content of the input image even if you remove all
-                prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended.
             control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
                 The percentage of total steps at which the ControlNet starts applying.
             control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
@@ -831,7 +831,6 @@ def __call__(
                 device=device,
                 dtype=controlnet.dtype,
                 do_classifier_free_guidance=do_classifier_free_guidance,
-                guess_mode=guess_mode,
             )
             height, width = image.shape[-2:]
         else:
diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
index 933a43a1c24e..85a5b81f1831 100644
--- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
@@ -152,6 +152,12 @@ def __init__(
     ):
         super().__init__()
 
+        num_vae_down_blocks = len(vae.encoder.down_blocks)
+        num_controlnet_conditioning_down_blocks = len(controlnet.config.conditioning_block_sizes)
+        if num_vae_down_blocks != num_controlnet_conditioning_down_blocks:
+            raise ValueError(f"The number of down blocks in the VAE ({num_vae_down_blocks}) and the conditioning part of ControlNetXS model {num_controlnet_conditioning_down_blocks} need to be equal. Consider building the ControlNetXS model with different `conditioning_block_sizes`.")
+
+
         self.register_modules(
             vae=vae,
             text_encoder=text_encoder,
@@ -606,7 +612,6 @@ def check_image(self, image, prompt, prompt_embeds):
                 f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
             )
 
-    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.prepare_image
     def prepare_image(
         self,
         image,
@@ -617,7 +622,6 @@ def prepare_image(
         device,
         dtype,
         do_classifier_free_guidance=False,
-        guess_mode=False,
     ):
         image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
         image_batch_size = image.shape[0]
@@ -632,7 +636,7 @@ def prepare_image(
 
         image = image.to(device=device, dtype=dtype)
 
-        if do_classifier_free_guidance and not guess_mode:
+        if do_classifier_free_guidance:
             image = torch.cat([image] * 2)
 
         return image
@@ -749,7 +753,6 @@ def __call__(
         callback_steps: int = 1,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
-        guess_mode: bool = False,
         control_guidance_start: float = 0.0,
         control_guidance_end: float = 1.0,
         original_size: Tuple[int, int] = None,
@@ -839,9 +842,6 @@ def __call__(
             controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
                 The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
                 to the residual in the original `unet`.
-            guess_mode (`bool`, *optional*, defaults to `False`):
-                The ControlNet encoder tries to recognize the content of the input image even if you remove all
-                prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended.
             control_guidance_start (`float`, *optional*, defaults to 0.0):
                 The percentage of total steps at which the ControlNet starts applying.
             control_guidance_end (`float`, *optional*, defaults to 1.0):
@@ -955,7 +955,6 @@ def __call__(
                 device=device,
                 dtype=controlnet.dtype,
                 do_classifier_free_guidance=do_classifier_free_guidance,
-                guess_mode=guess_mode,
             )
             height, width = image.shape[-2:]
         else:
diff --git a/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py b/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py
index d6eb9c3d524c..e61b087186b2 100644
--- a/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py
+++ b/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py
@@ -306,30 +306,6 @@ def test_stable_diffusion_xl_prompt_embeds(self):
         # make sure that it's equal
         assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1.1e-4
 
-    # TODO Umer: Understand guess mode and enable this test if needed
-    # def test_controlnet_sdxl_guess(self):
-    #     device = "cpu"
-
-    #     components = self.get_dummy_components()
-
-    #     sd_pipe = self.pipeline_class(**components)
-    #     sd_pipe = sd_pipe.to(device)
-
-    #     sd_pipe.set_progress_bar_config(disable=None)
-
-    #     inputs = self.get_dummy_inputs(device)
-    #     inputs["guess_mode"] = True
-
-    #     output = sd_pipe(**inputs)
-    #     image_slice = output.images[0, -3:, -3:, -1]
-    #     expected_slice = np.array(
-    #         [0.7330834, 0.590667, 0.5667336, 0.6029023, 0.5679491, 0.5968194, 0.4032986, 0.47612396, 0.5089609]
-    #     )
-
-    #     # make sure that it's equal
-    #     assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-4
-
-
 @slow
 @require_torch_gpu
 class ControlNetSDXLPipelineXSSlowTests(unittest.TestCase):

From 0f2a05f929feb8e47a34217f36a11283635b141a Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Tue, 28 Nov 2023 13:33:49 +0100
Subject: [PATCH 66/88] make style & quality

---
 .../pipelines/controlnet_xs/pipeline_controlnet_xs.py        | 4 +++-
 .../pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py  | 5 +++--
 tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py       | 1 +
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py
index ee787397ec29..4c775a543cf7 100644
--- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py
+++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py
@@ -159,7 +159,9 @@ def __init__(
         num_vae_down_blocks = len(vae.encoder.down_blocks)
         num_controlnet_conditioning_down_blocks = len(controlnet.config.conditioning_block_sizes)
         if num_vae_down_blocks != num_controlnet_conditioning_down_blocks:
-            raise ValueError(f"The number of down blocks in the VAE ({num_vae_down_blocks}) and the conditioning part of ControlNetXS model {num_controlnet_conditioning_down_blocks} need to be equal. Consider building the ControlNetXS model with different `conditioning_block_sizes`.")
+            raise ValueError(
+                f"The number of down blocks in the VAE ({num_vae_down_blocks}) and the conditioning part of ControlNetXS model {num_controlnet_conditioning_down_blocks} need to be equal. Consider building the ControlNetXS model with different `conditioning_block_sizes`."
+            )
 
         self.register_modules(
             vae=vae,
diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
index 85a5b81f1831..6a8d56dce39a 100644
--- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
@@ -155,8 +155,9 @@ def __init__(
         num_vae_down_blocks = len(vae.encoder.down_blocks)
         num_controlnet_conditioning_down_blocks = len(controlnet.config.conditioning_block_sizes)
         if num_vae_down_blocks != num_controlnet_conditioning_down_blocks:
-            raise ValueError(f"The number of down blocks in the VAE ({num_vae_down_blocks}) and the conditioning part of ControlNetXS model {num_controlnet_conditioning_down_blocks} need to be equal. Consider building the ControlNetXS model with different `conditioning_block_sizes`.")
-
+            raise ValueError(
+                f"The number of down blocks in the VAE ({num_vae_down_blocks}) and the conditioning part of ControlNetXS model {num_controlnet_conditioning_down_blocks} need to be equal. Consider building the ControlNetXS model with different `conditioning_block_sizes`."
+            )
 
         self.register_modules(
             vae=vae,
diff --git a/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py b/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py
index e61b087186b2..5248a031bb9a 100644
--- a/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py
+++ b/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py
@@ -306,6 +306,7 @@ def test_stable_diffusion_xl_prompt_embeds(self):
         # make sure that it's equal
         assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1.1e-4
 
+
 @slow
 @require_torch_gpu
 class ControlNetSDXLPipelineXSSlowTests(unittest.TestCase):

From f59231c5f67f1f38ecb10e9800f525d8bc659b95 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Tue, 28 Nov 2023 14:39:08 +0100
Subject: [PATCH 67/88] Fixed tests

---
 src/diffusers/models/controlnetxs.py          | 33 +++++++++++++++----
 .../controlnet_xs/pipeline_controlnet_xs.py   |  9 ++---
 .../pipeline_controlnet_xs_sd_xl.py           |  9 ++---
 3 files changed, 37 insertions(+), 14 deletions(-)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index a3e231f79344..ecad555efbc6 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -25,6 +25,7 @@
 from .attention_processor import (
     AttentionProcessor,
 )
+from .autoencoder_kl import AutoencoderKL
 from .lora import LoRACompatibleConv
 from .modeling_utils import ModelMixin
 from .unet_2d_blocks import (
@@ -57,9 +58,6 @@ class ControlNetXSOutput(BaseOutput):
     sample: torch.FloatTensor = None
 
 
-# todo umer: assert in pipe that conditioning_block_sizes matches vae downblocks
-
-
 class ControlNetXSModel(ModelMixin, ConfigMixin):
     r"""
     A ControlNet-XS model
@@ -573,7 +571,6 @@ def forward(
         attention_mask: Optional[torch.Tensor] = None,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
-        guess_mode: bool = False,  # todo umer: understand and implement if required
         return_dict: bool = True,
     ) -> Union[ControlNetXSOutput, Tuple]:
         """
@@ -606,8 +603,6 @@ def forward(
                 Additional conditions for the Stable Diffusion XL UNet.
             cross_attention_kwargs (`dict[str]`, *optional*, defaults to `None`):
                 A kwargs dictionary that if specified is passed along to the `AttnProcessor`.
-            # guess_mode (`bool`, defaults to `False`):
-                # todo umer
             return_dict (`bool`, defaults to `True`):
                 Whether or not to return a [`~models.controlnet.ControlNetOutput`] instead of a plain tuple.
 
@@ -779,6 +774,32 @@ def make_zero_conv(self, in_channels, out_channels=None):
 
         return zero_module(nn.Conv2d(in_channels, out_channels, 1, padding=0))
 
+    @torch.no_grad()
+    def _check_if_vae_compatible(self, vae: AutoencoderKL):
+        condition_downscale_factor = 2 ** (len(self.config.conditioning_block_sizes) - 1)
+
+        # Multiply by 2, as otherwise we have channel with sizes = 1 after vae encoding, which confuses PyTorch.
+        # Alternativy, we could set the vae to eval mode.
+        in_size = condition_downscale_factor * 2
+
+        rand_tensor = torch.rand((1, 3, in_size, in_size)).to(vae.device)
+
+        encoded_tensor = vae.encode(rand_tensor)
+        if hasattr(encoded_tensor, "latent_dist"):
+            encoded_tensor = encoded_tensor.latent_dist.sample()
+        elif hasattr(encoded_tensor, "latents"):
+            encoded_tensor = encoded_tensor.latents
+        else:
+            raise ValueError(f"Output of {type(vae)} has neither `latents` nor `latent_dist` as attribute.")
+
+        out_size = encoded_tensor.shape[-1]
+
+        vae_downscale_factor = in_size / out_size
+
+        compatible = condition_downscale_factor == vae_downscale_factor
+
+        return compatible, condition_downscale_factor, vae_downscale_factor
+
 
 class EmbedSequential(nn.ModuleList):
     """Sequential module passing embeddings (time and conditioning) to children if they support it."""
diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py
index 4c775a543cf7..6a1f1897f954 100644
--- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py
+++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py
@@ -156,11 +156,12 @@ def __init__(
                 " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
 
-        num_vae_down_blocks = len(vae.encoder.down_blocks)
-        num_controlnet_conditioning_down_blocks = len(controlnet.config.conditioning_block_sizes)
-        if num_vae_down_blocks != num_controlnet_conditioning_down_blocks:
+        vae_compatible, cnxs_condition_downsample_factor, vae_downsample_factor = controlnet._check_if_vae_compatible(
+            vae
+        )
+        if not vae_compatible:
             raise ValueError(
-                f"The number of down blocks in the VAE ({num_vae_down_blocks}) and the conditioning part of ControlNetXS model {num_controlnet_conditioning_down_blocks} need to be equal. Consider building the ControlNetXS model with different `conditioning_block_sizes`."
+                f"The downsampling factors of the VAE ({vae_downsample_factor}) and the conditioning part of ControlNetXS model {cnxs_condition_downsample_factor} need to be equal. Consider building the ControlNetXS model with different `conditioning_block_sizes`."
             )
 
         self.register_modules(
diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
index 6a8d56dce39a..a04cdbb05dda 100644
--- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
@@ -152,11 +152,12 @@ def __init__(
     ):
         super().__init__()
 
-        num_vae_down_blocks = len(vae.encoder.down_blocks)
-        num_controlnet_conditioning_down_blocks = len(controlnet.config.conditioning_block_sizes)
-        if num_vae_down_blocks != num_controlnet_conditioning_down_blocks:
+        vae_compatible, cnxs_condition_downsample_factor, vae_downsample_factor = controlnet._check_if_vae_compatible(
+            vae
+        )
+        if not vae_compatible:
             raise ValueError(
-                f"The number of down blocks in the VAE ({num_vae_down_blocks}) and the conditioning part of ControlNetXS model {num_controlnet_conditioning_down_blocks} need to be equal. Consider building the ControlNetXS model with different `conditioning_block_sizes`."
+                f"The downsampling factors of the VAE ({vae_downsample_factor}) and the conditioning part of ControlNetXS model {cnxs_condition_downsample_factor} need to be equal. Consider building the ControlNetXS model with different `conditioning_block_sizes`."
             )
 
         self.register_modules(

From 165a3588d8665ef6f5515908b8015fd6de8589cd Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Tue, 28 Nov 2023 14:42:39 +0100
Subject: [PATCH 68/88] fixed test

---
 src/diffusers/models/controlnetxs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index ecad555efbc6..8fb761d4fcb5 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -782,7 +782,7 @@ def _check_if_vae_compatible(self, vae: AutoencoderKL):
         # Alternativy, we could set the vae to eval mode.
         in_size = condition_downscale_factor * 2
 
-        rand_tensor = torch.rand((1, 3, in_size, in_size)).to(vae.device)
+        rand_tensor = torch.rand((1, 3, in_size, in_size)).to(vae.device, dtype=vae.dtype)
 
         encoded_tensor = vae.encode(rand_tensor)
         if hasattr(encoded_tensor, "latent_dist"):

From a2d5a52c815adce9e6bc422a19ffb9e9c0da7564 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Thu, 30 Nov 2023 17:33:08 +0100
Subject: [PATCH 69/88] Incorporated review feedback

---
 docs/source/en/api/pipelines/controlnetxs.md  |   6 +-
 .../en/api/pipelines/controlnetxs_sdxl.md     |   6 +-
 src/diffusers/models/attention.py             |   1 -
 src/diffusers/models/controlnetxs.py          | 262 ++++++++----------
 src/diffusers/models/transformer_2d.py        |   1 -
 .../controlnet_xs/pipeline_controlnet_xs.py   |   4 +-
 .../pipeline_stable_diffusion_xl.py           |   2 -
 .../schedulers/scheduling_euler_discrete.py   |   3 -
 8 files changed, 131 insertions(+), 154 deletions(-)

diff --git a/docs/source/en/api/pipelines/controlnetxs.md b/docs/source/en/api/pipelines/controlnetxs.md
index 552d5c03a69a..6abf439246c1 100644
--- a/docs/source/en/api/pipelines/controlnetxs.md
+++ b/docs/source/en/api/pipelines/controlnetxs.md
@@ -10,12 +10,16 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
-# ControlNet with Stable Diffusion
+# ControlNet-XS with Stable Diffusion
 
 ControlNet-XS was introduced in [ControlNet-XS](https://vislearn.github.io/ControlNet-XS/) by Denis Zavadski and Carsten Rother. It is based on the observation that the control model in the [original ControlNet](https://huggingface.co/papers/2302.05543) can be made much smaller and still produces good results.
 
 As with the original ControlNet model, you can provide an additional control image to condition and control Stable Diffusion generation. For example, if you provide a depth map, the ControlNet model generates an image that'll preserve the spatial information from the depth map. It is a more flexible and accurate way to control the image generation process.
 
+Here's the overview from the [project page](https://vislearn.github.io/ControlNet-XS/):
+
+*With increasing computing capabilities, current model architectures appear to follow the trend of simply upscaling all components without validating the necessity for doing so. In this project we investigate the size and architectural design of ControlNet [Zhang et al., 2023] for controlling the image generation process with stable diffusion-based models. We show that a new architecture with as little as 1% of the parameters of the base model achieves state-of-the art results, considerably better than ControlNet in terms of FID score. Hence we call it ControlNet-XS. We provide the code for controlling StableDiffusion-XL [Podell et al., 2023] (Model B, 48M Parameters) and StableDiffusion 2.1 [Rombach et al. 2022] (Model B, 14M Parameters), all under openrail license.*
+
 This model was contributed by [UmerHA](https://twitter.com/UmerHAdil). ❤️
 
 <Tip>
diff --git a/docs/source/en/api/pipelines/controlnetxs_sdxl.md b/docs/source/en/api/pipelines/controlnetxs_sdxl.md
index 2e1381667180..0a9d2a506dc9 100644
--- a/docs/source/en/api/pipelines/controlnetxs_sdxl.md
+++ b/docs/source/en/api/pipelines/controlnetxs_sdxl.md
@@ -10,12 +10,16 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
-# ControlNet with Stable Diffusion XL
+# ControlNet-XS with Stable Diffusion XL
 
 ControlNet-XS was introduced in [ControlNet-XS](https://vislearn.github.io/ControlNet-XS/) by Denis Zavadski and Carsten Rother. It is based on the observation that the control model in the [original ControlNet](https://huggingface.co/papers/2302.05543) can be made much smaller and still produces good results.
 
 As with the original ControlNet model, you can provide an additional control image to condition and control Stable Diffusion generation. For example, if you provide a depth map, the ControlNet model generates an image that'll preserve the spatial information from the depth map. It is a more flexible and accurate way to control the image generation process.
 
+Here's the overview from the [project page](https://vislearn.github.io/ControlNet-XS/):
+
+*With increasing computing capabilities, current model architectures appear to follow the trend of simply upscaling all components without validating the necessity for doing so. In this project we investigate the size and architectural design of ControlNet [Zhang et al., 2023] for controlling the image generation process with stable diffusion-based models. We show that a new architecture with as little as 1% of the parameters of the base model achieves state-of-the art results, considerably better than ControlNet in terms of FID score. Hence we call it ControlNet-XS. We provide the code for controlling StableDiffusion-XL [Podell et al., 2023] (Model B, 48M Parameters) and StableDiffusion 2.1 [Rombach et al. 2022] (Model B, 14M Parameters), all under openrail license.*
+
 This model was contributed by [UmerHA](https://twitter.com/UmerHAdil). ❤️
 
 <Tip warning={true}>
diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 132aee92c5c8..0c4c5de6e31a 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -261,7 +261,6 @@ def forward(
             attention_mask=attention_mask,
             **cross_attention_kwargs,
         )
-
         if self.use_ada_layer_norm_zero:
             attn_output = gate_msa.unsqueeze(1) * attn_output
         elif self.use_ada_layer_norm_single:
diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index 8fb761d4fcb5..70e393e7a485 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -57,6 +57,51 @@ class ControlNetXSOutput(BaseOutput):
 
     sample: torch.FloatTensor = None
 
+# copied from diffusers.models.controlnet.ControlNetConditioningEmbedding
+class ControlNetConditioningEmbedding(nn.Module):
+    """
+    Quoting from https://arxiv.org/abs/2302.05543: "Stable Diffusion uses a pre-processing method similar to VQ-GAN
+    [11] to convert the entire dataset of 512 × 512 images into smaller 64 × 64 “latent images” for stabilized
+    training. This requires ControlNets to convert image-based conditions to 64 × 64 feature space to match the
+    convolution size. We use a tiny network E(·) of four convolution layers with 4 × 4 kernels and 2 × 2 strides
+    (activated by ReLU, channels are 16, 32, 64, 128, initialized with Gaussian weights, trained jointly with the full
+    model) to encode image-space conditions ... into feature maps ..."
+    """
+
+    def __init__(
+        self,
+        conditioning_embedding_channels: int,
+        conditioning_channels: int = 3,
+        block_out_channels: Tuple[int, ...] = (16, 32, 96, 256),
+    ):
+        super().__init__()
+
+        self.conv_in = nn.Conv2d(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1)
+
+        self.blocks = nn.ModuleList([])
+
+        for i in range(len(block_out_channels) - 1):
+            channel_in = block_out_channels[i]
+            channel_out = block_out_channels[i + 1]
+            self.blocks.append(nn.Conv2d(channel_in, channel_in, kernel_size=3, padding=1))
+            self.blocks.append(nn.Conv2d(channel_in, channel_out, kernel_size=3, padding=1, stride=2))
+
+        self.conv_out = zero_module(
+            nn.Conv2d(block_out_channels[-1], conditioning_embedding_channels, kernel_size=3, padding=1)
+        )
+
+    def forward(self, conditioning):
+        embedding = self.conv_in(conditioning)
+        embedding = F.silu(embedding)
+
+        for block in self.blocks:
+            embedding = block(embedding)
+            embedding = F.silu(embedding)
+
+        embedding = self.conv_out(embedding)
+
+        return embedding
+
 
 class ControlNetXSModel(ModelMixin, ConfigMixin):
     r"""
@@ -73,8 +118,8 @@ class ControlNetXSModel(ModelMixin, ConfigMixin):
             Number of channels of conditioning input (e.g. an image)
         controlnet_conditioning_channel_order (`str`, defaults to `"rgb"`):
             The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
-        conditioning_block_sizes (`Tuple[int]`, defaults to `(16,32,96,256))`):
-                TODO
+        conditioning_embedding_out_channels (`tuple[int]`, defaults to `(16, 32, 96, 256)`):
+            The tuple of output channel for each block in the `controlnet_cond_embedding` layer.
         time_embedding_input_dim (`int`, defaults to 320):
             Dimension of input into time embedding. Needs to be same as in the base model.
         time_embedding_dim (`int`, defaults to 1280):
@@ -90,7 +135,16 @@ class ControlNetXSModel(ModelMixin, ConfigMixin):
     """
 
     @classmethod
-    def create_as_in_original_paper(cls, base_model: UNet2DConditionModel, is_sdxl=True):
+    def init_original(cls, base_model: UNet2DConditionModel, is_sdxl=True):
+        """
+        Create a ControlNetXS model with the same parameters as in the original paper (https://github.com/vislearn/ControlNet-XS).
+        
+        Parameters:
+            base_model (`UNet2DConditionModel`):
+                Base unet model. Needs to be either StableDiffusion or StableDiffusion-XL.
+            is_sdxl (`bool`, defaults to `True`):
+                Wether passed `base_model` is a StableDiffusion-XL model.
+        """
         if is_sdxl:
             return ControlNetXSModel.from_unet(
                 base_model,
@@ -98,7 +152,7 @@ def create_as_in_original_paper(cls, base_model: UNet2DConditionModel, is_sdxl=T
                 learn_embedding=True,
                 size_ratio=0.1,
                 dim_attention_heads=64,
-                conditioning_block_sizes=(16, 32, 96, 256),
+                conditioning_embedding_out_channels=(16, 32, 96, 256),
             )
         else:
             return ControlNetXSModel.from_unet(
@@ -107,11 +161,20 @@ def create_as_in_original_paper(cls, base_model: UNet2DConditionModel, is_sdxl=T
                 learn_embedding=True,
                 size_ratio=0.0125,
                 dim_attention_heads=8,
-                conditioning_block_sizes=(16, 32, 96, 256),
+                conditioning_embedding_out_channels=(16, 32, 96, 256),
             )
 
     @classmethod
-    def gather_subblock_sizes(cls, unet: UNet2DConditionModel, base_or_control):
+    def _gather_subblock_sizes(cls, unet: UNet2DConditionModel, base_or_control: str):
+        """To create correctly sized connections between base and control model, we need to know 
+        the input and output channels of each subblock.
+
+        Parameters:
+            unet (`UNet2DConditionModel`):
+                Unet of which the subblock channels sizes are to be gathered.
+            base_or_control (`str`):
+                Needs to be either "base" or "control". If "base", decoder is also considered.
+        """
         if base_or_control not in ["base", "control"]:
             raise ValueError("`base_or_control` needs to be either `base` or `control`")
 
@@ -152,7 +215,7 @@ def gather_subblock_sizes(cls, unet: UNet2DConditionModel, base_or_control):
     def __init__(
         self,
         conditioning_channels: int = 3,
-        conditioning_block_sizes: Tuple[int] = (16, 32, 96, 256),
+        conditioning_embedding_out_channels: Tuple[int] = (16, 32, 96, 256),
         controlnet_conditioning_channel_order: str = "rgb",
         time_embedding_input_dim: int = 320,
         time_embedding_dim: int = 1280,
@@ -184,94 +247,38 @@ def __init__(
             ],
         },
         sample_size: Optional[int] = None,
-        in_channels: int = 4,
-        out_channels: int = 4,
-        center_input_sample: bool = False,
         down_block_types: Tuple[str] = (
             "CrossAttnDownBlock2D",
             "CrossAttnDownBlock2D",
             "CrossAttnDownBlock2D",
             "DownBlock2D",
         ),
-        mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
         up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
-        only_cross_attention: Union[bool, Tuple[bool]] = False,
         block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
-        layers_per_block: Union[int, Tuple[int]] = 2,
-        downsample_padding: int = 1,
-        mid_block_scale_factor: float = 1,
-        dropout: float = 0.0,
-        act_fn: str = "silu",
         norm_num_groups: Optional[int] = 32,
-        norm_eps: float = 1e-5,
         cross_attention_dim: Union[int, Tuple[int]] = 1280,
         transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
-        reverse_transformer_layers_per_block: Optional[Tuple[Tuple[int]]] = None,
-        encoder_hid_dim: Optional[int] = None,
-        encoder_hid_dim_type: Optional[str] = None,
         attention_head_dim: Union[int, Tuple[int]] = 8,
         num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
-        dual_cross_attention: bool = False,
         use_linear_projection: bool = False,
         upcast_attention: bool = False,
-        resnet_time_scale_shift: str = "default",
-        resnet_skip_time_act: bool = False,
-        resnet_out_scale_factor: int = 1.0,
-        time_embedding_type: str = "positional",
-        time_embedding_act_fn: Optional[str] = None,
-        timestep_post_act: Optional[str] = None,
-        time_cond_proj_dim: Optional[int] = None,
-        conv_in_kernel: int = 3,
-        conv_out_kernel: int = 3,
-        attention_type: str = "default",
-        mid_block_only_cross_attention: Optional[bool] = None,
-        cross_attention_norm: Optional[str] = None,
-        addition_embed_type_num_heads=64,
     ):
         super().__init__()
 
         # 1 - Create control unet
         self.control_model = UNet2DConditionModel(
             sample_size=sample_size,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            center_input_sample=center_input_sample,
             down_block_types=down_block_types,
-            mid_block_type=mid_block_type,
             up_block_types=up_block_types,
-            only_cross_attention=only_cross_attention,
             block_out_channels=block_out_channels,
-            layers_per_block=layers_per_block,
-            downsample_padding=downsample_padding,
-            mid_block_scale_factor=mid_block_scale_factor,
-            dropout=dropout,
-            act_fn=act_fn,
             norm_num_groups=norm_num_groups,
-            norm_eps=norm_eps,
             cross_attention_dim=cross_attention_dim,
             transformer_layers_per_block=transformer_layers_per_block,
-            reverse_transformer_layers_per_block=reverse_transformer_layers_per_block,
-            encoder_hid_dim=encoder_hid_dim,
-            encoder_hid_dim_type=encoder_hid_dim_type,
             attention_head_dim=attention_head_dim,
             num_attention_heads=num_attention_heads,
-            dual_cross_attention=dual_cross_attention,
             use_linear_projection=use_linear_projection,
             upcast_attention=upcast_attention,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_skip_time_act=resnet_skip_time_act,
-            resnet_out_scale_factor=resnet_out_scale_factor,
-            time_embedding_type=time_embedding_type,
             time_embedding_dim=time_embedding_dim,
-            time_embedding_act_fn=time_embedding_act_fn,
-            timestep_post_act=timestep_post_act,
-            time_cond_proj_dim=time_cond_proj_dim,
-            conv_in_kernel=conv_in_kernel,
-            conv_out_kernel=conv_out_kernel,
-            attention_type=attention_type,
-            mid_block_only_cross_attention=mid_block_only_cross_attention,
-            cross_attention_norm=cross_attention_norm,
-            addition_embed_type_num_heads=addition_embed_type_num_heads,
         )
 
         # 2 - Do model surgery on control model
@@ -279,6 +286,7 @@ def __init__(
         adjust_time_dims(self.control_model, time_embedding_input_dim, time_embedding_dim)
 
         # 2.2 - Allow for information infusion from base model
+        layers_per_block = 2  # Currently, ControlNet-XS only supports SD or SDXL, for which `layers_per_block` is always 2 
         def compute_block_out_channels(subblock_channels, layers_per_block):
             channels = []
             for i, (_, subblock_out_channels) in enumerate(subblock_channels):
@@ -311,7 +319,7 @@ def compute_block_out_channels(subblock_channels, layers_per_block):
         adjust_group_norms(self.control_model)
 
         # 3 - Gather Channel Sizes
-        self.ch_inout_ctrl = ControlNetXSModel.gather_subblock_sizes(self.control_model, base_or_control="control")
+        self.ch_inout_ctrl = ControlNetXSModel._gather_subblock_sizes(self.control_model, base_or_control="control")
         self.ch_inout_base = base_model_channel_sizes
 
         # 4 - Build connections between base and control model
@@ -323,45 +331,29 @@ def compute_block_out_channels(subblock_channels, layers_per_block):
         self.up_zero_convs_in = nn.ModuleList([])
 
         for ch_io_base in self.ch_inout_base["down"]:
-            self.down_zero_convs_in.append(self.make_zero_conv(in_channels=ch_io_base[1], out_channels=ch_io_base[1]))
+            self.down_zero_convs_in.append(self._make_zero_conv(in_channels=ch_io_base[1], out_channels=ch_io_base[1]))
         for i in range(len(self.ch_inout_ctrl["down"])):
             self.down_zero_convs_out.append(
-                self.make_zero_conv(self.ch_inout_ctrl["down"][i][1], self.ch_inout_base["down"][i][1])
+                self._make_zero_conv(self.ch_inout_ctrl["down"][i][1], self.ch_inout_base["down"][i][1])
             )
 
-        self.middle_block_out = self.make_zero_conv(self.ch_inout_ctrl["mid"][-1][1], self.ch_inout_base["mid"][-1][1])
+        self.middle_block_out = self._make_zero_conv(self.ch_inout_ctrl["mid"][-1][1], self.ch_inout_base["mid"][-1][1])
 
         self.up_zero_convs_out.append(
-            self.make_zero_conv(self.ch_inout_ctrl["down"][-1][1], self.ch_inout_base["mid"][-1][1])
+            self._make_zero_conv(self.ch_inout_ctrl["down"][-1][1], self.ch_inout_base["mid"][-1][1])
         )
         for i in range(1, len(self.ch_inout_ctrl["down"])):
             self.up_zero_convs_out.append(
-                self.make_zero_conv(self.ch_inout_ctrl["down"][-(i + 1)][1], self.ch_inout_base["up"][i - 1][1])
+                self._make_zero_conv(self.ch_inout_ctrl["down"][-(i + 1)][1], self.ch_inout_base["up"][i - 1][1])
             )
 
         # 5 - Create conditioning hint embedding
-        conditioning_emb_layers = [
-            nn.Conv2d(conditioning_channels, conditioning_block_sizes[0], 3, padding=1),
-            nn.SiLU(),
-        ]
-
-        for i in range(len(conditioning_block_sizes) - 1):
-            in_channels = conditioning_block_sizes[i]
-            out_channels = conditioning_block_sizes[i + 1]
-
-            conditioning_emb_layers += [
-                nn.Conv2d(in_channels, in_channels, 3, padding=1, stride=1),
-                nn.SiLU(),
-                nn.Conv2d(in_channels, out_channels, 3, padding=1, stride=2),
-                nn.SiLU(),
-            ]
-
-        conditioning_emb_layers.append(
-            zero_module(nn.Conv2d(conditioning_block_sizes[-1], block_out_channels[0], 3, padding=1))
+        self.controlnet_cond_embedding = ControlNetConditioningEmbedding(
+            conditioning_embedding_channels=block_out_channels[0],
+            block_out_channels=conditioning_embedding_out_channels,
+            conditioning_channels=conditioning_channels,
         )
 
-        self.input_hint_block = nn.Sequential(*conditioning_emb_layers)
-
         # In the mininal implementation setting, we only need the control model up to the mid block
         del self.control_model.up_blocks
         del self.control_model.conv_norm_out
@@ -372,7 +364,7 @@ def from_unet(
         cls,
         unet: UNet2DConditionModel,
         conditioning_channels: int = 3,
-        conditioning_block_sizes: Tuple[int] = (16, 32, 96, 256),
+        conditioning_embedding_out_channels: Tuple[int] = (16, 32, 96, 256),
         controlnet_conditioning_channel_order: str = "rgb",
         learn_embedding: bool = False,
         time_embedding_mix: float = 1.0,
@@ -390,8 +382,8 @@ def from_unet(
                 The UNet model we want to control. The dimensions of the ControlNetXSModel will be adapted to it.
             conditioning_channels (`int`, defaults to 3):
                 Number of channels of conditioning input (e.g. an image)
-            conditioning_block_sizes (`Tuple[int]`, defaults to `(16,32,96,256))`):
-                TODO
+            conditioning_embedding_out_channels (`tuple[int]`, defaults to `(16, 32, 96, 256)`):
+                The tuple of output channel for each block in the `controlnet_cond_embedding` layer.
             controlnet_conditioning_channel_order (`str`, defaults to `"rgb"`):
                 The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
             learn_embedding (`bool`, defaults to `False`):
@@ -411,7 +403,7 @@ def from_unet(
 
         """
 
-        # check input
+        # Check input
         fixed_size = block_out_channels is not None
         relative_size = size_ratio is not None
         if not (fixed_size ^ relative_size):
@@ -422,14 +414,14 @@ def from_unet(
         if num_attention_heads is not None and dim_attention_heads is not None:
             raise ValueError("Pass only one of `num_attention_heads` or `dim_attention_heads`.")
 
-        # create model
+        # Create model
         if block_out_channels is None:
             block_out_channels = [int(size_ratio * c) for c in unet.config.block_out_channels]
 
         if dim_attention_heads is not None:
             num_attention_heads = [math.ceil(c / dim_attention_heads) for c in block_out_channels]
 
-        # check that attention heads and group norms match channel sizes
+        # Check that attention heads and group norms match channel sizes
         # - attention heads
         def attn_heads_match_channel_sizes(attn_heads, channel_sizes):
             if isinstance(attn_heads, (tuple, list)):
@@ -468,28 +460,28 @@ def get_time_emb_input_dim(unet: UNet2DConditionModel):
         def get_time_emb_dim(unet: UNet2DConditionModel):
             return unet.time_embedding.linear_2.out_features
 
-        # clone params from base unet
-        kwargs = dict(unet.config)
+        # Clone params from base unet if
+        #    (i)   it's required to build SD or SDXL, and
+        #    (ii)  it's not used for the time embedding (as time embedding of control model is never used), and
+        #    (iii) it's not set further below anyway
+        to_keep = [
+            'attention_head_dim',
+            'cross_attention_dim',
+            'down_block_types',
+            'sample_size',
+            'transformer_layers_per_block',
+            'up_block_types',
+            'upcast_attention',
+            'use_linear_projection',
+            'num_attention_heads'
+        ]
+        kwargs = {k:v for k,v in dict(unet.config).items() if k in to_keep}
         kwargs.update(block_out_channels=block_out_channels)
         if num_attention_heads is not None:
             kwargs.update(attention_head_dim=attention_head_dim)
         kwargs.update(norm_num_groups=norm_num_groups)
 
-        # time embedding of control unet is not used. So remove params for them.
-        to_remove = (
-            "flip_sin_to_cos",
-            "freq_shift",
-            "addition_embed_type",
-            "addition_time_embed_dim",
-            "class_embed_type",
-            "num_class_embeds",
-            "projection_class_embeddings_input_dim",
-            "class_embeddings_concat",
-        )
-        for o in to_remove:
-            del kwargs[o]
-
-        # add controlnetxs-specific params
+        # Add controlnetxs-specific params
         kwargs.update(
             conditioning_channels=conditioning_channels,
             controlnet_conditioning_channel_order=controlnet_conditioning_channel_order,
@@ -497,8 +489,8 @@ def get_time_emb_dim(unet: UNet2DConditionModel):
             time_embedding_dim=get_time_emb_dim(unet),
             time_embedding_mix=time_embedding_mix,
             learn_embedding=learn_embedding,
-            base_model_channel_sizes=ControlNetXSModel.gather_subblock_sizes(unet, base_or_control="base"),
-            conditioning_block_sizes=conditioning_block_sizes,
+            base_model_channel_sizes=ControlNetXSModel._gather_subblock_sizes(unet, base_or_control="base"),
+            conditioning_embedding_out_channels=conditioning_embedding_out_channels,
         )
 
         return cls(**kwargs)
@@ -710,7 +702,7 @@ def forward(
         cemb = encoder_hidden_states
 
         # Preparation
-        guided_hint = self.input_hint_block(controlnet_cond)
+        guided_hint = self.controlnet_cond_embedding(controlnet_cond)
 
         h_ctrl = h_base = sample
         hs_base, hs_ctrl = [], []
@@ -767,7 +759,7 @@ def forward(
 
         return ControlNetXSOutput(sample=h_base)
 
-    def make_zero_conv(self, in_channels, out_channels=None):
+    def _make_zero_conv(self, in_channels, out_channels=None):
         # keep running track of channels sizes
         self.in_channels = in_channels
         self.out_channels = out_channels or in_channels
@@ -776,33 +768,16 @@ def make_zero_conv(self, in_channels, out_channels=None):
 
     @torch.no_grad()
     def _check_if_vae_compatible(self, vae: AutoencoderKL):
-        condition_downscale_factor = 2 ** (len(self.config.conditioning_block_sizes) - 1)
-
-        # Multiply by 2, as otherwise we have channel with sizes = 1 after vae encoding, which confuses PyTorch.
-        # Alternativy, we could set the vae to eval mode.
-        in_size = condition_downscale_factor * 2
-
-        rand_tensor = torch.rand((1, 3, in_size, in_size)).to(vae.device, dtype=vae.dtype)
-
-        encoded_tensor = vae.encode(rand_tensor)
-        if hasattr(encoded_tensor, "latent_dist"):
-            encoded_tensor = encoded_tensor.latent_dist.sample()
-        elif hasattr(encoded_tensor, "latents"):
-            encoded_tensor = encoded_tensor.latents
-        else:
-            raise ValueError(f"Output of {type(vae)} has neither `latents` nor `latent_dist` as attribute.")
-
-        out_size = encoded_tensor.shape[-1]
-
-        vae_downscale_factor = in_size / out_size
-
+        condition_downscale_factor = 2 ** (len(self.config.conditioning_embedding_out_channels) - 1)
+        vae_downscale_factor = 2 ** (len(vae.config.block_out_channels) - 1)
         compatible = condition_downscale_factor == vae_downscale_factor
-
         return compatible, condition_downscale_factor, vae_downscale_factor
 
 
-class EmbedSequential(nn.ModuleList):
-    """Sequential module passing embeddings (time and conditioning) to children if they support it."""
+class SubBlock(nn.ModuleList):
+    """A SubBlock is the largest piece of either base or control model, that is executed independently of the other model respectively. 
+    Before each subblock, information is concatted from base to control. And after each subblock, information is added from control to base.
+      """
 
     def __init__(self, ms, *args, **kwargs):
         if not is_iterable(ms):
@@ -817,6 +792,7 @@ def forward(
         attention_mask: Optional[torch.Tensor] = None,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
     ):
+        """Iterate through children and pass correct information to each."""
         for m in self:
             if isinstance(m, ResnetBlock2D):
                 x = m(x, temb)
@@ -945,16 +921,16 @@ def find_denominator(number, start):
         # resnets
         for r in block.resnets:
             if r.norm1.num_groups < max_num_group:
-                r.norm1.num_groups = find_denominator(r.norm1.num_channels, start=32)
+                r.norm1.num_groups = find_denominator(r.norm1.num_channels, start=max_num_group)
 
             if r.norm2.num_groups < max_num_group:
-                r.norm2.num_groups = find_denominator(r.norm2.num_channels, start=32)
+                r.norm2.num_groups = find_denominator(r.norm2.num_channels, start=max_num_group)
 
         # transformers
         if hasattr(block, "attentions"):
             for a in block.attentions:
                 if a.norm.num_groups < max_num_group:
-                    a.norm.num_groups = find_denominator(a.norm.num_channels, start=32)
+                    a.norm.num_groups = find_denominator(a.norm.num_channels, start=max_num_group)
 
 
 def is_iterable(o):
@@ -1000,7 +976,7 @@ def to_sub_blocks(blocks):
             for d in b.downsamplers:
                 sub_blocks.append([d])
 
-    return list(map(EmbedSequential, sub_blocks))
+    return list(map(SubBlock, sub_blocks))
 
 
 def zero_module(module):
diff --git a/src/diffusers/models/transformer_2d.py b/src/diffusers/models/transformer_2d.py
index 4829fdb77810..3aecc43f0f5b 100644
--- a/src/diffusers/models/transformer_2d.py
+++ b/src/diffusers/models/transformer_2d.py
@@ -325,7 +325,6 @@ def forward(
             residual = hidden_states
 
             hidden_states = self.norm(hidden_states)
-
             if not self.use_linear_projection:
                 hidden_states = (
                     self.proj_in(hidden_states, scale=lora_scale)
diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py
index 6a1f1897f954..c46ab9925180 100644
--- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py
+++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py
@@ -66,10 +66,10 @@
         >>> # initialize the models and pipeline
         >>> controlnet_conditioning_scale = 0.5
         >>> controlnet = ControlNetXSModel.from_pretrained(
-        ...     "UmerHA/ConrolNetXS-SD2.1-canny", torch_dtype=torch.float32
+        ...     "UmerHA/ConrolNetXS-SD2.1-canny", torch_dtype=torch.float16
         ... )
         >>> pipe = StableDiffusionControlNetXSPipeline.from_pretrained(
-        ...     "stabilityai/stable-diffusion-2-1", controlnet=controlnet, torch_dtype=torch.float32
+        ...     "stabilityai/stable-diffusion-2-1", controlnet=controlnet, torch_dtype=torch.float16
         ... )
         >>> pipe.enable_model_cpu_offload()
 
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
index 5263fcf1244d..40c981a46d48 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -1129,10 +1129,8 @@ def __call__(
 
                 # predict the noise residual
                 added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
-
                 if ip_adapter_image is not None:
                     added_cond_kwargs["image_embeds"] = image_embeds
-
                 noise_pred = self.unet(
                     latent_model_input,
                     t,
diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py
index edf1f512d9b0..59d9af9f55b6 100644
--- a/src/diffusers/schedulers/scheduling_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_discrete.py
@@ -167,8 +167,6 @@ def __init__(
         sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
         self.sigmas = torch.from_numpy(sigmas)
 
-        # print(f'At the end of __init__, the sigmas are {self.sigmas[:5]} ...')
-
         # setable values
         self.num_inference_steps = None
         timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy()
@@ -242,7 +240,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
             # casting to int to avoid issues when num_inference_step is power of 3
             timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.float32)
             timesteps += self.config.steps_offset
-
         elif self.config.timestep_spacing == "trailing":
             step_ratio = self.config.num_train_timesteps / self.num_inference_steps
             # creates integer timesteps by multiplying by ratio

From a5d94d7d1a9f7fb1b416f6136c1408dc93273b8b Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Thu, 30 Nov 2023 19:23:56 +0100
Subject: [PATCH 70/88] simplified control model surgery

---
 src/diffusers/models/controlnetxs.py | 40 +++++++++-------------------
 1 file changed, 13 insertions(+), 27 deletions(-)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index 70e393e7a485..7bf6d0569595 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -286,34 +286,20 @@ def __init__(
         adjust_time_dims(self.control_model, time_embedding_input_dim, time_embedding_dim)
 
         # 2.2 - Allow for information infusion from base model
-        layers_per_block = 2  # Currently, ControlNet-XS only supports SD or SDXL, for which `layers_per_block` is always 2 
-        def compute_block_out_channels(subblock_channels, layers_per_block):
-            channels = []
-            for i, (_, subblock_out_channels) in enumerate(subblock_channels):
-                # first subblock is the conv_in
-                if i == 0:
-                    continue
-                # every block consists of `layers_per_block` resnet/attention subblocks and a down sample subblock
-                if i % (layers_per_block + 1) == 0:
-                    channels.append(subblock_out_channels)
-                # the last block doesn't have a down conv, so is handled separately
-                if i == len(subblock_channels) - 1:
-                    channels.append(subblock_out_channels)
-            return channels
-
-        base_block_out_channels = compute_block_out_channels(
-            subblock_channels=base_model_channel_sizes["down"], layers_per_block=layers_per_block
-        )
 
-        extra_channels = list(
-            zip(base_block_out_channels[0:1] + base_block_out_channels[:-1], base_block_out_channels)
-        )
-        for i, (e1, e2) in enumerate(extra_channels):
-            increase_block_input_in_encoder_resnet(self.control_model, block_no=i, resnet_idx=0, by=e1)
-            increase_block_input_in_encoder_resnet(self.control_model, block_no=i, resnet_idx=1, by=e2)
-            if self.control_model.down_blocks[i].downsamplers:
-                increase_block_input_in_encoder_downsampler(self.control_model, block_no=i, by=e2)
-        increase_block_input_in_mid_resnet(self.control_model, by=base_block_out_channels[-1])
+        # We concat the output of each base encoder subblocks to the input of the next control encoder subblock
+        # (We ignore the 1st element, as it represents the `conv_in`.)
+        extra_input_channels = [input_channels for input_channels, _ in base_model_channel_sizes["down"][1:]]
+        it_extra_input_channels = iter(extra_input_channels)
+
+        for b, block in enumerate(self.control_model.down_blocks):
+            for r in range(len(block.resnets)):
+                increase_block_input_in_encoder_resnet(self.control_model, block_no=b, resnet_idx=r, by=next(it_extra_input_channels))
+            
+            if block.downsamplers:
+                increase_block_input_in_encoder_downsampler(self.control_model, block_no=b, by=next(it_extra_input_channels))
+
+        increase_block_input_in_mid_resnet(self.control_model, by=extra_input_channels[-1])
 
         # 2.3 - Make group norms work with modified channel sizes
         adjust_group_norms(self.control_model)

From 87a32546b5b0390359abe7ccf4497248b3db2c25 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Thu, 30 Nov 2023 19:30:55 +0100
Subject: [PATCH 71/88] fixed tests & make style / quality

---
 src/diffusers/models/controlnetxs.py          | 44 +++++++++++--------
 .../controlnetxs/test_controlnetxs.py         |  2 +-
 .../controlnetxs/test_controlnetxs_sdxl.py    |  2 +-
 3 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index 7bf6d0569595..4a043110ef0a 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -18,6 +18,7 @@
 import torch
 import torch.utils.checkpoint
 from torch import nn
+from torch.nn import functional as F
 from torch.nn.modules.normalization import GroupNorm
 
 from ..configuration_utils import ConfigMixin, register_to_config
@@ -57,6 +58,7 @@ class ControlNetXSOutput(BaseOutput):
 
     sample: torch.FloatTensor = None
 
+
 # copied from diffusers.models.controlnet.ControlNetConditioningEmbedding
 class ControlNetConditioningEmbedding(nn.Module):
     """
@@ -138,7 +140,7 @@ class ControlNetXSModel(ModelMixin, ConfigMixin):
     def init_original(cls, base_model: UNet2DConditionModel, is_sdxl=True):
         """
         Create a ControlNetXS model with the same parameters as in the original paper (https://github.com/vislearn/ControlNet-XS).
-        
+
         Parameters:
             base_model (`UNet2DConditionModel`):
                 Base unet model. Needs to be either StableDiffusion or StableDiffusion-XL.
@@ -166,7 +168,7 @@ def init_original(cls, base_model: UNet2DConditionModel, is_sdxl=True):
 
     @classmethod
     def _gather_subblock_sizes(cls, unet: UNet2DConditionModel, base_or_control: str):
-        """To create correctly sized connections between base and control model, we need to know 
+        """To create correctly sized connections between base and control model, we need to know
         the input and output channels of each subblock.
 
         Parameters:
@@ -294,10 +296,14 @@ def __init__(
 
         for b, block in enumerate(self.control_model.down_blocks):
             for r in range(len(block.resnets)):
-                increase_block_input_in_encoder_resnet(self.control_model, block_no=b, resnet_idx=r, by=next(it_extra_input_channels))
-            
+                increase_block_input_in_encoder_resnet(
+                    self.control_model, block_no=b, resnet_idx=r, by=next(it_extra_input_channels)
+                )
+
             if block.downsamplers:
-                increase_block_input_in_encoder_downsampler(self.control_model, block_no=b, by=next(it_extra_input_channels))
+                increase_block_input_in_encoder_downsampler(
+                    self.control_model, block_no=b, by=next(it_extra_input_channels)
+                )
 
         increase_block_input_in_mid_resnet(self.control_model, by=extra_input_channels[-1])
 
@@ -323,7 +329,9 @@ def __init__(
                 self._make_zero_conv(self.ch_inout_ctrl["down"][i][1], self.ch_inout_base["down"][i][1])
             )
 
-        self.middle_block_out = self._make_zero_conv(self.ch_inout_ctrl["mid"][-1][1], self.ch_inout_base["mid"][-1][1])
+        self.middle_block_out = self._make_zero_conv(
+            self.ch_inout_ctrl["mid"][-1][1], self.ch_inout_base["mid"][-1][1]
+        )
 
         self.up_zero_convs_out.append(
             self._make_zero_conv(self.ch_inout_ctrl["down"][-1][1], self.ch_inout_base["mid"][-1][1])
@@ -451,17 +459,17 @@ def get_time_emb_dim(unet: UNet2DConditionModel):
         #    (ii)  it's not used for the time embedding (as time embedding of control model is never used), and
         #    (iii) it's not set further below anyway
         to_keep = [
-            'attention_head_dim',
-            'cross_attention_dim',
-            'down_block_types',
-            'sample_size',
-            'transformer_layers_per_block',
-            'up_block_types',
-            'upcast_attention',
-            'use_linear_projection',
-            'num_attention_heads'
+            "attention_head_dim",
+            "cross_attention_dim",
+            "down_block_types",
+            "sample_size",
+            "transformer_layers_per_block",
+            "up_block_types",
+            "upcast_attention",
+            "use_linear_projection",
+            "num_attention_heads",
         ]
-        kwargs = {k:v for k,v in dict(unet.config).items() if k in to_keep}
+        kwargs = {k: v for k, v in dict(unet.config).items() if k in to_keep}
         kwargs.update(block_out_channels=block_out_channels)
         if num_attention_heads is not None:
             kwargs.update(attention_head_dim=attention_head_dim)
@@ -761,9 +769,9 @@ def _check_if_vae_compatible(self, vae: AutoencoderKL):
 
 
 class SubBlock(nn.ModuleList):
-    """A SubBlock is the largest piece of either base or control model, that is executed independently of the other model respectively. 
+    """A SubBlock is the largest piece of either base or control model, that is executed independently of the other model respectively.
     Before each subblock, information is concatted from base to control. And after each subblock, information is added from control to base.
-      """
+    """
 
     def __init__(self, ms, *args, **kwargs):
         if not is_iterable(ms):
diff --git a/tests/pipelines/controlnetxs/test_controlnetxs.py b/tests/pipelines/controlnetxs/test_controlnetxs.py
index 721956cf23b9..fe1ca819d7fd 100644
--- a/tests/pipelines/controlnetxs/test_controlnetxs.py
+++ b/tests/pipelines/controlnetxs/test_controlnetxs.py
@@ -134,7 +134,7 @@ def get_dummy_components(self, time_cond_proj_dim=None):
             time_embedding_mix=0.95,
             learn_embedding=True,
             size_ratio=0.5,
-            conditioning_block_sizes=(16, 32),
+            conditioning_embedding_out_channels=(16, 32),
             dim_attention_heads=2,
         )
         torch.manual_seed(0)
diff --git a/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py b/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py
index 5248a031bb9a..dbdc532a6f3b 100644
--- a/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py
+++ b/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py
@@ -86,7 +86,7 @@ def get_dummy_components(self):
             time_embedding_mix=0.95,
             learn_embedding=True,
             size_ratio=0.5,
-            conditioning_block_sizes=(16, 32),
+            conditioning_embedding_out_channels=(16, 32),
         )
         torch.manual_seed(0)
         scheduler = EulerDiscreteScheduler(

From d8e723be71a12c8f9668f15d22ca1b81085d5593 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Thu, 30 Nov 2023 23:43:38 +0100
Subject: [PATCH 72/88] Updated docs; deleted pip & cursor files

---
 .cursorignore                                     |  1 -
 Pipfile                                           | 11 -----------
 docs/source/en/api/pipelines/controlnetxs.md      |  2 ++
 docs/source/en/api/pipelines/controlnetxs_sdxl.md |  2 ++
 4 files changed, 4 insertions(+), 12 deletions(-)
 delete mode 100644 .cursorignore
 delete mode 100644 Pipfile

diff --git a/.cursorignore b/.cursorignore
deleted file mode 100644
index dd449725e188..000000000000
--- a/.cursorignore
+++ /dev/null
@@ -1 +0,0 @@
-*.md
diff --git a/Pipfile b/Pipfile
deleted file mode 100644
index 0757494bb360..000000000000
--- a/Pipfile
+++ /dev/null
@@ -1,11 +0,0 @@
-[[source]]
-url = "https://pypi.org/simple"
-verify_ssl = true
-name = "pypi"
-
-[packages]
-
-[dev-packages]
-
-[requires]
-python_version = "3.11"
diff --git a/docs/source/en/api/pipelines/controlnetxs.md b/docs/source/en/api/pipelines/controlnetxs.md
index 6abf439246c1..aa2a69ca5876 100644
--- a/docs/source/en/api/pipelines/controlnetxs.md
+++ b/docs/source/en/api/pipelines/controlnetxs.md
@@ -16,6 +16,8 @@ ControlNet-XS was introduced in [ControlNet-XS](https://vislearn.github.io/Contr
 
 As with the original ControlNet model, you can provide an additional control image to condition and control Stable Diffusion generation. For example, if you provide a depth map, the ControlNet model generates an image that'll preserve the spatial information from the depth map. It is a more flexible and accurate way to control the image generation process.
 
+Using ControlNet-XS instead of regular ControlNet will produce images of roughly the same quality, but 20-25% faster ([see benchmark](https://github.com/UmerHA/controlnet-xs-benchmark/blob/main/Speed%20Benchmark.ipynb) with StableDiffusion-XL) and with ~45% less memory usage.
+
 Here's the overview from the [project page](https://vislearn.github.io/ControlNet-XS/):
 
 *With increasing computing capabilities, current model architectures appear to follow the trend of simply upscaling all components without validating the necessity for doing so. In this project we investigate the size and architectural design of ControlNet [Zhang et al., 2023] for controlling the image generation process with stable diffusion-based models. We show that a new architecture with as little as 1% of the parameters of the base model achieves state-of-the art results, considerably better than ControlNet in terms of FID score. Hence we call it ControlNet-XS. We provide the code for controlling StableDiffusion-XL [Podell et al., 2023] (Model B, 48M Parameters) and StableDiffusion 2.1 [Rombach et al. 2022] (Model B, 14M Parameters), all under openrail license.*
diff --git a/docs/source/en/api/pipelines/controlnetxs_sdxl.md b/docs/source/en/api/pipelines/controlnetxs_sdxl.md
index 0a9d2a506dc9..a6861334fb4d 100644
--- a/docs/source/en/api/pipelines/controlnetxs_sdxl.md
+++ b/docs/source/en/api/pipelines/controlnetxs_sdxl.md
@@ -16,6 +16,8 @@ ControlNet-XS was introduced in [ControlNet-XS](https://vislearn.github.io/Contr
 
 As with the original ControlNet model, you can provide an additional control image to condition and control Stable Diffusion generation. For example, if you provide a depth map, the ControlNet model generates an image that'll preserve the spatial information from the depth map. It is a more flexible and accurate way to control the image generation process.
 
+Using ControlNet-XS instead of regular ControlNet will produce images of roughly the same quality, but 20-25% faster ([see benchmark](https://github.com/UmerHA/controlnet-xs-benchmark/blob/main/Speed%20Benchmark.ipynb)) and with ~45% less memory usage.
+
 Here's the overview from the [project page](https://vislearn.github.io/ControlNet-XS/):
 
 *With increasing computing capabilities, current model architectures appear to follow the trend of simply upscaling all components without validating the necessity for doing so. In this project we investigate the size and architectural design of ControlNet [Zhang et al., 2023] for controlling the image generation process with stable diffusion-based models. We show that a new architecture with as little as 1% of the parameters of the base model achieves state-of-the art results, considerably better than ControlNet in terms of FID score. Hence we call it ControlNet-XS. We provide the code for controlling StableDiffusion-XL [Podell et al., 2023] (Model B, 48M Parameters) and StableDiffusion 2.1 [Rombach et al. 2022] (Model B, 14M Parameters), all under openrail license.*

From 7b006467074324a16dacc2142d0b0f65ec66d7bb Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Fri, 1 Dec 2023 00:02:14 +0100
Subject: [PATCH 73/88] Rolled back minimal change to resnet

---
 src/diffusers/models/resnet.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py
index 555dfac92451..139019eb87c3 100644
--- a/src/diffusers/models/resnet.py
+++ b/src/diffusers/models/resnet.py
@@ -764,7 +764,9 @@ def forward(
                 self.conv_shortcut(input_tensor, scale) if not USE_PEFT_BACKEND else self.conv_shortcut(input_tensor)
             )
 
-        return (input_tensor + hidden_states) / self.output_scale_factor
+        output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
+
+        return output_tensor
 
 
 # unet_rl.py

From d8cc418adb872bd2236fd8bd8709afb1fa1f5dcf Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Fri, 1 Dec 2023 00:03:44 +0100
Subject: [PATCH 74/88] Update resnet.py

---
 src/diffusers/models/resnet.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py
index 139019eb87c3..73fbda3bf835 100644
--- a/src/diffusers/models/resnet.py
+++ b/src/diffusers/models/resnet.py
@@ -756,6 +756,7 @@ def forward(
             hidden_states = hidden_states * (1 + scale) + shift
 
         hidden_states = self.nonlinearity(hidden_states)
+        
         hidden_states = self.dropout(hidden_states)
         hidden_states = self.conv2(hidden_states, scale) if not USE_PEFT_BACKEND else self.conv2(hidden_states)
 

From aacf54e6eac2739712c0b5c3b9d21d0c71a00f4e Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Fri, 1 Dec 2023 00:04:34 +0100
Subject: [PATCH 75/88] Update resnet.py

---
 src/diffusers/models/resnet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py
index 73fbda3bf835..7a48d343a531 100644
--- a/src/diffusers/models/resnet.py
+++ b/src/diffusers/models/resnet.py
@@ -756,7 +756,7 @@ def forward(
             hidden_states = hidden_states * (1 + scale) + shift
 
         hidden_states = self.nonlinearity(hidden_states)
-        
+
         hidden_states = self.dropout(hidden_states)
         hidden_states = self.conv2(hidden_states, scale) if not USE_PEFT_BACKEND else self.conv2(hidden_states)
 

From 5997cf9f89c434e7fc8ad09d87c6b202cedd8d8e Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Fri, 1 Dec 2023 17:18:24 +0100
Subject: [PATCH 76/88] Update src/diffusers/models/controlnetxs.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/models/controlnetxs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index 4a043110ef0a..24dd06292692 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -261,7 +261,7 @@ def __init__(
         cross_attention_dim: Union[int, Tuple[int]] = 1280,
         transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
         attention_head_dim: Union[int, Tuple[int]] = 8,
-        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
+        num_attention_heads: Optional[Union[int, Tuple[int]]] = 8,
         use_linear_projection: bool = False,
         upcast_attention: bool = False,
     ):

From 40e099a98b300d2aca2f9600a8a4bce103ddfa4d Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Fri, 1 Dec 2023 17:18:42 +0100
Subject: [PATCH 77/88] Update src/diffusers/models/controlnetxs.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/models/controlnetxs.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index 24dd06292692..8275f2c49115 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -260,7 +260,6 @@ def __init__(
         norm_num_groups: Optional[int] = 32,
         cross_attention_dim: Union[int, Tuple[int]] = 1280,
         transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
-        attention_head_dim: Union[int, Tuple[int]] = 8,
         num_attention_heads: Optional[Union[int, Tuple[int]]] = 8,
         use_linear_projection: bool = False,
         upcast_attention: bool = False,

From b789646a49afa6188a72350efc01384d70a68400 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Mon, 4 Dec 2023 16:34:12 +0100
Subject: [PATCH 78/88] Incorporated review feedback

---
 .../en/api/pipelines/controlnetxs_sdxl.md     |  2 -
 src/diffusers/models/controlnetxs.py          | 53 +++++++++----------
 .../controlnetxs/test_controlnetxs.py         |  2 +-
 3 files changed, 27 insertions(+), 30 deletions(-)

diff --git a/docs/source/en/api/pipelines/controlnetxs_sdxl.md b/docs/source/en/api/pipelines/controlnetxs_sdxl.md
index a6861334fb4d..13dc3206620e 100644
--- a/docs/source/en/api/pipelines/controlnetxs_sdxl.md
+++ b/docs/source/en/api/pipelines/controlnetxs_sdxl.md
@@ -30,8 +30,6 @@ This model was contributed by [UmerHA](https://twitter.com/UmerHAdil). ❤️
 
 </Tip>
 
-If you don't see a checkpoint you're interested in, you can train your own SDXL ControlNet with our [training script](../../../../../examples/controlnet/README_sdxl).
-
 <Tip>
 
 Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index 8275f2c49115..9083c19e469a 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -127,7 +127,7 @@ class ControlNetXSModel(ModelMixin, ConfigMixin):
         time_embedding_dim (`int`, defaults to 1280):
             Dimension of output from time embedding. Needs to be same as in the base model.
         learn_embedding (`bool`, defaults to `False`):
-            Wether to use time embedding of the control model. If yes, the time embedding is a linear interpolation of
+            Whether to use time embedding of the control model. If yes, the time embedding is a linear interpolation of
             the time embeddings of the control and base model with interpolation parameter `time_embedding_mix**3`.
         time_embedding_mix (`float`, defaults to 1.0):
             Linear interpolation parameter used if `learn_embedding` is `True`. A value of 1.0 means only the
@@ -147,14 +147,25 @@ def init_original(cls, base_model: UNet2DConditionModel, is_sdxl=True):
             is_sdxl (`bool`, defaults to `True`):
                 Wether passed `base_model` is a StableDiffusion-XL model.
         """
+
+        def get_dim_attn_heads(base_model: UNet2DConditionModel, size_ratio: float, num_attn_heads: int):
+            """
+            Currently, diffusers can only set the dimension of attention heads (see https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131 for why).
+            The original ControlNet-XS model, however, define the number of attention heads.
+            That's why compute the dimensions needed to get the correct number of attention heads.
+            """
+            block_out_channels = [int(size_ratio * c) for c in base_model.config.block_out_channels]
+            dim_attn_heads = [math.ceil(c / num_attn_heads) for c in block_out_channels]
+            return dim_attn_heads
+
         if is_sdxl:
             return ControlNetXSModel.from_unet(
                 base_model,
                 time_embedding_mix=0.95,
                 learn_embedding=True,
                 size_ratio=0.1,
-                dim_attention_heads=64,
                 conditioning_embedding_out_channels=(16, 32, 96, 256),
+                num_attention_heads=get_dim_attn_heads(base_model, 0.1, 64),
             )
         else:
             return ControlNetXSModel.from_unet(
@@ -162,8 +173,8 @@ def init_original(cls, base_model: UNet2DConditionModel, is_sdxl=True):
                 time_embedding_mix=1.0,
                 learn_embedding=True,
                 size_ratio=0.0125,
-                dim_attention_heads=8,
                 conditioning_embedding_out_channels=(16, 32, 96, 256),
+                num_attention_heads=get_dim_attn_heads(base_model, 0.0125, 8),
             )
 
     @classmethod
@@ -261,7 +272,6 @@ def __init__(
         cross_attention_dim: Union[int, Tuple[int]] = 1280,
         transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
         num_attention_heads: Optional[Union[int, Tuple[int]]] = 8,
-        use_linear_projection: bool = False,
         upcast_attention: bool = False,
     ):
         super().__init__()
@@ -275,9 +285,8 @@ def __init__(
             norm_num_groups=norm_num_groups,
             cross_attention_dim=cross_attention_dim,
             transformer_layers_per_block=transformer_layers_per_block,
-            attention_head_dim=attention_head_dim,
-            num_attention_heads=num_attention_heads,
-            use_linear_projection=use_linear_projection,
+            attention_head_dim=num_attention_heads,
+            use_linear_projection=True,
             upcast_attention=upcast_attention,
             time_embedding_dim=time_embedding_dim,
         )
@@ -363,8 +372,7 @@ def from_unet(
         time_embedding_mix: float = 1.0,
         block_out_channels: Optional[Tuple[int]] = None,
         size_ratio: Optional[float] = None,
-        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
-        dim_attention_heads: Optional[int] = None,
+        num_attention_heads: Optional[Union[int, Tuple[int]]] = 8,
         norm_num_groups: Optional[int] = None,
     ):
         r"""
@@ -386,14 +394,15 @@ def from_unet(
             time_embedding_mix (`float`, defaults to 1.0):
                 Linear interpolation parameter used if `learn_embedding` is `True`.
             block_out_channels (`Tuple[int]`, *optional*):
-                Down blocks output channels in control model. Either this or `block_out_channels` must be given.
+                Down blocks output channels in control model. Either this or `size_ratio` must be given.
             size_ratio (float, *optional*):
                 When given, block_out_channels is set to a relative fraction of the base model's block_out_channels.
-                Either this or `size_ratio` must be given.
+                Either this or `block_out_channels` must be given.
+            num_attention_heads (`Union[int, Tuple[int]]`, *optional*):
+                The dimension of the attention heads. The naming seems a bit confusing and it is, see https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131 for why.
             norm_num_groups (int, *optional*, defaults to `None`):
                 The number of groups to use for the normalization of the control unet. If `None`,
                 `int(unet.config.norm_num_groups * size_ratio)` is taken.
-
         """
 
         # Check input
@@ -404,16 +413,10 @@ def from_unet(
                 "Pass exactly one of `block_out_channels` (for absolute sizing) or `control_model_ratio` (for relative sizing)."
             )
 
-        if num_attention_heads is not None and dim_attention_heads is not None:
-            raise ValueError("Pass only one of `num_attention_heads` or `dim_attention_heads`.")
-
         # Create model
         if block_out_channels is None:
             block_out_channels = [int(size_ratio * c) for c in unet.config.block_out_channels]
 
-        if dim_attention_heads is not None:
-            num_attention_heads = [math.ceil(c / dim_attention_heads) for c in block_out_channels]
-
         # Check that attention heads and group norms match channel sizes
         # - attention heads
         def attn_heads_match_channel_sizes(attn_heads, channel_sizes):
@@ -422,10 +425,10 @@ def attn_heads_match_channel_sizes(attn_heads, channel_sizes):
             else:
                 return all(c % attn_heads == 0 for c in channel_sizes)
 
-        attention_head_dim = num_attention_heads or unet.config.attention_head_dim
-        if not attn_heads_match_channel_sizes(attention_head_dim, block_out_channels):
+        num_attention_heads = num_attention_heads or unet.config.attention_head_dim
+        if not attn_heads_match_channel_sizes(num_attention_heads, block_out_channels):
             raise ValueError(
-                f"The number of attention heads ({attention_head_dim}) must divide `block_out_channels` ({block_out_channels}). If you didn't set `num_attention_heads` or `attention_head_dim` the default settings don't match your model. Set one of them  manually."
+                f"The dimension of attention heads ({num_attention_heads}) must divide `block_out_channels` ({block_out_channels}). If you didn't set `num_attention_heads` the default settings don't match your model. Set `num_attention_heads` manually."
             )
 
         # - group norms
@@ -444,7 +447,7 @@ def group_norms_match_channel_sizes(num_groups, channel_sizes):
                     )
                 else:
                     raise ValueError(
-                        f"`block_out_channels` ({block_out_channels}) don't match the base models `norm_num_groups` ({unet.config.norm_num_groups}). Setting `norm_num_groups` to `min(norm_num_groups)` ({norm_num_groups}) didn't fix this. Pass `norm_num_groups` explicitly so it divides all block_out_channels."
+                        f"`block_out_channels` ({block_out_channels}) don't match the base models `norm_num_groups` ({unet.config.norm_num_groups}). Setting `norm_num_groups` to `min(block_out_channels)` ({norm_num_groups}) didn't fix this. Pass `norm_num_groups` explicitly so it divides all block_out_channels."
                     )
 
         def get_time_emb_input_dim(unet: UNet2DConditionModel):
@@ -458,20 +461,16 @@ def get_time_emb_dim(unet: UNet2DConditionModel):
         #    (ii)  it's not used for the time embedding (as time embedding of control model is never used), and
         #    (iii) it's not set further below anyway
         to_keep = [
-            "attention_head_dim",
             "cross_attention_dim",
             "down_block_types",
             "sample_size",
             "transformer_layers_per_block",
             "up_block_types",
             "upcast_attention",
-            "use_linear_projection",
-            "num_attention_heads",
         ]
         kwargs = {k: v for k, v in dict(unet.config).items() if k in to_keep}
         kwargs.update(block_out_channels=block_out_channels)
-        if num_attention_heads is not None:
-            kwargs.update(attention_head_dim=attention_head_dim)
+        kwargs.update(num_attention_heads=num_attention_heads)
         kwargs.update(norm_num_groups=norm_num_groups)
 
         # Add controlnetxs-specific params
diff --git a/tests/pipelines/controlnetxs/test_controlnetxs.py b/tests/pipelines/controlnetxs/test_controlnetxs.py
index fe1ca819d7fd..e3212e9e301c 100644
--- a/tests/pipelines/controlnetxs/test_controlnetxs.py
+++ b/tests/pipelines/controlnetxs/test_controlnetxs.py
@@ -135,7 +135,7 @@ def get_dummy_components(self, time_cond_proj_dim=None):
             learn_embedding=True,
             size_ratio=0.5,
             conditioning_embedding_out_channels=(16, 32),
-            dim_attention_heads=2,
+            num_attention_heads=2,
         )
         torch.manual_seed(0)
         scheduler = DDIMScheduler(

From 06bfe12cc0864225d1a8e033c61805b3f2455bc1 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Mon, 4 Dec 2023 17:56:09 +0100
Subject: [PATCH 79/88] Update
 docs/source/en/api/pipelines/controlnetxs_sdxl.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/en/api/pipelines/controlnetxs_sdxl.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/controlnetxs_sdxl.md b/docs/source/en/api/pipelines/controlnetxs_sdxl.md
index 13dc3206620e..25544f1cef50 100644
--- a/docs/source/en/api/pipelines/controlnetxs_sdxl.md
+++ b/docs/source/en/api/pipelines/controlnetxs_sdxl.md
@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.
 
 # ControlNet-XS with Stable Diffusion XL
 
-ControlNet-XS was introduced in [ControlNet-XS](https://vislearn.github.io/ControlNet-XS/) by Denis Zavadski and Carsten Rother. It is based on the observation that the control model in the [original ControlNet](https://huggingface.co/papers/2302.05543) can be made much smaller and still produces good results.
+ControlNet-XS was introduced in [ControlNet-XS](https://vislearn.github.io/ControlNet-XS/) by Denis Zavadski and Carsten Rother. It is based on the observation that the control model in the [original ControlNet](https://huggingface.co/papers/2302.05543) can be made much smaller and still produce good results.
 
 As with the original ControlNet model, you can provide an additional control image to condition and control Stable Diffusion generation. For example, if you provide a depth map, the ControlNet model generates an image that'll preserve the spatial information from the depth map. It is a more flexible and accurate way to control the image generation process.
 

From 4e9448aa7b14cfc0ad23920cd6c619d11aefecbf Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Mon, 4 Dec 2023 17:56:20 +0100
Subject: [PATCH 80/88] Update docs/source/en/api/pipelines/controlnetxs.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/en/api/pipelines/controlnetxs.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/controlnetxs.md b/docs/source/en/api/pipelines/controlnetxs.md
index aa2a69ca5876..ac0bf7806103 100644
--- a/docs/source/en/api/pipelines/controlnetxs.md
+++ b/docs/source/en/api/pipelines/controlnetxs.md
@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.
 
 # ControlNet-XS with Stable Diffusion
 
-ControlNet-XS was introduced in [ControlNet-XS](https://vislearn.github.io/ControlNet-XS/) by Denis Zavadski and Carsten Rother. It is based on the observation that the control model in the [original ControlNet](https://huggingface.co/papers/2302.05543) can be made much smaller and still produces good results.
+ControlNet-XS was introduced in [ControlNet-XS](https://vislearn.github.io/ControlNet-XS/) by Denis Zavadski and Carsten Rother. It is based on the observation that the control model in the [original ControlNet](https://huggingface.co/papers/2302.05543) can be made much smaller and still produce good results.
 
 As with the original ControlNet model, you can provide an additional control image to condition and control Stable Diffusion generation. For example, if you provide a depth map, the ControlNet model generates an image that'll preserve the spatial information from the depth map. It is a more flexible and accurate way to control the image generation process.
 

From 348bedbad49b4f9db9e79e88926d279142d0baff Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Mon, 4 Dec 2023 17:56:34 +0100
Subject: [PATCH 81/88] Update docs/source/en/api/pipelines/controlnetxs.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/en/api/pipelines/controlnetxs.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/controlnetxs.md b/docs/source/en/api/pipelines/controlnetxs.md
index ac0bf7806103..9f3aedbeab21 100644
--- a/docs/source/en/api/pipelines/controlnetxs.md
+++ b/docs/source/en/api/pipelines/controlnetxs.md
@@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License.
 
 ControlNet-XS was introduced in [ControlNet-XS](https://vislearn.github.io/ControlNet-XS/) by Denis Zavadski and Carsten Rother. It is based on the observation that the control model in the [original ControlNet](https://huggingface.co/papers/2302.05543) can be made much smaller and still produce good results.
 
-As with the original ControlNet model, you can provide an additional control image to condition and control Stable Diffusion generation. For example, if you provide a depth map, the ControlNet model generates an image that'll preserve the spatial information from the depth map. It is a more flexible and accurate way to control the image generation process.
+Like the original ControlNet model, you can provide an additional control image to condition and control Stable Diffusion generation. For example, if you provide a depth map, the ControlNet model generates an image that'll preserve the spatial information from the depth map. It is a more flexible and accurate way to control the image generation process.
 
 Using ControlNet-XS instead of regular ControlNet will produce images of roughly the same quality, but 20-25% faster ([see benchmark](https://github.com/UmerHA/controlnet-xs-benchmark/blob/main/Speed%20Benchmark.ipynb) with StableDiffusion-XL) and with ~45% less memory usage.
 

From 167b7674f1e25b4ae5cf4896c5dc4f5bffe8b3f5 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Mon, 4 Dec 2023 17:56:44 +0100
Subject: [PATCH 82/88] Update docs/source/en/api/pipelines/controlnetxs.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/en/api/pipelines/controlnetxs.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/controlnetxs.md b/docs/source/en/api/pipelines/controlnetxs.md
index 9f3aedbeab21..3aa4e89cece5 100644
--- a/docs/source/en/api/pipelines/controlnetxs.md
+++ b/docs/source/en/api/pipelines/controlnetxs.md
@@ -16,7 +16,7 @@ ControlNet-XS was introduced in [ControlNet-XS](https://vislearn.github.io/Contr
 
 Like the original ControlNet model, you can provide an additional control image to condition and control Stable Diffusion generation. For example, if you provide a depth map, the ControlNet model generates an image that'll preserve the spatial information from the depth map. It is a more flexible and accurate way to control the image generation process.
 
-Using ControlNet-XS instead of regular ControlNet will produce images of roughly the same quality, but 20-25% faster ([see benchmark](https://github.com/UmerHA/controlnet-xs-benchmark/blob/main/Speed%20Benchmark.ipynb) with StableDiffusion-XL) and with ~45% less memory usage.
+ControlNet-XS generates images with comparable quality to a regular ControlNet, but it is 20-25% faster ([see benchmark](https://github.com/UmerHA/controlnet-xs-benchmark/blob/main/Speed%20Benchmark.ipynb) with StableDiffusion-XL) and uses ~45% less memory.
 
 Here's the overview from the [project page](https://vislearn.github.io/ControlNet-XS/):
 

From 40744c9e966ea59e52d86a3dcb0edebe957d8c3f Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Mon, 4 Dec 2023 17:56:54 +0100
Subject: [PATCH 83/88] Update src/diffusers/models/controlnetxs.py

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 src/diffusers/models/controlnetxs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index 9083c19e469a..0bf9d300892e 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -145,7 +145,7 @@ def init_original(cls, base_model: UNet2DConditionModel, is_sdxl=True):
             base_model (`UNet2DConditionModel`):
                 Base unet model. Needs to be either StableDiffusion or StableDiffusion-XL.
             is_sdxl (`bool`, defaults to `True`):
-                Wether passed `base_model` is a StableDiffusion-XL model.
+                Whether passed `base_model` is a StableDiffusion-XL model.
         """
 
         def get_dim_attn_heads(base_model: UNet2DConditionModel, size_ratio: float, num_attn_heads: int):

From bab91cbbb6544b4ad8f5d9974de5ac8f8102bfad Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Mon, 4 Dec 2023 17:57:05 +0100
Subject: [PATCH 84/88] Update src/diffusers/models/controlnetxs.py

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 src/diffusers/models/controlnetxs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/models/controlnetxs.py b/src/diffusers/models/controlnetxs.py
index 0bf9d300892e..41f2d8af01b1 100644
--- a/src/diffusers/models/controlnetxs.py
+++ b/src/diffusers/models/controlnetxs.py
@@ -143,7 +143,7 @@ def init_original(cls, base_model: UNet2DConditionModel, is_sdxl=True):
 
         Parameters:
             base_model (`UNet2DConditionModel`):
-                Base unet model. Needs to be either StableDiffusion or StableDiffusion-XL.
+                Base UNet model. Needs to be either StableDiffusion or StableDiffusion-XL.
             is_sdxl (`bool`, defaults to `True`):
                 Whether passed `base_model` is a StableDiffusion-XL model.
         """

From bf976d0e25902844fd989a74aefbff375efeb24d Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Mon, 4 Dec 2023 17:57:17 +0100
Subject: [PATCH 85/88] Update
 src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py
index c46ab9925180..1cc18e879baa 100644
--- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py
+++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py
@@ -699,7 +699,7 @@ def __call__(
         Args:
             prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
-            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,
                     `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
                 The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
                 specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be

From dfa97f081e13ebf809682f926ac59c3974c6a257 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Mon, 4 Dec 2023 17:57:31 +0100
Subject: [PATCH 86/88] Update docs/source/en/api/pipelines/controlnetxs.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/en/api/pipelines/controlnetxs.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/api/pipelines/controlnetxs.md b/docs/source/en/api/pipelines/controlnetxs.md
index 3aa4e89cece5..2d4ae7b8ce46 100644
--- a/docs/source/en/api/pipelines/controlnetxs.md
+++ b/docs/source/en/api/pipelines/controlnetxs.md
@@ -10,7 +10,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
-# ControlNet-XS with Stable Diffusion
+# ControlNet-XS
 
 ControlNet-XS was introduced in [ControlNet-XS](https://vislearn.github.io/ControlNet-XS/) by Denis Zavadski and Carsten Rother. It is based on the observation that the control model in the [original ControlNet](https://huggingface.co/papers/2302.05543) can be made much smaller and still produce good results.
 

From eb8dd2b99d232d6de2ef77a40ebc10d93c3768cf Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Mon, 4 Dec 2023 17:57:44 +0100
Subject: [PATCH 87/88] Update
 src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 .../pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
index a04cdbb05dda..59aee5d97d37 100644
--- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py
@@ -774,7 +774,7 @@ def __call__(
             prompt_2 (`str` or `List[str]`, *optional*):
                 The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
                 used in both text-encoders.
-            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,
                     `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
                 The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
                 specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be

From 58ea5abe75f300c24e41e183f70fac208984de27 Mon Sep 17 00:00:00 2001
From: UmerHA <40663591+UmerHA@users.noreply.github.com>
Date: Mon, 4 Dec 2023 18:16:39 +0100
Subject: [PATCH 88/88] Incorporated doc feedback

---
 docs/source/en/api/pipelines/controlnetxs_sdxl.md | 4 ++--
 docs/source/en/api/pipelines/overview.md          | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/api/pipelines/controlnetxs_sdxl.md b/docs/source/en/api/pipelines/controlnetxs_sdxl.md
index 25544f1cef50..31075c0ef96a 100644
--- a/docs/source/en/api/pipelines/controlnetxs_sdxl.md
+++ b/docs/source/en/api/pipelines/controlnetxs_sdxl.md
@@ -14,9 +14,9 @@ specific language governing permissions and limitations under the License.
 
 ControlNet-XS was introduced in [ControlNet-XS](https://vislearn.github.io/ControlNet-XS/) by Denis Zavadski and Carsten Rother. It is based on the observation that the control model in the [original ControlNet](https://huggingface.co/papers/2302.05543) can be made much smaller and still produce good results.
 
-As with the original ControlNet model, you can provide an additional control image to condition and control Stable Diffusion generation. For example, if you provide a depth map, the ControlNet model generates an image that'll preserve the spatial information from the depth map. It is a more flexible and accurate way to control the image generation process.
+Like the original ControlNet model, you can provide an additional control image to condition and control Stable Diffusion generation. For example, if you provide a depth map, the ControlNet model generates an image that'll preserve the spatial information from the depth map. It is a more flexible and accurate way to control the image generation process.
 
-Using ControlNet-XS instead of regular ControlNet will produce images of roughly the same quality, but 20-25% faster ([see benchmark](https://github.com/UmerHA/controlnet-xs-benchmark/blob/main/Speed%20Benchmark.ipynb)) and with ~45% less memory usage.
+ControlNet-XS generates images with comparable quality to a regular ControlNet, but it is 20-25% faster ([see benchmark](https://github.com/UmerHA/controlnet-xs-benchmark/blob/main/Speed%20Benchmark.ipynb)) and uses ~45% less memory.
 
 Here's the overview from the [project page](https://vislearn.github.io/ControlNet-XS/):
 
diff --git a/docs/source/en/api/pipelines/overview.md b/docs/source/en/api/pipelines/overview.md
index 7dab22469dc2..761663e5fdf4 100644
--- a/docs/source/en/api/pipelines/overview.md
+++ b/docs/source/en/api/pipelines/overview.md
@@ -40,6 +40,8 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an
 | [Consistency Models](consistency_models) | unconditional image generation |
 | [ControlNet](controlnet) | text2image, image2image, inpainting |
 | [ControlNet with Stable Diffusion XL](controlnet_sdxl) | text2image |
+| [ControlNet-XS](controlnetxs) | text2image |
+| [ControlNet-XS with Stable Diffusion XL](controlnetxs_sdxl) | text2image |
 | [Cycle Diffusion](cycle_diffusion) | image2image |
 | [Dance Diffusion](dance_diffusion) | unconditional audio generation |
 | [DDIM](ddim) | unconditional image generation |