huggingface
diff --git a/‎src/diffusers/__init__.py
Lines changed: 1 addition & 1 deletion b/‎src/diffusers/__init__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/diffusers/models/__init__.py
Lines changed: 2 additions & 2 deletions b/‎src/diffusers/models/__init__.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/diffusers/models/attention.py
Lines changed: 0 additions & 208 deletions b/‎src/diffusers/models/attention.py
Lines changed: 0 additions & 208 deletions
diff --git a/‎src/diffusers/models/attention_processor.py
Lines changed: 9 additions & 94 deletions b/‎src/diffusers/models/attention_processor.py
Lines changed: 9 additions & 94 deletions
@@ -82,7 +82,7 @@
             "ConsistencyDecoderVAE",
             "ControlNetModel",
             "ControlNetXSAdapter",
-            "HunyuanDiT2DModel", 
+            "HunyuanDiT2DModel",
             "I2VGenXLUNet",
             "Kandinsky3UNet",
             "ModelMixin",
 
@@ -37,10 +37,10 @@
     _import_structure["dual_transformer_2d"] = ["DualTransformer2DModel"]
     _import_structure["embeddings"] = ["ImageProjection"]
     _import_structure["modeling_utils"] = ["ModelMixin"]
+    _import_structure["transformers.hunyuan_transformer_2d"] = ["HunyuanDiT2DModel"]
     _import_structure["transformers.prior_transformer"] = ["PriorTransformer"]
     _import_structure["transformers.t5_film_transformer"] = ["T5FilmDecoder"]
     _import_structure["transformers.transformer_2d"] = ["Transformer2DModel"]
-    _import_structure["transformers.hunyuan_transformer_2d"] = ["HunyuanDiT2DModel"]
     _import_structure["transformers.transformer_temporal"] = ["TransformerTemporalModel"]
     _import_structure["unets.unet_1d"] = ["UNet1DModel"]
     _import_structure["unets.unet_2d"] = ["UNet2DModel"]
@@ -75,8 +75,8 @@
         from .embeddings import ImageProjection
         from .modeling_utils import ModelMixin
         from .transformers import (
-            HunyuanDiT2DModel,
             DualTransformer2DModel,
+            HunyuanDiT2DModel,
             PriorTransformer,
             T5FilmDecoder,
             Transformer2DModel,
 
@@ -84,214 +84,6 @@ def forward(self, x: torch.Tensor, objs: torch.Tensor) -> torch.Tensor:
 
         return x
 
-### TODO: XCLiu: some ugly helper functions, please clean later
-### ==== begin ====
-def modulate(x, shift, scale):
-    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
-
-class FP32_Layernorm(nn.LayerNorm):
-    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
-        origin_dtype = inputs.dtype
-        return F.layer_norm(inputs.float(), self.normalized_shape, self.weight.float(), self.bias.float(),
-                            self.eps).to(origin_dtype)
-
-
-class FP32_SiLU(nn.SiLU):
-    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
-        return torch.nn.functional.silu(inputs.float(), inplace=False).to(inputs.dtype)
-
-from typing import Tuple, Union, Optional
-
-class HunyuanDiTAttentionPool(nn.Module):
-    def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
-        super().__init__()
-        self.positional_embedding = nn.Parameter(torch.randn(spacial_dim + 1, embed_dim) / embed_dim ** 0.5)
-        self.k_proj = nn.Linear(embed_dim, embed_dim)
-        self.q_proj = nn.Linear(embed_dim, embed_dim)
-        self.v_proj = nn.Linear(embed_dim, embed_dim)
-        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
-        self.num_heads = num_heads
-
-    def forward(self, x):
-        x = x.permute(1, 0, 2)  # NLC -> LNC
-        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (L+1)NC
-        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (L+1)NC
-        x, _ = F.multi_head_attention_forward(
-            query=x[:1], key=x, value=x,
-            embed_dim_to_check=x.shape[-1],
-            num_heads=self.num_heads,
-            q_proj_weight=self.q_proj.weight,
-            k_proj_weight=self.k_proj.weight,
-            v_proj_weight=self.v_proj.weight,
-            in_proj_weight=None,
-            in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
-            bias_k=None,
-            bias_v=None,
-            add_zero_attn=False,
-            dropout_p=0,
-            out_proj_weight=self.c_proj.weight,
-            out_proj_bias=self.c_proj.bias,
-            use_separate_proj_weight=True,
-            training=self.training,
-            need_weights=False
-        )
-        return x.squeeze(0)
-### ==== end ====
-
-
-@maybe_allow_in_graph
-class HunyuanDiTBlock(nn.Module):
-    r"""
-    HunyuanDiT Transformer block. Allow skip connection and QKNorm
-    Parameters:
-        dim (`int`): The number of channels in the input and output.
-        num_attention_heads (`int`): The number of heads to use for multi-head attention.
-        attention_head_dim (`int`): The number of channels in each head.
-        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
-        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
-        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
-        num_embeds_ada_norm (:
-            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
-        attention_bias (:
-            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
-        only_cross_attention (`bool`, *optional*):
-            Whether to use only cross-attention layers. In this case two cross attention layers are used.
-        double_self_attention (`bool`, *optional*):
-            Whether to use two self-attention layers. In this case no cross attention layers are used.
-        upcast_attention (`bool`, *optional*):
-            Whether to upcast the attention computation to float32. This is useful for mixed precision training.
-        norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
-            Whether to use learnable elementwise affine parameters for normalization.
-        norm_type (`str`, *optional*, defaults to `"layer_norm"`):
-            The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
-        final_dropout (`bool` *optional*, defaults to False):
-            Whether to apply a final dropout after the last feed-forward layer.
-        attention_type (`str`, *optional*, defaults to `"default"`):
-            The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
-        positional_embeddings (`str`, *optional*, defaults to `None`):
-            The type of positional embeddings to apply to.
-        num_positional_embeddings (`int`, *optional*, defaults to `None`):
-            The maximum number of positional embeddings to apply.
-    """
-
-    def __init__(
-        self,
-        dim: int,
-        num_attention_heads: int,
-        text_dim: int=1024,
-        dropout=0.0,
-        activation_fn: str = "geglu",
-        norm_elementwise_affine: bool = True,
-        norm_eps: float = 1e-6,
-        final_dropout: bool = False,
-        ff_inner_dim: Optional[int] = None,
-        ff_bias: bool = True,
-        skip: bool = False,
-        qk_norm: bool = True,
-    ):
-        super().__init__()
-
-        # Define 3 blocks. Each block has its own normalization layer.
-        # NOTE: when new version comes, chech norm2 and norm 3
-        # 1. Self-Attn
-        self.norm1 = FP32_Layernorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
-
-        from .attention_processor import HunyuanAttnProcessor2_0
-        self.attn1 = Attention(
-            query_dim=dim, 
-            cross_attention_dim=dim, 
-            dim_head = dim //num_attention_heads, 
-            heads = num_attention_heads,
-            qk_norm="layer_norm" if qk_norm else None, 
-            eps=1e-6, 
-            bias=True,
-            processor= HunyuanAttnProcessor2_0(),
-        )
-
-        # 2. Cross-Attn
-        self.norm2 = FP32_Layernorm(dim, norm_eps, norm_elementwise_affine)
-
-        self.attn2 = Attention(
-            query_dim=dim,
-            cross_attention_dim=text_dim,
-            dim_head = dim // num_attention_heads,
-            heads = num_attention_heads,
-            qk_norm="layer_norm" if qk_norm else None,
-            eps=1e-6,
-            bias=True,
-            processor= HunyuanAttnProcessor2_0(),
-        )
-        # 3. Feed-forward
-        self.norm3 = FP32_Layernorm(dim, norm_eps, norm_elementwise_affine)
-
-        self.ff = FeedForward(
-            dim,
-            dropout=dropout, ### 0.0
-            activation_fn=activation_fn, ### approx GeLU
-            final_dropout=final_dropout, ### 0.0
-            inner_dim=ff_inner_dim, ### int(dim * mlp_ratio)
-            bias=ff_bias,
-        )
-
-        # 4. Skip Connection
-        if skip:
-            self.skip_norm = FP32_Layernorm(2 * dim, norm_eps, elementwise_affine=True)
-            self.skip_linear = nn.Linear(2 * dim, dim)
-        else:
-            self.skip_linear = None
-
-        # 5. SDXL-style modulation with add
-        self.default_modulation = nn.Sequential(
-            FP32_SiLU(),
-            nn.Linear(dim, dim, bias=True)
-        )
-
-        # let chunk size default to None
-        self._chunk_size = None
-        self._chunk_dim = 0
-
-    def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0):
-        # Sets chunk feed-forward
-        self._chunk_size = chunk_size
-        self._chunk_dim = dim
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        timestep: Optional[torch.LongTensor] = None,
-        freq_cis_img = None,
-        skip=None
-    ) -> torch.Tensor:
-
-        # Notice that normalization is always applied before the real computation in the following blocks.
-        # 0. Long Skip Connection
-        if self.skip_linear is not None:
-            cat = torch.cat([hidden_states, skip], dim=-1)
-            cat = self.skip_norm(cat)
-            hidden_states = self.skip_linear(cat)
-
-        # 1. Self-Attention
-        norm_hidden_states = self.norm1(hidden_states) ### checked: self.norm1 is correct
-        shift_msa = self.default_modulation(timestep).unsqueeze(dim=1)
-        attn_output = self.attn1(
-            norm_hidden_states + shift_msa,
-            temb = freq_cis_img,
-        )
-        hidden_states = hidden_states + attn_output
-
-        # 2. Cross-Attention
-        hidden_states = hidden_states + self.attn2(
-            self.norm2(hidden_states),
-            encoder_hidden_states = encoder_hidden_states,
-            temb = freq_cis_img,
-        )
-
-        # FFN Layer ### TODO: switch norm2 and norm3 in the state dict
-        mlp_inputs = self.norm3(hidden_states)
-        hidden_states = hidden_states + self.ff(mlp_inputs)
-
-        return hidden_states
 
 @maybe_allow_in_graph
 class BasicTransformerBlock(nn.Module):
 
@@ -161,7 +161,7 @@ def __init__(
             self.spatial_norm = SpatialNorm(f_channels=query_dim, zq_channels=spatial_norm_dim)
         else:
             self.spatial_norm = None
-        
+
         if qk_norm is None:
             self.norm_q = None
             self.norm_k = None
@@ -1435,6 +1435,7 @@ def __call__(
 
         return hidden_states
 
+
 class HunyuanAttnProcessor2_0:
     r"""
     Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
@@ -1451,7 +1452,9 @@ def __call__(
         encoder_hidden_states: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         temb: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        from .embeddings import apply_rotary_emb
 
         residual = hidden_states
         if attn.spatial_norm is not None:
@@ -1478,10 +1481,8 @@ def __call__(
 
         query = attn.to_q(hidden_states)
 
-        apply_rotary_emb_on_key = False
         if encoder_hidden_states is None:
             encoder_hidden_states = hidden_states
-            apply_rotary_emb_on_key = True
         elif attn.norm_cross:
             encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
 
@@ -1502,16 +1503,10 @@ def __call__(
             key = attn.norm_k(key)
 
         # Apply RoPE if needed
-        if temb is not None:
-            if apply_rotary_emb_on_key:
-                qq, kk = apply_rotary_emb(query, key, temb, head_first=True)
-                assert qq.shape == query.shape and kk.shape == key.shape, \
-                    f'qq: {qq.shape}, q: {query.shape}, kk: {kk.shape}, key: {key.shape}'
-                query, key = qq, kk
-            else:
-                qq, _ = apply_rotary_emb(query, None, temb, head_first=True)
-                assert qq.shape == query.shape, f'qq: {qq.shape}, query: {query.shape}'
-                query = qq
+        if image_rotary_emb is not None:
+            query = apply_rotary_emb(query, image_rotary_emb)
+            if not attn.is_cross_attention:
+                key = apply_rotary_emb(key, image_rotary_emb)
 
         # the output of sdp = (batch, num_heads, seq_len, head_dim)
         # TODO: add support for attn.scale when we move to Torch 2.1
@@ -1537,6 +1532,7 @@ def __call__(
 
         return hidden_states
 
+
 class FusedAttnProcessor2_0:
     r"""
     Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). It uses
@@ -2808,84 +2804,3 @@ def __call__(
     LoRAXFormersAttnProcessor,
     LoRAAttnAddedKVProcessor,
 ]
-
-from typing import Tuple
-
-def reshape_for_broadcast(freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]], x: torch.Tensor, head_first=False):
-    """
-    Reshape frequency tensor for broadcasting it with another tensor.
-    This function reshapes the frequency tensor to have the same shape as the target tensor 'x'
-    for the purpose of broadcasting the frequency tensor during element-wise operations.
-    Args:
-        freqs_cis (Union[torch.Tensor, Tuple[torch.Tensor]]): Frequency tensor to be reshaped.
-        x (torch.Tensor): Target tensor for broadcasting compatibility.
-        head_first (bool): head dimension first (except batch dim) or not.
-    Returns:
-        torch.Tensor: Reshaped frequency tensor.
-    Raises:
-        AssertionError: If the frequency tensor doesn't match the expected shape.
-        AssertionError: If the target tensor 'x' doesn't have the expected number of dimensions.
-    """
-    ndim = x.ndim
-    assert 0 <= 1 < ndim
-
-    if isinstance(freqs_cis, tuple):
-        # freqs_cis: (cos, sin) in real space
-        if head_first:
-            assert freqs_cis[0].shape == (x.shape[-2], x.shape[-1]), f'freqs_cis shape {freqs_cis[0].shape} does not match x shape {x.shape}'
-            shape = [d if i == ndim - 2 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
-        else:
-            assert freqs_cis[0].shape == (x.shape[1], x.shape[-1]), f'freqs_cis shape {freqs_cis[0].shape} does not match x shape {x.shape}'
-            shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
-        return freqs_cis[0].view(*shape), freqs_cis[1].view(*shape)
-    else:
-        # freqs_cis: values in complex space
-        if head_first:
-            assert freqs_cis.shape == (x.shape[-2], x.shape[-1]), f'freqs_cis shape {freqs_cis.shape} does not match x shape {x.shape}'
-            shape = [d if i == ndim - 2 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
-        else:
-            assert freqs_cis.shape == (x.shape[1], x.shape[-1]), f'freqs_cis shape {freqs_cis.shape} does not match x shape {x.shape}'
-            shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
-        return freqs_cis.view(*shape)
-
-
-def rotate_half(x):
-    x_real, x_imag = x.float().reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
-    return torch.stack([-x_imag, x_real], dim=-1).flatten(3)
-
-def apply_rotary_emb(
-        xq: torch.Tensor,
-        xk: Optional[torch.Tensor],
-        freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
-        head_first: bool = False,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    """
-    Apply rotary embeddings to input tensors using the given frequency tensor.
-    This function applies rotary embeddings to the given query 'xq' and key 'xk' tensors using the provided
-    frequency tensor 'freqs_cis'. The input tensors are reshaped as complex numbers, and the frequency tensor
-    is reshaped for broadcasting compatibility. The resulting tensors contain rotary embeddings and are
-    returned as real tensors.
-    Args:
-        xq (torch.Tensor): Query tensor to apply rotary embeddings. [B, S, H, D]
-        xk (torch.Tensor): Key tensor to apply rotary embeddings.   [B, S, H, D]
-        freqs_cis (Union[torch.Tensor, Tuple[torch.Tensor]]): Precomputed frequency tensor for complex exponentials.
-        head_first (bool): head dimension first (except batch dim) or not.
-    Returns:
-        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
-    """
-    xk_out = None
-    if isinstance(freqs_cis, tuple):
-        cos, sin = reshape_for_broadcast(freqs_cis, xq, head_first)    # [S, D]
-        cos, sin = cos.to(xq.device), sin.to(xq.device)
-        xq_out = (xq.float() * cos + rotate_half(xq.float()) * sin).type_as(xq)
-        if xk is not None:
-            xk_out = (xk.float() * cos + rotate_half(xk.float()) * sin).type_as(xk)
-    else:
-        xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))  # [B, S, H, D//2]
-        freqs_cis = reshape_for_broadcast(freqs_cis, xq_, head_first).to(xq.device)   # [S, D//2] --> [1, S, 1, D//2]
-        xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3).type_as(xq)
-        if xk is not None:
-            xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))  # [B, S, H, D//2]
-            xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3).type_as(xk)
-
-    return xq_out, xk_out