unify the structure of the forward block

davidb · davidb · commit 5f6359f45856 · 2025-10-10T14:05:11.000Z
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
@@ -5605,7 +5605,6 @@ def __new__(cls, *args, **kwargs):
         return processor
 
 
-
 ADDED_KV_ATTENTION_PROCESSORS = (
     AttnAddedKVProcessor,
     SlicedAttnAddedKVProcessor,
diff --git a/src/diffusers/models/transformers/transformer_photon.py b/src/diffusers/models/transformers/transformer_photon.py
@@ -22,7 +22,7 @@
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
 from ..attention import AttentionMixin
-from ..attention_processor import Attention, AttentionProcessor
+from ..attention_processor import Attention
 from ..embeddings import get_timestep_embedding
 from ..modeling_outputs import Transformer2DModelOutput
 from ..modeling_utils import ModelMixin
@@ -78,6 +78,7 @@ def apply_rope(xq: Tensor, freqs_cis: Tensor) -> Tensor:
     xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
     return xq_out.reshape(*xq.shape).type_as(xq)
 
+
 class PhotonAttnProcessor2_0:
     r"""
     Processor for implementing Photon-style attention with multi-source tokens and RoPE. Properly integrates with
@@ -133,6 +134,8 @@ def __call__(
             attn_output = attn.to_out[1](attn_output)  # dropout if present
 
         return attn_output
+
+
 # copied from https://github.com/black-forest-labs/flux/blob/main/src/flux/modules/layers.py
 class EmbedND(nn.Module):
     r"""
@@ -299,9 +302,8 @@ class PhotonBlock(nn.Module):
             Produces scale/shift/gating parameters for modulated layers.
 
     Methods:
-        attn_forward(img, txt, pe, modulation, spatial_conditioning=None, attention_mask=None):
-            Compute cross-attention between image and text tokens, with optional spatial conditioning and attention
-            masking.
+        attn_forward(img, txt, pe, modulation, attention_mask=None):
+            Compute cross-attention between image and text tokens, with optional attention masking.
 
             Parameters:
                 img (`torch.Tensor`):
@@ -312,8 +314,6 @@ class PhotonBlock(nn.Module):
                     Rotary positional embeddings to apply to queries and keys.
                 modulation (`ModulationOut`):
                     Scale and shift parameters for modulating image tokens.
-                spatial_conditioning (`torch.Tensor`, *optional*):
-                    Extra conditioning tokens of shape `(B, L_cond, hidden_size)`.
                 attention_mask (`torch.Tensor`, *optional*):
                     Boolean mask of shape `(B, L_txt)` where 0 marks padding.
 
@@ -372,7 +372,6 @@ def _attn_forward(
         txt: Tensor,
         pe: Tensor,
         modulation: ModulationOut,
-        spatial_conditioning: None | Tensor = None,
         attention_mask: None | Tensor = None,
     ) -> Tensor:
         # image tokens proj and norm
@@ -444,7 +443,6 @@ def forward(
         txt: Tensor,
         vec: Tensor,
         pe: Tensor,
-        spatial_conditioning: Tensor | None = None,
         attention_mask: Tensor | None = None,
         **_: dict[str, Any],
     ) -> Tensor:
@@ -461,9 +459,6 @@ def forward(
                 broadcastable).
             pe (`torch.Tensor`):
                 Rotary positional embeddings applied inside attention.
-            spatial_conditioning (`torch.Tensor`, *optional*):
-                Extra conditioning tokens of shape `(B, L_cond, hidden_size)`. Used only if spatial conditioning is
-                enabled in the block.
             attention_mask (`torch.Tensor`, *optional*):
                 Boolean mask for text tokens of shape `(B, L_txt)`, where `0` marks padding.
             **_:
@@ -481,7 +476,6 @@ def forward(
             txt,
             pe,
             mod_attn,
-            spatial_conditioning=spatial_conditioning,
             attention_mask=attention_mask,
         )
         img = img + mod_mlp.gate * self._ffn_forward(img, mod_mlp)
@@ -698,14 +692,6 @@ def __init__(
 
         self.gradient_checkpointing = False
 
-    def _process_inputs(self, image_latent: Tensor, txt: Tensor, **_: Any) -> tuple[Tensor, Tensor, Tensor]:
-        txt = self.txt_in(txt)
-        img = img2seq(image_latent, self.patch_size)
-        bs, _, h, w = image_latent.shape
-        img_ids = get_image_ids(bs, h, w, patch_size=self.patch_size, device=image_latent.device)
-        pe = self.pe_embedder(img_ids)
-        return img, txt, pe
-
     def _compute_timestep_embedding(self, timestep: Tensor, dtype: torch.dtype) -> Tensor:
         return self.time_in(
             get_timestep_embedding(
@@ -717,43 +703,6 @@ def _compute_timestep_embedding(self, timestep: Tensor, dtype: torch.dtype) -> T
             ).to(dtype)
         )
 
-    def _forward_transformers(
-        self,
-        image_latent: Tensor,
-        cross_attn_conditioning: Tensor,
-        timestep: Optional[Tensor] = None,
-        time_embedding: Optional[Tensor] = None,
-        attention_mask: Optional[Tensor] = None,
-        **block_kwargs: Any,
-    ) -> Tensor:
-        img = self.img_in(image_latent)
-
-        if time_embedding is not None:
-            vec = time_embedding
-        else:
-            if timestep is None:
-                raise ValueError("Please provide either a timestep or a timestep_embedding")
-            vec = self._compute_timestep_embedding(timestep, dtype=img.dtype)
-
-        for block in self.blocks:
-            if torch.is_grad_enabled() and self.gradient_checkpointing:
-                img = self._gradient_checkpointing_func(
-                    block.__call__,
-                    img,
-                    cross_attn_conditioning,
-                    vec,
-                    block_kwargs.get("pe"),
-                    block_kwargs.get("spatial_conditioning"),
-                    attention_mask,
-                )
-            else:
-                img = block(
-                    img=img, txt=cross_attn_conditioning, vec=vec, attention_mask=attention_mask, **block_kwargs
-                )
-
-        img = self.final_layer(img, vec)
-        return img
-
     def forward(
         self,
         image_latent: Tensor,
@@ -797,6 +746,7 @@ def forward(
             lora_scale = attention_kwargs.pop("scale", 1.0)
         else:
             lora_scale = 1.0
+
         if USE_PEFT_BACKEND:
             # weight the lora layers by setting `lora_scale` for each PEFT layer
             scale_lora_layers(self, lora_scale)
@@ -805,12 +755,50 @@ def forward(
                 logger.warning(
                     "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
                 )
-        img_seq, txt, pe = self._process_inputs(image_latent, cross_attn_conditioning)
-        img_seq = self._forward_transformers(img_seq, txt, timestep, pe=pe, attention_mask=cross_attn_mask)
-        output = seq2img(img_seq, self.patch_size, image_latent.shape)
+
+        # Process text conditioning
+        txt = self.txt_in(cross_attn_conditioning)
+
+        # Convert image to sequence and embed
+        img = img2seq(image_latent, self.patch_size)
+        img = self.img_in(img)
+
+        # Generate positional embeddings
+        bs, _, h, w = image_latent.shape
+        img_ids = get_image_ids(bs, h, w, patch_size=self.patch_size, device=image_latent.device)
+        pe = self.pe_embedder(img_ids)
+
+        # Compute time embedding
+        vec = self._compute_timestep_embedding(timestep, dtype=img.dtype)
+
+        # Apply transformer blocks
+        for block in self.blocks:
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                img = self._gradient_checkpointing_func(
+                    block.__call__,
+                    img,
+                    txt,
+                    vec,
+                    pe,
+                    cross_attn_mask,
+                )
+            else:
+                img = block(
+                    img=img,
+                    txt=txt,
+                    vec=vec,
+                    pe=pe,
+                    attention_mask=cross_attn_mask,
+                )
+
+        # Final layer and convert back to image
+        img = self.final_layer(img, vec)
+        output = seq2img(img, self.patch_size, image_latent.shape)
+
         if USE_PEFT_BACKEND:
             # remove `lora_scale` from each PEFT layer
             unscale_lora_layers(self, lora_scale)
+
         if not return_dict:
             return (output,)
         return Transformer2DModelOutput(sample=output)
diff --git a/src/diffusers/pipelines/photon/pipeline_photon.py b/src/diffusers/pipelines/photon/pipeline_photon.py
@@ -16,22 +16,21 @@
 import inspect
 import re
 import urllib.parse as ul
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Callable, Dict, List, Optional, Union
 
 import ftfy
 import torch
 from transformers import (
     AutoTokenizer,
     GemmaTokenizerFast,
-    T5EncoderModel,
     T5TokenizerFast,
 )
 from transformers.models.t5gemma.modeling_t5gemma import T5GemmaEncoder
 
 from diffusers.image_processor import PixArtImageProcessor
 from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from diffusers.models import AutoencoderDC, AutoencoderKL
-from diffusers.models.transformers.transformer_photon import PhotonTransformer2DModel, seq2img
+from diffusers.models.transformers.transformer_photon import PhotonTransformer2DModel
 from diffusers.pipelines.photon.pipeline_output import PhotonPipelineOutput
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
@@ -45,29 +44,29 @@
 DEFAULT_RESOLUTION = 512
 
 ASPECT_RATIO_256_BIN = {
-  "0.46": [160, 352],
-  "0.6": [192, 320],
-  "0.78": [224, 288],
-  "1.0": [256, 256],
-  "1.29": [288, 224],
-  "1.67": [320, 192],
-  "2.2": [352, 160],
+    "0.46": [160, 352],
+    "0.6": [192, 320],
+    "0.78": [224, 288],
+    "1.0": [256, 256],
+    "1.29": [288, 224],
+    "1.67": [320, 192],
+    "2.2": [352, 160],
 }
 
 ASPECT_RATIO_512_BIN = {
-  "0.5": [352, 704],
-  "0.57": [384, 672],
-  "0.6": [384, 640],
-  "0.68": [416, 608],
-  "0.78": [448, 576],
-  "0.88": [480, 544],
-  "1.0": [512, 512],
-  "1.13": [544, 480],
-  "1.29": [576, 448],
-  "1.46": [608, 416],
-  "1.67": [640, 384],
-  "1.75": [672, 384],
-  "2.0": [704, 352],
+    "0.5": [352, 704],
+    "0.57": [384, 672],
+    "0.6": [384, 640],
+    "0.68": [416, 608],
+    "0.78": [448, 576],
+    "0.88": [480, 544],
+    "1.0": [512, 512],
+    "1.13": [544, 480],
+    "1.29": [576, 448],
+    "1.46": [608, 416],
+    "1.67": [640, 384],
+    "1.75": [672, 384],
+    "2.0": [704, 352],
 }
 
 logger = logging.get_logger(__name__)
@@ -283,7 +282,7 @@ def __init__(
     def vae_spatial_compression_ratio(self):
         if hasattr(self.vae, "spatial_compression_ratio"):
             return self.vae.spatial_compression_ratio
-        else: # Flux VAE
+        else:  # Flux VAE
             return 2 ** (len(self.vae.config.block_out_channels) - 1)
 
     @property
@@ -461,8 +460,8 @@ def __call__(
                 Whether or not to return a [`~pipelines.photon.PhotonPipelineOutput`] instead of a plain tuple.
             use_resolution_binning (`bool`, *optional*, defaults to `True`):
                 If set to `True`, the requested height and width are first mapped to the closest resolutions using
-                predefined aspect ratio bins. After the produced latents are decoded into images, they are resized back to
-                the requested resolution. Useful for generating non-square images at optimal resolutions.
+                predefined aspect ratio bins. After the produced latents are decoded into images, they are resized back
+                to the requested resolution. Useful for generating non-square images at optimal resolutions.
             callback_on_step_end (`Callable`, *optional*):
                 A function that calls at the end of each denoising steps during the inference. The function is called
                 with the following arguments: `callback_on_step_end(self, step, timestep, callback_kwargs)`.
@@ -572,20 +571,15 @@ def __call__(
                     # Normalize timestep for the transformer
                     t_cont = (t.float() / self.scheduler.config.num_train_timesteps).view(1).to(device)
 
-                # Process inputs for transformer
-                img_seq, txt, pe = self.transformer._process_inputs(latents_in, ca_embed)
-
-                # Forward through transformer layers
-                img_seq = self.transformer._forward_transformers(
-                    img_seq,
-                    txt,
-                    time_embedding=self.transformer._compute_timestep_embedding(t_cont, img_seq.dtype),
-                    pe=pe,
-                    attention_mask=ca_mask,
-                )
-
-                # Convert back to image format
-                noise_pred = seq2img(img_seq, self.transformer.patch_size, latents_in.shape)
+                # Forward through transformer
+                noise_pred = self.transformer(
+                    image_latent=latents_in,
+                    timestep=t_cont,
+                    cross_attn_conditioning=ca_embed,
+                    micro_conditioning=None,
+                    cross_attn_mask=ca_mask,
+                    return_dict=False,
+                )[0]
 
                 # Apply CFG
                 if self.do_classifier_free_guidance: