quic
diff --git a/‎QEfficient/diffusers/models/pytorch_transforms.py‎
Lines changed: 0 additions & 10 deletions b/‎QEfficient/diffusers/models/pytorch_transforms.py‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎QEfficient/diffusers/pipelines/flux/pipeline_flux.py‎
Lines changed: 36 additions & 45 deletions b/‎QEfficient/diffusers/pipelines/flux/pipeline_flux.py‎
Lines changed: 36 additions & 45 deletions
diff --git a/‎QEfficient/diffusers/pipelines/pipeline_module.py‎
Lines changed: 10 additions & 46 deletions b/‎QEfficient/diffusers/pipelines/pipeline_module.py‎
Lines changed: 10 additions & 46 deletions
@@ -35,7 +35,6 @@
     QEffFluxAttnProcessor,
     QEffFluxSingleTransformerBlock,
     QEffFluxTransformer2DModel,
-    QEffFluxTransformer2DModelOF,
     QEffFluxTransformerBlock,
 )
 
@@ -81,12 +80,3 @@ class NormalizationTransform(ModuleMappingTransform):
     def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]:
         model, transformed = super().apply(model)
         return model, transformed
-
-
-class OnnxFunctionTransform(ModuleMappingTransform):
-    _module_mapping = {QEffFluxTransformer2DModel, QEffFluxTransformer2DModelOF}
-
-    @classmethod
-    def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]:
-        model, transformed = super().apply(model)
-        return model, transformed
@@ -7,7 +7,7 @@
 
 import os
 import time
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Callable, Dict, List, Optional, Union
 
 import numpy as np
 import torch
@@ -24,6 +24,7 @@
 from QEfficient.diffusers.pipelines.pipeline_utils import (
     ModulePerf,
     QEffPipelineOutput,
+    calculate_compressed_latent_dimension,
     compile_modules_parallel,
     compile_modules_sequential,
     config_manager,
@@ -47,21 +48,21 @@ class QEFFFluxPipeline(FluxPipeline):
 
     _hf_auto_class = FluxPipeline
 
-    def __init__(self, model, use_onnx_function: bool, *args, **kwargs):
+    def __init__(self, model, use_onnx_subfunctions: bool, *args, **kwargs):
         """
         Initialize the QEfficient Flux pipeline.
 
         Args:
             model: Pre-loaded FluxPipeline model
-            use_onnx_function (bool): Whether to export transformer blocks as ONNX functions
+            use_onnx_subfunctions (bool): Whether to export transformer blocks as ONNX functions
             **kwargs: Additional arguments including height and width
         """
+
         # Wrap model components with QEfficient optimized versions
         self.text_encoder = QEffTextEncoder(model.text_encoder)
         self.text_encoder_2 = QEffTextEncoder(model.text_encoder_2)
-        self.transformer = QEffFluxTransformerModel(model.transformer, use_onnx_function=use_onnx_function)
+        self.transformer = QEffFluxTransformerModel(model.transformer, use_onnx_subfunctions=use_onnx_subfunctions)
         self.vae_decode = QEffVAE(model, "decoder")
-        self.use_onnx_function = use_onnx_function
 
         # Store all modules in a dictionary for easy iteration during export/compile
         self.modules = {
@@ -78,10 +79,6 @@ def __init__(self, model, use_onnx_function: bool, *args, **kwargs):
         self.tokenizer_max_length = model.tokenizer_max_length
         self.scheduler = model.scheduler
 
-        # Set default image dimensions
-        self.height = kwargs.get("height", 256)
-        self.width = kwargs.get("width", 256)
-
         # Override VAE forward method to use decode directly
         self.vae_decode.model.forward = lambda latent_sample, return_dict: self.vae_decode.model.decode(
             latent_sample, return_dict
@@ -102,10 +99,6 @@ def __init__(self, model, use_onnx_function: bool, *args, **kwargs):
 
         # Calculate latent dimensions based on image size and VAE scale factor
         self.default_sample_size = 128
-        self.latent_height = self.height // self.vae_scale_factor
-        self.latent_width = self.width // self.vae_scale_factor
-        # cl = compressed latent dimension (divided by 4 for Flux's 2x2 packing)
-        self.cl = (self.latent_height * self.latent_width) // 4
 
         # Sync max position embeddings between text encoders
         self.text_encoder_2.model.config.max_position_embeddings = (
@@ -116,17 +109,15 @@ def __init__(self, model, use_onnx_function: bool, *args, **kwargs):
     def from_pretrained(
         cls,
         pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
-        use_onnx_function: bool = False,
-        height: Optional[int] = 512,
-        width: Optional[int] = 512,
+        use_onnx_subfunctions: bool = False,
         **kwargs,
     ):
         """
         Load a pretrained Flux model and wrap it with QEfficient optimizations.
 
         Args:
             pretrained_model_name_or_path (str or os.PathLike): HuggingFace model ID or local path
-            use_onnx_function (bool): Whether to export transformer blocks as ONNX functions
+            use_onnx_subfunctions (bool): Whether to export transformer blocks as ONNX functions
             height (int): Target image height (default: 512)
             width (int): Target image width (default: 512)
             **kwargs: Additional arguments passed to FluxPipeline.from_pretrained
@@ -144,10 +135,8 @@ def from_pretrained(
 
         return cls(
             model=model,
-            use_onnx_function=use_onnx_function,
+            use_onnx_subfunctions=use_onnx_subfunctions,
             pretrained_model_name_or_path=pretrained_model_name_or_path,
-            height=height,
-            width=width,
             **kwargs,
         )
 
@@ -168,20 +157,12 @@ def export(self, export_dir: Optional[str] = None) -> str:
             # Get ONNX export configuration for this module
             example_inputs, dynamic_axes, output_names = module_obj.get_onnx_config()
 
-            export_kwargs = {}
-            # Special handling for transformer: export blocks as functions if enabled
-            if module_name == "transformer" and self.use_onnx_function:
-                export_kwargs = {
-                    "export_modules_as_functions": self.transformer.model._block_classes,
-                }
-
             # Export the module to ONNX
             module_obj.export(
                 inputs=example_inputs,
                 output_names=output_names,
                 dynamic_axes=dynamic_axes,
                 export_dir=export_dir,
-                export_kwargs=export_kwargs,
             )
 
     @staticmethod
@@ -194,7 +175,9 @@ def get_default_config_path() -> str:
         """
         return os.path.join(os.path.dirname(__file__), "flux_config.json")
 
-    def compile(self, compile_config: Optional[str] = None, parallel: bool = False) -> None:
+    def compile(
+        self, compile_config: Optional[str] = None, parallel: bool = False, height: int = 512, width: int = 512
+    ) -> None:
         """
         Compile ONNX models for deployment on Qualcomm AI hardware.
 
@@ -204,7 +187,7 @@ def compile(self, compile_config: Optional[str] = None, parallel: bool = False)
         Args:
             compile_config (str, optional): Path to JSON configuration file.
                                            If None, uses default configuration.
-            parallel (bool): If True, compile modules in parallel using ProcessPoolExecutor.
+            parallel (bool): If True, compile modules in parallel using ThreadPoolExecutor.
                            If False, compile sequentially (default: False).
         """
         # Ensure all modules are exported to ONNX before compilation
@@ -223,12 +206,15 @@ def compile(self, compile_config: Optional[str] = None, parallel: bool = False)
         if self.custom_config is None:
             config_manager(self, config_source=compile_config)
 
+        # Calculate compressed latent dimension using utility function
+        cl, latent_height, latent_width = calculate_compressed_latent_dimension(height, width, self.vae_scale_factor)
+
         # Prepare dynamic specialization updates based on image dimensions
         specialization_updates = {
-            "transformer": {"cl": self.cl},
+            "transformer": {"cl": cl},
             "vae_decoder": {
-                "latent_height": self.latent_height,
-                "latent_width": self.latent_width,
+                "latent_height": latent_height,
+                "latent_width": latent_width,
             },
         }
 
@@ -448,6 +434,8 @@ def encode_prompt(
 
     def __call__(
         self,
+        height: int = 512,
+        width: int = 512,
         prompt: Union[str, List[str]] = None,
         prompt_2: Optional[Union[str, List[str]]] = None,
         negative_prompt: Union[str, List[str]] = None,
@@ -464,8 +452,6 @@ def __call__(
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
         negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
         output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
         callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
         callback_on_step_end_tensor_inputs: List[str] = ["latents"],
         max_sequence_length: int = 512,
@@ -513,19 +499,21 @@ def __call__(
         """
         device = "cpu"
 
+        if height is None or width is None:
+            logger.warning("Height or width is None. Setting default values of 512 for both dimensions.")
+
         # Step 1: Load configuration and compile models if needed
         if custom_config_path is not None:
             config_manager(self, custom_config_path)
             set_module_device_ids(self)
 
-        self.compile(compile_config=custom_config_path, parallel=parallel_compile)
-
+        self.compile(compile_config=custom_config_path, parallel=parallel_compile, height=height, width=width)
         # Validate all inputs
         self.check_inputs(
             prompt,
             prompt_2,
-            self.height,
-            self.width,
+            height,
+            width,
             negative_prompt=negative_prompt,
             negative_prompt_2=negative_prompt_2,
             prompt_embeds=prompt_embeds,
@@ -587,23 +575,26 @@ def __call__(
         latents, latent_image_ids = self.prepare_latents(
             batch_size * num_images_per_prompt,
             num_channels_latents,
-            self.height,
-            self.width,
+            height,
+            width,
             prompt_embeds.dtype,
             device,
             generator,
             latents,
         )
 
-        # Step 6: Initialize transformer inference session
+        # Step 6: Calculate compressed latent dimension for transformer buffer allocation
+        cl, _, _ = calculate_compressed_latent_dimension(height, width, self.vae_scale_factor)
+
+        # Initialize transformer inference session
         if self.transformer.qpc_session is None:
             self.transformer.qpc_session = QAICInferenceSession(
                 str(self.transformer.qpc_path), device_ids=self.transformer.device_ids
             )
 
         # Allocate output buffer for transformer
         output_buffer = {
-            "output": np.random.rand(batch_size, self.cl, self.transformer.model.config.in_channels).astype(np.float32),
+            "output": np.random.rand(batch_size, cl, self.transformer.model.config.in_channels).astype(np.float32),
         }
         self.transformer.qpc_session.set_buffers(output_buffer)
 
@@ -693,7 +684,7 @@ def __call__(
             image = latents
         else:
             # Unpack and denormalize latents
-            latents = self._unpack_latents(latents, self.height, self.width, self.vae_scale_factor)
+            latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
             latents = (latents / self.vae_decode.model.scaling_factor) + self.vae_decode.model.shift_factor
 
             # Initialize VAE decoder inference session
@@ -703,7 +694,7 @@ def __call__(
                 )
 
             # Allocate output buffer for VAE decoder
-            output_buffer = {"sample": np.random.rand(batch_size, 3, self.height, self.width).astype(np.int32)}
+            output_buffer = {"sample": np.random.rand(batch_size, 3, height, width).astype(np.int32)}
             self.vae_decode.qpc_session.set_buffers(output_buffer)
 
             # Run VAE decoder inference and measure time
 
@@ -17,7 +17,10 @@
     AttentionTransform,
     CustomOpsTransform,
     NormalizationTransform,
-    OnnxFunctionTransform,
+)
+from QEfficient.diffusers.models.transformers.transformer_flux import (
+    QEffFluxSingleTransformerBlock,
+    QEffFluxTransformerBlock,
 )
 from QEfficient.transformers.models.pytorch_transforms import (
     T5ModelTransform,
@@ -377,28 +380,20 @@ class QEffFluxTransformerModel(QEFFBaseModel):
     _pytorch_transforms = [AttentionTransform, NormalizationTransform, CustomOpsTransform]
     _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
 
-    def __init__(self, model: nn.Module, use_onnx_function: bool) -> None:
+    def __init__(self, model: nn.Module, use_onnx_subfunctions: bool) -> None:
         """
         Initialize the Flux transformer wrapper.
 
         Args:
             model (nn.Module): The Flux transformer model to wrap
-            use_onnx_function (bool): Whether to export transformer blocks as ONNX functions
+            use_onnx_subfunctions (bool): Whether to export transformer blocks as ONNX functions
                                      for better modularity and potential optimization
         """
-
-        # Optionally apply ONNX function transform for modular export
-
-        if use_onnx_function:
-            model, _ = OnnxFunctionTransform.apply(model)
-
         super().__init__(model)
 
-        if use_onnx_function:
-            self._pytorch_transforms.append(OnnxFunctionTransform)
-
         # Ensure model is on CPU to avoid meta device issues
         self.model = model.to("cpu")
+        self.use_onnx_subfunctions = use_onnx_subfunctions
 
     def get_onnx_config(
         self, batch_size: int = 1, seq_length: int = 256, cl: int = 4096
@@ -423,17 +418,12 @@ def get_onnx_config(
         example_inputs = {
             # Latent representation of the image
             "hidden_states": torch.randn(batch_size, cl, self.model.config.in_channels, dtype=torch.float32),
-            # Text embeddings from T5 encoder
             "encoder_hidden_states": torch.randn(
                 batch_size, seq_length, self.model.config.joint_attention_dim, dtype=torch.float32
             ),
-            # Pooled text embeddings from CLIP encoder
             "pooled_projections": torch.randn(batch_size, self.model.config.pooled_projection_dim, dtype=torch.float32),
-            # Diffusion timestep (normalized to [0, 1])
             "timestep": torch.tensor([1.0], dtype=torch.float32),
-            # Position IDs for image patches
             "img_ids": torch.randn(cl, 3, dtype=torch.float32),
-            # Position IDs for text tokens
             "txt_ids": torch.randn(seq_length, 3, dtype=torch.float32),
             # AdaLN embeddings for dual transformer blocks
             # Shape: [num_layers, 12 chunks (6 for norm1 + 6 for norm1_context), hidden_dim]
@@ -490,6 +480,9 @@ def export(
         Returns:
             str: Path to the exported ONNX model
         """
+        if self.use_onnx_subfunctions:
+            export_kwargs = {"export_modules_as_functions": {QEffFluxTransformerBlock, QEffFluxSingleTransformerBlock}}
+
         return self._export(
             example_inputs=inputs,
             output_names=output_names,
@@ -498,35 +491,6 @@ def export(
             export_kwargs=export_kwargs,
         )
 
-    def get_specializations(self, batch_size: int, seq_len: int, cl: int) -> List[Dict]:
-        """
-        Generate specialization configuration for compilation.
-
-        Specializations define fixed values for certain dimensions to enable
-        compiler optimizations specific to the target use case.
-
-        Args:
-            batch_size (int): Batch size for inference
-            seq_len (int): Text sequence length
-            cl (int): Compressed latent dimension
-
-        Returns:
-            List[Dict]: Specialization configurations for the compiler
-        """
-        specializations = [
-            {
-                "batch_size": batch_size,
-                "stats-batchsize": batch_size,
-                "num_layers": self.model.config.num_layers,
-                "num_single_layers": self.model.config.num_single_layers,
-                "seq_len": seq_len,
-                "cl": cl,
-                "steps": 1,
-            }
-        ]
-
-        return specializations
-
     def compile(self, specializations: List[Dict], **compiler_options) -> None:
         """
         Compile the ONNX model for Qualcomm AI hardware.