diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
index 4d4845c5a0a3..0311df1d7f7a 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
@@ -131,8 +131,7 @@
 
         >>> prompt = "A robot, 4k photo"
         >>> image = load_image(
-        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-        ...     "/kandinsky/cat.png"
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/cat.png"
         ... ).resize((1024, 1024))
         >>> controlnet_conditioning_scale = 0.5  # recommended for good generalization
         >>> depth_image = get_depth_map(image)
diff --git a/src/diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py b/src/diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py
index cd5a734cc311..763f41373a73 100644
--- a/src/diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py
+++ b/src/diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py
@@ -88,9 +88,7 @@ def __init__(self, *args, **kwargs):
         >>> prompt = "The video depicts a winding mountain road covered in snow, with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the solitude and beauty of a winter drive through a mountainous region."
         >>> video = load_video(
         ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cosmos/cosmos-video2world-input-vid.mp4"
-        ... )[
-        ...     :21
-        ... ]  # This example uses only the first 21 frames
+        ... )[:21]  # This example uses only the first 21 frames
 
         >>> video = pipe(video=video, prompt=prompt).frames[0]
         >>> export_to_video(video, "output.mp4", fps=30)
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py
index 7286bcbee17b..a5952daad420 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py
@@ -98,7 +98,7 @@
         negative_prompt = "low quality, bad quality"
 
         original_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png"
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/cat.png"
         )
 
         mask = np.zeros((768, 768), dtype=np.float32)
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
index f5e41d499dc3..6469b970e8b5 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
@@ -60,8 +60,7 @@
         >>> pipe.to("cuda")
 
         >>> init_image = load_image(
-        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-        ...     "/kandinsky/frog.png"
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/frog.png"
         ... )
 
         >>> image = pipe(
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
index 731fce499859..521e6fd8f493 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -66,8 +66,7 @@
         >>> pipe.to("cuda")
 
         >>> init_image = load_image(
-        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-        ...     "/kandinsky/cat.png"
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/cat.png"
         ... )
 
         >>> mask = np.zeros((768, 768), dtype=np.float32)
diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
index 10ea8005c90d..eca3d2317392 100644
--- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
+++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -88,8 +88,7 @@
         >>> pipe_prior.to("cuda")
 
         >>> img1 = load_image(
-        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-        ...     "/kandinsky/cat.png"
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/cat.png"
         ... )
 
         >>> img2 = load_image(
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py
index fc2083247bb0..2c44408f299d 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py
@@ -92,7 +92,7 @@
         negative_prompt = "low quality, bad quality"
 
         original_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png"
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/cat.png"
         )
 
         mask = np.zeros((768, 768), dtype=np.float32)
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py
index c5faae82796b..c14790d61d19 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py
@@ -71,8 +71,7 @@
 
 
         >>> img = load_image(
-        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-        ...     "/kandinsky/cat.png"
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/cat.png"
         ... ).resize((768, 768))
 
         >>> hint = make_hint(img, depth_estimator).unsqueeze(0).half().to("cuda")
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py
index 54154c6ec1f2..662b81c311d8 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py
@@ -72,8 +72,7 @@
         >>> pipe = pipe.to("cuda")
 
         >>> img = load_image(
-        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-        ...     "/kandinsky/cat.png"
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/cat.png"
         ... ).resize((768, 768))
 
 
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
index 3b2509098fd1..0d7e118cee23 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
@@ -56,8 +56,7 @@
         >>> pipe.to("cuda")
 
         >>> init_image = load_image(
-        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-        ...     "/kandinsky/frog.png"
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/frog.png"
         ... )
 
         >>> image = pipe(
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py
index a61673293e1f..741c5622cd1e 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py
@@ -62,8 +62,7 @@
         >>> pipe.to("cuda")
 
         >>> init_image = load_image(
-        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-        ...     "/kandinsky/cat.png"
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/cat.png"
         ... )
 
         >>> mask = np.zeros((768, 768), dtype=np.float32)
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
index bc67847831a5..aae892f57136 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
@@ -60,8 +60,7 @@
         ... )
         >>> pipe_prior.to("cuda")
         >>> img1 = load_image(
-        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-        ...     "/kandinsky/cat.png"
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/cat.png"
         ... )
         >>> img2 = load_image(
         ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
index b586d166118b..7e3bee808d0c 100644
--- a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
+++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
@@ -35,8 +35,7 @@
 
         >>> prompt = "red cat, 4k photo"
         >>> img = load_image(
-        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-        ...     "/kandinsky/cat.png"
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/cat.png"
         ... )
         >>> image_emb, nagative_image_emb = pipe_prior(prompt, image=img, strength=0.2).to_tuple()
 
@@ -73,8 +72,7 @@
         >>> pipe_prior.to("cuda")
 
         >>> img1 = load_image(
-        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-        ...     "/kandinsky/cat.png"
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/cat.png"
         ... )
 
         >>> img2 = load_image(
diff --git a/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py b/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py
index a6df1b22c8b9..de956f2c0f2c 100644
--- a/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py
+++ b/src/diffusers/pipelines/pag/pipeline_pag_controlnet_sd_xl_img2img.py
@@ -132,8 +132,7 @@
 
         >>> prompt = "A robot, 4k photo"
         >>> image = load_image(
-        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-        ...     "/kandinsky/cat.png"
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky/cat.png"
         ... ).resize((1024, 1024))
         >>> controlnet_conditioning_scale = 0.5  # recommended for good generalization
         >>> depth_image = get_depth_map(image)
diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_plus.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_plus.py
index ec203edf166c..2347e1f76574 100644
--- a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_plus.py
+++ b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_plus.py
@@ -226,6 +226,25 @@ def _extract_masked_hidden(self, hidden_states: torch.Tensor, mask: torch.Tensor
 
         return split_result
 
+    def _sanitize_images(self, image):
+        """
+        Recursively unwraps tuples or lists to find valid PIL Images or Tensors. Solves the issue where
+        `load_image(...),` creates nested tuples like `((Image,),)`.
+        """
+        if isinstance(image, (list, tuple)):
+            # If it's a list/tuple, check if it contains images or nested tuples
+            unwrapped_images = []
+            for img in image:
+                while isinstance(img, (list, tuple)):
+                    img = img[0]
+                unwrapped_images.append(img)
+            return unwrapped_images
+
+        # Handle single input that might be wrapped
+        while isinstance(image, (list, tuple)):
+            image = image[0]
+        return [image]
+
     def _get_qwen_prompt_embeds(
         self,
         prompt: Union[str, List[str]] = None,
@@ -236,22 +255,32 @@ def _get_qwen_prompt_embeds(
         device = device or self._execution_device
         dtype = dtype or self.text_encoder.dtype
 
+        # Ensure prompt is a list
         prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        # Ensure image is a list matching the prompt length
+        # This is critical for the Processor to map index 0 -> index 0
+        if image is not None:
+            if not isinstance(image, list):
+                image = [image]
+
         img_prompt_template = "Picture {}: <|vision_start|><|image_pad|><|vision_end|>"
-        if isinstance(image, list):
+
+        # Logic to handle multiple images per SINGLE prompt
+        if isinstance(image, list) and len(image) > len(prompt):
             base_img_prompt = ""
             for i, img in enumerate(image):
                 base_img_prompt += img_prompt_template.format(i + 1)
-        elif image is not None:
-            base_img_prompt = img_prompt_template.format(1)
         else:
-            base_img_prompt = ""
+            base_img_prompt = img_prompt_template.format(1)
 
         template = self.prompt_template_encode
-
         drop_idx = self.prompt_template_encode_start_idx
+
+        # formatting
         txt = [template.format(base_img_prompt + e) for e in prompt]
 
+        # ensure processor gets lists
         model_inputs = self.processor(
             text=txt,
             images=image,
@@ -283,7 +312,6 @@ def _get_qwen_prompt_embeds(
 
         return prompt_embeds, encoder_attention_mask
 
-    # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage_edit.QwenImageEditPipeline.encode_prompt
     def encode_prompt(
         self,
         prompt: Union[str, List[str]],
@@ -294,34 +322,59 @@ def encode_prompt(
         prompt_embeds_mask: Optional[torch.Tensor] = None,
         max_sequence_length: int = 1024,
     ):
-        r"""
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            image (`torch.Tensor`, *optional*):
-                image to be encoded
-            device: (`torch.device`):
-                torch device
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            prompt_embeds (`torch.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-        """
         device = device or self._execution_device
 
-        prompt = [prompt] if isinstance(prompt, str) else prompt
-        batch_size = len(prompt) if prompt_embeds is None else prompt_embeds.shape[0]
+        # [FIX]: Robust Batch Handling Loop
+        if isinstance(prompt, list) and len(prompt) > 1:
+            prompt_embeds_list = []
+            mask_list = []
+
+            # Normalize images to a list matching the prompt length
+            if isinstance(image, list):
+                current_images = image
+            else:
+                current_images = [image] * len(prompt)
+
+            for i, single_prompt in enumerate(prompt):
+                single_image = current_images[i] if i < len(current_images) else current_images[0]
+
+                # Pass single items, so the processor sees
+                # text=["..."] and images=[img] (1-to-1).
+                pe, pem = self._get_qwen_prompt_embeds(single_prompt, image=single_image, device=device)
+                prompt_embeds_list.append(pe)
+                mask_list.append(pem)
+
+            # Pad embeddings to the maximum length in the batch
+            max_len = max([p.shape[1] for p in prompt_embeds_list])
+
+            padded_embeds = []
+            padded_masks = []
 
-        if prompt_embeds is None:
-            prompt_embeds, prompt_embeds_mask = self._get_qwen_prompt_embeds(prompt, image, device)
+            for pe, pem in zip(prompt_embeds_list, mask_list):
+                cur_len = pe.shape[1]
+                pad_len = max_len - cur_len
 
-        _, seq_len, _ = prompt_embeds.shape
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-        prompt_embeds_mask = prompt_embeds_mask.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds_mask = prompt_embeds_mask.view(batch_size * num_images_per_prompt, seq_len)
+                if pad_len > 0:
+                    pe = torch.nn.functional.pad(pe, (0, 0, 0, pad_len))
+                    pem = torch.nn.functional.pad(pem, (0, pad_len))
+
+                padded_embeds.append(pe)
+                padded_masks.append(pem)
+
+            prompt_embeds = torch.cat(padded_embeds, dim=0)
+            prompt_embeds_mask = torch.cat(padded_masks, dim=0)
+
+        else:
+            # Standard path for single prompt
+            prompt = [prompt] if isinstance(prompt, str) else prompt
+
+            if prompt_embeds is None:
+                prompt_embeds, prompt_embeds_mask = self._get_qwen_prompt_embeds(prompt, image, device)
+
+        if num_images_per_prompt > 1:
+            prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            if prompt_embeds_mask is not None:
+                prompt_embeds_mask = prompt_embeds_mask.repeat_interleave(num_images_per_prompt, dim=0)
 
         return prompt_embeds, prompt_embeds_mask
 
@@ -627,7 +680,27 @@ def __call__(
             [`~pipelines.qwenimage.QwenImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When
             returning a tuple, the first element is a list with the generated images.
         """
-        image_size = image[-1].size if isinstance(image, list) else image.size
+        # [Fix] Robustly determine image size (Handles Lists, Tensors, and PIL)
+        if image is not None:
+            image = self._sanitize_images(image)
+
+        if isinstance(image, list):
+            # Grab the first valid image to determine dimensions
+            check_img = image[0]
+            # Handle potential nested lists (e.g. if batching logic gets complex)
+            while isinstance(check_img, (list, tuple)):
+                check_img = check_img[0]
+
+            if isinstance(check_img, torch.Tensor):
+                # Tensor shape is usually (C, H, W) or (B, C, H, W) -> take last two dims
+                image_size = (check_img.shape[-1], check_img.shape[-2])
+            else:
+                image_size = check_img.size
+        elif isinstance(image, torch.Tensor):
+            image_size = (image.shape[-1], image.shape[-2])
+        else:
+            image_size = image.size
+
         calculated_width, calculated_height = calculate_dimensions(1024 * 1024, image_size[0] / image_size[1])
         height = height or calculated_height
         width = width or calculated_width
@@ -666,22 +739,72 @@ def __call__(
         device = self._execution_device
         # 3. Preprocess image
         if image is not None and not (isinstance(image, torch.Tensor) and image.size(1) == self.latent_channels):
-            if not isinstance(image, list):
-                image = [image]
+            image = self._sanitize_images(image)
+
             condition_image_sizes = []
             condition_images = []
             vae_image_sizes = []
             vae_images = []
+
+            # We first calculate what size each image WANTS to be (preserving aspect ratio)
+            ideal_sizes = []
             for img in image:
-                image_width, image_height = img.size
-                condition_width, condition_height = calculate_dimensions(
-                    CONDITION_IMAGE_SIZE, image_width / image_height
-                )
-                vae_width, vae_height = calculate_dimensions(VAE_IMAGE_SIZE, image_width / image_height)
-                condition_image_sizes.append((condition_width, condition_height))
-                vae_image_sizes.append((vae_width, vae_height))
-                condition_images.append(self.image_processor.resize(img, condition_height, condition_width))
-                vae_images.append(self.image_processor.preprocess(img, vae_height, vae_width).unsqueeze(2))
+                w, h = img.size
+                # vw, vh = calculate_dimensions(VAE_IMAGE_SIZE, w / h)
+                vw = (
+                    round(w / 32) * 32
+                )  # uncomment above line and change w -> vw and h -> vh if you want to upscale everything to 1024
+                vh = round(h / 32) * 32
+                ideal_sizes.append((vw, vh))
+
+            # If set(ideal_sizes) has length 1, they are all uniform!
+            all_same_size = len(set(ideal_sizes)) == 1
+
+            # Default target is 1024 (Standard Qwen)
+            force_tgt_w, force_tgt_h = 1024, 1024
+
+            for idx, img in enumerate(image):
+                w, h = img.size
+
+                # Condition Image (Always keeps aspect ratio)
+                cw, ch = calculate_dimensions(CONDITION_IMAGE_SIZE, w / h)
+                condition_image_sizes.append((cw, ch))
+                condition_images.append(self.image_processor.resize(img, ch, cw))
+
+                # VAE SIZE LOGIC
+                if height is not None and width is not None:
+                    # Priority 1: User specified strict size; Force it
+                    vw, vh = width, height
+
+                elif all_same_size:
+                    # Priority 2: Batch is uniform; Keep "Ideal" (Preserves Aspect Ratio)
+                    vw, vh = ideal_sizes[idx]
+
+                else:
+                    # Priority 3: Batch is mixed (Portrait + Landscape); Force Square
+                    vw, vh = force_tgt_w, force_tgt_h
+
+                # Ensure divisible by 32
+                vw = round(vw / 32) * 32
+                vh = round(vh / 32) * 32
+
+                vae_image_sizes.append((vw, vh))
+
+                # If Mixed Batch: stretch them to 1024x1024
+                # If Uniform Batch: preserve aspect ratio
+                vae_images.append(self.image_processor.preprocess(img, vh, vw).unsqueeze(2))
+
+            # Batching Logic
+            if isinstance(prompt, list) and len(prompt) > 1 and len(vae_images) == len(prompt):
+                batch_tensor = torch.cat(vae_images, dim=0)
+                vae_images = [batch_tensor]
+
+            # Update global metadata for pipeline
+            if height is None or width is None:
+                if all_same_size:
+                    width, height = vae_image_sizes[0]
+                else:
+                    width, height = force_tgt_w, force_tgt_h
 
         has_neg_prompt = negative_prompt is not None or (
             negative_prompt_embeds is not None and negative_prompt_embeds_mask is not None
@@ -730,15 +853,16 @@ def __call__(
             generator,
             latents,
         )
-        img_shapes = [
-            [
+        img_shapes = []
+        for i in range(batch_size):
+            # Safe access to size
+            vw, vh = vae_image_sizes[i] if i < len(vae_image_sizes) else vae_image_sizes[0]
+
+            shape_entry = [
                 (1, height // self.vae_scale_factor // 2, width // self.vae_scale_factor // 2),
-                *[
-                    (1, vae_height // self.vae_scale_factor // 2, vae_width // self.vae_scale_factor // 2)
-                    for vae_width, vae_height in vae_image_sizes
-                ],
+                (1, vh // self.vae_scale_factor // 2, vw // self.vae_scale_factor // 2),
             ]
-        ] * batch_size
+            img_shapes.append(shape_entry)
 
         # 5. Prepare timesteps
         sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas