fix animatediff based tests

a-r-r-o-w · a-r-r-o-w · commit 4e9d60ac170f · 2024-02-11T03:45:41.000+05:30
diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
@@ -905,7 +905,7 @@ def __call__(
         if self.do_classifier_free_guidance:
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
 
-        if ip_adapter_image is not None:
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
             image_embeds = self.prepare_ip_adapter_image_embeds(
                 ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_videos_per_prompt
             )
diff --git a/tests/models/unets/test_models_unet_2d_condition.py b/tests/models/unets/test_models_unet_2d_condition.py
@@ -62,7 +62,10 @@ def create_ip_adapter_state_dict(model):
     key_id = 1
 
     for name in model.attn_processors.keys():
-        cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim
+        cross_attention_dim = (
+            None if name.endswith("attn1.processor") or "motion_module" in name else model.config.cross_attention_dim
+        )
+
         if name.startswith("mid_block"):
             hidden_size = model.config.block_out_channels[-1]
         elif name.startswith("up_blocks"):
@@ -71,6 +74,7 @@ def create_ip_adapter_state_dict(model):
         elif name.startswith("down_blocks"):
             block_id = int(name[len("down_blocks.")])
             hidden_size = model.config.block_out_channels[block_id]
+
         if cross_attention_dim is not None:
             sd = IPAdapterAttnProcessor(
                 hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, scale=1.0
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
@@ -8,7 +8,7 @@
 import tempfile
 import unittest
 import uuid
-from typing import Callable, Union
+from typing import Any, Callable, Dict, Union
 
 import numpy as np
 import PIL.Image
@@ -85,46 +85,51 @@ def test_pipeline_signature(self):
     def _get_dummy_image_embeds(self, cross_attention_dim: int = 32):
         return torch.zeros((2, 1, cross_attention_dim), device=torch_device)
 
+    def _modify_inputs_for_ip_adapter_test(self, inputs: Dict[str, Any]):
+        inputs["output_type"] = "np"
+        inputs["return_dict"] = False
+        return inputs
+
     def test_ip_adapter(self, expected_max_diff: float = 1e-4):
         components = self.get_dummy_components()
         pipe = self.pipeline_class(**components).to(torch_device)
         pipe.set_progress_bar_config(disable=None)
         cross_attention_dim = pipe.unet.config.get("cross_attention_dim", 32)
 
         # forward pass without ip adapter
-        inputs = self.get_dummy_inputs(torch_device)
-        output_without_adapter = pipe(**inputs, return_dict=False)[0]
+        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
+        output_without_adapter = pipe(**inputs)[0]
 
         adapter_state_dict_1 = create_ip_adapter_state_dict(pipe.unet)
         adapter_state_dict_2 = create_ip_adapter_state_dict(pipe.unet)
 
         pipe.unet._load_ip_adapter_weights(adapter_state_dict_1)
 
         # forward pass with single ip adapter, but scale=0 which should have no effect
-        inputs = self.get_dummy_inputs(torch_device)
+        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
         inputs["ip_adapter_image_embeds"] = [self._get_dummy_image_embeds(cross_attention_dim)]
         pipe.set_ip_adapter_scale(0.0)
-        output_without_adapter_scale = pipe(**inputs, return_dict=False)[0]
+        output_without_adapter_scale = pipe(**inputs)[0]
 
         # forward pass with single ip adapter, but with scale of adapter weights
-        inputs = self.get_dummy_inputs(torch_device)
+        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
         inputs["ip_adapter_image_embeds"] = [self._get_dummy_image_embeds(cross_attention_dim)]
         pipe.set_ip_adapter_scale(1.0)
-        output_with_adapter_scale = pipe(**inputs, return_dict=False)[0]
+        output_with_adapter_scale = pipe(**inputs)[0]
 
         pipe.unet._load_ip_adapter_weights([adapter_state_dict_1, adapter_state_dict_2])
 
         # forward pass with multi ip adapter, but scale=0 which should have no effect
-        inputs = self.get_dummy_inputs(torch_device)
+        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
         inputs["ip_adapter_image_embeds"] = [self._get_dummy_image_embeds(cross_attention_dim)] * 2
         pipe.set_ip_adapter_scale([0.0, 0.0])
-        output_without_multi_adapter_scale = pipe(**inputs, return_dict=False)[0]
+        output_without_multi_adapter_scale = pipe(**inputs)[0]
 
         # forward pass with multi ip adapter, but with scale of adapter weights
-        inputs = self.get_dummy_inputs(torch_device)
+        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
         inputs["ip_adapter_image_embeds"] = [self._get_dummy_image_embeds(cross_attention_dim)] * 2
         pipe.set_ip_adapter_scale([0.5, 0.5])
-        output_with_multi_adapter_scale = pipe(**inputs, return_dict=False)[0]
+        output_with_multi_adapter_scale = pipe(**inputs)[0]
 
         max_diff_without_adapter_scale = np.abs(output_without_adapter_scale - output_without_adapter).max()
         max_diff_with_adapter_scale = np.abs(output_with_adapter_scale - output_without_adapter).max()

Original file line number	Diff line number	Diff line change
`@@ -905,7 +905,7 @@ def __call__(`
`905`	`905`	`if self.do_classifier_free_guidance:`
`906`	`906`	`prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])`
`907`	`907`
`908`		`- if ip_adapter_image is not None:`
	`908`	`+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None:`
`909`	`909`	`image_embeds = self.prepare_ip_adapter_image_embeds(`
`910`	`910`	`ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_videos_per_prompt`
`911`	`911`	`)`