diff --git a/tests/pipelines/animatediff/test_animatediff.py b/tests/pipelines/animatediff/test_animatediff.py index c61a1ee45b89..0db240a2855d 100644 --- a/tests/pipelines/animatediff/test_animatediff.py +++ b/tests/pipelines/animatediff/test_animatediff.py @@ -172,6 +172,12 @@ def test_ip_adapter_single(self): ) return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice) + def test_dict_tuple_outputs_equivalent(self): + expected_slice = None + if torch_device == "cpu": + expected_slice = np.array([0.4051, 0.4495, 0.4480, 0.5845, 0.4172, 0.6066, 0.4205, 0.3786, 0.5323]) + return super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice) + def test_inference_batch_single_identical( self, batch_size=2, diff --git a/tests/pipelines/controlnet/test_controlnet_blip_diffusion.py b/tests/pipelines/controlnet/test_controlnet_blip_diffusion.py index fe4c9daf4917..99a238caf53a 100644 --- a/tests/pipelines/controlnet/test_controlnet_blip_diffusion.py +++ b/tests/pipelines/controlnet/test_controlnet_blip_diffusion.py @@ -28,7 +28,7 @@ PNDMScheduler, UNet2DConditionModel, ) -from diffusers.utils.testing_utils import enable_full_determinism +from diffusers.utils.testing_utils import enable_full_determinism, torch_device from src.diffusers.pipelines.blip_diffusion.blip_image_processing import BlipImageProcessor from src.diffusers.pipelines.blip_diffusion.modeling_blip2 import Blip2QFormerModel from src.diffusers.pipelines.blip_diffusion.modeling_ctx_clip import ContextCLIPTextModel @@ -196,6 +196,12 @@ def get_dummy_inputs(self, device, seed=0): } return inputs + def test_dict_tuple_outputs_equivalent(self): + expected_slice = None + if torch_device == "cpu": + expected_slice = np.array([0.4803, 0.3865, 0.1422, 0.6119, 0.2283, 0.6365, 0.5453, 0.5205, 0.3581]) + super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice) + def test_blipdiffusion_controlnet(self): device = "cpu" components = self.get_dummy_components() diff --git a/tests/pipelines/controlnet/test_controlnet_inpaint_sdxl.py b/tests/pipelines/controlnet/test_controlnet_inpaint_sdxl.py index 970247d249c8..a7423bebd939 100644 --- a/tests/pipelines/controlnet/test_controlnet_inpaint_sdxl.py +++ b/tests/pipelines/controlnet/test_controlnet_inpaint_sdxl.py @@ -57,6 +57,7 @@ class ControlNetPipelineSDXLFastTests( image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS def get_dummy_components(self): + torch.manual_seed(0) unet = UNet2DConditionModel( block_out_channels=(32, 64), layers_per_block=2, @@ -74,6 +75,7 @@ def get_dummy_components(self): projection_class_embeddings_input_dim=80, # 6 * 8 + 32 cross_attention_dim=64, ) + torch.manual_seed(0) controlnet = ControlNetModel( block_out_channels=(32, 64), layers_per_block=2, @@ -123,6 +125,7 @@ def get_dummy_components(self): text_encoder = CLIPTextModel(text_encoder_config) tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") + torch.manual_seed(0) text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config) tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") @@ -182,6 +185,12 @@ def get_dummy_inputs(self, device, seed=0, img_res=64): def test_attention_slicing_forward_pass(self): return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3) + def test_dict_tuple_outputs_equivalent(self): + expected_slice = None + if torch_device == "cpu": + expected_slice = np.array([0.5490, 0.5053, 0.4676, 0.5816, 0.5364, 0.4830, 0.5937, 0.5719, 0.4318]) + super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice) + @unittest.skipIf( torch_device != "cuda" or not is_xformers_available(), reason="XFormers attention is only available with CUDA and `xformers` installed", @@ -289,9 +298,7 @@ def test_controlnet_sdxl_guess(self): output = sd_pipe(**inputs) image_slice = output.images[0, -3:, -3:, -1] - expected_slice = np.array( - [0.5381963, 0.4836803, 0.45821992, 0.5577731, 0.51210403, 0.4794795, 0.59282357, 0.5647199, 0.43100584] - ) + expected_slice = np.array([0.549, 0.5053, 0.4676, 0.5816, 0.5364, 0.483, 0.5937, 0.5719, 0.4318]) # make sure that it's equal assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-4 diff --git a/tests/pipelines/kandinsky3/test_kandinsky3_img2img.py b/tests/pipelines/kandinsky3/test_kandinsky3_img2img.py index 0e8ebc978e15..8c817df32e0c 100644 --- a/tests/pipelines/kandinsky3/test_kandinsky3_img2img.py +++ b/tests/pipelines/kandinsky3/test_kandinsky3_img2img.py @@ -36,6 +36,7 @@ load_image, require_torch_gpu, slow, + torch_device, ) from ..pipeline_params import ( @@ -152,6 +153,12 @@ def get_dummy_inputs(self, device, seed=0): } return inputs + def test_dict_tuple_outputs_equivalent(self): + expected_slice = None + if torch_device == "cpu": + expected_slice = np.array([0.5762, 0.6112, 0.4150, 0.6018, 0.6167, 0.4626, 0.5426, 0.5641, 0.6536]) + super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice) + def test_kandinsky3_img2img(self): device = "cpu" diff --git a/tests/pipelines/pia/test_pia.py b/tests/pipelines/pia/test_pia.py index 3a89452585fb..4150903ac0b9 100644 --- a/tests/pipelines/pia/test_pia.py +++ b/tests/pipelines/pia/test_pia.py @@ -175,6 +175,12 @@ def test_ip_adapter_single(self): ) return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice) + def test_dict_tuple_outputs_equivalent(self): + expected_slice = None + if torch_device == "cpu": + expected_slice = np.array([0.3740, 0.4284, 0.4038, 0.5417, 0.4405, 0.5521, 0.4273, 0.4124, 0.4997]) + return super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice) + @unittest.skip("Attention slicing is not enabled in this pipeline") def test_attention_slicing_forward_pass(self): pass diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py index 22548cd0eff2..2d6182ce472a 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py @@ -32,6 +32,7 @@ numpy_cosine_similarity_distance, require_torch_gpu, skip_mps, + torch_device, ) from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS @@ -153,6 +154,12 @@ def get_dummy_inputs(self, device, seed=0): } return inputs + def test_dict_tuple_outputs_equivalent(self): + expected_slice = None + if torch_device == "cpu": + expected_slice = np.array([0.6391, 0.6290, 0.4860, 0.5134, 0.5550, 0.4577, 0.5033, 0.5023, 0.4538]) + super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice, expected_max_difference=3e-3) + def test_inference(self): device = "cpu" @@ -182,9 +189,6 @@ def test_inference_batch_consistent(self): def test_inference_batch_single_identical(self): self._test_inference_batch_single_identical(batch_size=2, expected_max_diff=7e-4) - def test_dict_tuple_outputs_equivalent(self): - super().test_dict_tuple_outputs_equivalent(expected_max_difference=3e-3) - def test_pt_np_pil_outputs_equivalent(self): super().test_pt_np_pil_outputs_equivalent(expected_max_diff=5e-4) diff --git a/tests/pipelines/stable_diffusion_gligen_text_image/test_stable_diffusion_gligen_text_image.py b/tests/pipelines/stable_diffusion_gligen_text_image/test_stable_diffusion_gligen_text_image.py index f9f8b044a916..748702541b1e 100644 --- a/tests/pipelines/stable_diffusion_gligen_text_image/test_stable_diffusion_gligen_text_image.py +++ b/tests/pipelines/stable_diffusion_gligen_text_image/test_stable_diffusion_gligen_text_image.py @@ -35,7 +35,7 @@ ) from diffusers.pipelines.stable_diffusion import CLIPImageProjection from diffusers.utils import load_image -from diffusers.utils.testing_utils import enable_full_determinism +from diffusers.utils.testing_utils import enable_full_determinism, torch_device from ..pipeline_params import ( TEXT_TO_IMAGE_BATCH_PARAMS, @@ -160,6 +160,12 @@ def get_dummy_inputs(self, device, seed=0): } return inputs + def test_dict_tuple_outputs_equivalent(self): + expected_slice = None + if torch_device == "cpu": + expected_slice = np.array([0.5052, 0.5546, 0.4567, 0.4770, 0.5195, 0.4085, 0.5026, 0.4909, 0.4495]) + super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice) + def test_stable_diffusion_gligen_text_image_default_case(self): device = "cpu" # ensure determinism for the device-dependent torch.Generator components = self.get_dummy_components() diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py index 35574faa3185..f0e6818bfc2b 100644 --- a/tests/pipelines/test_pipelines_common.py +++ b/tests/pipelines/test_pipelines_common.py @@ -1037,7 +1037,7 @@ def _test_inference_batch_single_identical( max_diff = np.abs(to_np(output_batch[0][0]) - to_np(output[0][0])).max() assert max_diff < expected_max_diff - def test_dict_tuple_outputs_equivalent(self, expected_max_difference=1e-4): + def test_dict_tuple_outputs_equivalent(self, expected_slice=None, expected_max_difference=1e-4): components = self.get_dummy_components() pipe = self.pipeline_class(**components) for component in pipe.components.values(): @@ -1048,10 +1048,21 @@ def test_dict_tuple_outputs_equivalent(self, expected_max_difference=1e-4): pipe.set_progress_bar_config(disable=None) generator_device = "cpu" - output = pipe(**self.get_dummy_inputs(generator_device))[0] + if expected_slice is None: + output = pipe(**self.get_dummy_inputs(generator_device))[0] + else: + output = expected_slice + output_tuple = pipe(**self.get_dummy_inputs(generator_device), return_dict=False)[0] - max_diff = np.abs(to_np(output) - to_np(output_tuple)).max() + if expected_slice is None: + max_diff = np.abs(to_np(output) - to_np(output_tuple)).max() + else: + if output_tuple.ndim != 5: + max_diff = np.abs(to_np(output) - to_np(output_tuple)[0, -3:, -3:, -1].flatten()).max() + else: + max_diff = np.abs(to_np(output) - to_np(output_tuple)[0, -3:, -3:, -1, -1].flatten()).max() + self.assertLess(max_diff, expected_max_difference) def test_components_function(self): diff --git a/tests/pipelines/text_to_video_synthesis/test_text_to_video.py b/tests/pipelines/text_to_video_synthesis/test_text_to_video.py index 2c170850862f..79e3a7f9b736 100644 --- a/tests/pipelines/text_to_video_synthesis/test_text_to_video.py +++ b/tests/pipelines/text_to_video_synthesis/test_text_to_video.py @@ -133,6 +133,12 @@ def get_dummy_inputs(self, device, seed=0): } return inputs + def test_dict_tuple_outputs_equivalent(self): + expected_slice = None + if torch_device == "cpu": + expected_slice = np.array([0.4903, 0.5649, 0.5504, 0.5179, 0.4821, 0.5466, 0.4131, 0.5052, 0.5077]) + return super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice) + def test_text_to_video_default_case(self): device = "cpu" # ensure determinism for the device-dependent torch.Generator components = self.get_dummy_components() diff --git a/tests/pipelines/unidiffuser/test_unidiffuser.py b/tests/pipelines/unidiffuser/test_unidiffuser.py index 561b82aafbda..2e0ba1cfb8eb 100644 --- a/tests/pipelines/unidiffuser/test_unidiffuser.py +++ b/tests/pipelines/unidiffuser/test_unidiffuser.py @@ -206,6 +206,12 @@ def get_dummy_inputs_with_latents(self, device, seed=0): } return inputs + def test_dict_tuple_outputs_equivalent(self): + expected_slice = None + if torch_device == "cpu": + expected_slice = np.array([0.7489, 0.3722, 0.4475, 0.5630, 0.5923, 0.4992, 0.3936, 0.5844, 0.4975]) + super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice) + def test_unidiffuser_default_joint_v0(self): device = "cpu" # ensure determinism for the device-dependent torch.Generator components = self.get_dummy_components()