Skip to content
Merged
36 changes: 16 additions & 20 deletions tests/pipelines/stable_diffusion/test_stable_diffusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ class StableDiffusionPipelineFastTests(
callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS

def get_dummy_components(self, time_cond_proj_dim=None):
cross_attention_dim = 8

torch.manual_seed(0)
unet = UNet2DConditionModel(
block_out_channels=(4, 8),
Expand All @@ -134,7 +136,7 @@ def get_dummy_components(self, time_cond_proj_dim=None):
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
cross_attention_dim=32,
cross_attention_dim=cross_attention_dim,
norm_num_groups=2,
)
scheduler = DDIMScheduler(
Expand All @@ -158,11 +160,11 @@ def get_dummy_components(self, time_cond_proj_dim=None):
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
hidden_size=32,
intermediate_size=64,
hidden_size=cross_attention_dim,
intermediate_size=16,
layer_norm_eps=1e-05,
num_attention_heads=8,
num_hidden_layers=3,
num_attention_heads=2,
num_hidden_layers=2,
pad_token_id=1,
vocab_size=1000,
)
Expand Down Expand Up @@ -210,7 +212,7 @@ def test_stable_diffusion_ddim(self):
image_slice = image[0, -3:, -3:, -1]

assert image.shape == (1, 64, 64, 3)
expected_slice = np.array([0.3203, 0.4555, 0.4711, 0.3505, 0.3973, 0.4650, 0.5137, 0.3392, 0.4045])
expected_slice = np.array([0.1763, 0.4776, 0.4986, 0.2566, 0.3802, 0.4596, 0.5363, 0.3277, 0.3949])

assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

Expand All @@ -230,7 +232,7 @@ def test_stable_diffusion_lcm(self):
image_slice = image[0, -3:, -3:, -1]

assert image.shape == (1, 64, 64, 3)
expected_slice = np.array([0.3454, 0.5349, 0.5185, 0.2808, 0.4509, 0.4612, 0.4655, 0.3601, 0.4315])
expected_slice = np.array([0.2368, 0.4900, 0.5019, 0.2723, 0.4473, 0.4578, 0.4551, 0.3532, 0.4133])

assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

Expand All @@ -252,7 +254,7 @@ def test_stable_diffusion_lcm_custom_timesteps(self):
image_slice = image[0, -3:, -3:, -1]

assert image.shape == (1, 64, 64, 3)
expected_slice = np.array([0.3454, 0.5349, 0.5185, 0.2808, 0.4509, 0.4612, 0.4655, 0.3601, 0.4315])
expected_slice = np.array([0.2368, 0.4900, 0.5019, 0.2723, 0.4473, 0.4578, 0.4551, 0.3532, 0.4133])

assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

Expand Down Expand Up @@ -371,12 +373,6 @@ def test_stable_diffusion_prompt_embeds_with_plain_negative_prompt_list(self):

assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4

def test_ip_adapter_single(self):
expected_pipe_slice = None
if torch_device == "cpu":
expected_pipe_slice = np.array([0.3203, 0.4555, 0.4711, 0.3505, 0.3973, 0.4650, 0.5137, 0.3392, 0.4045])
return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice)
Comment on lines -374 to -378
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No point in keeping this test this way because we are reducing the model sizes dramatically so this won't contribute too much. LMK if that doesn't make sense.


def test_stable_diffusion_ddim_factor_8(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator

Expand All @@ -392,7 +388,7 @@ def test_stable_diffusion_ddim_factor_8(self):
image_slice = image[0, -3:, -3:, -1]

assert image.shape == (1, 136, 136, 3)
expected_slice = np.array([0.4346, 0.5621, 0.5016, 0.3926, 0.4533, 0.4134, 0.5625, 0.5632, 0.5265])
expected_slice = np.array([0.4720, 0.5426, 0.5160, 0.3961, 0.4696, 0.4296, 0.5738, 0.5888, 0.5481])

assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

Expand All @@ -410,7 +406,7 @@ def test_stable_diffusion_pndm(self):
image_slice = image[0, -3:, -3:, -1]

assert image.shape == (1, 64, 64, 3)
expected_slice = np.array([0.3411, 0.5032, 0.4704, 0.3135, 0.4323, 0.4740, 0.5150, 0.3498, 0.4022])
expected_slice = np.array([0.1941, 0.4748, 0.4880, 0.2222, 0.4221, 0.4545, 0.5604, 0.3488, 0.3902])

assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

Expand Down Expand Up @@ -450,7 +446,7 @@ def test_stable_diffusion_k_lms(self):
image_slice = image[0, -3:, -3:, -1]

assert image.shape == (1, 64, 64, 3)
expected_slice = np.array([0.3149, 0.5246, 0.4796, 0.3218, 0.4469, 0.4729, 0.5151, 0.3597, 0.3954])
expected_slice = np.array([0.2681, 0.4785, 0.4857, 0.2426, 0.4473, 0.4481, 0.5610, 0.3676, 0.3855])

assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

Expand All @@ -469,7 +465,7 @@ def test_stable_diffusion_k_euler_ancestral(self):
image_slice = image[0, -3:, -3:, -1]

assert image.shape == (1, 64, 64, 3)
expected_slice = np.array([0.3151, 0.5243, 0.4794, 0.3217, 0.4468, 0.4728, 0.5152, 0.3598, 0.3954])
expected_slice = np.array([0.2682, 0.4782, 0.4855, 0.2424, 0.4472, 0.4479, 0.5612, 0.3676, 0.3854])

assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

Expand All @@ -488,7 +484,7 @@ def test_stable_diffusion_k_euler(self):
image_slice = image[0, -3:, -3:, -1]

assert image.shape == (1, 64, 64, 3)
expected_slice = np.array([0.3149, 0.5246, 0.4796, 0.3218, 0.4469, 0.4729, 0.5151, 0.3597, 0.3954])
expected_slice = np.array([0.2681, 0.4785, 0.4857, 0.2426, 0.4473, 0.4481, 0.5610, 0.3676, 0.3855])

assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

Expand Down Expand Up @@ -560,7 +556,7 @@ def test_stable_diffusion_negative_prompt(self):
image_slice = image[0, -3:, -3:, -1]

assert image.shape == (1, 64, 64, 3)
expected_slice = np.array([0.3458, 0.5120, 0.4800, 0.3116, 0.4348, 0.4802, 0.5237, 0.3467, 0.3991])
expected_slice = np.array([0.1907, 0.4709, 0.4858, 0.2224, 0.4223, 0.4539, 0.5606, 0.3489, 0.3900])

assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

Expand Down