diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index b9ad36f1a2bf..41f426532754 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -212,6 +212,12 @@ def __call__( # set timesteps self.scheduler.set_timesteps(num_inference_steps) + # preprocess mask + if not isinstance(mask_image, torch.FloatTensor): + mask_image = preprocess_mask(mask_image) + mask_image = mask_image.to(self.device) + mask = torch.cat([mask_image] * batch_size) + # preprocess image if not isinstance(init_image, torch.FloatTensor): init_image = preprocess_image(init_image) @@ -221,18 +227,22 @@ def __call__( init_latent_dist = self.vae.encode(init_image).latent_dist init_latents = init_latent_dist.sample(generator=generator) + # adding noise to the masked areas depending on strength + rand_latents = torch.randn( + init_latents.shape, + generator=generator, + device=self.device, + ) + init_latents_noised = init_latents * mask + rand_latents * (1 - mask) + init_latents = init_latents * (1 - strength) + init_latents_noised * strength + + # multiply by scale_factor init_latents = 0.18215 * init_latents # Expand init_latents for batch_size init_latents = torch.cat([init_latents] * batch_size) init_latents_orig = init_latents - # preprocess mask - if not isinstance(mask_image, torch.FloatTensor): - mask_image = preprocess_mask(mask_image) - mask_image = mask_image.to(self.device) - mask = torch.cat([mask_image] * batch_size) - # check sizes if not mask.shape == init_latents.shape: raise ValueError("The mask and init_image should be the same size!") diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index 102a55a93e4b..389083d33808 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -699,7 +699,9 @@ def test_stable_diffusion_inpaint(self): image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([0.4731, 0.5346, 0.4531, 0.6251, 0.5446, 0.4057, 0.5527, 0.5896, 0.5153]) + expected_slice = np.array( + [0.4893303, 0.5381786, 0.46649122, 0.62859786, 0.53987336, 0.39735478, 0.5483682, 0.59601367, 0.5178648] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2