From 76fc47c17e7f15ec4658cf493e7d13f69253a298 Mon Sep 17 00:00:00 2001 From: fboulnois Date: Mon, 21 Nov 2022 12:29:50 -0500 Subject: [PATCH 1/6] feat: switch core pipelines to use image arg --- .../pipeline_alt_diffusion_img2img.py | 30 +++++++++---------- ...peline_latent_diffusion_superresolution.py | 22 +++++++------- .../pipeline_cycle_diffusion.py | 30 +++++++++---------- .../pipeline_onnx_stable_diffusion_img2img.py | 26 ++++++++-------- ...ne_onnx_stable_diffusion_inpaint_legacy.py | 22 +++++++------- .../pipeline_stable_diffusion_img2img.py | 30 +++++++++---------- ...ipeline_stable_diffusion_inpaint_legacy.py | 20 ++++++------- 7 files changed, 90 insertions(+), 90 deletions(-) diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py index 45df93fab02e..9635e3187460 100644 --- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py +++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py @@ -435,9 +435,9 @@ def get_timesteps(self, num_inference_steps, strength, device): return timesteps - def prepare_latents(self, init_image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None): - init_image = init_image.to(device=device, dtype=dtype) - init_latent_dist = self.vae.encode(init_image).latent_dist + def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None): + image = image.to(device=device, dtype=dtype) + init_latent_dist = self.vae.encode(image).latent_dist init_latents = init_latent_dist.sample(generator=generator) init_latents = 0.18215 * init_latents @@ -445,16 +445,16 @@ def prepare_latents(self, init_image, timestep, batch_size, num_images_per_promp # expand init_latents for batch_size deprecation_message = ( f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial" - " images (`init_image`). Initial images are now duplicating to match the number of text prompts. Note" + " images (`image`). Initial images are now duplicating to match the number of text prompts. Note" " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update" - " your script to pass as many init images as text prompts to suppress this warning." + " your script to pass as many initial images as text prompts to suppress this warning." ) - deprecate("len(prompt) != len(init_image)", "1.0.0", deprecation_message, standard_warn=False) + deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False) additional_image_per_prompt = batch_size // init_latents.shape[0] init_latents = torch.cat([init_latents] * additional_image_per_prompt * num_images_per_prompt, dim=0) elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0: raise ValueError( - f"Cannot duplicate `init_image` of batch size {init_latents.shape[0]} to {batch_size} text prompts." + f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts." ) else: init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0) @@ -472,7 +472,7 @@ def prepare_latents(self, init_image, timestep, batch_size, num_images_per_promp def __call__( self, prompt: Union[str, List[str]], - init_image: Union[torch.FloatTensor, PIL.Image.Image], + image: Union[torch.FloatTensor, PIL.Image.Image], strength: float = 0.8, num_inference_steps: Optional[int] = 50, guidance_scale: Optional[float] = 7.5, @@ -491,15 +491,15 @@ def __call__( Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - init_image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.FloatTensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. strength (`float`, *optional*, defaults to 0.8): - Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1. - `init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The + Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. + `image` will be used as a starting point, adding more noise to it the larger the `strength`. The number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will be maximum and the denoising process will run for the full number of iterations specified in - `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`. + `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. This parameter will be modulated by `strength`. @@ -557,8 +557,8 @@ def __call__( ) # 4. Preprocess image - if isinstance(init_image, PIL.Image.Image): - init_image = preprocess(init_image) + if isinstance(image, PIL.Image.Image): + image = preprocess(image) # 5. set timesteps self.scheduler.set_timesteps(num_inference_steps, device=device) @@ -567,7 +567,7 @@ def __call__( # 6. Prepare latent variables latents = self.prepare_latents( - init_image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator + image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator ) # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py index b296a4953f97..55333d4239a0 100644 --- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py @@ -66,7 +66,7 @@ def __init__( @torch.no_grad() def __call__( self, - init_image: Union[torch.Tensor, PIL.Image.Image], + image: Union[torch.Tensor, PIL.Image.Image], batch_size: Optional[int] = 1, num_inference_steps: Optional[int] = 100, eta: Optional[float] = 0.0, @@ -77,7 +77,7 @@ def __call__( ) -> Union[Tuple, ImagePipelineOutput]: r""" Args: - init_image (`torch.Tensor` or `PIL.Image.Image`): + image (`torch.Tensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. batch_size (`int`, *optional*, defaults to 1): @@ -103,19 +103,19 @@ def __call__( generated images. """ - if isinstance(init_image, PIL.Image.Image): + if isinstance(image, PIL.Image.Image): batch_size = 1 - elif isinstance(init_image, torch.Tensor): - batch_size = init_image.shape[0] + elif isinstance(image, torch.Tensor): + batch_size = image.shape[0] else: raise ValueError( - f"`init_image` has to be of type `PIL.Image.Image` or `torch.Tensor` but is {type(init_image)}" + f"`image` has to be of type `PIL.Image.Image` or `torch.Tensor` but is {type(image)}" ) - if isinstance(init_image, PIL.Image.Image): - init_image = preprocess(init_image) + if isinstance(image, PIL.Image.Image): + image = preprocess(image) - height, width = init_image.shape[-2:] + height, width = image.shape[-2:] # in_channels should be 6: 3 for latents, 3 for low resolution image latents_shape = (batch_size, self.unet.in_channels // 2, height, width) @@ -128,7 +128,7 @@ def __call__( else: latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype) - init_image = init_image.to(device=self.device, dtype=latents_dtype) + image = image.to(device=self.device, dtype=latents_dtype) # set timesteps and move to the correct device self.scheduler.set_timesteps(num_inference_steps, device=self.device) @@ -148,7 +148,7 @@ def __call__( for t in self.progress_bar(timesteps_tensor): # concat latents and low resolution image in the channel dimension. - latents_input = torch.cat([latents, init_image], dim=1) + latents_input = torch.cat([latents, image], dim=1) latents_input = self.scheduler.scale_model_input(latents_input, t) # predict the noise residual noise_pred = self.unet(latents_input, t).sample diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py index 424f53d3f8ef..50599951eebe 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py @@ -477,9 +477,9 @@ def get_timesteps(self, num_inference_steps, strength, device): return timesteps - def prepare_latents(self, init_image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None): - init_image = init_image.to(device=device, dtype=dtype) - init_latent_dist = self.vae.encode(init_image).latent_dist + def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None): + image = image.to(device=device, dtype=dtype) + init_latent_dist = self.vae.encode(image).latent_dist init_latents = init_latent_dist.sample(generator=generator) init_latents = 0.18215 * init_latents @@ -487,16 +487,16 @@ def prepare_latents(self, init_image, timestep, batch_size, num_images_per_promp # expand init_latents for batch_size deprecation_message = ( f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial" - " images (`init_image`). Initial images are now duplicating to match the number of text prompts. Note" + " images (`image`). Initial images are now duplicating to match the number of text prompts. Note" " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update" - " your script to pass as many init images as text prompts to suppress this warning." + " your script to pass as many initial images as text prompts to suppress this warning." ) - deprecate("len(prompt) != len(init_image)", "1.0.0", deprecation_message, standard_warn=False) + deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False) additional_image_per_prompt = batch_size // init_latents.shape[0] init_latents = torch.cat([init_latents] * additional_image_per_prompt * num_images_per_prompt, dim=0) elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0: raise ValueError( - f"Cannot duplicate `init_image` of batch size {init_latents.shape[0]} to {batch_size} text prompts." + f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts." ) else: init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0) @@ -516,7 +516,7 @@ def __call__( self, prompt: Union[str, List[str]], source_prompt: Union[str, List[str]], - init_image: Union[torch.FloatTensor, PIL.Image.Image], + image: Union[torch.FloatTensor, PIL.Image.Image], strength: float = 0.8, num_inference_steps: Optional[int] = 50, guidance_scale: Optional[float] = 7.5, @@ -535,15 +535,15 @@ def __call__( Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - init_image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.FloatTensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. strength (`float`, *optional*, defaults to 0.8): - Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1. - `init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The + Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. + `image` will be used as a starting point, adding more noise to it the larger the `strength`. The number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will be maximum and the denoising process will run for the full number of iterations specified in - `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`. + `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. This parameter will be modulated by `strength`. @@ -602,8 +602,8 @@ def __call__( ) # 4. Preprocess image - if isinstance(init_image, PIL.Image.Image): - init_image = preprocess(init_image) + if isinstance(image, PIL.Image.Image): + image = preprocess(image) # 5. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps, device=device) @@ -612,7 +612,7 @@ def __call__( # 6. Prepare latent variables latents, clean_latents = self.prepare_latents( - init_image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator + image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator ) source_latents = latents diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py index 1a878535c11d..1ad90f8a2d1d 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py @@ -229,7 +229,7 @@ def _encode_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guida def __call__( self, prompt: Union[str, List[str]], - init_image: Union[np.ndarray, PIL.Image.Image], + image: Union[np.ndarray, PIL.Image.Image], strength: float = 0.8, num_inference_steps: Optional[int] = 50, guidance_scale: Optional[float] = 7.5, @@ -248,15 +248,15 @@ def __call__( Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - init_image (`np.ndarray` or `PIL.Image.Image`): + image (`np.ndarray` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. strength (`float`, *optional*, defaults to 0.8): - Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1. - `init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The + Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. + `image` will be used as a starting point, adding more noise to it the larger the `strength`. The number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will be maximum and the denoising process will run for the full number of iterations specified in - `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`. + `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. This parameter will be modulated by `strength`. @@ -320,8 +320,8 @@ def __call__( # set timesteps self.scheduler.set_timesteps(num_inference_steps) - if isinstance(init_image, PIL.Image.Image): - init_image = preprocess(init_image) + if isinstance(image, PIL.Image.Image): + image = preprocess(image) # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` @@ -333,9 +333,9 @@ def __call__( ) latents_dtype = text_embeddings.dtype - init_image = init_image.astype(latents_dtype) + image = image.astype(latents_dtype) # encode the init image into latents and scale the latents - init_latents = self.vae_encoder(sample=init_image)[0] + init_latents = self.vae_encoder(sample=image)[0] init_latents = 0.18215 * init_latents if isinstance(prompt, str): @@ -344,16 +344,16 @@ def __call__( # expand init_latents for batch_size deprecation_message = ( f"You have passed {len(prompt)} text prompts (`prompt`), but only {init_latents.shape[0]} initial" - " images (`init_image`). Initial images are now duplicating to match the number of text prompts. Note" + " images (`image`). Initial images are now duplicating to match the number of text prompts. Note" " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update" - " your script to pass as many init images as text prompts to suppress this warning." + " your script to pass as many initial images as text prompts to suppress this warning." ) - deprecate("len(prompt) != len(init_image)", "1.0.0", deprecation_message, standard_warn=False) + deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False) additional_image_per_prompt = len(prompt) // init_latents.shape[0] init_latents = np.concatenate([init_latents] * additional_image_per_prompt * num_images_per_prompt, axis=0) elif len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] != 0: raise ValueError( - f"Cannot duplicate `init_image` of batch size {init_latents.shape[0]} to {len(prompt)} text prompts." + f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {len(prompt)} text prompts." ) else: init_latents = np.concatenate([init_latents] * num_images_per_prompt, axis=0) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py index 2f990651a43a..d50b4197fcb4 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py @@ -228,7 +228,7 @@ def _encode_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guida def __call__( self, prompt: Union[str, List[str]], - init_image: Union[np.ndarray, PIL.Image.Image], + image: Union[np.ndarray, PIL.Image.Image], mask_image: Union[np.ndarray, PIL.Image.Image], strength: float = 0.8, num_inference_steps: Optional[int] = 50, @@ -248,20 +248,20 @@ def __call__( Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - init_image (`nd.ndarray` or `PIL.Image.Image`): + image (`nd.ndarray` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. This is the image whose masked region will be inpainted. mask_image (`nd.ndarray` or `PIL.Image.Image`): - `Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be + `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.uu strength (`float`, *optional*, defaults to 0.8): - Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1. - `init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The + Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. + `image` will be used as a starting point, adding more noise to it the larger the `strength`. The number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will be maximum and the denoising process will run for the full number of iterations specified in - `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`. + `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. This parameter will be modulated by `strength`. @@ -325,8 +325,8 @@ def __call__( # set timesteps self.scheduler.set_timesteps(num_inference_steps) - if isinstance(init_image, PIL.Image.Image): - init_image = preprocess(init_image) + if isinstance(image, PIL.Image.Image): + image = preprocess(image) # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` @@ -338,10 +338,10 @@ def __call__( ) latents_dtype = text_embeddings.dtype - init_image = init_image.astype(latents_dtype) + image = image.astype(latents_dtype) # encode the init image into latents and scale the latents - init_latents = self.vae_encoder(sample=init_image)[0] + init_latents = self.vae_encoder(sample=image)[0] init_latents = 0.18215 * init_latents # Expand init_latents for batch_size and num_images_per_prompt @@ -356,7 +356,7 @@ def __call__( # check sizes if not mask.shape == init_latents.shape: - raise ValueError("The mask and init_image should be the same size!") + raise ValueError("The mask and image should be the same size!") # get the original timestep using init_timestep offset = self.scheduler.config.get("steps_offset", 0) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index 4d645cc1f34c..8372b373cdae 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -444,9 +444,9 @@ def get_timesteps(self, num_inference_steps, strength, device): return timesteps - def prepare_latents(self, init_image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None): - init_image = init_image.to(device=device, dtype=dtype) - init_latent_dist = self.vae.encode(init_image).latent_dist + def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None): + image = image.to(device=device, dtype=dtype) + init_latent_dist = self.vae.encode(image).latent_dist init_latents = init_latent_dist.sample(generator=generator) init_latents = 0.18215 * init_latents @@ -454,16 +454,16 @@ def prepare_latents(self, init_image, timestep, batch_size, num_images_per_promp # expand init_latents for batch_size deprecation_message = ( f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial" - " images (`init_image`). Initial images are now duplicating to match the number of text prompts. Note" + " images (`image`). Initial images are now duplicating to match the number of text prompts. Note" " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update" - " your script to pass as many init images as text prompts to suppress this warning." + " your script to pass as many initial images as text prompts to suppress this warning." ) - deprecate("len(prompt) != len(init_image)", "1.0.0", deprecation_message, standard_warn=False) + deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False) additional_image_per_prompt = batch_size // init_latents.shape[0] init_latents = torch.cat([init_latents] * additional_image_per_prompt * num_images_per_prompt, dim=0) elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0: raise ValueError( - f"Cannot duplicate `init_image` of batch size {init_latents.shape[0]} to {batch_size} text prompts." + f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts." ) else: init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0) @@ -481,7 +481,7 @@ def prepare_latents(self, init_image, timestep, batch_size, num_images_per_promp def __call__( self, prompt: Union[str, List[str]], - init_image: Union[torch.FloatTensor, PIL.Image.Image], + image: Union[torch.FloatTensor, PIL.Image.Image], strength: float = 0.8, num_inference_steps: Optional[int] = 50, guidance_scale: Optional[float] = 7.5, @@ -500,15 +500,15 @@ def __call__( Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - init_image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.FloatTensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. strength (`float`, *optional*, defaults to 0.8): - Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1. - `init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The + Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. + `image` will be used as a starting point, adding more noise to it the larger the `strength`. The number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will be maximum and the denoising process will run for the full number of iterations specified in - `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`. + `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. This parameter will be modulated by `strength`. @@ -566,8 +566,8 @@ def __call__( ) # 4. Preprocess image - if isinstance(init_image, PIL.Image.Image): - init_image = preprocess(init_image) + if isinstance(image, PIL.Image.Image): + image = preprocess(image) # 5. set timesteps self.scheduler.set_timesteps(num_inference_steps, device=device) @@ -576,7 +576,7 @@ def __call__( # 6. Prepare latent variables latents = self.prepare_latents( - init_image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator + image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator ) # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py index b7356dc6db0c..80d0f4437dd4 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py @@ -459,9 +459,9 @@ def get_timesteps(self, num_inference_steps, strength, device): return timesteps - def prepare_latents(self, init_image, timestep, batch_size, num_images_per_prompt, dtype, device, generator): - init_image = init_image.to(device=self.device, dtype=dtype) - init_latent_dist = self.vae.encode(init_image).latent_dist + def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator): + image = image.to(device=self.device, dtype=dtype) + init_latent_dist = self.vae.encode(image).latent_dist init_latents = init_latent_dist.sample(generator=generator) init_latents = 0.18215 * init_latents @@ -479,7 +479,7 @@ def prepare_latents(self, init_image, timestep, batch_size, num_images_per_promp def __call__( self, prompt: Union[str, List[str]], - init_image: Union[torch.FloatTensor, PIL.Image.Image], + image: Union[torch.FloatTensor, PIL.Image.Image], mask_image: Union[torch.FloatTensor, PIL.Image.Image], strength: float = 0.8, num_inference_steps: Optional[int] = 50, @@ -499,18 +499,18 @@ def __call__( Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - init_image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.FloatTensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. This is the image whose masked region will be inpainted. mask_image (`torch.FloatTensor` or `PIL.Image.Image`): - `Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be + `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`. strength (`float`, *optional*, defaults to 0.8): Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength` is 1, the denoising process will be run on the masked area for the full number of iterations specified - in `num_inference_steps`. `init_image` will be used as a reference for the masked area, adding more + in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur. num_inference_steps (`int`, *optional*, defaults to 50): The reference number of denoising steps. More denoising steps usually lead to a higher quality image at @@ -569,8 +569,8 @@ def __call__( ) # 4. Preprocess image and mask - if not isinstance(init_image, torch.FloatTensor): - init_image = preprocess_image(init_image) + if not isinstance(image, torch.FloatTensor): + image = preprocess_image(image) if not isinstance(mask_image, torch.FloatTensor): mask_image = preprocess_mask(mask_image, self.vae_scale_factor) @@ -583,7 +583,7 @@ def __call__( # 6. Prepare latent variables # encode the init image into latents and scale the latents latents, init_latents_orig, noise = self.prepare_latents( - init_image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator + image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator ) # 7. Prepare mask latent From d1bd3422e21143d85a0896d3327542d0ccff0b53 Mon Sep 17 00:00:00 2001 From: fboulnois Date: Mon, 21 Nov 2022 12:48:24 -0500 Subject: [PATCH 2/6] test: update tests for core pipelines --- .../test_alt_diffusion_img2img.py | 8 ++--- .../test_latent_diffusion_superresolution.py | 4 +-- .../stable_diffusion/test_cycle_diffusion.py | 8 ++--- .../test_onnx_stable_diffusion_img2img.py | 4 +-- ...st_onnx_stable_diffusion_inpaint_legacy.py | 2 +- .../test_stable_diffusion_img2img.py | 32 +++++++++---------- .../test_stable_diffusion_inpaint_legacy.py | 20 ++++++------ tests/test_pipelines.py | 4 +-- 8 files changed, 41 insertions(+), 41 deletions(-) diff --git a/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py b/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py index 0dab14b31716..434e55f946b6 100644 --- a/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py +++ b/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py @@ -141,7 +141,7 @@ def test_stable_diffusion_img2img_default_case(self): guidance_scale=6.0, num_inference_steps=2, output_type="np", - init_image=init_image, + image=init_image, ) image = output.images @@ -153,7 +153,7 @@ def test_stable_diffusion_img2img_default_case(self): guidance_scale=6.0, num_inference_steps=2, output_type="np", - init_image=init_image, + image=init_image, return_dict=False, )[0] @@ -204,7 +204,7 @@ def test_stable_diffusion_img2img_fp16(self): generator=generator, num_inference_steps=2, output_type="np", - init_image=init_image, + image=init_image, ).images assert image.shape == (1, 32, 32, 3) @@ -243,7 +243,7 @@ def test_stable_diffusion_img2img_pipeline_default(self): generator = torch.Generator(device=torch_device).manual_seed(0) output = pipe( prompt=prompt, - init_image=init_image, + image=init_image, strength=0.75, guidance_scale=7.5, generator=generator, diff --git a/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py index 6f1f51c7ba75..d7992c2a43ab 100644 --- a/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py +++ b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py @@ -79,7 +79,7 @@ def test_inference_superresolution(self): init_image = self.dummy_image.to(device) generator = torch.Generator(device=device).manual_seed(0) - image = ldm(init_image, generator=generator, num_inference_steps=2, output_type="numpy").images + image = ldm(image=init_image, generator=generator, num_inference_steps=2, output_type="numpy").images image_slice = image[0, -3:, -3:, -1] @@ -124,7 +124,7 @@ def test_inference_superresolution(self): ldm.set_progress_bar_config(disable=None) generator = torch.Generator(device=torch_device).manual_seed(0) - image = ldm(init_image, generator=generator, num_inference_steps=20, output_type="numpy").images + image = ldm(image=init_image, generator=generator, num_inference_steps=20, output_type="numpy").images image_slice = image[0, -3:, -3:, -1] diff --git a/tests/pipelines/stable_diffusion/test_cycle_diffusion.py b/tests/pipelines/stable_diffusion/test_cycle_diffusion.py index 7a32b74096c4..33157ed9ad30 100644 --- a/tests/pipelines/stable_diffusion/test_cycle_diffusion.py +++ b/tests/pipelines/stable_diffusion/test_cycle_diffusion.py @@ -186,7 +186,7 @@ def test_stable_diffusion_cycle(self): source_prompt=source_prompt, generator=generator, num_inference_steps=2, - init_image=init_image, + image=init_image, eta=0.1, strength=0.8, guidance_scale=3, @@ -244,7 +244,7 @@ def test_stable_diffusion_cycle_fp16(self): source_prompt=source_prompt, generator=generator, num_inference_steps=2, - init_image=init_image, + image=init_image, eta=0.1, strength=0.8, guidance_scale=3, @@ -297,7 +297,7 @@ def test_cycle_diffusion_pipeline_fp16(self): output = pipe( prompt=prompt, source_prompt=source_prompt, - init_image=init_image, + image=init_image, num_inference_steps=100, eta=0.1, strength=0.85, @@ -336,7 +336,7 @@ def test_cycle_diffusion_pipeline(self): output = pipe( prompt=prompt, source_prompt=source_prompt, - init_image=init_image, + image=init_image, num_inference_steps=100, eta=0.1, strength=0.85, diff --git a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py index 91e4412425b4..c03959fb0c60 100644 --- a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py +++ b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py @@ -72,7 +72,7 @@ def test_inference_default_pndm(self): generator = np.random.RandomState(0) output = pipe( prompt=prompt, - init_image=init_image, + image=init_image, strength=0.75, guidance_scale=7.5, num_inference_steps=10, @@ -110,7 +110,7 @@ def test_inference_k_lms(self): generator = np.random.RandomState(0) output = pipe( prompt=prompt, - init_image=init_image, + image=init_image, strength=0.75, guidance_scale=7.5, num_inference_steps=10, diff --git a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint_legacy.py b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint_legacy.py index 577023f7055c..d9b9443d86a2 100644 --- a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint_legacy.py +++ b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint_legacy.py @@ -80,7 +80,7 @@ def test_inference(self): generator = np.random.RandomState(0) output = pipe( prompt=prompt, - init_image=init_image, + image=init_image, mask_image=mask_image, strength=0.75, guidance_scale=7.5, diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py index d86b259eae9e..bab44100fb80 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py @@ -188,7 +188,7 @@ def test_stable_diffusion_img2img_default_case(self): guidance_scale=6.0, num_inference_steps=2, output_type="np", - init_image=init_image, + image=init_image, ) image = output.images @@ -200,7 +200,7 @@ def test_stable_diffusion_img2img_default_case(self): guidance_scale=6.0, num_inference_steps=2, output_type="np", - init_image=init_image, + image=init_image, return_dict=False, )[0] @@ -245,7 +245,7 @@ def test_stable_diffusion_img2img_negative_prompt(self): guidance_scale=6.0, num_inference_steps=2, output_type="np", - init_image=init_image, + image=init_image, ) image = output.images image_slice = image[0, -3:, -3:, -1] @@ -285,7 +285,7 @@ def test_stable_diffusion_img2img_multiple_init_images(self): guidance_scale=6.0, num_inference_steps=2, output_type="np", - init_image=init_image, + image=init_image, ) image = output.images @@ -328,7 +328,7 @@ def test_stable_diffusion_img2img_k_lms(self): guidance_scale=6.0, num_inference_steps=2, output_type="np", - init_image=init_image, + image=init_image, ) image = output.images @@ -339,7 +339,7 @@ def test_stable_diffusion_img2img_k_lms(self): guidance_scale=6.0, num_inference_steps=2, output_type="np", - init_image=init_image, + image=init_image, return_dict=False, ) image_from_tuple = output[0] @@ -382,7 +382,7 @@ def test_stable_diffusion_img2img_num_images_per_prompt(self): prompt, num_inference_steps=2, output_type="np", - init_image=init_image, + image=init_image, ).images assert images.shape == (1, 32, 32, 3) @@ -393,7 +393,7 @@ def test_stable_diffusion_img2img_num_images_per_prompt(self): [prompt] * batch_size, num_inference_steps=2, output_type="np", - init_image=init_image, + image=init_image, ).images assert images.shape == (batch_size, 32, 32, 3) @@ -404,7 +404,7 @@ def test_stable_diffusion_img2img_num_images_per_prompt(self): prompt, num_inference_steps=2, output_type="np", - init_image=init_image, + image=init_image, num_images_per_prompt=num_images_per_prompt, ).images @@ -416,7 +416,7 @@ def test_stable_diffusion_img2img_num_images_per_prompt(self): [prompt] * batch_size, num_inference_steps=2, output_type="np", - init_image=init_image, + image=init_image, num_images_per_prompt=num_images_per_prompt, ).images @@ -458,7 +458,7 @@ def test_stable_diffusion_img2img_fp16(self): generator=generator, num_inference_steps=2, output_type="np", - init_image=init_image, + image=init_image, ).images assert image.shape == (1, 32, 32, 3) @@ -497,7 +497,7 @@ def test_stable_diffusion_img2img_pipeline_default(self): generator = torch.Generator(device=torch_device).manual_seed(0) output = pipe( prompt=prompt, - init_image=init_image, + image=init_image, strength=0.75, guidance_scale=7.5, generator=generator, @@ -535,7 +535,7 @@ def test_stable_diffusion_img2img_pipeline_k_lms(self): generator = torch.Generator(device=torch_device).manual_seed(0) output = pipe( prompt=prompt, - init_image=init_image, + image=init_image, strength=0.75, guidance_scale=7.5, generator=generator, @@ -572,7 +572,7 @@ def test_stable_diffusion_img2img_pipeline_ddim(self): generator = torch.Generator(device=torch_device).manual_seed(0) output = pipe( prompt=prompt, - init_image=init_image, + image=init_image, strength=0.75, guidance_scale=7.5, generator=generator, @@ -626,7 +626,7 @@ def test_callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> No with torch.autocast(torch_device): pipe( prompt=prompt, - init_image=init_image, + image=init_image, strength=0.75, num_inference_steps=50, guidance_scale=7.5, @@ -663,7 +663,7 @@ def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): generator = torch.Generator(device=torch_device).manual_seed(0) _ = pipe( prompt=prompt, - init_image=init_image, + image=init_image, strength=0.75, guidance_scale=7.5, generator=generator, diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py index 4b972c7b7d2a..0ae503e9be3b 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py @@ -191,7 +191,7 @@ def test_stable_diffusion_inpaint_legacy(self): guidance_scale=6.0, num_inference_steps=2, output_type="np", - init_image=init_image, + image=init_image, mask_image=mask_image, ) @@ -204,7 +204,7 @@ def test_stable_diffusion_inpaint_legacy(self): guidance_scale=6.0, num_inference_steps=2, output_type="np", - init_image=init_image, + image=init_image, mask_image=mask_image, return_dict=False, )[0] @@ -252,7 +252,7 @@ def test_stable_diffusion_inpaint_legacy_negative_prompt(self): guidance_scale=6.0, num_inference_steps=2, output_type="np", - init_image=init_image, + image=init_image, mask_image=mask_image, ) @@ -295,7 +295,7 @@ def test_stable_diffusion_inpaint_legacy_num_images_per_prompt(self): prompt, num_inference_steps=2, output_type="np", - init_image=init_image, + image=init_image, mask_image=mask_image, ).images @@ -307,7 +307,7 @@ def test_stable_diffusion_inpaint_legacy_num_images_per_prompt(self): [prompt] * batch_size, num_inference_steps=2, output_type="np", - init_image=init_image, + image=init_image, mask_image=mask_image, ).images @@ -319,7 +319,7 @@ def test_stable_diffusion_inpaint_legacy_num_images_per_prompt(self): prompt, num_inference_steps=2, output_type="np", - init_image=init_image, + image=init_image, mask_image=mask_image, num_images_per_prompt=num_images_per_prompt, ).images @@ -332,7 +332,7 @@ def test_stable_diffusion_inpaint_legacy_num_images_per_prompt(self): [prompt] * batch_size, num_inference_steps=2, output_type="np", - init_image=init_image, + image=init_image, mask_image=mask_image, num_images_per_prompt=num_images_per_prompt, ).images @@ -374,7 +374,7 @@ def test_stable_diffusion_inpaint_legacy_pipeline(self): generator = torch.Generator(device=torch_device).manual_seed(0) output = pipe( prompt=prompt, - init_image=init_image, + image=init_image, mask_image=mask_image, strength=0.75, guidance_scale=7.5, @@ -416,7 +416,7 @@ def test_stable_diffusion_inpaint_legacy_pipeline_k_lms(self): generator = torch.Generator(device=torch_device).manual_seed(0) output = pipe( prompt=prompt, - init_image=init_image, + image=init_image, mask_image=mask_image, strength=0.75, guidance_scale=7.5, @@ -474,7 +474,7 @@ def test_callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> No with torch.autocast(torch_device): pipe( prompt=prompt, - init_image=init_image, + image=init_image, mask_image=mask_image, strength=0.75, num_inference_steps=50, diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index 033f363ff41f..72f2f5b8d247 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -411,7 +411,7 @@ def test_stable_diffusion_components(self): generator=generator, num_inference_steps=2, output_type="np", - init_image=init_image, + image=init_image, mask_image=mask_image, ).images image_img2img = img2img( @@ -419,7 +419,7 @@ def test_stable_diffusion_components(self): generator=generator, num_inference_steps=2, output_type="np", - init_image=init_image, + image=init_image, ).images image_text2img = text2img( [prompt], From 91bd55194dc1c5fca1bc8bf443598da5e6e3fede Mon Sep 17 00:00:00 2001 From: fboulnois Date: Mon, 21 Nov 2022 12:54:37 -0500 Subject: [PATCH 3/6] feat: switch examples to use image arg --- examples/community/imagic_stable_diffusion.py | 26 +++++------ examples/community/lpw_stable_diffusion.py | 46 +++++++++---------- .../community/lpw_stable_diffusion_onnx.py | 46 +++++++++---------- examples/community/stable_diffusion_mega.py | 8 ++-- 4 files changed, 63 insertions(+), 63 deletions(-) diff --git a/examples/community/imagic_stable_diffusion.py b/examples/community/imagic_stable_diffusion.py index 65966b4830e8..7414ebe8722b 100644 --- a/examples/community/imagic_stable_diffusion.py +++ b/examples/community/imagic_stable_diffusion.py @@ -133,7 +133,7 @@ def disable_attention_slicing(self): def train( self, prompt: Union[str, List[str]], - init_image: Union[torch.FloatTensor, PIL.Image.Image], + image: Union[torch.FloatTensor, PIL.Image.Image], height: Optional[int] = 512, width: Optional[int] = 512, generator: Optional[torch.Generator] = None, @@ -241,14 +241,14 @@ def train( lr=embedding_learning_rate, ) - if isinstance(init_image, PIL.Image.Image): - init_image = preprocess(init_image) + if isinstance(image, PIL.Image.Image): + image = preprocess(image) latents_dtype = text_embeddings.dtype - init_image = init_image.to(device=self.device, dtype=latents_dtype) - init_latent_image_dist = self.vae.encode(init_image).latent_dist - init_image_latents = init_latent_image_dist.sample(generator=generator) - init_image_latents = 0.18215 * init_image_latents + image = image.to(device=self.device, dtype=latents_dtype) + init_latent_image_dist = self.vae.encode(image).latent_dist + image_latents = init_latent_image_dist.sample(generator=generator) + image_latents = 0.18215 * image_latents progress_bar = tqdm(range(text_embedding_optimization_steps), disable=not accelerator.is_local_main_process) progress_bar.set_description("Steps") @@ -259,12 +259,12 @@ def train( for _ in range(text_embedding_optimization_steps): with accelerator.accumulate(text_embeddings): # Sample noise that we'll add to the latents - noise = torch.randn(init_image_latents.shape).to(init_image_latents.device) - timesteps = torch.randint(1000, (1,), device=init_image_latents.device) + noise = torch.randn(image_latents.shape).to(image_latents.device) + timesteps = torch.randint(1000, (1,), device=image_latents.device) # Add noise to the latents according to the noise magnitude at each timestep # (this is the forward diffusion process) - noisy_latents = self.scheduler.add_noise(init_image_latents, noise, timesteps) + noisy_latents = self.scheduler.add_noise(image_latents, noise, timesteps) # Predict the noise residual noise_pred = self.unet(noisy_latents, timesteps, text_embeddings).sample @@ -301,12 +301,12 @@ def train( for _ in range(model_fine_tuning_optimization_steps): with accelerator.accumulate(self.unet.parameters()): # Sample noise that we'll add to the latents - noise = torch.randn(init_image_latents.shape).to(init_image_latents.device) - timesteps = torch.randint(1000, (1,), device=init_image_latents.device) + noise = torch.randn(image_latents.shape).to(image_latents.device) + timesteps = torch.randint(1000, (1,), device=image_latents.device) # Add noise to the latents according to the noise magnitude at each timestep # (this is the forward diffusion process) - noisy_latents = self.scheduler.add_noise(init_image_latents, noise, timesteps) + noisy_latents = self.scheduler.add_noise(image_latents, noise, timesteps) # Predict the noise residual noise_pred = self.unet(noisy_latents, timesteps, text_embeddings).sample diff --git a/examples/community/lpw_stable_diffusion.py b/examples/community/lpw_stable_diffusion.py index 0e7dc9e1ed11..8ec6e8563ede 100644 --- a/examples/community/lpw_stable_diffusion.py +++ b/examples/community/lpw_stable_diffusion.py @@ -555,7 +555,7 @@ def __call__( self, prompt: Union[str, List[str]], negative_prompt: Optional[Union[str, List[str]]] = None, - init_image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[torch.FloatTensor, PIL.Image.Image] = None, mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None, height: int = 512, width: int = 512, @@ -583,11 +583,11 @@ def __call__( negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - init_image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.FloatTensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. mask_image (`torch.FloatTensor` or `PIL.Image.Image`): - `Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be + `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`. @@ -605,11 +605,11 @@ def __call__( 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality. strength (`float`, *optional*, defaults to 0.8): - Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1. - `init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The + Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. + `image` will be used as a starting point, adding more noise to it the larger the `strength`. The number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will be maximum and the denoising process will run for the full number of iterations specified in - `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`. + `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. eta (`float`, *optional*, defaults to 0.0): @@ -714,7 +714,7 @@ def __call__( mask = None noise = None - if init_image is None: + if image is None: # get the initial random noise unless the user supplied it # Unlike in other pipelines, latents need to be generated in the target device @@ -753,11 +753,11 @@ def __call__( # scale the initial noise by the standard deviation required by the scheduler latents = latents * self.scheduler.init_noise_sigma else: - if isinstance(init_image, PIL.Image.Image): - init_image = preprocess_image(init_image) + if isinstance(image, PIL.Image.Image): + image = preprocess_image(image) # encode the init image into latents and scale the latents - init_image = init_image.to(device=self.device, dtype=latents_dtype) - init_latent_dist = self.vae.encode(init_image).latent_dist + image = image.to(device=self.device, dtype=latents_dtype) + init_latent_dist = self.vae.encode(image).latent_dist init_latents = init_latent_dist.sample(generator=generator) init_latents = 0.18215 * init_latents init_latents = torch.cat([init_latents] * batch_size * num_images_per_prompt, dim=0) @@ -772,7 +772,7 @@ def __call__( # check sizes if not mask.shape == init_latents.shape: - raise ValueError("The mask and init_image should be the same size!") + raise ValueError("The mask and image should be the same size!") # get the original timestep using init_timestep offset = self.scheduler.config.get("steps_offset", 0) @@ -961,7 +961,7 @@ def text2img( def img2img( self, - init_image: Union[torch.FloatTensor, PIL.Image.Image], + image: Union[torch.FloatTensor, PIL.Image.Image], prompt: Union[str, List[str]], negative_prompt: Optional[Union[str, List[str]]] = None, strength: float = 0.8, @@ -980,7 +980,7 @@ def img2img( r""" Function for image-to-image generation. Args: - init_image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.FloatTensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. prompt (`str` or `List[str]`): @@ -989,11 +989,11 @@ def img2img( The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). strength (`float`, *optional*, defaults to 0.8): - Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1. - `init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The + Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. + `image` will be used as a starting point, adding more noise to it the larger the `strength`. The number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will be maximum and the denoising process will run for the full number of iterations specified in - `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`. + `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. This parameter will be modulated by `strength`. @@ -1035,7 +1035,7 @@ def img2img( return self.__call__( prompt=prompt, negative_prompt=negative_prompt, - init_image=init_image, + image=image, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale, strength=strength, @@ -1052,7 +1052,7 @@ def img2img( def inpaint( self, - init_image: Union[torch.FloatTensor, PIL.Image.Image], + image: Union[torch.FloatTensor, PIL.Image.Image], mask_image: Union[torch.FloatTensor, PIL.Image.Image], prompt: Union[str, List[str]], negative_prompt: Optional[Union[str, List[str]]] = None, @@ -1072,11 +1072,11 @@ def inpaint( r""" Function for inpaint. Args: - init_image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.FloatTensor` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. This is the image whose masked region will be inpainted. mask_image (`torch.FloatTensor` or `PIL.Image.Image`): - `Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be + `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`. @@ -1088,7 +1088,7 @@ def inpaint( strength (`float`, *optional*, defaults to 0.8): Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength` is 1, the denoising process will be run on the masked area for the full number of iterations specified - in `num_inference_steps`. `init_image` will be used as a reference for the masked area, adding more + in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur. num_inference_steps (`int`, *optional*, defaults to 50): The reference number of denoising steps. More denoising steps usually lead to a higher quality image at @@ -1131,7 +1131,7 @@ def inpaint( return self.__call__( prompt=prompt, negative_prompt=negative_prompt, - init_image=init_image, + image=image, mask_image=mask_image, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale, diff --git a/examples/community/lpw_stable_diffusion_onnx.py b/examples/community/lpw_stable_diffusion_onnx.py index 577772b9c36a..dfd8fd264f50 100644 --- a/examples/community/lpw_stable_diffusion_onnx.py +++ b/examples/community/lpw_stable_diffusion_onnx.py @@ -441,7 +441,7 @@ def __call__( self, prompt: Union[str, List[str]], negative_prompt: Optional[Union[str, List[str]]] = None, - init_image: Union[np.ndarray, PIL.Image.Image] = None, + image: Union[np.ndarray, PIL.Image.Image] = None, mask_image: Union[np.ndarray, PIL.Image.Image] = None, height: int = 512, width: int = 512, @@ -469,11 +469,11 @@ def __call__( negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). - init_image (`np.ndarray` or `PIL.Image.Image`): + image (`np.ndarray` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. mask_image (`np.ndarray` or `PIL.Image.Image`): - `Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be + `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`. @@ -491,11 +491,11 @@ def __call__( 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality. strength (`float`, *optional*, defaults to 0.8): - Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1. - `init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The + Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. + `image` will be used as a starting point, adding more noise to it the larger the `strength`. The number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will be maximum and the denoising process will run for the full number of iterations specified in - `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`. + `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. eta (`float`, *optional*, defaults to 0.0): @@ -598,7 +598,7 @@ def __call__( mask = None noise = None - if init_image is None: + if image is None: latents_shape = ( batch_size * num_images_per_prompt, 4, @@ -616,11 +616,11 @@ def __call__( # scale the initial noise by the standard deviation required by the scheduler latents = latents * self.scheduler.init_noise_sigma else: - if isinstance(init_image, PIL.Image.Image): - init_image = preprocess_image(init_image) + if isinstance(image, PIL.Image.Image): + image = preprocess_image(image) # encode the init image into latents and scale the latents - init_image = init_image.astype(latents_dtype) - init_latents = self.vae_encoder(sample=init_image)[0] + image = image.astype(latents_dtype) + init_latents = self.vae_encoder(sample=image)[0] init_latents = 0.18215 * init_latents init_latents = np.concatenate([init_latents] * batch_size * num_images_per_prompt) init_latents_orig = init_latents @@ -635,7 +635,7 @@ def __call__( # check sizes if not mask.shape == init_latents.shape: print(mask.shape, init_latents.shape) - raise ValueError("The mask and init_image should be the same size!") + raise ValueError("The mask and image should be the same size!") # get the original timestep using init_timestep offset = self.scheduler.config.get("steps_offset", 0) @@ -828,7 +828,7 @@ def text2img( def img2img( self, - init_image: Union[np.ndarray, PIL.Image.Image], + image: Union[np.ndarray, PIL.Image.Image], prompt: Union[str, List[str]], negative_prompt: Optional[Union[str, List[str]]] = None, strength: float = 0.8, @@ -847,7 +847,7 @@ def img2img( r""" Function for image-to-image generation. Args: - init_image (`np.ndarray` or `PIL.Image.Image`): + image (`np.ndarray` or `PIL.Image.Image`): `Image`, or ndarray representing an image batch, that will be used as the starting point for the process. prompt (`str` or `List[str]`): @@ -856,11 +856,11 @@ def img2img( The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). strength (`float`, *optional*, defaults to 0.8): - Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1. - `init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The + Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. + `image` will be used as a starting point, adding more noise to it the larger the `strength`. The number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will be maximum and the denoising process will run for the full number of iterations specified in - `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`. + `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. This parameter will be modulated by `strength`. @@ -901,7 +901,7 @@ def img2img( return self.__call__( prompt=prompt, negative_prompt=negative_prompt, - init_image=init_image, + image=image, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale, strength=strength, @@ -918,7 +918,7 @@ def img2img( def inpaint( self, - init_image: Union[np.ndarray, PIL.Image.Image], + image: Union[np.ndarray, PIL.Image.Image], mask_image: Union[np.ndarray, PIL.Image.Image], prompt: Union[str, List[str]], negative_prompt: Optional[Union[str, List[str]]] = None, @@ -938,11 +938,11 @@ def inpaint( r""" Function for inpaint. Args: - init_image (`np.ndarray` or `PIL.Image.Image`): + image (`np.ndarray` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. This is the image whose masked region will be inpainted. mask_image (`np.ndarray` or `PIL.Image.Image`): - `Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be + `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`. @@ -954,7 +954,7 @@ def inpaint( strength (`float`, *optional*, defaults to 0.8): Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength` is 1, the denoising process will be run on the masked area for the full number of iterations specified - in `num_inference_steps`. `init_image` will be used as a reference for the masked area, adding more + in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur. num_inference_steps (`int`, *optional*, defaults to 50): The reference number of denoising steps. More denoising steps usually lead to a higher quality image at @@ -996,7 +996,7 @@ def inpaint( return self.__call__( prompt=prompt, negative_prompt=negative_prompt, - init_image=init_image, + image=image, mask_image=mask_image, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale, diff --git a/examples/community/stable_diffusion_mega.py b/examples/community/stable_diffusion_mega.py index 67112b282b67..30699b6a1bf3 100644 --- a/examples/community/stable_diffusion_mega.py +++ b/examples/community/stable_diffusion_mega.py @@ -121,7 +121,7 @@ def disable_attention_slicing(self): def inpaint( self, prompt: Union[str, List[str]], - init_image: Union[torch.FloatTensor, PIL.Image.Image], + image: Union[torch.FloatTensor, PIL.Image.Image], mask_image: Union[torch.FloatTensor, PIL.Image.Image], strength: float = 0.8, num_inference_steps: Optional[int] = 50, @@ -138,7 +138,7 @@ def inpaint( # For more information on how this function works, please see: https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion#diffusers.StableDiffusionImg2ImgPipeline return StableDiffusionInpaintPipelineLegacy(**self.components)( prompt=prompt, - init_image=init_image, + image=image, mask_image=mask_image, strength=strength, num_inference_steps=num_inference_steps, @@ -156,7 +156,7 @@ def inpaint( def img2img( self, prompt: Union[str, List[str]], - init_image: Union[torch.FloatTensor, PIL.Image.Image], + image: Union[torch.FloatTensor, PIL.Image.Image], strength: float = 0.8, num_inference_steps: Optional[int] = 50, guidance_scale: Optional[float] = 7.5, @@ -173,7 +173,7 @@ def img2img( # For more information on how this function works, please see: https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion#diffusers.StableDiffusionImg2ImgPipeline return StableDiffusionImg2ImgPipeline(**self.components)( prompt=prompt, - init_image=init_image, + image=image, strength=strength, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale, From b065c766018e7bb3183c4216e3d38b8b9b3e43e4 Mon Sep 17 00:00:00 2001 From: fboulnois Date: Mon, 21 Nov 2022 13:01:21 -0500 Subject: [PATCH 4/6] docs: update docs to use image arg --- README.md | 2 +- docs/source/api/pipelines/cycle_diffusion.mdx | 4 ++-- docs/source/api/pipelines/overview.mdx | 2 +- docs/source/using-diffusers/custom_pipeline_examples.mdx | 4 ++-- docs/source/using-diffusers/img2img.mdx | 2 +- examples/community/README.md | 6 +++--- src/diffusers/pipelines/README.md | 2 +- src/diffusers/pipelines/stable_diffusion/README.md | 4 ++-- 8 files changed, 13 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index ff523d060c59..ecbf7b802103 100644 --- a/README.md +++ b/README.md @@ -280,7 +280,7 @@ init_image = init_image.resize((768, 512)) prompt = "A fantasy landscape, trending on artstation" -images = pipe(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images +images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images images[0].save("fantasy_landscape.png") ``` diff --git a/docs/source/api/pipelines/cycle_diffusion.mdx b/docs/source/api/pipelines/cycle_diffusion.mdx index 8eecd3d62494..b5c078ad9466 100644 --- a/docs/source/api/pipelines/cycle_diffusion.mdx +++ b/docs/source/api/pipelines/cycle_diffusion.mdx @@ -57,7 +57,7 @@ prompt = "An astronaut riding an elephant" image = pipe( prompt=prompt, source_prompt=source_prompt, - init_image=init_image, + image=init_image, num_inference_steps=100, eta=0.1, strength=0.8, @@ -83,7 +83,7 @@ torch.manual_seed(0) image = pipe( prompt=prompt, source_prompt=source_prompt, - init_image=init_image, + image=init_image, num_inference_steps=100, eta=0.1, strength=0.85, diff --git a/docs/source/api/pipelines/overview.mdx b/docs/source/api/pipelines/overview.mdx index eed8e0d0b020..ee0f27132a89 100644 --- a/docs/source/api/pipelines/overview.mdx +++ b/docs/source/api/pipelines/overview.mdx @@ -149,7 +149,7 @@ init_image = init_image.resize((768, 512)) prompt = "A fantasy landscape, trending on artstation" -images = pipe(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images +images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images images[0].save("fantasy_landscape.png") ``` diff --git a/docs/source/using-diffusers/custom_pipeline_examples.mdx b/docs/source/using-diffusers/custom_pipeline_examples.mdx index b77e33be77d0..b51af5db91e8 100644 --- a/docs/source/using-diffusers/custom_pipeline_examples.mdx +++ b/docs/source/using-diffusers/custom_pipeline_examples.mdx @@ -177,7 +177,7 @@ init_image = download_image( prompt = "A fantasy landscape, trending on artstation" -images = pipe.img2img(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images +images = pipe.img2img(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images ### Inpainting @@ -187,7 +187,7 @@ init_image = download_image(img_url).resize((512, 512)) mask_image = download_image(mask_url).resize((512, 512)) prompt = "a cat sitting on a bench" -images = pipe.inpaint(prompt=prompt, init_image=init_image, mask_image=mask_image, strength=0.75).images +images = pipe.inpaint(prompt=prompt, image=init_image, mask_image=mask_image, strength=0.75).images ``` As shown above this one pipeline can run all both "text-to-image", "image-to-image", and "inpainting" in one pipeline. diff --git a/docs/source/using-diffusers/img2img.mdx b/docs/source/using-diffusers/img2img.mdx index 911d7bd76ad0..ecd9d73da636 100644 --- a/docs/source/using-diffusers/img2img.mdx +++ b/docs/source/using-diffusers/img2img.mdx @@ -37,7 +37,7 @@ init_image.thumbnail((768, 768)) prompt = "A fantasy landscape, trending on artstation" -images = pipe(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images +images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images images[0].save("fantasy_landscape.png") ``` diff --git a/examples/community/README.md b/examples/community/README.md index 108f6f95f1b4..d430ac192e5a 100644 --- a/examples/community/README.md +++ b/examples/community/README.md @@ -166,7 +166,7 @@ init_image = download_image("https://raw.githubusercontent.com/CompVis/stable-di prompt = "A fantasy landscape, trending on artstation" -images = pipe.img2img(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images +images = pipe.img2img(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images ### Inpainting @@ -176,7 +176,7 @@ init_image = download_image(img_url).resize((512, 512)) mask_image = download_image(mask_url).resize((512, 512)) prompt = "a cat sitting on a bench" -images = pipe.inpaint(prompt=prompt, init_image=init_image, mask_image=mask_image, strength=0.75).images +images = pipe.inpaint(prompt=prompt, image=init_image, mask_image=mask_image, strength=0.75).images ``` As shown above this one pipeline can run all both "text-to-image", "image-to-image", and "inpainting" in one pipeline. @@ -420,7 +420,7 @@ init_image = Image.open(BytesIO(response.content)).convert("RGB") init_image = init_image.resize((512, 512)) res = pipe.train( prompt, - init_image, + image=init_image, guidance_scale=7.5, num_inference_steps=50, generator=generator) diff --git a/src/diffusers/pipelines/README.md b/src/diffusers/pipelines/README.md index 6ff40d3549b4..c3202db0270c 100644 --- a/src/diffusers/pipelines/README.md +++ b/src/diffusers/pipelines/README.md @@ -126,7 +126,7 @@ init_image = init_image.resize((768, 512)) prompt = "A fantasy landscape, trending on artstation" -images = pipe(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images +images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images images[0].save("fantasy_landscape.png") ``` diff --git a/src/diffusers/pipelines/stable_diffusion/README.md b/src/diffusers/pipelines/stable_diffusion/README.md index bc30be4a7b9d..be4c5d942b2e 100644 --- a/src/diffusers/pipelines/stable_diffusion/README.md +++ b/src/diffusers/pipelines/stable_diffusion/README.md @@ -138,7 +138,7 @@ prompt = "An astronaut riding an elephant" image = pipe( prompt=prompt, source_prompt=source_prompt, - init_image=init_image, + image=init_image, num_inference_steps=100, eta=0.1, strength=0.8, @@ -164,7 +164,7 @@ torch.manual_seed(0) image = pipe( prompt=prompt, source_prompt=source_prompt, - init_image=init_image, + image=init_image, num_inference_steps=100, eta=0.1, strength=0.85, From c9ebcbe188d8808fbc56dd338af0a87826c48a5b Mon Sep 17 00:00:00 2001 From: fboulnois Date: Mon, 21 Nov 2022 14:06:42 -0500 Subject: [PATCH 5/6] style: format code using black and doc-builder --- .../alt_diffusion/pipeline_alt_diffusion_img2img.py | 8 ++++---- .../pipeline_latent_diffusion_superresolution.py | 4 +--- .../stable_diffusion/pipeline_cycle_diffusion.py | 8 ++++---- .../pipeline_onnx_stable_diffusion_img2img.py | 8 ++++---- .../pipeline_onnx_stable_diffusion_inpaint_legacy.py | 8 ++++---- .../stable_diffusion/pipeline_stable_diffusion_img2img.py | 8 ++++---- .../pipeline_stable_diffusion_inpaint_legacy.py | 4 ++-- 7 files changed, 23 insertions(+), 25 deletions(-) diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py index 9635e3187460..c31e0c478f04 100644 --- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py +++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py @@ -495,10 +495,10 @@ def __call__( `Image`, or tensor representing an image batch, that will be used as the starting point for the process. strength (`float`, *optional*, defaults to 0.8): - Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. - `image` will be used as a starting point, adding more noise to it the larger the `strength`. The - number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added - noise will be maximum and the denoising process will run for the full number of iterations specified in + Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` + will be used as a starting point, adding more noise to it the larger the `strength`. The number of + denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will + be maximum and the denoising process will run for the full number of iterations specified in `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py index 55333d4239a0..64a47f858466 100644 --- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py @@ -108,9 +108,7 @@ def __call__( elif isinstance(image, torch.Tensor): batch_size = image.shape[0] else: - raise ValueError( - f"`image` has to be of type `PIL.Image.Image` or `torch.Tensor` but is {type(image)}" - ) + raise ValueError(f"`image` has to be of type `PIL.Image.Image` or `torch.Tensor` but is {type(image)}") if isinstance(image, PIL.Image.Image): image = preprocess(image) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py index 50599951eebe..9243516f8aaf 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py @@ -539,10 +539,10 @@ def __call__( `Image`, or tensor representing an image batch, that will be used as the starting point for the process. strength (`float`, *optional*, defaults to 0.8): - Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. - `image` will be used as a starting point, adding more noise to it the larger the `strength`. The - number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added - noise will be maximum and the denoising process will run for the full number of iterations specified in + Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` + will be used as a starting point, adding more noise to it the larger the `strength`. The number of + denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will + be maximum and the denoising process will run for the full number of iterations specified in `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py index 1ad90f8a2d1d..60f45d4eeeab 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py @@ -252,10 +252,10 @@ def __call__( `Image`, or tensor representing an image batch, that will be used as the starting point for the process. strength (`float`, *optional*, defaults to 0.8): - Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. - `image` will be used as a starting point, adding more noise to it the larger the `strength`. The - number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added - noise will be maximum and the denoising process will run for the full number of iterations specified in + Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` + will be used as a starting point, adding more noise to it the larger the `strength`. The number of + denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will + be maximum and the denoising process will run for the full number of iterations specified in `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py index d50b4197fcb4..210f1c590b88 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py @@ -257,10 +257,10 @@ def __call__( PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.uu strength (`float`, *optional*, defaults to 0.8): - Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. - `image` will be used as a starting point, adding more noise to it the larger the `strength`. The - number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added - noise will be maximum and the denoising process will run for the full number of iterations specified in + Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` + will be used as a starting point, adding more noise to it the larger the `strength`. The number of + denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will + be maximum and the denoising process will run for the full number of iterations specified in `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index 8372b373cdae..8373527cbd4c 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -504,10 +504,10 @@ def __call__( `Image`, or tensor representing an image batch, that will be used as the starting point for the process. strength (`float`, *optional*, defaults to 0.8): - Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. - `image` will be used as a starting point, adding more noise to it the larger the `strength`. The - number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added - noise will be maximum and the denoising process will run for the full number of iterations specified in + Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` + will be used as a starting point, adding more noise to it the larger the `strength`. The number of + denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will + be maximum and the denoising process will run for the full number of iterations specified in `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py index 80d0f4437dd4..9ba1eb46df58 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py @@ -510,8 +510,8 @@ def __call__( strength (`float`, *optional*, defaults to 0.8): Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength` is 1, the denoising process will be run on the masked area for the full number of iterations specified - in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more - noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur. + in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more noise to + that region the larger the `strength`. If `strength` is 0, no inpainting will occur. num_inference_steps (`int`, *optional*, defaults to 50): The reference number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. This parameter will be modulated by `strength`, as explained above. From 2656a78b1ec4ff00949dfa76fc6bc50d14d721dd Mon Sep 17 00:00:00 2001 From: fboulnois Date: Mon, 28 Nov 2022 12:21:48 -0500 Subject: [PATCH 6/6] fix: deprecate use of init_image in all pipelines --- examples/community/imagic_stable_diffusion.py | 6 +++++- examples/community/lpw_stable_diffusion.py | 3 +++ examples/community/lpw_stable_diffusion_onnx.py | 5 ++++- .../alt_diffusion/pipeline_alt_diffusion_img2img.py | 5 +++++ .../pipeline_latent_diffusion_superresolution.py | 5 ++++- .../pipelines/stable_diffusion/pipeline_cycle_diffusion.py | 5 +++++ .../pipeline_onnx_stable_diffusion_img2img.py | 5 +++++ .../pipeline_onnx_stable_diffusion_inpaint_legacy.py | 5 +++++ .../stable_diffusion/pipeline_stable_diffusion_img2img.py | 5 +++++ .../pipeline_stable_diffusion_inpaint_legacy.py | 5 +++++ 10 files changed, 46 insertions(+), 3 deletions(-) diff --git a/examples/community/imagic_stable_diffusion.py b/examples/community/imagic_stable_diffusion.py index 7414ebe8722b..f044a1f568cc 100644 --- a/examples/community/imagic_stable_diffusion.py +++ b/examples/community/imagic_stable_diffusion.py @@ -17,7 +17,7 @@ from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler -from diffusers.utils import logging +from diffusers.utils import deprecate, logging # TODO: remove and import from diffusers.utils when the new version of diffusers is released from packaging import version @@ -184,6 +184,10 @@ def train( list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw) content, according to the `safety_checker`. """ + message = "Please use `image` instead of `init_image`." + init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs) + image = init_image or image + accelerator = Accelerator( gradient_accumulation_steps=1, mixed_precision="fp16", diff --git a/examples/community/lpw_stable_diffusion.py b/examples/community/lpw_stable_diffusion.py index 8ec6e8563ede..1ed919d7ba58 100644 --- a/examples/community/lpw_stable_diffusion.py +++ b/examples/community/lpw_stable_diffusion.py @@ -648,6 +648,9 @@ def __call__( list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw) content, according to the `safety_checker`. """ + message = "Please use `image` instead of `init_image`." + init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs) + image = init_image or image if isinstance(prompt, str): batch_size = 1 diff --git a/examples/community/lpw_stable_diffusion_onnx.py b/examples/community/lpw_stable_diffusion_onnx.py index dfd8fd264f50..a6e765abe66b 100644 --- a/examples/community/lpw_stable_diffusion_onnx.py +++ b/examples/community/lpw_stable_diffusion_onnx.py @@ -10,7 +10,7 @@ from diffusers.pipeline_utils import DiffusionPipeline from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler -from diffusers.utils import logging +from diffusers.utils import deprecate, logging # TODO: remove and import from diffusers.utils when the new version of diffusers is released from packaging import version @@ -533,6 +533,9 @@ def __call__( list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw) content, according to the `safety_checker`. """ + message = "Please use `image` instead of `init_image`." + init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs) + image = init_image or image if isinstance(prompt, str): batch_size = 1 diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py index c31e0c478f04..3970b7265e61 100644 --- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py +++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py @@ -484,6 +484,7 @@ def __call__( return_dict: bool = True, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback_steps: Optional[int] = 1, + **kwargs, ): r""" Function invoked when calling the pipeline for generation. @@ -540,6 +541,10 @@ def __call__( list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw) content, according to the `safety_checker`. """ + message = "Please use `image` instead of `init_image`." + init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs) + image = init_image or image + # 1. Check inputs self.check_inputs(prompt, strength, callback_steps) diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py index 64a47f858466..09bdca54accf 100644 --- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py @@ -17,7 +17,7 @@ LMSDiscreteScheduler, PNDMScheduler, ) -from ...utils import PIL_INTERPOLATION +from ...utils import PIL_INTERPOLATION, deprecate def preprocess(image): @@ -102,6 +102,9 @@ def __call__( `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images. """ + message = "Please use `image` instead of `init_image`." + init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs) + image = init_image or image if isinstance(image, PIL.Image.Image): batch_size = 1 diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py index 9243516f8aaf..ef28e42f4f16 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py @@ -528,6 +528,7 @@ def __call__( return_dict: bool = True, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback_steps: Optional[int] = 1, + **kwargs, ): r""" Function invoked when calling the pipeline for generation. @@ -584,6 +585,10 @@ def __call__( list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw) content, according to the `safety_checker`. """ + message = "Please use `image` instead of `init_image`." + init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs) + image = init_image or image + # 1. Check inputs self.check_inputs(prompt, strength, callback_steps) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py index 60f45d4eeeab..2242d21b1d91 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py @@ -241,6 +241,7 @@ def __call__( return_dict: bool = True, callback: Optional[Callable[[int, int, np.ndarray], None]] = None, callback_steps: Optional[int] = 1, + **kwargs, ): r""" Function invoked when calling the pipeline for generation. @@ -296,6 +297,10 @@ def __call__( list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw) content, according to the `safety_checker`. """ + message = "Please use `image` instead of `init_image`." + init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs) + image = init_image or image + if isinstance(prompt, str): batch_size = 1 elif isinstance(prompt, list): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py index 210f1c590b88..84e85e51cca2 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py @@ -241,6 +241,7 @@ def __call__( return_dict: bool = True, callback: Optional[Callable[[int, int, np.ndarray], None]] = None, callback_steps: Optional[int] = 1, + **kwargs, ): r""" Function invoked when calling the pipeline for generation. @@ -301,6 +302,10 @@ def __call__( list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw) content, according to the `safety_checker`. """ + message = "Please use `image` instead of `init_image`." + init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs) + image = init_image or image + if isinstance(prompt, str): batch_size = 1 elif isinstance(prompt, list): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index 8373527cbd4c..dccc028372ef 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -493,6 +493,7 @@ def __call__( return_dict: bool = True, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback_steps: Optional[int] = 1, + **kwargs, ): r""" Function invoked when calling the pipeline for generation. @@ -549,6 +550,10 @@ def __call__( list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw) content, according to the `safety_checker`. """ + message = "Please use `image` instead of `init_image`." + init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs) + image = init_image or image + # 1. Check inputs self.check_inputs(prompt, strength, callback_steps) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py index 9ba1eb46df58..01f0a9e35c7e 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py @@ -492,6 +492,7 @@ def __call__( return_dict: bool = True, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback_steps: Optional[int] = 1, + **kwargs, ): r""" Function invoked when calling the pipeline for generation. @@ -552,6 +553,10 @@ def __call__( list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw) content, according to the `safety_checker`. """ + message = "Please use `image` instead of `init_image`." + init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs) + image = init_image or image + # 1. Check inputs self.check_inputs(prompt, strength, callback_steps)