From 6a3d3be1f69bdf2240869ccd49c14d5ae9e6657d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=BDilvinas=20Ledas?= Date: Sun, 18 Sep 2022 15:43:04 +0300 Subject: [PATCH 1/5] * Stabe Diffusion img2img using onnx. --- src/diffusers/__init__.py | 1 + src/diffusers/pipelines/__init__.py | 1 + .../pipelines/stable_diffusion/__init__.py | 1 + .../pipeline_stable_diffusion_img2img_onnx.py | 305 ++++++++++++++++++ 4 files changed, 308 insertions(+) create mode 100644 src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img_onnx.py diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index ecf4fe5fef95..b447e848097f 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -59,6 +59,7 @@ if is_torch_available() and is_transformers_available() and is_onnx_available(): from .pipelines import StableDiffusionOnnxPipeline + from .pipelines import StableDiffusionImg2ImgOnnxPipeline else: from .utils.dummy_torch_and_transformers_and_onnx_objects import * # noqa F403 diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index 3e2aeb4fb2b7..0e5778fc6a0c 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -17,3 +17,4 @@ if is_transformers_available() and is_onnx_available(): from .stable_diffusion import StableDiffusionOnnxPipeline + from .stable_diffusion import StableDiffusionImg2ImgOnnxPipeline diff --git a/src/diffusers/pipelines/stable_diffusion/__init__.py b/src/diffusers/pipelines/stable_diffusion/__init__.py index 5ffda93f1721..76d9529fab5b 100644 --- a/src/diffusers/pipelines/stable_diffusion/__init__.py +++ b/src/diffusers/pipelines/stable_diffusion/__init__.py @@ -35,3 +35,4 @@ class StableDiffusionPipelineOutput(BaseOutput): if is_transformers_available() and is_onnx_available(): from .pipeline_stable_diffusion_onnx import StableDiffusionOnnxPipeline + from .pipeline_stable_diffusion_img2img_onnx import StableDiffusionImg2ImgOnnxPipeline diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img_onnx.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img_onnx.py new file mode 100644 index 000000000000..39649d0f7283 --- /dev/null +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img_onnx.py @@ -0,0 +1,305 @@ +import inspect +import warnings +from typing import List, Optional, Union + +import numpy as np +import torch + +import PIL +from transformers import CLIPFeatureExtractor, CLIPTokenizer + +from ...onnx_utils import OnnxRuntimeModel + +from ...configuration_utils import FrozenDict +from ...models import AutoencoderKL +from ...pipeline_utils import DiffusionPipeline +from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler +from . import StableDiffusionPipelineOutput +from .safety_checker import StableDiffusionSafetyChecker + + +def preprocess(image): + w, h = image.size + w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32 + image = image.resize((w, h), resample=PIL.Image.LANCZOS) + image = np.array(image).astype(np.float32) / 255.0 + image = image[None].transpose(0, 3, 1, 2) + image = torch.from_numpy(image) + return 2.0 * image - 1.0 + + +class StableDiffusionImg2ImgOnnxPipeline(DiffusionPipeline): + r""" + Pipeline for text-guided image to image generation using Stable Diffusion. + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + Args: + vae ([`AutoencoderKL`]): + Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. + text_encoder ([`CLIPTextModel`]): + Frozen text-encoder. Stable Diffusion uses the text portion of + [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically + the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant. + tokenizer (`CLIPTokenizer`): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + scheduler ([`SchedulerMixin`]): + A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of + [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. + safety_checker ([`StableDiffusionSafetyChecker`]): + Classification module that estimates whether generated images could be considered offsensive or harmful. + Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details. + feature_extractor ([`CLIPFeatureExtractor`]): + Model that extracts features from generated images to be used as inputs for the `safety_checker`. + """ + + def __init__( + self, + vae_decoder: OnnxRuntimeModel, + vae: AutoencoderKL, + text_encoder: OnnxRuntimeModel, + tokenizer: CLIPTokenizer, + unet: OnnxRuntimeModel, + scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler], + safety_checker: OnnxRuntimeModel, + feature_extractor: CLIPFeatureExtractor, + ): + super().__init__() + scheduler = scheduler.set_format("np") + + if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1: + warnings.warn( + f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`" + f" should be set to 1 istead of {scheduler.config.steps_offset}. Please make sure " + "to update the config accordingly as leaving `steps_offset` might led to incorrect results" + " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub," + " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`" + " file", + DeprecationWarning, + ) + new_config = dict(scheduler.config) + new_config["steps_offset"] = 1 + scheduler._internal_dict = FrozenDict(new_config) + + self.register_modules( + vae_decoder=vae_decoder, + vae=vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + unet=unet, + scheduler=scheduler, + safety_checker=safety_checker, + feature_extractor=feature_extractor, + ) + + def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"): + r""" + Enable sliced attention computation. + + When this option is enabled, the attention module will split the input tensor in slices, to compute attention + in several steps. This is useful to save some memory in exchange for a small speed decrease. + + Args: + slice_size (`str` or `int`, *optional*, defaults to `"auto"`): + When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If + a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case, + `attention_head_dim` must be a multiple of `slice_size`. + """ + if slice_size == "auto": + # half the attention head size is usually a good trade-off between + # speed and memory + slice_size = self.unet.config.attention_head_dim // 2 + self.unet.set_attention_slice(slice_size) + + def disable_attention_slicing(self): + r""" + Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go + back to computing attention in one step. + """ + # set slice_size = `None` to disable `set_attention_slice` + self.enable_attention_slicing(None) + + def __call__( + self, + prompt: Union[str, List[str]], + init_image: Union[torch.FloatTensor, PIL.Image.Image], + strength: float = 0.8, + num_inference_steps: Optional[int] = 50, + guidance_scale: Optional[float] = 7.5, + eta: Optional[float] = 0.0, + output_type: Optional[str] = "pil", + return_dict: bool = True, + ): + r""" + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`): + The prompt or prompts to guide the image generation. + init_image (`torch.FloatTensor` or `PIL.Image.Image`): + `Image`, or tensor representing an image batch, that will be used as the starting point for the + process. + strength (`float`, *optional*, defaults to 0.8): + Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1. + `init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The + number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added + noise will be maximum and the denoising process will run for the full number of iterations specified in + `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. This parameter will be modulated by `strength`. + guidance_scale (`float`, *optional*, defaults to 7.5): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to + [`schedulers.DDIMScheduler`], will be ignored for others. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + plain tuple. + + Returns: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. + When returning a tuple, the first element is a list with the generated images, and the second element is a + list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" + (nsfw) content, according to the `safety_checker`. + """ + if isinstance(prompt, str): + batch_size = 1 + elif isinstance(prompt, list): + batch_size = len(prompt) + else: + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if strength < 0 or strength > 1: + raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}") + + # set timesteps + self.scheduler.set_timesteps(num_inference_steps) + + if isinstance(init_image, PIL.Image.Image): + init_image = preprocess(init_image) + + # encode the init image into latents and scale the latents + init_latent_dist = self.vae.encode(init_image.to(self.device)).latent_dist + init_latents = init_latent_dist.sample(generator=None) + init_latents = 0.18215 * init_latents + + # expand init_latents for batch_size + init_latents = np.concatenate([init_latents.detach().numpy()] * batch_size) + + # get the original timestep using init_timestep + offset = self.scheduler.config.get("steps_offset", 0) + init_timestep = int(num_inference_steps * strength) + offset + init_timestep = min(init_timestep, num_inference_steps) + if isinstance(self.scheduler, LMSDiscreteScheduler): + timesteps = num_inference_steps - init_timestep + else: + timesteps = self.scheduler.timesteps[-init_timestep] + + # add noise to latents using the timesteps + noise = np.random.randn(*init_latents.shape).astype(np.float32) + + init_latents = self.scheduler.add_noise(init_latents, noise, timesteps).to(self.device) + + # get prompt text embeddings + text_input = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="np", + ) + text_embeddings = self.text_encoder(input_ids=text_input.input_ids.astype(np.int32))[0] + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance: + max_length = text_input.input_ids.shape[-1] + uncond_input = self.tokenizer( + [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="np" + ) + uncond_embeddings = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0] + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + text_embeddings = np.concatenate([uncond_embeddings, text_embeddings]) + + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + latents = init_latents + + t_start = max(num_inference_steps - init_timestep + offset, 0) + for i, t in enumerate(self.progress_bar(self.scheduler.timesteps[t_start:])): + t_index = t_start + i + + # expand the latents if we are doing classifier free guidance + latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents + + # if we use LMSDiscreteScheduler, let's make sure latents are multiplied by sigmas + if isinstance(self.scheduler, LMSDiscreteScheduler): + sigma = self.scheduler.sigmas[t_index] + # the model input needs to be scaled to match the continuous ODE formulation in K-LMS + latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5) + latent_model_input = latent_model_input.to(self.unet.dtype) + t = t.to(self.unet.dtype) + + # predict the noise residual + noise_pred = self.unet( + sample=latent_model_input, timestep=np.array([t]), encoder_hidden_states=text_embeddings + ) + noise_pred = noise_pred[0] + + + # perform guidance + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + + # compute the previous noisy sample x_t -> x_t-1 + if isinstance(self.scheduler, LMSDiscreteScheduler): + latents = self.scheduler.step(noise_pred, t_index, latents, **extra_step_kwargs).prev_sample + else: + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample + + # scale and decode the image latents with vae + latents = 1 / 0.18215 * latents + image = self.vae_decoder(latent_sample=latents)[0] + + image = np.clip(image / 2 + 0.5, 0, 1) + image = image.transpose((0, 2, 3, 1)) + + # run safety checker + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="np") + image, has_nsfw_concept = self.safety_checker(clip_input=safety_checker_input.pixel_values, images=image) + + + + if output_type == "pil": + image = self.numpy_to_pil(image) + + if not return_dict: + return (image, has_nsfw_concept) + + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) From 0cccbc2a1336ebfd11e95a6541c7afd6c9e43ba5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=BDilvinas=20Ledas?= Date: Sun, 18 Sep 2022 16:33:37 +0300 Subject: [PATCH 2/5] * Stabe Diffusion inpaint using onnx. --- src/diffusers/__init__.py | 2 +- src/diffusers/pipelines/__init__.py | 1 + .../pipelines/stable_diffusion/__init__.py | 1 + .../pipeline_stable_diffusion_inpaint_onnx.py | 327 ++++++++++++++++++ 4 files changed, 330 insertions(+), 1 deletion(-) create mode 100644 src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_onnx.py diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index b447e848097f..363df8d6bb12 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -59,7 +59,7 @@ if is_torch_available() and is_transformers_available() and is_onnx_available(): from .pipelines import StableDiffusionOnnxPipeline - from .pipelines import StableDiffusionImg2ImgOnnxPipeline + from .pipelines import StableDiffusionInpaintOnnxPipeline else: from .utils.dummy_torch_and_transformers_and_onnx_objects import * # noqa F403 diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index 0e5778fc6a0c..887929cc5ea5 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -18,3 +18,4 @@ if is_transformers_available() and is_onnx_available(): from .stable_diffusion import StableDiffusionOnnxPipeline from .stable_diffusion import StableDiffusionImg2ImgOnnxPipeline + from .stable_diffusion import StableDiffusionInpaintOnnxPipeline diff --git a/src/diffusers/pipelines/stable_diffusion/__init__.py b/src/diffusers/pipelines/stable_diffusion/__init__.py index 76d9529fab5b..ab7ef48e31ea 100644 --- a/src/diffusers/pipelines/stable_diffusion/__init__.py +++ b/src/diffusers/pipelines/stable_diffusion/__init__.py @@ -36,3 +36,4 @@ class StableDiffusionPipelineOutput(BaseOutput): if is_transformers_available() and is_onnx_available(): from .pipeline_stable_diffusion_onnx import StableDiffusionOnnxPipeline from .pipeline_stable_diffusion_img2img_onnx import StableDiffusionImg2ImgOnnxPipeline + from .pipeline_stable_diffusion_inpaint_onnx import StableDiffusionInpaintOnnxPipeline diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_onnx.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_onnx.py new file mode 100644 index 000000000000..86085a9d6212 --- /dev/null +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_onnx.py @@ -0,0 +1,327 @@ +import inspect +import warnings +from typing import List, Optional, Union + +import numpy as np +import torch + +import PIL +from tqdm.auto import tqdm +from transformers import CLIPFeatureExtractor, CLIPTokenizer + +from ...onnx_utils import OnnxRuntimeModel + +from ...configuration_utils import FrozenDict +from ...models import AutoencoderKL +from ...pipeline_utils import DiffusionPipeline +from ...schedulers import DDIMScheduler, PNDMScheduler +from ...utils import logging +from . import StableDiffusionPipelineOutput +from .safety_checker import StableDiffusionSafetyChecker + + +logger = logging.get_logger(__name__) + + +def preprocess_image(image): + w, h = image.size + w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32 + image = image.resize((w, h), resample=PIL.Image.LANCZOS) + image = np.array(image).astype(np.float32) / 255.0 + image = image[None].transpose(0, 3, 1, 2) + image = torch.from_numpy(image) + return 2.0 * image - 1.0 + + +def preprocess_mask(mask): + mask = mask.convert("L") + w, h = mask.size + w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32 + mask = mask.resize((w // 8, h // 8), resample=PIL.Image.NEAREST) + mask = np.array(mask).astype(np.float32) / 255.0 + mask = np.tile(mask, (4, 1, 1)) + mask = mask[None].transpose(0, 1, 2, 3) # what does this step do? + mask = 1 - mask # repaint white, keep black + mask = torch.from_numpy(mask) + return mask + + +class StableDiffusionInpaintOnnxPipeline(DiffusionPipeline): + r""" + Pipeline for text-guided image inpainting using Stable Diffusion. *This is an experimental feature*. + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + Args: + vae ([`AutoencoderKL`]): + Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. + text_encoder ([`CLIPTextModel`]): + Frozen text-encoder. Stable Diffusion uses the text portion of + [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically + the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant. + tokenizer (`CLIPTokenizer`): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + scheduler ([`SchedulerMixin`]): + A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of + [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. + safety_checker ([`StableDiffusionSafetyChecker`]): + Classification module that estimates whether generated images could be considered offsensive or harmful. + Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details. + feature_extractor ([`CLIPFeatureExtractor`]): + Model that extracts features from generated images to be used as inputs for the `safety_checker`. + """ + + def __init__( + self, + vae_decoder: OnnxRuntimeModel, + vae: AutoencoderKL, + text_encoder: OnnxRuntimeModel, + tokenizer: CLIPTokenizer, + unet: OnnxRuntimeModel, + scheduler: Union[DDIMScheduler, PNDMScheduler], + safety_checker: OnnxRuntimeModel, + feature_extractor: CLIPFeatureExtractor, + ): + super().__init__() + scheduler = scheduler.set_format("np") + logger.info("`StableDiffusionInpaintPipeline` is experimental and will very likely change in the future.") + + if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1: + warnings.warn( + f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`" + f" should be set to 1 istead of {scheduler.config.steps_offset}. Please make sure " + "to update the config accordingly as leaving `steps_offset` might led to incorrect results" + " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub," + " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`" + " file", + DeprecationWarning, + ) + new_config = dict(scheduler.config) + new_config["steps_offset"] = 1 + scheduler._internal_dict = FrozenDict(new_config) + + self.register_modules( + vae_decoder=vae_decoder, + vae=vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + unet=unet, + scheduler=scheduler, + safety_checker=safety_checker, + feature_extractor=feature_extractor, + ) + + def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"): + r""" + Enable sliced attention computation. + + When this option is enabled, the attention module will split the input tensor in slices, to compute attention + in several steps. This is useful to save some memory in exchange for a small speed decrease. + + Args: + slice_size (`str` or `int`, *optional*, defaults to `"auto"`): + When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If + a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case, + `attention_head_dim` must be a multiple of `slice_size`. + """ + if slice_size == "auto": + # half the attention head size is usually a good trade-off between + # speed and memory + slice_size = self.unet.config.attention_head_dim // 2 + self.unet.set_attention_slice(slice_size) + + def disable_attention_slicing(self): + r""" + Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go + back to computing attention in one step. + """ + # set slice_size = `None` to disable `set_attention_slice` + self.enable_attention_slicing(None) + + def __call__( + self, + prompt: Union[str, List[str]], + init_image: Union[torch.FloatTensor, PIL.Image.Image], + mask_image: Union[torch.FloatTensor, PIL.Image.Image], + strength: float = 0.8, + num_inference_steps: Optional[int] = 50, + guidance_scale: Optional[float] = 7.5, + eta: Optional[float] = 0.0, + output_type: Optional[str] = "pil", + return_dict: bool = True, + ): + r""" + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`): + The prompt or prompts to guide the image generation. + init_image (`torch.FloatTensor` or `PIL.Image.Image`): + `Image`, or tensor representing an image batch, that will be used as the starting point for the + process. This is the image whose masked region will be inpainted. + mask_image (`torch.FloatTensor` or `PIL.Image.Image`): + `Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be + replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a + PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should + contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`. + strength (`float`, *optional*, defaults to 0.8): + Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength` + is 1, the denoising process will be run on the masked area for the full number of iterations specified + in `num_inference_steps`. `init_image` will be used as a reference for the masked area, adding more + noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur. + num_inference_steps (`int`, *optional*, defaults to 50): + The reference number of denoising steps. More denoising steps usually lead to a higher quality image at + the expense of slower inference. This parameter will be modulated by `strength`, as explained above. + guidance_scale (`float`, *optional*, defaults to 7.5): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to + [`schedulers.DDIMScheduler`], will be ignored for others. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + plain tuple. + + Returns: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. + When returning a tuple, the first element is a list with the generated images, and the second element is a + list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" + (nsfw) content, according to the `safety_checker`. + """ + if isinstance(prompt, str): + batch_size = 1 + elif isinstance(prompt, list): + batch_size = len(prompt) + else: + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if strength < 0 or strength > 1: + raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}") + + # set timesteps + self.scheduler.set_timesteps(num_inference_steps) + + # preprocess image + if not isinstance(init_image, torch.FloatTensor): + init_image = preprocess_image(init_image) + init_image = init_image.to(self.device) + + # encode the init image into latents and scale the latents + init_latent_dist = self.vae.encode(init_image).latent_dist + init_latents = init_latent_dist.sample(generator=None) + + init_latents = 0.18215 * init_latents + + # Expand init_latents for batch_size + init_latents = np.concatenate([init_latents.detach().numpy()] * batch_size) + init_latents_orig = init_latents + + # preprocess mask + if not isinstance(mask_image, torch.FloatTensor): + mask_image = preprocess_mask(mask_image) + mask_image = mask_image.to(self.device) + mask = np.concatenate([mask_image.detach().numpy()] * batch_size) + + # check sizes + if not mask.shape == init_latents.shape: + raise ValueError("The mask and init_image should be the same size!") + + # get the original timestep using init_timestep + offset = self.scheduler.config.get("steps_offset", 0) + init_timestep = int(num_inference_steps * strength) + offset + init_timestep = min(init_timestep, num_inference_steps) + timesteps = self.scheduler.timesteps[-init_timestep] + timesteps = torch.tensor([timesteps] * batch_size, dtype=torch.long, device=self.device) + + # add noise to latents using the timesteps + noise = np.random.randn(*init_latents.shape).astype(np.float32) + init_latents = self.scheduler.add_noise(init_latents, noise, timesteps) + + # get prompt text embeddings + text_input = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="np", + ) + text_embeddings = self.text_encoder(input_ids=text_input.input_ids.astype(np.int32))[0] + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance: + max_length = text_input.input_ids.shape[-1] + uncond_input = self.tokenizer( + [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="np" + ) + uncond_embeddings = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0] + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + text_embeddings = np.concatenate([uncond_embeddings, text_embeddings]) + + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + latents = init_latents + t_start = max(num_inference_steps - init_timestep + offset, 0) + for i, t in tqdm(enumerate(self.scheduler.timesteps[t_start:])): + # expand the latents if we are doing classifier free guidance + latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents + + # predict the noise residual + noise_pred = self.unet( + sample=latent_model_input, timestep=np.array([t]), encoder_hidden_states=text_embeddings + ) + noise_pred = noise_pred[0] + + # perform guidance + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample + + # masking + init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, t) + latents = (init_latents_proper * mask) + (latents * (1 - mask)) + + # scale and decode the image latents with vae + latents = 1 / 0.18215 * latents + image = self.vae_decoder(latent_sample=latents)[0] + + image = np.clip(image / 2 + 0.5, 0, 1) + image = image.transpose((0, 2, 3, 1)) + + # run safety checker + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="np") + image, has_nsfw_concept = self.safety_checker(clip_input=safety_checker_input.pixel_values, images=image) + + if output_type == "pil": + image = self.numpy_to_pil(image) + + if not return_dict: + return (image, has_nsfw_concept) + + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) From 0dbf0725fab47b8a9a557b8715cf5b9baf4bd527 Mon Sep 17 00:00:00 2001 From: anton-l Date: Mon, 17 Oct 2022 18:08:11 +0200 Subject: [PATCH 3/5] Export vae_encoder, upgrade img2img, add test --- ...ert_stable_diffusion_checkpoint_to_onnx.py | 7 + src/diffusers/__init__.py | 7 +- src/diffusers/pipelines/__init__.py | 8 +- .../pipelines/stable_diffusion/__init__.py | 6 +- ...x.py => pipeline_onnx_stable_diffusion.py} | 2 + ...pipeline_onnx_stable_diffusion_img2img.py} | 246 +++++++++++------- ...pipeline_onnx_stable_diffusion_inpaint.py} | 7 +- tests/test_pipelines.py | 32 +++ 8 files changed, 207 insertions(+), 108 deletions(-) rename src/diffusers/pipelines/stable_diffusion/{pipeline_stable_diffusion_onnx.py => pipeline_onnx_stable_diffusion.py} (99%) rename src/diffusers/pipelines/stable_diffusion/{pipeline_stable_diffusion_img2img_onnx.py => pipeline_onnx_stable_diffusion_img2img.py} (58%) rename src/diffusers/pipelines/stable_diffusion/{pipeline_stable_diffusion_inpaint_onnx.py => pipeline_onnx_stable_diffusion_inpaint.py} (98%) diff --git a/scripts/convert_stable_diffusion_checkpoint_to_onnx.py b/scripts/convert_stable_diffusion_checkpoint_to_onnx.py index a388c0d078dc..fb11616fc3b5 100644 --- a/scripts/convert_stable_diffusion_checkpoint_to_onnx.py +++ b/scripts/convert_stable_diffusion_checkpoint_to_onnx.py @@ -93,6 +93,7 @@ def convert_models(model_path: str, output_path: str, opset: int): }, opset=opset, ) + del pipeline.text_encoder # UNET unet_path = output_path / "unet" / "model.onnx" @@ -125,6 +126,7 @@ def convert_models(model_path: str, output_path: str, opset: int): location="weights.pb", convert_attribute=False, ) + del pipeline.unet # VAE ENCODER vae_encoder = pipeline.vae @@ -157,6 +159,7 @@ def convert_models(model_path: str, output_path: str, opset: int): }, opset=opset, ) + del pipeline.vae # SAFETY CHECKER safety_checker = pipeline.safety_checker @@ -173,8 +176,10 @@ def convert_models(model_path: str, output_path: str, opset: int): }, opset=opset, ) + del pipeline.safety_checker onnx_pipeline = StableDiffusionOnnxPipeline( + vae_encoder=OnnxRuntimeModel.from_pretrained(output_path / "vae_encoder"), vae_decoder=OnnxRuntimeModel.from_pretrained(output_path / "vae_decoder"), text_encoder=OnnxRuntimeModel.from_pretrained(output_path / "text_encoder"), tokenizer=pipeline.tokenizer, @@ -187,6 +192,8 @@ def convert_models(model_path: str, output_path: str, opset: int): onnx_pipeline.save_pretrained(output_path) print("ONNX pipeline saved to", output_path) + del pipeline + del onnx_pipeline _ = StableDiffusionOnnxPipeline.from_pretrained(output_path, provider="CPUExecutionProvider") print("ONNX pipeline is loadable") diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index f64c1e538007..f205f548540f 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -58,8 +58,11 @@ from .utils.dummy_torch_and_transformers_objects import * # noqa F403 if is_torch_available() and is_transformers_available() and is_onnx_available(): - from .pipelines import StableDiffusionOnnxPipeline - from .pipelines import StableDiffusionInpaintOnnxPipeline + from .pipelines import ( + StableDiffusionImg2ImgOnnxPipeline, + StableDiffusionInpaintOnnxPipeline, + StableDiffusionOnnxPipeline, + ) else: from .utils.dummy_torch_and_transformers_and_onnx_objects import * # noqa F403 diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index 7df8e3f16308..c131eb7fa05b 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -20,9 +20,11 @@ ) if is_transformers_available() and is_onnx_available(): - from .stable_diffusion import StableDiffusionOnnxPipeline - from .stable_diffusion import StableDiffusionImg2ImgOnnxPipeline - from .stable_diffusion import StableDiffusionInpaintOnnxPipeline + from .stable_diffusion import ( + StableDiffusionImg2ImgOnnxPipeline, + StableDiffusionInpaintOnnxPipeline, + StableDiffusionOnnxPipeline, + ) if is_transformers_available() and is_flax_available(): from .stable_diffusion import FlaxStableDiffusionPipeline diff --git a/src/diffusers/pipelines/stable_diffusion/__init__.py b/src/diffusers/pipelines/stable_diffusion/__init__.py index c2e9410764c8..9755120ca8aa 100644 --- a/src/diffusers/pipelines/stable_diffusion/__init__.py +++ b/src/diffusers/pipelines/stable_diffusion/__init__.py @@ -34,9 +34,9 @@ class StableDiffusionPipelineOutput(BaseOutput): from .safety_checker import StableDiffusionSafetyChecker if is_transformers_available() and is_onnx_available(): - from .pipeline_stable_diffusion_onnx import StableDiffusionOnnxPipeline - from .pipeline_stable_diffusion_img2img_onnx import StableDiffusionImg2ImgOnnxPipeline - from .pipeline_stable_diffusion_inpaint_onnx import StableDiffusionInpaintOnnxPipeline + from .pipeline_onnx_stable_diffusion import StableDiffusionOnnxPipeline + from .pipeline_onnx_stable_diffusion_img2img import StableDiffusionImg2ImgOnnxPipeline + from .pipeline_onnx_stable_diffusion_inpaint import StableDiffusionInpaintOnnxPipeline if is_transformers_available() and is_flax_available(): import flax diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_onnx.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py similarity index 99% rename from src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_onnx.py rename to src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py index acd6446af6fe..ef69f8e6ac3f 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_onnx.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py @@ -26,6 +26,7 @@ class StableDiffusionOnnxPipeline(DiffusionPipeline): def __init__( self, + vae_encoder: OnnxRuntimeModel, vae_decoder: OnnxRuntimeModel, text_encoder: OnnxRuntimeModel, tokenizer: CLIPTokenizer, @@ -36,6 +37,7 @@ def __init__( ): super().__init__() self.register_modules( + vae_encoder=vae_encoder, vae_decoder=vae_decoder, text_encoder=text_encoder, tokenizer=tokenizer, diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img_onnx.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py similarity index 58% rename from src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img_onnx.py rename to src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py index 39649d0f7283..74a76c55b618 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img_onnx.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py @@ -1,6 +1,5 @@ import inspect -import warnings -from typing import List, Optional, Union +from typing import Callable, List, Optional, Union import numpy as np import torch @@ -8,14 +7,15 @@ import PIL from transformers import CLIPFeatureExtractor, CLIPTokenizer -from ...onnx_utils import OnnxRuntimeModel - from ...configuration_utils import FrozenDict -from ...models import AutoencoderKL +from ...onnx_utils import OnnxRuntimeModel from ...pipeline_utils import DiffusionPipeline from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler +from ...utils import deprecate, logging from . import StableDiffusionPipelineOutput -from .safety_checker import StableDiffusionSafetyChecker + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name def preprocess(image): @@ -24,7 +24,6 @@ def preprocess(image): image = image.resize((w, h), resample=PIL.Image.LANCZOS) image = np.array(image).astype(np.float32) / 255.0 image = image[None].transpose(0, 3, 1, 2) - image = torch.from_numpy(image) return 2.0 * image - 1.0 @@ -50,16 +49,24 @@ class StableDiffusionImg2ImgOnnxPipeline(DiffusionPipeline): A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. safety_checker ([`StableDiffusionSafetyChecker`]): - Classification module that estimates whether generated images could be considered offsensive or harmful. + Classification module that estimates whether generated images could be considered offensive or harmful. Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details. feature_extractor ([`CLIPFeatureExtractor`]): Model that extracts features from generated images to be used as inputs for the `safety_checker`. """ + vae_encoder: OnnxRuntimeModel + vae_decoder: OnnxRuntimeModel + text_encoder: OnnxRuntimeModel + tokenizer: CLIPTokenizer + unet: OnnxRuntimeModel + scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler] + safety_checker: OnnxRuntimeModel + feature_extractor: CLIPFeatureExtractor def __init__( self, + vae_encoder: OnnxRuntimeModel, vae_decoder: OnnxRuntimeModel, - vae: AutoencoderKL, text_encoder: OnnxRuntimeModel, tokenizer: CLIPTokenizer, unet: OnnxRuntimeModel, @@ -68,25 +75,34 @@ def __init__( feature_extractor: CLIPFeatureExtractor, ): super().__init__() - scheduler = scheduler.set_format("np") if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1: - warnings.warn( + deprecation_message = ( f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`" - f" should be set to 1 istead of {scheduler.config.steps_offset}. Please make sure " + f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure " "to update the config accordingly as leaving `steps_offset` might led to incorrect results" " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub," " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`" - " file", - DeprecationWarning, + " file" ) + deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(scheduler.config) new_config["steps_offset"] = 1 scheduler._internal_dict = FrozenDict(new_config) + if safety_checker is None: + logger.warning( + f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure" + " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered" + " results in services or applications open to the public. Both the diffusers team and Hugging Face" + " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling" + " it only for use-cases that involve analyzing network behavior or auditing its results. For more" + " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ." + ) + self.register_modules( + vae_encoder=vae_encoder, vae_decoder=vae_decoder, - vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet, @@ -95,43 +111,21 @@ def __init__( feature_extractor=feature_extractor, ) - def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"): - r""" - Enable sliced attention computation. - - When this option is enabled, the attention module will split the input tensor in slices, to compute attention - in several steps. This is useful to save some memory in exchange for a small speed decrease. - - Args: - slice_size (`str` or `int`, *optional*, defaults to `"auto"`): - When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If - a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case, - `attention_head_dim` must be a multiple of `slice_size`. - """ - if slice_size == "auto": - # half the attention head size is usually a good trade-off between - # speed and memory - slice_size = self.unet.config.attention_head_dim // 2 - self.unet.set_attention_slice(slice_size) - - def disable_attention_slicing(self): - r""" - Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go - back to computing attention in one step. - """ - # set slice_size = `None` to disable `set_attention_slice` - self.enable_attention_slicing(None) - def __call__( self, prompt: Union[str, List[str]], - init_image: Union[torch.FloatTensor, PIL.Image.Image], + init_image: Union[np.ndarray, PIL.Image.Image], strength: float = 0.8, num_inference_steps: Optional[int] = 50, guidance_scale: Optional[float] = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, eta: Optional[float] = 0.0, output_type: Optional[str] = "pil", return_dict: bool = True, + callback: Optional[Callable[[int, int, np.ndarray], None]] = None, + callback_steps: Optional[int] = 1, + **kwargs, ): r""" Function invoked when calling the pipeline for generation. @@ -139,7 +133,7 @@ def __call__( Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - init_image (`torch.FloatTensor` or `PIL.Image.Image`): + init_image (`np.ndarray` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. strength (`float`, *optional*, defaults to 0.8): @@ -157,6 +151,11 @@ def __call__( Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored + if `guidance_scale` is less than `1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. eta (`float`, *optional*, defaults to 0.0): Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to [`schedulers.DDIMScheduler`], will be ignored for others. @@ -166,6 +165,12 @@ def __call__( return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a plain tuple. + callback (`Callable`, *optional*): + A function that will be called every `callback_steps` steps during inference. The function will be + called with the following arguments: `callback(step: int, timestep: int, latents: np.ndarray)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function will be called. If not specified, the callback will be + called at every step. Returns: [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: @@ -184,43 +189,40 @@ def __call__( if strength < 0 or strength > 1: raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}") + if (callback_steps is None) or ( + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type" + f" {type(callback_steps)}." + ) + # set timesteps self.scheduler.set_timesteps(num_inference_steps) if isinstance(init_image, PIL.Image.Image): init_image = preprocess(init_image) - # encode the init image into latents and scale the latents - init_latent_dist = self.vae.encode(init_image.to(self.device)).latent_dist - init_latents = init_latent_dist.sample(generator=None) - init_latents = 0.18215 * init_latents - - # expand init_latents for batch_size - init_latents = np.concatenate([init_latents.detach().numpy()] * batch_size) - - # get the original timestep using init_timestep - offset = self.scheduler.config.get("steps_offset", 0) - init_timestep = int(num_inference_steps * strength) + offset - init_timestep = min(init_timestep, num_inference_steps) - if isinstance(self.scheduler, LMSDiscreteScheduler): - timesteps = num_inference_steps - init_timestep - else: - timesteps = self.scheduler.timesteps[-init_timestep] - - # add noise to latents using the timesteps - noise = np.random.randn(*init_latents.shape).astype(np.float32) - - init_latents = self.scheduler.add_noise(init_latents, noise, timesteps).to(self.device) - # get prompt text embeddings - text_input = self.tokenizer( + text_inputs = self.tokenizer( prompt, padding="max_length", max_length=self.tokenizer.model_max_length, - truncation=True, return_tensors="np", ) - text_embeddings = self.text_encoder(input_ids=text_input.input_ids.astype(np.int32))[0] + text_input_ids = text_inputs.input_ids + + if text_input_ids.shape[-1] > self.tokenizer.model_max_length: + removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :]) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length] + text_embeddings = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0] + + # duplicate text embeddings for each generation per prompt + text_embeddings = np.repeat(text_embeddings, num_images_per_prompt, axis=0) # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` @@ -228,17 +230,76 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance: - max_length = text_input.input_ids.shape[-1] + uncond_tokens: List[str] + if negative_prompt is None: + uncond_tokens = [""] + elif type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError("The length of `negative_prompt` should be equal to batch_size.") + else: + uncond_tokens = negative_prompt + + max_length = text_input_ids.shape[-1] uncond_input = self.tokenizer( - [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="np" + uncond_tokens, + padding="max_length", + max_length=max_length, + truncation=True, + return_tensors="np", ) - uncond_embeddings = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0] + uncond_input_ids = uncond_input.input_ids + uncond_embeddings = self.text_encoder(input_ids=uncond_input_ids.astype(np.int32))[0] + + # duplicate unconditional embeddings for each generation per prompt + uncond_embeddings = np.repeat(uncond_embeddings, batch_size * num_images_per_prompt, axis=0) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes text_embeddings = np.concatenate([uncond_embeddings, text_embeddings]) + # encode the init image into latents and scale the latents + init_latents = self.vae_encoder(sample=init_image)[0] + init_latents = 0.18215 * init_latents + + if isinstance(prompt, str): + prompt = [prompt] + if len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] == 0: + # expand init_latents for batch_size + deprecation_message = ( + f"You have passed {len(prompt)} text prompts (`prompt`), but only {init_latents.shape[0]} initial" + " images (`init_image`). Initial images are now duplicating to match the number of text prompts. Note" + " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update" + " your script to pass as many init images as text prompts to suppress this warning." + ) + deprecate("len(prompt) != len(init_image)", "1.0.0", deprecation_message, standard_warn=False) + additional_image_per_prompt = len(prompt) // init_latents.shape[0] + init_latents = np.concatenate([init_latents] * additional_image_per_prompt * num_images_per_prompt, axis=0) + elif len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] != 0: + raise ValueError( + f"Cannot duplicate `init_image` of batch size {init_latents.shape[0]} to {len(prompt)} text prompts." + ) + else: + init_latents = np.concatenate([init_latents] * num_images_per_prompt, axis=0) + + # get the original timestep using init_timestep + offset = self.scheduler.config.get("steps_offset", 0) + init_timestep = int(num_inference_steps * strength) + offset + init_timestep = min(init_timestep, num_inference_steps) + + timesteps = self.scheduler.timesteps[-init_timestep] + timesteps = torch.tensor([timesteps] * batch_size * num_images_per_prompt, device=self.device) + + # add noise to latents using the timesteps + noise = np.random.randn(*init_latents.shape).astype(np.float32) + init_latents = self.scheduler.add_noise(torch.from_numpy(init_latents), torch.from_numpy(noise), timesteps) + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 @@ -251,26 +312,20 @@ def __call__( latents = init_latents t_start = max(num_inference_steps - init_timestep + offset, 0) - for i, t in enumerate(self.progress_bar(self.scheduler.timesteps[t_start:])): - t_index = t_start + i + # Some schedulers like PNDM have timesteps as arrays + # It's more optimized to move all timesteps to correct device beforehand + timesteps = self.scheduler.timesteps[t_start:].to(self.device) + + for i, t in enumerate(self.progress_bar(timesteps)): # expand the latents if we are doing classifier free guidance latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents - - # if we use LMSDiscreteScheduler, let's make sure latents are multiplied by sigmas - if isinstance(self.scheduler, LMSDiscreteScheduler): - sigma = self.scheduler.sigmas[t_index] - # the model input needs to be scaled to match the continuous ODE formulation in K-LMS - latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5) - latent_model_input = latent_model_input.to(self.unet.dtype) - t = t.to(self.unet.dtype) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual noise_pred = self.unet( sample=latent_model_input, timestep=np.array([t]), encoder_hidden_states=text_embeddings - ) - noise_pred = noise_pred[0] - + )[0] # perform guidance if do_classifier_free_guidance: @@ -278,23 +333,24 @@ def __call__( noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - if isinstance(self.scheduler, LMSDiscreteScheduler): - latents = self.scheduler.step(noise_pred, t_index, latents, **extra_step_kwargs).prev_sample - else: - latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample + latents = np.array(latents) + + # call the callback, if provided + if callback is not None and i % callback_steps == 0: + callback(i, t, latents) - # scale and decode the image latents with vae latents = 1 / 0.18215 * latents image = self.vae_decoder(latent_sample=latents)[0] image = np.clip(image / 2 + 0.5, 0, 1) image = image.transpose((0, 2, 3, 1)) - # run safety checker - safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="np") - image, has_nsfw_concept = self.safety_checker(clip_input=safety_checker_input.pixel_values, images=image) - - + if self.safety_checker is not None: + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="np") + image, has_nsfw_concept = self.safety_checker(clip_input=safety_checker_input.pixel_values, images=image) + else: + has_nsfw_concept = None if output_type == "pil": image = self.numpy_to_pil(image) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_onnx.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py similarity index 98% rename from src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_onnx.py rename to src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py index 86085a9d6212..0d8ca2f3a348 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_onnx.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py @@ -9,15 +9,13 @@ from tqdm.auto import tqdm from transformers import CLIPFeatureExtractor, CLIPTokenizer -from ...onnx_utils import OnnxRuntimeModel - from ...configuration_utils import FrozenDict from ...models import AutoencoderKL +from ...onnx_utils import OnnxRuntimeModel from ...pipeline_utils import DiffusionPipeline from ...schedulers import DDIMScheduler, PNDMScheduler from ...utils import logging from . import StableDiffusionPipelineOutput -from .safety_checker import StableDiffusionSafetyChecker logger = logging.get_logger(__name__) @@ -86,8 +84,7 @@ def __init__( feature_extractor: CLIPFeatureExtractor, ): super().__init__() - scheduler = scheduler.set_format("np") - logger.info("`StableDiffusionInpaintPipeline` is experimental and will very likely change in the future.") + logger.info("`StableDiffusionInpaintOnnxPipeline` is experimental and will very likely change in the future.") if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1: warnings.warn( diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index a666996e11ff..2d3b47f6cf19 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -41,6 +41,7 @@ PNDMScheduler, ScoreSdeVePipeline, ScoreSdeVeScheduler, + StableDiffusionImg2ImgOnnxPipeline, StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipeline, StableDiffusionOnnxPipeline, @@ -2025,6 +2026,37 @@ def test_stable_diffusion_onnx(self): expected_slice = np.array([0.3602, 0.3688, 0.3652, 0.3895, 0.3782, 0.3747, 0.3927, 0.4241, 0.4327]) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 + @slow + def test_stable_diffusion_img2img_onnx(self): + init_image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + "/img2img/sketch-mountains-input.jpg" + ) + init_image = init_image.resize((768, 512)) + + pipe = StableDiffusionImg2ImgOnnxPipeline.from_pretrained( + "../scripts/sd_onnx", provider="CPUExecutionProvider" + ) + pipe.set_progress_bar_config(disable=None) + + prompt = "A fantasy landscape, trending on artstation" + + np.random.seed(0) + output = pipe( + prompt=prompt, + init_image=init_image, + strength=0.75, + guidance_scale=7.5, + num_inference_steps=8, + output_type="np", + ) + images = output.images + image_slice = images[0, -3:, -3:, -1] + + assert images.shape == (1, 512, 768, 3) + expected_slice = np.array([0.0956, 0.0899, 0.1057, 0.0853, 0.0938, 0.1004, 0.1158, 0.1258, 0.1158]) + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 + @slow @unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU") def test_stable_diffusion_text2img_intermediate_state(self): From f41712cb3b065cfef8011e0e9773480a5a9f66bb Mon Sep 17 00:00:00 2001 From: anton-l Date: Mon, 17 Oct 2022 20:57:40 +0200 Subject: [PATCH 4/5] updated inpainting pipeline + test --- src/diffusers/__init__.py | 4 +- src/diffusers/pipelines/__init__.py | 4 +- .../pipelines/stable_diffusion/__init__.py | 4 +- .../pipeline_onnx_stable_diffusion_img2img.py | 4 +- .../pipeline_onnx_stable_diffusion_inpaint.py | 252 +++++++++++------- tests/test_pipelines.py | 46 +++- 6 files changed, 206 insertions(+), 108 deletions(-) diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index f205f548540f..0e2c1654476d 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -59,8 +59,8 @@ if is_torch_available() and is_transformers_available() and is_onnx_available(): from .pipelines import ( - StableDiffusionImg2ImgOnnxPipeline, - StableDiffusionInpaintOnnxPipeline, + OnnxStableDiffusionImg2ImgPipeline, + OnnxStableDiffusionInpaintPipeline, StableDiffusionOnnxPipeline, ) else: diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index c131eb7fa05b..2cde0ce600c7 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -21,8 +21,8 @@ if is_transformers_available() and is_onnx_available(): from .stable_diffusion import ( - StableDiffusionImg2ImgOnnxPipeline, - StableDiffusionInpaintOnnxPipeline, + OnnxStableDiffusionImg2ImgPipeline, + OnnxStableDiffusionInpaintPipeline, StableDiffusionOnnxPipeline, ) diff --git a/src/diffusers/pipelines/stable_diffusion/__init__.py b/src/diffusers/pipelines/stable_diffusion/__init__.py index 9755120ca8aa..ddb1708d6f5d 100644 --- a/src/diffusers/pipelines/stable_diffusion/__init__.py +++ b/src/diffusers/pipelines/stable_diffusion/__init__.py @@ -35,8 +35,8 @@ class StableDiffusionPipelineOutput(BaseOutput): if is_transformers_available() and is_onnx_available(): from .pipeline_onnx_stable_diffusion import StableDiffusionOnnxPipeline - from .pipeline_onnx_stable_diffusion_img2img import StableDiffusionImg2ImgOnnxPipeline - from .pipeline_onnx_stable_diffusion_inpaint import StableDiffusionInpaintOnnxPipeline + from .pipeline_onnx_stable_diffusion_img2img import OnnxStableDiffusionImg2ImgPipeline + from .pipeline_onnx_stable_diffusion_inpaint import OnnxStableDiffusionInpaintPipeline if is_transformers_available() and is_flax_available(): import flax diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py index 74a76c55b618..c7fcef7aa9a0 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py @@ -27,7 +27,7 @@ def preprocess(image): return 2.0 * image - 1.0 -class StableDiffusionImg2ImgOnnxPipeline(DiffusionPipeline): +class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline): r""" Pipeline for text-guided image to image generation using Stable Diffusion. @@ -334,7 +334,7 @@ def __call__( # compute the previous noisy sample x_t -> x_t-1 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample - latents = np.array(latents) + latents = latents.numpy() # call the callback, if provided if callback is not None and i % callback_steps == 0: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py index 0d8ca2f3a348..694d108a8ff5 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py @@ -1,6 +1,5 @@ import inspect -import warnings -from typing import List, Optional, Union +from typing import Callable, List, Optional, Union import numpy as np import torch @@ -10,12 +9,12 @@ from transformers import CLIPFeatureExtractor, CLIPTokenizer from ...configuration_utils import FrozenDict -from ...models import AutoencoderKL from ...onnx_utils import OnnxRuntimeModel from ...pipeline_utils import DiffusionPipeline -from ...schedulers import DDIMScheduler, PNDMScheduler -from ...utils import logging +from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler +from ...utils import deprecate, logging from . import StableDiffusionPipelineOutput +from .safety_checker import StableDiffusionSafetyChecker logger = logging.get_logger(__name__) @@ -27,7 +26,6 @@ def preprocess_image(image): image = image.resize((w, h), resample=PIL.Image.LANCZOS) image = np.array(image).astype(np.float32) / 255.0 image = image[None].transpose(0, 3, 1, 2) - image = torch.from_numpy(image) return 2.0 * image - 1.0 @@ -40,11 +38,10 @@ def preprocess_mask(mask): mask = np.tile(mask, (4, 1, 1)) mask = mask[None].transpose(0, 1, 2, 3) # what does this step do? mask = 1 - mask # repaint white, keep black - mask = torch.from_numpy(mask) return mask -class StableDiffusionInpaintOnnxPipeline(DiffusionPipeline): +class OnnxStableDiffusionInpaintPipeline(DiffusionPipeline): r""" Pipeline for text-guided image inpainting using Stable Diffusion. *This is an experimental feature*. @@ -66,43 +63,61 @@ class StableDiffusionInpaintOnnxPipeline(DiffusionPipeline): A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. safety_checker ([`StableDiffusionSafetyChecker`]): - Classification module that estimates whether generated images could be considered offsensive or harmful. + Classification module that estimates whether generated images could be considered offensive or harmful. Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details. feature_extractor ([`CLIPFeatureExtractor`]): Model that extracts features from generated images to be used as inputs for the `safety_checker`. """ + vae_encoder: OnnxRuntimeModel + vae_decoder: OnnxRuntimeModel + text_encoder: OnnxRuntimeModel + tokenizer: CLIPTokenizer + unet: OnnxRuntimeModel + scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler] + safety_checker: OnnxRuntimeModel + feature_extractor: CLIPFeatureExtractor def __init__( self, + vae_encoder: OnnxRuntimeModel, vae_decoder: OnnxRuntimeModel, - vae: AutoencoderKL, text_encoder: OnnxRuntimeModel, tokenizer: CLIPTokenizer, unet: OnnxRuntimeModel, - scheduler: Union[DDIMScheduler, PNDMScheduler], + scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler], safety_checker: OnnxRuntimeModel, feature_extractor: CLIPFeatureExtractor, ): super().__init__() - logger.info("`StableDiffusionInpaintOnnxPipeline` is experimental and will very likely change in the future.") + logger.info("`OnnxStableDiffusionInpaintPipeline` is experimental and will very likely change in the future.") if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1: - warnings.warn( + deprecation_message = ( f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`" - f" should be set to 1 istead of {scheduler.config.steps_offset}. Please make sure " + f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure " "to update the config accordingly as leaving `steps_offset` might led to incorrect results" " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub," " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`" - " file", - DeprecationWarning, + " file" ) + deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(scheduler.config) new_config["steps_offset"] = 1 scheduler._internal_dict = FrozenDict(new_config) + if safety_checker is None: + logger.warning( + f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure" + " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered" + " results in services or applications open to the public. Both the diffusers team and Hugging Face" + " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling" + " it only for use-cases that involve analyzing network behavior or auditing its results. For more" + " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ." + ) + self.register_modules( + vae_encoder=vae_encoder, vae_decoder=vae_decoder, - vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet, @@ -111,44 +126,23 @@ def __init__( feature_extractor=feature_extractor, ) - def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"): - r""" - Enable sliced attention computation. - - When this option is enabled, the attention module will split the input tensor in slices, to compute attention - in several steps. This is useful to save some memory in exchange for a small speed decrease. - - Args: - slice_size (`str` or `int`, *optional*, defaults to `"auto"`): - When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If - a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case, - `attention_head_dim` must be a multiple of `slice_size`. - """ - if slice_size == "auto": - # half the attention head size is usually a good trade-off between - # speed and memory - slice_size = self.unet.config.attention_head_dim // 2 - self.unet.set_attention_slice(slice_size) - - def disable_attention_slicing(self): - r""" - Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go - back to computing attention in one step. - """ - # set slice_size = `None` to disable `set_attention_slice` - self.enable_attention_slicing(None) - + @torch.no_grad() def __call__( self, prompt: Union[str, List[str]], - init_image: Union[torch.FloatTensor, PIL.Image.Image], - mask_image: Union[torch.FloatTensor, PIL.Image.Image], + init_image: Union[np.ndarray, PIL.Image.Image], + mask_image: Union[np.ndarray, PIL.Image.Image], strength: float = 0.8, num_inference_steps: Optional[int] = 50, guidance_scale: Optional[float] = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, eta: Optional[float] = 0.0, output_type: Optional[str] = "pil", return_dict: bool = True, + callback: Optional[Callable[[int, int, np.ndarray], None]] = None, + callback_steps: Optional[int] = 1, + **kwargs, ): r""" Function invoked when calling the pipeline for generation. @@ -156,10 +150,10 @@ def __call__( Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - init_image (`torch.FloatTensor` or `PIL.Image.Image`): + init_image (`np.ndarray` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, that will be used as the starting point for the process. This is the image whose masked region will be inpainted. - mask_image (`torch.FloatTensor` or `PIL.Image.Image`): + mask_image (`np.ndarray` or `PIL.Image.Image`): `Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should @@ -178,6 +172,11 @@ def __call__( Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored + if `guidance_scale` is less than `1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. eta (`float`, *optional*, defaults to 0.0): Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to [`schedulers.DDIMScheduler`], will be ignored for others. @@ -187,6 +186,12 @@ def __call__( return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a plain tuple. + callback (`Callable`, *optional*): + A function that will be called every `callback_steps` steps during inference. The function will be + called with the following arguments: `callback(step: int, timestep: int, latents: np.ndarray)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function will be called. If not specified, the callback will be + called at every step. Returns: [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: @@ -205,29 +210,98 @@ def __call__( if strength < 0 or strength > 1: raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}") + if (callback_steps is None) or ( + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type" + f" {type(callback_steps)}." + ) + # set timesteps self.scheduler.set_timesteps(num_inference_steps) + # get prompt text embeddings + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + return_tensors="np", + ) + text_input_ids = text_inputs.input_ids + + if text_input_ids.shape[-1] > self.tokenizer.model_max_length: + removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :]) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length] + text_embeddings = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0] + + # duplicate text embeddings for each generation per prompt + text_embeddings = np.repeat(text_embeddings, num_images_per_prompt, axis=0) + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance: + uncond_tokens: List[str] + if negative_prompt is None: + uncond_tokens = [""] + elif type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + max_length = text_input_ids.shape[-1] + uncond_input = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=max_length, + truncation=True, + return_tensors="np", + ) + uncond_input_ids = uncond_input.input_ids + uncond_embeddings = self.text_encoder(input_ids=uncond_input_ids.astype(np.int32))[0] + + # duplicate unconditional embeddings for each generation per prompt + uncond_embeddings = np.repeat(uncond_embeddings, batch_size * num_images_per_prompt, axis=0) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + text_embeddings = np.concatenate([uncond_embeddings, text_embeddings]) + # preprocess image if not isinstance(init_image, torch.FloatTensor): init_image = preprocess_image(init_image) - init_image = init_image.to(self.device) # encode the init image into latents and scale the latents - init_latent_dist = self.vae.encode(init_image).latent_dist - init_latents = init_latent_dist.sample(generator=None) - + init_latents = self.vae_encoder(sample=init_image)[0] init_latents = 0.18215 * init_latents - # Expand init_latents for batch_size - init_latents = np.concatenate([init_latents.detach().numpy()] * batch_size) + # Expand init_latents for batch_size and num_images_per_prompt + init_latents = np.concatenate([init_latents] * batch_size * num_images_per_prompt, axis=0) init_latents_orig = init_latents # preprocess mask - if not isinstance(mask_image, torch.FloatTensor): + if not isinstance(mask_image, np.ndarray): mask_image = preprocess_mask(mask_image) - mask_image = mask_image.to(self.device) - mask = np.concatenate([mask_image.detach().numpy()] * batch_size) + mask = np.concatenate([mask_image] * batch_size * num_images_per_prompt) # check sizes if not mask.shape == init_latents.shape: @@ -237,39 +311,13 @@ def __call__( offset = self.scheduler.config.get("steps_offset", 0) init_timestep = int(num_inference_steps * strength) + offset init_timestep = min(init_timestep, num_inference_steps) + timesteps = self.scheduler.timesteps[-init_timestep] - timesteps = torch.tensor([timesteps] * batch_size, dtype=torch.long, device=self.device) + timesteps = torch.tensor([timesteps] * batch_size * num_images_per_prompt, device=self.device) # add noise to latents using the timesteps noise = np.random.randn(*init_latents.shape).astype(np.float32) - init_latents = self.scheduler.add_noise(init_latents, noise, timesteps) - - # get prompt text embeddings - text_input = self.tokenizer( - prompt, - padding="max_length", - max_length=self.tokenizer.model_max_length, - truncation=True, - return_tensors="np", - ) - text_embeddings = self.text_encoder(input_ids=text_input.input_ids.astype(np.int32))[0] - - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - do_classifier_free_guidance = guidance_scale > 1.0 - # get unconditional embeddings for classifier free guidance - if do_classifier_free_guidance: - max_length = text_input.input_ids.shape[-1] - uncond_input = self.tokenizer( - [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="np" - ) - uncond_embeddings = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0] - - # For classifier free guidance, we need to do two forward passes. - # Here we concatenate the unconditional and text embeddings into a single batch - # to avoid doing two forward passes - text_embeddings = np.concatenate([uncond_embeddings, text_embeddings]) + init_latents = self.scheduler.add_noise(torch.from_numpy(init_latents), torch.from_numpy(noise), timesteps) # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. @@ -281,16 +329,22 @@ def __call__( extra_step_kwargs["eta"] = eta latents = init_latents + t_start = max(num_inference_steps - init_timestep + offset, 0) - for i, t in tqdm(enumerate(self.scheduler.timesteps[t_start:])): + + # Some schedulers like PNDM have timesteps as arrays + # It's more optimized to move all timesteps to correct device beforehand + timesteps = self.scheduler.timesteps[t_start:].to(self.device) + + for i, t in tqdm(enumerate(timesteps)): # expand the latents if we are doing classifier free guidance latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual noise_pred = self.unet( sample=latent_model_input, timestep=np.array([t]), encoder_hidden_states=text_embeddings - ) - noise_pred = noise_pred[0] + )[0] # perform guidance if do_classifier_free_guidance: @@ -299,21 +353,29 @@ def __call__( # compute the previous noisy sample x_t -> x_t-1 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample - + latents = latents.numpy() # masking - init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, t) + init_latents_proper = self.scheduler.add_noise( + torch.from_numpy(init_latents_orig), torch.from_numpy(noise), torch.tensor([t]) + ) + latents = (init_latents_proper * mask) + (latents * (1 - mask)) - # scale and decode the image latents with vae + # call the callback, if provided + if callback is not None and i % callback_steps == 0: + callback(i, t, latents) + latents = 1 / 0.18215 * latents image = self.vae_decoder(latent_sample=latents)[0] image = np.clip(image / 2 + 0.5, 0, 1) image = image.transpose((0, 2, 3, 1)) - # run safety checker - safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="np") - image, has_nsfw_concept = self.safety_checker(clip_input=safety_checker_input.pixel_values, images=image) + if self.safety_checker is not None: + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="np") + image, has_nsfw_concept = self.safety_checker(clip_input=safety_checker_input.pixel_values, images=image) + else: + has_nsfw_concept = None if output_type == "pil": image = self.numpy_to_pil(image) diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index 2d3b47f6cf19..e7af2d37f582 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -37,11 +37,12 @@ LDMPipeline, LDMTextToImagePipeline, LMSDiscreteScheduler, + OnnxStableDiffusionImg2ImgPipeline, + OnnxStableDiffusionInpaintPipeline, PNDMPipeline, PNDMScheduler, ScoreSdeVePipeline, ScoreSdeVeScheduler, - StableDiffusionImg2ImgOnnxPipeline, StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipeline, StableDiffusionOnnxPipeline, @@ -2034,8 +2035,8 @@ def test_stable_diffusion_img2img_onnx(self): ) init_image = init_image.resize((768, 512)) - pipe = StableDiffusionImg2ImgOnnxPipeline.from_pretrained( - "../scripts/sd_onnx", provider="CPUExecutionProvider" + pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained( + "CompVis/stable-diffusion-v1-4", revision="onnx", provider="CPUExecutionProvider" ) pipe.set_progress_bar_config(disable=None) @@ -2051,10 +2052,45 @@ def test_stable_diffusion_img2img_onnx(self): output_type="np", ) images = output.images - image_slice = images[0, -3:, -3:, -1] + image_slice = images[0, 255:258, 383:386, -1] assert images.shape == (1, 512, 768, 3) - expected_slice = np.array([0.0956, 0.0899, 0.1057, 0.0853, 0.0938, 0.1004, 0.1158, 0.1258, 0.1158]) + expected_slice = np.array([[0.4806, 0.5125, 0.5453, 0.4846, 0.4984, 0.4955, 0.4830, 0.4962, 0.4969]]) + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 + + @slow + def test_stable_diffusion_inpaint_onnx(self): + init_image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + "/in_paint/overture-creations-5sI6fQgYIuo.png" + ) + mask_image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + "/in_paint/overture-creations-5sI6fQgYIuo_mask.png" + ) + + pipe = OnnxStableDiffusionInpaintPipeline.from_pretrained( + "CompVis/stable-diffusion-v1-4", revision="onnx", provider="CPUExecutionProvider" + ) + pipe.set_progress_bar_config(disable=None) + + prompt = "A red cat sitting on a park bench" + + np.random.seed(0) + output = pipe( + prompt=prompt, + init_image=init_image, + mask_image=mask_image, + strength=0.75, + guidance_scale=7.5, + num_inference_steps=8, + output_type="np", + ) + images = output.images + image_slice = images[0, 255:258, 255:258, -1] + + assert images.shape == (1, 512, 512, 3) + expected_slice = np.array([0.3524, 0.3289, 0.3464, 0.3872, 0.4129, 0.3566, 0.3709, 0.4128, 0.3734]) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 @slow From ee850c7834b0fbec325c3399f7b357f4a688d97a Mon Sep 17 00:00:00 2001 From: anton-l Date: Mon, 17 Oct 2022 20:58:18 +0200 Subject: [PATCH 5/5] style --- .../stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py index 694d108a8ff5..1ff34dd38378 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py @@ -14,7 +14,6 @@ from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler from ...utils import deprecate, logging from . import StableDiffusionPipelineOutput -from .safety_checker import StableDiffusionSafetyChecker logger = logging.get_logger(__name__)