diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 29e085fbeb7c..0c05f0ef7ffa 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -266,10 +266,6 @@ title: ControlNet - local: api/pipelines/controlnet_sdxl title: ControlNet with Stable Diffusion XL - - local: api/pipelines/controlnetxs - title: ControlNet-XS - - local: api/pipelines/controlnetxs_sdxl - title: ControlNet-XS with Stable Diffusion XL - local: api/pipelines/dance_diffusion title: Dance Diffusion - local: api/pipelines/ddim diff --git a/docs/source/en/api/pipelines/controlnetxs.md b/examples/research_projects/controlnetxs/README.md similarity index 61% rename from docs/source/en/api/pipelines/controlnetxs.md rename to examples/research_projects/controlnetxs/README.md index 2d4ae7b8ce46..72ed91c01db2 100644 --- a/docs/source/en/api/pipelines/controlnetxs.md +++ b/examples/research_projects/controlnetxs/README.md @@ -1,15 +1,3 @@ - - # ControlNet-XS ControlNet-XS was introduced in [ControlNet-XS](https://vislearn.github.io/ControlNet-XS/) by Denis Zavadski and Carsten Rother. It is based on the observation that the control model in the [original ControlNet](https://huggingface.co/papers/2302.05543) can be made much smaller and still produce good results. @@ -24,16 +12,5 @@ Here's the overview from the [project page](https://vislearn.github.io/ControlNe This model was contributed by [UmerHA](https://twitter.com/UmerHAdil). ❤️ - - -Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines. - - - -## StableDiffusionControlNetXSPipeline -[[autodoc]] StableDiffusionControlNetXSPipeline - - all - - __call__ -## StableDiffusionPipelineOutput -[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput +> 🧠 Make sure to check out the Schedulers [guide](https://huggingface.co/docs/diffusers/main/en/using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](https://huggingface.co/docs/diffusers/main/en/using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines. \ No newline at end of file diff --git a/docs/source/en/api/pipelines/controlnetxs_sdxl.md b/examples/research_projects/controlnetxs/README_sdxl.md similarity index 56% rename from docs/source/en/api/pipelines/controlnetxs_sdxl.md rename to examples/research_projects/controlnetxs/README_sdxl.md index 31075c0ef96a..d401c1e76698 100644 --- a/docs/source/en/api/pipelines/controlnetxs_sdxl.md +++ b/examples/research_projects/controlnetxs/README_sdxl.md @@ -1,15 +1,3 @@ - - # ControlNet-XS with Stable Diffusion XL ControlNet-XS was introduced in [ControlNet-XS](https://vislearn.github.io/ControlNet-XS/) by Denis Zavadski and Carsten Rother. It is based on the observation that the control model in the [original ControlNet](https://huggingface.co/papers/2302.05543) can be made much smaller and still produce good results. @@ -24,22 +12,4 @@ Here's the overview from the [project page](https://vislearn.github.io/ControlNe This model was contributed by [UmerHA](https://twitter.com/UmerHAdil). ❤️ - - -🧪 Many of the SDXL ControlNet checkpoints are experimental, and there is a lot of room for improvement. Feel free to open an [Issue](https://github.com/huggingface/diffusers/issues/new/choose) and leave us feedback on how we can improve! - - - - - -Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines. - - - -## StableDiffusionXLControlNetXSPipeline -[[autodoc]] StableDiffusionXLControlNetXSPipeline - - all - - __call__ - -## StableDiffusionPipelineOutput -[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput +> 🧠 Make sure to check out the Schedulers [guide](https://huggingface.co/docs/diffusers/main/en/using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](https://huggingface.co/docs/diffusers/main/en/using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines. \ No newline at end of file diff --git a/src/diffusers/models/controlnetxs.py b/examples/research_projects/controlnetxs/controlnetxs.py similarity index 98% rename from src/diffusers/models/controlnetxs.py rename to examples/research_projects/controlnetxs/controlnetxs.py index 41fe624b9b59..c6419b44daeb 100644 --- a/src/diffusers/models/controlnetxs.py +++ b/examples/research_projects/controlnetxs/controlnetxs.py @@ -21,13 +21,12 @@ from torch.nn import functional as F from torch.nn.modules.normalization import GroupNorm -from ..configuration_utils import ConfigMixin, register_to_config -from ..utils import BaseOutput, logging -from .attention_processor import USE_PEFT_BACKEND, AttentionProcessor -from .autoencoders import AutoencoderKL -from .lora import LoRACompatibleConv -from .modeling_utils import ModelMixin -from .unet_2d_blocks import ( +from diffusers.configuration_utils import ConfigMixin, register_to_config +from diffusers.models.attention_processor import USE_PEFT_BACKEND, AttentionProcessor +from diffusers.models.autoencoders import AutoencoderKL +from diffusers.models.lora import LoRACompatibleConv +from diffusers.models.modeling_utils import ModelMixin +from diffusers.models.unet_2d_blocks import ( CrossAttnDownBlock2D, CrossAttnUpBlock2D, DownBlock2D, @@ -37,7 +36,8 @@ UpBlock2D, Upsample2D, ) -from .unet_2d_condition import UNet2DConditionModel +from diffusers.models.unet_2d_condition import UNet2DConditionModel +from diffusers.utils import BaseOutput, logging logger = logging.get_logger(__name__) # pylint: disable=invalid-name diff --git a/examples/research_projects/controlnetxs/infer_sd_controlnetxs.py b/examples/research_projects/controlnetxs/infer_sd_controlnetxs.py new file mode 100644 index 000000000000..722b282a3251 --- /dev/null +++ b/examples/research_projects/controlnetxs/infer_sd_controlnetxs.py @@ -0,0 +1,58 @@ +# !pip install opencv-python transformers accelerate +import argparse + +import cv2 +import numpy as np +import torch +from controlnetxs import ControlNetXSModel +from PIL import Image +from pipeline_controlnet_xs import StableDiffusionControlNetXSPipeline + +from diffusers.utils import load_image + + +parser = argparse.ArgumentParser() +parser.add_argument( + "--prompt", type=str, default="aerial view, a futuristic research complex in a bright foggy jungle, hard lighting" +) +parser.add_argument("--negative_prompt", type=str, default="low quality, bad quality, sketches") +parser.add_argument("--controlnet_conditioning_scale", type=float, default=0.7) +parser.add_argument( + "--image_path", + type=str, + default="https://hf.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png", +) +parser.add_argument("--num_inference_steps", type=int, default=50) + +args = parser.parse_args() + +prompt = args.prompt +negative_prompt = args.negative_prompt +# download an image +image = load_image(args.image_path) + +# initialize the models and pipeline +controlnet_conditioning_scale = args.controlnet_conditioning_scale +controlnet = ControlNetXSModel.from_pretrained("UmerHA/ConrolNetXS-SD2.1-canny", torch_dtype=torch.float16) +pipe = StableDiffusionControlNetXSPipeline.from_pretrained( + "stabilityai/stable-diffusion-2-1", controlnet=controlnet, torch_dtype=torch.float16 +) +pipe.enable_model_cpu_offload() + +# get canny image +image = np.array(image) +image = cv2.Canny(image, 100, 200) +image = image[:, :, None] +image = np.concatenate([image, image, image], axis=2) +canny_image = Image.fromarray(image) + +num_inference_steps = args.num_inference_steps + +# generate image +image = pipe( + prompt, + controlnet_conditioning_scale=controlnet_conditioning_scale, + image=canny_image, + num_inference_steps=num_inference_steps, +).images[0] +image.save("cnxs_sd.canny.png") diff --git a/examples/research_projects/controlnetxs/infer_sdxl_controlnetxs.py b/examples/research_projects/controlnetxs/infer_sdxl_controlnetxs.py new file mode 100644 index 000000000000..e5b8cfd88223 --- /dev/null +++ b/examples/research_projects/controlnetxs/infer_sdxl_controlnetxs.py @@ -0,0 +1,57 @@ +# !pip install opencv-python transformers accelerate +import argparse + +import cv2 +import numpy as np +import torch +from controlnetxs import ControlNetXSModel +from PIL import Image +from pipeline_controlnet_xs import StableDiffusionControlNetXSPipeline + +from diffusers.utils import load_image + + +parser = argparse.ArgumentParser() +parser.add_argument( + "--prompt", type=str, default="aerial view, a futuristic research complex in a bright foggy jungle, hard lighting" +) +parser.add_argument("--negative_prompt", type=str, default="low quality, bad quality, sketches") +parser.add_argument("--controlnet_conditioning_scale", type=float, default=0.7) +parser.add_argument( + "--image_path", + type=str, + default="https://hf.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png", +) +parser.add_argument("--num_inference_steps", type=int, default=50) + +args = parser.parse_args() + +prompt = args.prompt +negative_prompt = args.negative_prompt +# download an image +image = load_image(args.image_path) +# initialize the models and pipeline +controlnet_conditioning_scale = args.controlnet_conditioning_scale +controlnet = ControlNetXSModel.from_pretrained("UmerHA/ConrolNetXS-SDXL-canny", torch_dtype=torch.float16) +pipe = StableDiffusionControlNetXSPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, torch_dtype=torch.float16 +) +pipe.enable_model_cpu_offload() + +# get canny image +image = np.array(image) +image = cv2.Canny(image, 100, 200) +image = image[:, :, None] +image = np.concatenate([image, image, image], axis=2) +canny_image = Image.fromarray(image) + +num_inference_steps = args.num_inference_steps + +# generate image +image = pipe( + prompt, + controlnet_conditioning_scale=controlnet_conditioning_scale, + image=canny_image, + num_inference_steps=num_inference_steps, +).images[0] +image.save("cnxs_sdxl.canny.png") diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py b/examples/research_projects/controlnetxs/pipeline_controlnet_xs.py similarity index 94% rename from src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py rename to examples/research_projects/controlnetxs/pipeline_controlnet_xs.py index bf3ac5050506..8e95306da584 100644 --- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs.py +++ b/examples/research_projects/controlnetxs/pipeline_controlnet_xs.py @@ -19,74 +19,30 @@ import PIL.Image import torch import torch.nn.functional as F +from controlnetxs import ControlNetXSModel from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer -from ...image_processor import PipelineImageInput, VaeImageProcessor -from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin -from ...models import AutoencoderKL, ControlNetXSModel, UNet2DConditionModel -from ...models.lora import adjust_lora_scale_text_encoder -from ...schedulers import KarrasDiffusionSchedulers -from ...utils import ( +from diffusers.image_processor import PipelineImageInput, VaeImageProcessor +from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from diffusers.models import AutoencoderKL, UNet2DConditionModel +from diffusers.models.lora import adjust_lora_scale_text_encoder +from diffusers.pipelines.pipeline_utils import DiffusionPipeline +from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput +from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker +from diffusers.schedulers import KarrasDiffusionSchedulers +from diffusers.utils import ( USE_PEFT_BACKEND, deprecate, logging, - replace_example_docstring, scale_lora_layers, unscale_lora_layers, ) -from ...utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor -from ..pipeline_utils import DiffusionPipeline -from ..stable_diffusion.pipeline_output import StableDiffusionPipelineOutput -from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker +from diffusers.utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor logger = logging.get_logger(__name__) # pylint: disable=invalid-name -EXAMPLE_DOC_STRING = """ - Examples: - ```py - >>> # !pip install opencv-python transformers accelerate - >>> from diffusers import StableDiffusionControlNetXSPipeline, ControlNetXSModel - >>> from diffusers.utils import load_image - >>> import numpy as np - >>> import torch - - >>> import cv2 - >>> from PIL import Image - - >>> prompt = "aerial view, a futuristic research complex in a bright foggy jungle, hard lighting" - >>> negative_prompt = "low quality, bad quality, sketches" - - >>> # download an image - >>> image = load_image( - ... "https://hf.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png" - ... ) - - >>> # initialize the models and pipeline - >>> controlnet_conditioning_scale = 0.5 - >>> controlnet = ControlNetXSModel.from_pretrained( - ... "UmerHA/ConrolNetXS-SD2.1-canny", torch_dtype=torch.float16 - ... ) - >>> pipe = StableDiffusionControlNetXSPipeline.from_pretrained( - ... "stabilityai/stable-diffusion-2-1", controlnet=controlnet, torch_dtype=torch.float16 - ... ) - >>> pipe.enable_model_cpu_offload() - - >>> # get canny image - >>> image = np.array(image) - >>> image = cv2.Canny(image, 100, 200) - >>> image = image[:, :, None] - >>> image = np.concatenate([image, image, image], axis=2) - >>> canny_image = Image.fromarray(image) - >>> # generate image - >>> image = pipe( - ... prompt, controlnet_conditioning_scale=controlnet_conditioning_scale, image=canny_image - ... ).images[0] - ``` -""" - - class StableDiffusionControlNetXSPipeline( DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin ): @@ -669,7 +625,6 @@ def disable_freeu(self): self.unet.disable_freeu() @torch.no_grad() - @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( self, prompt: Union[str, List[str]] = None, diff --git a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py b/examples/research_projects/controlnetxs/pipeline_controlnet_xs_sd_xl.py similarity index 95% rename from src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py rename to examples/research_projects/controlnetxs/pipeline_controlnet_xs_sd_xl.py index 58f0f544a5ac..be888d7e1145 100644 --- a/src/diffusers/pipelines/controlnet_xs/pipeline_controlnet_xs_sd_xl.py +++ b/examples/research_projects/controlnetxs/pipeline_controlnet_xs_sd_xl.py @@ -21,76 +21,36 @@ import torch.nn.functional as F from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer -from diffusers.utils.import_utils import is_invisible_watermark_available - -from ...image_processor import PipelineImageInput, VaeImageProcessor -from ...loaders import FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin -from ...models import AutoencoderKL, ControlNetXSModel, UNet2DConditionModel -from ...models.attention_processor import ( +from diffusers.image_processor import PipelineImageInput, VaeImageProcessor +from diffusers.loaders import FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin +from diffusers.models import AutoencoderKL, ControlNetXSModel, UNet2DConditionModel +from diffusers.models.attention_processor import ( AttnProcessor2_0, LoRAAttnProcessor2_0, LoRAXFormersAttnProcessor, XFormersAttnProcessor, ) -from ...models.lora import adjust_lora_scale_text_encoder -from ...schedulers import KarrasDiffusionSchedulers -from ...utils import USE_PEFT_BACKEND, logging, replace_example_docstring, scale_lora_layers, unscale_lora_layers -from ...utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor -from ..pipeline_utils import DiffusionPipeline -from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput +from diffusers.models.lora import adjust_lora_scale_text_encoder +from diffusers.pipelines.pipeline_utils import DiffusionPipeline +from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput +from diffusers.schedulers import KarrasDiffusionSchedulers +from diffusers.utils import ( + USE_PEFT_BACKEND, + logging, + scale_lora_layers, + unscale_lora_layers, +) +from diffusers.utils.import_utils import is_invisible_watermark_available +from diffusers.utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor if is_invisible_watermark_available(): - from ..stable_diffusion_xl.watermark import StableDiffusionXLWatermarker + from diffusers.pipelines.stable_diffusion_xl.watermark import StableDiffusionXLWatermarker logger = logging.get_logger(__name__) # pylint: disable=invalid-name -EXAMPLE_DOC_STRING = """ - Examples: - ```py - >>> # !pip install opencv-python transformers accelerate - >>> from diffusers import StableDiffusionXLControlNetXSPipeline, ControlNetXSModel, AutoencoderKL - >>> from diffusers.utils import load_image - >>> import numpy as np - >>> import torch - - >>> import cv2 - >>> from PIL import Image - - >>> prompt = "aerial view, a futuristic research complex in a bright foggy jungle, hard lighting" - >>> negative_prompt = "low quality, bad quality, sketches" - - >>> # download an image - >>> image = load_image( - ... "https://hf.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png" - ... ) - - >>> # initialize the models and pipeline - >>> controlnet_conditioning_scale = 0.5 # recommended for good generalization - >>> controlnet = ControlNetXSModel.from_pretrained("UmerHA/ConrolNetXS-SDXL-canny", torch_dtype=torch.float16) - >>> vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16) - >>> pipe = StableDiffusionXLControlNetXSPipeline.from_pretrained( - ... "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, vae=vae, torch_dtype=torch.float16 - ... ) - >>> pipe.enable_model_cpu_offload() - - >>> # get canny image - >>> image = np.array(image) - >>> image = cv2.Canny(image, 100, 200) - >>> image = image[:, :, None] - >>> image = np.concatenate([image, image, image], axis=2) - >>> canny_image = Image.fromarray(image) - - >>> # generate image - >>> image = pipe( - ... prompt, controlnet_conditioning_scale=controlnet_conditioning_scale, image=canny_image - ... ).images[0] - ``` -""" - - class StableDiffusionXLControlNetXSPipeline( DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionXLLoraLoaderMixin, FromSingleFileMixin ): @@ -730,7 +690,6 @@ def disable_freeu(self): self.unet.disable_freeu() @torch.no_grad() - @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( self, prompt: Union[str, List[str]] = None, diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 10c5b0f46565..180b210953c1 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -80,7 +80,6 @@ "AutoencoderTiny", "ConsistencyDecoderVAE", "ControlNetModel", - "ControlNetXSModel", "Kandinsky3UNet", "ModelMixin", "MotionAdapter", @@ -256,7 +255,6 @@ "StableDiffusionControlNetImg2ImgPipeline", "StableDiffusionControlNetInpaintPipeline", "StableDiffusionControlNetPipeline", - "StableDiffusionControlNetXSPipeline", "StableDiffusionDepth2ImgPipeline", "StableDiffusionDiffEditPipeline", "StableDiffusionGLIGENPipeline", @@ -280,7 +278,6 @@ "StableDiffusionXLControlNetImg2ImgPipeline", "StableDiffusionXLControlNetInpaintPipeline", "StableDiffusionXLControlNetPipeline", - "StableDiffusionXLControlNetXSPipeline", "StableDiffusionXLImg2ImgPipeline", "StableDiffusionXLInpaintPipeline", "StableDiffusionXLInstructPix2PixPipeline", @@ -462,7 +459,6 @@ AutoencoderTiny, ConsistencyDecoderVAE, ControlNetModel, - ControlNetXSModel, Kandinsky3UNet, ModelMixin, MotionAdapter, @@ -617,7 +613,6 @@ StableDiffusionControlNetImg2ImgPipeline, StableDiffusionControlNetInpaintPipeline, StableDiffusionControlNetPipeline, - StableDiffusionControlNetXSPipeline, StableDiffusionDepth2ImgPipeline, StableDiffusionDiffEditPipeline, StableDiffusionGLIGENPipeline, @@ -641,7 +636,6 @@ StableDiffusionXLControlNetImg2ImgPipeline, StableDiffusionXLControlNetInpaintPipeline, StableDiffusionXLControlNetPipeline, - StableDiffusionXLControlNetXSPipeline, StableDiffusionXLImg2ImgPipeline, StableDiffusionXLInpaintPipeline, StableDiffusionXLInstructPix2PixPipeline, diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py index 6e7fe72bc949..36dbe14c5053 100644 --- a/src/diffusers/models/__init__.py +++ b/src/diffusers/models/__init__.py @@ -32,7 +32,6 @@ _import_structure["autoencoders.autoencoder_tiny"] = ["AutoencoderTiny"] _import_structure["autoencoders.consistency_decoder_vae"] = ["ConsistencyDecoderVAE"] _import_structure["controlnet"] = ["ControlNetModel"] - _import_structure["controlnetxs"] = ["ControlNetXSModel"] _import_structure["dual_transformer_2d"] = ["DualTransformer2DModel"] _import_structure["embeddings"] = ["ImageProjection"] _import_structure["modeling_utils"] = ["ModelMixin"] @@ -67,7 +66,6 @@ ConsistencyDecoderVAE, ) from .controlnet import ControlNetModel - from .controlnetxs import ControlNetXSModel from .dual_transformer_2d import DualTransformer2DModel from .embeddings import ImageProjection from .modeling_utils import ModelMixin diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index 3bf67dfc1cdc..2b456f4c3d08 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -128,12 +128,6 @@ "StableDiffusionXLControlNetPipeline", ] ) - _import_structure["controlnet_xs"].extend( - [ - "StableDiffusionControlNetXSPipeline", - "StableDiffusionXLControlNetXSPipeline", - ] - ) _import_structure["deepfloyd_if"] = [ "IFImg2ImgPipeline", "IFImg2ImgSuperResolutionPipeline", @@ -361,10 +355,6 @@ StableDiffusionXLControlNetInpaintPipeline, StableDiffusionXLControlNetPipeline, ) - from .controlnet_xs import ( - StableDiffusionControlNetXSPipeline, - StableDiffusionXLControlNetXSPipeline, - ) from .deepfloyd_if import ( IFImg2ImgPipeline, IFImg2ImgSuperResolutionPipeline, diff --git a/src/diffusers/pipelines/controlnet_xs/__init__.py b/src/diffusers/pipelines/controlnet_xs/__init__.py deleted file mode 100644 index 978278b184f9..000000000000 --- a/src/diffusers/pipelines/controlnet_xs/__init__.py +++ /dev/null @@ -1,68 +0,0 @@ -from typing import TYPE_CHECKING - -from ...utils import ( - DIFFUSERS_SLOW_IMPORT, - OptionalDependencyNotAvailable, - _LazyModule, - get_objects_from_module, - is_flax_available, - is_torch_available, - is_transformers_available, -) - - -_dummy_objects = {} -_import_structure = {} - -try: - if not (is_transformers_available() and is_torch_available()): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - from ...utils import dummy_torch_and_transformers_objects # noqa F403 - - _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects)) -else: - _import_structure["pipeline_controlnet_xs"] = ["StableDiffusionControlNetXSPipeline"] - _import_structure["pipeline_controlnet_xs_sd_xl"] = ["StableDiffusionXLControlNetXSPipeline"] -try: - if not (is_transformers_available() and is_flax_available()): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - from ...utils import dummy_flax_and_transformers_objects # noqa F403 - - _dummy_objects.update(get_objects_from_module(dummy_flax_and_transformers_objects)) -else: - pass # _import_structure["pipeline_flax_controlnet"] = ["FlaxStableDiffusionControlNetPipeline"] - - -if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: - try: - if not (is_transformers_available() and is_torch_available()): - raise OptionalDependencyNotAvailable() - - except OptionalDependencyNotAvailable: - from ...utils.dummy_torch_and_transformers_objects import * - else: - from .pipeline_controlnet_xs import StableDiffusionControlNetXSPipeline - from .pipeline_controlnet_xs_sd_xl import StableDiffusionXLControlNetXSPipeline - - try: - if not (is_transformers_available() and is_flax_available()): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - from ...utils.dummy_flax_and_transformers_objects import * # noqa F403 - else: - pass # from .pipeline_flax_controlnet import FlaxStableDiffusionControlNetPipeline - - -else: - import sys - - sys.modules[__name__] = _LazyModule( - __name__, - globals()["__file__"], - _import_structure, - module_spec=__spec__, - ) - for name, value in _dummy_objects.items(): - setattr(sys.modules[__name__], name, value) diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py index 5bd2f493ce08..d306a3575b1f 100644 --- a/src/diffusers/utils/dummy_pt_objects.py +++ b/src/diffusers/utils/dummy_pt_objects.py @@ -92,21 +92,6 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch"]) -class ControlNetXSModel(metaclass=DummyObject): - _backends = ["torch"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - @classmethod - def from_config(cls, *args, **kwargs): - requires_backends(cls, ["torch"]) - - @classmethod - def from_pretrained(cls, *args, **kwargs): - requires_backends(cls, ["torch"]) - - class Kandinsky3UNet(metaclass=DummyObject): _backends = ["torch"] diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py index ae6c6c916065..2eb9599658d9 100644 --- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py +++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py @@ -782,21 +782,6 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch", "transformers"]) -class StableDiffusionControlNetXSPipeline(metaclass=DummyObject): - _backends = ["torch", "transformers"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch", "transformers"]) - - @classmethod - def from_config(cls, *args, **kwargs): - requires_backends(cls, ["torch", "transformers"]) - - @classmethod - def from_pretrained(cls, *args, **kwargs): - requires_backends(cls, ["torch", "transformers"]) - - class StableDiffusionDepth2ImgPipeline(metaclass=DummyObject): _backends = ["torch", "transformers"] @@ -1142,21 +1127,6 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch", "transformers"]) -class StableDiffusionXLControlNetXSPipeline(metaclass=DummyObject): - _backends = ["torch", "transformers"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch", "transformers"]) - - @classmethod - def from_config(cls, *args, **kwargs): - requires_backends(cls, ["torch", "transformers"]) - - @classmethod - def from_pretrained(cls, *args, **kwargs): - requires_backends(cls, ["torch", "transformers"]) - - class StableDiffusionXLImg2ImgPipeline(metaclass=DummyObject): _backends = ["torch", "transformers"] diff --git a/tests/pipelines/controlnetxs/__init__.py b/tests/pipelines/controlnetxs/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/pipelines/controlnetxs/test_controlnetxs.py b/tests/pipelines/controlnetxs/test_controlnetxs.py deleted file mode 100644 index 1f184e5bb14c..000000000000 --- a/tests/pipelines/controlnetxs/test_controlnetxs.py +++ /dev/null @@ -1,311 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import traceback -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import ( - AutoencoderKL, - ControlNetXSModel, - DDIMScheduler, - LCMScheduler, - StableDiffusionControlNetXSPipeline, - UNet2DConditionModel, -) -from diffusers.utils.import_utils import is_xformers_available -from diffusers.utils.testing_utils import ( - enable_full_determinism, - load_image, - load_numpy, - numpy_cosine_similarity_distance, - require_python39_or_higher, - require_torch_2, - require_torch_gpu, - run_test_in_subprocess, - slow, - torch_device, -) -from diffusers.utils.torch_utils import randn_tensor - -from ..pipeline_params import ( - IMAGE_TO_IMAGE_IMAGE_PARAMS, - TEXT_TO_IMAGE_BATCH_PARAMS, - TEXT_TO_IMAGE_IMAGE_PARAMS, - TEXT_TO_IMAGE_PARAMS, -) -from ..test_pipelines_common import ( - PipelineKarrasSchedulerTesterMixin, - PipelineLatentTesterMixin, - PipelineTesterMixin, -) - - -enable_full_determinism() - - -# Will be run via run_test_in_subprocess -def _test_stable_diffusion_compile(in_queue, out_queue, timeout): - error = None - try: - _ = in_queue.get(timeout=timeout) - - controlnet = ControlNetXSModel.from_pretrained("UmerHA/ConrolNetXS-SD2.1-canny") - - pipe = StableDiffusionControlNetXSPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-1", safety_checker=None, controlnet=controlnet - ) - pipe.to("cuda") - pipe.set_progress_bar_config(disable=None) - - pipe.unet.to(memory_format=torch.channels_last) - pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) - - pipe.controlnet.to(memory_format=torch.channels_last) - pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True) - - generator = torch.Generator(device="cpu").manual_seed(0) - prompt = "bird" - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png" - ).resize((512, 512)) - - output = pipe(prompt, image, num_inference_steps=10, generator=generator, output_type="np") - image = output.images[0] - - assert image.shape == (512, 512, 3) - - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny_out_full.npy" - ) - expected_image = np.resize(expected_image, (512, 512, 3)) - - assert np.abs(expected_image - image).max() < 1.0 - - except Exception: - error = f"{traceback.format_exc()}" - - results = {"error": error} - out_queue.put(results, timeout=timeout) - out_queue.join() - - -class ControlNetXSPipelineFastTests( - PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase -): - pipeline_class = StableDiffusionControlNetXSPipeline - params = TEXT_TO_IMAGE_PARAMS - batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS - image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS - - def get_dummy_components(self, time_cond_proj_dim=None): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(4, 8), - layers_per_block=2, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - norm_num_groups=1, - time_cond_proj_dim=time_cond_proj_dim, - ) - torch.manual_seed(0) - controlnet = ControlNetXSModel.from_unet( - unet=unet, - time_embedding_mix=0.95, - learn_embedding=True, - size_ratio=0.5, - conditioning_embedding_out_channels=(16, 32), - num_attention_heads=2, - ) - torch.manual_seed(0) - scheduler = DDIMScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - clip_sample=False, - set_alpha_to_one=False, - ) - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[4, 8], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - norm_num_groups=2, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - components = { - "unet": unet, - "controlnet": controlnet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "safety_checker": None, - "feature_extractor": None, - } - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - - controlnet_embedder_scale_factor = 2 - image = randn_tensor( - (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor), - generator=generator, - device=torch.device(device), - ) - - inputs = { - "prompt": "A painting of a squirrel eating a burger", - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 6.0, - "output_type": "numpy", - "image": image, - } - - return inputs - - def test_attention_slicing_forward_pass(self): - return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3) - - @unittest.skipIf( - torch_device != "cuda" or not is_xformers_available(), - reason="XFormers attention is only available with CUDA and `xformers` installed", - ) - def test_xformers_attention_forwardGenerator_pass(self): - self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3) - - def test_inference_batch_single_identical(self): - self._test_inference_batch_single_identical(expected_max_diff=2e-3) - - def test_controlnet_lcm(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - - components = self.get_dummy_components(time_cond_proj_dim=256) - sd_pipe = StableDiffusionControlNetXSPipeline(**components) - sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe = sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - output = sd_pipe(**inputs) - image = output.images - - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array( - [0.52700454, 0.3930534, 0.25509018, 0.7132304, 0.53696585, 0.46568912, 0.7095368, 0.7059624, 0.4744786] - ) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - -@slow -@require_torch_gpu -class ControlNetXSPipelineSlowTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_canny(self): - controlnet = ControlNetXSModel.from_pretrained("UmerHA/ConrolNetXS-SD2.1-canny") - - pipe = StableDiffusionControlNetXSPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-1", safety_checker=None, controlnet=controlnet - ) - pipe.enable_model_cpu_offload() - pipe.set_progress_bar_config(disable=None) - - generator = torch.Generator(device="cpu").manual_seed(0) - prompt = "bird" - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png" - ) - - output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3) - - image = output.images[0] - - assert image.shape == (768, 512, 3) - - original_image = image[-3:, -3:, -1].flatten() - expected_image = np.array([0.1274, 0.1401, 0.147, 0.1185, 0.1555, 0.1492, 0.1565, 0.1474, 0.1701]) - - max_diff = numpy_cosine_similarity_distance(original_image, expected_image) - assert max_diff < 1e-4 - - def test_depth(self): - controlnet = ControlNetXSModel.from_pretrained("UmerHA/ConrolNetXS-SD2.1-depth") - - pipe = StableDiffusionControlNetXSPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-1", safety_checker=None, controlnet=controlnet - ) - pipe.enable_model_cpu_offload() - pipe.set_progress_bar_config(disable=None) - - generator = torch.Generator(device="cpu").manual_seed(0) - prompt = "Stormtrooper's lecture" - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth.png" - ) - - output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3) - - image = output.images[0] - - assert image.shape == (512, 512, 3) - - original_image = image[-3:, -3:, -1].flatten() - expected_image = np.array([0.1098, 0.1025, 0.1211, 0.1129, 0.1165, 0.1262, 0.1185, 0.1261, 0.1703]) - - max_diff = numpy_cosine_similarity_distance(original_image, expected_image) - assert max_diff < 1e-4 - - @require_python39_or_higher - @require_torch_2 - def test_stable_diffusion_compile(self): - run_test_in_subprocess(test_case=self, target_func=_test_stable_diffusion_compile, inputs=None) diff --git a/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py b/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py deleted file mode 100644 index dbdc532a6f3b..000000000000 --- a/tests/pipelines/controlnetxs/test_controlnetxs_sdxl.py +++ /dev/null @@ -1,362 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer - -from diffusers import ( - AutoencoderKL, - ControlNetXSModel, - EulerDiscreteScheduler, - StableDiffusionXLControlNetXSPipeline, - UNet2DConditionModel, -) -from diffusers.utils.import_utils import is_xformers_available -from diffusers.utils.testing_utils import enable_full_determinism, load_image, require_torch_gpu, slow, torch_device -from diffusers.utils.torch_utils import randn_tensor - -from ..pipeline_params import ( - IMAGE_TO_IMAGE_IMAGE_PARAMS, - TEXT_TO_IMAGE_BATCH_PARAMS, - TEXT_TO_IMAGE_IMAGE_PARAMS, - TEXT_TO_IMAGE_PARAMS, -) -from ..test_pipelines_common import ( - PipelineKarrasSchedulerTesterMixin, - PipelineLatentTesterMixin, - PipelineTesterMixin, - SDXLOptionalComponentsTesterMixin, -) - - -enable_full_determinism() - - -class StableDiffusionXLControlNetXSPipelineFastTests( - PipelineLatentTesterMixin, - PipelineKarrasSchedulerTesterMixin, - PipelineTesterMixin, - SDXLOptionalComponentsTesterMixin, - unittest.TestCase, -): - pipeline_class = StableDiffusionXLControlNetXSPipeline - params = TEXT_TO_IMAGE_PARAMS - batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS - image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - # SD2-specific config below - attention_head_dim=(2, 4), - use_linear_projection=True, - addition_embed_type="text_time", - addition_time_embed_dim=8, - transformer_layers_per_block=(1, 2), - projection_class_embeddings_input_dim=80, # 6 * 8 + 32 - cross_attention_dim=64, - ) - torch.manual_seed(0) - controlnet = ControlNetXSModel.from_unet( - unet, - time_embedding_mix=0.95, - learn_embedding=True, - size_ratio=0.5, - conditioning_embedding_out_channels=(16, 32), - ) - torch.manual_seed(0) - scheduler = EulerDiscreteScheduler( - beta_start=0.00085, - beta_end=0.012, - steps_offset=1, - beta_schedule="scaled_linear", - timestep_spacing="leading", - ) - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - # SD2-specific config below - hidden_act="gelu", - projection_dim=32, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config) - tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - components = { - "unet": unet, - "controlnet": controlnet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "text_encoder_2": text_encoder_2, - "tokenizer_2": tokenizer_2, - } - return components - - # copied from test_controlnet_sdxl.py - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - - controlnet_embedder_scale_factor = 2 - image = randn_tensor( - (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor), - generator=generator, - device=torch.device(device), - ) - - inputs = { - "prompt": "A painting of a squirrel eating a burger", - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 6.0, - "output_type": "np", - "image": image, - } - - return inputs - - # copied from test_controlnet_sdxl.py - def test_attention_slicing_forward_pass(self): - return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3) - - # copied from test_controlnet_sdxl.py - @unittest.skipIf( - torch_device != "cuda" or not is_xformers_available(), - reason="XFormers attention is only available with CUDA and `xformers` installed", - ) - def test_xformers_attention_forwardGenerator_pass(self): - self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3) - - # copied from test_controlnet_sdxl.py - def test_inference_batch_single_identical(self): - self._test_inference_batch_single_identical(expected_max_diff=2e-3) - - # copied from test_controlnet_sdxl.py - def test_save_load_optional_components(self): - self._test_save_load_optional_components() - - # copied from test_controlnet_sdxl.py - @require_torch_gpu - def test_stable_diffusion_xl_offloads(self): - pipes = [] - components = self.get_dummy_components() - sd_pipe = self.pipeline_class(**components).to(torch_device) - pipes.append(sd_pipe) - - components = self.get_dummy_components() - sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_model_cpu_offload() - pipes.append(sd_pipe) - - components = self.get_dummy_components() - sd_pipe = self.pipeline_class(**components) - sd_pipe.enable_sequential_cpu_offload() - pipes.append(sd_pipe) - - image_slices = [] - for pipe in pipes: - pipe.unet.set_default_attn_processor() - - inputs = self.get_dummy_inputs(torch_device) - image = pipe(**inputs).images - - image_slices.append(image[0, -3:, -3:, -1].flatten()) - - assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3 - assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3 - - # copied from test_controlnet_sdxl.py - def test_stable_diffusion_xl_multi_prompts(self): - components = self.get_dummy_components() - sd_pipe = self.pipeline_class(**components).to(torch_device) - - # forward with single prompt - inputs = self.get_dummy_inputs(torch_device) - output = sd_pipe(**inputs) - image_slice_1 = output.images[0, -3:, -3:, -1] - - # forward with same prompt duplicated - inputs = self.get_dummy_inputs(torch_device) - inputs["prompt_2"] = inputs["prompt"] - output = sd_pipe(**inputs) - image_slice_2 = output.images[0, -3:, -3:, -1] - - # ensure the results are equal - assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4 - - # forward with different prompt - inputs = self.get_dummy_inputs(torch_device) - inputs["prompt_2"] = "different prompt" - output = sd_pipe(**inputs) - image_slice_3 = output.images[0, -3:, -3:, -1] - - # ensure the results are not equal - assert np.abs(image_slice_1.flatten() - image_slice_3.flatten()).max() > 1e-4 - - # manually set a negative_prompt - inputs = self.get_dummy_inputs(torch_device) - inputs["negative_prompt"] = "negative prompt" - output = sd_pipe(**inputs) - image_slice_1 = output.images[0, -3:, -3:, -1] - - # forward with same negative_prompt duplicated - inputs = self.get_dummy_inputs(torch_device) - inputs["negative_prompt"] = "negative prompt" - inputs["negative_prompt_2"] = inputs["negative_prompt"] - output = sd_pipe(**inputs) - image_slice_2 = output.images[0, -3:, -3:, -1] - - # ensure the results are equal - assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4 - - # forward with different negative_prompt - inputs = self.get_dummy_inputs(torch_device) - inputs["negative_prompt"] = "negative prompt" - inputs["negative_prompt_2"] = "different negative prompt" - output = sd_pipe(**inputs) - image_slice_3 = output.images[0, -3:, -3:, -1] - - # ensure the results are not equal - assert np.abs(image_slice_1.flatten() - image_slice_3.flatten()).max() > 1e-4 - - # copied from test_stable_diffusion_xl.py - def test_stable_diffusion_xl_prompt_embeds(self): - components = self.get_dummy_components() - sd_pipe = self.pipeline_class(**components) - sd_pipe = sd_pipe.to(torch_device) - sd_pipe = sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - # forward without prompt embeds - inputs = self.get_dummy_inputs(torch_device) - inputs["prompt"] = 2 * [inputs["prompt"]] - inputs["num_images_per_prompt"] = 2 - - output = sd_pipe(**inputs) - image_slice_1 = output.images[0, -3:, -3:, -1] - - # forward with prompt embeds - inputs = self.get_dummy_inputs(torch_device) - prompt = 2 * [inputs.pop("prompt")] - - ( - prompt_embeds, - negative_prompt_embeds, - pooled_prompt_embeds, - negative_pooled_prompt_embeds, - ) = sd_pipe.encode_prompt(prompt) - - output = sd_pipe( - **inputs, - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, - pooled_prompt_embeds=pooled_prompt_embeds, - negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, - ) - image_slice_2 = output.images[0, -3:, -3:, -1] - - # make sure that it's equal - assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1.1e-4 - - -@slow -@require_torch_gpu -class ControlNetSDXLPipelineXSSlowTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_canny(self): - controlnet = ControlNetXSModel.from_pretrained("UmerHA/ConrolNetXS-SDXL-canny") - - pipe = StableDiffusionXLControlNetXSPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet - ) - pipe.enable_sequential_cpu_offload() - pipe.set_progress_bar_config(disable=None) - - generator = torch.Generator(device="cpu").manual_seed(0) - prompt = "bird" - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png" - ) - - images = pipe(prompt, image=image, generator=generator, output_type="np", num_inference_steps=3).images - - assert images[0].shape == (768, 512, 3) - - original_image = images[0, -3:, -3:, -1].flatten() - expected_image = np.array([0.4359, 0.4335, 0.4609, 0.4515, 0.4669, 0.4494, 0.452, 0.4493, 0.4382]) - assert np.allclose(original_image, expected_image, atol=1e-04) - - def test_depth(self): - controlnet = ControlNetXSModel.from_pretrained("UmerHA/ConrolNetXS-SDXL-depth") - - pipe = StableDiffusionXLControlNetXSPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet - ) - pipe.enable_sequential_cpu_offload() - pipe.set_progress_bar_config(disable=None) - - generator = torch.Generator(device="cpu").manual_seed(0) - prompt = "Stormtrooper's lecture" - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth.png" - ) - - images = pipe(prompt, image=image, generator=generator, output_type="np", num_inference_steps=3).images - - assert images[0].shape == (512, 512, 3) - - original_image = images[0, -3:, -3:, -1].flatten() - expected_image = np.array([0.4411, 0.3617, 0.2654, 0.266, 0.3449, 0.3898, 0.3745, 0.353, 0.326]) - assert np.allclose(original_image, expected_image, atol=1e-04)