From 837d6f861308245bed388462e9d6595a27caa5a3 Mon Sep 17 00:00:00 2001 From: Fabio Rigano Date: Thu, 23 Nov 2023 11:32:39 +0100 Subject: [PATCH 1/7] Add support for IPAdapterFull --- .../en/using-diffusers/loading_adapters.md | 61 +++++++++++++++++++ src/diffusers/loaders/unet.py | 29 ++++++++- src/diffusers/models/embeddings.py | 14 ++++- .../test_ip_adapter_stable_diffusion.py | 19 ++++++ 4 files changed, 121 insertions(+), 2 deletions(-) diff --git a/docs/source/en/using-diffusers/loading_adapters.md b/docs/source/en/using-diffusers/loading_adapters.md index c14b38a9dd89..93c8fecb6da8 100644 --- a/docs/source/en/using-diffusers/loading_adapters.md +++ b/docs/source/en/using-diffusers/loading_adapters.md @@ -485,6 +485,67 @@ image.save("sdxl_t2i.png") +You can use the IP-Adapter face model to apply specific faces to your images. It is an effective way to maintain consistent characters in your image generations. +Weights are loaded with the same method used for the other IP-Adapters. + +```python +# Load ip-adapter-full-face_sd15.bin +pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter-full-face_sd15.bin") +``` + + + +It is recommended to use `DDIMScheduler` and `EulerDiscreteScheduler` for face model. + + +```python +import torch +from diffusers import StableDiffusionPipeline, DDIMScheduler +from diffusers.utils import load_image + +noise_scheduler = DDIMScheduler( + num_train_timesteps=1000, + beta_start=0.00085, + beta_end=0.012, + beta_schedule="scaled_linear", + clip_sample=False, + set_alpha_to_one=False, + steps_offset=1 +) + +pipeline = StableDiffusionPipeline.from_pretrained( + "runwayml/stable-diffusion-v1-5", + torch_dtype=torch.float16, + scheduler=noise_scheduler, +).to("cuda") + +pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter-full-face_sd15.bin") + +pipeline.set_ip_adapter_scale(0.7) + +image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ai_face2.png") + +generator = torch.Generator(device="cpu").manual_seed(33) + +image = pipeline( + prompt="A photo of a girl wearing a black dress, holding red roses in hand, upper body, behind is the Eiffel Tower", + ip_adapter_image=image, + negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality", + num_inference_steps=50, num_images_per_prompt=1, width=512, height=704, + generator=generator, +).images[0] +``` + +
+
+ +
input image
+
+
+ +
output image
+
+
### LCM-Lora diff --git a/src/diffusers/loaders/unet.py b/src/diffusers/loaders/unet.py index 9d559a4b4af8..88a690d52fda 100644 --- a/src/diffusers/loaders/unet.py +++ b/src/diffusers/loaders/unet.py @@ -21,7 +21,7 @@ import torch.nn.functional as F from torch import nn -from ..models.embeddings import ImageProjection, Resampler +from ..models.embeddings import ImageProjection, MLPProjection, Resampler from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, load_model_dict_into_meta from ..utils import ( DIFFUSERS_CACHE, @@ -675,6 +675,9 @@ def _load_ip_adapter_weights(self, state_dict): if "proj.weight" in state_dict["image_proj"]: # IP-Adapter num_image_text_embeds = 4 + elif "proj.3.weight" in state_dict["image_proj"]: + # IP-Adapter Full Face + num_image_text_embeds = 257 else: # IP-Adapter Plus num_image_text_embeds = state_dict["image_proj"]["latents"].shape[1] @@ -744,8 +747,32 @@ def _load_ip_adapter_weights(self, state_dict): "norm.bias": state_dict["image_proj"]["norm.bias"], } ) + image_projection.load_state_dict(image_proj_state_dict) + del image_proj_state_dict + elif "proj.3.weight" in state_dict["image_proj"]: + clip_embeddings_dim = state_dict["image_proj"]["proj.0.weight"].shape[0] + cross_attention_dim = state_dict["image_proj"]["proj.3.weight"].shape[0] + + image_projection = MLPProjection( + cross_attention_dim=cross_attention_dim, image_embed_dim=clip_embeddings_dim + ) + image_projection.to(dtype=self.dtype, device=self.device) + + # load image projection layer weights + image_proj_state_dict = {} + image_proj_state_dict.update( + { + "ff.net.0.proj.weight": state_dict["image_proj"]["proj.0.weight"], + "ff.net.0.proj.bias": state_dict["image_proj"]["proj.0.bias"], + "ff.net.2.weight": state_dict["image_proj"]["proj.2.weight"], + "ff.net.2.bias": state_dict["image_proj"]["proj.2.bias"], + "norm.weight": state_dict["image_proj"]["proj.3.weight"], + "norm.bias": state_dict["image_proj"]["proj.3.bias"], + } + ) image_projection.load_state_dict(image_proj_state_dict) + del image_proj_state_dict else: # IP-Adapter Plus diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py index bdd2930d20f9..12f564e58c65 100644 --- a/src/diffusers/models/embeddings.py +++ b/src/diffusers/models/embeddings.py @@ -453,7 +453,7 @@ def __init__( def forward(self, image_embeds: torch.FloatTensor): batch_size = image_embeds.shape[0] - + # image image_embeds = self.image_embeds(image_embeds) image_embeds = image_embeds.reshape(batch_size, self.num_image_text_embeds, -1) @@ -461,6 +461,18 @@ def forward(self, image_embeds: torch.FloatTensor): return image_embeds +class MLPProjection(nn.Module): + def __init__(self, image_embed_dim=1024, cross_attention_dim=1024): + super().__init__() + from .attention import FeedForward + + self.ff = FeedForward(image_embed_dim, cross_attention_dim, mult=1, activation_fn="gelu") + self.norm = nn.LayerNorm(cross_attention_dim) + + def forward(self, image_embeds: torch.FloatTensor): + return self.norm(self.ff(image_embeds)) + + class CombinedTimestepLabelEmbeddings(nn.Module): def __init__(self, num_classes, embedding_dim, class_dropout_prob=0.1): super().__init__() diff --git a/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py b/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py index 7c6349ce2600..ff93ecaf003b 100644 --- a/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py +++ b/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py @@ -182,6 +182,25 @@ def test_inpainting(self): assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4) + def test_text_to_image_full_face(self): + image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder") + pipeline = StableDiffusionPipeline.from_pretrained( + "runwayml/stable-diffusion-v1-5", image_encoder=image_encoder, safety_checker=None, torch_dtype=self.dtype + ) + pipeline.to(torch_device) + pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter-full-face_sd15.bin") + pipeline.set_ip_adapter_scale(0.7) + + inputs = self.get_dummy_inputs() + images = pipeline(**inputs).images + image_slice = images[0, :3, :3, -1].flatten() + + expected_slice = np.array( + [0.1706543, 0.1303711, 0.12573242, 0.21777344, 0.14550781, 0.14038086, 0.40820312, 0.41455078, 0.42529297] + ) + + assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4) + @slow @require_torch_gpu From 9b428b2805d8046672ae069b4f33ef51f56039b1 Mon Sep 17 00:00:00 2001 From: Fabio Rigano Date: Mon, 27 Nov 2023 15:34:46 +0100 Subject: [PATCH 2/7] Fix copy --- src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py index 2121e9b81509..b77eddffe593 100644 --- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py +++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py @@ -892,7 +892,6 @@ def __call__( ) if self.do_classifier_free_guidance: image_embeds = torch.cat([negative_image_embeds, image_embeds]) - # 4. Prepare timesteps timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps) From 577fe86fa5b45a478035c283c2c5708f2ca08793 Mon Sep 17 00:00:00 2001 From: Fabio Rigano Date: Thu, 30 Nov 2023 08:03:08 +0100 Subject: [PATCH 3/7] Move test --- .../test_ip_adapter_stable_diffusion.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py b/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py index ff93ecaf003b..79a80935231c 100644 --- a/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py +++ b/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py @@ -201,6 +201,25 @@ def test_text_to_image_full_face(self): assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4) + def test_text_to_image_full_face(self): + image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder") + pipeline = StableDiffusionPipeline.from_pretrained( + "runwayml/stable-diffusion-v1-5", image_encoder=image_encoder, safety_checker=None, torch_dtype=self.dtype + ) + pipeline.to(torch_device) + pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter-full-face_sd15.bin") + pipeline.set_ip_adapter_scale(0.7) + + inputs = self.get_dummy_inputs() + images = pipeline(**inputs).images + image_slice = images[0, :3, :3, -1].flatten() + + expected_slice = np.array( + [0.1706543, 0.1303711, 0.12573242, 0.21777344, 0.14550781, 0.14038086, 0.40820312, 0.41455078, 0.42529297] + ) + + assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4) + @slow @require_torch_gpu From 95b5cb3358fe4977ddcc5344db81136536ca964a Mon Sep 17 00:00:00 2001 From: Fabio Rigano Date: Mon, 4 Dec 2023 15:00:17 +0100 Subject: [PATCH 4/7] Fix style and quality --- src/diffusers/models/__init__.py | 2 +- src/diffusers/models/embeddings.py | 2 +- .../alt_diffusion/pipeline_alt_diffusion.py | 1 + .../test_ip_adapter_stable_diffusion.py | 19 ------------------- 4 files changed, 3 insertions(+), 21 deletions(-) diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py index 49ee3ee6af6b..e3794939e25e 100644 --- a/src/diffusers/models/__init__.py +++ b/src/diffusers/models/__init__.py @@ -33,8 +33,8 @@ _import_structure["consistency_decoder_vae"] = ["ConsistencyDecoderVAE"] _import_structure["controlnet"] = ["ControlNetModel"] _import_structure["dual_transformer_2d"] = ["DualTransformer2DModel"] - _import_structure["modeling_utils"] = ["ModelMixin"] _import_structure["embeddings"] = ["ImageProjection"] + _import_structure["modeling_utils"] = ["ModelMixin"] _import_structure["prior_transformer"] = ["PriorTransformer"] _import_structure["t5_film_transformer"] = ["T5FilmDecoder"] _import_structure["transformer_2d"] = ["Transformer2DModel"] diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py index 12f564e58c65..73abc9869230 100644 --- a/src/diffusers/models/embeddings.py +++ b/src/diffusers/models/embeddings.py @@ -453,7 +453,7 @@ def __init__( def forward(self, image_embeds: torch.FloatTensor): batch_size = image_embeds.shape[0] - + # image image_embeds = self.image_embeds(image_embeds) image_embeds = image_embeds.reshape(batch_size, self.num_image_text_embeds, -1) diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py index b77eddffe593..2121e9b81509 100644 --- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py +++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py @@ -892,6 +892,7 @@ def __call__( ) if self.do_classifier_free_guidance: image_embeds = torch.cat([negative_image_embeds, image_embeds]) + # 4. Prepare timesteps timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps) diff --git a/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py b/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py index 79a80935231c..ff93ecaf003b 100644 --- a/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py +++ b/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py @@ -201,25 +201,6 @@ def test_text_to_image_full_face(self): assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4) - def test_text_to_image_full_face(self): - image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder") - pipeline = StableDiffusionPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", image_encoder=image_encoder, safety_checker=None, torch_dtype=self.dtype - ) - pipeline.to(torch_device) - pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter-full-face_sd15.bin") - pipeline.set_ip_adapter_scale(0.7) - - inputs = self.get_dummy_inputs() - images = pipeline(**inputs).images - image_slice = images[0, :3, :3, -1].flatten() - - expected_slice = np.array( - [0.1706543, 0.1303711, 0.12573242, 0.21777344, 0.14550781, 0.14038086, 0.40820312, 0.41455078, 0.42529297] - ) - - assert np.allclose(image_slice, expected_slice, atol=1e-4, rtol=1e-4) - @slow @require_torch_gpu From f438c6667805e8a20e8b834aeb8f6758e6745013 Mon Sep 17 00:00:00 2001 From: Fabio Rigano Date: Tue, 5 Dec 2023 07:32:20 +0100 Subject: [PATCH 5/7] Fix doc link --- docs/source/en/using-diffusers/loading_adapters.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/using-diffusers/loading_adapters.md b/docs/source/en/using-diffusers/loading_adapters.md index 93c8fecb6da8..382c49a1498f 100644 --- a/docs/source/en/using-diffusers/loading_adapters.md +++ b/docs/source/en/using-diffusers/loading_adapters.md @@ -542,7 +542,7 @@ image = pipeline(
input image
- +
output image
From 2ad8f4a90f99df77773a4b225ae8c4a68652f6b8 Mon Sep 17 00:00:00 2001 From: Fabio Rigano Date: Thu, 7 Dec 2023 08:24:39 +0100 Subject: [PATCH 6/7] Fix style + comment --- docs/source/en/using-diffusers/loading_adapters.md | 1 + src/diffusers/loaders/unet.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/en/using-diffusers/loading_adapters.md b/docs/source/en/using-diffusers/loading_adapters.md index 382c49a1498f..7082f4bd4f63 100644 --- a/docs/source/en/using-diffusers/loading_adapters.md +++ b/docs/source/en/using-diffusers/loading_adapters.md @@ -496,6 +496,7 @@ pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-a It is recommended to use `DDIMScheduler` and `EulerDiscreteScheduler` for face model. + ```python diff --git a/src/diffusers/loaders/unet.py b/src/diffusers/loaders/unet.py index 88a690d52fda..6f84cc9b1b78 100644 --- a/src/diffusers/loaders/unet.py +++ b/src/diffusers/loaders/unet.py @@ -677,7 +677,7 @@ def _load_ip_adapter_weights(self, state_dict): num_image_text_embeds = 4 elif "proj.3.weight" in state_dict["image_proj"]: # IP-Adapter Full Face - num_image_text_embeds = 257 + num_image_text_embeds = 257 # 256 CLIP tokens + 1 CLS token else: # IP-Adapter Plus num_image_text_embeds = state_dict["image_proj"]["latents"].shape[1] From 9d3a590d09620a8f7acc1c7d08b53d16af21b120 Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Wed, 6 Dec 2023 22:14:38 -1000 Subject: [PATCH 7/7] Update docs/source/en/using-diffusers/loading_adapters.md Co-authored-by: Patrick von Platen --- docs/source/en/using-diffusers/loading_adapters.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/en/using-diffusers/loading_adapters.md b/docs/source/en/using-diffusers/loading_adapters.md index 7082f4bd4f63..d9d4a675dd37 100644 --- a/docs/source/en/using-diffusers/loading_adapters.md +++ b/docs/source/en/using-diffusers/loading_adapters.md @@ -497,6 +497,7 @@ pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-a It is recommended to use `DDIMScheduler` and `EulerDiscreteScheduler` for face model. + ```python