major fixes.

sayakpaul · sayakpaul · commit df1b065895db · 2024-08-02T14:53:16.000+05:30
diff --git a/src/diffusers/loaders/peft.py b/src/diffusers/loaders/peft.py
@@ -32,6 +32,7 @@
     "UNet2DConditionModel": _maybe_expand_lora_scales,
     "UNetMotionModel": _maybe_expand_lora_scales,
     "SD3Transformer2DModel": lambda model_cls, weights: weights,
+    "FluxTransformer2DModel": lambda model_cls, weights: weights,
 }
 
 
diff --git a/src/diffusers/models/transformers/transformer_flux.py b/src/diffusers/models/transformers/transformer_flux.py
@@ -373,7 +373,6 @@ def forward(
         )
         encoder_hidden_states = self.context_embedder(encoder_hidden_states)
 
-        print(f"{txt_ids.shape=}, {img_ids.shape=}")
         ids = torch.cat((txt_ids, img_ids), dim=1)
         image_rotary_emb = self.pos_embed(ids)
 
diff --git a/tests/lora/test_lora_layers_flux.py b/tests/lora/test_lora_layers_flux.py
@@ -15,8 +15,11 @@
 import sys
 import unittest
 
+import torch
+from transformers import AutoTokenizer, CLIPTextModel, CLIPTokenizer, T5EncoderModel
+
 from diffusers import FlowMatchEulerDiscreteScheduler, FluxPipeline, FluxTransformer2DModel
-from diffusers.utils.testing_utils import is_peft_available, require_peft_backend
+from diffusers.utils.testing_utils import floats_tensor, is_peft_available, require_peft_backend
 
 
 if is_peft_available():
@@ -32,6 +35,7 @@ class FluxLoRATests(unittest.TestCase, PeftLoraLoaderMixinTests):
     pipeline_class = FluxPipeline
     scheduler_cls = FlowMatchEulerDiscreteScheduler()
     scheduler_kwargs = {}
+    uses_flow_matching = True
     transformer_kwargs = {
         "patch_size": 1,
         "in_channels": 4,
@@ -57,3 +61,35 @@ class FluxLoRATests(unittest.TestCase, PeftLoraLoaderMixinTests):
         "shift_factor": 0.0609,
         "scaling_factor": 1.5035,
     }
+    has_two_text_encoders = True
+    tokenizer = CLIPTokenizer.from_pretrained("peft-internal-testing/tiny-clip-text-2")
+    tokenizer_2 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+    text_encoder = CLIPTextModel.from_pretrained("peft-internal-testing/tiny-clip-text-2")
+    text_encoder_2 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+    @property
+    def output_shape(self):
+        return (1, 8, 8, 3)
+
+    def get_dummy_inputs(self, with_generator=True):
+        batch_size = 1
+        sequence_length = 10
+        num_channels = 4
+        sizes = (32, 32)
+
+        generator = torch.manual_seed(0)
+        noise = floats_tensor((batch_size, num_channels) + sizes)
+        input_ids = torch.randint(1, sequence_length, size=(batch_size, sequence_length), generator=generator)
+
+        pipeline_inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "num_inference_steps": 4,
+            "guidance_scale": 0.0,
+            "height": 8,
+            "width": 8,
+            "output_type": "np",
+        }
+        if with_generator:
+            pipeline_inputs.update({"generator": generator})
+
+        return noise, input_ids, pipeline_inputs
diff --git a/tests/lora/test_lora_layers_sd.py b/tests/lora/test_lora_layers_sd.py
@@ -22,6 +22,7 @@
 from huggingface_hub import hf_hub_download
 from huggingface_hub.repocard import RepoCard
 from safetensors.torch import load_file
+from transformers import CLIPTextModel, CLIPTokenizer
 
 from diffusers import (
     AutoPipelineForImage2Image,
@@ -80,6 +81,12 @@ class StableDiffusionLoRATests(PeftLoraLoaderMixinTests, unittest.TestCase):
         "up_block_types": ["UpDecoderBlock2D", "UpDecoderBlock2D"],
         "latent_channels": 4,
     }
+    text_encoder = CLIPTextModel.from_pretrained("peft-internal-testing/tiny-clip-text-2")
+    tokenizer = CLIPTokenizer.from_pretrained("peft-internal-testing/tiny-clip-text-2")
+
+    @property
+    def output_shape(self):
+        return (1, 64, 64, 3)
 
     def setUp(self):
         super().setUp()
diff --git a/tests/lora/test_lora_layers_sd3.py b/tests/lora/test_lora_layers_sd3.py
@@ -15,6 +15,8 @@
 import sys
 import unittest
 
+from transformers import AutoTokenizer, CLIPTextModelWithProjection, CLIPTokenizer, T5EncoderModel
+
 from diffusers import FlowMatchEulerDiscreteScheduler, SD3Transformer2DModel, StableDiffusion3Pipeline
 from diffusers.utils.testing_utils import is_peft_available, require_peft_backend, require_torch_gpu, torch_device
 
@@ -32,6 +34,7 @@ class SD3LoRATests(unittest.TestCase, PeftLoraLoaderMixinTests):
     pipeline_class = StableDiffusion3Pipeline
     scheduler_cls = FlowMatchEulerDiscreteScheduler()
     scheduler_kwargs = {}
+    uses_flow_matching = True
     transformer_kwargs = {
         "sample_size": 32,
         "patch_size": 1,
@@ -59,6 +62,16 @@ class SD3LoRATests(unittest.TestCase, PeftLoraLoaderMixinTests):
         "scaling_factor": 1.5035,
     }
     has_three_text_encoders = True
+    tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+    tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+    tokenizer_3 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+    text_encoder = CLIPTextModelWithProjection.from_pretrained("hf-internal-testing/tiny-sd3-text_encoder")
+    text_encoder_2 = CLIPTextModelWithProjection.from_pretrained("hf-internal-testing/tiny-sd3-text_encoder-2")
+    text_encoder_3 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+    @property
+    def output_shape(self):
+        return (1, 32, 32, 3)
 
     @require_torch_gpu
     def test_sd3_lora(self):
diff --git a/tests/lora/test_lora_layers_sdxl.py b/tests/lora/test_lora_layers_sdxl.py
@@ -22,6 +22,7 @@
 import numpy as np
 import torch
 from packaging import version
+from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
 
 from diffusers import (
     ControlNetModel,
@@ -89,6 +90,14 @@ class StableDiffusionXLLoRATests(PeftLoraLoaderMixinTests, unittest.TestCase):
         "latent_channels": 4,
         "sample_size": 128,
     }
+    text_encoder = CLIPTextModel.from_pretrained("peft-internal-testing/tiny-clip-text-2")
+    tokenizer = CLIPTokenizer.from_pretrained("peft-internal-testing/tiny-clip-text-2")
+    text_encoder_2 = CLIPTextModelWithProjection.from_pretrained("peft-internal-testing/tiny-clip-text-2")
+    tokenizer_2 = CLIPTokenizer.from_pretrained("peft-internal-testing/tiny-clip-text-2")
+
+    @property
+    def output_shape(self):
+        return (1, 64, 64, 3)
 
     def setUp(self):
         super().setUp()
diff --git a/tests/lora/utils.py b/tests/lora/utils.py

Original file line number	Diff line number	Diff line change
`@@ -32,6 +32,7 @@`
`32`	`32`	`"UNet2DConditionModel": _maybe_expand_lora_scales,`
`33`	`33`	`"UNetMotionModel": _maybe_expand_lora_scales,`
`34`	`34`	`"SD3Transformer2DModel": lambda model_cls, weights: weights,`
	`35`	`+ "FluxTransformer2DModel": lambda model_cls, weights: weights,`
`35`	`36`	`}`
`36`	`37`
`37`	`38`
Original file line number	Diff line number	Diff line change
`@@ -373,7 +373,6 @@ def forward(`
`373`	`373`	`)`
`374`	`374`	`encoder_hidden_states = self.context_embedder(encoder_hidden_states)`
`375`	`375`
`376`		`- print(f"{txt_ids.shape=}, {img_ids.shape=}")`
`377`	`376`	`ids = torch.cat((txt_ids, img_ids), dim=1)`
`378`	`377`	`image_rotary_emb = self.pos_embed(ids)`
`379`	`378`