saving and loading weights without AttnProcLayers class

williamberman · williamberman · commit 1828f821173d · 2023-06-14T13:52:58.000-07:00
diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py
@@ -49,7 +49,14 @@
     StableDiffusionPipeline,
     UNet2DConditionModel,
 )
-from diffusers.loaders import AttnProcsLayers, LoraLoaderMixin, text_encoder_attn_modules
+from diffusers.loaders import (
+    LORA_WEIGHT_NAME,
+    TEXT_ENCODER_NAME,
+    UNET_NAME,
+    LoraLoaderMixin,
+    text_encoder_attn_modules,
+    text_encoder_lora_state_dict,
+)
 from diffusers.models.attention_processor import (
     AttnAddedKVProcessor,
     AttnAddedKVProcessor2_0,
@@ -832,6 +839,7 @@ def main(args):
 
     # Set correct lora layers
     unet_lora_attn_procs = {}
+    unet_lora_parameters = []
     for name, attn_processor in unet.attn_processors.items():
         cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
         if name.startswith("mid_block"):
@@ -849,18 +857,17 @@ def main(args):
             lora_attn_processor_class = (
                 LoRAAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else LoRAAttnProcessor
             )
-        unet_lora_attn_procs[name] = lora_attn_processor_class(
-            hidden_size=hidden_size, cross_attention_dim=cross_attention_dim
-        )
+        module = lora_attn_processor_class(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
+        unet_lora_attn_procs[name] = module
+        unet_lora_parameters.append(module.parameters())
 
     unet.set_attn_processor(unet_lora_attn_procs)
-    unet_lora_layers = AttnProcsLayers(unet.attn_processors)
 
     # The text encoder comes from 🤗 transformers, so we cannot directly modify it.
     # So, instead, we monkey-patch the forward calls of its attention-blocks.
-    text_encoder_lora_layers = None
     if args.train_text_encoder:
         text_lora_attn_procs = {}
+        text_lora_parameters = []
 
         for name, module in text_encoder_attn_modules(text_encoder):
             if isinstance(text_encoder, CLIPTextModel):
@@ -872,9 +879,10 @@ def main(args):
             else:
                 raise ValueError(f"{text_encoder.__class__.__name__} does not support LoRA training")
 
-            text_lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, inner_dim=inner_dim)
+            module = LoRAAttnProcessor(hidden_size=hidden_size, inner_dim=inner_dim)
+            text_lora_attn_procs[name] = module
+            text_lora_parameters.append(module.parameters())
 
-        text_encoder_lora_layers = AttnProcsLayers(text_lora_attn_procs)
         LoraLoaderMixin._modify_text_encoder(text_lora_attn_procs, text_encoder)
 
     # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
@@ -884,23 +892,13 @@ def save_model_hook(models, weights, output_dir):
         unet_lora_layers_to_save = None
         text_encoder_lora_layers_to_save = None
 
-        if args.train_text_encoder:
-            text_encoder_keys = accelerator.unwrap_model(text_encoder_lora_layers).state_dict().keys()
-        unet_keys = accelerator.unwrap_model(unet_lora_layers).state_dict().keys()
-
         for model in models:
-            state_dict = model.state_dict()
-
-            if (
-                text_encoder_lora_layers is not None
-                and text_encoder_keys is not None
-                and state_dict.keys() == text_encoder_keys
-            ):
-                # text encoder
-                text_encoder_lora_layers_to_save = state_dict
-            elif state_dict.keys() == unet_keys:
-                # unet
-                unet_lora_layers_to_save = state_dict
+            if isinstance(model, type(accelerator.unwrap_model(unet))):
+                unet_lora_layers_to_save = model.attn_processors_state_dict
+            elif isinstance(model, type(accelerator.unwrap_model(text_encoder))):
+                text_encoder_lora_layers_to_save = text_encoder_lora_state_dict(model)
+            else:
+                raise ValueError(f"unexpected save model: {model.__class__}")
 
             # make sure to pop weight so that corresponding model is not saved again
             weights.pop()
@@ -912,27 +910,23 @@ def save_model_hook(models, weights, output_dir):
         )
 
     def load_model_hook(models, input_dir):
-        # Note we DON'T pass the unet and text encoder here an purpose
-        # so that the we don't accidentally override the LoRA layers of
-        # unet_lora_layers and text_encoder_lora_layers which are stored in `models`
-        # with new torch.nn.Modules / weights. We simply use the pipeline class as
-        # an easy way to load the lora checkpoints
-        temp_pipeline = DiffusionPipeline.from_pretrained(
-            args.pretrained_model_name_or_path,
-            revision=args.revision,
-            torch_dtype=weight_dtype,
-        )
-        temp_pipeline.load_lora_weights(input_dir)
-
-        # load lora weights into models
-        models[0].load_state_dict(AttnProcsLayers(temp_pipeline.unet.attn_processors).state_dict())
-        if len(models) > 1:
-            models[1].load_state_dict(AttnProcsLayers(temp_pipeline.text_encoder_lora_attn_procs).state_dict())
+        lora_weights = torch.load(os.path.join(input_dir, LORA_WEIGHT_NAME))
+        unet_weights = {}
+        text_encoder_weights = {}
+
+        for k, v in lora_weights.items():
+            model, *k = k.split(".")
+            k = ".".join(k)
+
+            if model == UNET_NAME:
+                unet_weights[k] = v
+            elif model == TEXT_ENCODER_NAME:
+                text_encoder_weights[k] = v
+            else:
+                raise ValueError(f"unknown model name {model}")
 
-        # delete temporary pipeline and pop models
-        del temp_pipeline
-        for _ in range(len(models)):
-            models.pop()
+        unet.load_state_dict(unet_weights, strict=False)
+        text_encoder.load_state_dict(text_encoder_weights, strict=False)
 
     accelerator.register_save_state_pre_hook(save_model_hook)
     accelerator.register_load_state_pre_hook(load_model_hook)
@@ -962,9 +956,9 @@ def load_model_hook(models, input_dir):
 
     # Optimizer creation
     params_to_optimize = (
-        itertools.chain(unet_lora_layers.parameters(), text_encoder_lora_layers.parameters())
+        itertools.chain(unet_lora_parameters, text_lora_parameters)
         if args.train_text_encoder
-        else unet_lora_layers.parameters()
+        else unet_lora_parameters
     )
     optimizer = optimizer_class(
         params_to_optimize,
@@ -1053,12 +1047,12 @@ def compute_text_embeddings(prompt):
 
     # Prepare everything with our `accelerator`.
     if args.train_text_encoder:
-        unet_lora_layers, text_encoder_lora_layers, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-            unet_lora_layers, text_encoder_lora_layers, optimizer, train_dataloader, lr_scheduler
+        unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, text_encoder, optimizer, train_dataloader, lr_scheduler
         )
     else:
-        unet_lora_layers, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-            unet_lora_layers, optimizer, train_dataloader, lr_scheduler
+        unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, optimizer, train_dataloader, lr_scheduler
         )
 
     # We need to recalculate our total training steps as the size of the training dataloader may have changed.
@@ -1207,9 +1201,9 @@ def compute_text_embeddings(prompt):
                 accelerator.backward(loss)
                 if accelerator.sync_gradients:
                     params_to_clip = (
-                        itertools.chain(unet_lora_layers.parameters(), text_encoder_lora_layers.parameters())
+                        itertools.chain(unet_lora_parameters, text_lora_parameters)
                         if args.train_text_encoder
-                        else unet_lora_layers.parameters()
+                        else unet_lora_parameters
                     )
                     accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
                 optimizer.step()
@@ -1309,12 +1303,14 @@ def compute_text_embeddings(prompt):
     # Save the lora layers
     accelerator.wait_for_everyone()
     if accelerator.is_main_process:
+        unet = accelerator.unwrap_model(unet)
         unet = unet.to(torch.float32)
-        unet_lora_layers = accelerator.unwrap_model(unet_lora_layers)
+        unet_lora_layers = unet.attn_processors_state_dict
 
-        if text_encoder is not None:
+        if text_encoder is not None and args.train_text_encoder:
+            text_encoder = accelerator.unwrap_model(text_encoder)
             text_encoder = text_encoder.to(torch.float32)
-            text_encoder_lora_layers = accelerator.unwrap_model(text_encoder_lora_layers)
+            text_encoder_lora_layers = text_encoder_lora_state_dict(text_encoder)
 
         LoraLoaderMixin.save_lora_weights(
             save_directory=args.output_dir,
diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py
@@ -97,6 +97,40 @@ def text_encoder_attn_modules(text_encoder):
     return attn_modules
 
 
+def text_encoder_lora_state_dict(text_encoder):
+    state_dict = {}
+
+    for name, module in text_encoder_attn_modules(text_encoder):
+        if isinstance(text_encoder, CLIPTextModel):
+            for k, v in module.q_proj.lora_linear_layer.state_dict():
+                state_dict[f"{name}.q_proj.lora_linear_layer.{k}"] = v
+
+            for k, v in module.k_proj.lora_linear_layer.state_dict():
+                state_dict[f"{name}.k_proj.lora_linear_layer.{k}"] = v
+
+            for k, v in module.v_proj.lora_linear_layer.state_dict():
+                state_dict[f"{name}.v_proj.lora_linear_layer.{k}"] = v
+
+            for k, v in module.out_proj.lora_linear_layer.state_dict():
+                state_dict[f"{name}.out_proj.lora_linear_layer.{k}"] = v
+        elif isinstance(text_encoder, T5EncoderModel):
+            for k, v in module.q.lora_linear_layer.state_dict():
+                state_dict[f"{name}.q.lora_linear_layer.{k}"] = v
+
+            for k, v in module.k.lora_linear_layer.state_dict():
+                state_dict[f"{name}.k.lora_linear_layer.{k}"] = v
+
+            for k, v in module.v.lora_linear_layer.state_dict():
+                state_dict[f"{name}.v.lora_linear_layer.{k}"] = v
+
+            for k, v in module.o.lora_linear_layer.state_dict():
+                state_dict[f"{name}.o.lora_linear_layer.{k}"] = v
+        else:
+            raise ValueError(f"do not know how to get state dict for: {text_encoder.__class__.__name__}")
+
+    return state_dict
+
+
 class AttnProcsLayers(torch.nn.Module):
     def __init__(self, state_dict: Dict[str, torch.Tensor]):
         super().__init__()
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
@@ -528,6 +528,30 @@ def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors:
 
         return processors
 
+    @property
+    def attn_processors_state_dict(self) -> Dict[str, torch.tensor]:
+        r"""
+        Returns:
+            a state dict containing just the attention processor parameters.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "set_processor"):
+                for processor_key, processor_parameter in module.processor.state_dict():
+                    processors[f"{name}.processor.{processor_key}"] = processor_parameter
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
     def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
         r"""
         Parameters: