[WIP] scheduler scaffolding

williamberman · williamberman · commit d14dff779686 · 2022-10-13T00:25:08.000-07:00
imports

init scheduler hacking

more work

fixes and docs

more cleaning
diff --git a/scripts/convert_vq_diffusion_to_diffusers.py b/scripts/convert_vq_diffusion_to_diffusers.py
@@ -32,9 +32,8 @@
 
 import yaml
 from accelerate import init_empty_weights, load_checkpoint_and_dispatch
-from diffusers import VQModel
+from diffusers import VQModel, VQDiffusionPipeline, VQDiffusionScheduler
 from diffusers.models.vq_diffusion_attention import VQDiffusionTransformer
-from diffusers.pipelines import VQDiffusionPipeline
 from transformers import CLIPTextModel, CLIPTokenizer
 from yaml.loader import FullLoader
 
@@ -846,10 +845,16 @@ def read_config_file(filename):
 
     # done text encoder
 
+    scheduler_model = VQDiffusionScheduler()
+
     print(f"saving VQ diffusion model, path: {args.dump_path}")
 
     pipe = VQDiffusionPipeline(
-        vqvae=vqvae_model, transformer=transformer_model, tokenizer=tokenizer_model, text_encoder=text_encoder_model
+        vqvae=vqvae_model,
+        transformer=transformer_model,
+        tokenizer=tokenizer_model,
+        text_encoder=text_encoder_model,
+        scheduler=scheduler_model,
     )
     pipe.save_pretrained(args.dump_path)
 
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
@@ -29,14 +29,15 @@
         get_scheduler,
     )
     from .pipeline_utils import DiffusionPipeline
-    from .pipelines import DDIMPipeline, DDPMPipeline, KarrasVePipeline, LDMPipeline, PNDMPipeline, ScoreSdeVePipeline
+    from .pipelines import DDIMPipeline, DDPMPipeline, KarrasVePipeline, LDMPipeline, PNDMPipeline, ScoreSdeVePipeline, VQDiffusionPipeline
     from .schedulers import (
         DDIMScheduler,
         DDPMScheduler,
         KarrasVeScheduler,
         PNDMScheduler,
         SchedulerMixin,
         ScoreSdeVeScheduler,
+        VQDiffusionScheduler
     )
     from .training_utils import EMAModel
 else:
diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
@@ -115,52 +115,3 @@ def forward(self, x):
         x_proj = x[:, None] * self.weight[None, :] * 2 * np.pi
         out = torch.cat([torch.sin(x_proj), torch.cos(x_proj)], dim=-1)
         return out
-
-
-# TODO(will) - document this. check if throwing errors internally is appropriate
-class DalleMaskImageEmbedding(nn.Module):
-    def __init__(
-        self,
-        num_embed,
-        height,
-        width,
-        embed_dim,
-    ):
-        super().__init__()
-
-        self.height = height
-        self.width = width
-        # TODO(will) add docs on why this is incremented by 1. (Has to do with mask?)
-        self.num_embed = num_embed + 1
-        self.embed_dim = embed_dim
-
-        self.emb = nn.Embedding(self.num_embed, embed_dim)
-        self.height_emb = nn.Embedding(self.height, embed_dim)
-        self.width_emb = nn.Embedding(self.width, embed_dim)
-
-    def forward(self, index):
-        assert index.dim() == 2  # B x L
-        try:
-            index[index < 0] = 0
-            emb = self.emb(index)
-        except:
-            raise RuntimeError(
-                "IndexError: index out of range in self, max index {}, num embed {}".format(
-                    index.max(), self.num_embed
-                )
-            )
-
-        # add col and row embedding
-        if emb.shape[1] > 0:
-            height_emb = self.height_emb(
-                torch.arange(self.height, device=index.device).view(1, self.height)
-            ).unsqueeze(
-                2
-            )  # 1 x H x D -> 1 x H x 1 x D
-            width_emb = self.width_emb(torch.arange(self.width, device=index.device).view(1, self.width)).unsqueeze(
-                1
-            )  # 1 x W x D -> 1 x 1 x W x D
-            pos_emb = (height_emb + width_emb).view(1, self.height * self.width, -1)  # 1 x H x W x D -> 1 x L xD
-            emb = emb + pos_emb[:, : emb.shape[1], :]
-
-        return emb
diff --git a/src/diffusers/models/vq_diffusion_attention.py b/src/diffusers/models/vq_diffusion_attention.py
@@ -2,10 +2,10 @@
 
 import torch
 from torch import nn
+import torch.nn.functional as F
 
 from diffusers.configuration_utils import ConfigMixin, register_to_config
 from diffusers.modeling_utils import ModelMixin
-from diffusers.models.embeddings import DalleMaskImageEmbedding
 
 from .attention import CrossAttention
 
@@ -23,20 +23,27 @@ def __init__(
         width: int,
         diffusion_steps: int,
         dropout: float = 0.0,
+        min_logged_value: float = -70.0
     ):
         super().__init__()
+
         self.n_heads = n_heads
         self.d_head = d_head
-        inner_dim = n_heads * d_head
+        self.inner_dim = n_heads * d_head
+        self.min_logged_value = min_logged_value
 
+        # The input to the `DalleMaskImageEmbedding` layer is the 
+        # embedding indices from the quantized codebook with an additional
+        # index for the masked value.
+        num_embed_with_mask = num_embed + 1
         self.latent_image_embedding = DalleMaskImageEmbedding(
-            num_embed=num_embed, embed_dim=inner_dim, height=height, width=width
+            num_embed=num_embed_with_mask, embed_dim=self.inner_dim, height=height, width=width
         )
 
         self.transformer_blocks = nn.ModuleList(
             [
                 BasicTransformerBlock(
-                    inner_dim,
+                    self.inner_dim,
                     n_heads,
                     d_head,
                     dropout=dropout,
@@ -48,21 +55,75 @@ def __init__(
             ]
         )
 
-        self.norm_out = nn.LayerNorm(inner_dim)
-        self.out = nn.Linear(inner_dim, num_embed)
+        self.norm_out = nn.LayerNorm(self.inner_dim)
+
+        # The output from the transformer is the embedding indices for the 
+        # quantized codebook. It does not include additional index for the
+        # masked value because the transformer predicts the unnoised image
+        # which has no masks
+        self.out = nn.Linear(self.inner_dim, num_embed)
 
     def forward(self, latent_images, cond_emb, t):
+        bsz = latent_images.shape[0]
+
         embedded_latent_images = self.latent_image_embedding(latent_images)
         hidden_states = embedded_latent_images
 
         for block in self.transformer_blocks:
             hidden_states = block(hidden_states, cond_emb, t)
 
         logits = self.out(self.norm_out(hidden_states))
-        out = logits.permute(0, 2, 1)
 
-        return out
+        # equivalent to `torch.zeros((bsz, self.inner_dim, 1)).log().clamp(self.min_logged_value)`
+        log_zero_vector = torch.full((bsz, self.inner_dim, 1), self.min_logged_value, device=logits.device)
+
+        log_x_0 = F.log_softmax(logits.double(), dim=-1).float().clamp(self.min_logged_value)
+        log_x_0 = torch.cat((log_x_0, log_zero_vector), dim=-1)
+
+        # TODO(will) can remove?
+        log_x_0 = log_x_0.permute(0, 2, 1)
+
+        return log_x_0
+
+
+# TODO(will) - document this
+class DalleMaskImageEmbedding(nn.Module):
+    def __init__(
+        self,
+        num_embed,
+        height,
+        width,
+        embed_dim,
+    ):
+        super().__init__()
+
+        self.height = height
+        self.width = width
+        self.num_embed = num_embed
+        self.embed_dim = embed_dim
+
+        self.emb = nn.Embedding(self.num_embed, embed_dim)
+        self.height_emb = nn.Embedding(self.height, embed_dim)
+        self.width_emb = nn.Embedding(self.width, embed_dim)
+
+    def forward(self, index):
+        emb = self.emb(index)
+
+        height_emb = self.height_emb(
+            torch.arange(self.height, device=index.device).view(1, self.height)
+        ).unsqueeze(
+            2
+        )  # 1 x H x D -> 1 x H x 1 x D
+
+        width_emb = self.width_emb(torch.arange(self.width, device=index.device).view(1, self.width)).unsqueeze(
+            1
+        )  # 1 x W x D -> 1 x 1 x W x D
+
+        pos_emb = (height_emb + width_emb).view(1, self.height * self.width, -1)  # 1 x H x W x D -> 1 x L xD
+
+        emb = emb + pos_emb[:, : emb.shape[1], :]
 
+        return emb
 
 class BasicTransformerBlock(nn.Module):
     def __init__(
diff --git a/src/diffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py b/src/diffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py
@@ -1,7 +1,32 @@
+from dataclasses import dataclass
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import torch
+
+import PIL
 from diffusers import VQDiffusionTransformer, VQModel
+from diffusers.schedulers.scheduling_vq_diffusion import VQDiffusionScheduler
 from transformers import CLIPTextModel, CLIPTokenizer
 
 from ...pipeline_utils import DiffusionPipeline
+from ...utils import BaseOutput, logging
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class VQDiffusionPipelineOutput(BaseOutput):
+    """
+    Args:
+    Output class for VQ Diffusion pipelines.
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
 
 
 # This class is a placeholder and does not have the full VQ-diffusion pipeline built out yet
@@ -14,18 +39,127 @@
 class VQDiffusionPipeline(DiffusionPipeline):
     vqvae: VQModel
     transformer: VQDiffusionTransformer
+    text_encoder: CLIPTextModel
+    tokenizer: CLIPTokenizer
+    scheduler: VQDiffusionScheduler
 
     def __init__(
         self,
         vqvae: VQModel,
         transformer: VQDiffusionTransformer,
         text_encoder: CLIPTextModel,
         tokenizer: CLIPTokenizer,
+        scheduler: VQDiffusionScheduler,
     ):
         super().__init__()
+
         self.register_modules(
             vqvae=vqvae,
             transformer=transformer,
             text_encoder=text_encoder,
             tokenizer=tokenizer,
+            scheduler=scheduler,
         )
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        height: int = 256,
+        width: int = 256,
+        num_inference_steps: int = 100,
+        num_images_per_prompt: int = 1,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+    ):
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+
+        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+        text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
+
+        # NOTE: This additional step of normalizing the text embeddings is from VQ-Diffusion.
+        # While CLIP does normalize the pooled output of the text transformer when combining
+        # the image and text embeddings, CLIP does not directly normalize the last hidden state.
+        #
+        # CLIP normalizing the pooled output.
+        # https://github.com/huggingface/transformers/blob/d92e22d1f28324f513f3080e5c47c071a3916721/src/transformers/models/clip/modeling_clip.py#L1052-L1053
+        text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)
+
+        # duplicate text embeddings for each generation per prompt
+        text_embeddings = text_embeddings.repeat_interleave(num_images_per_prompt, dim=0)
+
+        # get the initial random noise unless the user supplied it
+
+        # TODO I believe the latents are the indices of the of the vectors
+
+        # TODO HERE - what's the input shape?
+        latents_shape = TODO  # (batch_size * num_images_per_prompt, self.unet.in_channels, height // 8, width // 8)
+        latents_dtype = text_embeddings.dtype
+        if latents is None:
+            # all masked?
+            latents = TODO  # torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(self.device)
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+            latents = latents.to(self.device)
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        timesteps_tensor = self.scheduler.timesteps.to(self.device)
+
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # predict the un-noised image
+            log_x_start = TODO  # self.transformer(latents, t, encoder_hidden_states=text_embeddings).sample
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = TODO  # self.scheduler.step(x0_pred, t, latents).prev_sample
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                callback(i, t, latents)
+
+        image = self.vqvae.decode(latents).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return image
+
+        return VQDiffusionPipelineOutput(images=image)
diff --git a/src/diffusers/schedulers/__init__.py b/src/diffusers/schedulers/__init__.py
@@ -24,6 +24,7 @@
     from .scheduling_sde_ve import ScoreSdeVeScheduler
     from .scheduling_sde_vp import ScoreSdeVpScheduler
     from .scheduling_utils import SchedulerMixin
+    from .scheduling_vq_diffusion import VQDiffusionScheduler
 else:
     from ..utils.dummy_pt_objects import *  # noqa F403
 
diff --git a/src/diffusers/schedulers/scheduling_vq_diffusion.py b/src/diffusers/schedulers/scheduling_vq_diffusion.py