huggingface · dg845 · Nov 29, 2023
diff --git a/src/diffusers/pipelines/ddgan/__init__.py b/src/diffusers/pipelines/ddgan/__init__.py
@@ -0,0 +1,22 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    _LazyModule,
+)
+
+
+_import_structure = {"pipeline_ddgan": ["DDGANPipeline"]}
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    from .pipeline_ddgan import DDGANPipeline
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/src/diffusers/pipelines/ddgan/pipeline_ddgan.py b/src/diffusers/pipelines/ddgan/pipeline_ddgan.py
@@ -0,0 +1,315 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from ...image_processor import VaeImageProcessor
+from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# TODO: update
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import DiffusionPipeline
+        >>> import torch
+
+        >>> pipe = DiffusionPipeline.from_pretrained("SimianLuo/LCM_Dreamshaper_v7")
+        >>> # To save GPU memory, torch.float16 can be used, but it may compromise image quality.
+        >>> pipe.to(torch_device="cuda", torch_dtype=torch.float32)
+
+        >>> prompt = "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k"
+
+        >>> # Can be set to 1~50 steps. LCM support fast inference even <= 4 steps. Recommend: 1~8 steps.
+        >>> num_inference_steps = 4
+        >>> images = pipe(prompt=prompt, num_inference_steps=num_inference_steps, guidance_scale=8.0).images
+        >>> images[0].save("image.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class DDGANPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for denoising diffusion GAN (DDGAN) model.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        unet ([`UNet2DModel`]):
+            A `UNet2DModel` to denoise the noisy images.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the noisy images.
+    """
+
+    model_cpu_offload_seq = "unet"
+
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            unet=unet,
+            scheduler=scheduler,
+        )
+
+    # Copied from diffusers.pipelines.consistency_models.pipeline_consistency_models.ConsistencyModelPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels, height, width)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device=device, dtype=dtype)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.consistency_models.pipeline_consistency_models.ConsistencyModelPipeline.postprocess_image
+    def postprocess_image(self, sample: torch.FloatTensor, output_type: str = "pil"):
+        if output_type not in ["pt", "np", "pil"]:
+            raise ValueError(
+                f"output_type={output_type} is not supported. Make sure to choose one of ['pt', 'np', or 'pil']"
+            )
+
+        # Equivalent to diffusers.VaeImageProcessor.denormalize
+        sample = (sample / 2 + 0.5).clamp(0, 1)
+        if output_type == "pt":
+            return sample
+
+        # Equivalent to diffusers.VaeImageProcessor.pt_to_numpy
+        sample = sample.cpu().permute(0, 2, 3, 1).numpy()
+        if output_type == "np":
+            return sample
+
+        # Output_type must be 'pil'
+        sample = self.numpy_to_pil(sample)
+        return sample
+
+    # Copied from diffusers.pipelines.consistency_models.pipeline_consistency_models.ConsistencyModelPipeline.prepare_class_labels
+    def prepare_class_labels(self, batch_size, device, class_labels=None):
+        if self.unet.config.num_class_embeds is not None:
+            if isinstance(class_labels, list):
+                class_labels = torch.tensor(class_labels, dtype=torch.int)
+            elif isinstance(class_labels, int):
+                assert batch_size == 1, "Batch size must be 1 if classes is an int"
+                class_labels = torch.tensor([class_labels], dtype=torch.int)
+            elif class_labels is None:
+                # Randomly generate batch_size class labels
+                # TODO: should use generator here? int analogue of randn_tensor is not exposed in ...utils
+                class_labels = torch.randint(0, self.unet.config.num_class_embeds, size=(batch_size,))
+            class_labels = class_labels.to(device)
+        else:
+            class_labels = None
+
+    def check_inputs(self, num_inference_steps, timesteps, latents, batch_size, img_size, callback_steps):
+        if num_inference_steps is None and timesteps is None:
+            raise ValueError("Exactly one of `num_inference_steps` or `timesteps` must be supplied.")
+
+        if num_inference_steps is not None and timesteps is not None:
+            logger.warning(
+                f"Both `num_inference_steps`: {num_inference_steps} and `timesteps`: {timesteps} are supplied;"
+                " `timesteps` will be used over `num_inference_steps`."
+            )
+
+        if latents is not None:
+            expected_shape = (batch_size, 3, img_size, img_size)
+            if latents.shape != expected_shape:
+                raise ValueError(f"The shape of latents is {latents.shape} but is expected to be {expected_shape}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        batch_size: int = 1,
+        class_labels: Optional[Union[torch.Tensor, List[int], int]] = None,
+        num_inference_steps: int = 4,
+        timesteps: List[int] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        Args:
+            batch_size (`int`, *optional*, defaults to 1):
+                The number of images to generate.
+            class_labels (`torch.Tensor` or `List[int]` or `int`, *optional*):
+                Optional class labels for conditioning class-conditional consistency models. Not used if the model is
+                not class-conditional.
+            num_inference_steps (`int`, *optional*, defaults to 1):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
+                timesteps are used. Must be in descending order.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images.
+        """
+        # 0. Prepare call parameters
+        img_size = self.unet.config.sample_size
+        device = self._execution_device
+
+        # 1. Check inputs
+        self.check_inputs(num_inference_steps, timesteps, latents, batch_size, img_size, callback_steps)
+
+        # 2. Prepare image latents
+        # Sample image latents x_T ~ N(0, sigma^2 * I)
+        sample = self.prepare_latents(
+            batch_size=batch_size,
+            num_channels=self.unet.config.in_channels,
+            height=img_size,
+            width=img_size,
+            dtype=self.unet.dtype,
+            device=device,
+            generator=generator,
+            latents=latents,
+        )
+
+        # 3. Handle class_labels for class-conditional models
+        class_labels = self.prepare_class_labels(batch_size, device, class_labels=class_labels)
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+
+        # 5. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                scaled_sample = self.scheduler.scale_model_input(sample, t)
+                model_output = self.unet(scaled_sample, t, class_labels=class_labels, return_dict=False)[0]
+
+                sample = self.scheduler.step(model_output, t, sample, generator=generator)[0]
+
+                # call the callback, if provided
+                progress_bar.update()
+                if callback is not None and i % callback_steps == 0:
+                    callback(i, t, sample)
+
+        # 6. Post-process image sample
+        image = self.postprocess_image(sample, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)