From 50615d339b4b97136900b711fcae6638a292eeec Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 8 Mar 2023 02:58:20 +0000
Subject: [PATCH 01/45] add image_processor

---
 src/diffusers/__init__.py                     |   2 +-
 src/diffusers/image_processor.py              | 160 ++++++++++++++++++
 .../pipeline_stable_diffusion_img2img.py      |  36 ++--
 tests/test_image_processor.py                 |  63 +++++++
 4 files changed, 234 insertions(+), 27 deletions(-)
 create mode 100644 src/diffusers/image_processor.py
 create mode 100644 tests/test_image_processor.py

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index f480b4100907..bde1b37c858f 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -16,7 +16,7 @@
     is_unidecode_available,
     logging,
 )
-
+from .image_processor import VaeImageProcessor
 
 try:
     if not is_onnx_available():
diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
new file mode 100644
index 000000000000..3887f1175737
--- /dev/null
+++ b/src/diffusers/image_processor.py
@@ -0,0 +1,160 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Union, Optional
+
+import PIL
+from PIL import Image
+import torch
+import numpy as np
+
+from .utils import PIL_INTERPOLATION, CONFIG_NAME
+from .configuration_utils import ConfigMixin, register_to_config
+
+class VaeImageProcessor(ConfigMixin):
+    """
+    Image Processor for VAE
+
+    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. 
+            `do_resize` in the `preprocess` method.
+        vae_scale_factor (`int`, *optional*, defaults to `8`):
+            scale factor in VAE, if do_resize is True, the image will be automatically resized to multipls of vae_scale_factor
+        resample (`str`, *optional*, defaults to `lanczos`):
+            Resampling filter to use if resizing the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image to [-1,1]
+    """
+    
+    config_name = CONFIG_NAME
+
+    @register_to_config
+    def __init__(
+        self,
+        do_resize: bool = True,
+        vae_scale_factor: int = 8,
+        resample: str = "lanczos",
+        do_normalize: bool = True,
+        ):
+        super().__init__()
+
+    @staticmethod
+    def numpy_to_pil(images):
+        """
+        Convert a numpy image or a batch of images to a PIL image.
+        """
+        if images.ndim == 3:
+            images = images[None, ...]
+        images = (images * 255).round().astype("uint8")
+        if images.shape[-1] == 1:
+            # special case for grayscale (single channel) images
+            pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
+        else:
+            pil_images = [Image.fromarray(image) for image in images]
+
+        return pil_images
+    
+    @staticmethod
+    def numpy_to_pt(images):
+        """
+        Convert a numpy image  to a pytorch tensor
+        """
+        images = torch.from_numpy(images.transpose(0, 3, 1, 2))
+        return images
+
+    @staticmethod
+    def pt_to_numpy(images):
+        """
+        Convert a numpy image  to a pytorch tensor
+        """
+        images = images.cpu().numpy().transpose(0, 2, 3, 1)
+        return images
+
+    @staticmethod
+    def normalize(images):
+        """
+        Normalize an image array to [-1,1]
+        """
+        return 2.0 * images - 1.0
+    
+    def resize(self, images: PIL.Image.Image) -> PIL.Image.Image:
+        """
+        Resize an PIL image. Both height and width will be resized to integer multiple of vae_scale_factor
+        """
+        w, h = images.size
+        w, h = map(lambda x: x - x % self.vae_scale_factor, (w, h))  # resize to integer multiple of vae_scale_factor
+        images = images.resize((w, h), resample=PIL_INTERPOLATION[self.resample])
+        return images
+
+    def encode(
+        self, 
+        image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray],
+    ) -> torch.Tensor:
+
+        """
+        Preprocess the image input, accpet formats in PIL images, numpy arrays or pytorch tensors"
+        """
+        # convert PIL or list of PIL into numpy 
+        if isinstance(image, PIL.Image.Image):
+            image = [image]
+
+        if isinstance(image[0], PIL.Image.Image):
+            if self.do_resize:
+                image = [self.resize(i) for i in image]
+            image = [np.array(i).astype(np.float32) / 255.0 for i in image]
+
+        if isinstance(image, np.ndarray):
+            image = self.numpy_to_pt(image)
+        elif isinstance(image[0], np.ndarray):
+            image = self.numpy_to_pt(np.stack(image, axis=0))
+        elif not isinstance(image, torch.Tensor) and isinstance(image[0], torch.Tensor):
+            image = torch.cat(image, dim=0)
+        
+        # expected range [0,1], normalize to [-1,1]
+        if image.min() < 0: 
+            warnings.warn(
+                "Passing `image` as torch tensor with value range in [-1,1] is deprecated. The expected value range for image tensor is [0,1] "
+                f"when passing as pytorch tensor or numpy Array. You passed `image` with value range [{image.min()},{image.max()}]",
+                FutureWarning,
+                )
+            self.do_normalize = False
+        
+        if self.do_normalize:
+            image = self.normalize(image)
+
+        return image
+
+    def decode(
+        self, 
+        image,
+        output_type: str ='pil',
+        ):
+        
+        if output_type == 'pt': 
+            return image
+        
+        image = self.pt_to_numpy(image)
+
+        if output_type == 'np':
+            return image
+        elif output_type == 'pil':
+            return self.numpy_to_pil(image)
+        else:
+            raise ValueError(f"Unsupported output_type {output_type}.")
+
+        
\ No newline at end of file
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 172ab15a757e..526478cb46ad 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -36,6 +36,7 @@
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
 from .safety_checker import StableDiffusionSafetyChecker
+from ...image_processor import VaeImageProcessor
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -69,27 +70,6 @@
 """
 
 
-def preprocess(image):
-    if isinstance(image, torch.Tensor):
-        return image
-    elif isinstance(image, PIL.Image.Image):
-        image = [image]
-
-    if isinstance(image[0], PIL.Image.Image):
-        w, h = image[0].size
-        w, h = map(lambda x: x - x % 8, (w, h))  # resize to integer multiple of 8
-
-        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
-        image = np.concatenate(image, axis=0)
-        image = np.array(image).astype(np.float32) / 255.0
-        image = image.transpose(0, 3, 1, 2)
-        image = 2.0 * image - 1.0
-        image = torch.from_numpy(image)
-    elif isinstance(image[0], torch.Tensor):
-        image = torch.cat(image, dim=0)
-    return image
-
-
 class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
     r"""
     Pipeline for text-guided image to image generation using Stable Diffusion.
@@ -195,8 +175,7 @@ def __init__(
             deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(unet.config)
             new_config["sample_size"] = 64
-            unet._internal_dict = FrozenDict(new_config)
-
+            unet._internal_dict = FrozenDict(new_config)      
         self.register_modules(
             vae=vae,
             text_encoder=text_encoder,
@@ -207,7 +186,12 @@ def __init__(
             feature_extractor=feature_extractor,
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+        vae_feature_extractor = VaeImageProcessor(
+            vae_scale_factor =self.vae_scale_factor)
+        self.register_to_config(
+            requires_safety_checker=requires_safety_checker,
+            vae_feature_extractor = vae_feature_extractor)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
     def enable_sequential_cpu_offload(self, gpu_id=0):
@@ -674,7 +658,7 @@ def __call__(
         )
 
         # 4. Preprocess image
-        image = preprocess(image)
+        image = self.vae_feature_extractor.encode(image)
 
         # 5. set timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
@@ -713,7 +697,7 @@ def __call__(
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
-
+        
         # 9. Post-processing
         image = self.decode_latents(latents)
 
diff --git a/tests/test_image_processor.py b/tests/test_image_processor.py
new file mode 100644
index 000000000000..b3377deb893e
--- /dev/null
+++ b/tests/test_image_processor.py
@@ -0,0 +1,63 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import torch
+import numpy as np
+
+from diffusers import VaeImageProcessor
+
+class ImageProcessorTest(unittest.TestCase):
+    
+    @property
+    def dummy_sample(self):
+        batch_size = 4
+        num_channels = 3
+        height = 8
+        width = 8
+
+        sample = torch.rand((batch_size, num_channels, height, width))
+
+        return sample
+    
+    def test_encode_input_pt(self):
+        image_processor = VaeImageProcessor(do_resize=False, do_normalize=False)
+        
+        input_pt = self.dummy_sample
+        out_pt = image_processor.decode(
+            image_processor.encode(input_pt),
+            output_type='pt')
+        assert np.abs(input_pt.cpu().numpy() - out_pt.cpu().numpy()).max() < 1e-6
+
+    def test_encode_input_np(self):
+        image_processor = VaeImageProcessor(do_resize=False, do_normalize=False)
+
+        input_np = self.dummy_sample.cpu().numpy().transpose(0, 2, 3, 1)
+        out_np = image_processor.decode(
+            image_processor.encode(input_np), 
+            output_type='np')
+        assert np.abs(input_np - out_np).max() < 1e-6
+
+    def test_encode_input_pil(self):
+        image_processor = VaeImageProcessor(do_resize=False, do_normalize=False)
+
+        input_np = self.dummy_sample.cpu().numpy().transpose(0, 2, 3, 1)
+        input_pil = image_processor.numpy_to_pil(input_np)
+        
+        out_pil = image_processor.decode(
+            image_processor.encode(input_pil), 
+            output_type='pil')
+        for i, o in zip(input_pil, out_pil):
+            assert np.abs(np.array(i) - np.array(o)).max() == 0
\ No newline at end of file

From d82730d1281c38bf74d6ff76355b5684089df565 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Thu, 9 Mar 2023 06:43:48 -1000
Subject: [PATCH 02/45] Apply suggestions from code review

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/image_processor.py              | 21 ++++++++++++-------
 .../pipeline_stable_diffusion_img2img.py      |  2 +-
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index 3887f1175737..2b5561e43558 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -50,7 +50,7 @@ def __init__(
         vae_scale_factor: int = 8,
         resample: str = "lanczos",
         do_normalize: bool = True,
-        ):
+    ):
         super().__init__()
 
     @staticmethod
@@ -110,31 +110,38 @@ def encode(
         Preprocess the image input, accpet formats in PIL images, numpy arrays or pytorch tensors"
         """
         # convert PIL or list of PIL into numpy 
-        if isinstance(image, PIL.Image.Image):
+        supported_formats =  [PIL.Image.Image, np.ndarray, torch.Tensor]
+        if isinstance(image, supported_formats):
+            image = [image]
+        elif isinstance(image, list) and all(isinstance(i, supported_formats) for i in image):
+            image = image
+        else:
+            raise ValueError("Raise nice error messages here that incorrect format is used.")  
             image = [image]
 
         if isinstance(image[0], PIL.Image.Image):
             if self.do_resize:
                 image = [self.resize(i) for i in image]
             image = [np.array(i).astype(np.float32) / 255.0 for i in image]
+        elif self.do_resize:
+            # Currently we only support resizing for PIL so in case np or torch is used AND resizing is activating (which it is by default) then let's do the following: - 1. Check if the image sizes are not a multiple of `self.vae_scale_factor` => If it's not the case we throw a nice error
 
-        if isinstance(image, np.ndarray):
-            image = self.numpy_to_pt(image)
-        elif isinstance(image[0], np.ndarray):
+        if isinstance(image[0], np.ndarray):
             image = self.numpy_to_pt(np.stack(image, axis=0))
         elif not isinstance(image, torch.Tensor) and isinstance(image[0], torch.Tensor):
             image = torch.cat(image, dim=0)
         
         # expected range [0,1], normalize to [-1,1]
+        do_normalize = self.do_normalize
         if image.min() < 0: 
             warnings.warn(
                 "Passing `image` as torch tensor with value range in [-1,1] is deprecated. The expected value range for image tensor is [0,1] "
                 f"when passing as pytorch tensor or numpy Array. You passed `image` with value range [{image.min()},{image.max()}]",
                 FutureWarning,
                 )
-            self.do_normalize = False
+            do_normalize = False
         
-        if self.do_normalize:
+        if do_normalize:
             image = self.normalize(image)
 
         return image
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 526478cb46ad..b1754987adc1 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -191,7 +191,7 @@ def __init__(
             vae_scale_factor =self.vae_scale_factor)
         self.register_to_config(
             requires_safety_checker=requires_safety_checker,
-            vae_feature_extractor = vae_feature_extractor)
+            )
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
     def enable_sequential_cpu_offload(self, gpu_id=0):

From d0d1437e4cca9f4254dfb1cee22a79c3f992a348 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 9 Mar 2023 21:46:42 +0000
Subject: [PATCH 03/45] add more tests

---
 src/diffusers/image_processor.py | 20 +++++++----
 tests/test_image_processor.py    | 57 ++++++++++++++++++++++----------
 2 files changed, 53 insertions(+), 24 deletions(-)

diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index 2b5561e43558..ec60b4a1030d 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -74,6 +74,10 @@ def numpy_to_pt(images):
         """
         Convert a numpy image  to a pytorch tensor
         """
+        if images.ndim ==3:
+            images = images[...,None]
+        elif images.ndim==5:
+            images = images.squeeze(0)
         images = torch.from_numpy(images.transpose(0, 3, 1, 2))
         return images
 
@@ -109,28 +113,30 @@ def encode(
         """
         Preprocess the image input, accpet formats in PIL images, numpy arrays or pytorch tensors"
         """
-        # convert PIL or list of PIL into numpy 
-        supported_formats =  [PIL.Image.Image, np.ndarray, torch.Tensor]
+        supported_formats =  (PIL.Image.Image, np.ndarray, torch.Tensor)
         if isinstance(image, supported_formats):
             image = [image]
         elif isinstance(image, list) and all(isinstance(i, supported_formats) for i in image):
             image = image
         else:
-            raise ValueError("Raise nice error messages here that incorrect format is used.")  
-            image = [image]
-
+            raise ValueError(f"incorrect image format is used - currently we only support PIL image, numpy array or pytorch tensor")  
+    
         if isinstance(image[0], PIL.Image.Image):
             if self.do_resize:
                 image = [self.resize(i) for i in image]
             image = [np.array(i).astype(np.float32) / 255.0 for i in image]
-        elif self.do_resize:
-            # Currently we only support resizing for PIL so in case np or torch is used AND resizing is activating (which it is by default) then let's do the following: - 1. Check if the image sizes are not a multiple of `self.vae_scale_factor` => If it's not the case we throw a nice error
 
         if isinstance(image[0], np.ndarray):
             image = self.numpy_to_pt(np.stack(image, axis=0))
         elif not isinstance(image, torch.Tensor) and isinstance(image[0], torch.Tensor):
             image = torch.cat(image, dim=0)
         
+        if image.ndim==5: 
+            image = image.squeeze(0)
+        _, _, height, width = image.shape
+        if self.do_resize and (height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0):
+            raise ValueError(f"the height and width of image have to be divisible by {self.vae_scale_factor} but are {height} and {width}.")
+
         # expected range [0,1], normalize to [-1,1]
         do_normalize = self.do_normalize
         if image.min() < 0: 
diff --git a/tests/test_image_processor.py b/tests/test_image_processor.py
index b3377deb893e..d3e683988d9c 100644
--- a/tests/test_image_processor.py
+++ b/tests/test_image_processor.py
@@ -17,13 +17,16 @@
 import torch
 import numpy as np
 
+import PIL
+
 from diffusers import VaeImageProcessor
 
+
 class ImageProcessorTest(unittest.TestCase):
     
     @property
     def dummy_sample(self):
-        batch_size = 4
+        batch_size = 1
         num_channels = 3
         height = 8
         width = 8
@@ -31,24 +34,41 @@ def dummy_sample(self):
         sample = torch.rand((batch_size, num_channels, height, width))
 
         return sample
-    
+
+    def to_np(self, image):
+        if isinstance(image[0], PIL.Image.Image):
+            return np.stack([np.array(i) for i in image],axis=0)
+        elif isinstance(image, torch.Tensor):
+            return image.cpu().numpy().transpose(0, 2, 3, 1)
+        return image
+
     def test_encode_input_pt(self):
         image_processor = VaeImageProcessor(do_resize=False, do_normalize=False)
-        
+    
         input_pt = self.dummy_sample
-        out_pt = image_processor.decode(
-            image_processor.encode(input_pt),
-            output_type='pt')
-        assert np.abs(input_pt.cpu().numpy() - out_pt.cpu().numpy()).max() < 1e-6
+        input_np = self.to_np(input_pt)
+        
+        for output_type in ['pt','np','pil']:
+            out = image_processor.decode(
+                image_processor.encode(input_pt),
+                output_type=output_type,
+                )
+            out_np = self.to_np(out)
+            in_np = (input_np * 255).round() if output_type == 'pil' else input_np
+            assert np.abs(in_np - out_np).max() < 1e-6, f"decoded output does not match input for output_type {output_type}"
 
     def test_encode_input_np(self):
         image_processor = VaeImageProcessor(do_resize=False, do_normalize=False)
-
         input_np = self.dummy_sample.cpu().numpy().transpose(0, 2, 3, 1)
-        out_np = image_processor.decode(
-            image_processor.encode(input_np), 
-            output_type='np')
-        assert np.abs(input_np - out_np).max() < 1e-6
+        
+        for output_type in ['pt','np','pil']:
+            out = image_processor.decode(
+                image_processor.encode(input_np), 
+                output_type=output_type)
+            
+            out_np = self.to_np(out)
+            in_np = (input_np * 255).round() if output_type == 'pil' else input_np
+            assert np.abs(in_np - out_np).max() < 1e-6, f"decoded output does not match input for output_type {output_type}"
 
     def test_encode_input_pil(self):
         image_processor = VaeImageProcessor(do_resize=False, do_normalize=False)
@@ -56,8 +76,11 @@ def test_encode_input_pil(self):
         input_np = self.dummy_sample.cpu().numpy().transpose(0, 2, 3, 1)
         input_pil = image_processor.numpy_to_pil(input_np)
         
-        out_pil = image_processor.decode(
-            image_processor.encode(input_pil), 
-            output_type='pil')
-        for i, o in zip(input_pil, out_pil):
-            assert np.abs(np.array(i) - np.array(o)).max() == 0
\ No newline at end of file
+        for output_type in ['pt','np','pil']:
+            out = image_processor.decode(
+                image_processor.encode(input_pil), 
+                output_type=output_type)
+            for i, o in zip(input_pil, out):
+                in_np = np.array(i)
+                out_np = self.to_np(out) if output_type == 'pil' else (self.to_np(out) * 255).round()
+                assert np.abs(in_np - out_np).max() < 1e-6, f"decoded output does not match input for output_type {output_type}"
\ No newline at end of file

From da62e8d3a3a1c85ae891fc314640508bd9858c3d Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 9 Mar 2023 21:48:38 +0000
Subject: [PATCH 04/45] make style

---
 src/diffusers/__init__.py                     |  3 +-
 src/diffusers/image_processor.py              | 65 ++++++++++---------
 .../pipeline_stable_diffusion_img2img.py      | 13 ++--
 tests/test_image_processor.py                 | 51 ++++++++-------
 4 files changed, 66 insertions(+), 66 deletions(-)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index bde1b37c858f..cebe6d9bcfaf 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -1,6 +1,7 @@
 __version__ = "0.15.0.dev0"
 
 from .configuration_utils import ConfigMixin
+from .image_processor import VaeImageProcessor
 from .utils import (
     OptionalDependencyNotAvailable,
     is_flax_available,
@@ -16,7 +17,7 @@
     is_unidecode_available,
     logging,
 )
-from .image_processor import VaeImageProcessor
+
 
 try:
     if not is_onnx_available():
diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index ec60b4a1030d..c1f877ff694f 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -12,15 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Union, Optional
+from typing import Union
 
+import numpy as np
 import PIL
-from PIL import Image
 import torch
-import numpy as np
+from PIL import Image
 
-from .utils import PIL_INTERPOLATION, CONFIG_NAME
 from .configuration_utils import ConfigMixin, register_to_config
+from .utils import CONFIG_NAME, PIL_INTERPOLATION
+
 
 class VaeImageProcessor(ConfigMixin):
     """
@@ -31,7 +32,7 @@ class VaeImageProcessor(ConfigMixin):
 
     Args:
         do_resize (`bool`, *optional*, defaults to `True`):
-            Whether to resize the image's (height, width) dimensions to the specified `size`. 
+            Whether to resize the image's (height, width) dimensions to the specified `size`.
             `do_resize` in the `preprocess` method.
         vae_scale_factor (`int`, *optional*, defaults to `8`):
             scale factor in VAE, if do_resize is True, the image will be automatically resized to multipls of vae_scale_factor
@@ -40,7 +41,7 @@ class VaeImageProcessor(ConfigMixin):
         do_normalize (`bool`, *optional*, defaults to `True`):
             Whether to normalize the image to [-1,1]
     """
-    
+
     config_name = CONFIG_NAME
 
     @register_to_config
@@ -68,15 +69,15 @@ def numpy_to_pil(images):
             pil_images = [Image.fromarray(image) for image in images]
 
         return pil_images
-    
+
     @staticmethod
     def numpy_to_pt(images):
         """
         Convert a numpy image  to a pytorch tensor
         """
-        if images.ndim ==3:
-            images = images[...,None]
-        elif images.ndim==5:
+        if images.ndim == 3:
+            images = images[..., None]
+        elif images.ndim == 5:
             images = images.squeeze(0)
         images = torch.from_numpy(images.transpose(0, 3, 1, 2))
         return images
@@ -95,7 +96,7 @@ def normalize(images):
         Normalize an image array to [-1,1]
         """
         return 2.0 * images - 1.0
-    
+
     def resize(self, images: PIL.Image.Image) -> PIL.Image.Image:
         """
         Resize an PIL image. Both height and width will be resized to integer multiple of vae_scale_factor
@@ -106,21 +107,22 @@ def resize(self, images: PIL.Image.Image) -> PIL.Image.Image:
         return images
 
     def encode(
-        self, 
+        self,
         image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray],
     ) -> torch.Tensor:
-
         """
         Preprocess the image input, accpet formats in PIL images, numpy arrays or pytorch tensors"
         """
-        supported_formats =  (PIL.Image.Image, np.ndarray, torch.Tensor)
+        supported_formats = (PIL.Image.Image, np.ndarray, torch.Tensor)
         if isinstance(image, supported_formats):
             image = [image]
         elif isinstance(image, list) and all(isinstance(i, supported_formats) for i in image):
             image = image
         else:
-            raise ValueError(f"incorrect image format is used - currently we only support PIL image, numpy array or pytorch tensor")  
-    
+            raise ValueError(
+                "incorrect image format is used - currently we only support PIL image, numpy array or pytorch tensor"
+            )
+
         if isinstance(image[0], PIL.Image.Image):
             if self.do_resize:
                 image = [self.resize(i) for i in image]
@@ -130,44 +132,43 @@ def encode(
             image = self.numpy_to_pt(np.stack(image, axis=0))
         elif not isinstance(image, torch.Tensor) and isinstance(image[0], torch.Tensor):
             image = torch.cat(image, dim=0)
-        
-        if image.ndim==5: 
+
+        if image.ndim == 5:
             image = image.squeeze(0)
         _, _, height, width = image.shape
         if self.do_resize and (height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0):
-            raise ValueError(f"the height and width of image have to be divisible by {self.vae_scale_factor} but are {height} and {width}.")
+            raise ValueError(
+                f"the height and width of image have to be divisible by {self.vae_scale_factor} but are {height} and {width}."
+            )
 
         # expected range [0,1], normalize to [-1,1]
         do_normalize = self.do_normalize
-        if image.min() < 0: 
+        if image.min() < 0:
             warnings.warn(
                 "Passing `image` as torch tensor with value range in [-1,1] is deprecated. The expected value range for image tensor is [0,1] "
                 f"when passing as pytorch tensor or numpy Array. You passed `image` with value range [{image.min()},{image.max()}]",
                 FutureWarning,
-                )
+            )
             do_normalize = False
-        
+
         if do_normalize:
             image = self.normalize(image)
 
         return image
 
     def decode(
-        self, 
+        self,
         image,
-        output_type: str ='pil',
-        ):
-        
-        if output_type == 'pt': 
+        output_type: str = "pil",
+    ):
+        if output_type == "pt":
             return image
-        
+
         image = self.pt_to_numpy(image)
 
-        if output_type == 'np':
+        if output_type == "np":
             return image
-        elif output_type == 'pil':
+        elif output_type == "pil":
             return self.numpy_to_pil(image)
         else:
             raise ValueError(f"Unsupported output_type {output_type}.")
-
-        
\ No newline at end of file
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index b1754987adc1..34eeae444359 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -15,17 +15,16 @@
 import inspect
 from typing import Callable, List, Optional, Union
 
-import numpy as np
 import PIL
 import torch
 from packaging import version
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
+from ...image_processor import VaeImageProcessor
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
-    PIL_INTERPOLATION,
     deprecate,
     is_accelerate_available,
     is_accelerate_version,
@@ -36,7 +35,6 @@
 from ..pipeline_utils import DiffusionPipeline
 from . import StableDiffusionPipelineOutput
 from .safety_checker import StableDiffusionSafetyChecker
-from ...image_processor import VaeImageProcessor
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -175,7 +173,7 @@ def __init__(
             deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(unet.config)
             new_config["sample_size"] = 64
-            unet._internal_dict = FrozenDict(new_config)      
+            unet._internal_dict = FrozenDict(new_config)
         self.register_modules(
             vae=vae,
             text_encoder=text_encoder,
@@ -187,11 +185,10 @@ def __init__(
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
 
-        vae_feature_extractor = VaeImageProcessor(
-            vae_scale_factor =self.vae_scale_factor)
+        VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
         self.register_to_config(
             requires_safety_checker=requires_safety_checker,
-            )
+        )
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
     def enable_sequential_cpu_offload(self, gpu_id=0):
@@ -697,7 +694,7 @@ def __call__(
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
-        
+
         # 9. Post-processing
         image = self.decode_latents(latents)
 
diff --git a/tests/test_image_processor.py b/tests/test_image_processor.py
index d3e683988d9c..6734611e98dd 100644
--- a/tests/test_image_processor.py
+++ b/tests/test_image_processor.py
@@ -14,16 +14,15 @@
 # limitations under the License.
 
 import unittest
-import torch
-import numpy as np
 
+import numpy as np
 import PIL
+import torch
 
 from diffusers import VaeImageProcessor
 
 
 class ImageProcessorTest(unittest.TestCase):
-    
     @property
     def dummy_sample(self):
         batch_size = 1
@@ -37,50 +36,52 @@ def dummy_sample(self):
 
     def to_np(self, image):
         if isinstance(image[0], PIL.Image.Image):
-            return np.stack([np.array(i) for i in image],axis=0)
+            return np.stack([np.array(i) for i in image], axis=0)
         elif isinstance(image, torch.Tensor):
             return image.cpu().numpy().transpose(0, 2, 3, 1)
         return image
 
     def test_encode_input_pt(self):
         image_processor = VaeImageProcessor(do_resize=False, do_normalize=False)
-    
+
         input_pt = self.dummy_sample
         input_np = self.to_np(input_pt)
-        
-        for output_type in ['pt','np','pil']:
+
+        for output_type in ["pt", "np", "pil"]:
             out = image_processor.decode(
                 image_processor.encode(input_pt),
                 output_type=output_type,
-                )
+            )
             out_np = self.to_np(out)
-            in_np = (input_np * 255).round() if output_type == 'pil' else input_np
-            assert np.abs(in_np - out_np).max() < 1e-6, f"decoded output does not match input for output_type {output_type}"
+            in_np = (input_np * 255).round() if output_type == "pil" else input_np
+            assert (
+                np.abs(in_np - out_np).max() < 1e-6
+            ), f"decoded output does not match input for output_type {output_type}"
 
     def test_encode_input_np(self):
         image_processor = VaeImageProcessor(do_resize=False, do_normalize=False)
         input_np = self.dummy_sample.cpu().numpy().transpose(0, 2, 3, 1)
-        
-        for output_type in ['pt','np','pil']:
-            out = image_processor.decode(
-                image_processor.encode(input_np), 
-                output_type=output_type)
-            
+
+        for output_type in ["pt", "np", "pil"]:
+            out = image_processor.decode(image_processor.encode(input_np), output_type=output_type)
+
             out_np = self.to_np(out)
-            in_np = (input_np * 255).round() if output_type == 'pil' else input_np
-            assert np.abs(in_np - out_np).max() < 1e-6, f"decoded output does not match input for output_type {output_type}"
+            in_np = (input_np * 255).round() if output_type == "pil" else input_np
+            assert (
+                np.abs(in_np - out_np).max() < 1e-6
+            ), f"decoded output does not match input for output_type {output_type}"
 
     def test_encode_input_pil(self):
         image_processor = VaeImageProcessor(do_resize=False, do_normalize=False)
 
         input_np = self.dummy_sample.cpu().numpy().transpose(0, 2, 3, 1)
         input_pil = image_processor.numpy_to_pil(input_np)
-        
-        for output_type in ['pt','np','pil']:
-            out = image_processor.decode(
-                image_processor.encode(input_pil), 
-                output_type=output_type)
+
+        for output_type in ["pt", "np", "pil"]:
+            out = image_processor.decode(image_processor.encode(input_pil), output_type=output_type)
             for i, o in zip(input_pil, out):
                 in_np = np.array(i)
-                out_np = self.to_np(out) if output_type == 'pil' else (self.to_np(out) * 255).round()
-                assert np.abs(in_np - out_np).max() < 1e-6, f"decoded output does not match input for output_type {output_type}"
\ No newline at end of file
+                out_np = self.to_np(out) if output_type == "pil" else (self.to_np(out) * 255).round()
+                assert (
+                    np.abs(in_np - out_np).max() < 1e-6
+                ), f"decoded output does not match input for output_type {output_type}"

From 98146d08b3537fca2c59335fdf211fa13274883a Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 9 Mar 2023 21:53:30 +0000
Subject: [PATCH 05/45] fix

---
 src/diffusers/image_processor.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index c1f877ff694f..ac10fb269fba 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import warnings
 from typing import Union
 
 import numpy as np
@@ -27,15 +28,13 @@ class VaeImageProcessor(ConfigMixin):
     """
     Image Processor for VAE
 
-    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
-    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-
     Args:
         do_resize (`bool`, *optional*, defaults to `True`):
-            Whether to resize the image's (height, width) dimensions to the specified `size`.
-            `do_resize` in the `preprocess` method.
+            Whether to resize the image's (height, width) dimensions to the specified `size`. `do_resize` in the
+            `preprocess` method.
         vae_scale_factor (`int`, *optional*, defaults to `8`):
-            scale factor in VAE, if do_resize is True, the image will be automatically resized to multipls of vae_scale_factor
+            scale factor in VAE, if do_resize is True, the image will be automatically resized to multipls of
+            vae_scale_factor
         resample (`str`, *optional*, defaults to `lanczos`):
             Resampling filter to use if resizing the image.
         do_normalize (`bool`, *optional*, defaults to `True`):
@@ -73,7 +72,7 @@ def numpy_to_pil(images):
     @staticmethod
     def numpy_to_pt(images):
         """
-        Convert a numpy image  to a pytorch tensor
+        Convert a numpy image to a pytorch tensor
         """
         if images.ndim == 3:
             images = images[..., None]
@@ -85,7 +84,7 @@ def numpy_to_pt(images):
     @staticmethod
     def pt_to_numpy(images):
         """
-        Convert a numpy image  to a pytorch tensor
+        Convert a numpy image to a pytorch tensor
         """
         images = images.cpu().numpy().transpose(0, 2, 3, 1)
         return images

From d223e8e7762f36f9faef83e51d6fb57aae67112a Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 9 Mar 2023 22:52:35 +0000
Subject: [PATCH 06/45] update img2mg

---
 src/diffusers/__init__.py                     |  2 +-
 .../pipeline_stable_diffusion_img2img.py      | 49 +++++++++++++------
 .../test_stable_diffusion_img2img.py          |  2 +-
 3 files changed, 37 insertions(+), 16 deletions(-)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index cebe6d9bcfaf..c239b8da73cd 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -1,7 +1,6 @@
 __version__ = "0.15.0.dev0"
 
 from .configuration_utils import ConfigMixin
-from .image_processor import VaeImageProcessor
 from .utils import (
     OptionalDependencyNotAvailable,
     is_flax_available,
@@ -91,6 +90,7 @@
         VQDiffusionScheduler,
     )
     from .training_utils import EMAModel
+    from .image_processor import VaeImageProcessor
 
 try:
     if not (is_torch_available() and is_scipy_available()):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 34eeae444359..5be078ac16dc 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -185,7 +185,7 @@ def __init__(
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
 
-        VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.vae_feature_extractor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
         self.register_to_config(
             requires_safety_checker=requires_safety_checker,
         )
@@ -403,10 +403,9 @@ def _encode_prompt(
 
         return prompt_embeds
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
     def run_safety_checker(self, image, device, dtype):
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            safety_checker_input = self.feature_extractor(self.vae_feature_extractor.numpy_to_pil(image), return_tensors="pt").to(device)
             image, has_nsfw_concept = self.safety_checker(
                 images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
             )
@@ -420,7 +419,7 @@ def decode_latents(self, latents):
         image = self.vae.decode(latents).sample
         image = (image / 2 + 0.5).clamp(0, 1)
         # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        #image = image.cpu().permute(0, 2, 3, 1).float().numpy()
         return image
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
@@ -694,16 +693,38 @@ def __call__(
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
-
-        # 9. Post-processing
-        image = self.decode_latents(latents)
-
-        # 10. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
-
-        # 11. Convert to PIL
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
+        
+        if output_type is None: 
+            output_type = 'np'
+            
+        if output_type == "latent":
+            image = latents
+            has_nsfw_concept = None
+        elif output_type == "pt":
+            # 8. Post-processing
+            image = self.decode_latents(latents)
+            has_nsfw_concept = None
+        elif output_type == "np":
+            # 8. Post-processing
+            image = self.decode_latents(latents)
+
+            # 9. Run safety checker
+            image = self.vae_feature_extractor.decode(image, output_type='np')
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        
+        elif output_type == 'pil':
+            # 8. Post-processing
+            image = self.decode_latents(latents)
+
+            # 9. Run safety checker
+            image = self.vae_feature_extractor.decode(image, output_type='np')
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+            # 10. Convert to PIL
+            image = self.vae_feature_extractor.numpy_to_pil(image)
+        
+        else:
+            raise ValueError(f"Unsupported output_type {output_type} ")
 
         # Offload last model to CPU
         if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
index 77dfa9be1d1e..a8cd62a1c198 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
@@ -106,7 +106,7 @@ def get_dummy_inputs(self, device, seed=0):
             "generator": generator,
             "num_inference_steps": 2,
             "guidance_scale": 6.0,
-            "output_type": "numpy",
+            "output_type": "np",
         }
         return inputs
 

From 5eb759213c273ac2e0db6c11559b41629e1a38c5 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 9 Mar 2023 22:53:07 +0000
Subject: [PATCH 07/45] style

---
 src/diffusers/__init__.py                     |  2 +-
 .../pipeline_stable_diffusion_img2img.py      | 24 ++++++++++---------
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index c239b8da73cd..4315669cc459 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -32,6 +32,7 @@
 except OptionalDependencyNotAvailable:
     from .utils.dummy_pt_objects import *  # noqa F403
 else:
+    from .image_processor import VaeImageProcessor
     from .models import (
         AutoencoderKL,
         ControlNetModel,
@@ -90,7 +91,6 @@
         VQDiffusionScheduler,
     )
     from .training_utils import EMAModel
-    from .image_processor import VaeImageProcessor
 
 try:
     if not (is_torch_available() and is_scipy_available()):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 5be078ac16dc..5df32865e8c3 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -405,7 +405,9 @@ def _encode_prompt(
 
     def run_safety_checker(self, image, device, dtype):
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.vae_feature_extractor.numpy_to_pil(image), return_tensors="pt").to(device)
+            safety_checker_input = self.feature_extractor(
+                self.vae_feature_extractor.numpy_to_pil(image), return_tensors="pt"
+            ).to(device)
             image, has_nsfw_concept = self.safety_checker(
                 images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
             )
@@ -419,7 +421,7 @@ def decode_latents(self, latents):
         image = self.vae.decode(latents).sample
         image = (image / 2 + 0.5).clamp(0, 1)
         # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        #image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        # image = image.cpu().permute(0, 2, 3, 1).float().numpy()
         return image
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
@@ -693,10 +695,10 @@ def __call__(
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
-        
-        if output_type is None: 
-            output_type = 'np'
-            
+
+        if output_type is None:
+            output_type = "np"
+
         if output_type == "latent":
             image = latents
             has_nsfw_concept = None
@@ -709,20 +711,20 @@ def __call__(
             image = self.decode_latents(latents)
 
             # 9. Run safety checker
-            image = self.vae_feature_extractor.decode(image, output_type='np')
+            image = self.vae_feature_extractor.decode(image, output_type="np")
             image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
-        
-        elif output_type == 'pil':
+
+        elif output_type == "pil":
             # 8. Post-processing
             image = self.decode_latents(latents)
 
             # 9. Run safety checker
-            image = self.vae_feature_extractor.decode(image, output_type='np')
+            image = self.vae_feature_extractor.decode(image, output_type="np")
             image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
 
             # 10. Convert to PIL
             image = self.vae_feature_extractor.numpy_to_pil(image)
-        
+
         else:
             raise ValueError(f"Unsupported output_type {output_type} ")
 

From af21a0d8fcb7726a498515d2f0a596e849480f04 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Thu, 9 Mar 2023 23:09:59 +0000
Subject: [PATCH 08/45] fix

---
 src/diffusers/image_processor.py                             | 2 +-
 .../stable_diffusion/pipeline_stable_diffusion_img2img.py    | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index ac10fb269fba..f97f77860b4d 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -86,7 +86,7 @@ def pt_to_numpy(images):
         """
         Convert a numpy image to a pytorch tensor
         """
-        images = images.cpu().numpy().transpose(0, 2, 3, 1)
+        images = images.cpu().permute(0, 2, 3, 1).float().numpy()
         return images
 
     @staticmethod
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 5df32865e8c3..60a10cb37277 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -696,16 +696,15 @@ def __call__(
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
 
-        if output_type is None:
-            output_type = "np"
-
         if output_type == "latent":
             image = latents
             has_nsfw_concept = None
+
         elif output_type == "pt":
             # 8. Post-processing
             image = self.decode_latents(latents)
             has_nsfw_concept = None
+
         elif output_type == "np":
             # 8. Post-processing
             image = self.decode_latents(latents)

From 803c93e2140c738f6315a447d2d40b105936b280 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Sun, 12 Mar 2023 23:59:05 +0000
Subject: [PATCH 09/45] apply feedbacks

---
 src/diffusers/image_processor.py              | 18 ++---
 .../pipeline_stable_diffusion_img2img.py      | 53 +++++--------
 .../test_stable_diffusion_img2img.py          | 63 ++++++++++++---
 tests/test_image_processor.py                 | 79 +++++++++++++++++--
 4 files changed, 153 insertions(+), 60 deletions(-)

diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index f97f77860b4d..d8edc8223cc8 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -76,8 +76,7 @@ def numpy_to_pt(images):
         """
         if images.ndim == 3:
             images = images[..., None]
-        elif images.ndim == 5:
-            images = images.squeeze(0)
+
         images = torch.from_numpy(images.transpose(0, 3, 1, 2))
         return images
 
@@ -105,7 +104,7 @@ def resize(self, images: PIL.Image.Image) -> PIL.Image.Image:
         images = images.resize((w, h), resample=PIL_INTERPOLATION[self.resample])
         return images
 
-    def encode(
+    def preprocess(
         self,
         image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray],
     ) -> torch.Tensor:
@@ -128,12 +127,11 @@ def encode(
             image = [np.array(i).astype(np.float32) / 255.0 for i in image]
 
         if isinstance(image[0], np.ndarray):
-            image = self.numpy_to_pt(np.stack(image, axis=0))
-        elif not isinstance(image, torch.Tensor) and isinstance(image[0], torch.Tensor):
-            image = torch.cat(image, dim=0)
+            image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
+            image = self.numpy_to_pt(image)
+        elif isinstance(image[0], torch.Tensor):
+            image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0)
 
-        if image.ndim == 5:
-            image = image.squeeze(0)
         _, _, height, width = image.shape
         if self.do_resize and (height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0):
             raise ValueError(
@@ -155,12 +153,12 @@ def encode(
 
         return image
 
-    def decode(
+    def postprocess(
         self,
         image,
         output_type: str = "pil",
     ):
-        if output_type == "pt":
+        if isinstance(image, torch.Tensor) and output_type == "pt":
             return image
 
         image = self.pt_to_numpy(image)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 60a10cb37277..444354417168 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -185,7 +185,7 @@ def __init__(
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
 
-        self.vae_feature_extractor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
         self.register_to_config(
             requires_safety_checker=requires_safety_checker,
         )
@@ -404,15 +404,14 @@ def _encode_prompt(
         return prompt_embeds
 
     def run_safety_checker(self, image, device, dtype):
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(
-                self.vae_feature_extractor.numpy_to_pil(image), return_tensors="pt"
+        feature_extractor_input = self.image_processor.postprocess(image, output_type='pil')
+        safety_checker_input = self.feature_extractor(
+            feature_extractor_input, 
+            return_tensors="pt"
             ).to(device)
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+        image, has_nsfw_concept = self.safety_checker(
+            images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
             )
-        else:
-            has_nsfw_concept = None
         return image, has_nsfw_concept
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
@@ -656,7 +655,7 @@ def __call__(
         )
 
         # 4. Preprocess image
-        image = self.vae_feature_extractor.encode(image)
+        image = self.image_processor.preprocess(image)
 
         # 5. set timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
@@ -695,37 +694,27 @@ def __call__(
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
+        
+        if output_type not in ["latent", "pt", "np", "pil"]:
+            deprecation_message = (
+                f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: "
+                "`pil`, `np`, `pt`, `latent`"
+            )
+            deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False ) 
+            output_type = "np"
 
         if output_type == "latent":
             image = latents
             has_nsfw_concept = None
 
-        elif output_type == "pt":
-            # 8. Post-processing
-            image = self.decode_latents(latents)
-            has_nsfw_concept = None
-
-        elif output_type == "np":
-            # 8. Post-processing
-            image = self.decode_latents(latents)
+        image = self.decode_latents(latents)
 
-            # 9. Run safety checker
-            image = self.vae_feature_extractor.decode(image, output_type="np")
-            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
-
-        elif output_type == "pil":
-            # 8. Post-processing
-            image = self.decode_latents(latents)
-
-            # 9. Run safety checker
-            image = self.vae_feature_extractor.decode(image, output_type="np")
+        if self.safety_checker is not None:
             image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
-
-            # 10. Convert to PIL
-            image = self.vae_feature_extractor.numpy_to_pil(image)
-
         else:
-            raise ValueError(f"Unsupported output_type {output_type} ")
+            has_nsfw_concept = False
+        
+        image = self.image_processor.postprocess(image, output_type=output_type)
 
         # Offload last model to CPU
         if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
index a8cd62a1c198..5e91766e23a5 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
@@ -29,6 +29,7 @@
     PNDMScheduler,
     StableDiffusionImg2ImgPipeline,
     UNet2DConditionModel,
+    VaeImageProcessor,
 )
 from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device
 from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
@@ -94,19 +95,33 @@ def get_dummy_components(self):
         }
         return components
 
-    def get_dummy_inputs(self, device, seed=0):
+    def get_dummy_inputs(self, device, seed=0, input_image_type='pt', output_type='np'):
         image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
         if str(device).startswith("mps"):
             generator = torch.manual_seed(seed)
         else:
             generator = torch.Generator(device=device).manual_seed(seed)
+        
+        if input_image_type == 'pt':
+            input_image = image
+        elif input_image_type == 'np':
+            input_image = image.cpu().numpy().transpose(0, 2, 3, 1)
+        elif input_image_type == 'pil': 
+            input_image = image.cpu().numpy().transpose(0, 2, 3, 1)
+            input_image = VaeImageProcessor.numpy_to_pil(input_image)
+        else:
+            raise ValueError(f"unsupported input_image_type {input_image_type}.")
+        
+        if output_type not in ['pt', 'np', 'pil']:
+            raise ValueError(f"unsupported output_type {output_type}")
+
         inputs = {
             "prompt": "A painting of a squirrel eating a burger",
-            "image": image,
+            "image": input_image,
             "generator": generator,
             "num_inference_steps": 2,
             "guidance_scale": 6.0,
-            "output_type": "np",
+            "output_type": output_type,
         }
         return inputs
 
@@ -122,7 +137,7 @@ def test_stable_diffusion_img2img_default_case(self):
         image_slice = image[0, -3:, -3:, -1]
 
         assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.4492, 0.3865, 0.4222, 0.5854, 0.5139, 0.4379, 0.4193, 0.48, 0.4218])
+        expected_slice = np.array([0.46275955, 0.3977616,  0.42548066, 0.5823421,  0.50115615, 0.43968713, 0.41080174, 0.47410887, 0.42165133])
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
 
@@ -140,8 +155,7 @@ def test_stable_diffusion_img2img_negative_prompt(self):
         image_slice = image[0, -3:, -3:, -1]
 
         assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.4065, 0.3783, 0.4050, 0.5266, 0.4781, 0.4252, 0.4203, 0.4692, 0.4365])
-
+        expected_slice = np.array([0.4104152, 0.38498846, 0.41070235, 0.52090424, 0.47205922, 0.42849067, 0.41589636, 0.46834698, 0.4408132])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
 
     def test_stable_diffusion_img2img_multiple_init_images(self):
@@ -158,7 +172,7 @@ def test_stable_diffusion_img2img_multiple_init_images(self):
         image_slice = image[-1, -3:, -3:, -1]
 
         assert image.shape == (2, 32, 32, 3)
-        expected_slice = np.array([0.5144, 0.4447, 0.4735, 0.6676, 0.5526, 0.5454, 0.645, 0.5149, 0.4689])
+        expected_slice = np.array([0.46686086, 0.44393504, 0.5084481, 0.67471784, 0.55514234, 0.5346449, 0.63654804, 0.51626045, 0.46245307])
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
 
@@ -177,7 +191,7 @@ def test_stable_diffusion_img2img_k_lms(self):
         image_slice = image[0, -3:, -3:, -1]
 
         assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.4367, 0.4986, 0.4372, 0.6706, 0.5665, 0.444, 0.5864, 0.6019, 0.5203])
+        expected_slice = np.array([0.4390018, 0.49910325, 0.43994197, 0.6633433, 0.56556225, 0.44274506, 0.58594346, 0.60113865, 0.52007025])
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
 
@@ -197,7 +211,36 @@ def test_save_load_optional_components(self):
     def test_attention_slicing_forward_pass(self):
         return super().test_attention_slicing_forward_pass()
 
+    @skip_mps
+    def test_pt_np_pil_outputs_equivalent(self):
+        device = 'cpu'
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionImg2ImgPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+        
+        output_pt = sd_pipe(**self.get_dummy_inputs(device, output_type='pt'))[0]
+        output_np = sd_pipe(**self.get_dummy_inputs(device, output_type='np'))[0]
+        output_pil = sd_pipe(**self.get_dummy_inputs(device, output_type='pil'))[0]
 
+        assert np.abs(output_pt.cpu().numpy().transpose(0, 2, 3, 1) - output_np).max() <= 1e-4
+        assert np.abs(np.array(output_pil[0]) - (output_np * 255).round()).max() <= 1e-4
+
+    @skip_mps
+    def test_image_types_consistent(self):
+        device = 'cpu'
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionImg2ImgPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+        
+        output_pt = sd_pipe(**self.get_dummy_inputs(device, input_image_type='pt'))[0]
+        output_np = sd_pipe(**self.get_dummy_inputs(device, input_image_type='np'))[0]
+        output_pil = sd_pipe(**self.get_dummy_inputs(device, input_image_type='pil'))[0]
+
+        assert np.abs(output_pt - output_np).max() <= 1e-4
+        assert np.abs(output_pil - output_np).max() <= 1e-2
+       
 @slow
 @require_torch_gpu
 class StableDiffusionImg2ImgPipelineSlowTests(unittest.TestCase):
@@ -219,7 +262,7 @@ def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0
             "num_inference_steps": 3,
             "strength": 0.75,
             "guidance_scale": 7.5,
-            "output_type": "numpy",
+            "output_type": "np",
         }
         return inputs
 
@@ -426,7 +469,7 @@ def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0
             "num_inference_steps": 50,
             "strength": 0.75,
             "guidance_scale": 7.5,
-            "output_type": "numpy",
+            "output_type": "np",
         }
         return inputs
 
diff --git a/tests/test_image_processor.py b/tests/test_image_processor.py
index 6734611e98dd..87ef11aa611e 100644
--- a/tests/test_image_processor.py
+++ b/tests/test_image_processor.py
@@ -40,16 +40,16 @@ def to_np(self, image):
         elif isinstance(image, torch.Tensor):
             return image.cpu().numpy().transpose(0, 2, 3, 1)
         return image
-
-    def test_encode_input_pt(self):
+    
+    def test_vae_image_processor_pt(self):
         image_processor = VaeImageProcessor(do_resize=False, do_normalize=False)
 
         input_pt = self.dummy_sample
         input_np = self.to_np(input_pt)
 
         for output_type in ["pt", "np", "pil"]:
-            out = image_processor.decode(
-                image_processor.encode(input_pt),
+            out = image_processor.postprocess(
+                image_processor.preprocess(input_pt),
                 output_type=output_type,
             )
             out_np = self.to_np(out)
@@ -58,12 +58,12 @@ def test_encode_input_pt(self):
                 np.abs(in_np - out_np).max() < 1e-6
             ), f"decoded output does not match input for output_type {output_type}"
 
-    def test_encode_input_np(self):
+    def test_vae_image_processor_np(self):
         image_processor = VaeImageProcessor(do_resize=False, do_normalize=False)
         input_np = self.dummy_sample.cpu().numpy().transpose(0, 2, 3, 1)
 
         for output_type in ["pt", "np", "pil"]:
-            out = image_processor.decode(image_processor.encode(input_np), output_type=output_type)
+            out = image_processor.postprocess(image_processor.preprocess(input_np), output_type=output_type)
 
             out_np = self.to_np(out)
             in_np = (input_np * 255).round() if output_type == "pil" else input_np
@@ -71,17 +71,80 @@ def test_encode_input_np(self):
                 np.abs(in_np - out_np).max() < 1e-6
             ), f"decoded output does not match input for output_type {output_type}"
 
-    def test_encode_input_pil(self):
+    def test_vae_image_processor_pil(self):
         image_processor = VaeImageProcessor(do_resize=False, do_normalize=False)
 
         input_np = self.dummy_sample.cpu().numpy().transpose(0, 2, 3, 1)
         input_pil = image_processor.numpy_to_pil(input_np)
 
         for output_type in ["pt", "np", "pil"]:
-            out = image_processor.decode(image_processor.encode(input_pil), output_type=output_type)
+            out = image_processor.postprocess(image_processor.preprocess(input_pil), output_type=output_type)
             for i, o in zip(input_pil, out):
                 in_np = np.array(i)
                 out_np = self.to_np(out) if output_type == "pil" else (self.to_np(out) * 255).round()
                 assert (
                     np.abs(in_np - out_np).max() < 1e-6
                 ), f"decoded output does not match input for output_type {output_type}"
+
+    def test_preprocess_input_3d(self):
+        image_processor = VaeImageProcessor(do_resize=False, do_normalize=False)
+
+        input_pt_4d = self.dummy_sample
+        input_pt_3d = input_pt_4d.squeeze(0)
+
+        out_pt_4d = image_processor.postprocess(
+                image_processor.preprocess(input_pt_4d),
+                output_type="np",
+            )
+        out_pt_3d = image_processor.postprocess(
+                image_processor.preprocess(input_pt_3d),
+                output_type="np",
+            )
+
+        input_np_4d = self.to_np(self.dummy_sample)
+        input_np_3d = input_np_4d.squeeze(0)
+
+        out_np_4d = image_processor.postprocess(
+                image_processor.preprocess(input_np_4d),
+                output_type="np",
+            )
+        out_np_3d = image_processor.postprocess(
+                image_processor.preprocess(input_np_3d),
+                output_type="np",
+            )
+        
+        assert np.abs(out_pt_4d - out_pt_3d).max() < 1e-6
+        assert np.abs(out_np_4d - out_np_3d).max() < 1e-6
+
+    def test_preprocess_input_list(self):
+        image_processor = VaeImageProcessor(do_resize=False, do_normalize=False)
+        
+        input_pt_4d = self.dummy_sample
+        input_pt_list = list(input_pt_4d)
+
+        out_pt_4d = image_processor.postprocess(
+                image_processor.preprocess(input_pt_4d),
+                output_type="np",
+            )
+
+        out_pt_list = image_processor.postprocess(
+                image_processor.preprocess(input_pt_list),
+                output_type="np",
+            )
+        
+        input_np_4d = self.to_np(self.dummy_sample)
+        input_np_list = list(input_np_4d)
+
+        out_np_4d = image_processor.postprocess(
+                image_processor.preprocess(input_pt_4d),
+                output_type="np",
+            )
+
+        out_np_list = image_processor.postprocess(
+                image_processor.preprocess(input_pt_list),
+                output_type="np",
+            )
+
+        assert np.abs(out_pt_4d - out_pt_list).max() < 1e-6
+        assert np.abs(out_np_4d - out_np_list).max() < 1e-6
+

From 5c6de08f0e147a9a8499892e1ddd9f3300b85878 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 13 Mar 2023 00:04:34 +0000
Subject: [PATCH 10/45] fix style

---
 .../pipeline_stable_diffusion_img2img.py      | 15 ++---
 .../test_stable_diffusion_img2img.py          | 53 ++++++++++-------
 tests/test_image_processor.py                 | 59 +++++++++----------
 3 files changed, 66 insertions(+), 61 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 444354417168..25e580bcd096 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -404,14 +404,11 @@ def _encode_prompt(
         return prompt_embeds
 
     def run_safety_checker(self, image, device, dtype):
-        feature_extractor_input = self.image_processor.postprocess(image, output_type='pil')
-        safety_checker_input = self.feature_extractor(
-            feature_extractor_input, 
-            return_tensors="pt"
-            ).to(device)
+        feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+        safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
         image, has_nsfw_concept = self.safety_checker(
             images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
-            )
+        )
         return image, has_nsfw_concept
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
@@ -694,13 +691,13 @@ def __call__(
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
-        
+
         if output_type not in ["latent", "pt", "np", "pil"]:
             deprecation_message = (
                 f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: "
                 "`pil`, `np`, `pt`, `latent`"
             )
-            deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False ) 
+            deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False)
             output_type = "np"
 
         if output_type == "latent":
@@ -713,7 +710,7 @@ def __call__(
             image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
         else:
             has_nsfw_concept = False
-        
+
         image = self.image_processor.postprocess(image, output_type=output_type)
 
         # Offload last model to CPU
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
index 5e91766e23a5..504fc416e244 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
@@ -95,24 +95,24 @@ def get_dummy_components(self):
         }
         return components
 
-    def get_dummy_inputs(self, device, seed=0, input_image_type='pt', output_type='np'):
+    def get_dummy_inputs(self, device, seed=0, input_image_type="pt", output_type="np"):
         image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
         if str(device).startswith("mps"):
             generator = torch.manual_seed(seed)
         else:
             generator = torch.Generator(device=device).manual_seed(seed)
-        
-        if input_image_type == 'pt':
+
+        if input_image_type == "pt":
             input_image = image
-        elif input_image_type == 'np':
+        elif input_image_type == "np":
             input_image = image.cpu().numpy().transpose(0, 2, 3, 1)
-        elif input_image_type == 'pil': 
+        elif input_image_type == "pil":
             input_image = image.cpu().numpy().transpose(0, 2, 3, 1)
             input_image = VaeImageProcessor.numpy_to_pil(input_image)
         else:
             raise ValueError(f"unsupported input_image_type {input_image_type}.")
-        
-        if output_type not in ['pt', 'np', 'pil']:
+
+        if output_type not in ["pt", "np", "pil"]:
             raise ValueError(f"unsupported output_type {output_type}")
 
         inputs = {
@@ -137,7 +137,9 @@ def test_stable_diffusion_img2img_default_case(self):
         image_slice = image[0, -3:, -3:, -1]
 
         assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.46275955, 0.3977616,  0.42548066, 0.5823421,  0.50115615, 0.43968713, 0.41080174, 0.47410887, 0.42165133])
+        expected_slice = np.array(
+            [0.46275955, 0.3977616, 0.42548066, 0.5823421, 0.50115615, 0.43968713, 0.41080174, 0.47410887, 0.42165133]
+        )
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
 
@@ -155,7 +157,9 @@ def test_stable_diffusion_img2img_negative_prompt(self):
         image_slice = image[0, -3:, -3:, -1]
 
         assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.4104152, 0.38498846, 0.41070235, 0.52090424, 0.47205922, 0.42849067, 0.41589636, 0.46834698, 0.4408132])
+        expected_slice = np.array(
+            [0.4104152, 0.38498846, 0.41070235, 0.52090424, 0.47205922, 0.42849067, 0.41589636, 0.46834698, 0.4408132]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
 
     def test_stable_diffusion_img2img_multiple_init_images(self):
@@ -172,7 +176,9 @@ def test_stable_diffusion_img2img_multiple_init_images(self):
         image_slice = image[-1, -3:, -3:, -1]
 
         assert image.shape == (2, 32, 32, 3)
-        expected_slice = np.array([0.46686086, 0.44393504, 0.5084481, 0.67471784, 0.55514234, 0.5346449, 0.63654804, 0.51626045, 0.46245307])
+        expected_slice = np.array(
+            [0.46686086, 0.44393504, 0.5084481, 0.67471784, 0.55514234, 0.5346449, 0.63654804, 0.51626045, 0.46245307]
+        )
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
 
@@ -191,7 +197,9 @@ def test_stable_diffusion_img2img_k_lms(self):
         image_slice = image[0, -3:, -3:, -1]
 
         assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.4390018, 0.49910325, 0.43994197, 0.6633433, 0.56556225, 0.44274506, 0.58594346, 0.60113865, 0.52007025])
+        expected_slice = np.array(
+            [0.4390018, 0.49910325, 0.43994197, 0.6633433, 0.56556225, 0.44274506, 0.58594346, 0.60113865, 0.52007025]
+        )
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
 
@@ -213,34 +221,35 @@ def test_attention_slicing_forward_pass(self):
 
     @skip_mps
     def test_pt_np_pil_outputs_equivalent(self):
-        device = 'cpu'
+        device = "cpu"
         components = self.get_dummy_components()
         sd_pipe = StableDiffusionImg2ImgPipeline(**components)
         sd_pipe = sd_pipe.to(device)
         sd_pipe.set_progress_bar_config(disable=None)
-        
-        output_pt = sd_pipe(**self.get_dummy_inputs(device, output_type='pt'))[0]
-        output_np = sd_pipe(**self.get_dummy_inputs(device, output_type='np'))[0]
-        output_pil = sd_pipe(**self.get_dummy_inputs(device, output_type='pil'))[0]
+
+        output_pt = sd_pipe(**self.get_dummy_inputs(device, output_type="pt"))[0]
+        output_np = sd_pipe(**self.get_dummy_inputs(device, output_type="np"))[0]
+        output_pil = sd_pipe(**self.get_dummy_inputs(device, output_type="pil"))[0]
 
         assert np.abs(output_pt.cpu().numpy().transpose(0, 2, 3, 1) - output_np).max() <= 1e-4
         assert np.abs(np.array(output_pil[0]) - (output_np * 255).round()).max() <= 1e-4
 
     @skip_mps
     def test_image_types_consistent(self):
-        device = 'cpu'
+        device = "cpu"
         components = self.get_dummy_components()
         sd_pipe = StableDiffusionImg2ImgPipeline(**components)
         sd_pipe = sd_pipe.to(device)
         sd_pipe.set_progress_bar_config(disable=None)
-        
-        output_pt = sd_pipe(**self.get_dummy_inputs(device, input_image_type='pt'))[0]
-        output_np = sd_pipe(**self.get_dummy_inputs(device, input_image_type='np'))[0]
-        output_pil = sd_pipe(**self.get_dummy_inputs(device, input_image_type='pil'))[0]
+
+        output_pt = sd_pipe(**self.get_dummy_inputs(device, input_image_type="pt"))[0]
+        output_np = sd_pipe(**self.get_dummy_inputs(device, input_image_type="np"))[0]
+        output_pil = sd_pipe(**self.get_dummy_inputs(device, input_image_type="pil"))[0]
 
         assert np.abs(output_pt - output_np).max() <= 1e-4
         assert np.abs(output_pil - output_np).max() <= 1e-2
-       
+
+
 @slow
 @require_torch_gpu
 class StableDiffusionImg2ImgPipelineSlowTests(unittest.TestCase):
diff --git a/tests/test_image_processor.py b/tests/test_image_processor.py
index 87ef11aa611e..ce2483590668 100644
--- a/tests/test_image_processor.py
+++ b/tests/test_image_processor.py
@@ -40,7 +40,7 @@ def to_np(self, image):
         elif isinstance(image, torch.Tensor):
             return image.cpu().numpy().transpose(0, 2, 3, 1)
         return image
-    
+
     def test_vae_image_processor_pt(self):
         image_processor = VaeImageProcessor(do_resize=False, do_normalize=False)
 
@@ -93,58 +93,57 @@ def test_preprocess_input_3d(self):
         input_pt_3d = input_pt_4d.squeeze(0)
 
         out_pt_4d = image_processor.postprocess(
-                image_processor.preprocess(input_pt_4d),
-                output_type="np",
-            )
+            image_processor.preprocess(input_pt_4d),
+            output_type="np",
+        )
         out_pt_3d = image_processor.postprocess(
-                image_processor.preprocess(input_pt_3d),
-                output_type="np",
-            )
+            image_processor.preprocess(input_pt_3d),
+            output_type="np",
+        )
 
         input_np_4d = self.to_np(self.dummy_sample)
         input_np_3d = input_np_4d.squeeze(0)
 
         out_np_4d = image_processor.postprocess(
-                image_processor.preprocess(input_np_4d),
-                output_type="np",
-            )
+            image_processor.preprocess(input_np_4d),
+            output_type="np",
+        )
         out_np_3d = image_processor.postprocess(
-                image_processor.preprocess(input_np_3d),
-                output_type="np",
-            )
-        
+            image_processor.preprocess(input_np_3d),
+            output_type="np",
+        )
+
         assert np.abs(out_pt_4d - out_pt_3d).max() < 1e-6
         assert np.abs(out_np_4d - out_np_3d).max() < 1e-6
 
     def test_preprocess_input_list(self):
         image_processor = VaeImageProcessor(do_resize=False, do_normalize=False)
-        
+
         input_pt_4d = self.dummy_sample
         input_pt_list = list(input_pt_4d)
 
         out_pt_4d = image_processor.postprocess(
-                image_processor.preprocess(input_pt_4d),
-                output_type="np",
-            )
+            image_processor.preprocess(input_pt_4d),
+            output_type="np",
+        )
 
         out_pt_list = image_processor.postprocess(
-                image_processor.preprocess(input_pt_list),
-                output_type="np",
-            )
-        
+            image_processor.preprocess(input_pt_list),
+            output_type="np",
+        )
+
         input_np_4d = self.to_np(self.dummy_sample)
-        input_np_list = list(input_np_4d)
+        list(input_np_4d)
 
         out_np_4d = image_processor.postprocess(
-                image_processor.preprocess(input_pt_4d),
-                output_type="np",
-            )
+            image_processor.preprocess(input_pt_4d),
+            output_type="np",
+        )
 
         out_np_list = image_processor.postprocess(
-                image_processor.preprocess(input_pt_list),
-                output_type="np",
-            )
+            image_processor.preprocess(input_pt_list),
+            output_type="np",
+        )
 
         assert np.abs(out_pt_4d - out_pt_list).max() < 1e-6
         assert np.abs(out_np_4d - out_np_list).max() < 1e-6
-

From e07a9bea6805f4ca9c844d1bb3be1cf4396d398e Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 13 Mar 2023 00:16:35 +0000
Subject: [PATCH 11/45] remove fixed copies on img2img preprocess

---
 .../pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py    | 1 -
 .../pipelines/stable_diffusion/pipeline_cycle_diffusion.py       | 1 -
 .../stable_diffusion/pipeline_stable_diffusion_depth2img.py      | 1 -
 .../stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py   | 1 -
 4 files changed, 4 deletions(-)

diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index 1e7872e3b081..d6e448443d60 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -64,7 +64,6 @@
 """
 
 
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
 def preprocess(image):
     if isinstance(image, torch.Tensor):
         return image
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
index e977071b9c6c..d54d0b7b8466 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
@@ -35,7 +35,6 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
 def preprocess(image):
     if isinstance(image, torch.Tensor):
         return image
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
index 6c02e06a6523..39df07e4e527 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
@@ -32,7 +32,6 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
 def preprocess(image):
     if isinstance(image, torch.Tensor):
         return image
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
index b5a352c785ee..8d34466fc6d1 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
@@ -169,7 +169,6 @@ class Pix2PixInversionPipelineOutput(BaseOutput):
 """
 
 
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
 def preprocess(image):
     if isinstance(image, torch.Tensor):
         return image

From cd2721fddb43ef09ceedadc7da079a5761a012f3 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Mon, 13 Mar 2023 00:21:16 +0000
Subject: [PATCH 12/45] fix

---
 src/diffusers/pipelines/repaint/pipeline_repaint.py              | 1 -
 .../stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py   | 1 -
 .../pipeline_stable_diffusion_instruct_pix2pix.py                | 1 -
 3 files changed, 3 deletions(-)

diff --git a/src/diffusers/pipelines/repaint/pipeline_repaint.py b/src/diffusers/pipelines/repaint/pipeline_repaint.py
index fabcd2610f43..4f449fddcca3 100644
--- a/src/diffusers/pipelines/repaint/pipeline_repaint.py
+++ b/src/diffusers/pipelines/repaint/pipeline_repaint.py
@@ -28,7 +28,6 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
 def _preprocess_image(image: Union[List, PIL.Image.Image, torch.Tensor]):
     if isinstance(image, torch.Tensor):
         return image
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
index 9123e5f3296d..32ba3fa901e8 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
@@ -31,7 +31,6 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess with 8->64
 def preprocess(image):
     if isinstance(image, torch.Tensor):
         return image
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
index 953df11aa4f7..35238ea6e57b 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
@@ -38,7 +38,6 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
 def preprocess(image):
     if isinstance(image, torch.Tensor):
         return image

From 2c702f10af64573a63578842748e72d92792cf51 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Mon, 13 Mar 2023 14:34:27 -1000
Subject: [PATCH 13/45] Update src/diffusers/image_processor.py

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
---
 src/diffusers/image_processor.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index d8edc8223cc8..975f150727ef 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -30,8 +30,7 @@ class VaeImageProcessor(ConfigMixin):
 
     Args:
         do_resize (`bool`, *optional*, defaults to `True`):
-            Whether to resize the image's (height, width) dimensions to the specified `size`. `do_resize` in the
-            `preprocess` method.
+            Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`.
         vae_scale_factor (`int`, *optional*, defaults to `8`):
             scale factor in VAE, if do_resize is True, the image will be automatically resized to multipls of
             vae_scale_factor

From dc508d6ddd249ab06827985731809b8c21888962 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Mon, 13 Mar 2023 14:34:54 -1000
Subject: [PATCH 14/45] Update src/diffusers/image_processor.py

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
---
 src/diffusers/image_processor.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index 975f150727ef..0ee19294efc9 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -32,8 +32,7 @@ class VaeImageProcessor(ConfigMixin):
         do_resize (`bool`, *optional*, defaults to `True`):
             Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`.
         vae_scale_factor (`int`, *optional*, defaults to `8`):
-            scale factor in VAE, if do_resize is True, the image will be automatically resized to multipls of
-            vae_scale_factor
+            VAE scale factor. If `do_resize` is True, the image will be automatically resized to multiples of this factor.
         resample (`str`, *optional*, defaults to `lanczos`):
             Resampling filter to use if resizing the image.
         do_normalize (`bool`, *optional*, defaults to `True`):

From 3475dec3d6b75225809f526a19b645e7b30c1b7f Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Mon, 13 Mar 2023 14:35:08 -1000
Subject: [PATCH 15/45] Update src/diffusers/image_processor.py

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
---
 src/diffusers/image_processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index 0ee19294efc9..379503e5eaf1 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -34,7 +34,7 @@ class VaeImageProcessor(ConfigMixin):
         vae_scale_factor (`int`, *optional*, defaults to `8`):
             VAE scale factor. If `do_resize` is True, the image will be automatically resized to multiples of this factor.
         resample (`str`, *optional*, defaults to `lanczos`):
-            Resampling filter to use if resizing the image.
+            Resampling filter to use when resizing the image.
         do_normalize (`bool`, *optional*, defaults to `True`):
             Whether to normalize the image to [-1,1]
     """

From e3a0b133e599b781d9367fbb8499771995321c5a Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Mon, 13 Mar 2023 14:35:22 -1000
Subject: [PATCH 16/45] Update src/diffusers/image_processor.py

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
---
 src/diffusers/image_processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index 379503e5eaf1..da9c5e8a32d0 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -95,7 +95,7 @@ def normalize(images):
 
     def resize(self, images: PIL.Image.Image) -> PIL.Image.Image:
         """
-        Resize an PIL image. Both height and width will be resized to integer multiple of vae_scale_factor
+        Resize a PIL image. Both height and width will be downscaled to the next integer multiple of `vae_scale_factor`
         """
         w, h = images.size
         w, h = map(lambda x: x - x % self.vae_scale_factor, (w, h))  # resize to integer multiple of vae_scale_factor

From 63b2418777480d6faec2847b1110d3070511d2ce Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Mon, 13 Mar 2023 14:35:35 -1000
Subject: [PATCH 17/45] Update src/diffusers/image_processor.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/image_processor.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index da9c5e8a32d0..3d3a589251b2 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -113,7 +113,6 @@ def preprocess(
         if isinstance(image, supported_formats):
             image = [image]
         elif isinstance(image, list) and all(isinstance(i, supported_formats) for i in image):
-            image = image
         else:
             raise ValueError(
                 "incorrect image format is used - currently we only support PIL image, numpy array or pytorch tensor"

From 2847d4b031467ea81ee1e5af5c401dfafdb912cd Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Mon, 13 Mar 2023 14:38:09 -1000
Subject: [PATCH 18/45] Update src/diffusers/image_processor.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/image_processor.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index 3d3a589251b2..7b968e96a3ae 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -112,7 +112,6 @@ def preprocess(
         supported_formats = (PIL.Image.Image, np.ndarray, torch.Tensor)
         if isinstance(image, supported_formats):
             image = [image]
-        elif isinstance(image, list) and all(isinstance(i, supported_formats) for i in image):
         else:
             raise ValueError(
                 "incorrect image format is used - currently we only support PIL image, numpy array or pytorch tensor"

From f6e5af05a410f0a2436efb2ac841cb49a670279c Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Mon, 13 Mar 2023 14:38:19 -1000
Subject: [PATCH 19/45] Update src/diffusers/image_processor.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/image_processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index 7b968e96a3ae..398ab6856c93 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -112,7 +112,7 @@ def preprocess(
         supported_formats = (PIL.Image.Image, np.ndarray, torch.Tensor)
         if isinstance(image, supported_formats):
             image = [image]
-        else:
+        elif not (isinstance(image, list) and all(isinstance(i, supported_formats)):
             raise ValueError(
                 "incorrect image format is used - currently we only support PIL image, numpy array or pytorch tensor"
             )

From 771f6c0ae94292ac7a5ea5d16b93386183575efd Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Mon, 13 Mar 2023 14:38:32 -1000
Subject: [PATCH 20/45] Update src/diffusers/image_processor.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/image_processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index 398ab6856c93..b9ad5ff30d8f 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -114,7 +114,7 @@ def preprocess(
             image = [image]
         elif not (isinstance(image, list) and all(isinstance(i, supported_formats)):
             raise ValueError(
-                "incorrect image format is used - currently we only support PIL image, numpy array or pytorch tensor"
+                f"Input is in incorrect format: {[type(i) for i in image)}. Currently, we only support {', '.join(supported_formats)}"
             )
 
         if isinstance(image[0], PIL.Image.Image):

From 26e95145bb01a57ce58301af8b66f043ba57661f Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Mon, 13 Mar 2023 14:39:10 -1000
Subject: [PATCH 21/45] Update src/diffusers/image_processor.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/image_processor.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index b9ad5ff30d8f..f796ac6cc276 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -129,6 +129,7 @@ def preprocess(
             image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0)
 
         _, _, height, width = image.shape
+        
         if self.do_resize and (height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0):
             raise ValueError(
                 f"the height and width of image have to be divisible by {self.vae_scale_factor} but are {height} and {width}."

From 7c9b9f74d769154eef42de436ae12af35c424650 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 14 Mar 2023 00:52:01 +0000
Subject: [PATCH 22/45] fix typos

---
 src/diffusers/image_processor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index f796ac6cc276..1d20c45bd18e 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -112,9 +112,9 @@ def preprocess(
         supported_formats = (PIL.Image.Image, np.ndarray, torch.Tensor)
         if isinstance(image, supported_formats):
             image = [image]
-        elif not (isinstance(image, list) and all(isinstance(i, supported_formats)):
+        elif not (isinstance(image, list) and all(isinstance(i, supported_formats) for i in image)):
             raise ValueError(
-                f"Input is in incorrect format: {[type(i) for i in image)}. Currently, we only support {', '.join(supported_formats)}"
+                f"Input is in incorrect format: {[type(i) for i in image]}. Currently, we only support {', '.join(supported_formats)}"
             )
 
         if isinstance(image[0], PIL.Image.Image):

From f009e9781fd048819cd645724301020b4d53483e Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 14 Mar 2023 00:54:16 +0000
Subject: [PATCH 23/45] add back preprocess function

---
 .../pipeline_stable_diffusion_img2img.py      | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 25e580bcd096..691b946dd3c2 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -68,6 +68,27 @@
 """
 
 
+def preprocess(image):
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+
+    if isinstance(image[0], PIL.Image.Image):
+        w, h = image[0].size
+        w, h = map(lambda x: x - x % 8, (w, h))  # resize to integer multiple of 8
+
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = 2.0 * image - 1.0
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, dim=0)
+    return image
+
+    
 class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
     r"""
     Pipeline for text-guided image to image generation using Stable Diffusion.

From e2f7cf4243b57449e768d4b692a50248d4a88dc7 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 14 Mar 2023 00:55:09 +0000
Subject: [PATCH 24/45] Revert "remove fixed copies on img2img preprocess"

This reverts commit e07a9bea6805f4ca9c844d1bb3be1cf4396d398e.
---
 .../pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py    | 1 +
 .../pipelines/stable_diffusion/pipeline_cycle_diffusion.py       | 1 +
 .../stable_diffusion/pipeline_stable_diffusion_depth2img.py      | 1 +
 .../stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py   | 1 +
 4 files changed, 4 insertions(+)

diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index d6e448443d60..1e7872e3b081 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -64,6 +64,7 @@
 """
 
 
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
 def preprocess(image):
     if isinstance(image, torch.Tensor):
         return image
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
index d54d0b7b8466..e977071b9c6c 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
@@ -35,6 +35,7 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
 def preprocess(image):
     if isinstance(image, torch.Tensor):
         return image
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
index 39df07e4e527..6c02e06a6523 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
@@ -32,6 +32,7 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
 def preprocess(image):
     if isinstance(image, torch.Tensor):
         return image
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
index 8d34466fc6d1..b5a352c785ee 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
@@ -169,6 +169,7 @@ class Pix2PixInversionPipelineOutput(BaseOutput):
 """
 
 
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
 def preprocess(image):
     if isinstance(image, torch.Tensor):
         return image

From c1569be82a303330a81bf6c4e68811036891c963 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 14 Mar 2023 00:57:17 +0000
Subject: [PATCH 25/45] Revert "fix"

This reverts commit cd2721fddb43ef09ceedadc7da079a5761a012f3.
---
 src/diffusers/pipelines/repaint/pipeline_repaint.py              | 1 +
 .../stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py   | 1 +
 .../pipeline_stable_diffusion_instruct_pix2pix.py                | 1 +
 3 files changed, 3 insertions(+)

diff --git a/src/diffusers/pipelines/repaint/pipeline_repaint.py b/src/diffusers/pipelines/repaint/pipeline_repaint.py
index 4f449fddcca3..fabcd2610f43 100644
--- a/src/diffusers/pipelines/repaint/pipeline_repaint.py
+++ b/src/diffusers/pipelines/repaint/pipeline_repaint.py
@@ -28,6 +28,7 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
 def _preprocess_image(image: Union[List, PIL.Image.Image, torch.Tensor]):
     if isinstance(image, torch.Tensor):
         return image
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
index 32ba3fa901e8..9123e5f3296d 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
@@ -31,6 +31,7 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess with 8->64
 def preprocess(image):
     if isinstance(image, torch.Tensor):
         return image
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
index 35238ea6e57b..953df11aa4f7 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
@@ -38,6 +38,7 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
 def preprocess(image):
     if isinstance(image, torch.Tensor):
         return image

From 9cf2c0bc29ce3cea1f87b7fa8db4747d8296f3c5 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 14 Mar 2023 01:14:29 +0000
Subject: [PATCH 26/45] revert change in expected slice

---
 .../test_stable_diffusion_img2img.py            | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
index 504fc416e244..77c943d4d30f 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
@@ -137,9 +137,7 @@ def test_stable_diffusion_img2img_default_case(self):
         image_slice = image[0, -3:, -3:, -1]
 
         assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array(
-            [0.46275955, 0.3977616, 0.42548066, 0.5823421, 0.50115615, 0.43968713, 0.41080174, 0.47410887, 0.42165133]
-        )
+        expected_slice = np.array([0.4492, 0.3865, 0.4222, 0.5854, 0.5139, 0.4379, 0.4193, 0.48, 0.4218])
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
 
@@ -157,9 +155,8 @@ def test_stable_diffusion_img2img_negative_prompt(self):
         image_slice = image[0, -3:, -3:, -1]
 
         assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array(
-            [0.4104152, 0.38498846, 0.41070235, 0.52090424, 0.47205922, 0.42849067, 0.41589636, 0.46834698, 0.4408132]
-        )
+        expected_slice = np.array([0.4065, 0.3783, 0.4050, 0.5266, 0.4781, 0.4252, 0.4203, 0.4692, 0.4365])
+
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
 
     def test_stable_diffusion_img2img_multiple_init_images(self):
@@ -176,9 +173,7 @@ def test_stable_diffusion_img2img_multiple_init_images(self):
         image_slice = image[-1, -3:, -3:, -1]
 
         assert image.shape == (2, 32, 32, 3)
-        expected_slice = np.array(
-            [0.46686086, 0.44393504, 0.5084481, 0.67471784, 0.55514234, 0.5346449, 0.63654804, 0.51626045, 0.46245307]
-        )
+        expected_slice = np.array([0.5144, 0.4447, 0.4735, 0.6676, 0.5526, 0.5454, 0.645, 0.5149, 0.4689])
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
 
@@ -197,9 +192,7 @@ def test_stable_diffusion_img2img_k_lms(self):
         image_slice = image[0, -3:, -3:, -1]
 
         assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array(
-            [0.4390018, 0.49910325, 0.43994197, 0.6633433, 0.56556225, 0.44274506, 0.58594346, 0.60113865, 0.52007025]
-        )
+        expected_slice = np.array([0.4367, 0.4986, 0.4372, 0.6706, 0.5665, 0.444, 0.5864, 0.6019, 0.5203])
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
 

From 1fe112cba66085945e3dacb891f630e8beee68c3 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 14 Mar 2023 01:38:59 +0000
Subject: [PATCH 27/45] fix img2img tests

---
 .../stable_diffusion/test_stable_diffusion_img2img.py         | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
index 77c943d4d30f..5ab341b8a892 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
@@ -129,6 +129,7 @@ def test_stable_diffusion_img2img_default_case(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
         components = self.get_dummy_components()
         sd_pipe = StableDiffusionImg2ImgPipeline(**components)
+        sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=False)
         sd_pipe = sd_pipe.to(device)
         sd_pipe.set_progress_bar_config(disable=None)
 
@@ -145,6 +146,7 @@ def test_stable_diffusion_img2img_negative_prompt(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
         components = self.get_dummy_components()
         sd_pipe = StableDiffusionImg2ImgPipeline(**components)
+        sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=False)
         sd_pipe = sd_pipe.to(device)
         sd_pipe.set_progress_bar_config(disable=None)
 
@@ -163,6 +165,7 @@ def test_stable_diffusion_img2img_multiple_init_images(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
         components = self.get_dummy_components()
         sd_pipe = StableDiffusionImg2ImgPipeline(**components)
+        sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=False)
         sd_pipe = sd_pipe.to(device)
         sd_pipe.set_progress_bar_config(disable=None)
 
@@ -184,6 +187,7 @@ def test_stable_diffusion_img2img_k_lms(self):
             beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
         )
         sd_pipe = StableDiffusionImg2ImgPipeline(**components)
+        sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=False)
         sd_pipe = sd_pipe.to(device)
         sd_pipe.set_progress_bar_config(disable=None)
 

From 2f4cadec808635568f99b6dd412e4ed222099ec3 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 14 Mar 2023 01:42:34 +0000
Subject: [PATCH 28/45] make style

---
 src/diffusers/image_processor.py                              | 2 +-
 .../stable_diffusion/pipeline_stable_diffusion_img2img.py     | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index 1d20c45bd18e..52f84a98bbd8 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -129,7 +129,7 @@ def preprocess(
             image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0)
 
         _, _, height, width = image.shape
-        
+
         if self.do_resize and (height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0):
             raise ValueError(
                 f"the height and width of image have to be divisible by {self.vae_scale_factor} but are {height} and {width}."
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 691b946dd3c2..5f9771242452 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -15,6 +15,7 @@
 import inspect
 from typing import Callable, List, Optional, Union
 
+import numpy as np
 import PIL
 import torch
 from packaging import version
@@ -25,6 +26,7 @@
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
+    PIL_INTERPOLATION,
     deprecate,
     is_accelerate_available,
     is_accelerate_version,
@@ -88,7 +90,7 @@ def preprocess(image):
         image = torch.cat(image, dim=0)
     return image
 
-    
+
 class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
     r"""
     Pipeline for text-guided image to image generation using Stable Diffusion.

From 90e0539f6ce2952f4349a4ceabfcd6f6cfff889d Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 14 Mar 2023 01:45:37 +0000
Subject: [PATCH 29/45] remov #fixed copy on img2img init method

---
 .../stable_diffusion/pipeline_stable_diffusion_img2img.py        | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 5f9771242452..63584c3fbb8d 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -120,7 +120,6 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
     """
     _optional_components = ["safety_checker", "feature_extractor"]
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.__init__
     def __init__(
         self,
         vae: AutoencoderKL,

From 983f4e924818babaaa098a4f824b985f1443ee36 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 14 Mar 2023 01:48:39 +0000
Subject: [PATCH 30/45] remove #copy on img2img decode_latents

---
 .../stable_diffusion/pipeline_stable_diffusion_img2img.py        | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 63584c3fbb8d..684843517af6 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -433,7 +433,6 @@ def run_safety_checker(self, image, device, dtype):
         )
         return image, has_nsfw_concept
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
     def decode_latents(self, latents):
         latents = 1 / self.vae.config.scaling_factor * latents
         image = self.vae.decode(latents).sample

From 8ab5015a85616d5c7b938311ece214a16e42fbb7 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 14 Mar 2023 03:20:49 +0000
Subject: [PATCH 31/45] update alt_img2img

---
 .../pipeline_alt_diffusion_img2img.py         | 47 ++++++++++++-------
 src/diffusers/utils/dummy_pt_objects.py       | 15 ++++++
 .../test_alt_diffusion_img2img.py             |  4 +-
 3 files changed, 48 insertions(+), 18 deletions(-)

diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index 1e7872e3b081..c9d64b934ab0 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -24,6 +24,7 @@
 from diffusers.utils import is_accelerate_available, is_accelerate_version
 
 from ...configuration_utils import FrozenDict
+from ...image_processor import VaeImageProcessor
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import PIL_INTERPOLATION, deprecate, logging, randn_tensor, replace_example_docstring
@@ -192,7 +193,6 @@ def __init__(
             new_config = dict(unet.config)
             new_config["sample_size"] = 64
             unet._internal_dict = FrozenDict(new_config)
-
         self.register_modules(
             vae=vae,
             text_encoder=text_encoder,
@@ -203,7 +203,11 @@ def __init__(
             feature_extractor=feature_extractor,
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(
+            requires_safety_checker=requires_safety_checker,
+        )
 
     def enable_sequential_cpu_offload(self, gpu_id=0):
         r"""
@@ -415,13 +419,11 @@ def _encode_prompt(
         return prompt_embeds
 
     def run_safety_checker(self, image, device, dtype):
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
-            )
-        else:
-            has_nsfw_concept = None
+        feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+        safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+        image, has_nsfw_concept = self.safety_checker(
+            images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+        )
         return image, has_nsfw_concept
 
     def decode_latents(self, latents):
@@ -429,7 +431,7 @@ def decode_latents(self, latents):
         image = self.vae.decode(latents).sample
         image = (image / 2 + 0.5).clamp(0, 1)
         # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        # image = image.cpu().permute(0, 2, 3, 1).float().numpy()
         return image
 
     def prepare_extra_step_kwargs(self, generator, eta):
@@ -663,7 +665,7 @@ def __call__(
         )
 
         # 4. Preprocess image
-        image = preprocess(image)
+        image = self.image_processor.preprocess(image)
 
         # 5. set timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
@@ -703,15 +705,26 @@ def __call__(
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
 
-        # 9. Post-processing
+        if output_type not in ["latent", "pt", "np", "pil"]:
+            deprecation_message = (
+                f"the output_type {output_type} is outdated. Please make sure to set it to one of these instead: "
+                "`pil`, `np`, `pt`, `latent`"
+            )
+            deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False)
+            output_type = "np"
+
+        if output_type == "latent":
+            image = latents
+            has_nsfw_concept = None
+
         image = self.decode_latents(latents)
 
-        # 10. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        if self.safety_checker is not None:
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            has_nsfw_concept = False
 
-        # 11. Convert to PIL
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
+        image = self.image_processor.postprocess(image, output_type=output_type)
 
         # Offload last model to CPU
         if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py
index c731a1f1ddf3..1fcfb91d72e0 100644
--- a/src/diffusers/utils/dummy_pt_objects.py
+++ b/src/diffusers/utils/dummy_pt_objects.py
@@ -2,6 +2,21 @@
 from ..utils import DummyObject, requires_backends
 
 
+class VaeImageProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
 class AutoencoderKL(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py b/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
index d2745115af1c..9663f509dbc5 100644
--- a/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
+++ b/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
@@ -21,7 +21,7 @@
 import torch
 from transformers import XLMRobertaTokenizer
 
-from diffusers import AltDiffusionImg2ImgPipeline, AutoencoderKL, PNDMScheduler, UNet2DConditionModel
+from diffusers import AltDiffusionImg2ImgPipeline, AutoencoderKL, PNDMScheduler, UNet2DConditionModel, VaeImageProcessor
 from diffusers.pipelines.alt_diffusion.modeling_roberta_series import (
     RobertaSeriesConfig,
     RobertaSeriesModelWithTransformation,
@@ -128,6 +128,7 @@ def test_stable_diffusion_img2img_default_case(self):
             safety_checker=None,
             feature_extractor=self.dummy_extractor,
         )
+        alt_pipe.image_processor = VaeImageProcessor(vae_scale_factor=alt_pipe.vae_scale_factor, do_normalize=False)
         alt_pipe = alt_pipe.to(device)
         alt_pipe.set_progress_bar_config(disable=None)
 
@@ -191,6 +192,7 @@ def test_stable_diffusion_img2img_fp16(self):
             safety_checker=None,
             feature_extractor=self.dummy_extractor,
         )
+        alt_pipe.image_processor = VaeImageProcessor(vae_scale_factor=alt_pipe.vae_scale_factor, do_normalize=False)
         alt_pipe = alt_pipe.to(torch_device)
         alt_pipe.set_progress_bar_config(disable=None)
 

From 4cc2d0e3ab576b7b4c006c4d2ccb690e982118ce Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 14 Mar 2023 03:23:16 +0000
Subject: [PATCH 32/45] style

---
 .../pipelines/altdiffusion/test_alt_diffusion_img2img.py  | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py b/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
index 9663f509dbc5..10a3ef777b0e 100644
--- a/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
+++ b/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
@@ -21,7 +21,13 @@
 import torch
 from transformers import XLMRobertaTokenizer
 
-from diffusers import AltDiffusionImg2ImgPipeline, AutoencoderKL, PNDMScheduler, UNet2DConditionModel, VaeImageProcessor
+from diffusers import (
+    AltDiffusionImg2ImgPipeline,
+    AutoencoderKL,
+    PNDMScheduler,
+    UNet2DConditionModel,
+    VaeImageProcessor,
+)
 from diffusers.pipelines.alt_diffusion.modeling_roberta_series import (
     RobertaSeriesConfig,
     RobertaSeriesModelWithTransformation,

From d919e695f4b56dea6e50016019f7eac5f4de736e Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 14 Mar 2023 04:14:22 +0000
Subject: [PATCH 33/45] deprecate preprocess

---
 .../stable_diffusion/pipeline_stable_diffusion_img2img.py    | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 684843517af6..f8a55d026248 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -71,6 +71,11 @@
 
 
 def preprocess(image):
+    warnings.warn(
+            "The function preprocess is deprecated and will be removed. Please"
+            " use VAEImageProcessor.preprocess instead.",
+            FutureWarning,
+        )
     if isinstance(image, torch.Tensor):
         return image
     elif isinstance(image, PIL.Image.Image):

From daa3d32da4edf6fb30c91f25de2230656ede6ee2 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 14 Mar 2023 04:23:07 +0000
Subject: [PATCH 34/45] style + copy

---
 .../alt_diffusion/pipeline_alt_diffusion_img2img.py      | 6 ++++++
 src/diffusers/pipelines/repaint/pipeline_repaint.py      | 6 ++++++
 .../stable_diffusion/pipeline_cycle_diffusion.py         | 6 ++++++
 .../pipeline_onnx_stable_diffusion_img2img.py            | 5 +++++
 .../pipeline_stable_diffusion_depth2img.py               | 6 ++++++
 .../pipeline_stable_diffusion_img2img.py                 | 9 +++++----
 .../pipeline_stable_diffusion_instruct_pix2pix.py        | 6 ++++++
 .../pipeline_stable_diffusion_pix2pix_zero.py            | 6 ++++++
 8 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index c9d64b934ab0..656223c04616 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import inspect
+import warnings
 from typing import Callable, List, Optional, Union
 
 import numpy as np
@@ -67,6 +68,11 @@
 
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
 def preprocess(image):
+    warnings.warn(
+        "The function preprocess is deprecated and will be removed. Please"
+        " use VAEImageProcessor.preprocess instead.",
+        FutureWarning,
+    )
     if isinstance(image, torch.Tensor):
         return image
     elif isinstance(image, PIL.Image.Image):
diff --git a/src/diffusers/pipelines/repaint/pipeline_repaint.py b/src/diffusers/pipelines/repaint/pipeline_repaint.py
index fabcd2610f43..5f4a65f43f17 100644
--- a/src/diffusers/pipelines/repaint/pipeline_repaint.py
+++ b/src/diffusers/pipelines/repaint/pipeline_repaint.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 
+import warnings
 from typing import List, Optional, Tuple, Union
 
 import numpy as np
@@ -30,6 +31,11 @@
 
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
 def _preprocess_image(image: Union[List, PIL.Image.Image, torch.Tensor]):
+    warnings.warn(
+        "The function preprocess is deprecated and will be removed. Please"
+        " use VAEImageProcessor.preprocess instead.",
+        FutureWarning,
+    )
     if isinstance(image, torch.Tensor):
         return image
     elif isinstance(image, PIL.Image.Image):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
index e977071b9c6c..43751de89d98 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import inspect
+import warnings
 from typing import Callable, List, Optional, Union
 
 import numpy as np
@@ -37,6 +38,11 @@
 
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
 def preprocess(image):
+    warnings.warn(
+        "The function preprocess is deprecated and will be removed. Please"
+        " use VAEImageProcessor.preprocess instead.",
+        FutureWarning,
+    )
     if isinstance(image, torch.Tensor):
         return image
     elif isinstance(image, PIL.Image.Image):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
index 9123e5f3296d..3391e61e33d8 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import inspect
+import warnings
 from typing import Callable, List, Optional, Union
 
 import numpy as np
@@ -33,6 +34,10 @@
 
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess with 8->64
 def preprocess(image):
+    warnings.warn(
+        "The function preprocess is deprecated and will be removed. Please use VAEImageProcessor.preprocess instead.",
+        FutureWarning,
+    )
     if isinstance(image, torch.Tensor):
         return image
     elif isinstance(image, PIL.Image.Image):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
index 6c02e06a6523..309109a42779 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
@@ -14,6 +14,7 @@
 
 import contextlib
 import inspect
+import warnings
 from typing import Callable, List, Optional, Union
 
 import numpy as np
@@ -34,6 +35,11 @@
 
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
 def preprocess(image):
+    warnings.warn(
+        "The function preprocess is deprecated and will be removed. Please"
+        " use VAEImageProcessor.preprocess instead.",
+        FutureWarning,
+    )
     if isinstance(image, torch.Tensor):
         return image
     elif isinstance(image, PIL.Image.Image):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index f8a55d026248..3cf71f6551a0 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import inspect
+import warnings
 from typing import Callable, List, Optional, Union
 
 import numpy as np
@@ -72,10 +73,10 @@
 
 def preprocess(image):
     warnings.warn(
-            "The function preprocess is deprecated and will be removed. Please"
-            " use VAEImageProcessor.preprocess instead.",
-            FutureWarning,
-        )
+        "The function preprocess is deprecated and will be removed. Please"
+        " use VAEImageProcessor.preprocess instead.",
+        FutureWarning,
+    )
     if isinstance(image, torch.Tensor):
         return image
     elif isinstance(image, PIL.Image.Image):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
index 953df11aa4f7..2c5c07f19e5b 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import inspect
+import warnings
 from typing import Callable, List, Optional, Union
 
 import numpy as np
@@ -40,6 +41,11 @@
 
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
 def preprocess(image):
+    warnings.warn(
+        "The function preprocess is deprecated and will be removed. Please"
+        " use VAEImageProcessor.preprocess instead.",
+        FutureWarning,
+    )
     if isinstance(image, torch.Tensor):
         return image
     elif isinstance(image, PIL.Image.Image):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
index b5a352c785ee..5494cd267478 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import inspect
+import warnings
 from dataclasses import dataclass
 from typing import Any, Callable, Dict, List, Optional, Union
 
@@ -171,6 +172,11 @@ class Pix2PixInversionPipelineOutput(BaseOutput):
 
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
 def preprocess(image):
+    warnings.warn(
+        "The function preprocess is deprecated and will be removed. Please"
+        " use VAEImageProcessor.preprocess instead.",
+        FutureWarning,
+    )
     if isinstance(image, torch.Tensor):
         return image
     elif isinstance(image, PIL.Image.Image):

From cd83878a7d02ac6799adc88520281d13b878744e Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 14 Mar 2023 04:57:50 +0000
Subject: [PATCH 35/45] style again

---
 src/diffusers/image_processor.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index 52f84a98bbd8..7df632722e41 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -32,7 +32,8 @@ class VaeImageProcessor(ConfigMixin):
         do_resize (`bool`, *optional*, defaults to `True`):
             Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`.
         vae_scale_factor (`int`, *optional*, defaults to `8`):
-            VAE scale factor. If `do_resize` is True, the image will be automatically resized to multiples of this factor.
+            VAE scale factor. If `do_resize` is True, the image will be automatically resized to multiples of this
+            factor.
         resample (`str`, *optional*, defaults to `lanczos`):
             Resampling filter to use when resizing the image.
         do_normalize (`bool`, *optional*, defaults to `True`):

From 3dbb862627ded0b50f18f1b01a9c9902841d4dc2 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 14 Mar 2023 19:39:38 +0000
Subject: [PATCH 36/45] update error message for using resize with torch tensor
 or numpy array

---
 src/diffusers/image_processor.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index 7df632722e41..bb9f53f8ad4c 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -122,19 +122,26 @@ def preprocess(
             if self.do_resize:
                 image = [self.resize(i) for i in image]
             image = [np.array(i).astype(np.float32) / 255.0 for i in image]
+            image = np.stack(image, axis=0)  # to np
+            image = self.numpy_to_pt(image)  # to pt
 
-        if isinstance(image[0], np.ndarray):
+        elif isinstance(image[0], np.ndarray):
             image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
             image = self.numpy_to_pt(image)
+            _, _, height, width = image.shape
+            if self.do_resize and (height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0):
+                raise ValueError(
+                    f"Currently we only support resizing for PIL image - please resize your numpy array to be divisible by {self.vae_scale_factor}"
+                    f"currently the sizes are {height} and {width}. You can also pass a PIL image instead to use resize option in VAEImageProcessor"
+                    )
+
         elif isinstance(image[0], torch.Tensor):
             image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0)
-
-        _, _, height, width = image.shape
-
-        if self.do_resize and (height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0):
-            raise ValueError(
-                f"the height and width of image have to be divisible by {self.vae_scale_factor} but are {height} and {width}."
-            )
+            if self.do_resize and (height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0):
+                raise ValueError(
+                   f"Currently we only support resizing for PIL image - please resize your numpy array to be divisible by {self.vae_scale_factor}"
+                    f"currently the sizes are {height} and {width}. You can also pass a PIL image instead to use resize option in VAEImageProcessor"
+                    )
 
         # expected range [0,1], normalize to [-1,1]
         do_normalize = self.do_normalize

From ef8582ffcdff7a4a009d8fe8770e7d0587d4df2b Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 14 Mar 2023 19:45:10 +0000
Subject: [PATCH 37/45] fix

---
 src/diffusers/image_processor.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index bb9f53f8ad4c..e64be787d386 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -137,6 +137,7 @@ def preprocess(
 
         elif isinstance(image[0], torch.Tensor):
             image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0)
+            _, _, height, width = image.shape
             if self.do_resize and (height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0):
                 raise ValueError(
                    f"Currently we only support resizing for PIL image - please resize your numpy array to be divisible by {self.vae_scale_factor}"

From 0cec7375854c1ef7ead1b1fbf63972ea6c15f6ee Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 14 Mar 2023 19:49:30 +0000
Subject: [PATCH 38/45] remove deprecation warning for preprocess function +
 fix copies

---
 src/diffusers/image_processor.py                            | 6 +++---
 .../alt_diffusion/pipeline_alt_diffusion_img2img.py         | 6 ------
 src/diffusers/pipelines/repaint/pipeline_repaint.py         | 6 ------
 .../pipelines/stable_diffusion/pipeline_cycle_diffusion.py  | 6 ------
 .../pipeline_onnx_stable_diffusion_img2img.py               | 5 -----
 .../stable_diffusion/pipeline_stable_diffusion_depth2img.py | 6 ------
 .../stable_diffusion/pipeline_stable_diffusion_img2img.py   | 6 ------
 .../pipeline_stable_diffusion_instruct_pix2pix.py           | 6 ------
 .../pipeline_stable_diffusion_pix2pix_zero.py               | 6 ------
 9 files changed, 3 insertions(+), 50 deletions(-)

diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index e64be787d386..74299cf2628d 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -133,16 +133,16 @@ def preprocess(
                 raise ValueError(
                     f"Currently we only support resizing for PIL image - please resize your numpy array to be divisible by {self.vae_scale_factor}"
                     f"currently the sizes are {height} and {width}. You can also pass a PIL image instead to use resize option in VAEImageProcessor"
-                    )
+                )
 
         elif isinstance(image[0], torch.Tensor):
             image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0)
             _, _, height, width = image.shape
             if self.do_resize and (height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0):
                 raise ValueError(
-                   f"Currently we only support resizing for PIL image - please resize your numpy array to be divisible by {self.vae_scale_factor}"
+                    f"Currently we only support resizing for PIL image - please resize your numpy array to be divisible by {self.vae_scale_factor}"
                     f"currently the sizes are {height} and {width}. You can also pass a PIL image instead to use resize option in VAEImageProcessor"
-                    )
+                )
 
         # expected range [0,1], normalize to [-1,1]
         do_normalize = self.do_normalize
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index 656223c04616..c9d64b934ab0 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import inspect
-import warnings
 from typing import Callable, List, Optional, Union
 
 import numpy as np
@@ -68,11 +67,6 @@
 
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
 def preprocess(image):
-    warnings.warn(
-        "The function preprocess is deprecated and will be removed. Please"
-        " use VAEImageProcessor.preprocess instead.",
-        FutureWarning,
-    )
     if isinstance(image, torch.Tensor):
         return image
     elif isinstance(image, PIL.Image.Image):
diff --git a/src/diffusers/pipelines/repaint/pipeline_repaint.py b/src/diffusers/pipelines/repaint/pipeline_repaint.py
index 5f4a65f43f17..fabcd2610f43 100644
--- a/src/diffusers/pipelines/repaint/pipeline_repaint.py
+++ b/src/diffusers/pipelines/repaint/pipeline_repaint.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 
-import warnings
 from typing import List, Optional, Tuple, Union
 
 import numpy as np
@@ -31,11 +30,6 @@
 
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
 def _preprocess_image(image: Union[List, PIL.Image.Image, torch.Tensor]):
-    warnings.warn(
-        "The function preprocess is deprecated and will be removed. Please"
-        " use VAEImageProcessor.preprocess instead.",
-        FutureWarning,
-    )
     if isinstance(image, torch.Tensor):
         return image
     elif isinstance(image, PIL.Image.Image):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
index 43751de89d98..e977071b9c6c 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import inspect
-import warnings
 from typing import Callable, List, Optional, Union
 
 import numpy as np
@@ -38,11 +37,6 @@
 
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
 def preprocess(image):
-    warnings.warn(
-        "The function preprocess is deprecated and will be removed. Please"
-        " use VAEImageProcessor.preprocess instead.",
-        FutureWarning,
-    )
     if isinstance(image, torch.Tensor):
         return image
     elif isinstance(image, PIL.Image.Image):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
index 3391e61e33d8..9123e5f3296d 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import inspect
-import warnings
 from typing import Callable, List, Optional, Union
 
 import numpy as np
@@ -34,10 +33,6 @@
 
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess with 8->64
 def preprocess(image):
-    warnings.warn(
-        "The function preprocess is deprecated and will be removed. Please use VAEImageProcessor.preprocess instead.",
-        FutureWarning,
-    )
     if isinstance(image, torch.Tensor):
         return image
     elif isinstance(image, PIL.Image.Image):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
index e270f1fe4ceb..9087064ae0b8 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
@@ -14,7 +14,6 @@
 
 import contextlib
 import inspect
-import warnings
 from typing import Callable, List, Optional, Union
 
 import numpy as np
@@ -35,11 +34,6 @@
 
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
 def preprocess(image):
-    warnings.warn(
-        "The function preprocess is deprecated and will be removed. Please"
-        " use VAEImageProcessor.preprocess instead.",
-        FutureWarning,
-    )
     if isinstance(image, torch.Tensor):
         return image
     elif isinstance(image, PIL.Image.Image):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 3cf71f6551a0..684843517af6 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import inspect
-import warnings
 from typing import Callable, List, Optional, Union
 
 import numpy as np
@@ -72,11 +71,6 @@
 
 
 def preprocess(image):
-    warnings.warn(
-        "The function preprocess is deprecated and will be removed. Please"
-        " use VAEImageProcessor.preprocess instead.",
-        FutureWarning,
-    )
     if isinstance(image, torch.Tensor):
         return image
     elif isinstance(image, PIL.Image.Image):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
index 2c5c07f19e5b..953df11aa4f7 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import inspect
-import warnings
 from typing import Callable, List, Optional, Union
 
 import numpy as np
@@ -41,11 +40,6 @@
 
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
 def preprocess(image):
-    warnings.warn(
-        "The function preprocess is deprecated and will be removed. Please"
-        " use VAEImageProcessor.preprocess instead.",
-        FutureWarning,
-    )
     if isinstance(image, torch.Tensor):
         return image
     elif isinstance(image, PIL.Image.Image):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
index 5494cd267478..b5a352c785ee 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import inspect
-import warnings
 from dataclasses import dataclass
 from typing import Any, Callable, Dict, List, Optional, Union
 
@@ -172,11 +171,6 @@ class Pix2PixInversionPipelineOutput(BaseOutput):
 
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
 def preprocess(image):
-    warnings.warn(
-        "The function preprocess is deprecated and will be removed. Please"
-        " use VAEImageProcessor.preprocess instead.",
-        FutureWarning,
-    )
     if isinstance(image, torch.Tensor):
         return image
     elif isinstance(image, PIL.Image.Image):

From be5fcdc28729e522ce066a442556f2bfdf4d91be Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 14 Mar 2023 19:55:42 +0000
Subject: [PATCH 39/45] remove comment

---
 .../pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py   | 2 --
 .../stable_diffusion/pipeline_stable_diffusion_img2img.py       | 2 --
 2 files changed, 4 deletions(-)

diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index c9d64b934ab0..05138c86f246 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -430,8 +430,6 @@ def decode_latents(self, latents):
         latents = 1 / self.vae.config.scaling_factor * latents
         image = self.vae.decode(latents).sample
         image = (image / 2 + 0.5).clamp(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        # image = image.cpu().permute(0, 2, 3, 1).float().numpy()
         return image
 
     def prepare_extra_step_kwargs(self, generator, eta):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 684843517af6..8b3a7944def1 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -437,8 +437,6 @@ def decode_latents(self, latents):
         latents = 1 / self.vae.config.scaling_factor * latents
         image = self.vae.decode(latents).sample
         image = (image / 2 + 0.5).clamp(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        # image = image.cpu().permute(0, 2, 3, 1).float().numpy()
         return image
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs

From f3a2676b53a5c21491b0b146408515b39b471cb7 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Tue, 14 Mar 2023 10:17:06 -1000
Subject: [PATCH 40/45] Apply suggestions from code review

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
---
 src/diffusers/image_processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index 74299cf2628d..0fbabe18eccf 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -108,7 +108,7 @@ def preprocess(
         image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray],
     ) -> torch.Tensor:
         """
-        Preprocess the image input, accpet formats in PIL images, numpy arrays or pytorch tensors"
+        Preprocess the image input, accepted formats are PIL images, numpy arrays or pytorch tensors"
         """
         supported_formats = (PIL.Image.Image, np.ndarray, torch.Tensor)
         if isinstance(image, supported_formats):

From 419cabba213a7312ac235e7f8eb0da66c19076aa Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Tue, 14 Mar 2023 20:20:22 +0000
Subject: [PATCH 41/45] update error message

---
 src/diffusers/image_processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py
index 0fbabe18eccf..de6543800b2d 100644
--- a/src/diffusers/image_processor.py
+++ b/src/diffusers/image_processor.py
@@ -140,7 +140,7 @@ def preprocess(
             _, _, height, width = image.shape
             if self.do_resize and (height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0):
                 raise ValueError(
-                    f"Currently we only support resizing for PIL image - please resize your numpy array to be divisible by {self.vae_scale_factor}"
+                    f"Currently we only support resizing for PIL image - please resize your pytorch tensor to be divisible by {self.vae_scale_factor}"
                     f"currently the sizes are {height} and {width}. You can also pass a PIL image instead to use resize option in VAEImageProcessor"
                 )
 

From c844d2cfb26da3fcb3163aaba4975e2e3160cf88 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 15 Mar 2023 17:36:29 +0100
Subject: [PATCH 42/45] Update src/diffusers/__init__.py

---
 src/diffusers/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 4315669cc459..f480b4100907 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -32,7 +32,6 @@
 except OptionalDependencyNotAvailable:
     from .utils.dummy_pt_objects import *  # noqa F403
 else:
-    from .image_processor import VaeImageProcessor
     from .models import (
         AutoencoderKL,
         ControlNetModel,

From bf513f117dc78b8654e98bb2124e001b6e94e135 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 15 Mar 2023 17:38:55 +0100
Subject: [PATCH 43/45] Apply suggestions from code review

---
 tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py      | 2 +-
 .../pipelines/stable_diffusion/test_stable_diffusion_img2img.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py b/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
index 10a3ef777b0e..939632943405 100644
--- a/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
+++ b/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
@@ -26,8 +26,8 @@
     AutoencoderKL,
     PNDMScheduler,
     UNet2DConditionModel,
-    VaeImageProcessor,
 )
+from diffusers.image_processor import VaeImageProcessor
 from diffusers.pipelines.alt_diffusion.modeling_roberta_series import (
     RobertaSeriesConfig,
     RobertaSeriesModelWithTransformation,
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
index 5ab341b8a892..e27f83fc04fe 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
@@ -29,8 +29,8 @@
     PNDMScheduler,
     StableDiffusionImg2ImgPipeline,
     UNet2DConditionModel,
-    VaeImageProcessor,
 )
+from diffusers.image_processor import VaeImageProcessor
 from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device
 from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
 

From 3054135ec505fc4315f06c7547cf3acd91e73ee4 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 15 Mar 2023 16:53:18 +0000
Subject: [PATCH 44/45] fix import

---
 tests/test_image_processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_image_processor.py b/tests/test_image_processor.py
index ce2483590668..4f0e2c5aecfd 100644
--- a/tests/test_image_processor.py
+++ b/tests/test_image_processor.py
@@ -19,7 +19,7 @@
 import PIL
 import torch
 
-from diffusers import VaeImageProcessor
+from diffusers.image_processor import VaeImageProcessor
 
 
 class ImageProcessorTest(unittest.TestCase):

From 89921a93ee8480995c88a6de86d7e53a1fa983b8 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail,com>
Date: Wed, 15 Mar 2023 16:53:48 +0000
Subject: [PATCH 45/45] fix copies

---
 src/diffusers/utils/dummy_pt_objects.py | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py
index 1fcfb91d72e0..c731a1f1ddf3 100644
--- a/src/diffusers/utils/dummy_pt_objects.py
+++ b/src/diffusers/utils/dummy_pt_objects.py
@@ -2,21 +2,6 @@
 from ..utils import DummyObject, requires_backends
 
 
-class VaeImageProcessor(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch"])
-
-
 class AutoencoderKL(metaclass=DummyObject):
     _backends = ["torch"]