Make img2img strength 1 behave the same as txt2img (#2895)

JPPhoto · web-flow · commit 2db180d90979 · 2023-03-08T22:50:16.000+01:00
* Fix img2img and inpainting code so a strength of 1 behaves the same as txt2img.

* Make generated images identical to their txt2img counterparts when strength is 1.
diff --git a/invokeai/backend/generator/base.py b/invokeai/backend/generator/base.py
@@ -99,6 +99,7 @@ def generate(
             h_symmetry_time_pct=h_symmetry_time_pct,
             v_symmetry_time_pct=v_symmetry_time_pct,
             attention_maps_callback=attention_maps_callback,
+            seed=seed,
             **kwargs,
         )
         results = []
@@ -289,9 +290,7 @@ def generate_initial_noise(self, seed, width, height):
             if self.variation_amount > 0:
                 random.seed()  # reset RNG to an actually random state, so we can get a random seed for variations
                 seed = random.randrange(0, np.iinfo(np.uint32).max)
-            return (seed, initial_noise)
-        else:
-            return (seed, None)
+        return (seed, initial_noise)
 
     # returns a tensor filled with random numbers from a normal distribution
     def get_noise(self, width, height):
diff --git a/invokeai/backend/generator/img2img.py b/invokeai/backend/generator/img2img.py
@@ -1,8 +1,10 @@
 """
 invokeai.backend.generator.img2img descends from .generator
 """
+from typing import Optional
 
 import torch
+from accelerate.utils import set_seed
 from diffusers import logging
 
 from ..stable_diffusion import (
@@ -35,6 +37,7 @@ def get_make_image(
         h_symmetry_time_pct=None,
         v_symmetry_time_pct=None,
         attention_maps_callback=None,
+        seed=None,
         **kwargs,
     ):
         """
@@ -65,6 +68,7 @@ def make_image(x_T):
             # FIXME: use x_T for initial seeded noise
             # We're not at the moment because the pipeline automatically resizes init_image if
             # necessary, which the x_T input might not match.
+            # In the meantime, reset the seed prior to generating pipeline output so we at least get the same result.
             logging.set_verbosity_error()  # quench safety check warnings
             pipeline_output = pipeline.img2img_from_embeddings(
                 init_image,
@@ -73,6 +77,7 @@ def make_image(x_T):
                 conditioning_data,
                 noise_func=self.get_noise_like,
                 callback=step_callback,
+                seed=seed
             )
             if (
                 pipeline_output.attention_map_saver is not None
@@ -83,7 +88,9 @@ def make_image(x_T):
 
         return make_image
 
-    def get_noise_like(self, like: torch.Tensor):
+    def get_noise_like(self, like: torch.Tensor, seed: Optional[int]):
+        if seed is not None:
+            set_seed(seed)
         device = like.device
         if device.type == "mps":
             x = torch.randn_like(like, device="cpu").to(device)
diff --git a/invokeai/backend/generator/inpaint.py b/invokeai/backend/generator/inpaint.py
@@ -223,6 +223,7 @@ def get_make_image(
         inpaint_height=None,
         inpaint_fill: tuple(int) = (0x7F, 0x7F, 0x7F, 0xFF),
         attention_maps_callback=None,
+        seed=None,
         **kwargs,
     ):
         """
@@ -319,6 +320,7 @@ def make_image(x_T):
                 conditioning_data=conditioning_data,
                 noise_func=self.get_noise_like,
                 callback=step_callback,
+                seed=seed
             )
 
             if (
diff --git a/invokeai/backend/stable_diffusion/diffusers_pipeline.py b/invokeai/backend/stable_diffusion/diffusers_pipeline.py
@@ -690,6 +690,7 @@ def img2img_from_embeddings(
         callback: Callable[[PipelineIntermediateState], None] = None,
         run_id=None,
         noise_func=None,
+        seed=None,
     ) -> InvokeAIStableDiffusionPipelineOutput:
         if isinstance(init_image, PIL.Image.Image):
             init_image = image_resized_to_grid_as_tensor(init_image.convert("RGB"))
@@ -703,7 +704,7 @@ def img2img_from_embeddings(
             device=self._model_group.device_for(self.unet),
             dtype=self.unet.dtype,
         )
-        noise = noise_func(initial_latents)
+        noise = noise_func(initial_latents, seed)
 
         return self.img2img_from_latents_and_embeddings(
             initial_latents,
@@ -731,9 +732,11 @@ def img2img_from_latents_and_embeddings(
             device=self._model_group.device_for(self.unet),
         )
         result_latents, result_attention_maps = self.latents_from_embeddings(
-            initial_latents,
-            num_inference_steps,
-            conditioning_data,
+            latents=initial_latents if strength < 1.0 else torch.zeros_like(
+                initial_latents, device=initial_latents.device, dtype=initial_latents.dtype
+            ),
+            num_inference_steps=num_inference_steps,
+            conditioning_data=conditioning_data,
             timesteps=timesteps,
             noise=noise,
             run_id=run_id,
@@ -779,6 +782,7 @@ def inpaint_from_embeddings(
         callback: Callable[[PipelineIntermediateState], None] = None,
         run_id=None,
         noise_func=None,
+        seed=None,
     ) -> InvokeAIStableDiffusionPipelineOutput:
         device = self._model_group.device_for(self.unet)
         latents_dtype = self.unet.dtype
@@ -802,7 +806,7 @@ def inpaint_from_embeddings(
         init_image_latents = self.non_noised_latents_from_image(
             init_image, device=device, dtype=latents_dtype
         )
-        noise = noise_func(init_image_latents)
+        noise = noise_func(init_image_latents, seed)
 
         if mask.dim() == 3:
             mask = mask.unsqueeze(0)
@@ -831,9 +835,11 @@ def inpaint_from_embeddings(
 
         try:
             result_latents, result_attention_maps = self.latents_from_embeddings(
-                init_image_latents,
-                num_inference_steps,
-                conditioning_data,
+                latents=init_image_latents if strength < 1.0 else torch.zeros_like(
+                    init_image_latents, device=init_image_latents.device, dtype=init_image_latents.dtype
+                ),
+                num_inference_steps=num_inference_steps,
+                conditioning_data=conditioning_data,
                 noise=noise,
                 timesteps=timesteps,
                 additional_guidance=guidance,