diffusers#7426 fix stable diffusion xl inference on MPS when dtypes shift unexpectedly due to pytorch bugs

bghira · bghira · commit ec679860d535 · 2024-03-23T15:26:33.000-06:00
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -349,7 +349,7 @@ def encode_prompt(
             [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
         )
 
-        if prompt_embeds is None:
+        if prompt_embeds is None: 
             prompt_2 = prompt_2 or prompt
             prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
 
@@ -1193,7 +1193,11 @@ def __call__(
                     noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
 
                 # compute the previous noisy sample x_t -> x_t-1
+                old_dtype = latents.dtype
                 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                if latents.dtype != old_dtype:
+                    # some platforms (eg. apple mps) misbehave due to a pytorch bug, this is a workaround
+                    latents = latents.to(old_dtype)
 
                 if callback_on_step_end is not None:
                     callback_kwargs = {}
@@ -1228,6 +1232,9 @@ def __call__(
             if needs_upcasting:
                 self.upcast_vae()
                 latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+            elif torch.backends.mps.is_available() and latents.dtype != self.vae.dtype:
+                # some platforms (eg. apple mps) misbehave due to a pytorch bug, this is a workaround
+                self.vae = self.vae.to(latents.dtype)
 
             # unscale/denormalize the latents
             # denormalize with the mean and std if available and not None
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -1370,7 +1370,11 @@ def denoising_value_valid(dnv):
                     noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
 
                 # compute the previous noisy sample x_t -> x_t-1
+                old_dtype = latents.dtype
                 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                if latents.dtype != old_dtype:
+                    # some platforms (eg. apple mps) misbehave due to a pytorch bug, this is a workaround
+                    latents = latents.to(old_dtype)
 
                 if callback_on_step_end is not None:
                     callback_kwargs = {}
@@ -1405,6 +1409,9 @@ def denoising_value_valid(dnv):
             if needs_upcasting:
                 self.upcast_vae()
                 latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+            elif torch.backends.mps.is_available() and latents.dtype != self.vae.dtype:
+                # some platforms (eg. apple mps) misbehave due to a pytorch bug, this is a workaround
+                self.vae = self.vae.to(latents.dtype)
 
             # unscale/denormalize the latents
             # denormalize with the mean and std if available and not None
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
@@ -1720,7 +1720,11 @@ def denoising_value_valid(dnv):
                     noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
 
                 # compute the previous noisy sample x_t -> x_t-1
+                old_dtype = latents.dtype
                 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                if latents.dtype != old_dtype:
+                    # some platforms (eg. apple mps) misbehave due to a pytorch bug, this is a workaround
+                    latents = latents.to(old_dtype)
 
                 if num_channels_unet == 4:
                     init_latents_proper = image_latents
@@ -1772,6 +1776,9 @@ def denoising_value_valid(dnv):
             if needs_upcasting:
                 self.upcast_vae()
                 latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+            elif torch.backends.mps.is_available() and latents.dtype != self.vae.dtype:
+                # some platforms (eg. apple mps) misbehave due to a pytorch bug, this is a workaround
+                self.vae = self.vae.to(latents.dtype)
 
             # unscale/denormalize the latents
             # denormalize with the mean and std if available and not None
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
@@ -918,7 +918,11 @@ def __call__(
                     noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
 
                 # compute the previous noisy sample x_t -> x_t-1
+                old_dtype = latents.dtype
                 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                if latents.dtype != old_dtype:
+                    # some platforms (eg. apple mps) misbehave due to a pytorch bug, this is a workaround
+                    latents = latents.to(old_dtype)
 
                 # call the callback, if provided
                 if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
@@ -937,6 +941,9 @@ def __call__(
             if needs_upcasting:
                 self.upcast_vae()
                 latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+            elif torch.backends.mps.is_available() and latents.dtype != self.vae.dtype:
+                # some platforms (eg. apple mps) misbehave due to a pytorch bug, this is a workaround
+                self.vae = self.vae.to(latents.dtype)
 
             # unscale/denormalize the latents
             # denormalize with the mean and std if available and not None