mps changes for PyTorch 1.13 (huggingface#926)

pcuenca · anton-l · web-flow · commit 5a47af159ab8 · 2022-10-25T16:41:51.000+02:00
* Docs: refer to pre-RC version of PyTorch 1.13.0.

* Remove temporary workaround for unavailable op.

* Update comment to make it less ambiguous.

* Remove use of contiguous in mps.

It appears to not longer be necessary.

* Special case: use einsum for much better performance in mps

* Update mps docs.

* Minor doc update.

* Accept suggestion

Co-authored-by: Anton Lozhkov &lt;anton@huggingface.co&gt;

Co-authored-by: Anton Lozhkov &lt;anton@huggingface.co&gt;
diff --git a/models/attention.py b/models/attention.py
@@ -207,7 +207,6 @@ def _set_attention_slice(self, slice_size):
         self.attn2._slice_size = slice_size
 
     def forward(self, hidden_states, context=None):
-        hidden_states = hidden_states.contiguous() if hidden_states.device.type == "mps" else hidden_states
         hidden_states = self.attn1(self.norm1(hidden_states)) + hidden_states
         hidden_states = self.attn2(self.norm2(hidden_states), context=context) + hidden_states
         hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states
@@ -288,10 +287,19 @@ def forward(self, hidden_states, context=None, mask=None):
 
     def _attention(self, query, key, value):
         # TODO: use baddbmm for better performance
-        attention_scores = torch.matmul(query, key.transpose(-1, -2)) * self.scale
+        if query.device.type == "mps":
+            # Better performance on mps (~20-25%)
+            attention_scores = torch.einsum("b i d, b j d -> b i j", query, key) * self.scale
+        else:
+            attention_scores = torch.matmul(query, key.transpose(-1, -2)) * self.scale
         attention_probs = attention_scores.softmax(dim=-1)
         # compute attention output
-        hidden_states = torch.matmul(attention_probs, value)
+
+        if query.device.type == "mps":
+            hidden_states = torch.einsum("b i j, b j d -> b i d", attention_probs, value)
+        else:
+            hidden_states = torch.matmul(attention_probs, value)
+
         # reshape hidden_states
         hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
         return hidden_states
@@ -305,11 +313,21 @@ def _sliced_attention(self, query, key, value, sequence_length, dim):
         for i in range(hidden_states.shape[0] // slice_size):
             start_idx = i * slice_size
             end_idx = (i + 1) * slice_size
-            attn_slice = (
-                torch.matmul(query[start_idx:end_idx], key[start_idx:end_idx].transpose(1, 2)) * self.scale
-            )  # TODO: use baddbmm for better performance
+            if query.device.type == "mps":
+                # Better performance on mps (~20-25%)
+                attn_slice = (
+                    torch.einsum("b i d, b j d -> b i j", query[start_idx:end_idx], key[start_idx:end_idx])
+                    * self.scale
+                )
+            else:
+                attn_slice = (
+                    torch.matmul(query[start_idx:end_idx], key[start_idx:end_idx].transpose(1, 2)) * self.scale
+                )  # TODO: use baddbmm for better performance
             attn_slice = attn_slice.softmax(dim=-1)
-            attn_slice = torch.matmul(attn_slice, value[start_idx:end_idx])
+            if query.device.type == "mps":
+                attn_slice = torch.einsum("b i j, b j d -> b i d", attn_slice, value[start_idx:end_idx])
+            else:
+                attn_slice = torch.matmul(attn_slice, value[start_idx:end_idx])
 
             hidden_states[start_idx:end_idx] = attn_slice
 
diff --git a/models/resnet.py b/models/resnet.py
@@ -492,10 +492,6 @@ def upfirdn2d_native(tensor, kernel, up=1, down=1, pad=(0, 0)):
     kernel_h, kernel_w = kernel.shape
 
     out = tensor.view(-1, in_h, 1, in_w, 1, minor)
-
-    # Temporary workaround for mps specific issue: https://github.com/pytorch/pytorch/issues/84535
-    if tensor.device.type == "mps":
-        out = out.to("cpu")
     out = F.pad(out, [0, 0, 0, up_x - 1, 0, 0, 0, up_y - 1])
     out = out.view(-1, in_h * up_y, in_w * up_x, minor)
 
diff --git a/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -287,7 +287,7 @@ def __call__(
         latents_dtype = text_embeddings.dtype
         if latents is None:
             if self.device.type == "mps":
-                # randn does not exist on mps
+                # randn does not work reproducibly on mps
                 latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
                     self.device
                 )

Original file line number	Diff line number	Diff line change
`@@ -287,7 +287,7 @@ def __call__(`
`287`	`287`	`latents_dtype = text_embeddings.dtype`
`288`	`288`	`if latents is None:`
`289`	`289`	`if self.device.type == "mps":`
`290`		`- # randn does not exist on mps`
	`290`	`+ # randn does not work reproducibly on mps`
`291`	`291`	`latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(`
`292`	`292`	`self.device`
`293`	`293`	`)`