Return unsafe_view instead of view from matmul when folding occurs

internal developer · aostrowski-hbn · commit 8bee6d6f4e3c · 2025-01-07T23:48:09.000+01:00
When tensor folding occurs during matmul operation returned tensor is a view. This can cause issues when matmul is used inside a custom function and such view is then returned as output. Then it cannot be modified inplace and causes errors. It can be especially problematic when after such function inplace allreduce is performed. Issue is resolved when unsafe_view is returned from matmul instead. This solution aligns matmul decomposition with eager implementation in such a way that a non view tensor is returned. Pull request openned to pytorch pytorch#134568 Change-Id: I77484ff6f22d3e290352348b1acbffa267eb063b
diff --git a/test/custom_function/test_custom_function.py b/test/custom_function/test_custom_function.py
@@ -0,0 +1,59 @@
+import torch
+
+from torch.testing._internal.common_utils import TestCase, run_tests
+
+class TestCustomFunction(TestCase):
+    def test_autograd_function_with_matmul_folding_at_output(self):
+        """
+        When tensor folding occurs during matmul operation returned tensor is a view.
+        This can cause issues when matmul is used inside a custom function
+        and such view is then returned as output. Then it cannot be modified inplace
+        and causes errors.
+        It can be especially problematic when after such function inplace allreduce
+        is performed. This test recreates this behaviour.
+        Issue is resolved when unsafe_view is returned from matmul instead.
+        """
+
+        class CustomFunction(torch.autograd.Function):
+
+            @staticmethod
+            def forward(ctx, inp1, inp2) -> torch.Tensor:
+                ctx.save_for_backward(inp2)
+                ctx.output_shape = inp1.size()
+                return torch.matmul(inp1, inp2)
+
+            @staticmethod
+            def backward(ctx, grad_output) -> tuple[torch.Tensor, None]:
+                output_shape = ctx.output_shape
+                inp2, = ctx.saved_tensors
+                return torch.mm(grad_output.squeeze(), inp2.t()).view(output_shape), None
+
+
+        def outer_function(inp1, inp2) -> torch.Tensor:
+            res = CustomFunction.apply(inp1, inp2)
+            res.add_(1.0)
+            return res.sum()
+
+        def usual_function(inp1, inp2) -> torch.Tensor:
+            res = torch.matmul(inp1, inp2)
+            res.add_(1.0)
+            return res.sum()
+
+
+        inp1_custom = torch.randn(4, 1, 2, requires_grad=True)
+        inp1_usual = inp1_custom.detach().clone().requires_grad_(True)
+
+        inp2 = torch.randn(2, 4)
+        c_custom_func = torch.compile(outer_function)
+        c_usual_func = torch.compile(usual_function)
+
+        result_custom = c_custom_func(inp1_custom, inp2)
+        result_custom.backward()
+        result_usual = c_usual_func(inp1_usual, inp2)
+        result_usual.backward()
+
+        torch.allclose(inp1_custom.grad, inp1_usual.grad)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
@@ -4361,10 +4361,10 @@ def matmul(tensor1, tensor2, *, is_out=False):
         if t2_is_matrix:
             # This copies if we perform a 2D @ 3D and the first tensor requires_grad
             # See should_fold native/LinearAlgebra.cpp for why.
-            output = t1_folded.mm(t2).view(output_shape)
+            output = torch.ops.aten._unsafe_view(t1_folded.mm(t2), output_shape)
             return output.mT.contiguous() if transpose else output
         else:
-            return t1_folded.mv(t2).view(output_shape)
+            return torch.ops.aten._unsafe_view(t1_folded.mv(t2), output_shape)
 
     elif dim_tensor1 >= 1 and dim_tensor2 >= 1:
         # We are multiplying b1 x n x m1 by x2 x m2 x p (where b1 can be a list);