Add meta functions for lerp, addcmul, and addcdiv. (pytorch#136909)

ysiraichi · pytorchmergebot · commit e4b98b114939 · 2024-10-04T02:47:25.000Z
This PR adds new meta functions for `lerp`, `addcmul`, and `addcdiv` (including their respective inplace versions). These functions only had refs implementations, which was being the root cause of a significant overhead ([issue][1]) when running `AdamW` optimizer step on PyTorch/XLA backend. Running the meta functions resulted in the following improvements: - `lerp` calls: 1,550ms to 140ms (10x) - `addcdiv` calls: 640ms to 350ms (1.8x) - `addcmul` calls: 620ms to 300ms (2.05x) [1]: https://github.com/pytorch/xla/issues/7923 Pull Request resolved: pytorch#136909 Approved by: https://github.com/jansel
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
@@ -2,6 +2,7 @@
 # mypy: allow-untyped-defs
 import math
 from enum import Enum
+from functools import wraps
 from typing import List, Optional, Sequence, Tuple, Union
 
 import torch
@@ -6451,6 +6452,76 @@ def _f(x, y):
 _create_binary_float_meta_func(aten.special_legendre_polynomial_p)
 
 
+def _register_inplace_meta(fn):
+    @wraps(fn)
+    def _fn(self, *args, **kwargs):
+        out = fn(self, *args, **kwargs)
+        check_inplace_broadcast(self.shape, out.shape)
+        return self
+
+    inplace_name = f"{fn.__name__}_"
+    _fn.__name__ = inplace_name
+    _fn = register_meta(getattr(aten, inplace_name))(_fn)  # type: ignore[assignment]
+
+    return _fn
+
+
+@register_meta(aten.lerp)
+@out_wrapper()
+def lerp(start, end, weight):
+    torch._check(
+        start.dtype == end.dtype,
+        lambda: f"expected dtype {start.dtype} for `end`, but got dtype {end.dtype}",
+    )
+    args = [start, end]
+    if isinstance(weight, TensorLike):
+        torch._check(
+            start.dtype == weight.dtype,
+            lambda: f"expected dtype {start.dtype} for `weight`, but got dtype {weight.dtype}",
+        )
+        args.append(weight)
+    return elementwise_meta(
+        *args, type_promotion=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    )
+
+
+@register_meta(aten.addcmul)
+@out_wrapper()
+def addcmul(input, tensor1, tensor2, *, value=1):
+    return elementwise_meta(
+        input, tensor1, tensor2, type_promotion=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    )
+
+
+@register_meta(aten.addcdiv)
+@out_wrapper()
+def addcdiv(input, tensor1, tensor2, *, value=1):
+    torch._check(
+        not (
+            utils.is_integer_dtype(tensor1.dtype)
+            and utils.is_integer_dtype(tensor2.dtype)
+        ),
+        lambda: (
+            "Integer division with addcdiv is no longer supported, and in a future ",
+            "release addcdiv will perform a true division of tensor1 and tensor2. ",
+            "The historic addcdiv behavior can be implemented as ",
+            "(input + value * torch.trunc(tensor1 / tensor2)).to(input.dtype) ",
+            "for integer inputs and as ",
+            "(input + value * tensor1 / tensor2) for float inputs. ",
+            "The future addcdiv behavior is just the latter implementation: ",
+            "(input + value * tensor1 / tensor2), for all dtypes.",
+        ),
+    )
+    return elementwise_meta(
+        input, tensor1, tensor2, type_promotion=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    )
+
+
+lerp_ = _register_inplace_meta(aten.lerp)
+addcmul_ = _register_inplace_meta(aten.addcmul)
+addcdiv_ = _register_inplace_meta(aten.addcdiv)
+
+
 # We must also trigger meta registrations from PrimTorch ref
 # decompositions
 import torch._refs