pytorch
diff --git a/‎.lintrunner.toml
Lines changed: 2 additions & 0 deletions b/‎.lintrunner.toml
Lines changed: 2 additions & 0 deletions
diff --git a/‎kernels/portable/cpu/op_add.cpp
Lines changed: 8 additions & 4 deletions b/‎kernels/portable/cpu/op_add.cpp
Lines changed: 8 additions & 4 deletions
diff --git a/‎kernels/portable/cpu/op_atan2.cpp
Lines changed: 1 addition & 1 deletion b/‎kernels/portable/cpu/op_atan2.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎kernels/portable/cpu/op_clamp.cpp
Lines changed: 2 additions & 3 deletions b/‎kernels/portable/cpu/op_clamp.cpp
Lines changed: 2 additions & 3 deletions
diff --git a/‎kernels/portable/cpu/op_elu.cpp
Lines changed: 1 addition & 2 deletions b/‎kernels/portable/cpu/op_elu.cpp
Lines changed: 1 addition & 2 deletions
diff --git a/‎kernels/portable/cpu/op_fmod.cpp
Lines changed: 3 additions & 5 deletions b/‎kernels/portable/cpu/op_fmod.cpp
Lines changed: 3 additions & 5 deletions
diff --git a/‎kernels/portable/cpu/op_maximum.cpp
Lines changed: 1 addition & 1 deletion b/‎kernels/portable/cpu/op_maximum.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎kernels/portable/cpu/op_minimum.cpp
Lines changed: 1 addition & 2 deletions b/‎kernels/portable/cpu/op_minimum.cpp
Lines changed: 1 addition & 2 deletions
diff --git a/‎kernels/portable/cpu/op_mul.cpp
Lines changed: 1 addition & 3 deletions b/‎kernels/portable/cpu/op_mul.cpp
Lines changed: 1 addition & 3 deletions
diff --git a/‎kernels/portable/cpu/op_native_dropout.cpp
Lines changed: 6 additions & 4 deletions b/‎kernels/portable/cpu/op_native_dropout.cpp
Lines changed: 6 additions & 4 deletions
diff --git a/‎kernels/portable/cpu/op_pow.cpp
Lines changed: 16 additions & 7 deletions b/‎kernels/portable/cpu/op_pow.cpp
Lines changed: 16 additions & 7 deletions
diff --git a/‎kernels/portable/cpu/op_sigmoid.cpp
Lines changed: 3 additions & 4 deletions b/‎kernels/portable/cpu/op_sigmoid.cpp
Lines changed: 3 additions & 4 deletions
diff --git a/‎kernels/portable/cpu/op_where.cpp
Lines changed: 3 additions & 3 deletions b/‎kernels/portable/cpu/op_where.cpp
Lines changed: 3 additions & 3 deletions
diff --git a/‎kernels/portable/cpu/util/elementwise_util.h
Lines changed: 138 additions & 1 deletion b/‎kernels/portable/cpu/util/elementwise_util.h
Lines changed: 138 additions & 1 deletion
@@ -272,6 +272,8 @@ exclude_patterns = [
     'exir/verification/bindings.cpp',
     'extension/**',
     # Uses properly-gated (ET_USE_PYTORCH_HEADERS) ATen include.
+    'kernels/portable/cpu/util/elementwise_util.h',
+    'kernels/portable/cpu/util/math_util.h',
     'kernels/portable/cpu/util/vectorized_math.h',
     'kernels/optimized/**',
     'runtime/core/exec_aten/**',
 
@@ -102,14 +102,18 @@ Tensor& add_scalar_out(
   static constexpr const char op_name[] = "add.Scalar_out";
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
+    CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
+    CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
+    auto val_alpha_times_b = val_alpha * val_b;
     utils::apply_unitensor_elementwise_fn<
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::SAME_AS_COMMON>(
-        [b, alpha](const auto val_a) {
-          CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
-          CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
-          return val_a + val_alpha * val_b;
+        [val_alpha_times_b](const auto val_a) {
+          // Cast here supports vectorization; either it does nothing
+          // or it casts from CTYPE_COMPUTE to
+          // Vectorized<CTYPE_COMPUTE>.
+          return val_a + decltype(val_a)(val_alpha_times_b);
         },
         ctx,
         a,
 
@@ -60,7 +60,7 @@ Tensor& atan2_out(
         op_name,
         utils::SupportedTensorDtypes::FLOATHBF16>(
         [](const auto val_a, const auto val_b) {
-          return std::atan2(val_a, val_b);
+          return executorch::math::atan2(val_a, val_b);
         },
         ctx,
         a,
 
@@ -138,9 +138,8 @@ Tensor& clamp_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::SAME_AS_COMMON>(
-        [has_min, min_opt, has_max, max_opt](const CTYPE_COMPUTE val_in) {
-          // TODO: rewrite this to be vectorization-capable.
-          CTYPE_COMPUTE val_out = val_in;
+        [has_min, min_opt, has_max, max_opt](const auto val_in) {
+          auto val_out = val_in;
           if (has_min) {
             val_out = utils::max_override(
                 val_out, utils::scalar_to<CTYPE_COMPUTE>(min_opt.value()));
 
@@ -48,8 +48,7 @@ Tensor& elu_out(
         CTYPE,
         op_name,
         utils::SupportedTensorDtypes::SAME_AS_COMMON>(
-        [negcoef, math_scale, math_input_scale](const auto x) {
-          // TODO: rewrite this to be vectorization-capable.
+        [negcoef, math_scale, math_input_scale](const CTYPE x) {
           return MathT(x) <= MathT(0)
               ? std::expm1(MathT(x) * math_input_scale) * negcoef
               : MathT(x) * math_scale;
 
@@ -61,7 +61,7 @@ Tensor& fmod_Tensor_out(
         utils::SupportedTensorDtypes::REALHBF16>(
         [&div_by_zero_error](
             const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
-          // TODO: rewrite this to be vectorization-capable.
+          // TODO: rewrite this to be vectorization-capable?
           CTYPE_COMPUTE value = 0;
           if (is_integral_type<CTYPE_COMPUTE, /*includeBool=*/true>::value) {
             if (val_b == 0) {
@@ -138,10 +138,8 @@ Tensor& fmod_Scalar_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::REALHBF16>(
-        [val_b](const CTYPE_COMPUTE val_a) {
-          // TODO: rewrite this to be vectorization-capable.
-          CTYPE_COMPUTE value = std::fmod(val_a, val_b);
-          return value;
+        [val_b](const auto val_a) {
+          return executorch::math::fmod(val_a, (decltype(val_a))val_b);
         },
         ctx,
         a,
 
@@ -49,7 +49,7 @@ Tensor& maximum_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::REALHBBF16>(
-        [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+        [](const auto val_a, const auto val_b) {
           return utils::max_override(val_a, val_b);
         },
         ctx,
 
@@ -49,8 +49,7 @@ Tensor& minimum_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::REALHBBF16>(
-        [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
-          // TODO: rewrite this to be vectorization-capable.
+        [](const auto val_a, const auto val_b) {
           return utils::min_override(val_a, val_b);
         },
         ctx,
 
@@ -72,9 +72,7 @@ Tensor& mul_out(
           CTYPE_COMPUTE,
           op_name,
           utils::SupportedTensorDtypes::REALHBBF16>(
-          [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
-            return val_a * val_b;
-          },
+          [](const auto val_a, const auto val_b) { return val_a * val_b; },
           ctx,
           a,
           utils::SupportedTensorDtypes::REALHBBF16,
 
@@ -57,8 +57,11 @@ std::tuple<Tensor&, Tensor&> native_dropout_out(
     }
     ET_SWITCH_FLOATHBF16_TYPES(
         input.scalar_type(), ctx, op_name, CTYPE_COMPUTE, [&]() {
-          utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-              [](const auto val, const auto mask_val) {
+          utils::apply_bitensor_elementwise_fn<
+              CTYPE_COMPUTE,
+              op_name,
+              utils::SupportedTensorDtypes::SAME_AS_COMMON>(
+              [](const CTYPE_COMPUTE val, const CTYPE_COMPUTE mask_val) {
                 if (!mask_val) {
                   return static_cast<decltype(val)>(0);
                 }
@@ -70,8 +73,7 @@ std::tuple<Tensor&, Tensor&> native_dropout_out(
               mask,
               // TODO: should really be just BOOL
               utils::SupportedTensorDtypes::BOOL_OR_BYTE,
-              out,
-              utils::SupportedTensorDtypes::SAME_AS_COMMON);
+              out);
         });
   } else if (input.numel() > 0) {
     std::memcpy(out.mutable_data_ptr(), input.data_ptr(), input.nbytes());
 
@@ -57,9 +57,8 @@ Tensor& pow_Tensor_Tensor_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::REALHBF16>(
-        [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
-          // TODO: rewrite this to be vectorization-capable.
-          return std::pow(val_a, val_b);
+        [](const auto val_a, const auto val_b) {
+          return executorch::math::pow(val_a, val_b);
         },
         ctx,
         a,
@@ -111,8 +110,13 @@ Tensor& pow_Tensor_Scalar_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::REALHBF16>(
-        // TODO: rewrite this to be vectorization-capable.
-        [val_b](const CTYPE_COMPUTE val_a) { return std::pow(val_a, val_b); },
+        // Casting val_b here supports vectorization; it does
+        // nothing if we are not vectorizing (casts to
+        // CTYPE_COMPUTE) and casts to a vectorized type
+        // otherwise.
+        [val_b](const auto val_a) {
+          return executorch::math::pow(val_a, decltype(val_a)(val_b));
+        },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
@@ -161,8 +165,13 @@ Tensor& pow_Scalar_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::REALHBF16>(
-        // TODO: rewrite this to be vectorization-capable.
-        [val_a](const CTYPE_COMPUTE val_b) { return std::pow(val_a, val_b); },
+        // Casting val_a here supports vectorization; it does
+        // nothing if we are not vectorizing (casts to
+        // CTYPE_COMPUTE) and casts to a vectorized type
+        // otherwise.
+        [val_a](const auto val_b) {
+          return executorch::math::pow(decltype(val_b)(val_a), val_b);
+        },
         ctx,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
 
@@ -49,10 +49,9 @@ Tensor& sigmoid_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::FLOATHBF16>(
-        [](const auto val_in) -> CTYPE_COMPUTE {
-          // TODO: rewrite this to be vectorization-capable
-          CTYPE_COMPUTE out_val = static_cast<CTYPE_COMPUTE>(1.0) /
-              (static_cast<CTYPE_COMPUTE>(1.0) + exp(-val_in));
+        [](const auto val_in) {
+          const auto one = static_cast<decltype(val_in)>(1.0);
+          auto out_val = one / (one + executorch::math::exp(-val_in));
           return out_val;
         },
         ctx,
 
@@ -47,9 +47,9 @@ Tensor& where_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::SAME_AS_COMMON>(
-        [](const auto val_a, const auto val_b, const auto val_c) {
-          return val_c ? val_a : val_b;
-        },
+        [](const CTYPE_COMPUTE val_a,
+           const CTYPE_COMPUTE val_b,
+           const CTYPE_COMPUTE val_c) { return val_c ? val_a : val_b; },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
 
@@ -12,9 +12,14 @@
 #include <executorch/kernels/portable/cpu/util/broadcast_indexes_range.h>
 #include <executorch/kernels/portable/cpu/util/broadcast_util.h>
 #include <executorch/kernels/portable/cpu/util/dtype_util.h>
+#include <executorch/kernels/portable/cpu/util/vectorized_math.h> // Make vectorization support easy for clients.
 #include <executorch/runtime/kernel/kernel_runtime_context.h>
 #include <executorch/runtime/kernel/thread_parallel_interface.h>
 
+#ifdef ET_USE_PYTORCH_HEADERS
+#include <ATen/cpu/vec/vec.h>
+#endif // ET_USE_PYTORCH_HEADERS
+
 #include <array>
 #include <utility>
 
@@ -51,6 +56,38 @@ inline int64_t scalar_to<int64_t>(const Scalar& s) {
 }
 
 namespace internal {
+template <typename Ignore, typename T>
+using ignore_first_yield_second = T;
+
+#ifdef ET_USE_PYTORCH_HEADERS
+// Can I call a function of type Op with sizeof...(Args) arguments of type
+// at::vec::Vectorized<CTYPE_COMPUTE>?
+//
+// See [NOTE: Generic lambdas] below for requirements on Op.
+template <typename CTYPE_COMPUTE, typename Op, typename... Args>
+constexpr bool can_use_vectorized() {
+  using Vec = at::vec::Vectorized<CTYPE_COMPUTE>;
+  // NOTE: if we start building optimized kernels on platforms that
+  // ATen Vectorized doesn't support well, we will want to add a way
+  // to check that Vectorized actually does something on our target
+  // platform. For now, I see no concrete need for that.
+  if constexpr (std::is_invocable_v<
+                    Op,
+                    ignore_first_yield_second<Args, Vec>...>) {
+    // For bool, we will get a false positive if we rely on only the
+    // is_invocable_v check above because at::vec::Vectorized is
+    // implicitly convertible to a pointer, which makes it implicitly
+    // convertible to bool (which was 15 minutes of fun to debug). Also
+    // just seems like good hygiene to make sure we get the Vectorized
+    // we're expecting.
+    return std::is_same_v<
+        std::invoke_result_t<Op, ignore_first_yield_second<Args, Vec>...>,
+        Vec>;
+  }
+  return false;
+}
+#endif // ET_USE_PYTORCH_HEADERS
+
 template <
     typename CTYPE_COMPUTE,
     typename CTYPE_OUT,
@@ -61,8 +98,90 @@ inline void dtype_specialized_elementwise_fn_impl(
     KernelRuntimeContext& ctx,
     const Tensor& out,
     Args... inputs) {
+  static_assert(
+      (std::is_same_v<Args, std::pair<const Tensor*, SupportedTensorDtypes>> &&
+       ...));
   constexpr auto kNumInputs = sizeof...(inputs);
-  ET_DCHECK(((inputs.first->element_size() == sizeof(CTYPE_COMPUTE)) && ...));
+  // All inputs must be of type CTYPE_COMPUTE.
+  ET_DCHECK(
+      ((inputs.first->scalar_type() ==
+        CppTypeToScalarType<CTYPE_COMPUTE>::value) &&
+       ...));
+
+#ifdef ET_USE_PYTORCH_HEADERS
+  if constexpr (can_use_vectorized<CTYPE_COMPUTE, Op, Args...>()) {
+    const bool any_is_broadcasted =
+        !(torch::executor::internal::sizes_match_ignoring_leading_1s(
+              inputs.first->sizes(), out.sizes()) &&
+          ...);
+    if (!any_is_broadcasted) {
+      using Vec = at::vec::Vectorized<CTYPE_COMPUTE>;
+      ::executorch::extension::parallel_for(
+          0,
+          out.numel(),
+          ::executorch::extension::internal::GRAIN_SIZE,
+          [&](const auto begin, const auto end) {
+            std::array<const CTYPE_COMPUTE*, kNumInputs> inputs_data_ptrs = {
+                inputs.first->template const_data_ptr<CTYPE_COMPUTE>()...};
+
+            CTYPE_OUT* const data_out = out.mutable_data_ptr<CTYPE_OUT>();
+
+            const auto vectorized_begin =
+                begin + (Vec::size() - begin % Vec::size()) % Vec::size();
+            const auto vectorized_end = end - (end % Vec::size());
+            // Scalar prologue.
+            for (const auto idx : c10::irange(begin, vectorized_begin)) {
+              // In debug mode, always use Vectorized so that even
+              // small-sized tests will test whether using Vectorized broke our
+              // lambda.
+#ifndef NDEBUG
+              std::array<Vec, kNumInputs> loaded_inputs;
+#else // NDEBUG
+              std::array<CTYPE_COMPUTE, kNumInputs> loaded_inputs;
+#endif // NDEBUG
+              for (const auto input_idx : c10::irange(kNumInputs)) {
+                loaded_inputs[input_idx] = inputs_data_ptrs[input_idx][idx];
+              }
+#ifndef NDEBUG
+              std::apply(compute_fun, loaded_inputs).store(&data_out[idx], 1);
+#else // NDEBUG
+              data_out[idx] = std::apply(compute_fun, loaded_inputs);
+#endif // NDEBUG
+            }
+
+            // Main vectorized loop.
+            for (auto idx = vectorized_begin; idx < vectorized_end;
+                 idx += Vec::size()) {
+              std::array<Vec, kNumInputs> loaded_vec_inputs;
+              for (const auto input_idx : c10::irange(kNumInputs)) {
+                loaded_vec_inputs[input_idx] =
+                    Vec::loadu(&inputs_data_ptrs[input_idx][idx]);
+              }
+              auto result_vec = std::apply(compute_fun, loaded_vec_inputs);
+              result_vec.store(&data_out[idx]);
+            }
+
+            // Scalar epilogue.
+            for (const auto idx : c10::irange(vectorized_end, end)) {
+#ifndef NDEBUG
+              std::array<Vec, kNumInputs> loaded_inputs;
+#else // NDEBUG
+              std::array<CTYPE_COMPUTE, kNumInputs> loaded_inputs;
+#endif // NDEBUG
+              for (const auto input_idx : c10::irange(kNumInputs)) {
+                loaded_inputs[input_idx] = inputs_data_ptrs[input_idx][idx];
+              }
+#ifndef NDEBUG
+              std::apply(compute_fun, loaded_inputs).store(&data_out[idx], 1);
+#else // NDEBUG
+              data_out[idx] = std::apply(compute_fun, loaded_inputs);
+#endif // NDEBUG
+            }
+          });
+      return;
+    }
+  }
+#endif // ET_USE_PYTORCH_HEADERS
 
   ::executorch::extension::parallel_for(
       0,
@@ -240,6 +359,19 @@ inline void apply_unitensor_elementwise_fn(
       compute_fun, ctx, out, out_dtypes, std::make_pair(&a, a_dtypes));
 }
 
+/**
+ * Useful for unary elementwise operators. For each element of the
+ * input, call Op and write to the corresponding element of the
+ * output. Tensor broadcasting is applied wherever it is required.
+ *
+ * [NOTE: Generic lambdas]: If Op is a *generic* lambda (i.e., one with `auto`
+ * parameters; normal lambdas are fine), it must fulfill one of the
+ * following conditions. Either:
+ * 1) It must in fact compile when passed at::vec::Vectorized<CTYPE_COMPUTE>, or
+ * 2) It must be actively SFINAE-friendly, as per the C++17 examples in
+ * https://stackoverflow.com/questions/76525790/detecting-if-a-generic-lambda-with-certain-arguments-is-invocable
+ * .
+ */
 template <
     typename CTYPE_COMPUTE,
     const char* op_name,
@@ -281,6 +413,8 @@ inline void apply_bitensor_elementwise_fn(
  * Useful for bi-tensor elementwise operators. For each element of the inputs,
  * perform a computation and write to the corresponding element of the output.
  * Tensor broadcasting is applied wherever it is required.
+ * See [NOTE: Generic lambdas] if you want to pass a generic lambda for
+ * compute_fun.
  */
 template <
     typename CTYPE_COMPUTE,
@@ -347,6 +481,9 @@ inline void apply_tritensor_elementwise_fn(
  *
  * static constexpr const char op_name[] = "my_op";
  * apply_ternary_elementwise_fn<CTYPE_COMPUTE, op_name>.
+ *
+ * See [NOTE: Generic lambdas] if you want to pass a generic lambda for
+ * compute_fun.
  */
 template <
     typename CTYPE_COMPUTE,