From 61a2f321090084c8c0bae36f4ff839633d934fca Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Wed, 28 May 2025 15:09:31 -0700
Subject: [PATCH 1/2] split out some commits

[ghstack-poisoned]
---
 .lintrunner.toml                              |   2 +
 kernels/portable/cpu/util/targets.bzl         |  10 ++
 kernels/portable/cpu/util/test/CMakeLists.txt |  16 +-
 kernels/portable/cpu/util/test/targets.bzl    |  11 ++
 .../cpu/util/test/vectorized_math_test.cpp    |  95 +++++++++++
 kernels/portable/cpu/util/vectorized_math.h   | 148 ++++++++++++++++++
 .../core/portable_type/c10/c10/targets.bzl    |   6 +-
 test/utils/OSSTestConfig.json                 |  12 --
 8 files changed, 277 insertions(+), 23 deletions(-)
 create mode 100644 kernels/portable/cpu/util/test/vectorized_math_test.cpp
 create mode 100644 kernels/portable/cpu/util/vectorized_math.h

diff --git a/.lintrunner.toml b/.lintrunner.toml
index 5e7b4ff0951..4a7f8515791 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -271,6 +271,8 @@ exclude_patterns = [
     'examples/**',
     'exir/verification/bindings.cpp',
     'extension/**',
+    # Uses properly-gated (ET_USE_PYTORCH_HEADERS) ATen include.
+    'kernels/portable/cpu/util/vectorized_math.h',
     'kernels/optimized/**',
     'runtime/core/exec_aten/**',
     # Want to be able to keep c10 in sync with PyTorch core.
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
index 560e0472881..0e1e1f2e3a9 100644
--- a/kernels/portable/cpu/util/targets.bzl
+++ b/kernels/portable/cpu/util/targets.bzl
@@ -307,6 +307,16 @@ def define_common_targets():
         ],
     )
 
+    runtime.cxx_library(
+        name = "vectorized_math",
+        exported_headers = ["vectorized_math.h"],
+        visibility = ["//executorch/..."],
+        exported_deps = [
+            "//executorch/runtime/core/portable_type:portable_type",
+            "//executorch/runtime/core/exec_aten/util:scalar_type_util",
+        ],
+    )
+
     # Utility functions that can be used by operators that perform reduction
     for aten_mode in get_aten_mode_options():
         suffix = "_aten" if aten_mode else ""
diff --git a/kernels/portable/cpu/util/test/CMakeLists.txt b/kernels/portable/cpu/util/test/CMakeLists.txt
index d95b3a81b5c..41bfea54020 100644
--- a/kernels/portable/cpu/util/test/CMakeLists.txt
+++ b/kernels/portable/cpu/util/test/CMakeLists.txt
@@ -4,26 +4,22 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# @generated by test/utils/generate_gtest_cmakelists.py
-#
-# This file should be formatted with
-# ~~~
-# cmake-format -i CMakeLists.txt
-# ~~~
-# It should also be cmake-lint clean.
-#
-
 cmake_minimum_required(VERSION 3.19)
 
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../../..)
 
 include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
 set(_test_srcs broadcast_indexes_range_test.cpp broadcast_test.cpp
-               reduce_test.cpp
+               reduce_test.cpp vectorized_math_test.cpp
 )
 
 et_cxx_test(
   kernels_portable_cpu_util_test SOURCES ${_test_srcs} EXTRA_LIBS
   portable_kernels portable_ops_lib
 )
+
+find_package_torch_headers()
+target_include_directories(kernels_portable_cpu_util_test PRIVATE ${TORCH_INCLUDE_DIRS})
+target_compile_definitions(kernels_portable_cpu_util_test PRIVATE ET_USE_PYTORCH_HEADERS)
diff --git a/kernels/portable/cpu/util/test/targets.bzl b/kernels/portable/cpu/util/test/targets.bzl
index 178eb25a79b..4b167c6e946 100644
--- a/kernels/portable/cpu/util/test/targets.bzl
+++ b/kernels/portable/cpu/util/test/targets.bzl
@@ -32,3 +32,14 @@ def define_common_targets():
             "//executorch/kernels/portable/cpu/util:reduce_util",
         ],
     )
+
+    # this test requires ET_USE_PYTORCH_HEADERS, which doesn't work in OSS Buck.
+    if not runtime.is_oss:
+        runtime.cxx_test(
+            name = "vectorized_math_test",
+            srcs = ["vectorized_math_test.cpp"],
+            deps = [
+                "//executorch/kernels/portable/cpu/util:vectorized_math",
+                "//executorch/runtime/core/portable_type/c10/c10:c10",
+            ],
+        )
diff --git a/kernels/portable/cpu/util/test/vectorized_math_test.cpp b/kernels/portable/cpu/util/test/vectorized_math_test.cpp
new file mode 100644
index 00000000000..95ce327c53c
--- /dev/null
+++ b/kernels/portable/cpu/util/test/vectorized_math_test.cpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/portable/cpu/util/vectorized_math.h>
+
+#include <c10/util/irange.h>
+
+#include <gtest/gtest.h>
+
+#include <cstdint>
+
+#ifndef ET_USE_PYTORCH_HEADERS
+#error "This test requires ET_USE_PYTORCH_HEADERS!"
+#endif // ET_USE_PYTORCH_HEADERS
+
+TEST(VectorizedMathTest, BasicUnary) {
+  __at_align__ float result_floats[at::vec::Vectorized<float>::size()] = {0};
+  const auto x_vec = at::vec::Vectorized<float>::arange(0, 1);
+  const auto result_vec = executorch::math::exp(x_vec);
+  result_vec.store(result_floats);
+  for (const auto ii : c10::irange(at::vec::Vectorized<float>::size())) {
+    EXPECT_FLOAT_EQ(result_floats[ii], std::exp(ii));
+  }
+}
+
+namespace {
+template <typename T>
+void test_unary_t_to_float() {
+  __at_align__ float result_floats[at::vec::Vectorized<T>::size()] = {0};
+  const auto x_vec = at::vec::Vectorized<T>::arange(0, 1);
+  const auto result_vec = executorch::math::exp(x_vec);
+  static_assert(decltype(result_vec)::size() >= at::vec::Vectorized<T>::size());
+  result_vec.store(result_floats, at::vec::Vectorized<T>::size());
+  for (const auto ii : c10::irange(at::vec::Vectorized<T>::size())) {
+    EXPECT_EQ(result_floats[ii], std::expf(ii)) << ii;
+  }
+}
+
+} // namespace
+
+TEST(VectorizedMathTest, UnaryInt16ToFloat) {
+  test_unary_t_to_float<std::uint16_t>();
+}
+
+TEST(VectorizedMathTest, UnaryInt32ToFloat) {
+  test_unary_t_to_float<std::uint32_t>();
+}
+
+TEST(VectorizedMathTest, UnaryInt64ToFloat) {
+  test_unary_t_to_float<std::uint64_t>();
+}
+
+TEST(VectorizedMathTest, BasicBinary) {
+  __at_align__ float result_floats[at::vec::Vectorized<float>::size()] = {0};
+  const auto x_vec = at::vec::Vectorized<float>::arange(0, 1);
+  const auto y_vec = at::vec::Vectorized<float>(2);
+  const auto result_vec = executorch::math::pow(x_vec, y_vec);
+  result_vec.store(result_floats);
+  for (const auto ii : c10::irange(at::vec::Vectorized<float>::size())) {
+    EXPECT_FLOAT_EQ(result_floats[ii], std::powf(ii, 2));
+  }
+}
+
+namespace {
+template <typename T>
+void test_binary_t_to_float() {
+  __at_align__ float result_floats[at::vec::Vectorized<T>::size()] = {0};
+  const auto x_vec = at::vec::Vectorized<T>::arange(0, 1);
+  const auto y_vec = at::vec::Vectorized<T>(2);
+  const auto result_vec = executorch::math::pow(x_vec, y_vec);
+  static_assert(decltype(result_vec)::size() >= at::vec::Vectorized<T>::size());
+  result_vec.store(result_floats, at::vec::Vectorized<T>::size());
+  for (const auto ii : c10::irange(at::vec::Vectorized<T>::size())) {
+    EXPECT_EQ(result_floats[ii], std::powf(ii, 2)) << ii;
+  }
+}
+
+TEST(VectorizedMathTest, BinaryInt16ToFloat) {
+  test_binary_t_to_float<std::int16_t>();
+}
+
+TEST(VectorizedMathTest, BinaryInt32ToFloat) {
+  test_binary_t_to_float<std::int32_t>();
+}
+
+TEST(VectorizedMathTest, BinaryInt64ToFloat) {
+  test_binary_t_to_float<std::uint64_t>();
+}
+
+} // namespace
diff --git a/kernels/portable/cpu/util/vectorized_math.h b/kernels/portable/cpu/util/vectorized_math.h
new file mode 100644
index 00000000000..9e706ace56d
--- /dev/null
+++ b/kernels/portable/cpu/util/vectorized_math.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+
+#ifdef ET_USE_PYTORCH_HEADERS
+#include <ATen/cpu/vec/vec.h>
+#endif // ET_USE_PYTORCH_HEADERS
+
+#include <iostream>
+#include <type_traits>
+
+#ifdef ET_USE_PYTORCH_HEADERS
+namespace executorch {
+inline namespace math {
+namespace internal {
+template <typename T>
+auto convert_to_vectorized_n_of_float(at::vec::Vectorized<T> vec) {
+  static constexpr auto float_vec_size = at::vec::Vectorized<float>::size();
+  static constexpr auto t_vec_size = at::vec::Vectorized<T>::size();
+  static constexpr auto result_size =
+      t_vec_size < float_vec_size ? 1 : t_vec_size / float_vec_size;
+  static_assert(result_size >= 1);
+  return at::vec::convert<float, result_size, T, 1, /*keep=*/true>(
+      at::vec::VectorizedN<T, 1>(vec));
+}
+} // namespace internal
+} // namespace math
+} // namespace executorch
+#endif // ET_USE_PYTORCH_HEADERS
+
+#define _ET_INTERNAL_STD_MATH_FUNC(name) \
+  namespace executorch {                 \
+  inline namespace math {                \
+  using std::name;                       \
+  }                                      \
+  } // namespace executorch
+
+#ifdef ET_USE_PYTORCH_HEADERS
+/**
+ * Internal-usage macro for making a vectorized variant of a unary
+ * function available in the executorch::math namespace.
+ */
+#define ET_INTERNAL_VECTORIZED_FLOAT_UNARY_FUNC(func_name)                \
+  namespace executorch {                                                  \
+  inline namespace math {                                                 \
+  template <typename T>                                                   \
+  auto func_name(at::vec::Vectorized<T> vec) {                            \
+    if constexpr (!::executorch::runtime::is_floating_point<T>::value) {  \
+      return internal::convert_to_vectorized_n_of_float(vec).func_name(); \
+    } else {                                                              \
+      return vec.func_name();                                             \
+    }                                                                     \
+  }                                                                       \
+  }                                                                       \
+  }
+
+#define ET_INTERNAL_VECTORIZED_FLOAT_BINARY_FUNC(func_name)                  \
+  namespace executorch {                                                     \
+  inline namespace math {                                                    \
+  template <typename T>                                                      \
+  auto func_name(at::vec::Vectorized<T> vec0, at::vec::Vectorized<T> vec1) { \
+    if constexpr (!::executorch::runtime::is_floating_point<T>::value) {     \
+      const auto vec_float0 =                                                \
+          internal::convert_to_vectorized_n_of_float(vec0);                  \
+      const auto vec_float1 =                                                \
+          internal::convert_to_vectorized_n_of_float(vec1);                  \
+      return vec_float0.func_name(vec_float1);                               \
+    } else {                                                                 \
+      return vec0.func_name(vec1);                                           \
+    }                                                                        \
+  }                                                                          \
+  }                                                                          \
+  }
+
+/**
+ * Internal-usage macro for making a C++ standard library
+ * floating-point function and a vectorized variant of it available in
+ * the c10::math namespace. Should be used with functions where the
+ * corresponding operator is a "float op" in TensorIterator parlance
+ * (i.e., uses something like build_borrowing_binary_float_op()),
+ * because it converts non-floating-point arguments to floating point.
+ */
+#define ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(func_name) \
+  _ET_INTERNAL_STD_MATH_FUNC(func_name)                        \
+  ET_INTERNAL_VECTORIZED_FLOAT_UNARY_FUNC(func_name)
+
+#define ET_INTERNAL_VECTORIZED_STD_FLOAT_BINARY_FUNC(func_name) \
+  _ET_INTERNAL_STD_MATH_FUNC(func_name)                         \
+  ET_INTERNAL_VECTORIZED_FLOAT_BINARY_FUNC(func_name)
+
+#else // ET_USE_PYTORCH_HEADERS
+#define ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(name) \
+  _ET_INTERNAL_STD_MATH_FUNC(name)
+#define ET_INTERNAL_VECTORIZED_STD_FLOAT_BINARY_FUNC(name) \
+  _ET_INTERNAL_STD_MATH_FUNC(name)
+#endif // ET_USE_PYTORCH_HEADERS
+
+// To simplify client code, we provide coverage for a bunch of float ops (the
+// same ones listed in ATen vml.h) here.
+ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(abs)
+ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(acos)
+ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(asin)
+ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(atan)
+ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(ceil)
+ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(cos)
+ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(cosh)
+ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(erf)
+ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(erfc)
+ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(exp)
+ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(expm1)
+ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(floor)
+ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(log)
+ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(log10)
+ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(log1p)
+ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(log2)
+ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(sin)
+ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(sinh)
+ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(sqrt)
+ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(round)
+ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(tan)
+ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(tanh)
+ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(trunc)
+ET_INTERNAL_VECTORIZED_STD_FLOAT_UNARY_FUNC(lgamma)
+
+#ifdef ET_USE_PYTORCH_HEADERS
+ET_INTERNAL_VECTORIZED_FLOAT_BINARY_FUNC(rsqrt)
+#endif // ET_USE_PYTORCH_HEADERS
+
+namespace executorch {
+inline namespace math {
+template <typename T, std::enable_if_t<std::is_floating_point_v<T>>>
+T rsqrt(T x) {
+  return T(1) / std::sqrt(x);
+}
+} // namespace math
+} // namespace executorch
+
+ET_INTERNAL_VECTORIZED_STD_FLOAT_BINARY_FUNC(atan2)
+ET_INTERNAL_VECTORIZED_STD_FLOAT_BINARY_FUNC(fmod)
+ET_INTERNAL_VECTORIZED_STD_FLOAT_BINARY_FUNC(pow)
diff --git a/runtime/core/portable_type/c10/c10/targets.bzl b/runtime/core/portable_type/c10/c10/targets.bzl
index 4088110246d..70883ef6faf 100644
--- a/runtime/core/portable_type/c10/c10/targets.bzl
+++ b/runtime/core/portable_type/c10/c10/targets.bzl
@@ -53,7 +53,11 @@ def define_common_targets():
     runtime.cxx_library(
         name = "aten_headers_for_executorch",
         srcs = [],
-        visibility = ["//executorch/kernels/optimized/...", "@EXECUTORCH_CLIENTS"],
+        visibility = [
+            "//executorch/kernels/optimized/...",
+            "//executorch/kernels/portable/cpu/util/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
         exported_deps = select({
             "DEFAULT": [],
             "ovr_config//cpu:arm64": [
diff --git a/test/utils/OSSTestConfig.json b/test/utils/OSSTestConfig.json
index 2cfc4b8a995..182d0bfd58a 100644
--- a/test/utils/OSSTestConfig.json
+++ b/test/utils/OSSTestConfig.json
@@ -68,18 +68,6 @@
             "extension_threadpool"
         ]
     },
-    {
-        "directory": "kernels/portable/cpu/util/test",
-        "sources": [
-            "broadcast_indexes_range_test.cpp",
-            "broadcast_test.cpp",
-            "reduce_test.cpp"
-        ],
-        "additional_libs": [
-            "portable_kernels",
-            "portable_ops_lib"
-        ]
-    },
     {
         "directory": "runtime/core/portable_type/test",
         "sources": [

From 78e1abbe536d0661378de40809e07465acaed562 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Wed, 28 May 2025 15:48:01 -0700
Subject: [PATCH 2/2] fix visibility

[ghstack-poisoned]
---
 runtime/core/portable_type/targets.bzl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/runtime/core/portable_type/targets.bzl b/runtime/core/portable_type/targets.bzl
index 41bc6050524..5b6e67fa213 100644
--- a/runtime/core/portable_type/targets.bzl
+++ b/runtime/core/portable_type/targets.bzl
@@ -26,6 +26,7 @@ def define_common_targets():
         visibility = [
             "//executorch/backends/...",
             "//executorch/extension/fb/dynamic_shim/...",
+            "//executorch/kernels/portable/cpu/...",
             "//executorch/runtime/core/exec_aten/...",
             "//executorch/runtime/core/portable_type/test/...",
         ],