From 2d493a81fd0a4714e11fbde9a76b42509a1b7fc0 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Fri, 14 Mar 2025 08:44:18 -0700
Subject: [PATCH 1/2] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/op_argmin.cpp            | 14 ++++++++++++--
 kernels/portable/cpu/util/reduce_util.h       | 19 -------------------
 kernels/portable/cpu/util/targets.bzl         |  3 ---
 .../kernels/portable/op_registration_util.bzl |  1 +
 4 files changed, 13 insertions(+), 24 deletions(-)
diff --git a/kernels/portable/cpu/op_argmin.cpp b/kernels/portable/cpu/op_argmin.cpp
index b0816596e4e..87e90de4c04 100644
--- a/kernels/portable/cpu/op_argmin.cpp
+++ b/kernels/portable/cpu/op_argmin.cpp
@@ -12,6 +12,7 @@
 
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
 #include <executorch/runtime/platform/assert.h>
 
 namespace torch {
@@ -47,8 +48,17 @@ Tensor& argmin_out(
   ET_SWITCH_REALHBF16_TYPES(in.scalar_type(), ctx, "argmin.out", CTYPE, [&] {
     long* out_data = out.mutable_data_ptr<long>();
 
-    const bool success = parallel_for_each_reduce_over_dim_output_index(
-        in, dim, out, [&](const auto begin, const auto end) {
+    // REVIEW: this is the parallelization strategy ATen uses
+    // specifically when the reduction is along the last dimension and
+    // that dimension is contiguous. Is there any particular reason we
+    // shouldn't just always use this strategy since we aren't
+    // otherwise capable of parallelizing reductions?
+    const int64_t reduction_size = get_reduced_dim_product(in, dim);
+    const auto grain_size = std::max(
+        static_cast<int64_t>(1),
+        executorch::extension::internal::GRAIN_SIZE / reduction_size);
+    const bool success = executorch::extension::parallel_for(
+        0, out.numel(), grain_size, [&](const auto begin, const auto end) {
           for (const auto out_ix : c10::irange(begin, end)) {
             std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
                 [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
diff --git a/kernels/portable/cpu/util/reduce_util.h b/kernels/portable/cpu/util/reduce_util.h
index 6e109299920..2160d9810ae 100644
--- a/kernels/portable/cpu/util/reduce_util.h
+++ b/kernels/portable/cpu/util/reduce_util.h
@@ -10,7 +10,6 @@
 
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
-#include <executorch/runtime/kernel/thread_parallel_interface.h>
 #include <cstring>
 #include <tuple>
 
@@ -812,23 +811,5 @@ bool check_prod_out_args(
 
 #endif
 
-/**
- * parallel_for wrapper for reductions that call reduce_over_dim or
- * map_reduce_over_dim for each output element. Automatically
- * calculates appropriate grain size.
- */
-template <typename Func>
-[[nodiscard]] bool parallel_for_each_reduce_over_dim_output_index(
-    const Tensor& in,
-    optional<int64_t> dim,
-    const Tensor& out,
-    const Func& func) {
-  const int64_t reduction_size = get_reduced_dim_product(in, dim);
-  const auto grain_size = std::max(
-      static_cast<int64_t>(1),
-      executorch::extension::internal::GRAIN_SIZE / reduction_size);
-  return executorch::extension::parallel_for(0, out.numel(), grain_size, func);
-}
-
 } // namespace executor
 } // namespace torch
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
index 95fd1734d8e..db8202a920a 100644
--- a/kernels/portable/cpu/util/targets.bzl
+++ b/kernels/portable/cpu/util/targets.bzl
@@ -314,9 +314,6 @@ def define_common_targets():
                 "//executorch/runtime/kernel:kernel_includes{}".format(suffix),
                 "//executorch/runtime/core/exec_aten/util:tensor_util{}".format(suffix),
             ],
-            exported_deps = [
-                "//executorch/runtime/kernel:thread_parallel_interface",
-            ],
             exported_preprocessor_flags = ["-DUSE_ATEN_LIB"] if aten_mode else [],
             visibility = [
                 "//executorch/extension/llm/custom_ops/...",
diff --git a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
index b56413b92f4..dd48da64c30 100644
--- a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
+++ b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
@@ -284,6 +284,7 @@ ATEN_OPS = (
         name = "op_argmin",
         deps = [
             "//executorch/kernels/portable/cpu/util:reduce_util",
+            "//executorch/runtime/kernel:thread_parallel_interface",
         ],
     ),
     op_target(

From 8da75be7840296056c5136113085c561da149445 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Fri, 14 Mar 2025 08:44:23 -0700
Subject: [PATCH 2/2] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/op_argmin.cpp            | 14 ++------------
 kernels/portable/cpu/util/reduce_util.h       | 19 +++++++++++++++++++
 kernels/portable/cpu/util/targets.bzl         |  3 +++
 .../kernels/portable/op_registration_util.bzl |  1 -
 4 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/kernels/portable/cpu/op_argmin.cpp b/kernels/portable/cpu/op_argmin.cpp
index 87e90de4c04..b0816596e4e 100644
--- a/kernels/portable/cpu/op_argmin.cpp
+++ b/kernels/portable/cpu/op_argmin.cpp
@@ -12,7 +12,6 @@
 
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
-#include <executorch/runtime/kernel/thread_parallel_interface.h>
 #include <executorch/runtime/platform/assert.h>
 
 namespace torch {
@@ -48,17 +47,8 @@ Tensor& argmin_out(
   ET_SWITCH_REALHBF16_TYPES(in.scalar_type(), ctx, "argmin.out", CTYPE, [&] {
     long* out_data = out.mutable_data_ptr<long>();
 
-    // REVIEW: this is the parallelization strategy ATen uses
-    // specifically when the reduction is along the last dimension and
-    // that dimension is contiguous. Is there any particular reason we
-    // shouldn't just always use this strategy since we aren't
-    // otherwise capable of parallelizing reductions?
-    const int64_t reduction_size = get_reduced_dim_product(in, dim);
-    const auto grain_size = std::max(
-        static_cast<int64_t>(1),
-        executorch::extension::internal::GRAIN_SIZE / reduction_size);
-    const bool success = executorch::extension::parallel_for(
-        0, out.numel(), grain_size, [&](const auto begin, const auto end) {
+    const bool success = parallel_for_each_reduce_over_dim_output_index(
+        in, dim, out, [&](const auto begin, const auto end) {
           for (const auto out_ix : c10::irange(begin, end)) {
             std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
                 [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
diff --git a/kernels/portable/cpu/util/reduce_util.h b/kernels/portable/cpu/util/reduce_util.h
index 2160d9810ae..f9eeb6bebbc 100644
--- a/kernels/portable/cpu/util/reduce_util.h
+++ b/kernels/portable/cpu/util/reduce_util.h
@@ -10,6 +10,7 @@
 
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
 #include <cstring>
 #include <tuple>
 
@@ -811,5 +812,23 @@ bool check_prod_out_args(
 
 #endif
 
+/**
+ * parallel_for wrapper for reductions that call reduce_over_dim or
+ * map_reduce_over_dim for each output element. Automatically
+ * calculates appropriate grain size.
+ */
+template <typename Func>
+[[nodiscard]] bool parallel_for_each_reduce_over_dim_output_index(
+    const Tensor& in,
+    executorch::aten::optional<int64_t> dim,
+    const Tensor& out,
+    const Func& func) {
+  const int64_t reduction_size = get_reduced_dim_product(in, dim);
+  const auto grain_size = std::max(
+      static_cast<int64_t>(1),
+      executorch::extension::internal::GRAIN_SIZE / reduction_size);
+  return executorch::extension::parallel_for(0, out.numel(), grain_size, func);
+}
+
 } // namespace executor
 } // namespace torch
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
index db8202a920a..95fd1734d8e 100644
--- a/kernels/portable/cpu/util/targets.bzl
+++ b/kernels/portable/cpu/util/targets.bzl
@@ -314,6 +314,9 @@ def define_common_targets():
                 "//executorch/runtime/kernel:kernel_includes{}".format(suffix),
                 "//executorch/runtime/core/exec_aten/util:tensor_util{}".format(suffix),
             ],
+            exported_deps = [
+                "//executorch/runtime/kernel:thread_parallel_interface",
+            ],
             exported_preprocessor_flags = ["-DUSE_ATEN_LIB"] if aten_mode else [],
             visibility = [
                 "//executorch/extension/llm/custom_ops/...",
diff --git a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
index dd48da64c30..b56413b92f4 100644
--- a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
+++ b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
@@ -284,7 +284,6 @@ ATEN_OPS = (
         name = "op_argmin",
         deps = [
             "//executorch/kernels/portable/cpu/util:reduce_util",
-            "//executorch/runtime/kernel:thread_parallel_interface",
         ],
     ),
     op_target(