Save some size in pattern/{bitwise,comparison}_op.h (#10489)

swolchok · web-flow · commit b98c3abc6a36 · 2025-05-09T10:34:43.000-07:00
bloaty told me that we were paying a noticeable size cost for the ::value members of these structs (at least after the PR in this stack that reapplies #9841) and now we're not. Test Plan: bash test/build_optimized_size_test.sh ``` before: adopt functionref ========== ExecuTorch with no ops binary size, unstripped: -rwxr-xr-x 1 swolchok staff 153928 Apr 25 11:08 cmake-out/test/size_test ExecuTorch with portable ops binary size, unstripped: -rwxr-xr-x 1 swolchok staff 2150960 Apr 25 11:08 cmake-out/test/size_test_all_ops ExecuTorch with optimized ops binary size, unstripped: -rwxr-xr-x 1 swolchok staff 5927336 Apr 25 11:08 cmake-out/test/size_test_all_optimized_ops (.venv) swolchok@swolchok-mac ~/src/executorch> size cmake-out/test/size_test* __TEXT __DATA __OBJC others dec hex 81920 81920 0 4295049216 4295213056 10003c000 cmake-out/test/size_test 1474560 81920 0 4295655424 4297211904 100224000 cmake-out/test/size_test_all_ops 4505600 98304 0 4296376320 4300980224 1005bc000 cmake-out/test/size_test_all_optimized_ops after: ExecuTorch with no ops binary size, unstripped: -rwxr-xr-x 1 swolchok staff 153928 Apr 25 12:24 cmake-out/test/size_test ExecuTorch with portable ops binary size, unstripped: -rwxr-xr-x 1 swolchok staff 2150960 Apr 25 12:24 cmake-out/test/size_test_all_ops ExecuTorch with optimized ops binary size, unstripped: -rwxr-xr-x 1 swolchok staff 5887368 Apr 25 12:24 cmake-out/test/size_test_all_optimized_ops (.venv) swolchok@swolchok-mac ~/src/executorch> size cmake-out/test/size_test* __TEXT __DATA __OBJC others dec hex 81920 81920 0 4295049216 4295213056 10003c000 cmake-out/test/size_test 1474560 81920 0 4295655424 4297211904 100224000 cmake-out/test/size_test_all_ops 4489216 98304 0 4296359936 4300947456 1005b4000 cmake-out/test/size_test_all_optimized_ops ``` (yes it's neutral; improves size results for further diffs)
diff --git a/.lintrunner.toml b/.lintrunner.toml
@@ -220,6 +220,13 @@ exclude_patterns = [
     'extension/**',
     'kernels/optimized/**',
     # Justified <functional> include.
+    'kernels/portable/cpu/op_bitwise*.cpp',
+    'kernels/portable/cpu/op_eq.cpp',
+    'kernels/portable/cpu/op_ge.cpp',
+    'kernels/portable/cpu/op_gt.cpp',
+    'kernels/portable/cpu/op_le.cpp',
+    'kernels/portable/cpu/op_lt.cpp',
+    'kernels/portable/cpu/op_ne.cpp',
     'runtime/kernel/thread_parallel_interface.h',
     'scripts/**',
     'third-party/**',
diff --git a/kernels/portable/cpu/op_bitwise_and.cpp b/kernels/portable/cpu/op_bitwise_and.cpp
@@ -8,6 +8,8 @@
 
 #include <executorch/kernels/portable/cpu/pattern/bitwise_op.h>
 
+#include <functional>
+
 namespace torch {
 namespace executor {
 namespace native {
@@ -19,7 +21,7 @@ Tensor& bitwise_and_Tensor_out(
     Tensor& out) {
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "bitwise_and.Tensor_out";
-  return internal::bitwise_tensor_out<op_name>(ctx, a, b, out);
+  return internal::bitwise_tensor_out<std::bit_and, op_name>(ctx, a, b, out);
 }
 
 Tensor& bitwise_and_Scalar_out(
@@ -29,7 +31,7 @@ Tensor& bitwise_and_Scalar_out(
     Tensor& out) {
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "bitwise_and.Scalar_out";
-  return internal::bitwise_scalar_out<op_name>(ctx, a, b, out);
+  return internal::bitwise_scalar_out<std::bit_and, op_name>(ctx, a, b, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_bitwise_or.cpp b/kernels/portable/cpu/op_bitwise_or.cpp
@@ -8,6 +8,8 @@
 
 #include <executorch/kernels/portable/cpu/pattern/bitwise_op.h>
 
+#include <functional>
+
 namespace torch {
 namespace executor {
 namespace native {
@@ -19,7 +21,7 @@ Tensor& bitwise_or_Tensor_out(
     Tensor& out) {
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "bitwise_or.Tensor_out";
-  return internal::bitwise_tensor_out<op_name>(ctx, a, b, out);
+  return internal::bitwise_tensor_out<std::bit_or, op_name>(ctx, a, b, out);
 }
 
 Tensor& bitwise_or_Scalar_out(
@@ -29,7 +31,7 @@ Tensor& bitwise_or_Scalar_out(
     Tensor& out) {
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "bitwise_or.Scalar_out";
-  return internal::bitwise_scalar_out<op_name>(ctx, a, b, out);
+  return internal::bitwise_scalar_out<std::bit_or, op_name>(ctx, a, b, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_bitwise_xor.cpp b/kernels/portable/cpu/op_bitwise_xor.cpp
@@ -8,6 +8,8 @@
 
 #include <executorch/kernels/portable/cpu/pattern/bitwise_op.h>
 
+#include <functional>
+
 namespace torch {
 namespace executor {
 namespace native {
@@ -19,7 +21,7 @@ Tensor& bitwise_xor_Tensor_out(
     Tensor& out) {
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "bitwise_xor.Tensor_out";
-  return internal::bitwise_tensor_out<op_name>(ctx, a, b, out);
+  return internal::bitwise_tensor_out<std::bit_xor, op_name>(ctx, a, b, out);
 }
 
 Tensor& bitwise_xor_Scalar_out(
@@ -29,7 +31,7 @@ Tensor& bitwise_xor_Scalar_out(
     Tensor& out) {
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "bitwise_xor.Scalar_out";
-  return internal::bitwise_scalar_out<op_name>(ctx, a, b, out);
+  return internal::bitwise_scalar_out<std::bit_xor, op_name>(ctx, a, b, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_eq.cpp b/kernels/portable/cpu/op_eq.cpp
@@ -8,6 +8,8 @@
 
 #include <executorch/kernels/portable/cpu/pattern/comparison_op.h>
 
+#include <functional>
+
 namespace torch {
 namespace executor {
 namespace native {
@@ -19,7 +21,8 @@ Tensor& eq_tensor_out(
     Tensor& out) {
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "eq.Tensor_out";
-  return internal::comparison_tensor_out<op_name>(ctx, a, b, out);
+  return internal::comparison_tensor_out<std::equal_to, op_name>(
+      ctx, a, b, out);
 }
 
 Tensor& eq_scalar_out(
@@ -29,7 +32,8 @@ Tensor& eq_scalar_out(
     Tensor& out) {
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "eq.Scalar_out";
-  return internal::comparison_scalar_out<op_name>(ctx, a, b, out);
+  return internal::comparison_scalar_out<std::equal_to, op_name>(
+      ctx, a, b, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_ge.cpp b/kernels/portable/cpu/op_ge.cpp
@@ -8,6 +8,8 @@
 
 #include <executorch/kernels/portable/cpu/pattern/comparison_op.h>
 
+#include <functional>
+
 namespace torch {
 namespace executor {
 namespace native {
@@ -19,7 +21,8 @@ Tensor& ge_tensor_out(
     Tensor& out) {
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "ge.Tensor_out";
-  return internal::comparison_tensor_out<op_name>(ctx, a, b, out);
+  return internal::comparison_tensor_out<std::greater_equal, op_name>(
+      ctx, a, b, out);
 }
 
 Tensor& ge_scalar_out(
@@ -29,7 +32,8 @@ Tensor& ge_scalar_out(
     Tensor& out) {
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "ge.Scalar_out";
-  return internal::comparison_scalar_out<op_name>(ctx, a, b, out);
+  return internal::comparison_scalar_out<std::greater_equal, op_name>(
+      ctx, a, b, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_gt.cpp b/kernels/portable/cpu/op_gt.cpp
@@ -8,6 +8,8 @@
 
 #include <executorch/kernels/portable/cpu/pattern/comparison_op.h>
 
+#include <functional>
+
 namespace torch {
 namespace executor {
 namespace native {
@@ -19,7 +21,7 @@ Tensor& gt_tensor_out(
     Tensor& out) {
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "gt.Tensor_out";
-  return internal::comparison_tensor_out<op_name>(ctx, a, b, out);
+  return internal::comparison_tensor_out<std::greater, op_name>(ctx, a, b, out);
 }
 
 Tensor& gt_scalar_out(
@@ -29,7 +31,7 @@ Tensor& gt_scalar_out(
     Tensor& out) {
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "gt.Scalar_out";
-  return internal::comparison_scalar_out<op_name>(ctx, a, b, out);
+  return internal::comparison_scalar_out<std::greater, op_name>(ctx, a, b, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_le.cpp b/kernels/portable/cpu/op_le.cpp
@@ -8,6 +8,8 @@
 
 #include <executorch/kernels/portable/cpu/pattern/comparison_op.h>
 
+#include <functional>
+
 namespace torch {
 namespace executor {
 namespace native {
@@ -19,7 +21,8 @@ Tensor& le_tensor_out(
     Tensor& out) {
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "le.Tensor_out";
-  return internal::comparison_tensor_out<op_name>(ctx, a, b, out);
+  return internal::comparison_tensor_out<std::less_equal, op_name>(
+      ctx, a, b, out);
 }
 
 Tensor& le_scalar_out(
@@ -29,7 +32,8 @@ Tensor& le_scalar_out(
     Tensor& out) {
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "le.Scalar_out";
-  return internal::comparison_scalar_out<op_name>(ctx, a, b, out);
+  return internal::comparison_scalar_out<std::less_equal, op_name>(
+      ctx, a, b, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_lt.cpp b/kernels/portable/cpu/op_lt.cpp
@@ -8,6 +8,8 @@
 
 #include <executorch/kernels/portable/cpu/pattern/comparison_op.h>
 
+#include <functional>
+
 namespace torch {
 namespace executor {
 namespace native {
@@ -19,7 +21,7 @@ Tensor& lt_tensor_out(
     Tensor& out) {
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "lt.Tensor_out";
-  return internal::comparison_tensor_out<op_name>(ctx, a, b, out);
+  return internal::comparison_tensor_out<std::less, op_name>(ctx, a, b, out);
 }
 
 Tensor& lt_scalar_out(
@@ -29,7 +31,7 @@ Tensor& lt_scalar_out(
     Tensor& out) {
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "lt.Scalar_out";
-  return internal::comparison_scalar_out<op_name>(ctx, a, b, out);
+  return internal::comparison_scalar_out<std::less, op_name>(ctx, a, b, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/op_ne.cpp b/kernels/portable/cpu/op_ne.cpp
@@ -8,6 +8,8 @@
 
 #include <executorch/kernels/portable/cpu/pattern/comparison_op.h>
 
+#include <functional>
+
 namespace torch {
 namespace executor {
 namespace native {
@@ -19,7 +21,8 @@ Tensor& ne_tensor_out(
     Tensor& out) {
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "ne.Tensor_out";
-  return internal::comparison_tensor_out<op_name>(ctx, a, b, out);
+  return internal::comparison_tensor_out<std::not_equal_to, op_name>(
+      ctx, a, b, out);
 }
 
 Tensor& ne_scalar_out(
@@ -29,7 +32,8 @@ Tensor& ne_scalar_out(
     Tensor& out) {
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "ne.Scalar_out";
-  return internal::comparison_scalar_out<op_name>(ctx, a, b, out);
+  return internal::comparison_scalar_out<std::not_equal_to, op_name>(
+      ctx, a, b, out);
 }
 
 } // namespace native
diff --git a/kernels/portable/cpu/pattern/bitwise_op.h b/kernels/portable/cpu/pattern/bitwise_op.h
@@ -47,11 +47,13 @@ constexpr bitwise_fn<T> get_bitwise_fn() {
 
 template <typename T, const char* op_name>
 struct BitwiseFnForOp {
-  static constexpr auto value = get_bitwise_fn<T, op_name>();
-  static_assert(value != nullptr, "unknown op_name!");
+  static constexpr auto get_value() {
+    return get_bitwise_fn<T, op_name>();
+  }
+  static_assert(get_value() != nullptr, "unknown op_name!");
 };
 
-template <const char* op_name>
+template <template <typename> class BitOp, const char* op_name>
 Tensor& bitwise_tensor_out(
     RuntimeContext& ctx,
     const Tensor& a,
@@ -81,7 +83,7 @@ Tensor& bitwise_tensor_out(
   ET_SWITCH_INT_TYPES_AND(
       Bool, compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
         utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-            BitwiseFnForOp<CTYPE_COMPUTE, op_name>::value,
+            BitOp<CTYPE_COMPUTE>(),
             ctx,
             a,
             utils::SupportedTensorDtypes::INTB,
@@ -94,7 +96,7 @@ Tensor& bitwise_tensor_out(
   return out;
 }
 
-template <const char* op_name>
+template <template <typename> class BitOp, const char* op_name>
 Tensor& bitwise_scalar_out(
     RuntimeContext& ctx,
     const Tensor& a,
@@ -123,8 +125,7 @@ Tensor& bitwise_scalar_out(
         const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
         utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
             [val_b](const CTYPE_COMPUTE val_a) {
-              return BitwiseFnForOp<CTYPE_COMPUTE, op_name>::value(
-                  val_a, val_b);
+              return BitOp()(val_a, val_b);
             },
             ctx,
             a,
diff --git a/kernels/portable/cpu/pattern/comparison_op.h b/kernels/portable/cpu/pattern/comparison_op.h
@@ -17,53 +17,7 @@ namespace executor {
 namespace native {
 namespace internal {
 
-#define DEFINE_BINARY_OPERATOR_TEMPLATE(name, op) \
-  template <typename T>                           \
-  T name(const T val_a, const T val_b) {          \
-    return val_a op val_b;                        \
-  }
-
-DEFINE_BINARY_OPERATOR_TEMPLATE(eq, ==)
-DEFINE_BINARY_OPERATOR_TEMPLATE(ne, !=)
-DEFINE_BINARY_OPERATOR_TEMPLATE(ge, >=)
-DEFINE_BINARY_OPERATOR_TEMPLATE(le, <=)
-DEFINE_BINARY_OPERATOR_TEMPLATE(gt, >)
-DEFINE_BINARY_OPERATOR_TEMPLATE(lt, <)
-
-template <typename T>
-using comparison_fn = T (*)(const T, const T);
-
-template <typename T, const char* op_name>
-constexpr comparison_fn<T> get_comparison_fn() {
-  std::string_view op = op_name;
-  if (op == "eq.Tensor_out" || op == "eq.Scalar_out") {
-    return eq;
-  }
-  if (op == "ne.Tensor_out" || op == "ne.Scalar_out") {
-    return ne;
-  }
-  if (op == "ge.Tensor_out" || op == "ge.Scalar_out") {
-    return ge;
-  }
-  if (op == "le.Tensor_out" || op == "le.Scalar_out") {
-    return le;
-  }
-  if (op == "gt.Tensor_out" || op == "gt.Scalar_out") {
-    return gt;
-  }
-  if (op == "lt.Tensor_out" || op == "lt.Scalar_out") {
-    return lt;
-  }
-  return nullptr;
-};
-
-template <typename T, const char* op_name>
-struct ComparisonFnForOp {
-  static constexpr auto value = get_comparison_fn<T, op_name>();
-  static_assert(value != nullptr, "unknown op_name!");
-};
-
-template <const char* op_name>
+template <template <typename> class Comparison, const char* op_name>
 Tensor& comparison_tensor_out(
     KernelRuntimeContext& ctx,
     const Tensor& a,
@@ -92,7 +46,7 @@ Tensor& comparison_tensor_out(
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        ComparisonFnForOp<CTYPE_COMPUTE, op_name>::value,
+        Comparison<CTYPE_COMPUTE>(),
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
@@ -105,7 +59,7 @@ Tensor& comparison_tensor_out(
   return out;
 }
 
-template <const char* op_name>
+template <template <typename> class Comparison, const char* op_name>
 Tensor& comparison_scalar_out(
     KernelRuntimeContext& ctx,
     const Tensor& a,
@@ -129,7 +83,7 @@ Tensor& comparison_scalar_out(
     const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
     utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
         [val_b](const CTYPE_COMPUTE val_a) {
-          return ComparisonFnForOp<CTYPE_COMPUTE, op_name>::value(val_a, val_b);
+          return Comparison<CTYPE_COMPUTE>()(val_a, val_b);
         },
         ctx,
         a,