elementwise_util: don't cast the result of compute_fun back to the common type

swolchok · swolchok · commit d49b1472da75 · 2025-03-19T09:58:11.000-07:00
The compute function might return an entirely different type. For example, if we were applying a trigonometric function like acos to an input of type bool expecting an output of type float, we would get bad results because acos(0) = 1.57, but casting through bool would truncate that to 1. Note that we don't need the pair of ET_CHECK_MSG I removed because we already check tensor dtypes on entry to the elementwise util functions; the checks were inconvenient because we now call get_store_common_to_tensor_fn without the actual common type. ghstack-source-id: 9f73cfa ghstack-comment-id: 2735017325 Pull Request resolved: #9385
diff --git a/kernels/portable/cpu/util/dtype_util.h b/kernels/portable/cpu/util/dtype_util.h
@@ -86,12 +86,6 @@ load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn_bool_or_byte(
 template <typename CTYPE_COMMON, const char* op_name>
 load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn_same_as_compute(
     const Tensor& t) {
-  constexpr auto common_scalar_type = CppTypeToScalarType<CTYPE_COMMON>::value;
-  ET_CHECK_MSG(
-      t.scalar_type() == common_scalar_type,
-      "Unhandled dtype %s for %s",
-      ::executorch::runtime::toString(common_scalar_type),
-      op_name);
   return internal::load_and_convert<CTYPE_COMMON, CTYPE_COMMON>;
 }
 
@@ -179,33 +173,13 @@ get_store_common_to_tensor_fn_bool_or_byte(const Tensor& t) {
 template <typename CTYPE_COMMON, const char* op_name>
 store_common_to_tensor_fn<CTYPE_COMMON>
 get_store_common_to_tensor_fn_same_as_compute(const Tensor& t) {
-  constexpr auto common_scalar_type = CppTypeToScalarType<CTYPE_COMMON>::value;
-  ET_CHECK_MSG(
-      t.scalar_type() == common_scalar_type,
-      "Unhandled dtype %s for %s",
-      ::executorch::runtime::toString(common_scalar_type),
-      op_name);
-  return internal::convert_and_store<CTYPE_COMMON, CTYPE_COMMON>;
+  // We already validate tensor types earlier in the process, so at
+  // this phase, treat same_as_compute the same as our widest
+  // SupportedTensorDtypes set.
+  return get_store_common_to_tensor_fn_realhbf16<CTYPE_COMMON, op_name>(t);
 }
 
-template <
-    typename CTYPE_COMMON,
-    const char* op_name,
-    std::enable_if_t<std::is_same_v<CTYPE_COMMON, float>, bool> = true>
-store_common_to_tensor_fn<CTYPE_COMMON>
-get_store_common_to_tensor_fn_same_as_common(const Tensor& t) {
-  void (*result)(CTYPE_COMMON, void*) = nullptr;
-  ET_SWITCH_THREE_TYPES(
-      Float, Half, BFloat16, t.scalar_type(), unused, op_name, CTYPE, [&]() {
-        result = internal::convert_and_store<CTYPE, CTYPE_COMMON>;
-      });
-  return result;
-}
-
-template <
-    typename CTYPE_COMMON,
-    const char* op_name,
-    std::enable_if_t<!std::is_same_v<CTYPE_COMMON, float>, bool> = true>
+template <typename CTYPE_COMMON, const char* op_name>
 store_common_to_tensor_fn<CTYPE_COMMON>
 get_store_common_to_tensor_fn_same_as_common(const Tensor& t) {
   return get_store_common_to_tensor_fn_same_as_compute<CTYPE_COMMON, op_name>(
diff --git a/kernels/portable/cpu/util/elementwise_util.h b/kernels/portable/cpu/util/elementwise_util.h
@@ -51,6 +51,13 @@ inline int64_t scalar_to<int64_t>(const Scalar& s) {
 }
 
 namespace internal {
+template <typename Ignore, typename T>
+using ignore_first_yield_second = T;
+
+template <typename CTYPE_COMMON, typename Op, typename... Args>
+using op_call_result =
+    std::invoke_result_t<Op, ignore_first_yield_second<Args, CTYPE_COMMON>...>;
+
 template <
     typename CTYPE_COMMON,
     const char* op_name,
@@ -89,9 +96,16 @@ inline void apply_elementwise_fn(
       inputs.first->element_size(),
   })...};
 
-  const auto store_common_to_out =
-      internal::get_store_common_to_tensor_fn<CTYPE_COMMON, op_name>(
-          out, out_dtypes);
+  // NOTE: the result of compute_fun is not necessarily CTYPE_COMMON!
+  // For example, consider the possibility that compute_fun is a
+  // trigonometric function like acos, the common input type is bool,
+  // and the output type is float -- we would truncate acos(0) ~= 1.67
+  // to just 1. Conveniently, it costs us nothing at runtime to handle
+  // this correctly.
+  const auto store_compute_result_to_out =
+      internal::get_store_common_to_tensor_fn<
+          op_call_result<CTYPE_COMMON, Op, Args...>,
+          op_name>(out, out_dtypes);
   char* const data_out = reinterpret_cast<char*>(out.mutable_data_ptr());
   const auto out_element_size = out.element_size();
 
@@ -114,7 +128,8 @@ inline void apply_elementwise_fn(
                      .data_ptr[indexes[idx + 1] * input_info.element_size]);
           }
           auto result = std::apply(compute_fun, loaded_inputs);
-          store_common_to_out(result, &data_out[indexes[0] * out_element_size]);
+          store_compute_result_to_out(
+              result, &data_out[indexes[0] * out_element_size]);
         }
       });
 }