Skip to content

Commit 2978ac8

Browse files
committed
Add vectorization in elementwise_util (not working yet)
this works with op_mul, which is vectorized-friendly, but doesn't work when we roll out to pattern.h because those ops will not work with Vectorized yet. See TODO in elementwise_util.h ghstack-source-id: 66f9d9d ghstack-comment-id: 2738665976 Pull Request resolved: #9432
1 parent d8ac866 commit 2978ac8

33 files changed

+686
-67
lines changed

.lintrunner.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,10 @@ exclude_patterns = [
271271
'examples/**',
272272
'exir/verification/bindings.cpp',
273273
'extension/**',
274+
# Uses properly-gated (ET_USE_PYTORCH_HEADERS) ATen include.
275+
'kernels/portable/cpu/util/elementwise_util.h',
276+
'kernels/portable/cpu/util/math_util.h',
277+
'kernels/portable/cpu/util/vectorized_math.h',
274278
'kernels/optimized/**',
275279
'runtime/core/exec_aten/**',
276280
# Want to be able to keep c10 in sync with PyTorch core.

kernels/portable/CMakeLists.txt

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,15 @@ if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
6969
target_compile_options(optimized_portable_kernels PUBLIC ${_common_compile_options})
7070
target_include_directories(optimized_portable_kernels PRIVATE ${TORCH_INCLUDE_DIRS})
7171
target_compile_definitions(optimized_portable_kernels PRIVATE ET_USE_PYTORCH_HEADERS)
72+
gen_selected_ops(LIB_NAME "optimized_portable_ops_lib" OPS_SCHEMA_YAML "${_yaml}")
73+
generate_bindings_for_kernels(
74+
LIB_NAME "optimized_portable_ops_lib" FUNCTIONS_YAML "${_yaml}"
75+
)
76+
gen_operators_lib(
77+
LIB_NAME "optimized_portable_ops_lib" KERNEL_LIBS optimized_portable_kernels DEPS executorch_core
78+
)
7279
install(
73-
TARGETS optimized_portable_kernels
80+
TARGETS optimized_portable_kernels optimized_portable_ops_lib
7481
DESTINATION lib
7582
)
7683
endif()

kernels/portable/cpu/op_add.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -102,14 +102,18 @@ Tensor& add_scalar_out(
102102
static constexpr const char op_name[] = "add.Scalar_out";
103103

104104
ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
105+
CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
106+
CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
107+
auto val_alpha_times_b = val_alpha * val_b;
105108
utils::apply_unitensor_elementwise_fn<
106109
CTYPE_COMPUTE,
107110
op_name,
108111
utils::SupportedTensorDtypes::SAME_AS_COMMON>(
109-
[b, alpha](const auto val_a) {
110-
CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
111-
CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
112-
return val_a + val_alpha * val_b;
112+
[val_alpha_times_b](const auto val_a) {
113+
// Cast here supports vectorization; either it does nothing
114+
// or it casts from CTYPE_COMPUTE to
115+
// Vectorized<CTYPE_COMPUTE>.
116+
return val_a + decltype(val_a)(val_alpha_times_b);
113117
},
114118
ctx,
115119
a,

kernels/portable/cpu/op_atan2.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ Tensor& atan2_out(
6060
op_name,
6161
utils::SupportedTensorDtypes::FLOATHBF16>(
6262
[](const auto val_a, const auto val_b) {
63-
return std::atan2(val_a, val_b);
63+
return executorch::math::atan2(val_a, val_b);
6464
},
6565
ctx,
6666
a,

kernels/portable/cpu/op_clamp.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -138,9 +138,8 @@ Tensor& clamp_out(
138138
CTYPE_COMPUTE,
139139
op_name,
140140
utils::SupportedTensorDtypes::SAME_AS_COMMON>(
141-
[has_min, min_opt, has_max, max_opt](const CTYPE_COMPUTE val_in) {
142-
// TODO: rewrite this to be vectorization-capable.
143-
CTYPE_COMPUTE val_out = val_in;
141+
[has_min, min_opt, has_max, max_opt](const auto val_in) {
142+
auto val_out = val_in;
144143
if (has_min) {
145144
val_out = utils::max_override(
146145
val_out, utils::scalar_to<CTYPE_COMPUTE>(min_opt.value()));

kernels/portable/cpu/op_elu.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,7 @@ Tensor& elu_out(
4848
CTYPE,
4949
op_name,
5050
utils::SupportedTensorDtypes::SAME_AS_COMMON>(
51-
[negcoef, math_scale, math_input_scale](const auto x) {
52-
// TODO: rewrite this to be vectorization-capable.
51+
[negcoef, math_scale, math_input_scale](const CTYPE x) {
5352
return MathT(x) <= MathT(0)
5453
? std::expm1(MathT(x) * math_input_scale) * negcoef
5554
: MathT(x) * math_scale;

kernels/portable/cpu/op_fmod.cpp

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ Tensor& fmod_Tensor_out(
6161
utils::SupportedTensorDtypes::REALHBF16>(
6262
[&div_by_zero_error](
6363
const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
64-
// TODO: rewrite this to be vectorization-capable.
64+
// TODO: rewrite this to be vectorization-capable?
6565
CTYPE_COMPUTE value = 0;
6666
if (is_integral_type<CTYPE_COMPUTE, /*includeBool=*/true>::value) {
6767
if (val_b == 0) {
@@ -138,10 +138,8 @@ Tensor& fmod_Scalar_out(
138138
CTYPE_COMPUTE,
139139
op_name,
140140
utils::SupportedTensorDtypes::REALHBF16>(
141-
[val_b](const CTYPE_COMPUTE val_a) {
142-
// TODO: rewrite this to be vectorization-capable.
143-
CTYPE_COMPUTE value = std::fmod(val_a, val_b);
144-
return value;
141+
[val_b](const auto val_a) {
142+
return executorch::math::fmod(val_a, (decltype(val_a))val_b);
145143
},
146144
ctx,
147145
a,

kernels/portable/cpu/op_maximum.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ Tensor& maximum_out(
4949
CTYPE_COMPUTE,
5050
op_name,
5151
utils::SupportedTensorDtypes::REALHBBF16>(
52-
[](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
52+
[](const auto val_a, const auto val_b) {
5353
return utils::max_override(val_a, val_b);
5454
},
5555
ctx,

kernels/portable/cpu/op_minimum.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,7 @@ Tensor& minimum_out(
4949
CTYPE_COMPUTE,
5050
op_name,
5151
utils::SupportedTensorDtypes::REALHBBF16>(
52-
[](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
53-
// TODO: rewrite this to be vectorization-capable.
52+
[](const auto val_a, const auto val_b) {
5453
return utils::min_override(val_a, val_b);
5554
},
5655
ctx,

kernels/portable/cpu/op_mul.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,7 @@ Tensor& mul_out(
7272
CTYPE_COMPUTE,
7373
op_name,
7474
utils::SupportedTensorDtypes::REALHBBF16>(
75-
[](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
76-
return val_a * val_b;
77-
},
75+
[](const auto val_a, const auto val_b) { return val_a * val_b; },
7876
ctx,
7977
a,
8078
utils::SupportedTensorDtypes::REALHBBF16,

kernels/portable/cpu/op_native_dropout.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,11 @@ std::tuple<Tensor&, Tensor&> native_dropout_out(
5757
}
5858
ET_SWITCH_FLOATHBF16_TYPES(
5959
input.scalar_type(), ctx, op_name, CTYPE_COMPUTE, [&]() {
60-
utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
61-
[](const auto val, const auto mask_val) {
60+
utils::apply_bitensor_elementwise_fn<
61+
CTYPE_COMPUTE,
62+
op_name,
63+
utils::SupportedTensorDtypes::SAME_AS_COMMON>(
64+
[](const CTYPE_COMPUTE val, const CTYPE_COMPUTE mask_val) {
6265
if (!mask_val) {
6366
return static_cast<decltype(val)>(0);
6467
}
@@ -70,8 +73,7 @@ std::tuple<Tensor&, Tensor&> native_dropout_out(
7073
mask,
7174
// TODO: should really be just BOOL
7275
utils::SupportedTensorDtypes::BOOL_OR_BYTE,
73-
out,
74-
utils::SupportedTensorDtypes::SAME_AS_COMMON);
76+
out);
7577
});
7678
} else if (input.numel() > 0) {
7779
std::memcpy(out.mutable_data_ptr(), input.data_ptr(), input.nbytes());

kernels/portable/cpu/op_pow.cpp

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -57,9 +57,8 @@ Tensor& pow_Tensor_Tensor_out(
5757
CTYPE_COMPUTE,
5858
op_name,
5959
utils::SupportedTensorDtypes::REALHBF16>(
60-
[](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
61-
// TODO: rewrite this to be vectorization-capable.
62-
return std::pow(val_a, val_b);
60+
[](const auto val_a, const auto val_b) {
61+
return executorch::math::pow(val_a, val_b);
6362
},
6463
ctx,
6564
a,
@@ -111,8 +110,13 @@ Tensor& pow_Tensor_Scalar_out(
111110
CTYPE_COMPUTE,
112111
op_name,
113112
utils::SupportedTensorDtypes::REALHBF16>(
114-
// TODO: rewrite this to be vectorization-capable.
115-
[val_b](const CTYPE_COMPUTE val_a) { return std::pow(val_a, val_b); },
113+
// Casting val_b here supports vectorization; it does
114+
// nothing if we are not vectorizing (casts to
115+
// CTYPE_COMPUTE) and casts to a vectorized type
116+
// otherwise.
117+
[val_b](const auto val_a) {
118+
return executorch::math::pow(val_a, decltype(val_a)(val_b));
119+
},
116120
ctx,
117121
a,
118122
utils::SupportedTensorDtypes::REALHBBF16,
@@ -161,8 +165,13 @@ Tensor& pow_Scalar_out(
161165
CTYPE_COMPUTE,
162166
op_name,
163167
utils::SupportedTensorDtypes::REALHBF16>(
164-
// TODO: rewrite this to be vectorization-capable.
165-
[val_a](const CTYPE_COMPUTE val_b) { return std::pow(val_a, val_b); },
168+
// Casting val_a here supports vectorization; it does
169+
// nothing if we are not vectorizing (casts to
170+
// CTYPE_COMPUTE) and casts to a vectorized type
171+
// otherwise.
172+
[val_a](const auto val_b) {
173+
return executorch::math::pow(decltype(val_b)(val_a), val_b);
174+
},
166175
ctx,
167176
b,
168177
utils::SupportedTensorDtypes::REALHBBF16,

kernels/portable/cpu/op_sigmoid.cpp

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,9 @@ Tensor& sigmoid_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
4949
CTYPE_COMPUTE,
5050
op_name,
5151
utils::SupportedTensorDtypes::FLOATHBF16>(
52-
[](const auto val_in) -> CTYPE_COMPUTE {
53-
// TODO: rewrite this to be vectorization-capable
54-
CTYPE_COMPUTE out_val = static_cast<CTYPE_COMPUTE>(1.0) /
55-
(static_cast<CTYPE_COMPUTE>(1.0) + exp(-val_in));
52+
[](const auto val_in) {
53+
const auto one = static_cast<decltype(val_in)>(1.0);
54+
auto out_val = one / (one + executorch::math::exp(-val_in));
5655
return out_val;
5756
},
5857
ctx,

kernels/portable/cpu/op_where.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,9 @@ Tensor& where_out(
4747
CTYPE_COMPUTE,
4848
op_name,
4949
utils::SupportedTensorDtypes::SAME_AS_COMMON>(
50-
[](const auto val_a, const auto val_b, const auto val_c) {
51-
return val_c ? val_a : val_b;
52-
},
50+
[](const CTYPE_COMPUTE val_a,
51+
const CTYPE_COMPUTE val_b,
52+
const CTYPE_COMPUTE val_c) { return val_c ? val_a : val_b; },
5353
ctx,
5454
a,
5555
utils::SupportedTensorDtypes::REALHBBF16,

0 commit comments

Comments
 (0)