Closed
Description
The following code uses libstdc++ experimental simd, and wants to detect several all-zero patterns that can be easily done with the vptest instructions. All the code is available at https://godbolt.org/z/Kx68E1T6v .
#include <experimental/simd>
#include <cstdint>
namespace stdx = std::experimental;
template <class T, std::size_t N>
using simd_of = stdx::simd<T, stdx::simd_abi::deduce_t<T, N>>;
using data_t = simd_of<std::int32_t, 4>;
bool simple_ptest(data_t x) {
return all_of(x == 0);
}
bool ptest_and(data_t a, data_t b) {
return all_of((a & b) == 0);
}
bool ptest_andn(data_t a, data_t b) {
return all_of((a & ~b) == 0);
}
Equivalent assembly (hand-written):
simple_ptest:
vptest %xmm0, %xmm0
sete %al
ret
ptest_and:
vptest %xmm0, %xmm1
sete %al
ret
ptest_andn:
vptest %xmm0, %xmm1
setc %al
ret
But clang++ generates the following code at -O3 -march=x86-64-v3
.
simple_ptest(std::experimental::parallelism_v2::simd<int, std::experimental::parallelism_v2::simd_abi::_VecBuiltin<16>>):
vpxor xmm1, xmm1, xmm1
vpcmpeqd xmm0, xmm0, xmm1
vpcmpeqd xmm1, xmm1, xmm1
vptest xmm0, xmm1
setb al
ret
ptest_and(std::experimental::parallelism_v2::simd<int, std::experimental::parallelism_v2::simd_abi::_VecBuiltin<16>>, std::experimental::parallelism_v2::simd<int, std::experimental::parallelism_v2::simd_abi::_VecBuiltin<16>>):
vpand xmm0, xmm1, xmm0
vpxor xmm1, xmm1, xmm1
vpcmpeqd xmm0, xmm0, xmm1
vpcmpeqd xmm1, xmm1, xmm1
vptest xmm0, xmm1
setb al
ret
ptest_andn(std::experimental::parallelism_v2::simd<int, std::experimental::parallelism_v2::simd_abi::_VecBuiltin<16>>, std::experimental::parallelism_v2::simd<int, std::experimental::parallelism_v2::simd_abi::_VecBuiltin<16>>):
vpandn xmm0, xmm1, xmm0
vpxor xmm1, xmm1, xmm1
vpcmpeqd xmm0, xmm0, xmm1
vpcmpeqd xmm1, xmm1, xmm1
vptest xmm0, xmm1
setb al
ret
reference: the same issue at gcc bugzilla (identified as a duplicate to another missed optimization).