Skip to content

Commit e366cb1

Browse files
committed
Fix endianess for algorithm mismatch
1 parent 47ed2bf commit e366cb1

File tree

1 file changed

+42
-5
lines changed

1 file changed

+42
-5
lines changed

libcxx/include/__algorithm/mismatch.h

+42-5
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,39 @@ __mismatch(_Iter1 __first1, _Sent1 __last1, _Iter2 __first2, _Pred& __pred, _Pro
5656

5757
#if _LIBCPP_VECTORIZE_ALGORITHMS
5858

59+
template <class _Tp,
60+
__enable_if_t<is_integral<_Tp>::value, int> = 0>
61+
_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_Tp, 8>
62+
__reverse_vector(__simd_vector<_Tp, 8>& __cmp_res) {
63+
#if defined(_LIBCPP_BIG_ENDIAN)
64+
static_assert(__native_vector_size<_Tp> == 8, "The __native_vector_size has to be 8");
65+
__cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 7, 6, 5, 4, 3, 2, 1, 0);
66+
#endif
67+
return __cmp_res;
68+
}
69+
70+
template <class _Tp,
71+
__enable_if_t<is_integral<_Tp>::value, int> = 0>
72+
_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_Tp, 16>
73+
__reverse_vector(__simd_vector<_Tp, 16> __cmp_res) {
74+
#if defined(_LIBCPP_BIG_ENDIAN)
75+
static_assert(__native_vector_size<_Tp> == 16, "The __native_vector_size has to be 16");
76+
__cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
77+
#endif
78+
return __cmp_res;
79+
}
80+
81+
template <class _Tp,
82+
__enable_if_t<is_integral<_Tp>::value, int> = 0>
83+
_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_Tp, 32>
84+
__reverse_vector(__simd_vector<_Tp, 32> __cmp_res) {
85+
#if defined(_LIBCPP_BIG_ENDIAN)
86+
static_assert(__native_vector_size<_Tp> == 32, "The __native_vector_size has to be 32");
87+
__cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
88+
#endif
89+
return __cmp_res;
90+
}
91+
5992
template <class _Iter>
6093
_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Iter, _Iter>
6194
__mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) {
@@ -77,7 +110,9 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) {
77110
}
78111

79112
for (size_t __i = 0; __i != __unroll_count; ++__i) {
80-
if (auto __cmp_res = __lhs[__i] == __rhs[__i]; !std::__all_of(__cmp_res)) {
113+
auto __cmp_res = __lhs[__i] == __rhs[__i];
114+
__cmp_res = __reverse_vector<_Tp>(__cmp_res);
115+
if (!std::__all_of(__cmp_res)) {
81116
auto __offset = __i * __vec_size + std::__find_first_not_set(__cmp_res);
82117
return {__first1 + __offset, __first2 + __offset};
83118
}
@@ -89,8 +124,9 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) {
89124

90125
// check the remaining 0-3 vectors
91126
while (static_cast<size_t>(__last1 - __first1) >= __vec_size) {
92-
if (auto __cmp_res = std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2);
93-
!std::__all_of(__cmp_res)) {
127+
auto __cmp_res = std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2);
128+
__cmp_res = __reverse_vector<_Tp>(__cmp_res);
129+
if (!std::__all_of(__cmp_res)) {
94130
auto __offset = std::__find_first_not_set(__cmp_res);
95131
return {__first1 + __offset, __first2 + __offset};
96132
}
@@ -106,8 +142,9 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) {
106142
if (static_cast<size_t>(__first1 - __orig_first1) >= __vec_size) {
107143
__first1 = __last1 - __vec_size;
108144
__first2 = __last2 - __vec_size;
109-
auto __offset =
110-
std::__find_first_not_set(std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2));
145+
auto __cmp_res = std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2);
146+
__cmp_res = __reverse_vector<_Tp>(__cmp_res);
147+
auto __offset = std::__find_first_not_set(__cmp_res);
111148
return {__first1 + __offset, __first2 + __offset};
112149
} // else loop over the elements individually
113150
}

0 commit comments

Comments
 (0)