Skip to content

Commit 07b18c5

Browse files
authored
[libc++] Optimize ranges::fill{,_n} for vector<bool>::iterator (#84642)
``` ------------------------------------------------------ Benchmark old new ------------------------------------------------------ bm_ranges_fill_n/1 1.64 ns 3.06 ns bm_ranges_fill_n/2 3.45 ns 3.06 ns bm_ranges_fill_n/3 4.88 ns 3.06 ns bm_ranges_fill_n/4 6.46 ns 3.06 ns bm_ranges_fill_n/5 8.03 ns 3.06 ns bm_ranges_fill_n/6 9.65 ns 3.07 ns bm_ranges_fill_n/7 11.5 ns 3.06 ns bm_ranges_fill_n/8 13.0 ns 3.06 ns bm_ranges_fill_n/16 25.9 ns 3.06 ns bm_ranges_fill_n/64 103 ns 4.62 ns bm_ranges_fill_n/512 711 ns 4.40 ns bm_ranges_fill_n/4096 5642 ns 9.86 ns bm_ranges_fill_n/32768 45135 ns 33.6 ns bm_ranges_fill_n/262144 360818 ns 243 ns bm_ranges_fill_n/1048576 1442828 ns 982 ns bm_ranges_fill/1 1.63 ns 3.17 ns bm_ranges_fill/2 3.43 ns 3.28 ns bm_ranges_fill/3 4.97 ns 3.31 ns bm_ranges_fill/4 6.53 ns 3.27 ns bm_ranges_fill/5 8.12 ns 3.33 ns bm_ranges_fill/6 9.76 ns 3.32 ns bm_ranges_fill/7 11.6 ns 3.29 ns bm_ranges_fill/8 13.2 ns 3.26 ns bm_ranges_fill/16 26.3 ns 3.26 ns bm_ranges_fill/64 104 ns 4.92 ns bm_ranges_fill/512 716 ns 4.47 ns bm_ranges_fill/4096 5772 ns 8.21 ns bm_ranges_fill/32768 45778 ns 33.1 ns bm_ranges_fill/262144 351422 ns 241 ns bm_ranges_fill/1048576 1404710 ns 965 ns ```
1 parent 601e102 commit 07b18c5

File tree

9 files changed

+192
-105
lines changed

9 files changed

+192
-105
lines changed

libcxx/benchmarks/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@ set(BENCHMARK_TESTS
176176
algorithms/count.bench.cpp
177177
algorithms/equal.bench.cpp
178178
algorithms/find.bench.cpp
179+
algorithms/fill.bench.cpp
179180
algorithms/for_each.bench.cpp
180181
algorithms/lower_bound.bench.cpp
181182
algorithms/make_heap.bench.cpp
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include <algorithm>
10+
#include <benchmark/benchmark.h>
11+
#include <vector>
12+
13+
static void bm_fill_n(benchmark::State& state) {
14+
std::vector<bool> vec1(state.range());
15+
for (auto _ : state) {
16+
benchmark::DoNotOptimize(vec1);
17+
benchmark::DoNotOptimize(std::fill_n(vec1.begin(), vec1.size(), false));
18+
}
19+
}
20+
BENCHMARK(bm_fill_n)->DenseRange(1, 8)->Range(16, 1 << 20);
21+
22+
static void bm_ranges_fill_n(benchmark::State& state) {
23+
std::vector<bool> vec1(state.range());
24+
for (auto _ : state) {
25+
benchmark::DoNotOptimize(vec1);
26+
benchmark::DoNotOptimize(std::ranges::fill_n(vec1.begin(), vec1.size(), false));
27+
}
28+
}
29+
BENCHMARK(bm_ranges_fill_n)->DenseRange(1, 8)->Range(16, 1 << 20);
30+
31+
static void bm_fill(benchmark::State& state) {
32+
std::vector<bool> vec1(state.range());
33+
for (auto _ : state) {
34+
benchmark::DoNotOptimize(vec1);
35+
std::fill(vec1.begin(), vec1.end(), false);
36+
}
37+
}
38+
BENCHMARK(bm_fill)->DenseRange(1, 8)->Range(16, 1 << 20);
39+
40+
static void bm_ranges_fill(benchmark::State& state) {
41+
std::vector<bool> vec1(state.range());
42+
for (auto _ : state) {
43+
benchmark::DoNotOptimize(vec1);
44+
benchmark::DoNotOptimize(std::ranges::fill(vec1, false));
45+
}
46+
}
47+
BENCHMARK(bm_ranges_fill)->DenseRange(1, 8)->Range(16, 1 << 20);
48+
49+
BENCHMARK_MAIN();

libcxx/docs/ReleaseNotes/19.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@ Improvements and New Features
4949
-----------------------------
5050

5151
- The performance of growing ``std::vector`` has been improved for trivially relocatable types.
52+
- The performance of ``ranges::fill`` and ``ranges::fill_n`` has been improved for ``vector<bool>::iterator``\s,
53+
resulting in a performance increase of up to 1400x.
5254

5355
Deprecations and Removals
5456
-------------------------

libcxx/include/__algorithm/fill_n.h

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,18 +9,74 @@
99
#ifndef _LIBCPP___ALGORITHM_FILL_N_H
1010
#define _LIBCPP___ALGORITHM_FILL_N_H
1111

12+
#include <__algorithm/min.h>
1213
#include <__config>
14+
#include <__fwd/bit_reference.h>
1315
#include <__iterator/iterator_traits.h>
16+
#include <__memory/pointer_traits.h>
1417
#include <__utility/convert_to_integral.h>
1518

1619
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
1720
# pragma GCC system_header
1821
#endif
1922

23+
_LIBCPP_PUSH_MACROS
24+
#include <__undef_macros>
25+
2026
_LIBCPP_BEGIN_NAMESPACE_STD
2127

2228
// fill_n isn't specialized for std::memset, because the compiler already optimizes the loop to a call to std::memset.
2329

30+
template <class _OutputIterator, class _Size, class _Tp>
31+
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
32+
__fill_n(_OutputIterator __first, _Size __n, const _Tp& __value);
33+
34+
template <bool _FillVal, class _Cp>
35+
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void
36+
__fill_n_bool(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n) {
37+
using _It = __bit_iterator<_Cp, false>;
38+
using __storage_type = typename _It::__storage_type;
39+
40+
const int __bits_per_word = _It::__bits_per_word;
41+
// do first partial word
42+
if (__first.__ctz_ != 0) {
43+
__storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
44+
__storage_type __dn = std::min(__clz_f, __n);
45+
__storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
46+
if (_FillVal)
47+
*__first.__seg_ |= __m;
48+
else
49+
*__first.__seg_ &= ~__m;
50+
__n -= __dn;
51+
++__first.__seg_;
52+
}
53+
// do middle whole words
54+
__storage_type __nw = __n / __bits_per_word;
55+
std::__fill_n(std::__to_address(__first.__seg_), __nw, _FillVal ? static_cast<__storage_type>(-1) : 0);
56+
__n -= __nw * __bits_per_word;
57+
// do last partial word
58+
if (__n > 0) {
59+
__first.__seg_ += __nw;
60+
__storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
61+
if (_FillVal)
62+
*__first.__seg_ |= __m;
63+
else
64+
*__first.__seg_ &= ~__m;
65+
}
66+
}
67+
68+
template <class _Cp, class _Size>
69+
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator<_Cp, false>
70+
__fill_n(__bit_iterator<_Cp, false> __first, _Size __n, const bool& __value) {
71+
if (__n > 0) {
72+
if (__value)
73+
std::__fill_n_bool<true>(__first, __n);
74+
else
75+
std::__fill_n_bool<false>(__first, __n);
76+
}
77+
return __first + __n;
78+
}
79+
2480
template <class _OutputIterator, class _Size, class _Tp>
2581
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
2682
__fill_n(_OutputIterator __first, _Size __n, const _Tp& __value) {
@@ -37,4 +93,6 @@ fill_n(_OutputIterator __first, _Size __n, const _Tp& __value) {
3793

3894
_LIBCPP_END_NAMESPACE_STD
3995

96+
_LIBCPP_POP_MACROS
97+
4098
#endif // _LIBCPP___ALGORITHM_FILL_N_H

libcxx/include/__bit_reference

Lines changed: 3 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -171,61 +171,6 @@ private:
171171
__bit_const_reference& operator=(const __bit_const_reference&) = delete;
172172
};
173173

174-
// fill_n
175-
176-
template <bool _FillVal, class _Cp>
177-
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void
178-
__fill_n(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n) {
179-
using _It = __bit_iterator<_Cp, false>;
180-
using __storage_type = typename _It::__storage_type;
181-
182-
const int __bits_per_word = _It::__bits_per_word;
183-
// do first partial word
184-
if (__first.__ctz_ != 0) {
185-
__storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
186-
__storage_type __dn = std::min(__clz_f, __n);
187-
__storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
188-
if (_FillVal)
189-
*__first.__seg_ |= __m;
190-
else
191-
*__first.__seg_ &= ~__m;
192-
__n -= __dn;
193-
++__first.__seg_;
194-
}
195-
// do middle whole words
196-
__storage_type __nw = __n / __bits_per_word;
197-
std::fill_n(std::__to_address(__first.__seg_), __nw, _FillVal ? static_cast<__storage_type>(-1) : 0);
198-
__n -= __nw * __bits_per_word;
199-
// do last partial word
200-
if (__n > 0) {
201-
__first.__seg_ += __nw;
202-
__storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
203-
if (_FillVal)
204-
*__first.__seg_ |= __m;
205-
else
206-
*__first.__seg_ &= ~__m;
207-
}
208-
}
209-
210-
template <class _Cp>
211-
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
212-
fill_n(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n, bool __value) {
213-
if (__n > 0) {
214-
if (__value)
215-
std::__fill_n<true>(__first, __n);
216-
else
217-
std::__fill_n<false>(__first, __n);
218-
}
219-
}
220-
221-
// fill
222-
223-
template <class _Cp>
224-
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
225-
fill(__bit_iterator<_Cp, false> __first, __bit_iterator<_Cp, false> __last, bool __value) {
226-
std::fill_n(__first, static_cast<typename _Cp::size_type>(__last - __first), __value);
227-
}
228-
229174
// copy
230175

231176
template <class _Cp, bool _IsConst>
@@ -1007,8 +952,10 @@ private:
1007952
friend class __bit_iterator<_Cp, true>;
1008953
template <class _Dp>
1009954
friend struct __bit_array;
955+
1010956
template <bool _FillVal, class _Dp>
1011-
_LIBCPP_CONSTEXPR_SINCE_CXX20 friend void __fill_n(__bit_iterator<_Dp, false> __first, typename _Dp::size_type __n);
957+
_LIBCPP_CONSTEXPR_SINCE_CXX20 friend void
958+
__fill_n_bool(__bit_iterator<_Dp, false> __first, typename _Dp::size_type __n);
1012959

1013960
template <class _Dp, bool _IC>
1014961
_LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false> __copy_aligned(

libcxx/test/libcxx/containers/sequences/vector/robust_against_adl.pass.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@ struct MyAlloc {
3131
int main(int, char**)
3232
{
3333
std::vector<bool, MyAlloc<bool>> vb;
34-
std::vector<bool, MyAlloc<bool>> wb(100);
34+
// std::fill_n triggers ADL because __bit_iterator has the container type as a template argument
35+
// std::vector<bool, MyAlloc<bool>> wb(100);
3536

3637
std::vector<int, MyAlloc<int>> v;
3738
std::vector<int, MyAlloc<int>> w(100);

libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp

Lines changed: 75 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -14,62 +14,91 @@
1414
// fill(Iter first, Iter last, const T& value);
1515

1616
#include <algorithm>
17+
#include <array>
1718
#include <cassert>
19+
#include <vector>
1820

1921
#include "test_macros.h"
2022
#include "test_iterators.h"
2123

22-
#if TEST_STD_VER > 17
23-
TEST_CONSTEXPR bool test_constexpr() {
24-
int ia[] = {0, 1, 2, 3, 4};
25-
26-
std::fill(std::begin(ia), std::end(ia), 5);
24+
template <class Iter, class Container>
25+
TEST_CONSTEXPR_CXX20 void
26+
test(Container in, size_t from, size_t to, typename Container::value_type value, Container expected) {
27+
std::fill(Iter(in.data() + from), Iter(in.data() + to), value);
28+
assert(in == expected);
29+
}
2730

28-
return std::all_of(std::begin(ia), std::end(ia), [](int a) {return a == 5; })
29-
;
31+
template <class T>
32+
struct Test {
33+
template <class Iter>
34+
TEST_CONSTEXPR_CXX20 void operator()() {
35+
{
36+
std::array<T, 4> in = {1, 2, 3, 4};
37+
std::array<T, 4> expected = {5, 5, 5, 5};
38+
test<Iter>(in, 0, 4, 5, expected);
3039
}
31-
#endif
32-
33-
template <class Iter>
34-
void
35-
test_char()
36-
{
37-
const unsigned n = 4;
38-
char ca[n] = {0};
39-
std::fill(Iter(ca), Iter(ca+n), char(1));
40-
assert(ca[0] == 1);
41-
assert(ca[1] == 1);
42-
assert(ca[2] == 1);
43-
assert(ca[3] == 1);
44-
}
40+
{
41+
std::array<T, 4> in = {1, 2, 3, 4};
42+
std::array<T, 4> expected = {1, 5, 5, 4};
43+
test<Iter>(in, 1, 3, 5, expected);
44+
}
45+
}
46+
};
4547

46-
template <class Iter>
47-
void
48-
test_int()
49-
{
50-
const unsigned n = 4;
51-
int ia[n] = {0};
52-
std::fill(Iter(ia), Iter(ia+n), 1);
53-
assert(ia[0] == 1);
54-
assert(ia[1] == 1);
55-
assert(ia[2] == 1);
56-
assert(ia[3] == 1);
48+
TEST_CONSTEXPR_CXX20 bool test() {
49+
types::for_each(types::forward_iterator_list<char*>(), Test<char>());
50+
types::for_each(types::forward_iterator_list<int*>(), Test<int>());
51+
{ // test vector<bool>::iterator optimization
52+
{ // simple case
53+
std::vector<bool> in(4, false);
54+
std::vector<bool> expected(4, true);
55+
std::fill(in.begin(), in.end(), true);
56+
assert(in == expected);
57+
}
58+
{ // partial byte in the front is not filled
59+
std::vector<bool> in(8, false);
60+
std::vector<bool> expected(8, true);
61+
expected[0] = false;
62+
expected[1] = false;
63+
std::fill(in.begin() + 2, in.end(), true);
64+
assert(in == expected);
65+
}
66+
{ // partial byte in the back is not filled
67+
std::vector<bool> in(8, false);
68+
std::vector<bool> expected(8, true);
69+
expected[6] = false;
70+
expected[7] = false;
71+
std::fill(in.begin(), in.end() - 2, true);
72+
assert(in == expected);
73+
}
74+
{ // partial byte in the front and back is not filled
75+
std::vector<bool> in(16, false);
76+
std::vector<bool> expected(16, true);
77+
expected[0] = false;
78+
expected[1] = false;
79+
expected[14] = false;
80+
expected[15] = false;
81+
std::fill(in.begin() + 2, in.end() - 2, true);
82+
assert(in == expected);
83+
}
84+
{ // only a few bits of a byte are set
85+
std::vector<bool> in(8, false);
86+
std::vector<bool> expected(8, true);
87+
expected[0] = false;
88+
expected[1] = false;
89+
expected[6] = false;
90+
expected[7] = false;
91+
std::fill(in.begin() + 2, in.end() - 2, true);
92+
assert(in == expected);
93+
}
94+
}
95+
return true;
5796
}
5897

59-
int main(int, char**)
60-
{
61-
test_char<forward_iterator<char*> >();
62-
test_char<bidirectional_iterator<char*> >();
63-
test_char<random_access_iterator<char*> >();
64-
test_char<char*>();
65-
66-
test_int<forward_iterator<int*> >();
67-
test_int<bidirectional_iterator<int*> >();
68-
test_int<random_access_iterator<int*> >();
69-
test_int<int*>();
70-
71-
#if TEST_STD_VER > 17
72-
static_assert(test_constexpr());
98+
int main(int, char**) {
99+
test();
100+
#if TEST_STD_VER >= 20
101+
static_assert(test());
73102
#endif
74103

75104
return 0;

libcxx/test/std/algorithms/alg.nonmodifying/alg.count/count.pass.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
// count(Iter first, Iter last, const T& value);
1515

1616
// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=20000000
17-
// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=70000000
17+
// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=80000000
1818

1919
#include <algorithm>
2020
#include <cassert>

libcxx/test/std/algorithms/alg.nonmodifying/alg.count/ranges.count.pass.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
// UNSUPPORTED: c++03, c++11, c++14, c++17
1212

1313
// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=20000000
14-
// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=70000000
14+
// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=80000000
1515

1616
// template<input_iterator I, sentinel_for<I> S, class T, class Proj = identity>
1717
// requires indirect_binary_predicate<ranges::equal_to, projected<I, Proj>, const T*>

0 commit comments

Comments
 (0)