Skip to content

Commit b962212

Browse files
committed
ParallelFor with compile time optimization of kernels with run time parameters
Branches inside ParallelFor can be very expensive. If a branch uses a lot of resources (e.g., registers), it can significantly affect the performance even if at run time the branch is never executed because it affects the GPU occupancy. For CPUs, it can affect vectorization of the kernel. The new ParallelFor functions use C++17 fold expression to generate kernel launches for all run time variants. The kernel function can use constexpr if to discard unused code blocks for better run time performance. Here are two examples of how to use them. int runtime_option = ...; enum All_options : int { A0, A1, A2, A3}; // Four ParallelFors will be generated. ParallelFor(box, [=] AMREX_GPU_DEVICE (int i, int j, int k, auto control) { ... if constexpr (control.value == A0) { ... } else if constexpr (control.value == A1) { ... } else if constexpr (control.value == A2) { ... else { ... } ... }, TypeList<CompileTimeOptions<A0,A1,A2,A3>>{}, {runtime_option}); and int A_runtime_option = ...; int B_runtime_option = ...; enum A_options : int { A0, A1, A2, A3}; enum B_options : int { B0, B1 }; // 4*2=8 ParallelFors will be generated. ParallelFor(N, [=] AMREX_GPU_DEVICE (int i, auto A_control, auto B_control) { ... if constexpr (A_control.value == A0) { ... } else if constexpr (A_control.value == A1) { ... } else if constexpr (A_control.value == A2) { ... else { ... } if constexpr (A_control.value != A3 && B_control.value == B1) { ... } ... }, TypeList<CompileTimeOptions<A0,A1,A2,A3>, CompileTimeOptions<B0,B1> > {}, {A_runtime_option, B_runtime_option}); Note that that due to a limitation of CUDA's extended device lambda, the constexpr if block cannot be the one that captures a variable first. If nvcc complains about it, you will have to manually capture it outside constexpr if. The data type for the parameters is int. Thank Maikel Nadolski and Alex Sinn for showing us the meta-programming techniques used here.
1 parent 826cd37 commit b962212

File tree

9 files changed

+414
-1
lines changed

9 files changed

+414
-1
lines changed
Lines changed: 316 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,316 @@
1+
#ifndef AMREX_CTO_PARALLEL_FOR_H_
2+
#define AMREX_CTO_PARALLEL_FOR_H_
3+
4+
/* This header is for the users to include directly. It's meant to be
5+
* included in AMReX_GpuLaunch.H, which has included the headers needed
6+
* here. */
7+
8+
/* Thank Maikel Nadolski and Alex Sinn for the techniques used here! */
9+
10+
namespace amrex {
11+
12+
template <int... ctr>
13+
struct CompileTimeOptions {
14+
// TypeList is defined in AMReX_Tuple.H
15+
using list_type = TypeList<std::integral_constant<int, ctr>...>;
16+
};
17+
18+
#if (__cplusplus >= 201703L)
19+
20+
namespace meta
21+
{
22+
template <typename... As, typename... Bs>
23+
constexpr auto operator+ (TypeList<As...>, TypeList<Bs...>) {
24+
return TypeList<As..., Bs...>{};
25+
}
26+
27+
template <typename... Ls, typename A>
28+
constexpr auto single_product (TypeList<Ls...>, A) {
29+
return TypeList<decltype(Ls{} + TypeList<A>{})...>{};
30+
}
31+
32+
template <typename LLs, typename... As>
33+
constexpr auto operator* (LLs, TypeList<As...>) {
34+
return (TypeList<>{} + ... + single_product(LLs{}, As{}));
35+
}
36+
37+
template <typename... Ls>
38+
constexpr auto cartesian_product_n (TypeList<Ls...>) {
39+
return (TypeList<TypeList<>>{} * ... * Ls{});
40+
}
41+
}
42+
43+
namespace detail
44+
{
45+
template <int MT, typename T, class F, typename... As>
46+
std::enable_if_t<std::is_integral<T>::value || std::is_same<T,Box>::value, bool>
47+
ParallelFor_helper2 (T const& N, F&& f, TypeList<As...>,
48+
std::array<int,sizeof...(As)> const& runtime_options)
49+
{
50+
if (runtime_options == std::array<int,sizeof...(As)>{As::value...}) {
51+
if constexpr (std::is_integral<T>::value) {
52+
ParallelFor<MT>(N, [f] AMREX_GPU_DEVICE (T i) noexcept
53+
{
54+
f(i, As{}...);
55+
});
56+
} else {
57+
ParallelFor<MT>(N, [f] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
58+
{
59+
f(i, j, k, As{}...);
60+
});
61+
}
62+
return true;
63+
} else {
64+
return false;
65+
}
66+
}
67+
68+
template <int MT, typename T, class F, typename... As>
69+
std::enable_if_t<std::is_integral<T>::value, bool>
70+
ParallelFor_helper2 (Box const& box, T ncomp, F&& f, TypeList<As...>,
71+
std::array<int,sizeof...(As)> const& runtime_options)
72+
{
73+
if (runtime_options == std::array<int,sizeof...(As)>{As::value...}) {
74+
ParallelFor<MT>(box, ncomp, [f] AMREX_GPU_DEVICE (int i, int j, int k, T n) noexcept
75+
{
76+
f(i, j, k, n, As{}...);
77+
});
78+
return true;
79+
} else {
80+
return false;
81+
}
82+
}
83+
84+
template <int MT, typename T, class F, typename... PPs, typename RO>
85+
std::enable_if_t<std::is_integral<T>::value || std::is_same<T,Box>::value>
86+
ParallelFor_helper1 (T const& N, F&& f, TypeList<PPs...>,
87+
RO const& runtime_options)
88+
{
89+
bool found_option = (false || ... ||
90+
ParallelFor_helper2<MT>(N, std::forward<F>(f),
91+
PPs{}, runtime_options));
92+
amrex::ignore_unused(found_option);
93+
AMREX_ASSERT(found_option);
94+
}
95+
96+
template <int MT, typename T, class F, typename... PPs, typename RO>
97+
std::enable_if_t<std::is_integral<T>::value>
98+
ParallelFor_helper1 (Box const& box, T ncomp, F&& f, TypeList<PPs...>,
99+
RO const& runtime_options)
100+
{
101+
bool found_option = (false || ... ||
102+
ParallelFor_helper2<MT>(box, ncomp, std::forward<F>(f),
103+
PPs{}, runtime_options));
104+
amrex::ignore_unused(found_option);
105+
AMREX_ASSERT(found_option);
106+
}
107+
}
108+
109+
#endif
110+
111+
template <int MT, typename T, class F, typename... CTOs>
112+
std::enable_if_t<std::is_integral<T>::value>
113+
ParallelFor (T N, F&& f, TypeList<CTOs...> /*list_of_compile_time_options*/,
114+
std::array<int,sizeof...(CTOs)> const& runtime_options)
115+
{
116+
#if (__cplusplus >= 201703L)
117+
using OptionsListList = TypeList<typename CTOs::list_type...>;
118+
detail::ParallelFor_helper1<MT>(N, std::forward<F>(f),
119+
meta::cartesian_product_n(OptionsListList{}),
120+
runtime_options);
121+
#else
122+
amrex::ignore_unused(N, f, runtime_options);
123+
static_assert(std::is_integral<F>::value, "This requires C++17");
124+
#endif
125+
}
126+
127+
template <int MT, class F, typename... CTOs>
128+
void ParallelFor (Box const& box, F&& f, TypeList<CTOs...> /*list_of_compile_time_options*/,
129+
std::array<int,sizeof...(CTOs)> const& runtime_options)
130+
{
131+
#if (__cplusplus >= 201703L)
132+
using OptionsListList = TypeList<typename CTOs::list_type...>;
133+
detail::ParallelFor_helper1<MT>(box, std::forward<F>(f),
134+
meta::cartesian_product_n(OptionsListList{}),
135+
runtime_options);
136+
#else
137+
amrex::ignore_unused(box, f, runtime_options);
138+
static_assert(std::is_integral<F>::value, "This requires C++17");
139+
#endif
140+
}
141+
142+
template <int MT, typename T, class F, typename... CTOs>
143+
std::enable_if_t<std::is_integral<T>::value>
144+
ParallelFor (Box const& box, T ncomp, F&& f, TypeList<CTOs...> /*list_of_compile_time_options*/,
145+
std::array<int,sizeof...(CTOs)> const& runtime_options)
146+
{
147+
#if (__cplusplus >= 201703L)
148+
using OptionsListList = TypeList<typename CTOs::list_type...>;
149+
detail::ParallelFor_helper1<MT>(box, ncomp, std::forward<F>(f),
150+
meta::cartesian_product_n(OptionsListList{}),
151+
runtime_options);
152+
#else
153+
amrex::ignore_unused(box, ncomp, f, runtime_options);
154+
static_assert(std::is_integral<F>::value, "This requires C++17");
155+
#endif
156+
}
157+
158+
/**
159+
* \brief ParallelFor with compile time optimization of kernels with run time options.
160+
*
161+
* It uses fold expression to generate kernel launches for all combinations
162+
* of the run time options. The kernel function can use constexpr if to
163+
* discard unused code blocks for better run time performance. In the
164+
* example below, the code will be expanded into 4*2=8 normal ParallelFors
165+
* for all combinations of the run time parameters.
166+
\verbatim
167+
int A_runtime_option = ...;
168+
int B_runtime_option = ...;
169+
enum A_options : int { A0, A1, A2, A3};
170+
enum B_options : int { B0, B1 };
171+
ParallelFor(N, [=] AMREX_GPU_DEVICE (int i, auto A_control, auto B_control)
172+
{
173+
...
174+
if constexpr (A_control.value == A0) {
175+
...
176+
} else if constexpr (A_control.value == A1) {
177+
...
178+
} else if constexpr (A_control.value == A2) {
179+
...
180+
else {
181+
...
182+
}
183+
if constexpr (A_control.value != A3 && B_control.value == B1) {
184+
...
185+
}
186+
...
187+
}, TypeList<CompileTimeOptions<A0,A1,A2,A3>,
188+
CompileTimeOptions<B0,B1>>{},
189+
{A_runtime_option, B_runtime_option});
190+
\endverbatim
191+
* Note that due to a limitation of CUDA's extended device lambda, the
192+
* constexpr if block cannot be the one that captures a variable first.
193+
* If nvcc complains about it, you will have to manually capture it outside
194+
* constexpr if. The data type for the parameters is int.
195+
*
196+
* \param N an interger specifying the 1D for loop's range.
197+
* \param f a callable object taking an integer and working on that iteration.
198+
* \param ctos list of all possible values of the parameters.
199+
* \param option the run time parameters.
200+
*/
201+
template <typename T, class F, typename... CTOs>
202+
std::enable_if_t<std::is_integral<T>::value>
203+
ParallelFor (T N, F&& f, TypeList<CTOs...> ctos,
204+
std::array<int,sizeof...(CTOs)> const& option)
205+
{
206+
ParallelFor<AMREX_GPU_MAX_THREADS>(N, std::forward<F>(f), ctos, option);
207+
}
208+
209+
/**
210+
* \brief ParallelFor with compile time optimization of kernels with run time options.
211+
*
212+
* It uses fold expression to generate kernel launches for all combinations
213+
* of the run time options. The kernel function can use constexpr if to
214+
* discard unused code blocks for better run time performance. In the
215+
* example below, the code will be expanded into 4*2=8 normal ParallelFors
216+
* for all combinations of the run time parameters.
217+
\verbatim
218+
int A_runtime_option = ...;
219+
int B_runtime_option = ...;
220+
enum A_options : int { A0, A1, A2, A3};
221+
enum B_options : int { B0, B1 };
222+
ParallelFor(box, [=] AMREX_GPU_DEVICE (int i, int j, int k, auto A_control, auto B_control)
223+
{
224+
...
225+
if constexpr (A_control.value == A0) {
226+
...
227+
} else if constexpr (A_control.value == A1) {
228+
...
229+
} else if constexpr (A_control.value == A2) {
230+
...
231+
else {
232+
...
233+
}
234+
if constexpr (A_control.value != A3 && B_control.value == B1) {
235+
...
236+
}
237+
...
238+
}, TypeList<CompileTimeOptions<A0,A1,A2,A3>,
239+
CompileTimeOptions<B0,B1>>{},
240+
{A_runtime_option, B_runtime_option});
241+
\endverbatim
242+
* Note that due to a limitation of CUDA's extended device lambda, the
243+
* constexpr if block cannot be the one that captures a variable first.
244+
* If nvcc complains about it, you will have to manually capture it outside
245+
* constexpr if. The data type for the parameters is int.
246+
*
247+
* \param box a Box specifying the 3D for loop's range.
248+
* \param f a callable object taking three integers and working on the given cell.
249+
* \param A_ctos all possible values of parameter A.
250+
* \param A_option the run time parameter for A.
251+
* \param B_ctos all possible values of parameter B.
252+
* \param B_option the run time parameter for B.
253+
*/
254+
template <class F, typename... CTOs>
255+
void ParallelFor (Box const& box, F&& f, TypeList<CTOs...> ctos,
256+
std::array<int,sizeof...(CTOs)> const& option)
257+
{
258+
ParallelFor<AMREX_GPU_MAX_THREADS>(box, std::forward<F>(f), ctos, option);
259+
}
260+
261+
/**
262+
* \brief ParallelFor with compile time optimization of kernels with run time options.
263+
*
264+
* It uses fold expression to generate kernel launches for all combinations
265+
* of the run time options. The kernel function can use constexpr if to
266+
* discard unused code blocks for better run time performance. In the
267+
* example below, the code will be expanded into 4*2=8 normal ParallelFors
268+
* for all combinations of the run time parameters.
269+
\verbatim
270+
int A_runtime_option = ...;
271+
int B_runtime_option = ...;
272+
enum A_options : int { A0, A1, A2, A3};
273+
enum B_options : int { B0, B1 };
274+
ParallelFor(box, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n,
275+
auto A_control, auto B_control)
276+
{
277+
...
278+
if constexpr (A_control.value == A0) {
279+
...
280+
} else if constexpr (A_control.value == A1) {
281+
...
282+
} else if constexpr (A_control.value == A2) {
283+
...
284+
else {
285+
...
286+
}
287+
if constexpr (A_control.value != A3 && B_control.value == B1) {
288+
...
289+
}
290+
...
291+
}, TypeList<CompileTimeOptions<A0,A1,A2,A3>,
292+
CompileTimeOptions<B0,B1>>{},
293+
{A_runtime_option, B_runtime_option});
294+
\endverbatim
295+
* Note that due to a limitation of CUDA's extended device lambda, the
296+
* constexpr if block cannot be the one that captures a variable first.
297+
* If nvcc complains about it, you will have to manually capture it outside
298+
* constexpr if. The data type for the parameters is int.
299+
*
300+
* \param box a Box specifying the iteration in 3D space.
301+
* \param ncomp an integer specifying the range for iteration over components.
302+
* \param f a callable object taking three integers and working on the given cell.
303+
* \param ctos list of all possible values of the parameters.
304+
* \param option the run time parameters.
305+
*/
306+
template <typename T, class F, typename... CTOs>
307+
std::enable_if_t<std::is_integral<T>::value>
308+
ParallelFor (Box const& box, T ncomp, F&& f, TypeList<CTOs...> ctos,
309+
std::array<int,sizeof...(CTOs)> const& option)
310+
{
311+
ParallelFor<AMREX_GPU_MAX_THREADS>(box, ncomp, std::forward<F>(f), ctos, option);
312+
}
313+
314+
}
315+
316+
#endif

Src/Base/AMReX_GpuLaunch.H

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -443,4 +443,6 @@ namespace Gpu {
443443

444444
#endif
445445

446+
#include <AMReX_CTOParallelForImpl.H>
447+
446448
#endif

Src/Base/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,7 @@ target_sources( amrex
223223
AMReX_MFParallelForC.H
224224
AMReX_MFParallelForG.H
225225
AMReX_TagParallelFor.H
226+
AMReX_CTOParallelForImpl.H
226227
AMReX_ParReduce.H
227228
# CUDA --------------------------------------------------------------------
228229
AMReX_CudaGraph.H

Src/Base/Make.package

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ C$(AMREX_BASE)_headers += AMReX_MFParallelForC.H
100100
C$(AMREX_BASE)_headers += AMReX_MFParallelForG.H
101101

102102
C$(AMREX_BASE)_headers += AMReX_TagParallelFor.H
103+
C$(AMREX_BASE)_headers += AMReX_CTOParallelForImpl.H
103104

104105
C$(AMREX_BASE)_headers += AMReX_ParReduce.H
105106

Tests/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#
22
# List of subdirectories to search for CMakeLists.
33
#
4-
set( AMREX_TESTS_SUBDIRS AsyncOut MultiBlock Amr CLZ Parser)
4+
set( AMREX_TESTS_SUBDIRS AsyncOut MultiBlock Amr CLZ Parser CTOParFor)
55

66
if (AMReX_PARTICLES)
77
list(APPEND AMREX_TESTS_SUBDIRS Particles)

Tests/CTOParFor/CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
set(_sources main.cpp)
2+
set(_input_files)
3+
4+
setup_test(_sources _input_files)
5+
6+
unset(_sources)
7+
unset(_input_files)

Tests/CTOParFor/GNUmakefile

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
AMREX_HOME = ../../
2+
3+
DEBUG = FALSE
4+
DIM = 3
5+
COMP = gcc
6+
7+
USE_MPI = FALSE
8+
USE_OMP = FALSE
9+
USE_CUDA = FALSE
10+
11+
TINY_PROFILE = FALSE
12+
13+
CXXSTD = c++17
14+
15+
include $(AMREX_HOME)/Tools/GNUMake/Make.defs
16+
17+
include ./Make.package
18+
include $(AMREX_HOME)/Src/Base/Make.package
19+
20+
include $(AMREX_HOME)/Tools/GNUMake/Make.rules

Tests/CTOParFor/Make.package

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
CEXE_sources += main.cpp
2+
3+
4+

0 commit comments

Comments
 (0)