-
Notifications
You must be signed in to change notification settings - Fork 428
ParallelFor with compile time optimization of kernels with run time parameters #2954
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,331 @@ | ||
| #ifndef AMREX_CTO_PARALLEL_FOR_H_ | ||
| #define AMREX_CTO_PARALLEL_FOR_H_ | ||
|
|
||
| #include <AMReX_BLassert.H> | ||
| #include <AMReX_Box.H> | ||
| #include <AMReX_Tuple.H> | ||
|
|
||
| #include <array> | ||
| #include <type_traits> | ||
|
|
||
| /* This header is not for the users to include directly. It's meant to be | ||
| * included in AMReX_GpuLaunch.H, which has included the headers needed | ||
| * here. */ | ||
|
|
||
| /* Thank Maikel Nadolski and Alex Sinn for the techniques used here! */ | ||
|
|
||
| namespace amrex { | ||
|
|
||
| template <int... ctr> | ||
| struct CompileTimeOptions { | ||
| // TypeList is defined in AMReX_Tuple.H | ||
| using list_type = TypeList<std::integral_constant<int, ctr>...>; | ||
| }; | ||
|
|
||
| #if (__cplusplus >= 201703L) | ||
|
|
||
| namespace meta | ||
| { | ||
| template <typename... As, typename... Bs> | ||
| constexpr auto operator+ (TypeList<As...>, TypeList<Bs...>) { | ||
| return TypeList<As..., Bs...>{}; | ||
| } | ||
|
|
||
| template <typename... Ls, typename A> | ||
| constexpr auto single_product (TypeList<Ls...>, A) { | ||
| return TypeList<decltype(Ls{} + TypeList<A>{})...>{}; | ||
| } | ||
|
|
||
| template <typename LLs, typename... As> | ||
| constexpr auto operator* (LLs, TypeList<As...>) { | ||
| return (TypeList<>{} + ... + single_product(LLs{}, As{})); | ||
| } | ||
|
|
||
| template <typename... Ls> | ||
| constexpr auto cartesian_product_n (TypeList<Ls...>) { | ||
| return (TypeList<TypeList<>>{} * ... * Ls{}); | ||
| } | ||
| } | ||
|
|
||
| namespace detail | ||
| { | ||
| template <int MT, typename T, class F, typename... As> | ||
| std::enable_if_t<std::is_integral<T>::value || std::is_same<T,Box>::value, bool> | ||
| ParallelFor_helper2 (T const& N, F&& f, TypeList<As...>, | ||
| std::array<int,sizeof...(As)> const& runtime_options) | ||
| { | ||
| if (runtime_options == std::array<int,sizeof...(As)>{As::value...}) { | ||
| if constexpr (std::is_integral<T>::value) { | ||
| ParallelFor<MT>(N, [f] AMREX_GPU_DEVICE (T i) noexcept | ||
| { | ||
| f(i, As{}...); | ||
| }); | ||
| } else { | ||
| ParallelFor<MT>(N, [f] AMREX_GPU_DEVICE (int i, int j, int k) noexcept | ||
| { | ||
| f(i, j, k, As{}...); | ||
| }); | ||
| } | ||
| return true; | ||
| } else { | ||
| return false; | ||
| } | ||
| } | ||
|
|
||
| template <int MT, typename T, class F, typename... As> | ||
| std::enable_if_t<std::is_integral<T>::value, bool> | ||
| ParallelFor_helper2 (Box const& box, T ncomp, F&& f, TypeList<As...>, | ||
| std::array<int,sizeof...(As)> const& runtime_options) | ||
| { | ||
| if (runtime_options == std::array<int,sizeof...(As)>{As::value...}) { | ||
| ParallelFor<MT>(box, ncomp, [f] AMREX_GPU_DEVICE (int i, int j, int k, T n) noexcept | ||
| { | ||
| f(i, j, k, n, As{}...); | ||
| }); | ||
| return true; | ||
| } else { | ||
| return false; | ||
| } | ||
| } | ||
|
|
||
| template <int MT, typename T, class F, typename... PPs, typename RO> | ||
| std::enable_if_t<std::is_integral<T>::value || std::is_same<T,Box>::value> | ||
| ParallelFor_helper1 (T const& N, F&& f, TypeList<PPs...>, | ||
| RO const& runtime_options) | ||
| { | ||
| bool found_option = (false || ... || | ||
| ParallelFor_helper2<MT>(N, std::forward<F>(f), | ||
| PPs{}, runtime_options)); | ||
| amrex::ignore_unused(found_option); | ||
| AMREX_ASSERT(found_option); | ||
| } | ||
|
|
||
| template <int MT, typename T, class F, typename... PPs, typename RO> | ||
| std::enable_if_t<std::is_integral<T>::value> | ||
| ParallelFor_helper1 (Box const& box, T ncomp, F&& f, TypeList<PPs...>, | ||
| RO const& runtime_options) | ||
| { | ||
| bool found_option = (false || ... || | ||
| ParallelFor_helper2<MT>(box, ncomp, std::forward<F>(f), | ||
| PPs{}, runtime_options)); | ||
| amrex::ignore_unused(found_option); | ||
| AMREX_ASSERT(found_option); | ||
| } | ||
| } | ||
|
|
||
| #endif | ||
|
|
||
| template <int MT, typename T, class F, typename... CTOs> | ||
| std::enable_if_t<std::is_integral<T>::value> | ||
| ParallelFor (TypeList<CTOs...> /*list_of_compile_time_options*/, | ||
| std::array<int,sizeof...(CTOs)> const& runtime_options, | ||
| T N, F&& f) | ||
| { | ||
| #if (__cplusplus >= 201703L) | ||
| using OptionsListList = TypeList<typename CTOs::list_type...>; | ||
| detail::ParallelFor_helper1<MT>(N, std::forward<F>(f), | ||
| meta::cartesian_product_n(OptionsListList{}), | ||
| runtime_options); | ||
| #else | ||
| amrex::ignore_unused(N, f, runtime_options); | ||
| static_assert(std::is_integral<F>::value, "This requires C++17"); | ||
| #endif | ||
| } | ||
|
|
||
| template <int MT, class F, typename... CTOs> | ||
| void ParallelFor (TypeList<CTOs...> /*list_of_compile_time_options*/, | ||
| std::array<int,sizeof...(CTOs)> const& runtime_options, | ||
| Box const& box, F&& f) | ||
| { | ||
| #if (__cplusplus >= 201703L) | ||
| using OptionsListList = TypeList<typename CTOs::list_type...>; | ||
| detail::ParallelFor_helper1<MT>(box, std::forward<F>(f), | ||
| meta::cartesian_product_n(OptionsListList{}), | ||
| runtime_options); | ||
| #else | ||
| amrex::ignore_unused(box, f, runtime_options); | ||
| static_assert(std::is_integral<F>::value, "This requires C++17"); | ||
| #endif | ||
| } | ||
|
|
||
| template <int MT, typename T, class F, typename... CTOs> | ||
| std::enable_if_t<std::is_integral<T>::value> | ||
| ParallelFor (TypeList<CTOs...> /*list_of_compile_time_options*/, | ||
| std::array<int,sizeof...(CTOs)> const& runtime_options, | ||
| Box const& box, T ncomp, F&& f) | ||
| { | ||
| #if (__cplusplus >= 201703L) | ||
| using OptionsListList = TypeList<typename CTOs::list_type...>; | ||
| detail::ParallelFor_helper1<MT>(box, ncomp, std::forward<F>(f), | ||
| meta::cartesian_product_n(OptionsListList{}), | ||
| runtime_options); | ||
| #else | ||
| amrex::ignore_unused(box, ncomp, f, runtime_options); | ||
| static_assert(std::is_integral<F>::value, "This requires C++17"); | ||
| #endif | ||
| } | ||
|
|
||
| /** | ||
| * \brief ParallelFor with compile time optimization of kernels with run time options. | ||
| * | ||
| * It uses fold expression to generate kernel launches for all combinations | ||
| * of the run time options. The kernel function can use constexpr if to | ||
| * discard unused code blocks for better run time performance. In the | ||
| * example below, the code will be expanded into 4*2=8 normal ParallelFors | ||
| * for all combinations of the run time parameters. | ||
| \verbatim | ||
| int A_runtime_option = ...; | ||
| int B_runtime_option = ...; | ||
| enum A_options : int { A0, A1, A2, A3}; | ||
| enum B_options : int { B0, B1 }; | ||
| ParallelFor(TypeList<CompileTimeOptions<A0,A1,A2,A3>, | ||
| CompileTimeOptions<B0,B1>>{}, | ||
| {A_runtime_option, B_runtime_option}, | ||
| N, [=] AMREX_GPU_DEVICE (int i, auto A_control, auto B_control) | ||
| { | ||
| ... | ||
| if constexpr (A_control.value == A0) { | ||
| ... | ||
| } else if constexpr (A_control.value == A1) { | ||
| ... | ||
| } else if constexpr (A_control.value == A2) { | ||
| ... | ||
| else { | ||
| ... | ||
| } | ||
| if constexpr (A_control.value != A3 && B_control.value == B1) { | ||
| ... | ||
| } | ||
| ... | ||
| }); | ||
| \endverbatim | ||
| * Note that due to a limitation of CUDA's extended device lambda, the | ||
| * constexpr if block cannot be the one that captures a variable first. | ||
| * If nvcc complains about it, you will have to manually capture it outside | ||
| * constexpr if. The data type for the parameters is int. | ||
| * | ||
| * \param ctos list of all possible values of the parameters. | ||
| * \param option the run time parameters. | ||
| * \param N an interger specifying the 1D for loop's range. | ||
| * \param f a callable object taking an integer and working on that iteration. | ||
| */ | ||
| template <typename T, class F, typename... CTOs> | ||
| std::enable_if_t<std::is_integral<T>::value> | ||
| ParallelFor (TypeList<CTOs...> ctos, | ||
| std::array<int,sizeof...(CTOs)> const& option, | ||
| T N, F&& f) | ||
| { | ||
| ParallelFor<AMREX_GPU_MAX_THREADS>(ctos, option, N, std::forward<F>(f)); | ||
| } | ||
|
|
||
| /** | ||
| * \brief ParallelFor with compile time optimization of kernels with run time options. | ||
| * | ||
| * It uses fold expression to generate kernel launches for all combinations | ||
| * of the run time options. The kernel function can use constexpr if to | ||
| * discard unused code blocks for better run time performance. In the | ||
| * example below, the code will be expanded into 4*2=8 normal ParallelFors | ||
| * for all combinations of the run time parameters. | ||
| \verbatim | ||
| int A_runtime_option = ...; | ||
| int B_runtime_option = ...; | ||
| enum A_options : int { A0, A1, A2, A3}; | ||
| enum B_options : int { B0, B1 }; | ||
| ParallelFor(TypeList<CompileTimeOptions<A0,A1,A2,A3>, | ||
| CompileTimeOptions<B0,B1>>{}, | ||
| {A_runtime_option, B_runtime_option}, | ||
| box, [=] AMREX_GPU_DEVICE (int i, int j, int k, | ||
| auto A_control, auto B_control) | ||
| { | ||
| ... | ||
| if constexpr (A_control.value == A0) { | ||
| ... | ||
| } else if constexpr (A_control.value == A1) { | ||
| ... | ||
| } else if constexpr (A_control.value == A2) { | ||
| ... | ||
| else { | ||
| ... | ||
| } | ||
| if constexpr (A_control.value != A3 && B_control.value == B1) { | ||
| ... | ||
| } | ||
| ... | ||
| }); | ||
| \endverbatim | ||
| * Note that due to a limitation of CUDA's extended device lambda, the | ||
| * constexpr if block cannot be the one that captures a variable first. | ||
| * If nvcc complains about it, you will have to manually capture it outside | ||
| * constexpr if. The data type for the parameters is int. | ||
| * | ||
| * \param ctos list of all possible values of the parameters. | ||
| * \param option the run time parameters. | ||
| * \param box a Box specifying the 3D for loop's range. | ||
| * \param f a callable object taking three integers and working on the given cell. | ||
| */ | ||
| template <class F, typename... CTOs> | ||
| void ParallelFor (TypeList<CTOs...> ctos, | ||
| std::array<int,sizeof...(CTOs)> const& option, | ||
| Box const& box, F&& f) | ||
| { | ||
| ParallelFor<AMREX_GPU_MAX_THREADS>(ctos, option, box, std::forward<F>(f)); | ||
| } | ||
|
|
||
| /** | ||
| * \brief ParallelFor with compile time optimization of kernels with run time options. | ||
| * | ||
| * It uses fold expression to generate kernel launches for all combinations | ||
| * of the run time options. The kernel function can use constexpr if to | ||
| * discard unused code blocks for better run time performance. In the | ||
| * example below, the code will be expanded into 4*2=8 normal ParallelFors | ||
| * for all combinations of the run time parameters. | ||
| \verbatim | ||
| int A_runtime_option = ...; | ||
| int B_runtime_option = ...; | ||
| enum A_options : int { A0, A1, A2, A3}; | ||
| enum B_options : int { B0, B1 }; | ||
| ParallelFor(TypeList<CompileTimeOptions<A0,A1,A2,A3>, | ||
| CompileTimeOptions<B0,B1>>{}, | ||
| {A_runtime_option, B_runtime_option}, | ||
| box, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n, | ||
| auto A_control, auto B_control) | ||
| { | ||
| ... | ||
| if constexpr (A_control.value == A0) { | ||
| ... | ||
| } else if constexpr (A_control.value == A1) { | ||
| ... | ||
| } else if constexpr (A_control.value == A2) { | ||
| ... | ||
| else { | ||
| ... | ||
| } | ||
| if constexpr (A_control.value != A3 && B_control.value == B1) { | ||
| ... | ||
| } | ||
| ... | ||
| }); | ||
| \endverbatim | ||
| * Note that due to a limitation of CUDA's extended device lambda, the | ||
| * constexpr if block cannot be the one that captures a variable first. | ||
| * If nvcc complains about it, you will have to manually capture it outside | ||
| * constexpr if. The data type for the parameters is int. | ||
| * | ||
| * \param ctos list of all possible values of the parameters. | ||
| * \param option the run time parameters. | ||
| * \param box a Box specifying the iteration in 3D space. | ||
| * \param ncomp an integer specifying the range for iteration over components. | ||
| * \param f a callable object taking three integers and working on the given cell. | ||
| */ | ||
| template <typename T, class F, typename... CTOs> | ||
| std::enable_if_t<std::is_integral<T>::value> | ||
| ParallelFor (TypeList<CTOs...> ctos, | ||
| std::array<int,sizeof...(CTOs)> const& option, | ||
| Box const& box, T ncomp, F&& f) | ||
| { | ||
| ParallelFor<AMREX_GPU_MAX_THREADS>(ctos, option, box, ncomp, std::forward<F>(f)); | ||
| } | ||
|
|
||
| } | ||
|
|
||
| #endif | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -443,4 +443,6 @@ namespace Gpu { | |
|
|
||
| #endif | ||
|
|
||
| #include <AMReX_CTOParallelForImpl.H> | ||
|
|
||
| #endif | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,7 @@ | ||
| set(_sources main.cpp) | ||
| set(_input_files) | ||
|
|
||
| setup_test(_sources _input_files) | ||
|
|
||
| unset(_sources) | ||
| unset(_input_files) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,20 @@ | ||
| AMREX_HOME = ../../ | ||
|
|
||
| DEBUG = FALSE | ||
| DIM = 3 | ||
| COMP = gcc | ||
|
|
||
| USE_MPI = FALSE | ||
| USE_OMP = FALSE | ||
| USE_CUDA = FALSE | ||
|
|
||
| TINY_PROFILE = FALSE | ||
|
|
||
| CXXSTD = c++17 | ||
|
|
||
| include $(AMREX_HOME)/Tools/GNUMake/Make.defs | ||
|
|
||
| include ./Make.package | ||
| include $(AMREX_HOME)/Src/Base/Make.package | ||
|
|
||
| include $(AMREX_HOME)/Tools/GNUMake/Make.rules |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,4 @@ | ||
| CEXE_sources += main.cpp | ||
|
|
||
|
|
||
|
|
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.