AMReX-Codes · WeiqunZhang · Oct 15, 2022 · Sep 16, 2022 · Oct 15, 2022
diff --git a/Src/Base/AMReX_CTOParallelForImpl.H b/Src/Base/AMReX_CTOParallelForImpl.H
@@ -0,0 +1,331 @@
+#ifndef AMREX_CTO_PARALLEL_FOR_H_
+#define AMREX_CTO_PARALLEL_FOR_H_
+
+#include <AMReX_BLassert.H>
+#include <AMReX_Box.H>
+#include <AMReX_Tuple.H>
+
+#include <array>
+#include <type_traits>
+
+/* This header is not for the users to include directly.  It's meant to be
+ * included in AMReX_GpuLaunch.H, which has included the headers needed
+ * here. */
+
+/* Thank Maikel Nadolski and Alex Sinn for the techniques used here! */
+
+namespace amrex {
+
+template <int... ctr>
+struct CompileTimeOptions {
+    // TypeList is defined in AMReX_Tuple.H
+    using list_type = TypeList<std::integral_constant<int, ctr>...>;
+};
+
+#if (__cplusplus >= 201703L)
+
+namespace meta
+{
+    template <typename... As, typename... Bs>
+    constexpr auto operator+ (TypeList<As...>, TypeList<Bs...>) {
+        return TypeList<As..., Bs...>{};
+    }
+
+    template <typename... Ls, typename A>
+    constexpr auto single_product (TypeList<Ls...>, A) {
+        return TypeList<decltype(Ls{} + TypeList<A>{})...>{};
+    }
+
+    template <typename LLs, typename... As>
+    constexpr auto operator* (LLs, TypeList<As...>) {
+        return (TypeList<>{} + ... + single_product(LLs{}, As{}));
+    }
+
+    template <typename... Ls>
+    constexpr auto cartesian_product_n (TypeList<Ls...>) {
+        return (TypeList<TypeList<>>{} * ... * Ls{});
+    }
+}
+
+namespace detail
+{
+    template <int MT, typename T, class F, typename... As>
+    std::enable_if_t<std::is_integral<T>::value || std::is_same<T,Box>::value, bool>
+    ParallelFor_helper2 (T const& N, F&& f, TypeList<As...>,
+                         std::array<int,sizeof...(As)> const& runtime_options)
+    {
+        if (runtime_options == std::array<int,sizeof...(As)>{As::value...}) {
+            if constexpr (std::is_integral<T>::value) {
+                ParallelFor<MT>(N, [f] AMREX_GPU_DEVICE (T i) noexcept
+                {
+                    f(i, As{}...);
+                });
+            } else {
+                ParallelFor<MT>(N, [f] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+                {
+                    f(i, j, k, As{}...);
+                });
+            }
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    template <int MT, typename T, class F, typename... As>
+    std::enable_if_t<std::is_integral<T>::value, bool>
+    ParallelFor_helper2 (Box const& box, T ncomp, F&& f, TypeList<As...>,
+                         std::array<int,sizeof...(As)> const& runtime_options)
+    {
+        if (runtime_options == std::array<int,sizeof...(As)>{As::value...}) {
+            ParallelFor<MT>(box, ncomp, [f] AMREX_GPU_DEVICE (int i, int j, int k, T n) noexcept
+            {
+                f(i, j, k, n, As{}...);
+            });
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    template <int MT, typename T, class F, typename... PPs, typename RO>
+    std::enable_if_t<std::is_integral<T>::value || std::is_same<T,Box>::value>
+    ParallelFor_helper1 (T const& N, F&& f, TypeList<PPs...>,
+                         RO const& runtime_options)
+    {
+        bool found_option = (false || ... ||
+                             ParallelFor_helper2<MT>(N, std::forward<F>(f),
+                                                     PPs{}, runtime_options));
+        amrex::ignore_unused(found_option);
+        AMREX_ASSERT(found_option);
+    }
+
+    template <int MT, typename T, class F, typename... PPs, typename RO>
+    std::enable_if_t<std::is_integral<T>::value>
+    ParallelFor_helper1 (Box const& box, T ncomp, F&& f, TypeList<PPs...>,
+                         RO const& runtime_options)
+    {
+        bool found_option = (false || ... ||
+                             ParallelFor_helper2<MT>(box, ncomp, std::forward<F>(f),
+                                                     PPs{}, runtime_options));
+        amrex::ignore_unused(found_option);
+        AMREX_ASSERT(found_option);
+    }
+}
+
+#endif
+
+template <int MT, typename T, class F, typename... CTOs>
+std::enable_if_t<std::is_integral<T>::value>
+ParallelFor (TypeList<CTOs...> /*list_of_compile_time_options*/,
+             std::array<int,sizeof...(CTOs)> const& runtime_options,
+             T N, F&& f)
+{
+#if (__cplusplus >= 201703L)
+    using OptionsListList = TypeList<typename CTOs::list_type...>;
+    detail::ParallelFor_helper1<MT>(N, std::forward<F>(f),
+                                    meta::cartesian_product_n(OptionsListList{}),
+                                    runtime_options);
+#else
+    amrex::ignore_unused(N, f, runtime_options);
+    static_assert(std::is_integral<F>::value, "This requires C++17");
+#endif
+}
+
+template <int MT, class F, typename... CTOs>
+void ParallelFor (TypeList<CTOs...> /*list_of_compile_time_options*/,
+                  std::array<int,sizeof...(CTOs)> const& runtime_options,
+                  Box const& box, F&& f)
+{
+#if (__cplusplus >= 201703L)
+    using OptionsListList = TypeList<typename CTOs::list_type...>;
+    detail::ParallelFor_helper1<MT>(box, std::forward<F>(f),
+                                    meta::cartesian_product_n(OptionsListList{}),
+                                    runtime_options);
+#else
+    amrex::ignore_unused(box, f, runtime_options);
+    static_assert(std::is_integral<F>::value, "This requires C++17");
+#endif
+}
+
+template <int MT, typename T, class F, typename... CTOs>
+std::enable_if_t<std::is_integral<T>::value>
+ParallelFor (TypeList<CTOs...> /*list_of_compile_time_options*/,
+             std::array<int,sizeof...(CTOs)> const& runtime_options,
+             Box const& box, T ncomp, F&& f)
+{
+#if (__cplusplus >= 201703L)
+    using OptionsListList = TypeList<typename CTOs::list_type...>;
+    detail::ParallelFor_helper1<MT>(box, ncomp, std::forward<F>(f),
+                                    meta::cartesian_product_n(OptionsListList{}),
+                                    runtime_options);
+#else
+    amrex::ignore_unused(box, ncomp, f, runtime_options);
+    static_assert(std::is_integral<F>::value, "This requires C++17");
+#endif
+}
+
+/**
+ * \brief ParallelFor with compile time optimization of kernels with run time options.
+ *
+ * It uses fold expression to generate kernel launches for all combinations
+ * of the run time options.  The kernel function can use constexpr if to
+ * discard unused code blocks for better run time performance.  In the
+ * example below, the code will be expanded into 4*2=8 normal ParallelFors
+ * for all combinations of the run time parameters.
+ \verbatim
+     int A_runtime_option = ...;
+     int B_runtime_option = ...;
+     enum A_options : int { A0, A1, A2, A3};
+     enum B_options : int { B0, B1 };
+     ParallelFor(TypeList<CompileTimeOptions<A0,A1,A2,A3>,
+                          CompileTimeOptions<B0,B1>>{},
+                 {A_runtime_option, B_runtime_option},
+                 N, [=] AMREX_GPU_DEVICE (int i, auto A_control, auto B_control)
+     {
+         ...
+         if constexpr (A_control.value == A0) {
+             ...
+         } else if constexpr (A_control.value == A1) {
+             ...
+         } else if constexpr (A_control.value == A2) {
+             ...
+         else {
+             ...
+         }
+         if constexpr (A_control.value != A3 && B_control.value == B1) {
+             ...
+         }
+         ...
+     });
+ \endverbatim
+ * Note that due to a limitation of CUDA's extended device lambda, the
+ * constexpr if block cannot be the one that captures a variable first.
+ * If nvcc complains about it, you will have to manually capture it outside
+ * constexpr if.  The data type for the parameters is int.
+ *
+ * \param ctos   list of all possible values of the parameters.
+ * \param option the run time parameters.
+ * \param N      an interger specifying the 1D for loop's range.
+ * \param f      a callable object taking an integer and working on that iteration.
+ */
+template <typename T, class F, typename... CTOs>
+std::enable_if_t<std::is_integral<T>::value>
+ParallelFor (TypeList<CTOs...> ctos,
+             std::array<int,sizeof...(CTOs)> const& option,
+             T N, F&& f)
+{
+    ParallelFor<AMREX_GPU_MAX_THREADS>(ctos, option, N, std::forward<F>(f));
+}
+
+/**
+ * \brief ParallelFor with compile time optimization of kernels with run time options.
+ *
+ * It uses fold expression to generate kernel launches for all combinations
+ * of the run time options.  The kernel function can use constexpr if to
+ * discard unused code blocks for better run time performance.  In the
+ * example below, the code will be expanded into 4*2=8 normal ParallelFors
+ * for all combinations of the run time parameters.
+ \verbatim
+     int A_runtime_option = ...;
+     int B_runtime_option = ...;
+     enum A_options : int { A0, A1, A2, A3};
+     enum B_options : int { B0, B1 };
+     ParallelFor(TypeList<CompileTimeOptions<A0,A1,A2,A3>,
+                          CompileTimeOptions<B0,B1>>{},
+                 {A_runtime_option, B_runtime_option},
+                 box, [=] AMREX_GPU_DEVICE (int i, int j, int k,
+                                            auto A_control, auto B_control)
+     {
+         ...
+         if constexpr (A_control.value == A0) {
+             ...
+         } else if constexpr (A_control.value == A1) {
+             ...
+         } else if constexpr (A_control.value == A2) {
+             ...
+         else {
+             ...
+         }
+         if constexpr (A_control.value != A3 && B_control.value == B1) {
+             ...
+         }
+         ...
+     });
+ \endverbatim
+ * Note that due to a limitation of CUDA's extended device lambda, the
+ * constexpr if block cannot be the one that captures a variable first.
+ * If nvcc complains about it, you will have to manually capture it outside
+ * constexpr if.  The data type for the parameters is int.
+ *
+ * \param ctos   list of all possible values of the parameters.
+ * \param option the run time parameters.
+ * \param box      a Box specifying the 3D for loop's range.
+ * \param f        a callable object taking three integers and working on the given cell.
+ */
+template <class F, typename... CTOs>
+void ParallelFor (TypeList<CTOs...> ctos,
+                  std::array<int,sizeof...(CTOs)> const& option,
+                  Box const& box, F&& f)
+{
+    ParallelFor<AMREX_GPU_MAX_THREADS>(ctos, option, box, std::forward<F>(f));
+}
+
+/**
+ * \brief ParallelFor with compile time optimization of kernels with run time options.
+ *
+ * It uses fold expression to generate kernel launches for all combinations
+ * of the run time options.  The kernel function can use constexpr if to
+ * discard unused code blocks for better run time performance.  In the
+ * example below, the code will be expanded into 4*2=8 normal ParallelFors
+ * for all combinations of the run time parameters.
+ \verbatim
+     int A_runtime_option = ...;
+     int B_runtime_option = ...;
+     enum A_options : int { A0, A1, A2, A3};
+     enum B_options : int { B0, B1 };
+     ParallelFor(TypeList<CompileTimeOptions<A0,A1,A2,A3>,
+                          CompileTimeOptions<B0,B1>>{},
+                 {A_runtime_option, B_runtime_option},
+                 box, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n,
+                                                   auto A_control, auto B_control)
+     {
+         ...
+         if constexpr (A_control.value == A0) {
+             ...
+         } else if constexpr (A_control.value == A1) {
+             ...
+         } else if constexpr (A_control.value == A2) {
+             ...
+         else {
+             ...
+         }
+         if constexpr (A_control.value != A3 && B_control.value == B1) {
+             ...
+         }
+         ...
+     });
+ \endverbatim
+ * Note that due to a limitation of CUDA's extended device lambda, the
+ * constexpr if block cannot be the one that captures a variable first.
+ * If nvcc complains about it, you will have to manually capture it outside
+ * constexpr if.  The data type for the parameters is int.
+ *
+ * \param ctos   list of all possible values of the parameters.
+ * \param option the run time parameters.
+ * \param box    a Box specifying the iteration in 3D space.
+ * \param ncomp  an integer specifying the range for iteration over components.
+ * \param f      a callable object taking three integers and working on the given cell.
+ */
+template <typename T, class F, typename... CTOs>
+std::enable_if_t<std::is_integral<T>::value>
+ParallelFor (TypeList<CTOs...> ctos,
+             std::array<int,sizeof...(CTOs)> const& option,
+             Box const& box, T ncomp, F&& f)
+{
+    ParallelFor<AMREX_GPU_MAX_THREADS>(ctos, option, box, ncomp, std::forward<F>(f));
+}
+
+}
+
+#endif
diff --git a/Src/Base/AMReX_GpuLaunch.H b/Src/Base/AMReX_GpuLaunch.H
@@ -443,4 +443,6 @@ namespace Gpu {
 
 #endif
 
+#include <AMReX_CTOParallelForImpl.H>
+
 #endif
diff --git a/Src/Base/CMakeLists.txt b/Src/Base/CMakeLists.txt
@@ -223,6 +223,7 @@ target_sources( amrex
    AMReX_MFParallelForC.H
    AMReX_MFParallelForG.H
    AMReX_TagParallelFor.H
+   AMReX_CTOParallelForImpl.H
    AMReX_ParReduce.H
    # CUDA --------------------------------------------------------------------
    AMReX_CudaGraph.H

diff --git a/Src/Base/Make.package b/Src/Base/Make.package
@@ -100,6 +100,7 @@ C$(AMREX_BASE)_headers += AMReX_MFParallelForC.H
 C$(AMREX_BASE)_headers += AMReX_MFParallelForG.H
 
 C$(AMREX_BASE)_headers += AMReX_TagParallelFor.H
+C$(AMREX_BASE)_headers += AMReX_CTOParallelForImpl.H
 
 C$(AMREX_BASE)_headers += AMReX_ParReduce.H
 

diff --git a/Tests/CMakeLists.txt b/Tests/CMakeLists.txt
@@ -1,7 +1,7 @@
 #
 # List of subdirectories to search for CMakeLists.
 #
-set( AMREX_TESTS_SUBDIRS AsyncOut MultiBlock Amr CLZ Parser)
+set( AMREX_TESTS_SUBDIRS AsyncOut MultiBlock Amr CLZ Parser CTOParFor)
 
 if (AMReX_PARTICLES)
    list(APPEND AMREX_TESTS_SUBDIRS Particles)

diff --git a/Tests/CTOParFor/CMakeLists.txt b/Tests/CTOParFor/CMakeLists.txt
@@ -0,0 +1,7 @@
+set(_sources     main.cpp)
+set(_input_files)
+
+setup_test(_sources _input_files)
+
+unset(_sources)
+unset(_input_files)
diff --git a/Tests/CTOParFor/GNUmakefile b/Tests/CTOParFor/GNUmakefile
@@ -0,0 +1,20 @@
+AMREX_HOME = ../../
+
+DEBUG	= FALSE
+DIM	= 3
+COMP    = gcc
+
+USE_MPI   = FALSE
+USE_OMP   = FALSE
+USE_CUDA  = FALSE
+
+TINY_PROFILE = FALSE
+
+CXXSTD = c++17
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.defs
+
+include ./Make.package
+include $(AMREX_HOME)/Src/Base/Make.package
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.rules
diff --git a/Tests/CTOParFor/Make.package b/Tests/CTOParFor/Make.package
@@ -0,0 +1,4 @@
+CEXE_sources += main.cpp
+
+
+
Original file line number	Diff line number	Diff line change
Expand Up		@@ -443,4 +443,6 @@ namespace Gpu {

		#endif

		#include <AMReX_CTOParallelForImpl.H>

		#endif