diff --git a/sycl/doc/design/KernelFusionJIT.md b/sycl/doc/design/KernelFusionJIT.md index bbdd4b3e70930..303f28106cac1 100644 --- a/sycl/doc/design/KernelFusionJIT.md +++ b/sycl/doc/design/KernelFusionJIT.md @@ -226,7 +226,8 @@ This remapping consists on an inter-procedural pass replacing each built-in quer First of all, work-item remapping will always be performed when the list of input nd-ranges is heterogeneous. Additional remapping conditions are present for the following work-item components. For each input kernel: - `num_work_groups` and `local_size`: Only performed if the input nd-range has an explicit local size, may result in better performance, as this replaces built-in calls with constants; -- `global_id`, `local_id` and `group_id`: Only needed if the number of dimensions differ w.r.t. that of the fused kernel or any component of the global size in the range [2, `num_dims`] differs. +- `global_id`: Only needed if the number of dimensions differ w.r.t. that of the fused kernel or any component of the global size in the range [2, `num_dims`] differs. +- `local_id` and `group_id`: Never needed as per [kernel fusion restrictions](#restrictions). These are invariant after fusion. Once this rules are set, also taking into account remapping constraints, the remapping is performed as follows for each input kernel: @@ -234,10 +235,6 @@ Once this rules are set, also taking into account remapping constraints, the rem - `global_id(0) = GLID / (global_size(1) * global_size(2))` - `global_id(1) = (GLID / global_size(2)) % global_size(1)` - `global_id(2) = GLID % global_size(2)` -- `local_id`: - - `local_id(x) = global_id(x) % local_size(x)` -- `group_id`: - - `group_id(x) = global_id(x) / local_size(x)` - `num_work_groups`: - `num_work_groups(x) = global_size(x) / local_size(x)` - `global_size`: @@ -348,6 +345,13 @@ q.submit([&](sycl::handler &cgh) { sycl::detail::strategy::group_reduce_and_last_wg_detection>(...); }); ``` +### Group Algorithms and Functions + +Kernel fusion supports group algorithms and functions. As per [remapping +rules](#work-item-remapping), group ID and local ID are invariant after fusion +even when different ND-ranges are involved. This way, group functions and +algorithms conceptually executed for a given group and using a given local ID +as, e.g., the `group_broadcast` local ID, will keep semantics after fusion. ### Unsupported SYCL constructs diff --git a/sycl/test-e2e/KernelFusion/GroupAlgorithm/permute.cpp b/sycl/test-e2e/KernelFusion/GroupAlgorithm/permute.cpp new file mode 100644 index 0000000000000..2eb7b9774cca8 --- /dev/null +++ b/sycl/test-e2e/KernelFusion/GroupAlgorithm/permute.cpp @@ -0,0 +1,108 @@ +// RUN: %{build} -fsycl-embed-ir -o %t.out +// RUN: %{run} %t.out + +// Test fusion works with permute and remapping. + +#include + +#include "../helpers.hpp" +#include "sycl/group_algorithm.hpp" + +using namespace sycl; + +class FillKernel; +class Kernel0; +class Kernel1; + +int main() { + constexpr size_t dataSize = 512; + constexpr size_t localSize = 16; + std::array in; + std::array out0; + std::array out1; + // Needed to check results + size_t sg_size = 0; + + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + { + buffer buff_in{in}; + buffer buff_out0{out0}; + buffer buff_out1{out1}; + buffer buff_sg_size{&sg_size, 1}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + + q.submit([&](handler &cgh) { + accessor in(buff_in, cgh, write_only, no_init); + cgh.parallel_for(nd_range<1>{{dataSize}, {localSize}}, + [=](nd_item<1> i) { + if (i.get_local_id() == 0) { + auto j = i.get_group(0); + in[j] = static_cast(j); + } + }); + }); + + fw.start_fusion(); + + q.submit([&](handler &cgh) { + accessor sg_size(buff_sg_size, cgh, write_only, no_init); + accessor in(buff_in, cgh, read_only); + accessor out(buff_out0, cgh, write_only, no_init); + cgh.parallel_for( + nd_range<1>{{dataSize}, {localSize}}, [=](nd_item<1> i) { + sub_group group = i.get_sub_group(); + int gid = i.get_global_id(); + int sgid = group.get_group_id(); + sg_size[0] = group.get_max_local_range()[0]; + out[gid] = permute_group_by_xor( + group, gid, sgid % group.get_max_local_range()[0]); + }); + }); + + q.submit([&](handler &cgh) { + accessor in(buff_in, cgh, read_only); + accessor out(buff_out1, cgh, write_only, no_init); + cgh.parallel_for( + nd_range<1>{{dataSize / 2}, {localSize}}, [=](nd_item<1> i) { + sub_group group = i.get_sub_group(); + int gid = i.get_global_id(); + int sgid = group.get_group_id(); + out[gid] = permute_group_by_xor( + group, gid, sgid % group.get_max_local_range()[0]); + }); + }); + + complete_fusion_with_check(fw); + } + + // Check the results + int SGid = 0; + int SGLid = 0; + int SGBeginGid = 0; + int j = 0; + const auto check = [sg_size, &SGid, &SGLid, &SGBeginGid, &out0, + &out1](int j, bool checkSmall) { + if (j % localSize % sg_size == 0) { + SGid++; + SGLid = 0; + SGBeginGid = j; + } + if (j % localSize == 0) { + SGid = 0; + SGLid = 0; + SGBeginGid = j; + } + assert(out0[j] == SGBeginGid + (SGLid ^ (SGid % sg_size))); + assert(!checkSmall || (out1[j] == SGBeginGid + (SGLid ^ (SGid % sg_size)))); + SGLid++; + }; + for (int end = dataSize / 2; j < end; j++) { + check(j, true); + } + for (int end = dataSize; j < end; j++) { + check(j, false); + } + + return 0; +} diff --git a/sycl/test-e2e/KernelFusion/GroupFunctions/group_broadcast_remapping.cpp b/sycl/test-e2e/KernelFusion/GroupFunctions/group_broadcast_remapping.cpp new file mode 100644 index 0000000000000..af220956a9b29 --- /dev/null +++ b/sycl/test-e2e/KernelFusion/GroupFunctions/group_broadcast_remapping.cpp @@ -0,0 +1,85 @@ +// RUN: %{build} -fsycl-embed-ir -o %t.out +// RUN: %{run} %t.out + +// Test fusion works with group_broadcast and remapping. + +#include + +#include "../helpers.hpp" + +using namespace sycl; + +class FillKernel; +class Kernel0; +class Kernel1; + +int main() { + constexpr size_t dataSize = 512; + constexpr size_t localSize = 16; + std::array in; + std::array out0; + std::array out1; + + queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; + { + buffer buff_in{in}; + buffer buff_out0{out0}; + buffer buff_out1{out1}; + + ext::codeplay::experimental::fusion_wrapper fw{q}; + + q.submit([&](handler &cgh) { + accessor in(buff_in, cgh, write_only, no_init); + cgh.parallel_for(nd_range<1>{{dataSize}, {localSize}}, + [=](nd_item<1> i) { + if (i.get_local_id() == 0) { + auto j = i.get_group(0); + in[j] = static_cast(j); + } + }); + }); + + fw.start_fusion(); + + q.submit([&](handler &cgh) { + accessor in(buff_in, cgh, read_only); + accessor out(buff_out0, cgh, write_only, no_init); + cgh.parallel_for( + nd_range<1>{{dataSize}, {localSize}}, [=](nd_item<1> i) { + auto group = i.get_group(); + out[i.get_global_id()] = group_broadcast( + group, i.get_local_id() == 1 ? in[group.get_group_id(0)] : -1, + 1); + }); + }); + + q.submit([&](handler &cgh) { + accessor in(buff_in, cgh, read_only); + accessor out(buff_out1, cgh, write_only, no_init); + cgh.parallel_for( + nd_range<1>{{dataSize / 2}, {localSize}}, [=](nd_item<1> i) { + auto group = i.get_group(); + out[i.get_global_id()] = group_broadcast( + group, i.get_local_id() == 1 ? in[group.get_group_id(0)] : -1, + 1); + }); + }); + + complete_fusion_with_check(fw); + } + + // Check the results + int i = 0; + for (int end = dataSize / 2; i < end; ++i) { + int group_id = i / static_cast(localSize); + assert(out0[i] == group_id && "Computation error"); + assert(out1[i] == group_id && "Computation error"); + } + + for (int end = dataSize; i < end; ++i) { + int group_id = i / static_cast(localSize); + assert(out0[i] == group_id && "Computation error"); + } + + return 0; +}