diff --git a/SYCL/Reduction/regression_after_pr_6343.cpp b/SYCL/Reduction/regression_after_pr_6343.cpp new file mode 100644 index 0000000000..c9422cee03 --- /dev/null +++ b/SYCL/Reduction/regression_after_pr_6343.cpp @@ -0,0 +1,50 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include + +using namespace sycl; + +int main() { + device d(default_selector{}); + context ctx{d}; + queue q{ctx, d}; + + int WGSize = 256; + // Reduction implementation would spawn several other kernels to reduce + // partial sums. At some point the number of partial sums won't be divisible + // by the WG size and the code needs to adjust it for that. Ensure that is + // done. + int N = 22500 * 256; + + auto *data = malloc_device(N, q); + auto *r1 = malloc_device(1, q); + buffer r2buf(1); + + q.fill(data, 1, N).wait(); + q.fill(r1, 0, 1).wait(); + + q.submit([&](handler &cgh) { + auto r2 = r2buf.get_access(cgh); + cgh.parallel_for(nd_range(range(N), range(WGSize)), + sycl::reduction(r1, std::plus()), + ext::oneapi::reduction(r2, std::plus()), + [=](auto id, auto &r1, auto &r2) { + r1 += 1; + r2 += 2; + }); + }).wait(); + + int res1, res2; + q.copy(r1, &res1, 1).wait(); + auto r2acc = host_accessor{r2buf}; + res2 = r2acc[0]; + assert(res1 == N && res2 == 2 * N); + + free(r1, q); + free(data, q); + + return 0; +}