From ac6d74329280e13d2719d304adce49cf425bc980 Mon Sep 17 00:00:00 2001 From: Andrei Elovikov Date: Wed, 20 Jul 2022 16:10:39 -0700 Subject: [PATCH 1/4] [SYCL] Add regression test for https://github.com/intel/llvm/pull/6460 --- SYCL/Reduction/regression_after_pr_6343.cpp | 45 +++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 SYCL/Reduction/regression_after_pr_6343.cpp diff --git a/SYCL/Reduction/regression_after_pr_6343.cpp b/SYCL/Reduction/regression_after_pr_6343.cpp new file mode 100644 index 0000000000..d4ca8f054b --- /dev/null +++ b/SYCL/Reduction/regression_after_pr_6343.cpp @@ -0,0 +1,45 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include + +using namespace sycl; + +int main () { + device d(default_selector{}); + context ctx{d}; + queue q{ctx, d}; + + // Non-uniform WG. + int WGSize = 256; + int N = 22500*256; + + auto *data = malloc_device(N, q); + auto *r1 = malloc_device(1, q); + buffer r2buf(1); + // auto *r2 = malloc_device(1, q); + + q.fill(data, 1, N).wait(); + q.fill(r1, 0, N).wait(); + + q.submit([&](handler &cgh) { + auto r2 = r2buf.get_access(cgh); + cgh.parallel_for(nd_range(range(N), range(WGSize)), + sycl::reduction(r1, std::plus()), + ext::oneapi::reduction(r2, std::plus()), + [=](auto id, auto &r1, auto &r2) { + r1 += 1; + r2 += 2; + }); + }).wait(); + + int res1, res2; + q.copy(r1, &res1, 1).wait(); + auto r2acc = host_accessor{r2buf}; + res2 = r2acc[0]; + assert(res1 == N && res2 == 2 * N); + + return 0; +} From 0ec1f85d2b36c4b5c08c80cec6fc80e4f3c0a06b Mon Sep 17 00:00:00 2001 From: Andrei Elovikov Date: Wed, 20 Jul 2022 16:25:57 -0700 Subject: [PATCH 2/4] clang-format --- SYCL/Reduction/regression_after_pr_6343.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/SYCL/Reduction/regression_after_pr_6343.cpp b/SYCL/Reduction/regression_after_pr_6343.cpp index d4ca8f054b..574af6ebb0 100644 --- a/SYCL/Reduction/regression_after_pr_6343.cpp +++ b/SYCL/Reduction/regression_after_pr_6343.cpp @@ -7,14 +7,14 @@ using namespace sycl; -int main () { +int main() { device d(default_selector{}); context ctx{d}; queue q{ctx, d}; // Non-uniform WG. int WGSize = 256; - int N = 22500*256; + int N = 22500 * 256; auto *data = malloc_device(N, q); auto *r1 = malloc_device(1, q); From cb0be28513f8c187ee2e1a5353a5dba55ad9c410 Mon Sep 17 00:00:00 2001 From: Andrei Elovikov Date: Thu, 21 Jul 2022 11:25:27 -0700 Subject: [PATCH 3/4] Fix issues --- SYCL/Reduction/regression_after_pr_6343.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/SYCL/Reduction/regression_after_pr_6343.cpp b/SYCL/Reduction/regression_after_pr_6343.cpp index 574af6ebb0..419a048559 100644 --- a/SYCL/Reduction/regression_after_pr_6343.cpp +++ b/SYCL/Reduction/regression_after_pr_6343.cpp @@ -19,10 +19,9 @@ int main() { auto *data = malloc_device(N, q); auto *r1 = malloc_device(1, q); buffer r2buf(1); - // auto *r2 = malloc_device(1, q); q.fill(data, 1, N).wait(); - q.fill(r1, 0, N).wait(); + q.fill(r1, 0, 1).wait(); q.submit([&](handler &cgh) { auto r2 = r2buf.get_access(cgh); @@ -41,5 +40,8 @@ int main() { res2 = r2acc[0]; assert(res1 == N && res2 == 2 * N); + free(r1, q); + free(data, q); + return 0; } From 70b347f560f8e15a2d477d7beafdd361fe306813 Mon Sep 17 00:00:00 2001 From: Andrei Elovikov Date: Fri, 22 Jul 2022 11:10:10 -0700 Subject: [PATCH 4/4] Clarify comment --- SYCL/Reduction/regression_after_pr_6343.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/SYCL/Reduction/regression_after_pr_6343.cpp b/SYCL/Reduction/regression_after_pr_6343.cpp index 419a048559..c9422cee03 100644 --- a/SYCL/Reduction/regression_after_pr_6343.cpp +++ b/SYCL/Reduction/regression_after_pr_6343.cpp @@ -12,8 +12,11 @@ int main() { context ctx{d}; queue q{ctx, d}; - // Non-uniform WG. int WGSize = 256; + // Reduction implementation would spawn several other kernels to reduce + // partial sums. At some point the number of partial sums won't be divisible + // by the WG size and the code needs to adjust it for that. Ensure that is + // done. int N = 22500 * 256; auto *data = malloc_device(N, q);