diff --git a/SYCL/Reduction/reduction_big_data.cpp b/SYCL/Reduction/reduction_big_data.cpp index 1c8d67068b..e7fd13d1a5 100644 --- a/SYCL/Reduction/reduction_big_data.cpp +++ b/SYCL/Reduction/reduction_big_data.cpp @@ -4,7 +4,7 @@ // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUNx: %HOST_RUN_PLACEHOLDER %t.out -// TODO: Enable the test for HOST when it supports ONEAPI::reduce() and +// TODO: Enable the test for HOST when it supports ext::oneapi::reduce() and // barrier() // This test performs basic checks of parallel_for(nd_range, reduction, func) @@ -12,9 +12,8 @@ // errors. #include "reduction_utils.hpp" -#include + #include -#include using namespace cl::sycl; @@ -27,14 +26,14 @@ size_t getSafeMaxWGSize(size_t MaxWGSize, size_t MemSize, size_t OneElemSize) { return std::min(MaxNumElems / 2, MaxWGSize); } -template -void test(queue &Q, T Identity) { - device Device = Q.get_device(); +template +int test(queue &Q, T Identity) { + // It seems enough to test just one case - SYCL2020 reduction. + constexpr bool IsSYCL2020 = true; + device Device = Q.get_device(); std::size_t MaxWGSize = Device.get_info(); std::size_t LocalMemSize = Device.get_info(); - std::cout << "Detected device::max_work_group_size = " << MaxWGSize << "\n"; - std::cout << "Detected device::local_mem_size = " << LocalMemSize << "\n"; size_t WGSize = getSafeMaxWGSize(MaxWGSize, LocalMemSize, sizeof(T)); @@ -44,15 +43,12 @@ void test(queue &Q, T Identity) { size_t NWorkItems = std::min(WGSize * MaxWGSize + 1, MaxGlobalRange); size_t NWorkGroups = (NWorkItems - 1) / WGSize + 1; - range<1> GlobalRange(NWorkGroups * WGSize); - range<1> LocalRange(WGSize); - nd_range<1> NDRange(GlobalRange, LocalRange); - std::cout << "Running the test with: GlobalRange = " << (NWorkGroups * WGSize) - << ", LocalRange = " << WGSize << ", NWorkItems = " << NWorkItems - << "\n"; + nd_range<1> NDRange(range<1>{NWorkGroups * WGSize}, range<1>{WGSize}); + printTestLabel(IsSYCL2020, NDRange); buffer InBuf(NWorkItems); buffer OutBuf(1); + (OutBuf.template get_access())[0] = Identity; // Initialize. BinaryOperation BOp; @@ -62,25 +58,17 @@ void test(queue &Q, T Identity) { // Compute. Q.submit([&](handler &CGH) { auto In = InBuf.template get_access(CGH); - accessor - Out(OutBuf, CGH); - CGH.parallel_for(NDRange, ONEAPI::reduction(Out, Identity, BOp), - [=](nd_item<1> NDIt, auto &Sum) { - if (NDIt.get_global_linear_id() < NWorkItems) - Sum.combine( - In[NDIt.get_global_linear_id()]); - }); + CGH.parallel_for(NDRange, sycl::reduction(OutBuf, CGH, Identity, BOp), + [=](nd_item<1> NDIt, auto &Sum) { + if (NDIt.get_global_linear_id() < NWorkItems) + Sum.combine(In[NDIt.get_global_linear_id()]); + }); }); // Check correctness. auto Out = OutBuf.template get_access(); T ComputedOut = *(Out.get_pointer()); - if (ComputedOut != CorrectOut) { - std::cout << "Computed value: " << ComputedOut - << ", Expected value: " << CorrectOut << "\n"; - assert(0 && "Wrong value."); - } - std::cout << "Test case passed\n\n"; + return checkResults(Q, IsSYCL2020, BOp, NDRange, ComputedOut, CorrectOut); } template struct BigCustomVec : public CustomVec { @@ -99,11 +87,13 @@ template struct BigCustomVecPlus { int main() { queue Q; - test>(Q, getMinimumFPValue()); + printDeviceInfo(Q); + int NumErrors = test>( + Q, getMinimumFPValue()); using BCV = BigCustomVec; - test>(Q, BCV(0)); + NumErrors += test>(Q, BCV(0)); - std::cout << "Test passed\n"; - return 0; + printFinalStatus(NumErrors); + return NumErrors; } diff --git a/SYCL/Reduction/reduction_nd_N_vars.cpp b/SYCL/Reduction/reduction_nd_N_vars.cpp index 1fc8cfadc7..0f4b4ead79 100644 --- a/SYCL/Reduction/reduction_nd_N_vars.cpp +++ b/SYCL/Reduction/reduction_nd_N_vars.cpp @@ -4,23 +4,13 @@ // RUN: %ACC_RUN_PLACEHOLDER %t.out // TODO: The test irregularly reports incorrect. -// UNSUPPORTED: TEMPORARY_DISABLED +// REQUIRES: TEMPORARY_DISABLED // This test checks handling of parallel_for() accepting nd_range and // two or more reductions. #include "reduction_utils.hpp" -#include - -#include -#include -#include -#include -#include -#include -#include - using namespace cl::sycl; template class KNameGroup; @@ -29,22 +19,15 @@ template class KName; constexpr access::mode RW = access::mode::read_write; constexpr access::mode DW = access::mode::discard_write; -template -bool cherkResultIsExpected(int TestCaseNum, T Expected, T Computed, - bool IsSYCL2020) { - bool Success; - if (!std::is_floating_point::value) - Success = (Expected == Computed); - else - Success = std::abs((Expected / Computed) - 1) < 0.5; - - if (!Success) { - std::cerr << "Is SYCL2020 mode: " << IsSYCL2020 << std::endl; - std::cerr << TestCaseNum << ": Expected value = " << Expected - << ", Computed value = " << Computed << "\n"; - } - - return Success; +template +void printNVarsTestLabel(bool IsSYCL2020, const RangeT &Range, + bool ToCERR = false) { + std::ostream &OS = ToCERR ? std::cerr : std::cout; + std::string Mode = IsSYCL2020 ? "SYCL2020" : "ONEAPI "; + OS << (ToCERR ? "Error" : "Start") << ": Mode=" << Mode + << ", Range=" << Range; + if (!ToCERR) + OS << std::endl; } // Returns 0 if the test case passed. Otherwise, some non-zero value. @@ -58,6 +41,10 @@ int testOne(queue &Q, T1 IdentityVal1, T1 InitVal1, BinaryOperation1 BOp1, T3 IdentityVal3, T3 InitVal3, BinaryOperation3 BOp3, T4 IdentityVal4, T3 InitVal4, BinaryOperation4 BOp4, usm::alloc AllocType4, size_t NWorkItems, size_t WGSize) { + + auto NDR = nd_range<1>{range<1>(NWorkItems), range<1>{WGSize}}; + printNVarsTestLabel<>(IsSYCL2020, NDR); + buffer InBuf1(NWorkItems); buffer InBuf2(NWorkItems); buffer InBuf3(NWorkItems); @@ -120,64 +107,33 @@ int testOne(queue &Q, T1 IdentityVal1, T1 InitVal1, BinaryOperation1 BOp1, } } - auto NDR = nd_range<1>{range<1>(NWorkItems), range<1>{WGSize}}; - if constexpr (IsSYCL2020) { - Q.submit([&](handler &CGH) { - auto In1 = InBuf1.template get_access(CGH); - auto In2 = InBuf2.template get_access(CGH); - auto In3 = InBuf3.template get_access(CGH); - auto In4 = InBuf4.template get_access(CGH); - - auto Redu1 = sycl::reduction(OutBuf1, CGH, IdentityVal1, BOp1, - getPropertyList()); - auto Redu2 = sycl::reduction(OutBuf2, CGH, IdentityVal2, BOp2, - getPropertyList()); - auto Redu3 = sycl::reduction(OutBuf3, CGH, IdentityVal3, BOp3, - getPropertyList()); - auto Redu4 = - sycl::reduction(Out4, IdentityVal4, BOp4, getPropertyList()); - - auto Lambda = [=](nd_item<1> NDIt, auto &Sum1, auto &Sum2, auto &Sum3, - auto &Sum4) { - size_t I = NDIt.get_global_id(0); - Sum1.combine(In1[I]); - Sum2.combine(In2[I]); - Sum3.combine(In3[I]); - Sum4.combine(In4[I]); - }; - CGH.parallel_for(NDR, Redu1, Redu2, Redu3, Redu4, Lambda); - }).wait(); - } else { - // Test ONEAPI reductions - Q.submit([&](handler &CGH) { - auto In1 = InBuf1.template get_access(CGH); - auto In2 = InBuf2.template get_access(CGH); - auto In3 = InBuf3.template get_access(CGH); - auto In4 = InBuf4.template get_access(CGH); - - auto Out1 = OutBuf1.template get_access(CGH); - auto Out2 = OutBuf2.template get_access(CGH); - accessor Out3(OutBuf3, CGH); - - auto Redu1 = ONEAPI::reduction(Out1, IdentityVal1, BOp1); - auto Redu2 = ONEAPI::reduction(Out2, IdentityVal2, BOp2); - auto Redu3 = ONEAPI::reduction(Out3, IdentityVal3, BOp3); - auto Redu4 = ONEAPI::reduction(Out4, IdentityVal4, BOp4); - - auto Lambda = [=](nd_item<1> NDIt, auto &Sum1, auto &Sum2, auto &Sum3, - auto &Sum4) { - size_t I = NDIt.get_global_id(0); - Sum1.combine(In1[I]); - Sum2.combine(In2[I]); - Sum3.combine(In3[I]); - Sum4.combine(In4[I]); - }; - CGH.parallel_for(NDR, Redu1, Redu2, Redu3, Redu4, Lambda); - }).wait(); - } + Q.submit([&](handler &CGH) { + auto In1 = InBuf1.template get_access(CGH); + auto In2 = InBuf2.template get_access(CGH); + auto In3 = InBuf3.template get_access(CGH); + auto In4 = InBuf4.template get_access(CGH); + + auto Redu1 = + createReduction(OutBuf1, CGH, IdentityVal1, BOp1); + auto Redu2 = + createReduction(OutBuf2, CGH, IdentityVal2, BOp2); + auto Redu3 = + createReduction(OutBuf3, CGH, IdentityVal3, BOp3); + auto Redu4 = createReduction(Out4, IdentityVal4, BOp4); + + auto Lambda = [=](nd_item<1> NDIt, auto &Sum1, auto &Sum2, auto &Sum3, + auto &Sum4) { + size_t I = NDIt.get_global_id(0); + Sum1.combine(In1[I]); + Sum2.combine(In2[I]); + Sum3.combine(In3[I]); + Sum4.combine(In4[I]); + }; + CGH.parallel_for(NDR, Redu1, Redu2, Redu3, Redu4, Lambda); + }).wait(); // Check the results and free memory. - int Error = 0; + int NumErrors = 0; { auto Out1 = OutBuf1.template get_access(); auto Out2 = OutBuf2.template get_access(); @@ -195,22 +151,23 @@ int testOne(queue &Q, T1 IdentityVal1, T1 InitVal1, BinaryOperation1 BOp1, Out4Val = *Out4; } - Error += cherkResultIsExpected(1, CorrectOut1, Out1[0], IsSYCL2020) ? 0 : 1; - Error += cherkResultIsExpected(2, CorrectOut2, Out2[0], IsSYCL2020) ? 0 : 1; - Error += cherkResultIsExpected(3, CorrectOut3, Out3[0], IsSYCL2020) ? 0 : 1; - Error += cherkResultIsExpected(4, CorrectOut4, Out4Val, IsSYCL2020) ? 0 : 1; + std::string AddInfo = "TestCase="; + NumErrors += checkResults(Q, IsSYCL2020, BOp1, NDR, Out1[0], CorrectOut1, + AddInfo + std::to_string(1)); + NumErrors += checkResults(Q, IsSYCL2020, BOp2, NDR, Out2[0], CorrectOut2, + AddInfo + std::to_string(2)); + NumErrors += checkResults(Q, IsSYCL2020, BOp3, NDR, Out3[0], CorrectOut3, + AddInfo + std::to_string(3)); + NumErrors += checkResults(Q, IsSYCL2020, BOp4, NDR, Out4Val, CorrectOut4, + AddInfo + std::to_string(4)); free(Out4, Q.get_context()); } - if (Error) - std::cerr << "The test failed for nd_range(" << NWorkItems << "," << WGSize - << ")\n\n"; - - return Error; + return NumErrors; } // Tests both implementations of reduction: -// sycl::reduction and sycl::ONEAPI::reduction +// sycl::reduction and sycl::ext::oneapi::reduction template ( - Q, 0, 1000, std::plus{}, 0, 2000, std::plus<>{}, 0, 4000, + Q, 0, 1000, std::plus<>{}, 0, 2000, std::plus<>{}, 0, 4000, std::bit_or<>{}, 0, 8000, std::bit_xor<>{}, usm::alloc::shared, 16, 16); auto Add = [](auto x, auto y) { return (x + y); }; Error += testBoth( - Q, 0, 1000, std::plus{}, 0, 2000, std::plus<>{}, 0, 4000, Add, 0, - 8000, std::plus<>{}, usm::alloc::device, 5 * (256 + 1), 5); + Q, 0, 1000, std::plus<>{}, 0, 2000, std::plus<>{}, 0, 4000, Add, 0, 8000, + std::plus<>{}, usm::alloc::device, 5 * (256 + 1), 5); - if (!Error) - std::cout << "Test passed\n"; - else - std::cerr << Error << " test-cases failed\n"; + printFinalStatus(Error); return Error; } diff --git a/SYCL/Reduction/reduction_nd_conditional.cpp b/SYCL/Reduction/reduction_nd_conditional.cpp index 48c2cf56f3..d1bf4c0f05 100644 --- a/SYCL/Reduction/reduction_nd_conditional.cpp +++ b/SYCL/Reduction/reduction_nd_conditional.cpp @@ -7,8 +7,7 @@ // This test performs basic checks of parallel_for(nd_range, reduction, func) // with reduction and conditional increment of the reduction variable. -#include -#include +#include "reduction_utils.hpp" using namespace cl::sycl; @@ -18,10 +17,7 @@ void initInputData(buffer &InBuf, T &ExpectedOut, T Identity, ExpectedOut = Identity; auto In = InBuf.template get_access(); for (int I = 0; I < N; ++I) { - if (std::is_same>::value) - In[I] = 1 + (((I % 37) == 0) ? 1 : 0); - else - In[I] = I + 1 + 1.1; + In[I] = I + 1; if (I < 2) ExpectedOut = BOp(ExpectedOut, 99); @@ -32,79 +28,51 @@ void initInputData(buffer &InBuf, T &ExpectedOut, T Identity, } }; -template struct Vec { - Vec() : X(0), Y(0) {} - Vec(T X, T Y) : X(X), Y(Y) {} - Vec(T V) : X(V), Y(V) {} - bool operator==(const Vec &P) const { return P.X == X && P.Y == Y; } - bool operator!=(const Vec &P) const { return !(*this == P); } - T X; - T Y; -}; -template bool operator==(const Vec &A, const Vec &B) { - return A.X == B.X && A.Y == B.Y; -} -template -std::ostream &operator<<(std::ostream &OS, const Vec &P) { - return OS << "(" << P.X << ", " << P.Y << ")"; -} - -template struct VecPlus { - using P = Vec; - P operator()(const P &A, const P &B) const { return P(A.X + B.X, A.Y + B.Y); } -}; - -template -void test(queue &Q, T Identity, size_t WGSize, size_t NWItems) { +template +int test(queue &Q, T Identity, size_t WGSize, size_t NWItems) { buffer InBuf(NWItems); buffer OutBuf(1); + nd_range<1> NDRange(range<1>{NWItems}, range<1>{WGSize}); + printTestLabel(true /*SYCL2020*/, NDRange); + // Initialize. BinaryOperation BOp; T CorrectOut; initInputData(InBuf, CorrectOut, Identity, BOp, NWItems); + (OutBuf.template get_access())[0] = Identity; // Compute. Q.submit([&](handler &CGH) { auto In = InBuf.template get_access(CGH); - accessor - Out(OutBuf, CGH); - auto Redu = ONEAPI::reduction(Out, Identity, BOp); - - range<1> GlobalRange(NWItems); - range<1> LocalRange(WGSize); - nd_range<1> NDRange(GlobalRange, LocalRange); - CGH.parallel_for( - NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) { - size_t I = NDIt.get_global_linear_id(); - if (I < 2) - Sum.combine(T(99)); - else if (I % 3) - Sum.combine(In[I]); - else - ; // do nothing. - }); + auto Redu = sycl::reduction(OutBuf, CGH, Identity, BOp); + CGH.parallel_for(NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) { + size_t I = NDIt.get_global_linear_id(); + if (I < 2) + Sum.combine(T(99)); + else if (I % 3) + Sum.combine(In[I]); + else + ; // do nothing. + }); }); // Check correctness. auto Out = OutBuf.template get_access(); T ComputedOut = *(Out.get_pointer()); - if (ComputedOut != CorrectOut) { - std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n"; - std::cout << "Computed value: " << ComputedOut - << ", Expected value: " << CorrectOut << "\n"; - assert(0 && "Wrong value."); - } + return checkResults(Q, true /*SYCL2020*/, BOp, NDRange, ComputedOut, + CorrectOut); } int main() { queue Q; - test>(Q, 0, 2, 2); - test>(Q, 0, 7, 7); - test>(Q, 0, 2, 64); - test>(Q, 0, 16, 256); + printDeviceInfo(Q); + + int NumErrors = 0; + NumErrors += test>(Q, 0, 2, 2); + NumErrors += test>(Q, 0, 7, 7); + NumErrors += test>(Q, 0, 1, 1025); - std::cout << "Test passed\n"; - return 0; + printFinalStatus(NumErrors); + return NumErrors; } diff --git a/SYCL/Reduction/reduction_nd_ext_double.cpp b/SYCL/Reduction/reduction_nd_ext_double.cpp index aacbce14cb..88b375ca16 100644 --- a/SYCL/Reduction/reduction_nd_ext_double.cpp +++ b/SYCL/Reduction/reduction_nd_ext_double.cpp @@ -11,4 +11,4 @@ #include "reduction_nd_ext_type.hpp" -int main() { return runTests("cl_khr_fp64"); } +int main() { return runTests(sycl::aspect::fp64); } diff --git a/SYCL/Reduction/reduction_nd_ext_half.cpp b/SYCL/Reduction/reduction_nd_ext_half.cpp index 7bb9f6f543..e0e88c05ca 100644 --- a/SYCL/Reduction/reduction_nd_ext_half.cpp +++ b/SYCL/Reduction/reduction_nd_ext_half.cpp @@ -16,4 +16,4 @@ #include "reduction_nd_ext_type.hpp" -int main() { return runTests("cl_khr_fp16"); } +int main() { return runTests(sycl::aspect::fp16); } diff --git a/SYCL/Reduction/reduction_nd_ext_type.hpp b/SYCL/Reduction/reduction_nd_ext_type.hpp index 0ee925a615..f30768c428 100644 --- a/SYCL/Reduction/reduction_nd_ext_type.hpp +++ b/SYCL/Reduction/reduction_nd_ext_type.hpp @@ -2,103 +2,47 @@ // with types that may require additional runtime checks for extensions // supported by the device, e.g. 'half' or 'double' -#include "reduction_utils.hpp" -#include -#include +#include "reduction_nd_range_scalar.hpp" using namespace cl::sycl; -template class KName; -constexpr access::mode RW = access::mode::read_write; -constexpr access::mode DW = access::mode::discard_write; +int NumErrors = 0; -template -void test(queue &Q, T Identity, T Init, size_t WGSize, size_t NWItems) { - buffer InBuf(NWItems); - buffer OutBuf(1); - - // Initialize. - BinaryOperation BOp; - T CorrectOut; - initInputData(InBuf, CorrectOut, Identity, BOp, NWItems); - if (Mode == access::mode::read_write) - CorrectOut = BOp(CorrectOut, Init); - - (OutBuf.template get_access())[0] = Init; - - // Compute. +template +void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t WGSize, + size_t NWItems) { nd_range<1> NDRange(range<1>{NWItems}, range<1>{WGSize}); - if constexpr (IsSYCL2020Mode) { - Q.submit([&](handler &CGH) { - auto In = InBuf.template get_access(CGH); - auto Redu = - sycl::reduction(OutBuf, CGH, Identity, BOp, getPropertyList()); - CGH.parallel_for(NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) { - Sum.combine(In[NDIt.get_global_linear_id()]); - }); - }); - } else { - Q.submit([&](handler &CGH) { - auto In = InBuf.template get_access(CGH); - accessor Out(OutBuf, CGH); - auto Redu = ONEAPI::reduction(Out, Identity, BOp); - - CGH.parallel_for(NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) { - Sum.combine(In[NDIt.get_global_linear_id()]); - }); - }); - } - - // Check correctness. - auto Out = OutBuf.template get_access(); - T ComputedOut = *(Out.get_pointer()); - T MaxDiff = 4 * std::numeric_limits::epsilon() * - std::fabs(ComputedOut + CorrectOut); - T CompDiff = std::fabs(static_cast(ComputedOut - CorrectOut)); - if (CompDiff > MaxDiff) { - std::cerr << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n"; - std::cerr << "Computed value: " << ComputedOut - << ", Expected value: " << CorrectOut << ", MaxDiff = " << MaxDiff - << ", ComputedDiff = " << CompDiff << std::endl; - assert(0 && "Wrong value."); - } -} - -template -void testBoth(queue &Q, T Identity, T Init, size_t WGSize, size_t NWItems) { - test, false, T, Dim, Mode, BinaryOperation>( - Q, Identity, Init, WGSize, NWItems); - - test, true, T, Dim, Mode, BinaryOperation>( - Q, Identity, Init, WGSize, NWItems); + NumErrors += testBoth(Q, Identity, Init, BOp, NDRange); } -template int runTests(const string_class &ExtensionName) { +template int runTests(sycl::aspect ExtAspect) { queue Q; + printDeviceInfo(Q); device D = Q.get_device(); - if (!D.is_host() && !D.has_extension(ExtensionName)) { + if (!D.is_host() && !D.has(ExtAspect)) { std::cout << "Test skipped\n"; return 0; } - testBoth>(Q, 1, 77, 4, 4); + constexpr access::mode RW = access::mode::read_write; + constexpr access::mode DW = access::mode::discard_write; + + tests(Q, 1, 77, std::multiplies{}, 4, 4); + tests(Q, 1, 77, std::multiplies{}, 4, 8); + + tests(Q, 0, 77, std::plus{}, 4, 32); + tests(Q, 0, 33, std::plus{}, 3, 3 * 5); - testBoth>(Q, 0, 77, 4, 64); - testBoth>(Q, 0, 33, 3, 3 * 5); + tests(Q, getMaximumFPValue(), -10.0, + ext::oneapi::minimum{}, 7, 7 * 512); + tests(Q, getMaximumFPValue(), 99.0, + ext::oneapi::minimum{}, 7, 7); - testBoth>(Q, getMaximumFPValue(), - -10.0, 7, 7); - testBoth>(Q, getMaximumFPValue(), - 99.0, 7, 7); - testBoth>(Q, getMaximumFPValue(), - -99.0, 3, 3); + tests(Q, getMinimumFPValue(), 99.0, + ext::oneapi::maximum<>{}, 3, 3); + tests(Q, getMinimumFPValue(), 99.0, + ext::oneapi::maximum<>{}, 7, 7 * 5); - testBoth>(Q, getMinimumFPValue(), - 99.0, 3, 3); - testBoth>(Q, getMinimumFPValue(), - 99.0, 7, 7 * 5); - std::cout << "Test passed\n"; - return 0; + printFinalStatus(NumErrors); + return NumErrors; } diff --git a/SYCL/Reduction/reduction_nd_lambda.cpp b/SYCL/Reduction/reduction_nd_lambda.cpp index 0fe7b29779..334385b65b 100644 --- a/SYCL/Reduction/reduction_nd_lambda.cpp +++ b/SYCL/Reduction/reduction_nd_lambda.cpp @@ -6,72 +6,42 @@ // This test performs basic checks of parallel_for(nd_range, reduction, lambda) -#include "reduction_utils.hpp" -#include -#include +#include "reduction_nd_range_scalar.hpp" using namespace cl::sycl; -// Note that this function is created only to test that if the accessor -// object passed to ONEAPI::reduction is destroyed right after -// ONEAPI::reduction creation, then the reduction still works properly, -// i.e. it holds a COPY of user's accessor. -template -auto createReduction(sycl::buffer Buffer, handler &CGH, T Identity, - BOpT BOp) { - auto Acc = Buffer.template get_access(CGH); - return ONEAPI::reduction(Acc, Identity, BOp); -} - -template -void test(queue &Q, T Identity, BinaryOperation BOp, size_t WGSize, - size_t NWItems) { - buffer InBuf(NWItems); - buffer OutBuf(1); - - // Initialize. - T CorrectOut; - initInputData(InBuf, CorrectOut, Identity, BOp, NWItems); - - // Compute. - Q.submit([&](handler &CGH) { - auto In = InBuf.template get_access(CGH); - auto Redu = createReduction(OutBuf, CGH, Identity, BOp); +template class MName; - nd_range<1> NDRange(range<1>{NWItems}, range<1>{WGSize}); - CGH.parallel_for(NDRange, Redu, - [=](nd_item<1> NDIt, auto &Sum) { - Sum.combine(In[NDIt.get_global_linear_id()]); - }); - }); +int NumErrors = 0; - // Check correctness. - auto Out = OutBuf.template get_access(); - T ComputedOut = *(Out.get_pointer()); - if (ComputedOut != CorrectOut) { - std::cerr << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n"; - std::cerr << "Computed value: " << ComputedOut - << ", Expected value: " << CorrectOut << "\n"; - assert(0 && "Wrong value."); - } +template +void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t WGSize, + size_t NWItems) { + constexpr access::mode DW = access::mode::discard_write; + constexpr access::mode RW = access::mode::read_write; + nd_range<1> NDRange(range<1>{NWItems}, range<1>{WGSize}); + NumErrors += testBoth, DW>(Q, Identity, Init, BOp, NDRange); + NumErrors += testBoth, RW>(Q, Identity, Init, BOp, NDRange); } int main() { queue Q; - test( - Q, 0, [](auto x, auto y) { return (x + y); }, 1, 1024); - test( - Q, 0, [](auto x, auto y) { return (x * y); }, 8, 32); + printDeviceInfo(Q); + tests( + Q, 0, 9, [](auto x, auto y) { return (x + y); }, 1, 1024); + tests( + Q, 1, 2, [](auto x, auto y) { return (x * y); }, 8, 16); // Check with CUSTOM type. - test>( - Q, CustomVec(0), - [](auto x, auto y) { + using CV = CustomVec; + tests( + Q, CV(0), CV(2021), + [](auto X, auto Y) { CustomVecPlus BOp; - return BOp(x, y); + return BOp(X, Y); }, 4, 64); - std::cout << "Test passed\n"; - return 0; + printFinalStatus(NumErrors); + return NumErrors; } diff --git a/SYCL/Reduction/reduction_nd_range_scalar.hpp b/SYCL/Reduction/reduction_nd_range_scalar.hpp new file mode 100644 index 0000000000..cdfdc897dc --- /dev/null +++ b/SYCL/Reduction/reduction_nd_range_scalar.hpp @@ -0,0 +1,59 @@ +// This test performs basic checks of parallel_for(range, reduction, func) +// with reductions initialized with 1-dimensional buffer/accessor +// accessing a scalar holding the reduction result. + +#include "reduction_utils.hpp" + +using namespace cl::sycl; + +template class KName; + +template +int test(queue &Q, T Identity, T Init, BinaryOperation BOp, + const nd_range &Range) { + printTestLabel(IsSYCL2020, Range); + + // Skip the test for such big arrays now. + constexpr size_t TwoGB = 2LL * 1024 * 1024 * 1024; + range GlobalRange = Range.get_global_range(); + if (GlobalRange.size() > TwoGB) + return 0; + + buffer InBuf(GlobalRange); + buffer OutBuf(1); + + // Initialize. + T CorrectOut; + initInputData(InBuf, CorrectOut, Identity, BOp, GlobalRange); + if constexpr (Mode == access::mode::read_write) { + CorrectOut = BOp(CorrectOut, Init); + } + + // The value assigned here must be discarded (if IsReadWrite is true). + // Verify that it is really discarded and assign some value. + (OutBuf.template get_access())[0] = Init; + + // Compute. + Q.submit([&](handler &CGH) { + auto In = InBuf.template get_access(CGH); + auto Redu = + createReduction(OutBuf, CGH, Identity, BOp); + CGH.parallel_for(Range, Redu, [=](nd_item NDIt, auto &Sum) { + Sum.combine(In[NDIt.get_global_id()]); + }); + }); + + // Check correctness. + auto Out = OutBuf.template get_access(); + T ComputedOut = *(Out.get_pointer()); + return checkResults(Q, IsSYCL2020, BOp, Range, ComputedOut, CorrectOut); +} + +template +int testBoth(queue &Q, T Identity, T Init, BinaryOperation BOp, + const nd_range &Range) { + return test, false, Mode>(Q, Identity, Init, BOp, Range) + + test, true, Mode>(Q, Identity, Init, BOp, Range); +} diff --git a/SYCL/Reduction/reduction_nd_s0_dw.cpp b/SYCL/Reduction/reduction_nd_s0_dw.cpp index 43ce0f6566..6dda0ca243 100644 --- a/SYCL/Reduction/reduction_nd_s0_dw.cpp +++ b/SYCL/Reduction/reduction_nd_s0_dw.cpp @@ -6,104 +6,49 @@ // This test performs basic checks of parallel_for(nd_range, reduction, func) // with reductions initialized with 0-dimensional discard_write accessor. -#include "reduction_utils.hpp" -#include -#include +#include "reduction_nd_range_scalar.hpp" using namespace cl::sycl; -// This allocator is needed only for the purpose of testing buffers -// with allocator that is not same_as sycl::buffer_allocator. -struct CustomAllocator : public sycl::buffer_allocator {}; - -template class KName; - -template -void test(queue &Q, T Identity, T Init, size_t WGSize, size_t NWItems) { - buffer InBuf(NWItems); - buffer OutBuf(1); - - // Initialize. - BinaryOperation BOp; - T CorrectOut; - initInputData(InBuf, CorrectOut, Identity, BOp, NWItems); - - // This assignment is discarded, but is still needed to verify - // that is discarded. - (OutBuf.template get_access())[0] = Init; - - // Compute. - - nd_range<1> NDRange(range<1>{NWItems}, range<1>{WGSize}); - if constexpr (IsSYCL2020Mode) { - Q.submit([&](handler &CGH) { - auto In = InBuf.template get_access(CGH); - property_list PropList(property::reduction::initialize_to_identity{}); - auto Redu = sycl::reduction(OutBuf, CGH, Identity, BOp, PropList); - - CGH.parallel_for(NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) { - Sum.combine(In[NDIt.get_global_linear_id()]); - }); - }); - } else { - Q.submit([&](handler &CGH) { - auto In = InBuf.template get_access(CGH); - accessor - Out(OutBuf, CGH); - auto Redu = ONEAPI::reduction(Out, Identity, BOp); - - CGH.parallel_for(NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) { - Sum.combine(In[NDIt.get_global_linear_id()]); - }); - }); - } - - // Check correctness. - auto Out = OutBuf.template get_access(); - T ComputedOut = *(Out.get_pointer()); - if (ComputedOut != CorrectOut) { - std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n"; - std::cout << "Computed value: " << ComputedOut - << ", Expected value: " << CorrectOut << "\n"; - assert(0 && "Wrong value."); - } -} +int NumErrors = 0; template -void testBoth(queue &Q, T Identity, T Init, size_t WGSize, size_t NWItems) { - test, false, T, BinaryOperation>(Q, Identity, Init, WGSize, - NWItems); - test, true, T, BinaryOperation>(Q, Identity, Init, WGSize, - NWItems); +void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t WGSize, + size_t NWItems) { + constexpr access::mode DW = access::mode::discard_write; + nd_range<1> NDRange(range<1>{NWItems}, range<1>{WGSize}); + NumErrors += + test(Q, Identity, Init, BOp, NDRange); } int main() { queue Q; + printDeviceInfo(Q); // Check some non power-of-two work-group sizes. - testBoth>(Q, 0, 99, 1, 7); - testBoth>(Q, 0, 99, 49, 49 * 5); + tests(Q, 0, 99, std::plus{}, 1, 7); + tests(Q, 0, 99, std::plus{}, 49, 49 * 5); // Try some power-of-two work-group sizes. - testBoth>(Q, 0, 99, 1, 32); - testBoth>(Q, 1, 99, 4, 32); - testBoth>(Q, 0, 99, 16, 256); - testBoth>(Q, ~0, 99, 32, 256); - testBoth>( - Q, (std::numeric_limits::max)(), -99, 64, 256); - testBoth>( - Q, (std::numeric_limits::min)(), 99, 128, 256); - testBoth>(Q, 0, 99, 256, 256); + tests(Q, 0, 99, std::plus<>{}, 1, 32); + tests(Q, 1, 99, std::multiplies<>{}, 4, 32); + tests(Q, 0, 99, std::bit_xor<>{}, 16, 256); + tests(Q, ~0, 99, std::bit_and<>{}, 32, 256); + tests(Q, (std::numeric_limits::max)(), -99, + ext::oneapi::minimum<>{}, 64, 256); + tests(Q, (std::numeric_limits::min)(), 99, + ext::oneapi::maximum<>{}, 128, 256); + tests(Q, 0, 99, std::plus<>{}, 256, 256); // Check with various types. - testBoth>(Q, 1, 99, 8, 256); - testBoth>(Q, 0x7fff, -99, 8, 256); - testBoth>(Q, 0, 99, 8, 256); + tests(Q, 1, 99, std::multiplies<>{}, 8, 16); + tests(Q, 0x7fff, -99, ext::oneapi::minimum<>{}, 8, 256); + tests(Q, 0, 99, ext::oneapi::maximum<>{}, 8, 256); // Check with CUSTOM type. - testBoth, CustomVecPlus>( - Q, CustomVec(0), CustomVec(99), 8, 256); + using CV = CustomVec; + tests(Q, CV(0), CV(99), CustomVecPlus{}, 8, 256); - std::cout << "Test passed\n"; - return 0; + printFinalStatus(NumErrors); + return NumErrors; } diff --git a/SYCL/Reduction/reduction_nd_s0_rw.cpp b/SYCL/Reduction/reduction_nd_s0_rw.cpp index a5983f019f..a8b75b06a1 100644 --- a/SYCL/Reduction/reduction_nd_s0_rw.cpp +++ b/SYCL/Reduction/reduction_nd_s0_rw.cpp @@ -6,104 +6,50 @@ // This test performs basic checks of parallel_for(nd_range, reduction, func) // with reductions initialized with 0-dimensional read_write accessor. -#include "reduction_utils.hpp" -#include -#include +#include "reduction_nd_range_scalar.hpp" using namespace cl::sycl; -// This allocator is needed only for the purpose of testing buffers -// with allocator that is not same_as sycl::buffer_allocator. -struct CustomAllocator : public sycl::buffer_allocator {}; - -template class KName; - -template -void test(queue &Q, T Identity, T Init, size_t WGSize, size_t NWItems) { - buffer InBuf(NWItems); - buffer OutBuf(1); - - // Initialize. - BinaryOperation BOp; - T CorrectOut; - initInputData(InBuf, CorrectOut, Identity, BOp, NWItems); - - // The final reduction sum after running parallel_for() must include - // the original value it was initialized with before the parallel_for(). - CorrectOut = BOp(CorrectOut, Init); - (OutBuf.template get_access())[0] = Init; - - // Compute. - nd_range<1> NDRange(range<1>{NWItems}, range<1>{WGSize}); - if constexpr (IsSYCL2020Mode) { - Q.submit([&](handler &CGH) { - auto In = InBuf.template get_access(CGH); - auto Redu = sycl::reduction(OutBuf, CGH, Identity, BOp); - - CGH.parallel_for(NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) { - Sum.combine(In[NDIt.get_global_linear_id()]); - }); - }); - } else { - Q.submit([&](handler &CGH) { - auto In = InBuf.template get_access(CGH); - accessor - Out(OutBuf, CGH); - auto Redu = ONEAPI::reduction(Out, Identity, BOp); - - CGH.parallel_for(NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) { - Sum.combine(In[NDIt.get_global_linear_id()]); - }); - }); - } - - // Check correctness. - auto Out = OutBuf.template get_access(); - T ComputedOut = *(Out.get_pointer()); - if (ComputedOut != CorrectOut) { - std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n"; - std::cout << "Computed value: " << ComputedOut - << ", Expected value: " << CorrectOut << "\n"; - assert(0 && "Wrong value."); - } -} +int NumErrors = 0; template -void testBoth(queue &Q, T Identity, T Init, size_t WGSize, size_t NWItems) { - test, false, T, BinaryOperation>(Q, Identity, Init, WGSize, - NWItems); - test, true, T, BinaryOperation>(Q, Identity, Init, WGSize, - NWItems); +void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t WGSize, + size_t NWItems) { + constexpr access::mode RW = access::mode::read_write; + nd_range<1> NDRange(range<1>{NWItems}, range<1>{WGSize}); + NumErrors += + test(Q, Identity, Init, BOp, NDRange); } int main() { queue Q; + printDeviceInfo(Q); // Check non power-of-two work-group sizes. - testBoth>(Q, 0, 99, 1, 7); - testBoth>(Q, 0, -99, 49, 49 * 5); + tests(Q, 0, 99, std::plus{}, 1, 7); + tests(Q, 0, -99, std::plus{}, 49, 49 * 5); // Try some power-of-two work-group sizes. - testBoth>(Q, 0, 99, 2, 32); - testBoth>(Q, 0, 199, 32, 32); - testBoth>(Q, 0, 299, 128, 256); - testBoth>(Q, 0, 399, 256, 256); + tests(Q, 0, 99, std::plus<>{}, 2, 32); + tests(Q, 0, 199, std::plus<>{}, 32, 32); + tests(Q, 0, 299, std::plus<>{}, 128, 256); + tests(Q, 0, 399, std::plus<>{}, 256, 256); // Check with various operations and types. - testBoth>(Q, 1, 2, 8, 256); - testBoth>(Q, 1, 1.2, 8, 32); - testBoth>(Q, 0, 0x3400, 4, 32); - testBoth>(Q, 0, 0x12340000, 4, 32); - testBoth>(Q, ~0, ~0, 4, 16); - testBoth>( - Q, (std::numeric_limits::max)(), -99, 8, 256); - testBoth>( - Q, (std::numeric_limits::min)(), 99, 8, 256); + tests(Q, 1, 2, std::multiplies<>{}, 8, 8); + tests(Q, 1, 1.2, std::multiplies<>{}, 8, 16); + tests(Q, 0, 0x3400, std::bit_or<>{}, 4, 32); + tests(Q, 0, 0x12340000, std::bit_xor<>{}, 4, 32); + tests(Q, ~0, ~0, std::bit_and<>{}, 4, 16); + tests(Q, (std::numeric_limits::max)(), -99, + ext::oneapi::minimum<>{}, 8, 256); + tests(Q, (std::numeric_limits::min)(), 99, + ext::oneapi::maximum<>{}, 8, 256); // Check with CUSTOM type. - testBoth, CustomVecPlus>( - Q, CustomVec(0), CustomVec(199), 8, 256); + using CV = CustomVec; + tests(Q, CV(0), CV(99), CustomVecPlus{}, 8, 256); - std::cout << "Test passed\n"; - return 0; + printFinalStatus(NumErrors); + return NumErrors; } diff --git a/SYCL/Reduction/reduction_nd_s1_dw.cpp b/SYCL/Reduction/reduction_nd_s1_dw.cpp index 8745d054e3..3afbfb141b 100644 --- a/SYCL/Reduction/reduction_nd_s1_dw.cpp +++ b/SYCL/Reduction/reduction_nd_s1_dw.cpp @@ -8,100 +8,48 @@ // with reductions initialized with 1-dimensional discard_write accessor // accessing 1 element buffer. -#include "reduction_utils.hpp" -#include -#include +#include "reduction_nd_range_scalar.hpp" using namespace cl::sycl; -template class KName; - -template -void test(queue &Q, T Identity, T Init, size_t WGSize, size_t NWItems) { - buffer InBuf(NWItems); - buffer OutBuf(1); - - // Initialize. - BinaryOperation BOp; - T CorrectOut; - initInputData(InBuf, CorrectOut, Identity, BOp, NWItems); - - // This assignment is discarded, but is still needed to verify - // that is discarded. - (OutBuf.template get_access())[0] = Init; - - // Compute. - nd_range<1> NDRange(range<1>{NWItems}, range<1>{WGSize}); - if constexpr (IsSYCL2020Mode) { - Q.submit([&](handler &CGH) { - auto In = InBuf.template get_access(CGH); - property_list PropList(property::reduction::initialize_to_identity{}); - auto Redu = sycl::reduction(OutBuf, CGH, Identity, BOp, PropList); - - CGH.parallel_for(NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) { - Sum.combine(In[NDIt.get_global_linear_id()]); - }); - }); - } else { - Q.submit([&](handler &CGH) { - auto In = InBuf.template get_access(CGH); - accessor - Out(OutBuf, CGH); - auto Redu = ONEAPI::reduction(Out, Identity, BOp); - - CGH.parallel_for(NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) { - Sum.combine(In[NDIt.get_global_linear_id()]); - }); - }); - } - - // Check correctness. - auto Out = OutBuf.template get_access(); - T ComputedOut = *(Out.get_pointer()); - if (ComputedOut != CorrectOut) { - std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n"; - std::cout << "Computed value: " << ComputedOut - << ", Expected value: " << CorrectOut << "\n"; - assert(0 && "Wrong value."); - } -} +int NumErrors = 0; template -void testBoth(queue &Q, T Identity, T Init, size_t WGSize, size_t NWItems) { - test, false, T, BinaryOperation>(Q, Identity, Init, WGSize, - NWItems); - test, true, T, BinaryOperation>(Q, Identity, Init, WGSize, - NWItems); +void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t WGSize, + size_t NWItems) { + constexpr access::mode DW = access::mode::discard_write; + nd_range<1> NDRange(range<1>{NWItems}, range<1>{WGSize}); + NumErrors += testBoth(Q, Identity, Init, BOp, NDRange); } int main() { queue Q; + printDeviceInfo(Q); // Check some non power-of-two work-group sizes. - testBoth>(Q, 0, 99, 1, 7); - testBoth>(Q, 0, 99, 49, 49 * 5); + tests(Q, 0, 99, std::plus{}, 1, 7); + tests(Q, 0, 99, std::plus{}, 49, 49 * 5); // Try some power-of-two work-group sizes. - testBoth>(Q, 0, 99, 1, 32); - testBoth>(Q, 1, 99, 4, 64); - testBoth>(Q, 0, 99, 8, 128); - testBoth>(Q, 0, 99, 16, 256); - testBoth>(Q, ~0, 99, 32, 256); - testBoth>( - Q, (std::numeric_limits::max)(), -99, 64, 256); - testBoth>( - Q, (std::numeric_limits::min)(), 99, 128, 256); - testBoth>(Q, 0, 99, 256, 256); + tests(Q, 0, 99, std::plus<>{}, 1, 32); + tests(Q, 0, 99, std::bit_or<>{}, 8, 128); + tests(Q, 0, 99, std::bit_xor<>{}, 16, 256); + tests(Q, ~0, 99, std::bit_and<>{}, 32, 256); + tests(Q, (std::numeric_limits::max)(), -99, + ext::oneapi::minimum<>{}, 64, 256); + tests(Q, (std::numeric_limits::min)(), 99, + ext::oneapi::maximum<>{}, 128, 256); + tests(Q, 0, 99, std::plus<>{}, 256, 256); // Check with various types. - testBoth>(Q, 1, 99, 8, 256); - testBoth>(Q, 0x7fff, -99, 8, 256); - testBoth>(Q, 0, 99, 8, 256); + tests(Q, 1, 99, std::multiplies<>{}, 8, 24); + tests(Q, 0x7fff, -99, ext::oneapi::minimum<>{}, 8, 256); + tests(Q, 0, 99, ext::oneapi::maximum<>{}, 8, 256); // Check with CUSTOM type. - testBoth, CustomVecPlus>( - Q, CustomVec(0), CustomVec(99), 8, 256); + using CV = CustomVec; + tests(Q, CV(0), CV(99), CustomVecPlus{}, 8, 256); - std::cout << "Test passed\n"; - return 0; + printFinalStatus(NumErrors); + return NumErrors; } diff --git a/SYCL/Reduction/reduction_nd_s1_rw.cpp b/SYCL/Reduction/reduction_nd_s1_rw.cpp index 2a40149555..654da943ec 100644 --- a/SYCL/Reduction/reduction_nd_s1_rw.cpp +++ b/SYCL/Reduction/reduction_nd_s1_rw.cpp @@ -7,104 +7,43 @@ // with reductions initialized with 1-dimensional read_write accessor // accessing 1 element buffer. -#include "reduction_utils.hpp" -#include -#include +#include "reduction_nd_range_scalar.hpp" using namespace cl::sycl; -// This allocator is needed only for the purpose of testing buffers -// with allocator that is not same_as sycl::buffer_allocator. -struct CustomAllocator : public sycl::buffer_allocator {}; - -template class KName; - -template -void test(queue &Q, T Identity, T Init, size_t WGSize, size_t NWItems) { - buffer InBuf(NWItems); - buffer OutBuf(1); - - // Initialize. - BinaryOperation BOp; - T CorrectOut; - initInputData(InBuf, CorrectOut, Identity, BOp, NWItems); - - // The final reduction sum after running parallel_for() must include - // the original value it was initialized with before the parallel_for(). - CorrectOut = BOp(CorrectOut, Init); - (OutBuf.template get_access())[0] = Init; - - // Compute. - nd_range<1> NDRange(range<1>{NWItems}, range<1>{WGSize}); - if constexpr (IsSYCL2020Mode) { - Q.submit([&](handler &CGH) { - auto In = InBuf.template get_access(CGH); - auto Redu = sycl::reduction(OutBuf, CGH, Identity, BOp); - - CGH.parallel_for(NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) { - Sum.combine(In[NDIt.get_global_linear_id()]); - }); - }); - } else { - Q.submit([&](handler &CGH) { - auto In = InBuf.template get_access(CGH); - accessor - Out(OutBuf, CGH); - auto Redu = ONEAPI::reduction(Out, Identity, BOp); - - CGH.parallel_for(NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) { - Sum.combine(In[NDIt.get_global_linear_id()]); - }); - }); - } - - // Check correctness. - auto Out = OutBuf.template get_access(); - T ComputedOut = *(Out.get_pointer()); - if (ComputedOut != CorrectOut) { - std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n"; - std::cout << "Computed value: " << ComputedOut - << ", Expected value: " << CorrectOut << "\n"; - assert(0 && "Wrong value."); - } -} +int NumErrors = 0; template -void testBoth(queue &Q, T Identity, T Init, size_t WGSize, size_t NWItems) { - test, false, T, BinaryOperation>(Q, Identity, Init, WGSize, - NWItems); - test, true, T, BinaryOperation>(Q, Identity, Init, WGSize, - NWItems); +void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t WGSize, + size_t NWItems) { + constexpr access::mode RW = access::mode::read_write; + nd_range<1> NDRange(range<1>{NWItems}, range<1>{WGSize}); + NumErrors += testBoth(Q, Identity, Init, BOp, NDRange); } int main() { queue Q; + printDeviceInfo(Q); // Check non power-of-two work-group sizes. - testBoth>(Q, 0, 99, 1, 7); - testBoth>(Q, 0, -99, 49, 49 * 5); - - // Try some power-of-two work-group sizes. - testBoth>(Q, 0, 99, 2, 32); - testBoth>(Q, 0, 199, 32, 128); - testBoth>(Q, 0, 299, 128, 128); - testBoth>(Q, 0, 399, 256, 256); + tests(Q, 0, 99, std::plus{}, 1, 7); + tests(Q, 0, -99, std::plus{}, 49, 49 * 5); // Check with various operations and types. - testBoth>(Q, 1, 2, 8, 256); - testBoth>(Q, 1, 1.2, 8, 16); - testBoth>(Q, 0, 0x3400, 4, 32); - testBoth>(Q, 0, 0x12340000, 2, 16); - testBoth>(Q, ~0, ~0, 4, 16); - testBoth>( - Q, (std::numeric_limits::max)(), 99, 8, 256); - testBoth>( - Q, (std::numeric_limits::min)(), -99, 8, 256); + tests(Q, 0, 99, std::plus<>{}, 1, 32); + tests(Q, 0, 99, std::bit_or<>{}, 8, 128); + tests(Q, 0, 99, std::bit_xor<>{}, 16, 256); + tests(Q, ~0, 99, std::bit_and<>{}, 32, 256); + tests(Q, (std::numeric_limits::max)(), -99, + ext::oneapi::minimum<>{}, 64, 256); + tests(Q, (std::numeric_limits::min)(), 99, + ext::oneapi::maximum<>{}, 128, 256); + tests(Q, 0, 99, std::plus<>{}, 256, 256); // Check with CUSTOM type. - testBoth, CustomVecPlus>( - Q, CustomVec(0), CustomVec(-199), 8, 256); + using CV = CustomVec; + tests(Q, CV(0), CV(-199), CustomVecPlus{}, 8, 256); - std::cout << "Test passed\n"; - return 0; + printFinalStatus(NumErrors); + return NumErrors; } diff --git a/SYCL/Reduction/reduction_placeholder.cpp b/SYCL/Reduction/reduction_placeholder.cpp index e47b040497..f71bea0255 100644 --- a/SYCL/Reduction/reduction_placeholder.cpp +++ b/SYCL/Reduction/reduction_placeholder.cpp @@ -4,23 +4,24 @@ // RUN: %ACC_RUN_PLACEHOLDER %t.out // RUNx: %HOST_RUN_PLACEHOLDER %t.out -// TODO: Enable the test for HOST when it supports ONEAPI::reduce() and +// TODO: Enable the test for HOST when it supports ext::oneapi::reduce() and // barrier() // This test performs basic checks of parallel_for(nd_range, reduction, func) // with reductions initialized with a placeholder accessor. #include "reduction_utils.hpp" -#include -#include using namespace cl::sycl; -template class KNGroup; +template class KName; + +template +int test(queue &Q, T Identity, T Init, size_t WGSize, size_t NWItems) { + nd_range<1> NDRange(range<1>{NWItems}, range<1>{WGSize}); + printTestLabel(false, NDRange); -template -void testOneCase(queue &Q, T Identity, T Init, size_t WGSize, size_t NWItems) { // Initialize. T CorrectOut; BinaryOperation BOp; @@ -39,57 +40,54 @@ void testOneCase(queue &Q, T Identity, T Init, size_t WGSize, size_t NWItems) { Q.submit([&](handler &CGH) { auto In = InBuf.template get_access(CGH); CGH.require(Out); - auto Redu = ONEAPI::reduction(Out, Identity, BinaryOperation()); - range<1> GlobalRange(NWItems); - range<1> LocalRange(WGSize); - nd_range<1> NDRange(GlobalRange, LocalRange); - CGH.parallel_for( - NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) { - Sum.combine(In[NDIt.get_global_linear_id()]); - }); + auto Redu = ext::oneapi::reduction(Out, Identity, BOp); + CGH.parallel_for(NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) { + Sum.combine(In[NDIt.get_global_linear_id()]); + }); }); Q.wait(); // Check correctness. T ReduVar = (OutBuf.template get_access())[0]; - if (ReduVar != CorrectOut) { - std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n"; - std::cout << "Computed value: " << ReduVar - << ", Expected value: " << CorrectOut << "\n"; - assert(0 && "Wrong value."); - } + return checkResults(Q, false /*SYCL2020*/, BOp, NDRange, ReduVar, CorrectOut); } -template -void test(queue &Q, T Identity, T Init, size_t WGSize, size_t NWItems) { - testOneCase, T, Dim, BinaryOperation, - access::mode::read_write>(Q, Identity, Init, WGSize, NWItems); - testOneCase, T, Dim, BinaryOperation, - access::mode::discard_write>(Q, Identity, Init, WGSize, NWItems); +int NumErrors = 0; + +template +void tests(queue &Q, T Identity, T Init, size_t WGSize, size_t NWItems) { + constexpr access::mode DW = access::mode::discard_write; + constexpr access::mode RW = access::mode::read_write; + + NumErrors += test, T, 0, BinaryOperation, DW>( + Q, Identity, Init, WGSize, NWItems); + NumErrors += test, T, 1, BinaryOperation, DW>( + Q, Identity, Init, WGSize, NWItems); + + NumErrors += test, T, 0, BinaryOperation, RW>( + Q, Identity, Init, WGSize, NWItems); + NumErrors += test, T, 1, BinaryOperation, RW>( + Q, Identity, Init, WGSize, NWItems); } int main() { queue Q; + printDeviceInfo(Q); + // fast atomics and fast reduce - test>(Q, 0, 77, 49, 49 * 5); - test>(Q, 0, -77, 8, 8); + tests>(Q, 0, 77, 49, 49 * 5); // fast atomics - test>(Q, 0, 233, 7, 7 * 3); - test>(Q, 0, 177, 4, 128); + tests>(Q, 0, 233, 7, 7 * 3); // fast reduce - test>( + tests>( Q, getMaximumFPValue(), -5.0, 5, 5 * 7); - test>( - Q, getMinimumFPValue(), -5.0, 4, 128); // generic algorithm - test>(Q, 1, 2, 7, 7 * 5); - test>(Q, 1, 3, 8, 16); - test, 0, CustomVecPlus>( + tests, CustomVecPlus>( Q, CustomVec(0), CustomVec(4), 8, 8 * 3); - std::cout << "Test passed\n"; - return 0; + printFinalStatus(NumErrors); + return NumErrors; } diff --git a/SYCL/Reduction/reduction_queue_parallel_for.cpp b/SYCL/Reduction/reduction_queue_parallel_for.cpp index f143378fb5..ec3bfdd723 100644 --- a/SYCL/Reduction/reduction_queue_parallel_for.cpp +++ b/SYCL/Reduction/reduction_queue_parallel_for.cpp @@ -4,47 +4,38 @@ // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUNx: %HOST_RUN_PLACEHOLDER %t.out -// TODO: Enable the test for HOST when it supports ONEAPI::reduce() and +// TODO: Enable the test for HOST when it supports ext::oneapi::reduce() and // barrier() // This test only checks that the method queue::parallel_for() accepting // reduction, can be properly translated into queue::submit + parallel_for(). -#include +#include "reduction_utils.hpp" + using namespace sycl; template class KName; -template int test(queue &Q) { - const size_t NElems = 1024; - const size_t WGSize = 256; +template +int test(queue &Q, T Identity, size_t WGSize, size_t NElems) { + nd_range<1> NDRange(range<1>{NElems}, range<1>{WGSize}); + printTestLabel(IsSYCL2020, NDRange); - int *Data = malloc_shared(NElems, Q); + T *Data = malloc_shared(NElems, Q); for (int I = 0; I < NElems; I++) Data[I] = I; - int *Sum = malloc_shared(1, Q); - *Sum = 0; - - if constexpr (IsSYCL2020Mode) { - Q.parallel_for( - nd_range<1>{NElems, WGSize}, sycl::reduction(Sum, std::plus<>()), - [=](nd_item<1> It, auto &Sum) { Sum += Data[It.get_global_id(0)]; }) - .wait(); - } else { - Q.parallel_for( - nd_range<1>{NElems, WGSize}, ONEAPI::reduction(Sum, ONEAPI::plus<>()), - [=](nd_item<1> It, auto &Sum) { Sum += Data[It.get_global_id(0)]; }) - .wait(); - } - - int ExpectedSum = (NElems - 1) * NElems / 2; - int Error = 0; - if (*Sum != ExpectedSum) { - std::cerr << "Error: Expected = " << ExpectedSum << ", Computed = " << *Sum - << std::endl; - Error = 1; - } + T *Sum = malloc_shared(1, Q); + *Sum = Identity; + + BinaryOperation BOp; + auto Redu = createReduction(Sum, BOp); + Q.parallel_for(NDRange, Redu, [=](nd_item<1> It, auto &Sum) { + Sum += Data[It.get_global_id(0)]; + }).wait(); + + T ExpectedSum = (NElems - 1) * NElems / 2; + int Error = checkResults(Q, IsSYCL2020, BOp, NDRange, *Sum, ExpectedSum); free(Data, Q); free(Sum, Q); @@ -53,7 +44,11 @@ template int test(queue &Q) { int main() { queue Q; - int Error = test, true>(Q); - Error += test, false>(Q); - return Error; + printDeviceInfo(Q); + + int NumErrors = test>(Q, 0, 16, 32); + NumErrors += test>(Q, 0, 7, 14); + + printFinalStatus(NumErrors); + return NumErrors; } diff --git a/SYCL/Reduction/reduction_range_1d_s0_dw.cpp b/SYCL/Reduction/reduction_range_1d_s0_dw.cpp index a000c9cfe7..e84d764ced 100644 --- a/SYCL/Reduction/reduction_range_1d_s0_dw.cpp +++ b/SYCL/Reduction/reduction_range_1d_s0_dw.cpp @@ -14,10 +14,13 @@ using namespace cl::sycl; +int NumErrors = 0; + template void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t NWItems) { constexpr access::mode DW = access::mode::discard_write; - test(Q, Identity, Init, BOp, range<1>{NWItems}); + NumErrors += + test(Q, Identity, Init, BOp, range<1>{NWItems}); } int main() { @@ -38,19 +41,17 @@ int main() { tests(Q, ~0, ~0, std::bit_and<>{}, 8); tests(Q, 0, 0x12340000, std::bit_xor<>{}, 16); tests(Q, 0, 0x3400, std::bit_or<>{}, MaxWGSize * 4); - tests(Q, 1, 2, std::multiplies<>{}, 256); - tests(Q, 1, 3, std::multiplies<>{}, MaxWGSize + 1); - tests(Q, (std::numeric_limits::max)(), -99, + tests(Q, 1, 2, std::multiplies<>{}, 31); + tests(Q, (std::numeric_limits::max)(), -99, ext::oneapi::minimum<>{}, MaxWGSize * 2); - tests(Q, (std::numeric_limits::min)(), 99, + tests(Q, (std::numeric_limits::min)(), 99, ext::oneapi::maximum<>{}, 8); - tests(Q, 1, 99, std::multiplies<>{}, 37); // Check with CUSTOM type. using CV = CustomVec; tests(Q, CV(0), CV(99), CustomVecPlus{}, 64); tests(Q, CV(0), CV(99), CustomVecPlus{}, MaxWGSize * 3); - std::cout << "Test passed\n"; - return 0; + printFinalStatus(NumErrors); + return NumErrors; } diff --git a/SYCL/Reduction/reduction_range_1d_s0_rw.cpp b/SYCL/Reduction/reduction_range_1d_s0_rw.cpp index 8e23e9e26a..bc1e2befed 100644 --- a/SYCL/Reduction/reduction_range_1d_s0_rw.cpp +++ b/SYCL/Reduction/reduction_range_1d_s0_rw.cpp @@ -14,10 +14,13 @@ using namespace cl::sycl; +int NumErrors = 0; + template void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t NWItems) { constexpr access::mode RW = access::mode::read_write; - test(Q, Identity, Init, BOp, range<1>{NWItems}); + NumErrors += + test(Q, Identity, Init, BOp, range<1>{NWItems}); } int main() { @@ -38,13 +41,12 @@ int main() { tests(Q, ~0, ~0, std::bit_and<>{}, 8); tests(Q, 0, 0x12340000, std::bit_xor<>{}, 16); tests(Q, 0, 0x3400, std::bit_or<>{}, MaxWGSize * 4); - tests(Q, 1, 2, std::multiplies<>{}, 256); - tests(Q, 1, 3, std::multiplies<>{}, MaxWGSize + 1); - tests(Q, (std::numeric_limits::max)(), -99, + tests(Q, 1, 2, std::multiplies<>{}, 17); + tests(Q, (std::numeric_limits::max)(), -99, ext::oneapi::minimum<>{}, MaxWGSize * 2); - tests(Q, (std::numeric_limits::min)(), 99, + tests(Q, (std::numeric_limits::min)(), 99, ext::oneapi::maximum<>{}, 8); - tests(Q, 1, 99, std::multiplies<>{}, MaxWGSize); + tests(Q, 1, 99, std::multiplies<>{}, 16); // Check with CUSTOM type. tests(Q, CustomVec(0), CustomVec(99), @@ -52,6 +54,6 @@ int main() { tests(Q, CustomVec(0), CustomVec(99), CustomVecPlus{}, MaxWGSize * 3); - std::cout << "Test passed\n"; - return 0; + printFinalStatus(NumErrors); + return NumErrors; } diff --git a/SYCL/Reduction/reduction_range_1d_s1_dw.cpp b/SYCL/Reduction/reduction_range_1d_s1_dw.cpp index e2924a0050..ff5c01898e 100644 --- a/SYCL/Reduction/reduction_range_1d_s1_dw.cpp +++ b/SYCL/Reduction/reduction_range_1d_s1_dw.cpp @@ -14,10 +14,12 @@ using namespace cl::sycl; +int NumErrors = 0; + template void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t NWItems) { constexpr access::mode DW = access::mode::discard_write; - testBoth(Q, Identity, Init, BOp, range<1>{NWItems}); + NumErrors += testBoth(Q, Identity, Init, BOp, range<1>{NWItems}); } int main() { @@ -27,24 +29,22 @@ int main() { Q.get_device().get_info(); // Fast-reduce and Fast-atomics. Try various range types/sizes. - tests(Q, 0, 99, std::plus<>{}, 1); - tests(Q, 0, 99, std::plus<>{}, 2); - tests(Q, 0, 99, std::plus<>{}, 7); - tests(Q, 0, 99, std::plus<>{}, 64); - tests(Q, 0, 99, std::plus<>{}, MaxWGSize * 2); - tests(Q, 0, 99, std::plus<>{}, MaxWGSize * 2 + 5); + tests(Q, 0, 99, std::plus{}, 1); + tests(Q, 0, 99, std::plus<>{}, 7); + tests(Q, 0, 99, std::plus<>{}, 64); + tests(Q, 0, 99, std::plus<>{}, MaxWGSize * 2); + tests(Q, 0, 99, std::plus<>{}, MaxWGSize * 2 + 5); // Try various types & ranges. tests(Q, ~0, 99, std::bit_and<>{}, 7); tests(Q, 0, 0xff99, std::bit_xor<>{}, MaxWGSize); tests(Q, 0, 0xff99, std::bit_or<>{}, 3); - tests(Q, 1, 3, std::multiplies<>{}, 32); - tests(Q, 1, 3, std::multiplies<>{}, MaxWGSize * 4); - tests(Q, (std::numeric_limits::max)(), -99, + tests(Q, 1, 2, std::multiplies<>{}, 7); + tests(Q, (std::numeric_limits::max)(), -99, ext::oneapi::minimum<>{}, MaxWGSize * 2); - tests(Q, (std::numeric_limits::min)(), 99, + tests(Q, (std::numeric_limits::min)(), 99, ext::oneapi::maximum<>{}, 8); - tests(Q, 1, 99, std::multiplies<>{}, MaxWGSize); + tests(Q, 1, 99, std::multiplies<>{}, 10); // Check with CUSTOM type. tests(Q, CustomVec(0), CustomVec(99), @@ -52,6 +52,6 @@ int main() { tests(Q, CustomVec(0), CustomVec(99), CustomVecPlus{}, MaxWGSize * 3); - std::cout << "Test passed\n"; - return 0; + printFinalStatus(NumErrors); + return NumErrors; } diff --git a/SYCL/Reduction/reduction_range_1d_s1_rw.cpp b/SYCL/Reduction/reduction_range_1d_s1_rw.cpp index 01d55587b7..bc10d647ce 100644 --- a/SYCL/Reduction/reduction_range_1d_s1_rw.cpp +++ b/SYCL/Reduction/reduction_range_1d_s1_rw.cpp @@ -18,10 +18,12 @@ using namespace cl::sycl; +int NumErrors = 0; + template void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t NWItems) { constexpr access::mode RW = access::mode::read_write; - testBoth(Q, Identity, Init, BOp, range<1>{NWItems}); + NumErrors += testBoth(Q, Identity, Init, BOp, range<1>{NWItems}); } int main() { @@ -32,23 +34,21 @@ int main() { // Fast-reduce and Fast-atomics. Try various range types/sizes. tests(Q, 0, 99, std::plus{}, 1); - tests(Q, 0, 99, std::plus{}, 2); - tests(Q, 0, 99, std::plus<>{}, 7); - tests(Q, 0, 99, std::plus<>{}, 64); - tests(Q, 0, 99, std::plus<>{}, MaxWGSize * 2); - tests(Q, 0, 99, std::plus<>{}, MaxWGSize * 2 + 5); + tests(Q, 0, 99, std::plus<>{}, 7); + tests(Q, 0, 99, std::plus<>{}, 64); + tests(Q, 0, 99, std::plus<>{}, MaxWGSize * 2); + tests(Q, 0, 99, std::plus<>{}, MaxWGSize * 2 + 5); // Try various types & ranges. tests(Q, ~0, ~0, std::bit_and<>{}, 8); tests(Q, 0, 0x12340000, std::bit_xor<>{}, 16); tests(Q, 0, 0x3400, std::bit_or<>{}, MaxWGSize * 4); - tests(Q, 1, 2, std::multiplies<>{}, 256); - tests(Q, 1, 3, std::multiplies<>{}, MaxWGSize * 4); + tests(Q, 1, 2, std::multiplies<>{}, 16); + tests(Q, 1, 3, std::multiplies<>{}, 11); tests(Q, (std::numeric_limits::max)(), -99, ext::oneapi::minimum<>{}, MaxWGSize * 2); tests(Q, (std::numeric_limits::min)(), 99, ext::oneapi::maximum<>{}, 8); - tests(Q, 1, 99, std::multiplies<>{}, MaxWGSize); // Check with CUSTOM type. tests(Q, CustomVec(0), CustomVec(99), @@ -56,6 +56,6 @@ int main() { tests(Q, CustomVec(0), CustomVec(99), CustomVecPlus{}, MaxWGSize * 3); - std::cout << "Test passed\n"; - return 0; + printFinalStatus(NumErrors); + return NumErrors; } diff --git a/SYCL/Reduction/reduction_range_2d_s1_dw.cpp b/SYCL/Reduction/reduction_range_2d_s1_dw.cpp index 0846b0ac0e..79476a1f52 100644 --- a/SYCL/Reduction/reduction_range_2d_s1_dw.cpp +++ b/SYCL/Reduction/reduction_range_2d_s1_dw.cpp @@ -14,10 +14,12 @@ using namespace cl::sycl; +int NumErrors = 0; + template void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, range<2> Range) { constexpr access::mode DW = access::mode::discard_write; - testBoth(Q, Identity, Init, BOp, Range); + NumErrors += testBoth(Q, Identity, Init, BOp, Range); } int main() { @@ -38,9 +40,8 @@ int main() { tests(Q, 0, 0x2021ff99, std::bit_xor<>{}, range<2>{3, 3}); tests(Q, ~0, 99, std::bit_and<>{}, range<2>{4, 3}); tests(Q, 0, 99, std::bit_or<>{}, range<2>{2, 2}); - tests(Q, 1, 3, std::multiplies<>{}, range<2>{16, 3}); - tests(Q, 1, 3, std::multiplies<>{}, - range<2>{3, MaxWGSize}); + tests(Q, 1, 3, std::multiplies<>{}, range<2>{8, 3}); + tests(Q, 1, 3, std::multiplies<>{}, range<2>{3, 7}); tests(Q, (std::numeric_limits::max)(), -99, ext::oneapi::minimum<>{}, range<2>{8, 3}); tests(Q, (std::numeric_limits::min)(), 99, @@ -50,6 +51,6 @@ int main() { tests(Q, CustomVec(0), CustomVec(99), CustomVecPlus{}, range<2>{33, MaxWGSize}); - std::cout << "Test passed\n"; - return 0; + printFinalStatus(NumErrors); + return NumErrors; } diff --git a/SYCL/Reduction/reduction_range_2d_s1_rw.cpp b/SYCL/Reduction/reduction_range_2d_s1_rw.cpp index 56a5a9e7af..4dea17a2b7 100644 --- a/SYCL/Reduction/reduction_range_2d_s1_rw.cpp +++ b/SYCL/Reduction/reduction_range_2d_s1_rw.cpp @@ -14,10 +14,12 @@ using namespace cl::sycl; +int NumErrors = 0; + template void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, range<2> Range) { constexpr access::mode RW = access::mode::read_write; - testBoth(Q, Identity, Init, BOp, Range); + NumErrors += testBoth(Q, Identity, Init, BOp, Range); } int main() { @@ -38,9 +40,8 @@ int main() { tests(Q, 0, 0x2021ff99, std::bit_xor<>{}, range<2>{3, 3}); tests(Q, ~0, 99, std::bit_and<>{}, range<2>{4, 3}); tests(Q, 0, 99, std::bit_or<>{}, range<2>{2, 2}); - tests(Q, 1, 3, std::multiplies<>{}, range<2>{16, 3}); - tests(Q, 1, 3, std::multiplies<>{}, - range<2>{3, MaxWGSize}); + tests(Q, 1, 3, std::multiplies<>{}, range<2>{8, 3}); + tests(Q, 1, 3, std::multiplies<>{}, range<2>{3, 8}); tests(Q, (std::numeric_limits::max)(), -99, ext::oneapi::minimum<>{}, range<2>{8, 3}); tests(Q, (std::numeric_limits::min)(), 99, @@ -50,6 +51,6 @@ int main() { tests(Q, CustomVec(0), CustomVec(99), CustomVecPlus{}, range<2>{33, MaxWGSize}); - std::cout << "Test passed\n"; - return 0; + printFinalStatus(NumErrors); + return NumErrors; } diff --git a/SYCL/Reduction/reduction_range_3d_s1_dw.cpp b/SYCL/Reduction/reduction_range_3d_s1_dw.cpp index d0b11a98f5..26fb3b1c50 100644 --- a/SYCL/Reduction/reduction_range_3d_s1_dw.cpp +++ b/SYCL/Reduction/reduction_range_3d_s1_dw.cpp @@ -14,10 +14,12 @@ using namespace cl::sycl; +int NumErrors = 0; + template void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, range<3> Range) { constexpr access::mode DW = access::mode::discard_write; - testBoth(Q, Identity, Init, BOp, Range); + NumErrors += testBoth(Q, Identity, Init, BOp, Range); } int main() { @@ -50,19 +52,17 @@ int main() { range<3>{MaxWGSize * 3, 4, 3}); tests(Q, 0, 99, std::bit_or<>{}, range<3>{2, 2, MaxWGSize * 3}); - tests(Q, 1, 3, std::multiplies<>{}, range<3>{16, 3, 5}); - tests(Q, 1, 3, std::multiplies<>{}, - range<3>{2, 3, MaxWGSize}); + tests(Q, 1, 3, std::multiplies<>{}, range<3>{2, 3, 5}); + tests(Q, 1, 2, std::multiplies<>{}, range<3>{2, 3, 8}); tests(Q, (std::numeric_limits::max)(), -99, ext::oneapi::minimum<>{}, range<3>{MaxWGSize, 8, 3}); tests(Q, (std::numeric_limits::min)(), 99, ext::oneapi::maximum<>{}, range<3>{3, MaxWGSize, 3}); - tests(Q, 1, 99, std::multiplies<>{}, - range<3>{3, 3, MaxWGSize}); + tests(Q, 1, 99, std::multiplies<>{}, range<3>{3, 3, 5}); tests(Q, CustomVec(0), CustomVec(99), CustomVecPlus{}, range<3>{2, 33, MaxWGSize}); - std::cout << "Test passed\n"; - return 0; + printFinalStatus(NumErrors); + return NumErrors; } diff --git a/SYCL/Reduction/reduction_range_3d_s1_rw.cpp b/SYCL/Reduction/reduction_range_3d_s1_rw.cpp index 963e9b4769..b5b8ddc0d3 100644 --- a/SYCL/Reduction/reduction_range_3d_s1_rw.cpp +++ b/SYCL/Reduction/reduction_range_3d_s1_rw.cpp @@ -14,10 +14,12 @@ using namespace cl::sycl; +int NumErrors = 0; + template void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, range<3> Range) { constexpr access::mode RW = access::mode::read_write; - testBoth(Q, Identity, Init, BOp, Range); + NumErrors += testBoth(Q, Identity, Init, BOp, Range); } int main() { @@ -50,19 +52,17 @@ int main() { range<3>{MaxWGSize * 3, 4, 3}); tests(Q, 0, 99, std::bit_or<>{}, range<3>{2, 2, MaxWGSize * 3}); - tests(Q, 1, 3, std::multiplies<>{}, range<3>{16, 3, 5}); - tests(Q, 1, 3, std::multiplies<>{}, - range<3>{2, 3, MaxWGSize}); + tests(Q, 1, 3, std::multiplies<>{}, range<3>{2, 3, 5}); + tests(Q, 1, 3, std::multiplies<>{}, range<3>{5, 2, 3}); tests(Q, (std::numeric_limits::max)(), -99, ext::oneapi::minimum<>{}, range<3>{MaxWGSize, 8, 3}); tests(Q, (std::numeric_limits::min)(), 99, ext::oneapi::maximum<>{}, range<3>{3, MaxWGSize, 3}); - tests(Q, 1, 99, std::multiplies<>{}, - range<3>{3, 3, MaxWGSize}); + tests(Q, 1, 99, std::multiplies<>{}, range<3>{3, 3, 4}); tests(Q, CustomVec(0), CustomVec(99), CustomVecPlus{}, range<3>{2, 33, MaxWGSize}); - std::cout << "Test passed\n"; - return 0; + printFinalStatus(NumErrors); + return NumErrors; } diff --git a/SYCL/Reduction/reduction_range_lambda.cpp b/SYCL/Reduction/reduction_range_lambda.cpp index ef7a1d46be..b677055609 100644 --- a/SYCL/Reduction/reduction_range_lambda.cpp +++ b/SYCL/Reduction/reduction_range_lambda.cpp @@ -25,26 +25,34 @@ int main() { auto LambdaSum = [](auto x, auto y) { return (x + y); }; - testBoth(Q, 0, 99, LambdaSum, range<1>{7}); - testBoth(Q, 0, 99, LambdaSum, range<1>{7}); + int NumErrors = 0; - testBoth(Q, 0, 99, LambdaSum, range<1>{MaxWGSize + 1}); - testBoth(Q, 0, 99, LambdaSum, range<1>{MaxWGSize + 1}); + NumErrors += testBoth(Q, 0, 99, LambdaSum, range<1>{7}); + NumErrors += testBoth(Q, 0, 99, LambdaSum, range<1>{7}); - testBoth(Q, 0, 99, LambdaSum, range<2>{3, 4}); - testBoth(Q, 0, 99, LambdaSum, range<2>{3, 4}); + NumErrors += + testBoth(Q, 0, 99, LambdaSum, range<1>{MaxWGSize + 1}); + NumErrors += + testBoth(Q, 0, 99, LambdaSum, range<1>{MaxWGSize + 1}); - testBoth(Q, 0, 99, LambdaSum, range<2>{3, MaxWGSize + 1}); - testBoth(Q, 0, 99, LambdaSum, range<2>{3, MaxWGSize + 1}); + NumErrors += testBoth(Q, 0, 99, LambdaSum, range<2>{3, 4}); + NumErrors += testBoth(Q, 0, 99, LambdaSum, range<2>{3, 4}); - testBoth(Q, 0, 99, LambdaSum, range<3>{2, 3, 4}); - testBoth(Q, 0, 99, LambdaSum, range<3>{2, 3, 4}); + NumErrors += testBoth(Q, 0, 99, LambdaSum, + range<2>{3, MaxWGSize + 1}); + NumErrors += testBoth(Q, 0, 99, LambdaSum, + range<2>{3, MaxWGSize + 1}); - testBoth(Q, 0, 99, LambdaSum, - range<3>{2, 3, MaxWGSize + 1}); - testBoth(Q, 0, 99, LambdaSum, - range<3>{2, 3, MaxWGSize + 1}); + NumErrors += + testBoth(Q, 0, 99, LambdaSum, range<3>{2, 3, 4}); + NumErrors += + testBoth(Q, 0, 99, LambdaSum, range<3>{2, 3, 4}); - std::cout << "Test passed\n"; - return 0; + NumErrors += testBoth(Q, 0, 99, LambdaSum, + range<3>{2, 3, MaxWGSize + 1}); + NumErrors += testBoth(Q, 0, 99, LambdaSum, + range<3>{2, 3, MaxWGSize + 1}); + + printFinalStatus(NumErrors); + return NumErrors; } diff --git a/SYCL/Reduction/reduction_range_scalar.hpp b/SYCL/Reduction/reduction_range_scalar.hpp index 7ee833f9d1..ef03f2ace1 100644 --- a/SYCL/Reduction/reduction_range_scalar.hpp +++ b/SYCL/Reduction/reduction_range_scalar.hpp @@ -3,38 +3,21 @@ // accessing a scalar holding the reduction result. #include "reduction_utils.hpp" -#include -#include using namespace cl::sycl; template class KName; -template -std::ostream &operator<<(std::ostream &OS, const range &Range) { - OS << "{" << Range[0]; - if constexpr (Dims > 1) - OS << ", " << Range[1]; - if constexpr (Dims > 2) - OS << ", " << Range[2]; - OS << "}"; - return OS; -} - -template -void test(queue &Q, T Identity, T Init, BinaryOperation BOp, - range Range) { - std::string StdMode = IsSYCL2020Mode ? "SYCL2020" : "ONEAPI "; - std::cout << "Running the test case: " << StdMode - << " {T=" << typeid(T).name() - << ", BOp=" << typeid(BinaryOperation).name() << ", Range=" << Range - << std::endl; +template +int test(queue &Q, T Identity, T Init, BinaryOperation BOp, + const range &Range) { + printTestLabel(IsSYCL2020, Range); // Skip the test for such big arrays now. constexpr size_t TwoGB = 2LL * 1024 * 1024 * 1024; if (Range.size() > TwoGB) - return; + return 0; buffer InBuf(Range); buffer OutBuf(1); @@ -51,42 +34,24 @@ void test(queue &Q, T Identity, T Init, BinaryOperation BOp, (OutBuf.template get_access())[0] = Init; // Compute. - if constexpr (IsSYCL2020Mode) { - Q.submit([&](handler &CGH) { - auto In = InBuf.template get_access(CGH); - property_list PropList = getPropertyList(); - auto Redu = sycl::reduction(OutBuf, CGH, Identity, BOp, PropList); - - CGH.parallel_for( - Range, Redu, [=](id Id, auto &Sum) { Sum.combine(In[Id]); }); - }); - } else { - Q.submit([&](handler &CGH) { - auto In = InBuf.template get_access(CGH); - accessor Out(OutBuf, CGH); - auto Redu = ext::oneapi::reduction(Out, Identity, BOp); - - CGH.parallel_for( - Range, Redu, [=](id Id, auto &Sum) { Sum.combine(In[Id]); }); - }); - } + Q.submit([&](handler &CGH) { + auto In = InBuf.template get_access(CGH); + auto Redu = + createReduction(OutBuf, CGH, Identity, BOp); + CGH.parallel_for( + Range, Redu, [=](id Id, auto &Sum) { Sum.combine(In[Id]); }); + }); // Check correctness. auto Out = OutBuf.template get_access(); T ComputedOut = *(Out.get_pointer()); - if (ComputedOut != CorrectOut) { - printDeviceInfo(Q, true); - std::cerr << "Error: Range = " << Range << ", " - << "Computed value: " << ComputedOut - << ", Expected value: " << CorrectOut << "\n"; - assert(0 && "Wrong value."); - } + return checkResults(Q, IsSYCL2020, BOp, Range, ComputedOut, CorrectOut); } template -void testBoth(queue &Q, T Identity, T Init, BinaryOperation BOp, - range Range) { - test, false, Mode>(Q, Identity, Init, BOp, Range); - test, true, Mode>(Q, Identity, Init, BOp, Range); +int testBoth(queue &Q, T Identity, T Init, BinaryOperation BOp, + const range &Range) { + return test, false, Mode>(Q, Identity, Init, BOp, Range) + + test, true, Mode>(Q, Identity, Init, BOp, Range); } diff --git a/SYCL/Reduction/reduction_reducer_op_eq.cpp b/SYCL/Reduction/reduction_reducer_op_eq.cpp index 07c4bbce28..e6990c875e 100644 --- a/SYCL/Reduction/reduction_reducer_op_eq.cpp +++ b/SYCL/Reduction/reduction_reducer_op_eq.cpp @@ -6,9 +6,7 @@ // This test checks that operators ++, +=, *=, |=, &=, ^= are supported // whent the corresponding std::plus<>, std::multiplies, etc are defined. -#include -#include -#include +#include "reduction_utils.hpp" using namespace sycl; @@ -78,19 +76,13 @@ template <> struct bit_and { }; } // namespace std -template -auto createReduction(T *USMPtr, T Identity, BinaryOperation BOp) { - if constexpr (IsSYCL2020Mode) - return sycl::reduction(USMPtr, Identity, BOp); - else - return ONEAPI::reduction(USMPtr, Identity, BOp); -} - template int test(queue &Q, T Identity) { constexpr size_t N = 16; constexpr size_t L = 4; + nd_range<1> NDR{N, L}; + printTestLabel(IsSYCL2020Mode, NDR); T *Data = malloc_host(N, Q); T *Res = malloc_host(1, Q); @@ -107,7 +99,6 @@ int test(queue &Q, T Identity) { *Res = Identity; auto Red = createReduction(Res, Identity, BOp); - nd_range<1> NDR{N, L}; if constexpr (OpEq == PlusPlus) { auto Lambda = [=](nd_item<1> ID, auto &Sum) { ++Sum; }; Q.submit([&](handler &H) { H.parallel_for(NDR, Red, Lambda); }).wait(); @@ -189,11 +180,12 @@ template int testINTPack(queue &Q) { int main() { queue Q; - int Error = 0; - Error += testFPPack(Q); - Error += testINTPack(Q); - Error += testINTPack(Q); - - std::cout << (Error ? "Failed\n" : "Passed.\n"); - return Error; + printDeviceInfo(Q); + int NumErrors = 0; + NumErrors += testFPPack(Q); + NumErrors += testINTPack(Q); + NumErrors += testINTPack(Q); + + printFinalStatus(NumErrors); + return NumErrors; } diff --git a/SYCL/Reduction/reduction_usm.cpp b/SYCL/Reduction/reduction_usm.cpp index 941c113881..e3c22b17d2 100644 --- a/SYCL/Reduction/reduction_usm.cpp +++ b/SYCL/Reduction/reduction_usm.cpp @@ -4,46 +4,33 @@ // RUN: %ACC_RUN_PLACEHOLDER %t.out // This test performs basic checks of parallel_for(nd_range, reduction, func) -// with reductions initialized with USM var. It tests both ONEAPI::reduction -// and SYCL-2020 reduction (sycl::reduction) assuming only read-write access, -// i.e. without using SYCL-2020 property::reduction::initialize_to_identity. +// with reductions initialized with USM var. It tests both +// ext::oneapi::reduction and SYCL-2020 reduction (sycl::reduction) assuming +// only read-write access, i.e. without using SYCL-2020 +// property::reduction::initialize_to_identity. #include "reduction_utils.hpp" -#include -#include using namespace cl::sycl; -template class KernelNameGroup; - -template -auto createReduction(T *USMPtr, T Identity, BinaryOperation BOp) { - if constexpr (IsSYCL2020) - return sycl::reduction(USMPtr, Identity, BOp); - else - return ONEAPI::reduction(USMPtr, Identity, BOp); -} +template class KName; template -void test(queue &Q, T Identity, T Init, size_t WGSize, size_t NWItems, - usm::alloc AllocType) { +int test(queue &Q, T Identity, T Init, size_t WGSize, size_t NWItems, + usm::alloc AllocType) { + nd_range<1> NDRange(range<1>{NWItems}, range<1>{WGSize}); + printTestLabel(IsSYCL2020, NDRange); + auto Dev = Q.get_device(); - if (AllocType == usm::alloc::shared && - !Dev.get_info()) - return; - if (AllocType == usm::alloc::host && - !Dev.get_info()) - return; - if (AllocType == usm::alloc::device && - !Dev.get_info()) - return; + if (!Dev.has(getUSMAspect(AllocType))) + return 0; T *ReduVarPtr = (T *)malloc(sizeof(T), Dev, Q.get_context(), AllocType); if (ReduVarPtr == nullptr) - return; + return 0; if (AllocType == usm::alloc::device) { Q.submit([&](handler &CGH) { - CGH.single_task>( + CGH.single_task>( [=]() { *ReduVarPtr = Init; }); }).wait(); } else { @@ -62,9 +49,7 @@ void test(queue &Q, T Identity, T Init, size_t WGSize, size_t NWItems, Q.submit([&](handler &CGH) { auto In = InBuf.template get_access(CGH); auto Redu = createReduction(ReduVarPtr, Identity, BOp); - nd_range<1> NDRange(range<1>{NWItems}, range<1>{WGSize}); - - CGH.parallel_for>( + CGH.parallel_for>( NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) { Sum.combine(In[NDIt.get_global_linear_id()]); }); @@ -76,63 +61,65 @@ void test(queue &Q, T Identity, T Init, size_t WGSize, size_t NWItems, buffer Buf(&ComputedOut, range<1>(1)); Q.submit([&](handler &CGH) { auto OutAcc = Buf.template get_access(CGH); - CGH.single_task>( + CGH.single_task>( [=]() { OutAcc[0] = *ReduVarPtr; }); }).wait(); ComputedOut = (Buf.template get_access())[0]; } else { ComputedOut = *ReduVarPtr; } - if (ComputedOut != CorrectOut) { - std::cerr << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n"; - std::cerr << "Computed value: " << ComputedOut - << ", Expected value: " << CorrectOut - << ", AllocMode: " << static_cast(AllocType) << "\n"; - assert(0 && "Wrong value."); - } + std::string AllocStr = + "AllocMode=" + std::to_string(static_cast(AllocType)); + int Error = checkResults(Q, IsSYCL2020, BOp, NDRange, ComputedOut, CorrectOut, + AllocStr); free(ReduVarPtr, Q.get_context()); + return Error; } +int NumErrors = 0; + template void testUSM(queue &Q, T Identity, T Init, size_t WGSize, size_t NWItems) { // Test SYCL-2020 reductions - test, true, T, BinaryOperation>( + NumErrors += test, true, T, BinaryOperation>( Q, Identity, Init, WGSize, NWItems, usm::alloc::shared); - test, true, T, BinaryOperation>( + NumErrors += test, true, T, BinaryOperation>( Q, Identity, Init, WGSize, NWItems, usm::alloc::host); - test, true, T, BinaryOperation>( + NumErrors += test, true, T, BinaryOperation>( Q, Identity, Init, WGSize, NWItems, usm::alloc::device); - // Test ONEAPI reductions - test, false, T, BinaryOperation>( + // Test ext::oneapi reductions + NumErrors += test, false, T, BinaryOperation>( Q, Identity, Init, WGSize, NWItems, usm::alloc::shared); - test, false, T, BinaryOperation>( + NumErrors += test, false, T, BinaryOperation>( Q, Identity, Init, WGSize, NWItems, usm::alloc::host); - test, false, T, BinaryOperation>( + NumErrors += test, false, T, BinaryOperation>( Q, Identity, Init, WGSize, NWItems, usm::alloc::device); } int main() { queue Q; + printDeviceInfo(Q); + // fast atomics and fast reduce - testUSM>(Q, 0, 99, 49, 5 * 49); + testUSM>(Q, 0, 99, 49, 5 * 49); // fast atomics - testUSM>(Q, 0, 0xff00ff00, 7, 7); - testUSM>(Q, 0, 0x7f007f00, 4, 32); + testUSM>(Q, 0, 0xff00ff00, 7, 7); + testUSM>(Q, 0, 0x7f007f00, 4, 32); // fast reduce - testUSM>( + testUSM>( Q, getMaximumFPValue(), -100.0, 17, 17); - testUSM>( + testUSM>( Q, getMinimumFPValue(), 100.0, 4, 32); // generic algorithm - testUSM>(Q, 1, 5, 7, 7); + testUSM>(Q, 1, 5, 7, 7); testUSM, CustomVecPlus>( Q, CustomVec(0), CustomVec(77), 8, 8 * 3); - std::cout << "Test passed\n"; - return 0; + printFinalStatus(NumErrors); + return NumErrors; } diff --git a/SYCL/Reduction/reduction_usm_dw.cpp b/SYCL/Reduction/reduction_usm_dw.cpp index e3c9b0d6a1..42f4f41a59 100644 --- a/SYCL/Reduction/reduction_usm_dw.cpp +++ b/SYCL/Reduction/reduction_usm_dw.cpp @@ -12,33 +12,30 @@ // is created with property::reduction::initialize_to_identity. #include "reduction_utils.hpp" -#include -#include using namespace cl::sycl; -template class KernelNameGroup; +template class KName; + +// Discard-write access to USM reductions is available only in SYCL2020. +constexpr bool IsSYCL2020 = true; template -void test(queue &Q, T Identity, T Init, size_t WGSize, size_t NWItems, - usm::alloc AllocType) { +int test(queue &Q, T Identity, T Init, size_t WGSize, size_t NWItems, + usm::alloc AllocType) { + nd_range<1> NDRange(range<1>{NWItems}, range<1>{WGSize}); + printTestLabel(IsSYCL2020, NDRange); + auto Dev = Q.get_device(); - if (AllocType == usm::alloc::shared && - !Dev.get_info()) - return; - if (AllocType == usm::alloc::host && - !Dev.get_info()) - return; - if (AllocType == usm::alloc::device && - !Dev.get_info()) - return; + if (!Dev.has(getUSMAspect(AllocType))) + return 0; T *ReduVarPtr = (T *)malloc(sizeof(T), Dev, Q.get_context(), AllocType); if (ReduVarPtr == nullptr) - return; + return 0; if (AllocType == usm::alloc::device) { Q.submit([&](handler &CGH) { - CGH.single_task>( + CGH.single_task>( [=]() { *ReduVarPtr = Init; }); }).wait(); } else { @@ -55,11 +52,9 @@ void test(queue &Q, T Identity, T Init, size_t WGSize, size_t NWItems, // Compute. Q.submit([&](handler &CGH) { auto In = InBuf.template get_access(CGH); - property_list PropList(property::reduction::initialize_to_identity{}); - auto Redu = sycl::reduction(ReduVarPtr, Identity, BOp, PropList); - nd_range<1> NDRange(range<1>{NWItems}, range<1>{WGSize}); - - CGH.parallel_for>( + auto Redu = createReduction( + ReduVarPtr, Identity, BOp); + CGH.parallel_for>( NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) { Sum.combine(In[NDIt.get_global_linear_id()]); }); @@ -71,54 +66,56 @@ void test(queue &Q, T Identity, T Init, size_t WGSize, size_t NWItems, buffer Buf(&ComputedOut, range<1>(1)); Q.submit([&](handler &CGH) { auto OutAcc = Buf.template get_access(CGH); - CGH.single_task>( + CGH.single_task>( [=]() { OutAcc[0] = *ReduVarPtr; }); }).wait(); ComputedOut = (Buf.template get_access())[0]; } else { ComputedOut = *ReduVarPtr; } - if (ComputedOut != CorrectOut) { - std::cerr << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n"; - std::cerr << "Computed value: " << ComputedOut - << ", Expected value: " << CorrectOut - << ", AllocMode: " << static_cast(AllocType) << "\n"; - assert(0 && "Wrong value."); - } + std::string AllocStr = + "AllocMode=" + std::to_string(static_cast(AllocType)); + int Error = checkResults(Q, IsSYCL2020, BOp, NDRange, ComputedOut, CorrectOut, + AllocStr); free(ReduVarPtr, Q.get_context()); + return Error; } +int NumErrors = 0; + template void testUSM(queue &Q, T Identity, T Init, size_t WGSize, size_t NWItems) { - test, T, BinaryOperation>( + NumErrors += test, T, BinaryOperation>( Q, Identity, Init, WGSize, NWItems, usm::alloc::shared); - test, T, BinaryOperation>( + NumErrors += test, T, BinaryOperation>( Q, Identity, Init, WGSize, NWItems, usm::alloc::host); - test, T, BinaryOperation>( + NumErrors += test, T, BinaryOperation>( Q, Identity, Init, WGSize, NWItems, usm::alloc::device); } int main() { queue Q; + printDeviceInfo(Q); + // fast atomics and fast reduce - testUSM>(Q, 0, 99, 49, 5 * 49); + testUSM>(Q, 0, 99, 49, 5 * 49); // fast atomics - testUSM>(Q, 0, 0xff00ff00, 7, 7); - testUSM>(Q, 0, 0x7f007f00, 4, 32); + testUSM>(Q, 0, 0xff00ff00, 7, 7); + testUSM>(Q, 0, 0x7f007f00, 4, 32); // fast reduce - testUSM>( + testUSM>( Q, getMaximumFPValue(), -100.0, 17, 17); - testUSM>( + testUSM>( Q, getMinimumFPValue(), 100.0, 4, 32); // generic algorithm - testUSM>(Q, 1, 5, 7, 7); + testUSM>(Q, 1, 5, 7, 7); testUSM, CustomVecPlus>( Q, CustomVec(0), CustomVec(77), 8, 8 * 3); - std::cout << "Test passed\n"; - return 0; + printFinalStatus(NumErrors); + return NumErrors; } diff --git a/SYCL/Reduction/reduction_utils.hpp b/SYCL/Reduction/reduction_utils.hpp index e51030f1ea..6303fe1d46 100644 --- a/SYCL/Reduction/reduction_utils.hpp +++ b/SYCL/Reduction/reduction_utils.hpp @@ -11,8 +11,9 @@ void initInputData(buffer &InBuf, T &ExpectedOut, T Identity, size_t N = Range.size(); auto In = InBuf.template get_access(); for (int I = 0; I < N; ++I) { - if (std::is_same>::value) - In[I] = 1 + (((I % 37) == 0) ? 1 : 0); + if (std::is_same_v> || + std::is_same_v>) + In[I] = 1.1 + (((I % 11) == 0) ? 1 : 0); else In[I] = ((I + 1) % 5) + 1.1; ExpectedOut = BOp(ExpectedOut, In[I]); @@ -28,8 +29,9 @@ void initInputData(buffer &InBuf, T &ExpectedOut, T Identity, auto In = InBuf.template get_access(); for (int J = 0; J < Range[0]; ++J) { for (int I = 0; I < Range[1]; ++I) { - if (std::is_same>::value) - In[J][I] = 1 + ((((I * 2 + J * 3) % 37) == 0) ? 1 : 0); + if (std::is_same_v> || + std::is_same_v>) + In[J][I] = 1.1 + ((((I + J * 3) % 11) == 0) ? 1 : 0); else In[J][I] = ((I + 1 + J) % 5) + 1.1; ExpectedOut = BOp(ExpectedOut, In[J][I]); @@ -47,8 +49,9 @@ void initInputData(buffer &InBuf, T &ExpectedOut, T Identity, for (int K = 0; K < Range[0]; ++K) { for (int J = 0; J < Range[1]; ++J) { for (int I = 0; I < Range[2]; ++I) { - if (std::is_same>::value) - In[K][J][I] = 1 + ((((I * 2 + J * 3 + K) % 37) == 0) ? 1 : 0); + if (std::is_same_v> || + std::is_same_v>) + In[K][J][I] = 1.1 + ((((I + J * 3 + K) % 11) == 0) ? 1 : 0); else In[K][J][I] = ((I + 1 + J + K * 3) % 5) + 1.1; ExpectedOut = BOp(ExpectedOut, In[K][J][I]); @@ -75,6 +78,23 @@ bool operator==(const CustomVec &A, const CustomVec &B) { return A.X == B.X && A.Y == B.Y; } template +bool operator<(const CustomVec &A, const CustomVec &B) { + return A.X < B.X && A.Y < B.Y; +} +template +CustomVec operator/(const CustomVec &A, const CustomVec &B) { + return {A.X / B.X && A.Y / B.Y}; +} +template +CustomVec operator-(const CustomVec &A, const CustomVec &B) { + return {A.X - B.X && A.Y - B.Y}; +} +namespace std { +template CustomVec abs(const CustomVec &A) { + return {std::abs(A.X), std::abs(A.Y)}; +} +} // namespace std +template std::ostream &operator<<(std::ostream &OS, const CustomVec &V) { return OS << "(" << V.X << ", " << V.Y << ")"; } @@ -105,14 +125,142 @@ template property_list getPropertyList() { } void printDeviceInfo(queue &Q, bool ToCERR = false) { + static int IsErrDeviceInfoPrinted = 0; + if (IsErrDeviceInfoPrinted >= 2) + return; + IsErrDeviceInfoPrinted++; + device D = Q.get_device(); auto Name = D.get_info(); size_t MaxWGSize = D.get_info(); size_t LocalMemSize = D.get_info(); - if (ToCERR) - std::cout << "Device: " << Name << ", MaxWGSize: " << MaxWGSize - << ", LocalMemSize: " << LocalMemSize << std::endl; + std::ostream &OS = ToCERR ? std::cerr : std::cout; + OS << "Device: " << Name << ", MaxWGSize: " << MaxWGSize + << ", LocalMemSize: " << LocalMemSize + << ", Driver: " << D.get_info() << std::endl; +} + +template +std::ostream &operator<<(std::ostream &OS, const range &Range) { + OS << "{" << Range[0]; + if constexpr (Dims > 1) + OS << ", " << Range[1]; + if constexpr (Dims > 2) + OS << ", " << Range[2]; + OS << "}"; + return OS; +} + +template +std::ostream &operator<<(std::ostream &OS, const nd_range &Range) { + OS << "{" << Range.get_global_range() << ", " << Range.get_local_range() + << "}"; + return OS; +} + +template +void printTestLabel(bool IsSYCL2020, const RangeT &Range, bool ToCERR = false) { + std::ostream &OS = ToCERR ? std::cerr : std::cout; + std::string Mode = IsSYCL2020 ? "SYCL2020" : "ONEAPI "; + OS << (ToCERR ? "Error" : "Start") << ": Mode=" << Mode + << ", T=" << typeid(T).name() << ", BOp=" << typeid(BinaryOperation).name() + << ", Range=" << Range; +} + +template constexpr bool isPreciseResultFP() { + return (std::is_floating_point_v || std::is_same_v)&&( + std::is_same_v, BOp> || + std::is_same_v, BOp> || + std::is_same_v, BOp> || + std::is_same_v, BOp>); +} + +template +int checkResults(queue &Q, bool IsSYCL2020, BinaryOperation, + const RangeT &Range, const T &ComputedRes, const T &CorrectRes, + std::string AddInfo = "") { + std::string ErrorStr; + bool Passed; + + if constexpr (std::is_floating_point_v || std::is_same_v) { + // It is a pretty simple and naive FP diff check here, which though + // should work reasonably well for most of cases handled in reduction + // tests. + T MaxDiff = std::numeric_limits::epsilon() * std::fabs(CorrectRes); + if constexpr (std::is_same_v> || + std::is_same_v> || + std::is_same_v>) + MaxDiff *= Range.size(); + else + MaxDiff *= Range.get_global_range().size(); + + if (isPreciseResultFP()) + MaxDiff = 0; + T Diff = std::abs(CorrectRes - ComputedRes); + ErrorStr = ", Diff=" + std::to_string(Diff) + + ", MaxDiff=" + std::to_string(MaxDiff); + Passed = Diff <= MaxDiff; + } else { + Passed = ComputedRes == CorrectRes; + } + + std::cout << AddInfo << (Passed ? ". PASSED" : ". FAILED") << std::endl; + if (!Passed) { + printDeviceInfo(Q, true); + printTestLabel(IsSYCL2020, Range, true); + std::cerr << ", Computed value=" << ComputedRes + << ", Expected value=" << CorrectRes << ErrorStr + << (AddInfo.empty() ? "" : ", " + AddInfo) << std::endl; + } + return Passed ? 0 : 1; +} + +void printFinalStatus(int NumErrors) { + if (NumErrors == 0) + std::cout << "Test passed" << std::endl; else - std::cerr << "Device: " << Name << ", MaxWGSize: " << MaxWGSize - << ", LocalMemSize: " << LocalMemSize << std::endl; + std::cerr << NumErrors << " test-cases failed" << std::endl; +} + +template +auto createReduction(T *USMPtr, T Identity, BinaryOperation BOp) { + if constexpr (IsSYCL2020) + return sycl::reduction(USMPtr, Identity, BOp, getPropertyList()); + else + return ext::oneapi::reduction(USMPtr, Identity, BOp); +} + +template +auto createReduction(T *USMPtr, BinaryOperation BOp) { + if constexpr (IsSYCL2020) + return sycl::reduction(USMPtr, BOp, getPropertyList()); + else + return ext::oneapi::reduction(USMPtr, BOp); +} + +template +auto createReduction(BufferT ReduBuf, handler &CGH, T Identity, + BinaryOperation BOp) { + if constexpr (IsSYCL2020) { + property_list PropList = getPropertyList(); + return sycl::reduction(ReduBuf, CGH, Identity, BOp, PropList); + } else { + accessor Out(ReduBuf, + CGH); + return ext::oneapi::reduction(Out, Identity, BOp); + } +} + +aspect getUSMAspect(usm::alloc Alloc) { + if (Alloc == sycl::usm::alloc::host) + return aspect::usm_host_allocations; + + if (Alloc == sycl::usm::alloc::device) + return aspect::usm_device_allocations; + + assert(Alloc == usm::alloc::shared && "Unknown USM allocation type"); + return aspect::usm_shared_allocations; }