Skip to content

Commit bb73d92

Browse files
authored
[SYCL] Implement basic reduction for parallel_for() accepting nd_range (#1585)
This patch implements basic support for parallel_for(nd_range, reduction, kernel). It handles all reduction types and operations, including user's custom ones. The more efficient variants are on the way. What is NOT supported by this patch: - parallel_for(range, ...) // i.e. simple range without work-group sizes - parallel_for(nd_range, reduction1, reduction1, ...) // i.e. more than 1 reductions in paralell_for - USM - vector reductions (dims > 1 and #elements > 1) - HOST. The implementation used in this patch uses barrier(), which is not supported on HOST yet. Signed-off-by: Vyacheslav N Klochkov <[email protected]>
1 parent abed4e3 commit bb73d92

16 files changed

+1092
-29
lines changed

sycl/include/CL/sycl/builtins.hpp

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,7 @@ namespace sycl {
2424
#else
2525
namespace __sycl_std = __host_std;
2626
#endif
27-
} // namespace sycl
28-
} // __SYCL_INLINE_NAMESPACE(cl)
2927

30-
__SYCL_INLINE_NAMESPACE(cl) {
31-
namespace sycl {
3228
/* ----------------- 4.13.3 Math functions. ---------------------------------*/
3329
// genfloat acos (genfloat x)
3430
template <typename T>
@@ -731,7 +727,8 @@ detail::enable_if_t<detail::is_geninteger<T>::value, T> clz(T x) __NOEXC {
731727
namespace intel {
732728
// geninteger ctz (geninteger x)
733729
template <typename T>
734-
detail::enable_if_t<detail::is_geninteger<T>::value, T> ctz(T x) __NOEXC {
730+
sycl::detail::enable_if_t<sycl::detail::is_geninteger<T>::value, T>
731+
ctz(T x) __NOEXC {
735732
return __sycl_std::__invoke_ctz<T>(x);
736733
}
737734
} // namespace intel

sycl/include/CL/sycl/detail/cg.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ class interop_handler {
4646

4747
public:
4848
using QueueImplPtr = std::shared_ptr<detail::queue_impl>;
49-
using ReqToMem = std::pair<detail::Requirement*, pi_mem>;
49+
using ReqToMem = std::pair<detail::Requirement *, pi_mem>;
5050

5151
interop_handler(std::vector<ReqToMem> MemObjs, QueueImplPtr Queue)
5252
: MQueue(std::move(Queue)), MMemObjs(std::move(MemObjs)) {}

sycl/include/CL/sycl/handler.hpp

Lines changed: 393 additions & 2 deletions
Large diffs are not rendered by default.

sycl/include/CL/sycl/intel/function_pointer.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ device_func_ptr_holder_t get_device_func_ptr(FuncType F, const char *FuncName,
8181
PI_INVALID_OPERATION);
8282
}
8383

84-
return detail::getDeviceFunctionPointerImpl(D, P, FuncName);
84+
return sycl::detail::getDeviceFunctionPointerImpl(D, P, FuncName);
8585
}
8686
} // namespace intel
8787
} // namespace sycl

sycl/include/CL/sycl/intel/reduction.hpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,13 +342,39 @@ class reduction_impl {
342342
"Only scalar/1-element reductions are supported now.");
343343
}
344344

345+
accessor<T, buffer_dim, access::mode::discard_read_write,
346+
access::target::local>
347+
getReadWriteLocalAcc(size_t Size, handler &CGH) {
348+
return accessor<T, buffer_dim, access::mode::discard_read_write,
349+
access::target::local>(Size, CGH);
350+
}
351+
352+
accessor<T, buffer_dim, access::mode::read>
353+
getReadAccToPreviousPartialReds(handler &CGH) const {
354+
CGH.addReduction(MOutBufPtr);
355+
return accessor<T, buffer_dim, access::mode::read>(*MOutBufPtr, CGH);
356+
}
357+
358+
accessor_type getWriteAccForPartialReds(size_t Size, size_t RunNumber,
359+
handler &CGH) {
360+
if (Size == 1) {
361+
if (RunNumber > 0)
362+
CGH.associateWithHandler(this->MAcc);
363+
return this->MAcc;
364+
}
365+
// Create a new output buffer and return an accessor to it.
366+
MOutBufPtr = std::make_shared<buffer<T, buffer_dim>>(range<1>(Size));
367+
CGH.addReduction(MOutBufPtr);
368+
return accessor_type(*MOutBufPtr, CGH);
369+
}
345370
/// User's accessor to where the reduction must be written.
346371
accessor_type MAcc;
347372

348373
private:
349374
/// Identity of the BinaryOperation.
350375
/// The result of BinaryOperation(X, MIdentity) is equal to X for any X.
351376
const T MIdentity;
377+
shared_ptr_class<buffer<T, buffer_dim>> MOutBufPtr;
352378
};
353379

354380
} // namespace detail

sycl/source/detail/queue_impl.hpp

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,11 @@ class queue_impl {
351351
/// \return a native handle.
352352
pi_native_handle getNative() const;
353353

354+
/// Stores an event that should be associated with the queue
355+
///
356+
/// \param Event is the event to be stored
357+
void addEvent(event Event);
358+
354359
private:
355360
/// Performs command group submission to the queue.
356361
///
@@ -362,8 +367,9 @@ class queue_impl {
362367
shared_ptr_class<queue_impl> Self,
363368
const detail::code_location &Loc) {
364369
handler Handler(std::move(Self), MHostQueue);
370+
Handler.saveCodeLoc(Loc);
365371
CGF(Handler);
366-
event Event = Handler.finalize(Loc);
372+
event Event = Handler.finalize();
367373
addEvent(Event);
368374
return Event;
369375
}
@@ -377,11 +383,6 @@ class queue_impl {
377383
void instrumentationEpilog(void *TelementryEvent, string_class &Name,
378384
int32_t StreamID, uint64_t IId);
379385

380-
/// Stores an event that should be associated with the queue
381-
///
382-
/// \param Event is the event to be stored
383-
void addEvent(event Event);
384-
385386
/// Stores a USM operation event that should be associated with the queue
386387
///
387388
/// \param Event is the event to be stored

sycl/source/handler.cpp

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,24 @@
1313
#include <CL/sycl/handler.hpp>
1414
#include <CL/sycl/info/info_desc.hpp>
1515
#include <detail/kernel_impl.hpp>
16+
#include <detail/queue_impl.hpp>
1617
#include <detail/scheduler/scheduler.hpp>
1718

1819
__SYCL_INLINE_NAMESPACE(cl) {
1920
namespace sycl {
20-
event handler::finalize(const cl::sycl::detail::code_location &Payload) {
21-
sycl::event EventRet;
21+
22+
void handler::addEventToQueue(shared_ptr_class<detail::queue_impl> Queue,
23+
cl::sycl::event Event) {
24+
Queue->addEvent(std::move(Event));
25+
}
26+
27+
event handler::finalize() {
28+
// This block of code is needed only for reduction implementation.
29+
// It is harmless (does nothing) for everything else.
30+
if (MIsFinalized)
31+
return MLastEvent;
32+
MIsFinalized = true;
33+
2234
unique_ptr_class<detail::CG> CommandGroup;
2335
switch (MCGType) {
2436
case detail::CG::KERNEL:
@@ -29,52 +41,52 @@ event handler::finalize(const cl::sycl::detail::code_location &Payload) {
2941
std::move(MSharedPtrStorage), std::move(MRequirements),
3042
std::move(MEvents), std::move(MArgs), std::move(MKernelName),
3143
std::move(MOSModuleHandle), std::move(MStreamStorage), MCGType,
32-
Payload));
44+
MCodeLoc));
3345
break;
3446
}
3547
case detail::CG::INTEROP_TASK_CODEPLAY:
3648
CommandGroup.reset(new detail::CGInteropTask(
3749
std::move(MInteropTask), std::move(MArgsStorage),
3850
std::move(MAccStorage), std::move(MSharedPtrStorage),
39-
std::move(MRequirements), std::move(MEvents), MCGType, Payload));
51+
std::move(MRequirements), std::move(MEvents), MCGType, MCodeLoc));
4052
break;
4153
case detail::CG::COPY_ACC_TO_PTR:
4254
case detail::CG::COPY_PTR_TO_ACC:
4355
case detail::CG::COPY_ACC_TO_ACC:
4456
CommandGroup.reset(new detail::CGCopy(
4557
MCGType, MSrcPtr, MDstPtr, std::move(MArgsStorage),
4658
std::move(MAccStorage), std::move(MSharedPtrStorage),
47-
std::move(MRequirements), std::move(MEvents), Payload));
59+
std::move(MRequirements), std::move(MEvents), MCodeLoc));
4860
break;
4961
case detail::CG::FILL:
5062
CommandGroup.reset(new detail::CGFill(
5163
std::move(MPattern), MDstPtr, std::move(MArgsStorage),
5264
std::move(MAccStorage), std::move(MSharedPtrStorage),
53-
std::move(MRequirements), std::move(MEvents), Payload));
65+
std::move(MRequirements), std::move(MEvents), MCodeLoc));
5466
break;
5567
case detail::CG::UPDATE_HOST:
5668
CommandGroup.reset(new detail::CGUpdateHost(
5769
MDstPtr, std::move(MArgsStorage), std::move(MAccStorage),
5870
std::move(MSharedPtrStorage), std::move(MRequirements),
59-
std::move(MEvents), Payload));
71+
std::move(MEvents), MCodeLoc));
6072
break;
6173
case detail::CG::COPY_USM:
6274
CommandGroup.reset(new detail::CGCopyUSM(
6375
MSrcPtr, MDstPtr, MLength, std::move(MArgsStorage),
6476
std::move(MAccStorage), std::move(MSharedPtrStorage),
65-
std::move(MRequirements), std::move(MEvents), Payload));
77+
std::move(MRequirements), std::move(MEvents), MCodeLoc));
6678
break;
6779
case detail::CG::FILL_USM:
6880
CommandGroup.reset(new detail::CGFillUSM(
6981
std::move(MPattern), MDstPtr, MLength, std::move(MArgsStorage),
7082
std::move(MAccStorage), std::move(MSharedPtrStorage),
71-
std::move(MRequirements), std::move(MEvents), Payload));
83+
std::move(MRequirements), std::move(MEvents), MCodeLoc));
7284
break;
7385
case detail::CG::PREFETCH_USM:
7486
CommandGroup.reset(new detail::CGPrefetchUSM(
7587
MDstPtr, MLength, std::move(MArgsStorage), std::move(MAccStorage),
7688
std::move(MSharedPtrStorage), std::move(MRequirements),
77-
std::move(MEvents), Payload));
89+
std::move(MEvents), MCodeLoc));
7890
break;
7991
case detail::CG::NONE:
8092
throw runtime_error("Command group submitted without a kernel or a "
@@ -88,8 +100,8 @@ event handler::finalize(const cl::sycl::detail::code_location &Payload) {
88100
detail::EventImplPtr Event = detail::Scheduler::getInstance().addCG(
89101
std::move(CommandGroup), std::move(MQueue));
90102

91-
EventRet = detail::createSyclObjFromImpl<event>(Event);
92-
return EventRet;
103+
MLastEvent = detail::createSyclObjFromImpl<event>(Event);
104+
return MLastEvent;
93105
}
94106

95107
void handler::processArg(void *Ptr, const detail::kernel_param_kind_t &Kind,

sycl/test/abi/sycl_symbols_linux.dump

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3231,7 +3231,8 @@ _ZN2cl4sycl7handler10processArgEPvRKNS0_6detail19kernel_param_kind_tEimRmb
32313231
_ZN2cl4sycl7handler13getKernelNameB5cxx11Ev
32323232
_ZN2cl4sycl7handler18extractArgsAndReqsEv
32333233
_ZN2cl4sycl7handler28extractArgsAndReqsFromLambdaEPcmPKNS0_6detail19kernel_param_desc_tE
3234-
_ZN2cl4sycl7handler8finalizeERKNS0_6detail13code_locationE
3234+
_ZN2cl4sycl7handler15addEventToQueueESt10shared_ptrINS0_6detail10queue_implEENS0_5eventE
3235+
_ZN2cl4sycl7handler8finalizeEv
32353236
_ZN2cl4sycl7program17build_with_sourceENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES7_
32363237
_ZN2cl4sycl7program19compile_with_sourceENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES7_
32373238
_ZN2cl4sycl7program22build_with_kernel_nameENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES7_l

sycl/test/abi/symbol_size.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,11 @@ int main() {
4343
check_size<device_selector, 8>();
4444
check_size<event, 16>();
4545
check_size<gpu_selector, 8>();
46-
check_size<handler, 472>();
46+
#ifdef _MSC_VER
47+
check_size<handler, 520>();
48+
#else
49+
check_size<handler, 528>();
50+
#endif
4751
check_size<image<1>, 16>();
4852
check_size<kernel, 16>();
4953
check_size<platform, 16>();
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2+
// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out
3+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
4+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
5+
// RUN: %ACC_RUN_PLACEHOLDER %t.out
6+
7+
// This test performs basic checks of parallel_for(nd_range, reduction, func)
8+
// with reduction and conditional increment of the reduction variable.
9+
10+
#include <CL/sycl.hpp>
11+
#include <cassert>
12+
13+
using namespace cl::sycl;
14+
15+
template <typename T, class BinaryOperation>
16+
void initInputData(buffer<T, 1> &InBuf, T &ExpectedOut, T Identity,
17+
BinaryOperation BOp, size_t N) {
18+
ExpectedOut = Identity;
19+
auto In = InBuf.template get_access<access::mode::write>();
20+
for (int I = 0; I < N; ++I) {
21+
if (std::is_same<BinaryOperation, std::multiplies<T>>::value)
22+
In[I] = 1 + (((I % 37) == 0) ? 1 : 0);
23+
else
24+
In[I] = I + 1 + 1.1;
25+
26+
if (I < 2)
27+
ExpectedOut = BOp(ExpectedOut, 99);
28+
else if (I % 3)
29+
ExpectedOut = BOp(ExpectedOut, In[I]);
30+
else
31+
; // do nothing.
32+
}
33+
};
34+
35+
template <typename T, int Dim, class BinaryOperation>
36+
class SomeClass;
37+
38+
template <typename T>
39+
struct Vec {
40+
Vec() : X(0), Y(0) {}
41+
Vec(T X, T Y) : X(X), Y(Y) {}
42+
Vec(T V) : X(V), Y(V) {}
43+
bool operator==(const Vec &P) const {
44+
return P.X == X && P.Y == Y;
45+
}
46+
bool operator!=(const Vec &P) const {
47+
return !(*this == P);
48+
}
49+
T X;
50+
T Y;
51+
};
52+
template <typename T>
53+
bool operator==(const Vec<T> &A, const Vec<T> &B) {
54+
return A.X == B.X && A.Y == B.Y;
55+
}
56+
template <typename T>
57+
std::ostream &operator<<(std::ostream &OS, const Vec<T> &P) {
58+
return OS << "(" << P.X << ", " << P.Y << ")";
59+
}
60+
61+
template <class T>
62+
struct VecPlus {
63+
using P = Vec<T>;
64+
P operator()(const P &A, const P &B) const {
65+
return P(A.X + B.X, A.Y + B.Y);
66+
}
67+
};
68+
69+
template <typename T, int Dim, class BinaryOperation>
70+
void test(T Identity, size_t WGSize, size_t NWItems) {
71+
buffer<T, 1> InBuf(NWItems);
72+
buffer<T, 1> OutBuf(1);
73+
74+
// Initialize.
75+
BinaryOperation BOp;
76+
T CorrectOut;
77+
initInputData(InBuf, CorrectOut, Identity, BOp, NWItems);
78+
79+
// Compute.
80+
queue Q;
81+
Q.submit([&](handler &CGH) {
82+
auto In = InBuf.template get_access<access::mode::read>(CGH);
83+
accessor<T, Dim, access::mode::discard_write, access::target::global_buffer>
84+
Out(OutBuf, CGH);
85+
auto Redu = intel::reduction(Out, Identity, BOp);
86+
87+
range<1> GlobalRange(NWItems);
88+
range<1> LocalRange(WGSize);
89+
nd_range<1> NDRange(GlobalRange, LocalRange);
90+
CGH.parallel_for<SomeClass<T, Dim, BinaryOperation>>(
91+
NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) {
92+
size_t I = NDIt.get_global_linear_id();
93+
if (I < 2)
94+
Sum.combine(T(99));
95+
else if (I % 3)
96+
Sum.combine(In[I]);
97+
else
98+
; // do nothing.
99+
});
100+
});
101+
102+
// Check correctness.
103+
auto Out = OutBuf.template get_access<access::mode::read>();
104+
T ComputedOut = *(Out.get_pointer());
105+
if (ComputedOut != CorrectOut) {
106+
std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n";
107+
std::cout << "Computed value: " << ComputedOut
108+
<< ", Expected value: " << CorrectOut << "\n";
109+
assert(0 && "Wrong value.");
110+
}
111+
}
112+
113+
int main() {
114+
test<int, 0, intel::plus<int>>(0, 2, 2);
115+
test<int, 1, intel::plus<int>>(0, 7, 7);
116+
test<int, 0, intel::plus<int>>(0, 2, 64);
117+
test<short, 1, intel::plus<short>>(0, 16, 256);
118+
119+
std::cout << "Test passed\n";
120+
return 0;
121+
}

0 commit comments

Comments
 (0)