@@ -37,117 +37,34 @@ class handler;
37
37
namespace detail {
38
38
class HostTask ;
39
39
40
- // The structure represents kernel argument.
41
- class ArgDesc {
42
- public:
43
- ArgDesc (sycl::detail::kernel_param_kind_t Type, void *Ptr , int Size ,
44
- int Index)
45
- : MType(Type), MPtr(Ptr ), MSize(Size ), MIndex(Index) {}
46
-
47
- sycl::detail::kernel_param_kind_t MType;
48
- void *MPtr;
49
- int MSize;
50
- int MIndex;
51
- };
52
-
53
- // The structure represents NDRange - global, local sizes, global offset,
54
- // number of dimensions, and the cluster dimensions if applicable.
55
- class NDRDescT {
56
- // The method initializes all sizes for dimensions greater than the passed one
57
- // to the default values, so they will not affect execution.
58
- void setNDRangeLeftover (int Dims_) {
59
- for (int I = Dims_; I < 3 ; ++I) {
60
- GlobalSize[I] = 1 ;
61
- LocalSize[I] = LocalSize[0 ] ? 1 : 0 ;
62
- GlobalOffset[I] = 0 ;
63
- NumWorkGroups[I] = 0 ;
64
- }
65
- }
66
-
67
- public:
68
- NDRDescT ()
69
- : GlobalSize{0 , 0 , 0 }, LocalSize{0 , 0 , 0 }, NumWorkGroups{0 , 0 , 0 },
70
- Dims{0 } {}
71
-
72
- template <int Dims_> void set (sycl::range<Dims_> NumWorkItems) {
73
- for (int I = 0 ; I < Dims_; ++I) {
74
- GlobalSize[I] = NumWorkItems[I];
75
- LocalSize[I] = 0 ;
76
- GlobalOffset[I] = 0 ;
77
- NumWorkGroups[I] = 0 ;
78
- }
79
- setNDRangeLeftover (Dims_);
80
- Dims = Dims_;
81
- }
82
-
83
- // Initializes this ND range descriptor with given range of work items and
84
- // offset.
85
- template <int Dims_>
86
- void set (sycl::range<Dims_> NumWorkItems, sycl::id<Dims_> Offset) {
87
- for (int I = 0 ; I < Dims_; ++I) {
88
- GlobalSize[I] = NumWorkItems[I];
89
- LocalSize[I] = 0 ;
90
- GlobalOffset[I] = Offset[I];
91
- NumWorkGroups[I] = 0 ;
92
- }
93
- setNDRangeLeftover (Dims_);
94
- Dims = Dims_;
95
- }
96
-
97
- template <int Dims_> void set (sycl::nd_range<Dims_> ExecutionRange) {
98
- for (int I = 0 ; I < Dims_; ++I) {
99
- GlobalSize[I] = ExecutionRange.get_global_range ()[I];
100
- LocalSize[I] = ExecutionRange.get_local_range ()[I];
101
- GlobalOffset[I] = ExecutionRange.get_offset ()[I];
102
- NumWorkGroups[I] = 0 ;
103
- }
104
- setNDRangeLeftover (Dims_);
105
- Dims = Dims_;
106
- }
107
-
108
- void set (int Dims_, sycl::nd_range<3 > ExecutionRange) {
109
- for (int I = 0 ; I < Dims_; ++I) {
110
- GlobalSize[I] = ExecutionRange.get_global_range ()[I];
111
- LocalSize[I] = ExecutionRange.get_local_range ()[I];
112
- GlobalOffset[I] = ExecutionRange.get_offset ()[I];
113
- NumWorkGroups[I] = 0 ;
114
- }
115
- setNDRangeLeftover (Dims_);
116
- Dims = Dims_;
117
- }
118
-
119
- template <int Dims_> void setNumWorkGroups (sycl::range<Dims_> N) {
120
- for (int I = 0 ; I < Dims_; ++I) {
121
- GlobalSize[I] = 0 ;
122
- // '0' is a mark to adjust before kernel launch when there is enough info:
123
- LocalSize[I] = 0 ;
124
- GlobalOffset[I] = 0 ;
125
- NumWorkGroups[I] = N[I];
126
- }
127
- setNDRangeLeftover (Dims_);
128
- Dims = Dims_;
129
- }
130
-
131
- template <int Dims_> void setClusterDimensions (sycl::range<Dims_> N) {
132
- if (Dims_ != Dims) {
133
- throw std::runtime_error (
134
- " Dimensionality of cluster, global and local ranges must be same" );
135
- }
136
-
137
- for (int I = 0 ; I < Dims_; ++I) {
138
- ClusterDimensions[I] = N[I];
139
- }
140
- }
141
-
142
- sycl::range<3 > GlobalSize;
143
- sycl::range<3 > LocalSize;
144
- sycl::id<3 > GlobalOffset;
145
- // / Number of workgroups, used to record the number of workgroups from the
146
- // / simplest form of parallel_for_work_group. If set, all other fields must be
147
- // / zero
148
- sycl::range<3 > NumWorkGroups;
149
- sycl::range<3 > ClusterDimensions{1 , 1 , 1 };
150
- size_t Dims;
40
+ // / Type of the command group.
41
+ // / NOTE: Changing the values of any of these enumerators is an API-break.
42
+ enum class CGType : unsigned int {
43
+ None = 0 ,
44
+ Kernel = 1 ,
45
+ CopyAccToPtr = 2 ,
46
+ CopyPtrToAcc = 3 ,
47
+ CopyAccToAcc = 4 ,
48
+ Barrier = 5 ,
49
+ BarrierWaitlist = 6 ,
50
+ Fill = 7 ,
51
+ UpdateHost = 8 ,
52
+ CopyUSM = 10 ,
53
+ FillUSM = 11 ,
54
+ PrefetchUSM = 12 ,
55
+ CodeplayHostTask = 14 ,
56
+ AdviseUSM = 15 ,
57
+ Copy2DUSM = 16 ,
58
+ Fill2DUSM = 17 ,
59
+ Memset2DUSM = 18 ,
60
+ CopyToDeviceGlobal = 19 ,
61
+ CopyFromDeviceGlobal = 20 ,
62
+ ReadWriteHostPipe = 21 ,
63
+ ExecCommandBuffer = 22 ,
64
+ CopyImage = 23 ,
65
+ SemaphoreWait = 24 ,
66
+ SemaphoreSignal = 25 ,
67
+ ProfilingTag = 26 ,
151
68
};
152
69
153
70
template <typename , typename T> struct check_fn_signature {
@@ -236,8 +153,6 @@ runKernelWithArg(KernelType KernelName, ArgType Arg) {
236
153
// The pure virtual class aimed to store lambda/functors of any type.
237
154
class HostKernelBase {
238
155
public:
239
- // The method executes lambda stored using NDRange passed.
240
- virtual void call (const NDRDescT &NDRDesc, HostProfilingInfo *HPI) = 0;
241
156
// Return pointer to the lambda object.
242
157
// Used to extract captured variables.
243
158
virtual char *getPtr () = 0;
@@ -255,177 +170,9 @@ class HostKernel : public HostKernelBase {
255
170
256
171
public:
257
172
HostKernel (KernelType Kernel) : MKernel(Kernel) {}
258
- void call (const NDRDescT &NDRDesc, HostProfilingInfo *HPI) override {
259
- // adjust ND range for serial host:
260
- NDRDescT AdjustedRange = NDRDesc;
261
-
262
- if (NDRDesc.GlobalSize [0 ] == 0 && NDRDesc.NumWorkGroups [0 ] != 0 ) {
263
- // This is a special case - NDRange information is not complete, only the
264
- // desired number of work groups is set by the user. Choose work group
265
- // size (LocalSize), calculate the missing NDRange characteristics
266
- // needed to invoke the kernel and adjust the NDRange descriptor
267
- // accordingly. For some devices the work group size selection requires
268
- // access to the device's properties, hence such late "adjustment".
269
- range<3 > WGsize{1 , 1 , 1 }; // no better alternative for serial host?
270
- AdjustedRange.set (NDRDesc.Dims ,
271
- nd_range<3 >(NDRDesc.NumWorkGroups * WGsize, WGsize));
272
- }
273
- // If local size for host is not set explicitly, let's adjust it to 1,
274
- // so an exception for zero local size is not thrown.
275
- if (AdjustedRange.LocalSize [0 ] == 0 )
276
- for (size_t I = 0 ; I < AdjustedRange.Dims ; ++I)
277
- AdjustedRange.LocalSize [I] = 1 ;
278
- if (HPI)
279
- HPI->start ();
280
- runOnHost (AdjustedRange);
281
- if (HPI)
282
- HPI->end ();
283
- }
284
173
285
174
char *getPtr () override { return reinterpret_cast <char *>(&MKernel); }
286
175
287
- template <class ArgT = KernelArgType>
288
- typename std::enable_if_t <std::is_same_v<ArgT, void >>
289
- runOnHost (const NDRDescT &) {
290
- runKernelWithoutArg (MKernel);
291
- }
292
-
293
- template <class ArgT = KernelArgType>
294
- typename std::enable_if_t <std::is_same_v<ArgT, sycl::id<Dims>>>
295
- runOnHost (const NDRDescT &NDRDesc) {
296
- sycl::range<Dims> Range (InitializedVal<Dims, range>::template get<0 >());
297
- sycl::id<Dims> Offset;
298
- sycl::range<Dims> Stride (
299
- InitializedVal<Dims, range>::template get<1 >()); // initialized to 1
300
- sycl::range<Dims> UpperBound (
301
- InitializedVal<Dims, range>::template get<0 >());
302
- for (int I = 0 ; I < Dims; ++I) {
303
- Range[I] = NDRDesc.GlobalSize [I];
304
- Offset[I] = NDRDesc.GlobalOffset [I];
305
- UpperBound[I] = Range[I] + Offset[I];
306
- }
307
-
308
- detail::NDLoop<Dims>::iterate(
309
- /* LowerBound=*/ Offset, Stride, UpperBound,
310
- [&](const sycl::id<Dims> &ID) {
311
- sycl::item<Dims, /* Offset=*/ true > Item =
312
- IDBuilder::createItem<Dims, true >(Range, ID, Offset);
313
-
314
- runKernelWithArg<const sycl::id<Dims> &>(MKernel, ID);
315
- });
316
- }
317
-
318
- template <class ArgT = KernelArgType>
319
- typename std::enable_if_t <std::is_same_v<ArgT, item<Dims, /* Offset=*/ false >>>
320
- runOnHost (const NDRDescT &NDRDesc) {
321
- sycl::id<Dims> ID;
322
- sycl::range<Dims> Range (InitializedVal<Dims, range>::template get<0 >());
323
- for (int I = 0 ; I < Dims; ++I)
324
- Range[I] = NDRDesc.GlobalSize [I];
325
-
326
- detail::NDLoop<Dims>::iterate(Range, [&](const sycl::id<Dims> ID) {
327
- sycl::item<Dims, /* Offset=*/ false > Item =
328
- IDBuilder::createItem<Dims, false >(Range, ID);
329
- sycl::item<Dims, /* Offset=*/ true > ItemWithOffset = Item;
330
-
331
- runKernelWithArg<sycl::item<Dims, /* Offset=*/ false >>(MKernel, Item);
332
- });
333
- }
334
-
335
- template <class ArgT = KernelArgType>
336
- typename std::enable_if_t <std::is_same_v<ArgT, item<Dims, /* Offset=*/ true >>>
337
- runOnHost (const NDRDescT &NDRDesc) {
338
- sycl::range<Dims> Range (InitializedVal<Dims, range>::template get<0 >());
339
- sycl::id<Dims> Offset;
340
- sycl::range<Dims> Stride (
341
- InitializedVal<Dims, range>::template get<1 >()); // initialized to 1
342
- sycl::range<Dims> UpperBound (
343
- InitializedVal<Dims, range>::template get<0 >());
344
- for (int I = 0 ; I < Dims; ++I) {
345
- Range[I] = NDRDesc.GlobalSize [I];
346
- Offset[I] = NDRDesc.GlobalOffset [I];
347
- UpperBound[I] = Range[I] + Offset[I];
348
- }
349
-
350
- detail::NDLoop<Dims>::iterate(
351
- /* LowerBound=*/ Offset, Stride, UpperBound,
352
- [&](const sycl::id<Dims> &ID) {
353
- sycl::item<Dims, /* Offset=*/ true > Item =
354
- IDBuilder::createItem<Dims, true >(Range, ID, Offset);
355
-
356
- runKernelWithArg<sycl::item<Dims, /* Offset=*/ true >>(MKernel, Item);
357
- });
358
- }
359
-
360
- template <class ArgT = KernelArgType>
361
- typename std::enable_if_t <std::is_same_v<ArgT, nd_item<Dims>>>
362
- runOnHost (const NDRDescT &NDRDesc) {
363
- sycl::range<Dims> GroupSize (InitializedVal<Dims, range>::template get<0 >());
364
- for (int I = 0 ; I < Dims; ++I) {
365
- if (NDRDesc.LocalSize [I] == 0 ||
366
- NDRDesc.GlobalSize [I] % NDRDesc.LocalSize [I] != 0 )
367
- throw sycl::exception (make_error_code (errc::nd_range),
368
- " Invalid local size for global size" );
369
- GroupSize[I] = NDRDesc.GlobalSize [I] / NDRDesc.LocalSize [I];
370
- }
371
-
372
- sycl::range<Dims> LocalSize (InitializedVal<Dims, range>::template get<0 >());
373
- sycl::range<Dims> GlobalSize (
374
- InitializedVal<Dims, range>::template get<0 >());
375
- sycl::id<Dims> GlobalOffset;
376
- for (int I = 0 ; I < Dims; ++I) {
377
- GlobalOffset[I] = NDRDesc.GlobalOffset [I];
378
- LocalSize[I] = NDRDesc.LocalSize [I];
379
- GlobalSize[I] = NDRDesc.GlobalSize [I];
380
- }
381
-
382
- detail::NDLoop<Dims>::iterate(GroupSize, [&](const id<Dims> &GroupID) {
383
- sycl::group<Dims> Group = IDBuilder::createGroup<Dims>(
384
- GlobalSize, LocalSize, GroupSize, GroupID);
385
-
386
- detail::NDLoop<Dims>::iterate(LocalSize, [&](const id<Dims> &LocalID) {
387
- id<Dims> GlobalID =
388
- GroupID * id<Dims>{LocalSize} + LocalID + GlobalOffset;
389
- const sycl::item<Dims, /* Offset=*/ true > GlobalItem =
390
- IDBuilder::createItem<Dims, true >(GlobalSize, GlobalID,
391
- GlobalOffset);
392
- const sycl::item<Dims, /* Offset=*/ false > LocalItem =
393
- IDBuilder::createItem<Dims, false >(LocalSize, LocalID);
394
- const sycl::nd_item<Dims> NDItem =
395
- IDBuilder::createNDItem<Dims>(GlobalItem, LocalItem, Group);
396
-
397
- runKernelWithArg<const sycl::nd_item<Dims>>(MKernel, NDItem);
398
- });
399
- });
400
- }
401
-
402
- template <typename ArgT = KernelArgType>
403
- std::enable_if_t <std::is_same_v<ArgT, sycl::group<Dims>>>
404
- runOnHost (const NDRDescT &NDRDesc) {
405
- sycl::range<Dims> NGroups (InitializedVal<Dims, range>::template get<0 >());
406
-
407
- for (int I = 0 ; I < Dims; ++I) {
408
- if (NDRDesc.LocalSize [I] == 0 ||
409
- NDRDesc.GlobalSize [I] % NDRDesc.LocalSize [I] != 0 )
410
- throw sycl::exception (make_error_code (errc::nd_range),
411
- " Invalid local size for global size" );
412
- NGroups[I] = NDRDesc.GlobalSize [I] / NDRDesc.LocalSize [I];
413
- }
414
-
415
- sycl::range<Dims> LocalSize (InitializedVal<Dims, range>::template get<0 >());
416
- sycl::range<Dims> GlobalSize (
417
- InitializedVal<Dims, range>::template get<0 >());
418
- for (int I = 0 ; I < Dims; ++I) {
419
- LocalSize[I] = NDRDesc.LocalSize [I];
420
- GlobalSize[I] = NDRDesc.GlobalSize [I];
421
- }
422
- detail::NDLoop<Dims>::iterate(NGroups, [&](const id<Dims> &GroupID) {
423
- sycl::group<Dims> Group =
424
- IDBuilder::createGroup<Dims>(GlobalSize, LocalSize, NGroups, GroupID);
425
- runKernelWithArg<sycl::group<Dims>>(MKernel, Group);
426
- });
427
- }
428
-
429
176
~HostKernel () = default ;
430
177
};
431
178
0 commit comments