34
34
#include " pybind11/pybind11.h"
35
35
36
36
#include " utils/offset_utils.hpp"
37
+ #include " utils/sycl_utils.hpp"
37
38
#include " utils/type_dispatch.hpp"
38
39
#include " utils/type_utils.hpp"
39
40
@@ -227,9 +228,8 @@ struct ContigBooleanReduction
227
228
228
229
void operator ()(sycl::nd_item<1 > it) const
229
230
{
230
- const size_t red_gws_ = it.get_global_range (0 ) / iter_gws_;
231
- const size_t reduction_id = it.get_global_id (0 ) / red_gws_;
232
- const size_t reduction_batch_id = get_reduction_batch_id (it);
231
+ const size_t reduction_id = it.get_group (0 ) % iter_gws_;
232
+ const size_t reduction_batch_id = it.get_group (0 ) / iter_gws_;
233
233
const size_t wg_size = it.get_local_range (0 );
234
234
235
235
const size_t base = reduction_id * reduction_max_gid_;
@@ -241,14 +241,6 @@ struct ContigBooleanReduction
241
241
// in group_op_
242
242
group_op_ (it, out_, reduction_id, inp_ + start, inp_ + end);
243
243
}
244
-
245
- private:
246
- size_t get_reduction_batch_id (sycl::nd_item<1 > const &it) const
247
- {
248
- const size_t n_reduction_groups = it.get_group_range (0 ) / iter_gws_;
249
- const size_t reduction_batch_id = it.get_group (0 ) % n_reduction_groups;
250
- return reduction_batch_id;
251
- }
252
244
};
253
245
254
246
typedef sycl::event (*boolean_reduction_contig_impl_fn_ptr)(
@@ -268,17 +260,19 @@ class boolean_reduction_contig_krn;
268
260
template <typename T1, typename T2, typename T3, typename T4, typename T5>
269
261
class boolean_reduction_seq_contig_krn ;
270
262
263
+ using dpctl::tensor::sycl_utils::choose_workgroup_size;
264
+
271
265
template <typename argTy, typename resTy, typename RedOpT, typename GroupOpT>
272
266
sycl::event
273
- boolean_reduction_contig_impl (sycl::queue exec_q,
274
- size_t iter_nelems,
275
- size_t reduction_nelems,
276
- const char *arg_cp,
277
- char *res_cp,
278
- py::ssize_t iter_arg_offset,
279
- py::ssize_t iter_res_offset,
280
- py::ssize_t red_arg_offset,
281
- const std::vector<sycl::event> &depends)
267
+ boolean_reduction_axis1_contig_impl (sycl::queue exec_q,
268
+ size_t iter_nelems,
269
+ size_t reduction_nelems,
270
+ const char *arg_cp,
271
+ char *res_cp,
272
+ py::ssize_t iter_arg_offset,
273
+ py::ssize_t iter_res_offset,
274
+ py::ssize_t red_arg_offset,
275
+ const std::vector<sycl::event> &depends)
282
276
{
283
277
const argTy *arg_tp = reinterpret_cast <const argTy *>(arg_cp) +
284
278
iter_arg_offset + red_arg_offset;
@@ -288,8 +282,7 @@ boolean_reduction_contig_impl(sycl::queue exec_q,
288
282
289
283
const sycl::device &d = exec_q.get_device ();
290
284
const auto &sg_sizes = d.get_info <sycl::info::device::sub_group_sizes>();
291
- size_t wg =
292
- 4 * (*std::max_element (std::begin (sg_sizes), std::end (sg_sizes)));
285
+ size_t wg = choose_workgroup_size<4 >(reduction_nelems, sg_sizes);
293
286
294
287
sycl::event red_ev;
295
288
if (reduction_nelems < wg) {
@@ -322,18 +315,8 @@ boolean_reduction_contig_impl(sycl::queue exec_q,
322
315
});
323
316
}
324
317
else {
325
- sycl::event init_ev = exec_q.submit ([&](sycl::handler &cgh) {
326
- using IndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
327
-
328
- IndexerT res_indexer{};
329
-
330
- cgh.depends_on (depends);
331
-
332
- cgh.parallel_for (sycl::range<1 >(iter_nelems), [=](sycl::id<1 > id) {
333
- auto res_offset = res_indexer (id[0 ]);
334
- res_tp[res_offset] = identity_val;
335
- });
336
- });
318
+ sycl::event init_ev = exec_q.fill <resTy>(res_tp, resTy (identity_val),
319
+ iter_nelems, depends);
337
320
red_ev = exec_q.submit ([&](sycl::handler &cgh) {
338
321
cgh.depends_on (init_ev);
339
322
@@ -363,7 +346,7 @@ boolean_reduction_contig_impl(sycl::queue exec_q,
363
346
return red_ev;
364
347
}
365
348
366
- template <typename fnT, typename srcTy> struct AllContigFactory
349
+ template <typename fnT, typename srcTy> struct AllAxis1ContigFactory
367
350
{
368
351
fnT get () const
369
352
{
@@ -372,12 +355,12 @@ template <typename fnT, typename srcTy> struct AllContigFactory
372
355
using GroupOpT =
373
356
all_reduce_wg_contig<srcTy, resTy, boolean_predicate<srcTy>>;
374
357
375
- return dpctl::tensor::kernels::boolean_reduction_contig_impl <
358
+ return dpctl::tensor::kernels::boolean_reduction_axis1_contig_impl <
376
359
srcTy, resTy, RedOpT, GroupOpT>;
377
360
}
378
361
};
379
362
380
- template <typename fnT, typename srcTy> struct AnyContigFactory
363
+ template <typename fnT, typename srcTy> struct AnyAxis1ContigFactory
381
364
{
382
365
fnT get () const
383
366
{
@@ -386,7 +369,7 @@ template <typename fnT, typename srcTy> struct AnyContigFactory
386
369
using GroupOpT =
387
370
any_reduce_wg_contig<srcTy, resTy, boolean_predicate<srcTy>>;
388
371
389
- return dpctl::tensor::kernels::boolean_reduction_contig_impl <
372
+ return dpctl::tensor::kernels::boolean_reduction_axis1_contig_impl <
390
373
srcTy, resTy, RedOpT, GroupOpT>;
391
374
}
392
375
};
@@ -433,9 +416,9 @@ struct StridedBooleanReduction
433
416
434
417
void operator ()(sycl::nd_item<1 > it) const
435
418
{
436
- const size_t red_gws_ = it.get_global_range (0 ) / iter_gws_;
437
- const size_t reduction_id = it.get_global_id (0 ) / red_gws_ ;
438
- const size_t reduction_batch_id = get_reduction_batch_id (it);
419
+ const size_t reduction_id = it.get_group (0 ) % iter_gws_;
420
+ const size_t reduction_batch_id = it.get_group (0 ) / iter_gws_ ;
421
+
439
422
const size_t reduction_lid = it.get_local_id (0 );
440
423
const size_t wg_size = it.get_local_range (0 );
441
424
@@ -468,13 +451,112 @@ struct StridedBooleanReduction
468
451
// in group_op_
469
452
group_op_ (it, out_, out_iter_offset, local_red_val);
470
453
}
454
+ };
455
+
456
+ template <typename T1,
457
+ typename T2,
458
+ typename T3,
459
+ typename T4,
460
+ typename T5,
461
+ typename T6>
462
+ class boolean_reduction_axis0_contig_krn ;
463
+
464
+ template <typename argTy, typename resTy, typename RedOpT, typename GroupOpT>
465
+ sycl::event
466
+ boolean_reduction_axis0_contig_impl (sycl::queue exec_q,
467
+ size_t iter_nelems,
468
+ size_t reduction_nelems,
469
+ const char *arg_cp,
470
+ char *res_cp,
471
+ py::ssize_t iter_arg_offset,
472
+ py::ssize_t iter_res_offset,
473
+ py::ssize_t red_arg_offset,
474
+ const std::vector<sycl::event> &depends)
475
+ {
476
+ const argTy *arg_tp = reinterpret_cast <const argTy *>(arg_cp) +
477
+ iter_arg_offset + red_arg_offset;
478
+ resTy *res_tp = reinterpret_cast <resTy *>(res_cp) + iter_res_offset;
479
+
480
+ constexpr resTy identity_val = sycl::known_identity<RedOpT, resTy>::value;
481
+
482
+ const sycl::device &d = exec_q.get_device ();
483
+ const auto &sg_sizes = d.get_info <sycl::info::device::sub_group_sizes>();
484
+ size_t wg = choose_workgroup_size<4 >(reduction_nelems, sg_sizes);
471
485
472
- private:
473
- size_t get_reduction_batch_id (sycl::nd_item<1 > const &it) const
474
486
{
475
- const size_t n_reduction_groups = it.get_group_range (0 ) / iter_gws_;
476
- const size_t reduction_batch_id = it.get_group (0 ) % n_reduction_groups;
477
- return reduction_batch_id;
487
+ sycl::event init_ev = exec_q.fill <resTy>(res_tp, resTy (identity_val),
488
+ iter_nelems, depends);
489
+ sycl::event red_ev = exec_q.submit ([&](sycl::handler &cgh) {
490
+ cgh.depends_on (init_ev);
491
+
492
+ constexpr std::uint8_t dim = 1 ;
493
+
494
+ using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
495
+ using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
496
+ using InputOutputIterIndexerT =
497
+ dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
498
+ NoOpIndexerT, NoOpIndexerT>;
499
+ using ReductionIndexerT = ColsIndexerT;
500
+
501
+ NoOpIndexerT columns_indexer{};
502
+ NoOpIndexerT result_indexer{};
503
+ InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
504
+ result_indexer};
505
+ ReductionIndexerT reduction_indexer{
506
+ 0 , static_cast <py::ssize_t >(reduction_nelems),
507
+ static_cast <py::ssize_t >(iter_nelems)};
508
+
509
+ constexpr size_t preferred_reductions_per_wi = 4 ;
510
+ size_t reductions_per_wi =
511
+ (reduction_nelems < preferred_reductions_per_wi * wg)
512
+ ? ((reduction_nelems + wg - 1 ) / wg)
513
+ : preferred_reductions_per_wi;
514
+
515
+ size_t reduction_groups =
516
+ (reduction_nelems + reductions_per_wi * wg - 1 ) /
517
+ (reductions_per_wi * wg);
518
+
519
+ auto gws = sycl::range<dim>{iter_nelems * reduction_groups * wg};
520
+ auto lws = sycl::range<dim>{wg};
521
+
522
+ cgh.parallel_for <class boolean_reduction_axis0_contig_krn <
523
+ argTy, resTy, RedOpT, GroupOpT, InputOutputIterIndexerT,
524
+ ReductionIndexerT>>(
525
+ sycl::nd_range<dim>(gws, lws),
526
+ StridedBooleanReduction<argTy, resTy, RedOpT, GroupOpT,
527
+ InputOutputIterIndexerT,
528
+ ReductionIndexerT>(
529
+ arg_tp, res_tp, RedOpT (), GroupOpT (), identity_val,
530
+ in_out_iter_indexer, reduction_indexer, reduction_nelems,
531
+ iter_nelems, reductions_per_wi));
532
+ });
533
+ return red_ev;
534
+ }
535
+ }
536
+
537
+ template <typename fnT, typename srcTy> struct AllAxis0ContigFactory
538
+ {
539
+ fnT get () const
540
+ {
541
+ using resTy = std::int32_t ;
542
+ using RedOpT = sycl::logical_and<resTy>;
543
+ using GroupOpT = all_reduce_wg_strided<resTy>;
544
+
545
+ return dpctl::tensor::kernels::boolean_reduction_axis0_contig_impl<
546
+ srcTy, resTy, RedOpT, GroupOpT>;
547
+ }
548
+ };
549
+
550
+ template <typename fnT, typename srcTy> struct AnyAxis0ContigFactory
551
+ {
552
+ fnT get () const
553
+ {
554
+ using resTy = std::int32_t ;
555
+ using RedOpT = sycl::logical_or<resTy>;
556
+ using GroupOpT = any_reduce_wg_strided<resTy>;
557
+
558
+ return dpctl::tensor::kernels::boolean_reduction_axis0_contig_impl<
559
+ srcTy, resTy, RedOpT, GroupOpT>;
478
560
}
479
561
};
480
562
@@ -527,8 +609,7 @@ boolean_reduction_strided_impl(sycl::queue exec_q,
527
609
528
610
const sycl::device &d = exec_q.get_device ();
529
611
const auto &sg_sizes = d.get_info <sycl::info::device::sub_group_sizes>();
530
- size_t wg =
531
- 4 * (*std::max_element (std::begin (sg_sizes), std::end (sg_sizes)));
612
+ size_t wg = choose_workgroup_size<4 >(reduction_nelems, sg_sizes);
532
613
533
614
sycl::event red_ev;
534
615
if (reduction_nelems < wg) {
@@ -558,7 +639,7 @@ boolean_reduction_strided_impl(sycl::queue exec_q,
558
639
});
559
640
}
560
641
else {
561
- sycl::event res_init_ev = exec_q.submit ([&](sycl::handler &cgh) {
642
+ sycl::event init_ev = exec_q.submit ([&](sycl::handler &cgh) {
562
643
using IndexerT =
563
644
dpctl::tensor::offset_utils::UnpackedStridedIndexer;
564
645
@@ -576,7 +657,7 @@ boolean_reduction_strided_impl(sycl::queue exec_q,
576
657
});
577
658
});
578
659
red_ev = exec_q.submit ([&](sycl::handler &cgh) {
579
- cgh.depends_on (res_init_ev );
660
+ cgh.depends_on (init_ev );
580
661
581
662
constexpr std::uint8_t dim = 1 ;
582
663
0 commit comments