@@ -1094,7 +1094,7 @@ sycl::event reduction_over_group_temps_strided_impl(
1094
1094
// max_max_wg prevents running out of resources on CPU
1095
1095
constexpr size_t max_max_wg = 2048 ;
1096
1096
size_t max_wg = std::min (
1097
- max_max_wg, d.get_info <sycl::info::device::max_work_group_size>());
1097
+ max_max_wg, d.get_info <sycl::info::device::max_work_group_size>() / 2 );
1098
1098
1099
1099
size_t reductions_per_wi (preferrered_reductions_per_wi);
1100
1100
if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) {
@@ -1444,7 +1444,7 @@ sycl::event reduction_axis1_over_group_temps_contig_impl(
1444
1444
// max_max_wg prevents running out of resources on CPU
1445
1445
constexpr size_t max_max_wg = 2048 ;
1446
1446
size_t max_wg = std::min (
1447
- max_max_wg, d.get_info <sycl::info::device::max_work_group_size>());
1447
+ max_max_wg, d.get_info <sycl::info::device::max_work_group_size>() / 2 );
1448
1448
1449
1449
size_t reductions_per_wi (preferrered_reductions_per_wi);
1450
1450
if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) {
@@ -1788,7 +1788,7 @@ sycl::event reduction_axis0_over_group_temps_contig_impl(
1788
1788
// max_max_wg prevents running out of resources on CPU
1789
1789
constexpr size_t max_max_wg = 2048 ;
1790
1790
size_t max_wg = std::min (
1791
- max_max_wg, d.get_info <sycl::info::device::max_work_group_size>());
1791
+ max_max_wg, d.get_info <sycl::info::device::max_work_group_size>() / 2 );
1792
1792
1793
1793
size_t reductions_per_wi (preferrered_reductions_per_wi);
1794
1794
if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) {
@@ -3883,8 +3883,9 @@ sycl::event search_over_group_temps_strided_impl(
3883
3883
3884
3884
constexpr size_t preferrered_reductions_per_wi = 4 ;
3885
3885
// max_max_wg prevents running out of resources on CPU
3886
- size_t max_wg = std::min (
3887
- size_t (2048 ), d.get_info <sycl::info::device::max_work_group_size>());
3886
+ size_t max_wg =
3887
+ std::min (size_t (2048 ),
3888
+ d.get_info <sycl::info::device::max_work_group_size>() / 2 );
3888
3889
3889
3890
size_t reductions_per_wi (preferrered_reductions_per_wi);
3890
3891
if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) {
@@ -4279,8 +4280,9 @@ sycl::event search_axis1_over_group_temps_contig_impl(
4279
4280
4280
4281
constexpr size_t preferrered_reductions_per_wi = 8 ;
4281
4282
// max_max_wg prevents running out of resources on CPU
4282
- size_t max_wg = std::min (
4283
- size_t (2048 ), d.get_info <sycl::info::device::max_work_group_size>());
4283
+ size_t max_wg =
4284
+ std::min (size_t (2048 ),
4285
+ d.get_info <sycl::info::device::max_work_group_size>() / 2 );
4284
4286
4285
4287
size_t reductions_per_wi (preferrered_reductions_per_wi);
4286
4288
if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) {
@@ -4657,8 +4659,9 @@ sycl::event search_axis0_over_group_temps_contig_impl(
4657
4659
4658
4660
constexpr size_t preferrered_reductions_per_wi = 8 ;
4659
4661
// max_max_wg prevents running out of resources on CPU
4660
- size_t max_wg = std::min (
4661
- size_t (2048 ), d.get_info <sycl::info::device::max_work_group_size>());
4662
+ size_t max_wg =
4663
+ std::min (size_t (2048 ),
4664
+ d.get_info <sycl::info::device::max_work_group_size>() / 2 );
4662
4665
4663
4666
size_t reductions_per_wi (preferrered_reductions_per_wi);
4664
4667
if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) {
0 commit comments