@@ -218,6 +218,8 @@ DPCTLSyclEventRef dpnp_any_c(DPCTLSyclQueueRef q_ref,
218
218
const size_t size,
219
219
const DPCTLEventVectorRef dep_event_vec_ref)
220
220
{
221
+ static_assert (std::is_same_v<_ResultType, bool >, " Boolean result type is required" );
222
+
221
223
// avoid warning unused variable
222
224
(void )dep_event_vec_ref;
223
225
@@ -229,38 +231,50 @@ DPCTLSyclEventRef dpnp_any_c(DPCTLSyclQueueRef q_ref,
229
231
}
230
232
231
233
sycl::queue q = *(reinterpret_cast <sycl::queue*>(q_ref));
232
- sycl::event event;
233
234
234
- DPNPC_ptr_adapter<_DataType> input1_ptr (q_ref, array1_in, size);
235
- DPNPC_ptr_adapter<_ResultType> result1_ptr (q_ref, result1, 1 , true , true );
236
- const _DataType* array_in = input1_ptr.get_ptr ();
237
- _ResultType* result = result1_ptr.get_ptr ();
235
+ const _DataType* array_in = static_cast <const _DataType*>(array1_in);
236
+ bool * result = static_cast <bool *>(result1);
238
237
239
- result[ 0 ] = false ;
238
+ auto fill_event = q. fill (result, false , 1 ) ;
240
239
241
240
if (!size)
242
241
{
243
- return event_ref;
242
+ event_ref = reinterpret_cast <DPCTLSyclEventRef>(&fill_event);
243
+ return DPCTLEvent_Copy (event_ref);
244
244
}
245
245
246
- sycl::range<1 > gws (size);
247
- auto kernel_parallel_for_func = [=](sycl::id<1 > global_id) {
248
- size_t i = global_id[0 ];
246
+ constexpr size_t lws = 64 ;
247
+ constexpr size_t vec_sz = 8 ;
248
+
249
+ auto gws_range = sycl::range<1 >(((size + lws * vec_sz - 1 ) / (lws * vec_sz)) * lws);
250
+ auto lws_range = sycl::range<1 >(lws);
251
+ sycl::nd_range<1 > gws (gws_range, lws_range);
252
+
253
+ auto kernel_parallel_for_func = [=](sycl::nd_item<1 > nd_it) {
254
+ auto sg = nd_it.get_sub_group ();
255
+ const auto max_sg_size = sg.get_max_local_range ()[0 ];
256
+ const size_t start =
257
+ vec_sz * (nd_it.get_group (0 ) * nd_it.get_local_range (0 ) + sg.get_group_id ()[0 ] * max_sg_size);
258
+ const size_t end = sycl::min (start + vec_sz * max_sg_size, size);
249
259
250
- if (array_in[i])
260
+ // each work-item reduces over "vec_sz" elements in the input array
261
+ bool local_reduction = sycl::joint_any_of (
262
+ sg, &array_in[start], &array_in[end], [&](_DataType elem) { return elem != static_cast <_DataType>(0 ); });
263
+
264
+ if (sg.leader () && (local_reduction == true ))
251
265
{
252
266
result[0 ] = true ;
253
267
}
254
268
};
255
269
256
270
auto kernel_func = [&](sycl::handler& cgh) {
271
+ cgh.depends_on (fill_event);
257
272
cgh.parallel_for <class dpnp_any_c_kernel <_DataType, _ResultType>>(gws, kernel_parallel_for_func);
258
273
};
259
274
260
- event = q.submit (kernel_func);
275
+ auto event = q.submit (kernel_func);
261
276
262
277
event_ref = reinterpret_cast <DPCTLSyclEventRef>(&event);
263
-
264
278
return DPCTLEvent_Copy (event_ref);
265
279
}
266
280
@@ -275,6 +289,7 @@ void dpnp_any_c(const void* array1_in, void* result1, const size_t size)
275
289
size,
276
290
dep_event_vec_ref);
277
291
DPCTLEvent_WaitAndThrow (event_ref);
292
+ DPCTLEvent_Delete (event_ref);
278
293
}
279
294
280
295
template <typename _DataType, typename _ResultType>
@@ -829,6 +844,8 @@ void func_map_init_logic(func_map_t& fmap)
829
844
fmap[DPNPFuncName::DPNP_FN_ANY_EXT][eft_LNG][eft_LNG] = {eft_LNG, (void *)dpnp_any_ext_c<int64_t , bool >};
830
845
fmap[DPNPFuncName::DPNP_FN_ANY_EXT][eft_FLT][eft_FLT] = {eft_FLT, (void *)dpnp_any_ext_c<float , bool >};
831
846
fmap[DPNPFuncName::DPNP_FN_ANY_EXT][eft_DBL][eft_DBL] = {eft_DBL, (void *)dpnp_any_ext_c<double , bool >};
847
+ fmap[DPNPFuncName::DPNP_FN_ANY_EXT][eft_C64][eft_C64] = {eft_C64, (void *)dpnp_any_ext_c<std::complex<float >, bool >};
848
+ fmap[DPNPFuncName::DPNP_FN_ANY_EXT][eft_C128][eft_C128] = {eft_C128, (void *)dpnp_any_ext_c<std::complex<double >, bool >};
832
849
833
850
func_map_logic_1arg_1type_helper<eft_BLN, eft_INT, eft_LNG, eft_FLT, eft_DBL>(fmap);
834
851
func_map_logic_2arg_2type_helper<eft_BLN, eft_INT, eft_LNG, eft_FLT, eft_DBL>(fmap);
0 commit comments