@@ -233,6 +233,8 @@ DPCTLSyclEventRef dpnp_any_c(DPCTLSyclQueueRef q_ref,
233
233
const size_t size,
234
234
const DPCTLEventVectorRef dep_event_vec_ref)
235
235
{
236
+ static_assert (std::is_same_v<_ResultType, bool >, " Boolean result type is required" );
237
+
236
238
// avoid warning unused variable
237
239
(void )dep_event_vec_ref;
238
240
@@ -244,38 +246,50 @@ DPCTLSyclEventRef dpnp_any_c(DPCTLSyclQueueRef q_ref,
244
246
}
245
247
246
248
sycl::queue q = *(reinterpret_cast <sycl::queue*>(q_ref));
247
- sycl::event event;
248
249
249
- DPNPC_ptr_adapter<_DataType> input1_ptr (q_ref, array1_in, size);
250
- DPNPC_ptr_adapter<_ResultType> result1_ptr (q_ref, result1, 1 , true , true );
251
- const _DataType* array_in = input1_ptr.get_ptr ();
252
- _ResultType* result = result1_ptr.get_ptr ();
250
+ const _DataType* array_in = static_cast <const _DataType*>(array1_in);
251
+ bool * result = static_cast <bool *>(result1);
253
252
254
- result[ 0 ] = false ;
253
+ auto fill_event = q. fill (result, false , 1 ) ;
255
254
256
255
if (!size)
257
256
{
258
- return event_ref;
257
+ event_ref = reinterpret_cast <DPCTLSyclEventRef>(&fill_event);
258
+ return DPCTLEvent_Copy (event_ref);
259
259
}
260
260
261
- sycl::range<1 > gws (size);
262
- auto kernel_parallel_for_func = [=](sycl::id<1 > global_id) {
263
- size_t i = global_id[0 ];
261
+ constexpr size_t lws = 64 ;
262
+ constexpr size_t vec_sz = 8 ;
264
263
265
- if (array_in[i])
264
+ auto gws_range = sycl::range<1 >(((size + lws * vec_sz - 1 ) / (lws * vec_sz)) * lws);
265
+ auto lws_range = sycl::range<1 >(lws);
266
+ sycl::nd_range<1 > gws (gws_range, lws_range);
267
+
268
+ auto kernel_parallel_for_func = [=](sycl::nd_item<1 > nd_it) {
269
+ auto gr = nd_it.get_group ();
270
+ const auto max_gr_size = gr.get_max_local_range ()[0 ];
271
+ const size_t start =
272
+ vec_sz * (nd_it.get_group (0 ) * nd_it.get_local_range (0 ) + gr.get_group_id ()[0 ] * max_gr_size);
273
+ const size_t end = sycl::min (start + vec_sz * max_gr_size, size);
274
+
275
+ // each work-item reduces over "vec_sz" elements in the input array
276
+ bool local_reduction = sycl::joint_any_of (
277
+ gr, &array_in[start], &array_in[end], [&](_DataType elem) { return elem != static_cast <_DataType>(0 ); });
278
+
279
+ if (gr.leader () && (local_reduction == true ))
266
280
{
267
281
result[0 ] = true ;
268
282
}
269
283
};
270
284
271
285
auto kernel_func = [&](sycl::handler& cgh) {
286
+ cgh.depends_on (fill_event);
272
287
cgh.parallel_for <class dpnp_any_c_kernel <_DataType, _ResultType>>(gws, kernel_parallel_for_func);
273
288
};
274
289
275
- event = q.submit (kernel_func);
290
+ auto event = q.submit (kernel_func);
276
291
277
292
event_ref = reinterpret_cast <DPCTLSyclEventRef>(&event);
278
-
279
293
return DPCTLEvent_Copy (event_ref);
280
294
}
281
295
@@ -290,6 +304,7 @@ void dpnp_any_c(const void* array1_in, void* result1, const size_t size)
290
304
size,
291
305
dep_event_vec_ref);
292
306
DPCTLEvent_WaitAndThrow (event_ref);
307
+ DPCTLEvent_Delete (event_ref);
293
308
}
294
309
295
310
template <typename _DataType, typename _ResultType>
@@ -846,6 +861,8 @@ void func_map_init_logic(func_map_t& fmap)
846
861
fmap[DPNPFuncName::DPNP_FN_ANY_EXT][eft_LNG][eft_LNG] = {eft_LNG, (void *)dpnp_any_ext_c<int64_t , bool >};
847
862
fmap[DPNPFuncName::DPNP_FN_ANY_EXT][eft_FLT][eft_FLT] = {eft_FLT, (void *)dpnp_any_ext_c<float , bool >};
848
863
fmap[DPNPFuncName::DPNP_FN_ANY_EXT][eft_DBL][eft_DBL] = {eft_DBL, (void *)dpnp_any_ext_c<double , bool >};
864
+ fmap[DPNPFuncName::DPNP_FN_ANY_EXT][eft_C64][eft_C64] = {eft_C64, (void *)dpnp_any_ext_c<std::complex<float >, bool >};
865
+ fmap[DPNPFuncName::DPNP_FN_ANY_EXT][eft_C128][eft_C128] = {eft_C128, (void *)dpnp_any_ext_c<std::complex<double >, bool >};
849
866
850
867
func_map_logic_1arg_1type_helper<eft_BLN, eft_INT, eft_LNG, eft_FLT, eft_DBL>(fmap);
851
868
func_map_logic_2arg_2type_helper<eft_BLN, eft_INT, eft_LNG, eft_FLT, eft_DBL>(fmap);
0 commit comments