@@ -41,6 +41,8 @@ DPCTLSyclEventRef dpnp_all_c(DPCTLSyclQueueRef q_ref,
41
41
const size_t size,
42
42
const DPCTLEventVectorRef dep_event_vec_ref)
43
43
{
44
+ static_assert (std::is_same_v<_ResultType, bool >, " Boolean result type is required" );
45
+
44
46
// avoid warning unused variable
45
47
(void )dep_event_vec_ref;
46
48
@@ -52,38 +54,50 @@ DPCTLSyclEventRef dpnp_all_c(DPCTLSyclQueueRef q_ref,
52
54
}
53
55
54
56
sycl::queue q = *(reinterpret_cast <sycl::queue*>(q_ref));
55
- sycl::event event;
56
57
57
- DPNPC_ptr_adapter<_DataType> input1_ptr (q_ref, array1_in, size);
58
- DPNPC_ptr_adapter<_ResultType> result1_ptr (q_ref, result1, 1 , true , true );
59
- const _DataType* array_in = input1_ptr.get_ptr ();
60
- _ResultType* result = result1_ptr.get_ptr ();
58
+ const _DataType* array_in = static_cast <const _DataType*>(array1_in);
59
+ bool * result = static_cast <bool *>(result1);
61
60
62
- result[ 0 ] = true ;
61
+ auto fill_event = q. fill (result, true , 1 ) ;
63
62
64
63
if (!size)
65
64
{
66
- return event_ref;
65
+ event_ref = reinterpret_cast <DPCTLSyclEventRef>(&fill_event);
66
+ return DPCTLEvent_Copy (event_ref);
67
67
}
68
68
69
- sycl::range<1 > gws (size);
70
- auto kernel_parallel_for_func = [=](sycl::id<1 > global_id) {
71
- size_t i = global_id[0 ];
69
+ constexpr size_t lws = 64 ;
70
+ constexpr size_t vec_sz = 8 ;
71
+
72
+ auto gws_range = sycl::range<1 >(((size + lws * vec_sz - 1 ) / (lws * vec_sz)) * lws);
73
+ auto lws_range = sycl::range<1 >(lws);
74
+ sycl::nd_range<1 > gws (gws_range, lws_range);
72
75
73
- if (!array_in[i])
76
+ auto kernel_parallel_for_func = [=](sycl::nd_item<1 > nd_it) {
77
+ auto gr = nd_it.get_group ();
78
+ const auto max_gr_size = gr.get_max_local_range ()[0 ];
79
+ const size_t start =
80
+ vec_sz * (nd_it.get_group (0 ) * nd_it.get_local_range (0 ) + gr.get_group_id ()[0 ] * max_gr_size);
81
+ const size_t end = sycl::min (start + vec_sz * max_gr_size, size);
82
+
83
+ // each work-item reduces over "vec_sz" elements in the input array
84
+ bool local_reduction = sycl::joint_none_of (
85
+ gr, &array_in[start], &array_in[end], [&](_DataType elem) { return elem == static_cast <_DataType>(0 ); });
86
+
87
+ if (gr.leader () && (local_reduction == false ))
74
88
{
75
89
result[0 ] = false ;
76
90
}
77
91
};
78
92
79
93
auto kernel_func = [&](sycl::handler& cgh) {
94
+ cgh.depends_on (fill_event);
80
95
cgh.parallel_for <class dpnp_all_c_kernel <_DataType, _ResultType>>(gws, kernel_parallel_for_func);
81
96
};
82
97
83
- event = q.submit (kernel_func);
98
+ auto event = q.submit (kernel_func);
84
99
85
100
event_ref = reinterpret_cast <DPCTLSyclEventRef>(&event);
86
-
87
101
return DPCTLEvent_Copy (event_ref);
88
102
}
89
103
@@ -98,6 +112,7 @@ void dpnp_all_c(const void* array1_in, void* result1, const size_t size)
98
112
size,
99
113
dep_event_vec_ref);
100
114
DPCTLEvent_WaitAndThrow (event_ref);
115
+ DPCTLEvent_Delete (event_ref);
101
116
}
102
117
103
118
template <typename _DataType, typename _ResultType>
@@ -751,6 +766,8 @@ void func_map_init_logic(func_map_t& fmap)
751
766
fmap[DPNPFuncName::DPNP_FN_ALL_EXT][eft_LNG][eft_LNG] = {eft_LNG, (void *)dpnp_all_ext_c<int64_t , bool >};
752
767
fmap[DPNPFuncName::DPNP_FN_ALL_EXT][eft_FLT][eft_FLT] = {eft_FLT, (void *)dpnp_all_ext_c<float , bool >};
753
768
fmap[DPNPFuncName::DPNP_FN_ALL_EXT][eft_DBL][eft_DBL] = {eft_DBL, (void *)dpnp_all_ext_c<double , bool >};
769
+ fmap[DPNPFuncName::DPNP_FN_ALL_EXT][eft_C64][eft_C64] = {eft_C64, (void *)dpnp_all_ext_c<std::complex<float >, bool >};
770
+ fmap[DPNPFuncName::DPNP_FN_ALL_EXT][eft_C128][eft_C128] = {eft_C128, (void *)dpnp_all_ext_c<std::complex<double >, bool >};
754
771
755
772
fmap[DPNPFuncName::DPNP_FN_ALLCLOSE][eft_INT][eft_INT] = {eft_BLN,
756
773
(void *)dpnp_allclose_default_c<int32_t , int32_t , bool >};
0 commit comments