diff --git a/.github/workflows/generate-coverage.yaml b/.github/workflows/generate-coverage.yaml index 5975837d55..7e0c8b8667 100644 --- a/.github/workflows/generate-coverage.yaml +++ b/.github/workflows/generate-coverage.yaml @@ -79,7 +79,7 @@ jobs: - name: Install dpctl dependencies shell: bash -l {0} run: | - pip install numpy cython setuptools pytest pytest-cov scikit-build cmake coverage[toml] + pip install numpy"<1.26.0" cython setuptools pytest pytest-cov scikit-build cmake coverage[toml] - name: Build dpctl with coverage shell: bash -l {0} diff --git a/.github/workflows/os-llvm-sycl-build.yml b/.github/workflows/os-llvm-sycl-build.yml index d12747f3a9..e7a27f633c 100644 --- a/.github/workflows/os-llvm-sycl-build.yml +++ b/.github/workflows/os-llvm-sycl-build.yml @@ -108,7 +108,7 @@ jobs: - name: Install dpctl dependencies shell: bash -l {0} run: | - pip install numpy cython setuptools pytest scikit-build cmake ninja + pip install numpy"<1.26.0" cython setuptools pytest scikit-build cmake ninja - name: Checkout repo uses: actions/checkout@v3 diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp index 8130fde96a..9b54e505f6 100644 --- a/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp +++ b/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp @@ -34,6 +34,7 @@ #include "pybind11/pybind11.h" #include "utils/offset_utils.hpp" +#include "utils/sycl_utils.hpp" #include "utils/type_dispatch.hpp" #include "utils/type_utils.hpp" @@ -227,9 +228,8 @@ struct ContigBooleanReduction void operator()(sycl::nd_item<1> it) const { - const size_t red_gws_ = it.get_global_range(0) / iter_gws_; - const size_t reduction_id = it.get_global_id(0) / red_gws_; - const size_t reduction_batch_id = get_reduction_batch_id(it); + const size_t reduction_id = it.get_group(0) % iter_gws_; + const size_t reduction_batch_id = it.get_group(0) / iter_gws_; const size_t wg_size = it.get_local_range(0); const size_t base = reduction_id * reduction_max_gid_; @@ -241,14 +241,6 @@ struct ContigBooleanReduction // in group_op_ group_op_(it, out_, reduction_id, inp_ + start, inp_ + end); } - -private: - size_t get_reduction_batch_id(sycl::nd_item<1> const &it) const - { - const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_; - const size_t reduction_batch_id = it.get_group(0) % n_reduction_groups; - return reduction_batch_id; - } }; typedef sycl::event (*boolean_reduction_contig_impl_fn_ptr)( @@ -268,17 +260,19 @@ class boolean_reduction_contig_krn; template class boolean_reduction_seq_contig_krn; +using dpctl::tensor::sycl_utils::choose_workgroup_size; + template sycl::event -boolean_reduction_contig_impl(sycl::queue exec_q, - size_t iter_nelems, - size_t reduction_nelems, - const char *arg_cp, - char *res_cp, - py::ssize_t iter_arg_offset, - py::ssize_t iter_res_offset, - py::ssize_t red_arg_offset, - const std::vector &depends) +boolean_reduction_axis1_contig_impl(sycl::queue exec_q, + size_t iter_nelems, + size_t reduction_nelems, + const char *arg_cp, + char *res_cp, + py::ssize_t iter_arg_offset, + py::ssize_t iter_res_offset, + py::ssize_t red_arg_offset, + const std::vector &depends) { const argTy *arg_tp = reinterpret_cast(arg_cp) + iter_arg_offset + red_arg_offset; @@ -288,8 +282,7 @@ boolean_reduction_contig_impl(sycl::queue exec_q, const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); - size_t wg = - 4 * (*std::max_element(std::begin(sg_sizes), std::end(sg_sizes))); + size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); sycl::event red_ev; if (reduction_nelems < wg) { @@ -322,18 +315,8 @@ boolean_reduction_contig_impl(sycl::queue exec_q, }); } else { - sycl::event init_ev = exec_q.submit([&](sycl::handler &cgh) { - using IndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - - IndexerT res_indexer{}; - - cgh.depends_on(depends); - - cgh.parallel_for(sycl::range<1>(iter_nelems), [=](sycl::id<1> id) { - auto res_offset = res_indexer(id[0]); - res_tp[res_offset] = identity_val; - }); - }); + sycl::event init_ev = exec_q.fill(res_tp, resTy(identity_val), + iter_nelems, depends); red_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(init_ev); @@ -363,7 +346,7 @@ boolean_reduction_contig_impl(sycl::queue exec_q, return red_ev; } -template struct AllContigFactory +template struct AllAxis1ContigFactory { fnT get() const { @@ -372,12 +355,12 @@ template struct AllContigFactory using GroupOpT = all_reduce_wg_contig>; - return dpctl::tensor::kernels::boolean_reduction_contig_impl< + return dpctl::tensor::kernels::boolean_reduction_axis1_contig_impl< srcTy, resTy, RedOpT, GroupOpT>; } }; -template struct AnyContigFactory +template struct AnyAxis1ContigFactory { fnT get() const { @@ -386,7 +369,7 @@ template struct AnyContigFactory using GroupOpT = any_reduce_wg_contig>; - return dpctl::tensor::kernels::boolean_reduction_contig_impl< + return dpctl::tensor::kernels::boolean_reduction_axis1_contig_impl< srcTy, resTy, RedOpT, GroupOpT>; } }; @@ -433,9 +416,9 @@ struct StridedBooleanReduction void operator()(sycl::nd_item<1> it) const { - const size_t red_gws_ = it.get_global_range(0) / iter_gws_; - const size_t reduction_id = it.get_global_id(0) / red_gws_; - const size_t reduction_batch_id = get_reduction_batch_id(it); + const size_t reduction_id = it.get_group(0) % iter_gws_; + const size_t reduction_batch_id = it.get_group(0) / iter_gws_; + const size_t reduction_lid = it.get_local_id(0); const size_t wg_size = it.get_local_range(0); @@ -468,13 +451,112 @@ struct StridedBooleanReduction // in group_op_ group_op_(it, out_, out_iter_offset, local_red_val); } +}; + +template +class boolean_reduction_axis0_contig_krn; + +template +sycl::event +boolean_reduction_axis0_contig_impl(sycl::queue exec_q, + size_t iter_nelems, + size_t reduction_nelems, + const char *arg_cp, + char *res_cp, + py::ssize_t iter_arg_offset, + py::ssize_t iter_res_offset, + py::ssize_t red_arg_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + red_arg_offset; + resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; + + constexpr resTy identity_val = sycl::known_identity::value; + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); -private: - size_t get_reduction_batch_id(sycl::nd_item<1> const &it) const { - const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_; - const size_t reduction_batch_id = it.get_group(0) % n_reduction_groups; - return reduction_batch_id; + sycl::event init_ev = exec_q.fill(res_tp, resTy(identity_val), + iter_nelems, depends); + sycl::event red_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(init_ev); + + constexpr std::uint8_t dim = 1; + + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = ColsIndexerT; + + NoOpIndexerT columns_indexer{}; + NoOpIndexerT result_indexer{}; + InputOutputIterIndexerT in_out_iter_indexer{columns_indexer, + result_indexer}; + ReductionIndexerT reduction_indexer{ + 0, static_cast(reduction_nelems), + static_cast(iter_nelems)}; + + constexpr size_t preferred_reductions_per_wi = 4; + size_t reductions_per_wi = + (reduction_nelems < preferred_reductions_per_wi * wg) + ? ((reduction_nelems + wg - 1) / wg) + : preferred_reductions_per_wi; + + size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + + auto gws = sycl::range{iter_nelems * reduction_groups * wg}; + auto lws = sycl::range{wg}; + + cgh.parallel_for>( + sycl::nd_range(gws, lws), + StridedBooleanReduction( + arg_tp, res_tp, RedOpT(), GroupOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, reduction_nelems, + iter_nelems, reductions_per_wi)); + }); + return red_ev; + } +} + +template struct AllAxis0ContigFactory +{ + fnT get() const + { + using resTy = std::int32_t; + using RedOpT = sycl::logical_and; + using GroupOpT = all_reduce_wg_strided; + + return dpctl::tensor::kernels::boolean_reduction_axis0_contig_impl< + srcTy, resTy, RedOpT, GroupOpT>; + } +}; + +template struct AnyAxis0ContigFactory +{ + fnT get() const + { + using resTy = std::int32_t; + using RedOpT = sycl::logical_or; + using GroupOpT = any_reduce_wg_strided; + + return dpctl::tensor::kernels::boolean_reduction_axis0_contig_impl< + srcTy, resTy, RedOpT, GroupOpT>; } }; @@ -527,8 +609,7 @@ boolean_reduction_strided_impl(sycl::queue exec_q, const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); - size_t wg = - 4 * (*std::max_element(std::begin(sg_sizes), std::end(sg_sizes))); + size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); sycl::event red_ev; if (reduction_nelems < wg) { @@ -558,7 +639,7 @@ boolean_reduction_strided_impl(sycl::queue exec_q, }); } else { - sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) { + sycl::event init_ev = exec_q.submit([&](sycl::handler &cgh) { using IndexerT = dpctl::tensor::offset_utils::UnpackedStridedIndexer; @@ -576,7 +657,7 @@ boolean_reduction_strided_impl(sycl::queue exec_q, }); }); red_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(res_init_ev); + cgh.depends_on(init_ev); constexpr std::uint8_t dim = 1; diff --git a/dpctl/tensor/libtensor/source/boolean_reductions.cpp b/dpctl/tensor/libtensor/source/boolean_reductions.cpp index 5def6c5158..db07d05c73 100644 --- a/dpctl/tensor/libtensor/source/boolean_reductions.cpp +++ b/dpctl/tensor/libtensor/source/boolean_reductions.cpp @@ -58,7 +58,9 @@ using dpctl::tensor::kernels::boolean_reduction_strided_impl_fn_ptr; static boolean_reduction_strided_impl_fn_ptr all_reduction_strided_dispatch_vector[td_ns::num_types]; static boolean_reduction_contig_impl_fn_ptr - all_reduction_contig_dispatch_vector[td_ns::num_types]; + all_reduction_axis1_contig_dispatch_vector[td_ns::num_types]; +static boolean_reduction_contig_impl_fn_ptr + all_reduction_axis0_contig_dispatch_vector[td_ns::num_types]; void populate_all_dispatch_vectors(void) { @@ -74,11 +76,19 @@ void populate_all_dispatch_vectors(void) using dpctl::tensor::kernels::boolean_reduction_contig_impl_fn_ptr; - using dpctl::tensor::kernels::AllContigFactory; + using dpctl::tensor::kernels::AllAxis1ContigFactory; DispatchVectorBuilder + AllAxis1ContigFactory, td_ns::num_types> all_dvb2; - all_dvb2.populate_dispatch_vector(all_reduction_contig_dispatch_vector); + all_dvb2.populate_dispatch_vector( + all_reduction_axis1_contig_dispatch_vector); + + using dpctl::tensor::kernels::AllAxis0ContigFactory; + DispatchVectorBuilder + all_dvb3; + all_dvb3.populate_dispatch_vector( + all_reduction_axis0_contig_dispatch_vector); }; } // namespace impl @@ -91,7 +101,9 @@ static boolean_reduction_strided_impl_fn_ptr any_reduction_strided_dispatch_vector[td_ns::num_types]; using dpctl::tensor::kernels::boolean_reduction_contig_impl_fn_ptr; static boolean_reduction_contig_impl_fn_ptr - any_reduction_contig_dispatch_vector[td_ns::num_types]; + any_reduction_axis1_contig_dispatch_vector[td_ns::num_types]; +static boolean_reduction_contig_impl_fn_ptr + any_reduction_axis0_contig_dispatch_vector[td_ns::num_types]; void populate_any_dispatch_vectors(void) { @@ -107,11 +119,19 @@ void populate_any_dispatch_vectors(void) using dpctl::tensor::kernels::boolean_reduction_contig_impl_fn_ptr; - using dpctl::tensor::kernels::AnyContigFactory; + using dpctl::tensor::kernels::AnyAxis1ContigFactory; DispatchVectorBuilder + AnyAxis1ContigFactory, td_ns::num_types> any_dvb2; - any_dvb2.populate_dispatch_vector(any_reduction_contig_dispatch_vector); + any_dvb2.populate_dispatch_vector( + any_reduction_axis1_contig_dispatch_vector); + + using dpctl::tensor::kernels::AnyAxis0ContigFactory; + DispatchVectorBuilder + any_dvb3; + any_dvb3.populate_dispatch_vector( + any_reduction_axis0_contig_dispatch_vector); }; } // namespace impl @@ -124,16 +144,18 @@ void init_boolean_reduction_functions(py::module_ m) // ALL { impl::populate_all_dispatch_vectors(); - using impl::all_reduction_contig_dispatch_vector; + using impl::all_reduction_axis0_contig_dispatch_vector; + using impl::all_reduction_axis1_contig_dispatch_vector; using impl::all_reduction_strided_dispatch_vector; auto all_pyapi = [&](arrayT src, int trailing_dims_to_reduce, arrayT dst, sycl::queue exec_q, const event_vecT &depends = {}) { - return py_boolean_reduction(src, trailing_dims_to_reduce, dst, - exec_q, depends, - all_reduction_contig_dispatch_vector, - all_reduction_strided_dispatch_vector); + return py_boolean_reduction( + src, trailing_dims_to_reduce, dst, exec_q, depends, + all_reduction_axis1_contig_dispatch_vector, + all_reduction_axis0_contig_dispatch_vector, + all_reduction_strided_dispatch_vector); }; m.def("_all", all_pyapi, "", py::arg("src"), py::arg("trailing_dims_to_reduce"), py::arg("dst"), @@ -143,16 +165,18 @@ void init_boolean_reduction_functions(py::module_ m) // ANY { impl::populate_any_dispatch_vectors(); - using impl::any_reduction_contig_dispatch_vector; + using impl::any_reduction_axis0_contig_dispatch_vector; + using impl::any_reduction_axis1_contig_dispatch_vector; using impl::any_reduction_strided_dispatch_vector; auto any_pyapi = [&](arrayT src, int trailing_dims_to_reduce, arrayT dst, sycl::queue exec_q, const event_vecT &depends = {}) { - return py_boolean_reduction(src, trailing_dims_to_reduce, dst, - exec_q, depends, - any_reduction_contig_dispatch_vector, - any_reduction_strided_dispatch_vector); + return py_boolean_reduction( + src, trailing_dims_to_reduce, dst, exec_q, depends, + any_reduction_axis1_contig_dispatch_vector, + any_reduction_axis0_contig_dispatch_vector, + any_reduction_strided_dispatch_vector); }; m.def("_any", any_pyapi, "", py::arg("src"), py::arg("trailing_dims_to_reduce"), py::arg("dst"), diff --git a/dpctl/tensor/libtensor/source/boolean_reductions.hpp b/dpctl/tensor/libtensor/source/boolean_reductions.hpp index 3d5970d783..591439a7c9 100644 --- a/dpctl/tensor/libtensor/source/boolean_reductions.hpp +++ b/dpctl/tensor/libtensor/source/boolean_reductions.hpp @@ -54,7 +54,8 @@ py_boolean_reduction(dpctl::tensor::usm_ndarray src, dpctl::tensor::usm_ndarray dst, sycl::queue exec_q, const std::vector &depends, - const contig_dispatchT &contig_dispatch_vector, + const contig_dispatchT &axis1_contig_dispatch_vector, + const contig_dispatchT &axis0_contig_dispatch_vector, const strided_dispatchT &strided_dispatch_vector) { int src_nd = src.get_ndim(); @@ -131,16 +132,32 @@ py_boolean_reduction(dpctl::tensor::usm_ndarray src, bool is_src_c_contig = src.is_c_contiguous(); bool is_src_f_contig = src.is_f_contiguous(); - bool is_dst_c_contig = dst.is_c_contiguous(); if ((is_src_c_contig && is_dst_c_contig) || - (is_src_f_contig && dst_nd == 0)) { - auto fn = contig_dispatch_vector[src_typeid]; + (is_src_f_contig && dst_nelems == 0)) + { + auto fn = axis1_contig_dispatch_vector[src_typeid]; constexpr py::ssize_t zero_offset = 0; - auto red_ev = fn(exec_q, dst_nelems, red_nelems, src_data, dst_data, - zero_offset, zero_offset, zero_offset, depends); + sycl::event red_ev = + fn(exec_q, dst_nelems, red_nelems, src_data, dst_data, zero_offset, + zero_offset, zero_offset, depends); + + sycl::event keep_args_event = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev}); + + return std::make_pair(keep_args_event, red_ev); + } + else if (is_src_f_contig && + ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous())) + { + auto fn = axis0_contig_dispatch_vector[src_typeid]; + constexpr py::ssize_t zero_offset = 0; + + sycl::event red_ev = + fn(exec_q, dst_nelems, red_nelems, src_data, dst_data, zero_offset, + zero_offset, zero_offset, depends); sycl::event keep_args_event = dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev}); @@ -196,24 +213,48 @@ py_boolean_reduction(dpctl::tensor::usm_ndarray src, simplified_iter_dst_strides, iter_src_offset, iter_dst_offset); } - if ((simplified_red_nd == 1) && (simplified_red_src_strides[0] == 1) && - (iter_nd == 1) && - ((simplified_iter_shape[0] == 1) || - ((simplified_iter_dst_strides[0] == 1) && - (simplified_iter_src_strides[0] == - static_cast(red_nelems))))) - { - auto fn = contig_dispatch_vector[src_typeid]; + if (simplified_red_nd == 1 && iter_nd == 1) { + bool mat_reduce_over_axis1 = false; + bool mat_reduce_over_axis0 = false; + bool array_reduce_all_elems = false; size_t iter_nelems = dst_nelems; - sycl::event red_ev = - fn(exec_q, iter_nelems, red_nelems, src.get_data(), dst.get_data(), - iter_src_offset, iter_dst_offset, red_src_offset, depends); + if (simplified_red_src_strides[0] == 1) { + array_reduce_all_elems = (simplified_iter_shape[0] == 1); + mat_reduce_over_axis1 = + (simplified_iter_dst_strides[0] == 1) && + (static_cast(simplified_iter_src_strides[0]) == + red_nelems); + } + else if (static_cast(simplified_red_src_strides[0]) == + iter_nelems) { + mat_reduce_over_axis0 = (simplified_iter_dst_strides[0] == 1) && + (simplified_iter_src_strides[0] == 1); + } + if (mat_reduce_over_axis1 || array_reduce_all_elems) { + auto fn = axis1_contig_dispatch_vector[src_typeid]; - sycl::event keep_args_event = - dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev}); + sycl::event red_ev = + fn(exec_q, iter_nelems, red_nelems, src_data, dst_data, + iter_src_offset, iter_dst_offset, red_src_offset, depends); - return std::make_pair(keep_args_event, red_ev); + sycl::event keep_args_event = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev}); + + return std::make_pair(keep_args_event, red_ev); + } + else if (mat_reduce_over_axis0) { + auto fn = axis0_contig_dispatch_vector[src_typeid]; + + sycl::event red_ev = + fn(exec_q, iter_nelems, red_nelems, src_data, dst_data, + iter_src_offset, iter_dst_offset, red_src_offset, depends); + + sycl::event keep_args_event = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev}); + + return std::make_pair(keep_args_event, red_ev); + } } auto fn = strided_dispatch_vector[src_typeid];