IntelPython
diff --git a/‎.github/workflows/generate-coverage.yaml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/generate-coverage.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/os-llvm-sycl-build.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/os-llvm-sycl-build.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp
Lines changed: 131 additions & 50 deletions b/‎dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp
Lines changed: 131 additions & 50 deletions
@@ -79,7 +79,7 @@ jobs:
       - name: Install dpctl dependencies
         shell: bash -l {0}
         run: |
-          pip install numpy cython setuptools pytest pytest-cov scikit-build cmake coverage[toml]
+          pip install numpy"<1.26.0" cython setuptools pytest pytest-cov scikit-build cmake coverage[toml]
 
       - name: Build dpctl with coverage
         shell: bash -l {0}
 
@@ -108,7 +108,7 @@ jobs:
       - name: Install dpctl dependencies
         shell: bash -l {0}
         run: |
-          pip install numpy cython setuptools pytest scikit-build cmake ninja
+          pip install numpy"<1.26.0" cython setuptools pytest scikit-build cmake ninja
 
       - name: Checkout repo
         uses: actions/checkout@v3
 
@@ -34,6 +34,7 @@
 #include "pybind11/pybind11.h"
 
 #include "utils/offset_utils.hpp"
+#include "utils/sycl_utils.hpp"
 #include "utils/type_dispatch.hpp"
 #include "utils/type_utils.hpp"
 
@@ -227,9 +228,8 @@ struct ContigBooleanReduction
 
     void operator()(sycl::nd_item<1> it) const
     {
-        const size_t red_gws_ = it.get_global_range(0) / iter_gws_;
-        const size_t reduction_id = it.get_global_id(0) / red_gws_;
-        const size_t reduction_batch_id = get_reduction_batch_id(it);
+        const size_t reduction_id = it.get_group(0) % iter_gws_;
+        const size_t reduction_batch_id = it.get_group(0) / iter_gws_;
         const size_t wg_size = it.get_local_range(0);
 
         const size_t base = reduction_id * reduction_max_gid_;
@@ -241,14 +241,6 @@ struct ContigBooleanReduction
         // in group_op_
         group_op_(it, out_, reduction_id, inp_ + start, inp_ + end);
     }
-
-private:
-    size_t get_reduction_batch_id(sycl::nd_item<1> const &it) const
-    {
-        const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_;
-        const size_t reduction_batch_id = it.get_group(0) % n_reduction_groups;
-        return reduction_batch_id;
-    }
 };
 
 typedef sycl::event (*boolean_reduction_contig_impl_fn_ptr)(
@@ -268,17 +260,19 @@ class boolean_reduction_contig_krn;
 template <typename T1, typename T2, typename T3, typename T4, typename T5>
 class boolean_reduction_seq_contig_krn;
 
+using dpctl::tensor::sycl_utils::choose_workgroup_size;
+
 template <typename argTy, typename resTy, typename RedOpT, typename GroupOpT>
 sycl::event
-boolean_reduction_contig_impl(sycl::queue exec_q,
-                              size_t iter_nelems,
-                              size_t reduction_nelems,
-                              const char *arg_cp,
-                              char *res_cp,
-                              py::ssize_t iter_arg_offset,
-                              py::ssize_t iter_res_offset,
-                              py::ssize_t red_arg_offset,
-                              const std::vector<sycl::event> &depends)
+boolean_reduction_axis1_contig_impl(sycl::queue exec_q,
+                                    size_t iter_nelems,
+                                    size_t reduction_nelems,
+                                    const char *arg_cp,
+                                    char *res_cp,
+                                    py::ssize_t iter_arg_offset,
+                                    py::ssize_t iter_res_offset,
+                                    py::ssize_t red_arg_offset,
+                                    const std::vector<sycl::event> &depends)
 {
     const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
                           iter_arg_offset + red_arg_offset;
@@ -288,8 +282,7 @@ boolean_reduction_contig_impl(sycl::queue exec_q,
 
     const sycl::device &d = exec_q.get_device();
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
-    size_t wg =
-        4 * (*std::max_element(std::begin(sg_sizes), std::end(sg_sizes)));
+    size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
 
     sycl::event red_ev;
     if (reduction_nelems < wg) {
@@ -322,18 +315,8 @@ boolean_reduction_contig_impl(sycl::queue exec_q,
         });
     }
     else {
-        sycl::event init_ev = exec_q.submit([&](sycl::handler &cgh) {
-            using IndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-
-            IndexerT res_indexer{};
-
-            cgh.depends_on(depends);
-
-            cgh.parallel_for(sycl::range<1>(iter_nelems), [=](sycl::id<1> id) {
-                auto res_offset = res_indexer(id[0]);
-                res_tp[res_offset] = identity_val;
-            });
-        });
+        sycl::event init_ev = exec_q.fill<resTy>(res_tp, resTy(identity_val),
+                                                 iter_nelems, depends);
         red_ev = exec_q.submit([&](sycl::handler &cgh) {
             cgh.depends_on(init_ev);
 
@@ -363,7 +346,7 @@ boolean_reduction_contig_impl(sycl::queue exec_q,
     return red_ev;
 }
 
-template <typename fnT, typename srcTy> struct AllContigFactory
+template <typename fnT, typename srcTy> struct AllAxis1ContigFactory
 {
     fnT get() const
     {
@@ -372,12 +355,12 @@ template <typename fnT, typename srcTy> struct AllContigFactory
         using GroupOpT =
             all_reduce_wg_contig<srcTy, resTy, boolean_predicate<srcTy>>;
 
-        return dpctl::tensor::kernels::boolean_reduction_contig_impl<
+        return dpctl::tensor::kernels::boolean_reduction_axis1_contig_impl<
             srcTy, resTy, RedOpT, GroupOpT>;
     }
 };
 
-template <typename fnT, typename srcTy> struct AnyContigFactory
+template <typename fnT, typename srcTy> struct AnyAxis1ContigFactory
 {
     fnT get() const
     {
@@ -386,7 +369,7 @@ template <typename fnT, typename srcTy> struct AnyContigFactory
         using GroupOpT =
             any_reduce_wg_contig<srcTy, resTy, boolean_predicate<srcTy>>;
 
-        return dpctl::tensor::kernels::boolean_reduction_contig_impl<
+        return dpctl::tensor::kernels::boolean_reduction_axis1_contig_impl<
             srcTy, resTy, RedOpT, GroupOpT>;
     }
 };
@@ -433,9 +416,9 @@ struct StridedBooleanReduction
 
     void operator()(sycl::nd_item<1> it) const
     {
-        const size_t red_gws_ = it.get_global_range(0) / iter_gws_;
-        const size_t reduction_id = it.get_global_id(0) / red_gws_;
-        const size_t reduction_batch_id = get_reduction_batch_id(it);
+        const size_t reduction_id = it.get_group(0) % iter_gws_;
+        const size_t reduction_batch_id = it.get_group(0) / iter_gws_;
+
         const size_t reduction_lid = it.get_local_id(0);
         const size_t wg_size = it.get_local_range(0);
 
@@ -468,13 +451,112 @@ struct StridedBooleanReduction
         // in group_op_
         group_op_(it, out_, out_iter_offset, local_red_val);
     }
+};
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6>
+class boolean_reduction_axis0_contig_krn;
+
+template <typename argTy, typename resTy, typename RedOpT, typename GroupOpT>
+sycl::event
+boolean_reduction_axis0_contig_impl(sycl::queue exec_q,
+                                    size_t iter_nelems,
+                                    size_t reduction_nelems,
+                                    const char *arg_cp,
+                                    char *res_cp,
+                                    py::ssize_t iter_arg_offset,
+                                    py::ssize_t iter_res_offset,
+                                    py::ssize_t red_arg_offset,
+                                    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + red_arg_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
+
+    constexpr resTy identity_val = sycl::known_identity<RedOpT, resTy>::value;
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
 
-private:
-    size_t get_reduction_batch_id(sycl::nd_item<1> const &it) const
     {
-        const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_;
-        const size_t reduction_batch_id = it.get_group(0) % n_reduction_groups;
-        return reduction_batch_id;
+        sycl::event init_ev = exec_q.fill<resTy>(res_tp, resTy(identity_val),
+                                                 iter_nelems, depends);
+        sycl::event red_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(init_ev);
+
+            constexpr std::uint8_t dim = 1;
+
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    NoOpIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT = ColsIndexerT;
+
+            NoOpIndexerT columns_indexer{};
+            NoOpIndexerT result_indexer{};
+            InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
+                                                        result_indexer};
+            ReductionIndexerT reduction_indexer{
+                0, static_cast<py::ssize_t>(reduction_nelems),
+                static_cast<py::ssize_t>(iter_nelems)};
+
+            constexpr size_t preferred_reductions_per_wi = 4;
+            size_t reductions_per_wi =
+                (reduction_nelems < preferred_reductions_per_wi * wg)
+                    ? ((reduction_nelems + wg - 1) / wg)
+                    : preferred_reductions_per_wi;
+
+            size_t reduction_groups =
+                (reduction_nelems + reductions_per_wi * wg - 1) /
+                (reductions_per_wi * wg);
+
+            auto gws = sycl::range<dim>{iter_nelems * reduction_groups * wg};
+            auto lws = sycl::range<dim>{wg};
+
+            cgh.parallel_for<class boolean_reduction_axis0_contig_krn<
+                argTy, resTy, RedOpT, GroupOpT, InputOutputIterIndexerT,
+                ReductionIndexerT>>(
+                sycl::nd_range<dim>(gws, lws),
+                StridedBooleanReduction<argTy, resTy, RedOpT, GroupOpT,
+                                        InputOutputIterIndexerT,
+                                        ReductionIndexerT>(
+                    arg_tp, res_tp, RedOpT(), GroupOpT(), identity_val,
+                    in_out_iter_indexer, reduction_indexer, reduction_nelems,
+                    iter_nelems, reductions_per_wi));
+        });
+        return red_ev;
+    }
+}
+
+template <typename fnT, typename srcTy> struct AllAxis0ContigFactory
+{
+    fnT get() const
+    {
+        using resTy = std::int32_t;
+        using RedOpT = sycl::logical_and<resTy>;
+        using GroupOpT = all_reduce_wg_strided<resTy>;
+
+        return dpctl::tensor::kernels::boolean_reduction_axis0_contig_impl<
+            srcTy, resTy, RedOpT, GroupOpT>;
+    }
+};
+
+template <typename fnT, typename srcTy> struct AnyAxis0ContigFactory
+{
+    fnT get() const
+    {
+        using resTy = std::int32_t;
+        using RedOpT = sycl::logical_or<resTy>;
+        using GroupOpT = any_reduce_wg_strided<resTy>;
+
+        return dpctl::tensor::kernels::boolean_reduction_axis0_contig_impl<
+            srcTy, resTy, RedOpT, GroupOpT>;
     }
 };
 
@@ -527,8 +609,7 @@ boolean_reduction_strided_impl(sycl::queue exec_q,
 
     const sycl::device &d = exec_q.get_device();
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
-    size_t wg =
-        4 * (*std::max_element(std::begin(sg_sizes), std::end(sg_sizes)));
+    size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
 
     sycl::event red_ev;
     if (reduction_nelems < wg) {
@@ -558,7 +639,7 @@ boolean_reduction_strided_impl(sycl::queue exec_q,
         });
     }
     else {
-        sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
+        sycl::event init_ev = exec_q.submit([&](sycl::handler &cgh) {
             using IndexerT =
                 dpctl::tensor::offset_utils::UnpackedStridedIndexer;
 
@@ -576,7 +657,7 @@ boolean_reduction_strided_impl(sycl::queue exec_q,
             });
         });
         red_ev = exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(res_init_ev);
+            cgh.depends_on(init_ev);
 
             constexpr std::uint8_t dim = 1;