diff --git a/.github/workflows/generate-coverage.yaml b/.github/workflows/generate-coverage.yaml
index 5975837d55..7e0c8b8667 100644
--- a/.github/workflows/generate-coverage.yaml
+++ b/.github/workflows/generate-coverage.yaml
@@ -79,7 +79,7 @@ jobs:
       - name: Install dpctl dependencies
         shell: bash -l {0}
         run: |
-          pip install numpy cython setuptools pytest pytest-cov scikit-build cmake coverage[toml]
+          pip install numpy"<1.26.0" cython setuptools pytest pytest-cov scikit-build cmake coverage[toml]
 
       - name: Build dpctl with coverage
         shell: bash -l {0}
diff --git a/.github/workflows/os-llvm-sycl-build.yml b/.github/workflows/os-llvm-sycl-build.yml
index d12747f3a9..e7a27f633c 100644
--- a/.github/workflows/os-llvm-sycl-build.yml
+++ b/.github/workflows/os-llvm-sycl-build.yml
@@ -108,7 +108,7 @@ jobs:
       - name: Install dpctl dependencies
         shell: bash -l {0}
         run: |
-          pip install numpy cython setuptools pytest scikit-build cmake ninja
+          pip install numpy"<1.26.0" cython setuptools pytest scikit-build cmake ninja
 
       - name: Checkout repo
         uses: actions/checkout@v3
diff --git a/CHANGELOG.md b/CHANGELOG.md
index f85446fc9a..fc5bb4db36 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,15 +4,43 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [dev]
+## [0.15.0]
 
 ### Added
+
+* Added `dpctl.tensor.floor`, `dpctl.tensor.ceil`, `dpctl.tensor.trunc` elementwise functions.
+* Added `dpctl.tensor.hypot`, `dpctl.tensor.logaddexp` elementwise functions.
+* Added trigonometric (`dpctl.tensor.sin`, `dpctl.tensor.cos`, `dpctl.tensor.tan`) and hyperbolic (`dpctl.tensor.sinh`, `dpctl.tensor.cosh`, `dpctl.tensor.tanh`) elementwise functions and their inverses (`dpctl.tensor.asin`, `dpctl.tensor.asinh`, `dpctl.tensor.acos`, `dpctl.tensor.acosh`, `dpctl.tensor.atan`, `dpctl.tensor.atanh`).
+* Added `dpctl.tensor.round` function.
+* Added `dpctl.tensor.sign` and `dpctl.tensor.remainder` elementwise functions.
+* Added bitwise elementwise functions `dpctl.tensor.bitwise_and`, `dpctl.tensor.bitwise_xor`, `dpctl.tensor.bitwise_or`, `dpctl.tensor.bitwise_invert`
+* Added bitwise shift functions `dpctl.tensor.bitwise_left_shift` and `dpctl.tensor.bitwise_right_shift`.
+* Added `dpctl.tensor.atan2` and `dpctl.tensor.signbit` elementwise functions.
+* Added `dpctl.tensor.minumum` and `dpctl.tensor.maximum` binary elementwise functions.
+* Supported equality checking and hashing for `dpctl.SyclPlatform`.
+* Implemented `types` property for all unary and binary elementwise functions  [#1361](https://github.com/IntelPython/dpctl/pull/1361)
+* Added `dpctl.tensor.repeat` and `dpctl.tensor.tile` functions.
+* Added `dpctl.tensor.matrix_transpose ` function.
+
 ### Changed
 
-* Removed `dpctl.tensor.numpy_usm_shared` obsolete class and associated tests which were being skipped
+* Enabled support for Python arithmetic, in-place arithmetic, reflexive arithmetic, comparison, and bitwise operators for `dpctl.tensor.usm_ndarray` type [#1324](https://github.com/IntelPython/dpctl/pull/1324).
+* Removed `dpctl.tensor.numpy_usm_shared` obsolete class and associated tests which were being skipped [#1310](https://github.com/IntelPython/dpctl/pull/1310)
+* Transitioned `dpctl` codebase to Cython 3.
+* Improved performance of boolean reduction functions `dpctl.tensor.all` and `dpctl.tensor.any`.
+* Improved performance of summation function `dpctl.tensor.sum`.
+* Improved in-place arithmetic operations for addition, subtraction and multiplication.
+* Updated codebase per SYCL-2020 intel/llvm compiler deprecation warnings.
+* Improved performance of advanced boolean indexing for arrays whose size fits in 32-bit signed integer type.
+* Removed deprecated `DPCTLDevice_GetMaxWorkItemSizes` function from the SyclInterface library.
+* Improved performance of `dpctl.tensor.reshape` in the case when a copy is being made.
+* Improved performance of `dpctl.tensor.roll` function.
 
 ### Fixed
 
+* Fixed issues identified by Coverity security scans.
+* Fixed issues [#1279](https://github.com/IntelPython/dpctl/issues/1279), [#1350](https://github.com/IntelPython/dpctl/issues/1350), [#1344](https://github.com/IntelPython/dpctl/issues/1344), [#1327](https://github.com/IntelPython/dpctl/issues/1327), [#1241](https://github.com/IntelPython/dpctl/issues/1241), [#1250](https://github.com/IntelPython/dpctl/issues/1250), [#1293](https://github.com/IntelPython/dpctl/issues/1293).
+
 ## [0.14.5] - 07/17/2023
 
 ### Added
diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt
index 49f25aef6a..456eebdbaa 100644
--- a/dpctl/tensor/CMakeLists.txt
+++ b/dpctl/tensor/CMakeLists.txt
@@ -33,6 +33,7 @@ endif()
 set(python_module_name _tensor_impl)
 pybind11_add_module(${python_module_name} MODULE
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_py.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
@@ -49,6 +50,7 @@ pybind11_add_module(${python_module_name} MODULE
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sum_reductions.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp
 )
 set(_clang_prefix "")
 if (WIN32)
diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py
index ad51689f3f..f0930004ec 100644
--- a/dpctl/tensor/__init__.py
+++ b/dpctl/tensor/__init__.py
@@ -72,11 +72,13 @@
     iinfo,
     moveaxis,
     permute_dims,
+    repeat,
     result_type,
     roll,
     squeeze,
     stack,
     swapaxes,
+    tile,
     unstack,
 )
 from dpctl.tensor._print import (
@@ -305,4 +307,6 @@
     "tanh",
     "trunc",
     "allclose",
+    "repeat",
+    "tile",
 ]
diff --git a/dpctl/tensor/_manipulation_functions.py b/dpctl/tensor/_manipulation_functions.py
index cb54556ed2..7201cd96fb 100644
--- a/dpctl/tensor/_manipulation_functions.py
+++ b/dpctl/tensor/_manipulation_functions.py
@@ -15,10 +15,11 @@
 #  limitations under the License.
 
 
+import itertools
 import operator
-from itertools import chain, repeat
 
 import numpy as np
+from numpy import AxisError
 from numpy.core.numeric import normalize_axis_index, normalize_axis_tuple
 
 import dpctl
@@ -133,7 +134,9 @@ def _broadcast_shape_impl(shapes):
         diff = biggest - nds[i]
         if diff > 0:
             ty = type(shapes[i])
-            shapes[i] = ty(chain(repeat(1, diff), shapes[i]))
+            shapes[i] = ty(
+                itertools.chain(itertools.repeat(1, diff), shapes[i])
+            )
     common_shape = []
     for axis in range(biggest):
         lengths = [s[axis] for s in shapes]
@@ -918,6 +921,302 @@ def swapaxes(X, axis1, axis2):
     return dpt.permute_dims(X, tuple(ind))
 
 
+def repeat(x, repeats, axis=None):
+    """repeat(x, repeats, axis=None)
+
+    Repeat elements of an array.
+
+    Args:
+        x (usm_ndarray): input array
+
+        repeat (Union[int, Tuple[int, ...]]):
+            The number of repetitions for each element.
+            `repeats` is broadcasted to fit the shape of the given axis.
+
+        axis (Optional[int]):
+            The axis along which to repeat values. The `axis` is required
+            if input array has more than one dimension.
+
+    Returns:
+        usm_narray:
+            Array with repeated elements.
+            The returned array must have the same data type as `x`,
+            is created on the same device as `x` and has the same USM
+            allocation type as `x`.
+
+    Raises:
+        AxisError: if `axis` value is invalid.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray type, got {type(x)}.")
+
+    x_ndim = x.ndim
+    if axis is None:
+        if x_ndim > 1:
+            raise ValueError(
+                f"`axis` cannot be `None` for array of dimension {x_ndim}"
+            )
+        axis = 0
+
+    x_shape = x.shape
+    if x_ndim > 0:
+        axis = normalize_axis_index(operator.index(axis), x_ndim)
+        axis_size = x_shape[axis]
+    else:
+        if axis != 0:
+            AxisError("`axis` must be `0` for input of dimension `0`")
+        axis_size = x.size
+
+    scalar = False
+    if isinstance(repeats, int):
+        if repeats < 0:
+            raise ValueError("`repeats` must be a positive integer")
+        usm_type = x.usm_type
+        exec_q = x.sycl_queue
+        scalar = True
+    elif isinstance(repeats, dpt.usm_ndarray):
+        if repeats.ndim > 1:
+            raise ValueError(
+                "`repeats` array must be 0- or 1-dimensional, got"
+                "{repeats.ndim}"
+            )
+        exec_q = dpctl.utils.get_execution_queue(
+            (x.sycl_queue, repeats.sycl_queue)
+        )
+        if exec_q is None:
+            raise dputils.ExecutionPlacementError(
+                "Execution placement can not be unambiguously inferred "
+                "from input arguments."
+            )
+        usm_type = dpctl.utils.get_coerced_usm_type(
+            (
+                x.usm_type,
+                repeats.usm_type,
+            )
+        )
+        dpctl.utils.validate_usm_type(usm_type, allow_none=False)
+        if not dpt.can_cast(repeats.dtype, dpt.int64, casting="same_kind"):
+            raise TypeError(
+                f"`repeats` data type `{repeats.dtype}` cannot be cast to "
+                "`int64` according to the casting rule ''safe.''"
+            )
+        if repeats.size == 1:
+            scalar = True
+            # bring the single element to the host
+            repeats = int(repeats)
+            if repeats < 0:
+                raise ValueError("`repeats` elements must be positive")
+        else:
+            if repeats.size != axis_size:
+                raise ValueError(
+                    "`repeats` array must be broadcastable to the size of "
+                    "the repeated axis"
+                )
+            if not dpt.all(repeats >= 0):
+                raise ValueError("`repeats` elements must be positive")
+
+    elif isinstance(repeats, tuple):
+        usm_type = x.usm_type
+        exec_q = x.sycl_queue
+
+        len_reps = len(repeats)
+        if len_reps != axis_size:
+            raise ValueError(
+                "`repeats` tuple must have the same length as the repeated "
+                "axis"
+            )
+        elif len_reps == 1:
+            repeats = repeats[0]
+            if repeats < 0:
+                raise ValueError("`repeats` elements must be positive")
+            scalar = True
+        else:
+            repeats = dpt.asarray(
+                repeats, dtype=dpt.int64, usm_type=usm_type, sycl_queue=exec_q
+            )
+            if not dpt.all(repeats >= 0):
+                raise ValueError("`repeats` elements must be positive")
+    else:
+        raise TypeError(
+            "Expected int, tuple, or `usm_ndarray` for second argument,"
+            f"got {type(repeats)}"
+        )
+
+    if axis_size == 0:
+        return dpt.empty(x_shape, dtype=x.dtype, sycl_queue=exec_q)
+
+    if scalar:
+        res_axis_size = repeats * axis_size
+        res_shape = x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :]
+        res = dpt.empty(
+            res_shape, dtype=x.dtype, usm_type=usm_type, sycl_queue=exec_q
+        )
+        if res_axis_size > 0:
+            ht_rep_ev, _ = ti._repeat_by_scalar(
+                src=x,
+                dst=res,
+                reps=repeats,
+                axis=axis,
+                sycl_queue=exec_q,
+            )
+            ht_rep_ev.wait()
+    else:
+        if repeats.dtype != dpt.int64:
+            rep_buf = dpt.empty(
+                repeats.shape,
+                dtype=dpt.int64,
+                usm_type=usm_type,
+                sycl_queue=exec_q,
+            )
+            ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+                src=repeats, dst=rep_buf, sycl_queue=exec_q
+            )
+            cumsum = dpt.empty(
+                (axis_size,),
+                dtype=dpt.int64,
+                usm_type=usm_type,
+                sycl_queue=exec_q,
+            )
+            # _cumsum_1d synchronizes so `depends` ends here safely
+            res_axis_size = ti._cumsum_1d(
+                rep_buf, cumsum, sycl_queue=exec_q, depends=[copy_ev]
+            )
+            res_shape = x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :]
+            res = dpt.empty(
+                res_shape, dtype=x.dtype, usm_type=usm_type, sycl_queue=exec_q
+            )
+            if res_axis_size > 0:
+                ht_rep_ev, _ = ti._repeat_by_sequence(
+                    src=x,
+                    dst=res,
+                    reps=rep_buf,
+                    cumsum=cumsum,
+                    axis=axis,
+                    sycl_queue=exec_q,
+                )
+                ht_rep_ev.wait()
+            ht_copy_ev.wait()
+        else:
+            cumsum = dpt.empty(
+                (axis_size,),
+                dtype=dpt.int64,
+                usm_type=usm_type,
+                sycl_queue=exec_q,
+            )
+            # _cumsum_1d synchronizes so `depends` ends here safely
+            res_axis_size = ti._cumsum_1d(repeats, cumsum, sycl_queue=exec_q)
+            res_shape = x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :]
+            res = dpt.empty(
+                res_shape, dtype=x.dtype, usm_type=usm_type, sycl_queue=exec_q
+            )
+            if res_axis_size > 0:
+                ht_rep_ev, _ = ti._repeat_by_sequence(
+                    src=x,
+                    dst=res,
+                    reps=repeats,
+                    cumsum=cumsum,
+                    axis=axis,
+                    sycl_queue=exec_q,
+                )
+                ht_rep_ev.wait()
+    return res
+
+
+def tile(x, repetitions):
+    """tile(x, repetitions)
+
+    Repeat an input array `x` along each axis a number of times given by
+    `repetitions`.
+
+    For `N` = len(`repetitions`) and `M` = len(`x.shape`):
+    - if `M < N`, `x` will have `N - M` new axes prepended to its shape
+    - if `M > N`, `repetitions` will have `M - N` new axes 1 prepended to it
+
+    Args:
+        x (usm_ndarray): input array
+
+        repetitions (Union[int, Tuple[int, ...]]):
+            The number of repetitions for each dimension.
+
+    Returns:
+        usm_narray:
+            Array with tiled elements.
+            The returned array must have the same data type as `x`,
+            is created on the same device as `x` and has the same USM
+            allocation type as `x`.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray type, got {type(x)}.")
+
+    if not isinstance(repetitions, tuple):
+        if isinstance(repetitions, int):
+            repetitions = (repetitions,)
+        else:
+            raise TypeError(
+                f"Expected tuple or integer type, got {type(repetitions)}."
+            )
+
+    # case of scalar
+    if x.size == 1:
+        if not repetitions:
+            # handle empty tuple
+            repetitions = (1,)
+        return dpt.full(
+            repetitions,
+            x,
+            dtype=x.dtype,
+            usm_type=x.usm_type,
+            sycl_queue=x.sycl_queue,
+        )
+    rep_dims = len(repetitions)
+    x_dims = x.ndim
+    if rep_dims < x_dims:
+        repetitions = (x_dims - rep_dims) * (1,) + repetitions
+    elif x_dims < rep_dims:
+        x = dpt.reshape(x, (rep_dims - x_dims) * (1,) + x.shape)
+    res_shape = tuple(map(lambda sh, rep: sh * rep, x.shape, repetitions))
+    # case of empty input
+    if x.size == 0:
+        return dpt.empty(
+            res_shape, x.dtype, usm_type=x.usm_type, sycl_queue=x.sycl_queue
+        )
+    in_sh = x.shape
+    if res_shape == in_sh:
+        return dpt.copy(x)
+    expanded_sh = []
+    broadcast_sh = []
+    out_sz = 1
+    for i in range(len(res_shape)):
+        out_sz *= res_shape[i]
+        reps, sh = repetitions[i], in_sh[i]
+        if reps == 1:
+            # dimension will be unchanged
+            broadcast_sh.append(sh)
+            expanded_sh.append(sh)
+        elif sh == 1:
+            # dimension will be broadcast
+            broadcast_sh.append(reps)
+            expanded_sh.append(sh)
+        else:
+            broadcast_sh.extend([reps, sh])
+            expanded_sh.extend([1, sh])
+    exec_q = x.sycl_queue
+    res = dpt.empty((out_sz,), x.dtype, usm_type=x.usm_type, sycl_queue=exec_q)
+    # no need to copy data for empty output
+    if out_sz > 0:
+        x = dpt.broadcast_to(
+            # this reshape should never copy
+            dpt.reshape(x, expanded_sh),
+            broadcast_sh,
+        )
+        # copy broadcast input into flat array
+        hev, _ = ti._copy_usm_ndarray_for_reshape(
+            src=x, dst=res, sycl_queue=exec_q
+        )
+        hev.wait()
+    return dpt.reshape(res, res_shape)
+
+
 def _supported_dtype(dtypes):
     for dtype in dtypes:
         if dtype.char not in "?bBhHiIlLqQefdFD":
diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp
new file mode 100644
index 0000000000..0072690e9b
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp
@@ -0,0 +1,387 @@
+//===  accumulators.hpp - Implementation of accumulator kernels --*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for accumulators (cumulative sum, prod, etc.).
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <array>
+#include <cstdint>
+#include <limits>
+#include <pybind11/pybind11.h>
+#include <utility>
+#include <vector>
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace accumulators
+{
+
+namespace py = pybind11;
+
+using namespace dpctl::tensor::offset_utils;
+
+template <typename T> T ceiling_quotient(T n, T m)
+{
+    return (n + m - 1) / m;
+}
+template <typename T1, typename T2> T1 ceiling_quotient(T1 n, T2 m)
+{
+    return ceiling_quotient<T1>(n, static_cast<T1>(m));
+}
+
+template <typename inputT,
+          typename outputT,
+          size_t n_wi,
+          typename IndexerT,
+          typename TransformerT>
+class inclusive_scan_rec_local_scan_krn;
+
+template <typename inputT,
+          typename outputT,
+          typename IndexerT,
+          typename TransformerT>
+class inclusive_scan_rec_chunk_update_krn;
+
+template <typename inputT, typename outputT> struct NonZeroIndicator
+{
+    NonZeroIndicator() {}
+
+    outputT operator()(const inputT &val) const
+    {
+        constexpr outputT out_one(1);
+        constexpr outputT out_zero(0);
+        constexpr inputT val_zero(0);
+
+        return (val == val_zero) ? out_zero : out_one;
+    }
+};
+
+template <typename T> struct NoOpTransformer
+{
+    NoOpTransformer() {}
+
+    T operator()(const T &val) const
+    {
+        return val;
+    }
+};
+
+/*
+ * for integer type maskT,
+ *       output[j] = sum( input[s0 + i * s1], 0 <= i <= j)
+ * for 0 <= j < n_elems
+ */
+template <typename inputT,
+          typename outputT,
+          size_t n_wi,
+          typename IndexerT,
+          typename TransformerT>
+sycl::event inclusive_scan_rec(sycl::queue exec_q,
+                               size_t n_elems,
+                               size_t wg_size,
+                               const inputT *input,
+                               outputT *output,
+                               size_t s0,
+                               size_t s1,
+                               IndexerT indexer,
+                               TransformerT transformer,
+                               std::vector<sycl::event> const &depends = {})
+{
+    size_t n_groups = ceiling_quotient(n_elems, n_wi * wg_size);
+
+    sycl::event inc_scan_phase1_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using slmT = sycl::local_accessor<size_t, 1>;
+
+        auto lws = sycl::range<1>(wg_size);
+        auto gws = sycl::range<1>(n_groups * wg_size);
+
+        slmT slm_iscan_tmp(lws, cgh);
+
+        cgh.parallel_for<class inclusive_scan_rec_local_scan_krn<
+            inputT, outputT, n_wi, IndexerT, decltype(transformer)>>(
+            sycl::nd_range<1>(gws, lws), [=](sycl::nd_item<1> it)
+        {
+            auto chunk_gid = it.get_global_id(0);
+            auto lid = it.get_local_id(0);
+
+            std::array<size_t, n_wi> local_isum;
+
+            size_t i = chunk_gid * n_wi;
+            for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) {
+                constexpr outputT out_zero(0);
+
+                local_isum[m_wi] =
+                    (i + m_wi < n_elems)
+                        ? transformer(input[indexer(s0 + s1 * (i + m_wi))])
+                        : out_zero;
+            }
+
+// local_isum is now result of
+// inclusive scan of locally stored mask indicators
+#pragma unroll
+            for (size_t m_wi = 1; m_wi < n_wi; ++m_wi) {
+                local_isum[m_wi] += local_isum[m_wi - 1];
+            }
+
+            size_t wg_iscan_val =
+                sycl::inclusive_scan_over_group(it.get_group(),
+                                                local_isum.back(),
+                                                sycl::plus<size_t>(),
+                                                size_t(0));
+
+            slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val;
+            it.barrier(sycl::access::fence_space::local_space);
+            size_t addand = (lid == 0) ? 0 : slm_iscan_tmp[lid];
+            it.barrier(sycl::access::fence_space::local_space);
+
+#pragma unroll
+            for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) {
+                local_isum[m_wi] += addand;
+            }
+
+            for (size_t m_wi = 0; m_wi < n_wi && i + m_wi < n_elems; ++m_wi) {
+                output[i + m_wi] = local_isum[m_wi];
+            }
+        });
+    });
+
+    sycl::event out_event = inc_scan_phase1_ev;
+    if (n_groups > 1) {
+        outputT *temp = sycl::malloc_device<outputT>(n_groups - 1, exec_q);
+
+        auto chunk_size = wg_size * n_wi;
+
+        NoOpIndexer _no_op_indexer{};
+        NoOpTransformer<outputT> _no_op_transformer{};
+        auto e2 = inclusive_scan_rec<outputT, outputT, n_wi, NoOpIndexer,
+                                     decltype(_no_op_transformer)>(
+            exec_q, n_groups - 1, wg_size, output, temp, chunk_size - 1,
+            chunk_size, _no_op_indexer, _no_op_transformer,
+            {inc_scan_phase1_ev});
+
+        // output[ chunk_size * (i + 1) + j] += temp[i]
+        auto e3 = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(e2);
+            cgh.parallel_for<class inclusive_scan_rec_chunk_update_krn<
+                inputT, outputT, IndexerT, decltype(transformer)>>(
+                {n_elems}, [=](auto wiid)
+            {
+                auto gid = wiid[0];
+                auto i = (gid / chunk_size);
+                output[gid] += (i > 0) ? temp[i - 1] : 0;
+            });
+        });
+
+        sycl::event e4 = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(e3);
+            auto ctx = exec_q.get_context();
+            cgh.host_task([ctx, temp]() { sycl::free(temp, ctx); });
+        });
+
+        out_event = e4;
+    }
+
+    return out_event;
+}
+
+typedef size_t (*accumulate_contig_impl_fn_ptr_t)(
+    sycl::queue,
+    size_t,
+    const char *,
+    char *,
+    std::vector<sycl::event> const &);
+
+template <typename maskT, typename cumsumT, typename transformerT>
+size_t accumulate_contig_impl(sycl::queue q,
+                              size_t n_elems,
+                              const char *mask,
+                              char *cumsum,
+                              std::vector<sycl::event> const &depends = {})
+{
+    constexpr int n_wi = 8;
+    const maskT *mask_data_ptr = reinterpret_cast<const maskT *>(mask);
+    cumsumT *cumsum_data_ptr = reinterpret_cast<cumsumT *>(cumsum);
+    size_t wg_size = 128;
+
+    NoOpIndexer flat_indexer{};
+    transformerT non_zero_indicator{};
+
+    sycl::event comp_ev =
+        inclusive_scan_rec<maskT, cumsumT, n_wi, decltype(flat_indexer),
+                           decltype(non_zero_indicator)>(
+            q, n_elems, wg_size, mask_data_ptr, cumsum_data_ptr, 0, 1,
+            flat_indexer, non_zero_indicator, depends);
+
+    cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1);
+
+    cumsumT *last_elem_host_usm = sycl::malloc_host<cumsumT>(1, q);
+
+    if (last_elem_host_usm == nullptr) {
+        throw std::bad_alloc();
+    }
+    sycl::event copy_e =
+        q.copy<cumsumT>(last_elem, last_elem_host_usm, 1, {comp_ev});
+    copy_e.wait();
+    size_t return_val = static_cast<size_t>(*last_elem_host_usm);
+    sycl::free(last_elem_host_usm, q);
+
+    return return_val;
+}
+
+template <typename fnT, typename T> struct MaskPositionsContigFactoryForInt32
+{
+    fnT get()
+    {
+        using cumsumT = std::int32_t;
+        fnT fn =
+            accumulate_contig_impl<T, cumsumT, NonZeroIndicator<T, cumsumT>>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T> struct MaskPositionsContigFactoryForInt64
+{
+    fnT get()
+    {
+        using cumsumT = std::int64_t;
+        fnT fn =
+            accumulate_contig_impl<T, cumsumT, NonZeroIndicator<T, cumsumT>>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T> struct Cumsum1DContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_integral_v<T>) {
+            using cumsumT = std::int64_t;
+            fnT fn =
+                accumulate_contig_impl<T, cumsumT, NoOpTransformer<cumsumT>>;
+            return fn;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+typedef size_t (*accumulate_strided_impl_fn_ptr_t)(
+    sycl::queue,
+    size_t,
+    const char *,
+    int,
+    const py::ssize_t *,
+    char *,
+    std::vector<sycl::event> const &);
+
+template <typename maskT, typename cumsumT, typename transformerT>
+size_t accumulate_strided_impl(sycl::queue q,
+                               size_t n_elems,
+                               const char *mask,
+                               int nd,
+                               const py::ssize_t *shape_strides,
+                               char *cumsum,
+                               std::vector<sycl::event> const &depends = {})
+{
+    constexpr int n_wi = 8;
+    const maskT *mask_data_ptr = reinterpret_cast<const maskT *>(mask);
+    cumsumT *cumsum_data_ptr = reinterpret_cast<cumsumT *>(cumsum);
+    size_t wg_size = 128;
+
+    StridedIndexer strided_indexer{nd, 0, shape_strides};
+    transformerT non_zero_indicator{};
+
+    sycl::event comp_ev =
+        inclusive_scan_rec<maskT, cumsumT, n_wi, decltype(strided_indexer),
+                           decltype(non_zero_indicator)>(
+            q, n_elems, wg_size, mask_data_ptr, cumsum_data_ptr, 0, 1,
+            strided_indexer, non_zero_indicator, depends);
+
+    cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1);
+
+    cumsumT *last_elem_host_usm = sycl::malloc_host<cumsumT>(1, q);
+
+    if (last_elem_host_usm == nullptr) {
+        throw std::bad_alloc();
+    }
+    sycl::event copy_e =
+        q.copy<cumsumT>(last_elem, last_elem_host_usm, 1, {comp_ev});
+    copy_e.wait();
+    size_t return_val = static_cast<size_t>(*last_elem_host_usm);
+    sycl::free(last_elem_host_usm, q);
+
+    return return_val;
+}
+
+template <typename fnT, typename T> struct MaskPositionsStridedFactoryForInt32
+{
+    fnT get()
+    {
+        using cumsumT = std::int32_t;
+        fnT fn =
+            accumulate_strided_impl<T, cumsumT, NonZeroIndicator<T, cumsumT>>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T> struct MaskPositionsStridedFactoryForInt64
+{
+    fnT get()
+    {
+        using cumsumT = std::int64_t;
+        fnT fn =
+            accumulate_strided_impl<T, cumsumT, NonZeroIndicator<T, cumsumT>>;
+        return fn;
+    }
+};
+
+template <typename fnT, typename T> struct Cumsum1DStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_integral_v<T>) {
+            using cumsumT = std::int64_t;
+            fnT fn =
+                accumulate_strided_impl<T, cumsumT, NoOpTransformer<cumsumT>>;
+            return fn;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+} // namespace accumulators
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
index 43c546860b..595fc68496 100644
--- a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
@@ -24,7 +24,6 @@
 
 #pragma once
 #include <CL/sycl.hpp>
-#include <array>
 #include <cstdint>
 #include <limits>
 #include <pybind11/pybind11.h>
@@ -47,172 +46,6 @@ namespace py = pybind11;
 
 using namespace dpctl::tensor::offset_utils;
 
-template <typename T> T ceiling_quotient(T n, T m)
-{
-    return (n + m - 1) / m;
-}
-template <typename T1, typename T2> T1 ceiling_quotient(T1 n, T2 m)
-{
-    return ceiling_quotient<T1>(n, static_cast<T1>(m));
-}
-
-template <typename inputT,
-          typename outputT,
-          size_t n_wi,
-          typename IndexerT,
-          typename TransformerT>
-class inclusive_scan_rec_local_scan_krn;
-
-template <typename inputT,
-          typename outputT,
-          typename IndexerT,
-          typename TransformerT>
-class inclusive_scan_rec_chunk_update_krn;
-
-template <typename inputT, typename outputT> struct NonZeroIndicator
-{
-    NonZeroIndicator() {}
-
-    outputT operator()(const inputT &val) const
-    {
-        constexpr outputT out_one(1);
-        constexpr outputT out_zero(0);
-        constexpr inputT val_zero(0);
-
-        return (val == val_zero) ? out_zero : out_one;
-    }
-};
-
-template <typename T> struct NoOpTransformer
-{
-    NoOpTransformer() {}
-
-    T operator()(const T &val) const
-    {
-        return val;
-    }
-};
-
-/*
- * for integer type maskT,
- *       output[j] = sum( input[s0 + i * s1], 0 <= i <= j)
- * for 0 <= j < n_elems
- */
-template <typename inputT,
-          typename outputT,
-          size_t n_wi,
-          typename IndexerT,
-          typename TransformerT>
-sycl::event inclusive_scan_rec(sycl::queue exec_q,
-                               size_t n_elems,
-                               size_t wg_size,
-                               const inputT *input,
-                               outputT *output,
-                               size_t s0,
-                               size_t s1,
-                               IndexerT indexer,
-                               TransformerT transformer,
-                               std::vector<sycl::event> const &depends = {})
-{
-    size_t n_groups = ceiling_quotient(n_elems, n_wi * wg_size);
-
-    sycl::event inc_scan_phase1_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        using slmT = sycl::local_accessor<size_t, 1>;
-
-        auto lws = sycl::range<1>(wg_size);
-        auto gws = sycl::range<1>(n_groups * wg_size);
-
-        slmT slm_iscan_tmp(lws, cgh);
-
-        cgh.parallel_for<class inclusive_scan_rec_local_scan_krn<
-            inputT, outputT, n_wi, IndexerT, decltype(transformer)>>(
-            sycl::nd_range<1>(gws, lws), [=](sycl::nd_item<1> it)
-        {
-            auto chunk_gid = it.get_global_id(0);
-            auto lid = it.get_local_id(0);
-
-            std::array<size_t, n_wi> local_isum;
-
-            size_t i = chunk_gid * n_wi;
-            for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) {
-                constexpr outputT out_zero(0);
-
-                local_isum[m_wi] =
-                    (i + m_wi < n_elems)
-                        ? transformer(input[indexer(s0 + s1 * (i + m_wi))])
-                        : out_zero;
-            }
-
-// local_isum is now result of
-// inclusive scan of locally stored mask indicators
-#pragma unroll
-            for (size_t m_wi = 1; m_wi < n_wi; ++m_wi) {
-                local_isum[m_wi] += local_isum[m_wi - 1];
-            }
-
-            size_t wg_iscan_val =
-                sycl::inclusive_scan_over_group(it.get_group(),
-                                                local_isum.back(),
-                                                sycl::plus<size_t>(),
-                                                size_t(0));
-
-            slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val;
-            it.barrier(sycl::access::fence_space::local_space);
-            size_t addand = (lid == 0) ? 0 : slm_iscan_tmp[lid];
-            it.barrier(sycl::access::fence_space::local_space);
-
-#pragma unroll
-            for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) {
-                local_isum[m_wi] += addand;
-            }
-
-            for (size_t m_wi = 0; m_wi < n_wi && i + m_wi < n_elems; ++m_wi) {
-                output[i + m_wi] = local_isum[m_wi];
-            }
-        });
-    });
-
-    sycl::event out_event = inc_scan_phase1_ev;
-    if (n_groups > 1) {
-        outputT *temp = sycl::malloc_device<outputT>(n_groups - 1, exec_q);
-
-        auto chunk_size = wg_size * n_wi;
-
-        NoOpIndexer _no_op_indexer{};
-        NoOpTransformer<outputT> _no_op_transformer{};
-        auto e2 = inclusive_scan_rec<outputT, outputT, n_wi, NoOpIndexer,
-                                     decltype(_no_op_transformer)>(
-            exec_q, n_groups - 1, wg_size, output, temp, chunk_size - 1,
-            chunk_size, _no_op_indexer, _no_op_transformer,
-            {inc_scan_phase1_ev});
-
-        // output[ chunk_size * (i + 1) + j] += temp[i]
-        auto e3 = exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(e2);
-            cgh.parallel_for<class inclusive_scan_rec_chunk_update_krn<
-                inputT, outputT, IndexerT, decltype(transformer)>>(
-                {n_elems}, [=](auto wiid)
-            {
-                auto gid = wiid[0];
-                auto i = (gid / chunk_size);
-                output[gid] += (i > 0) ? temp[i - 1] : 0;
-            });
-        });
-
-        sycl::event e4 = exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(e3);
-            auto ctx = exec_q.get_context();
-            cgh.host_task([ctx, temp]() { sycl::free(temp, ctx); });
-        });
-
-        out_event = e4;
-    }
-
-    return out_event;
-}
-
 template <typename OrthogIndexerT,
           typename MaskedSrcIndexerT,
           typename MaskedDstIndexerT,
@@ -355,136 +188,6 @@ struct MaskedPlaceStridedFunctor
         masked_rhs_indexer; // has 1, rhs_strides for dimensions that ARE masked
 };
 
-// mask positions
-
-typedef size_t (*mask_positions_contig_impl_fn_ptr_t)(
-    sycl::queue,
-    size_t,
-    const char *,
-    char *,
-    std::vector<sycl::event> const &);
-
-template <typename maskT, typename cumsumT>
-size_t mask_positions_contig_impl(sycl::queue q,
-                                  size_t n_elems,
-                                  const char *mask,
-                                  char *cumsum,
-                                  std::vector<sycl::event> const &depends = {})
-{
-    constexpr int n_wi = 8;
-    const maskT *mask_data_ptr = reinterpret_cast<const maskT *>(mask);
-    cumsumT *cumsum_data_ptr = reinterpret_cast<cumsumT *>(cumsum);
-    size_t wg_size = 128;
-
-    NoOpIndexer flat_indexer{};
-    NonZeroIndicator<maskT, cumsumT> non_zero_indicator{};
-
-    sycl::event comp_ev =
-        inclusive_scan_rec<maskT, cumsumT, n_wi, decltype(flat_indexer),
-                           decltype(non_zero_indicator)>(
-            q, n_elems, wg_size, mask_data_ptr, cumsum_data_ptr, 0, 1,
-            flat_indexer, non_zero_indicator, depends);
-
-    cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1);
-
-    cumsumT *last_elem_host_usm = sycl::malloc_host<cumsumT>(1, q);
-
-    if (last_elem_host_usm == nullptr) {
-        throw std::bad_alloc();
-    }
-    sycl::event copy_e =
-        q.copy<cumsumT>(last_elem, last_elem_host_usm, 1, {comp_ev});
-    copy_e.wait();
-    size_t return_val = static_cast<size_t>(*last_elem_host_usm);
-    sycl::free(last_elem_host_usm, q);
-
-    return return_val;
-}
-
-template <typename fnT, typename T> struct MaskPositionsContigFactoryForInt32
-{
-    fnT get()
-    {
-        fnT fn = mask_positions_contig_impl<T, std::int32_t>;
-        return fn;
-    }
-};
-
-template <typename fnT, typename T> struct MaskPositionsContigFactoryForInt64
-{
-    fnT get()
-    {
-        fnT fn = mask_positions_contig_impl<T, std::int64_t>;
-        return fn;
-    }
-};
-
-typedef size_t (*mask_positions_strided_impl_fn_ptr_t)(
-    sycl::queue,
-    size_t,
-    const char *,
-    int,
-    const py::ssize_t *,
-    char *,
-    std::vector<sycl::event> const &);
-
-template <typename maskT, typename cumsumT>
-size_t mask_positions_strided_impl(sycl::queue q,
-                                   size_t n_elems,
-                                   const char *mask,
-                                   int nd,
-                                   const py::ssize_t *shape_strides,
-                                   char *cumsum,
-                                   std::vector<sycl::event> const &depends = {})
-{
-    constexpr int n_wi = 8;
-    const maskT *mask_data_ptr = reinterpret_cast<const maskT *>(mask);
-    cumsumT *cumsum_data_ptr = reinterpret_cast<cumsumT *>(cumsum);
-    size_t wg_size = 128;
-
-    StridedIndexer strided_indexer{nd, 0, shape_strides};
-    NonZeroIndicator<maskT, cumsumT> non_zero_indicator{};
-
-    sycl::event comp_ev =
-        inclusive_scan_rec<maskT, cumsumT, n_wi, decltype(strided_indexer),
-                           decltype(non_zero_indicator)>(
-            q, n_elems, wg_size, mask_data_ptr, cumsum_data_ptr, 0, 1,
-            strided_indexer, non_zero_indicator, depends);
-
-    cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1);
-
-    cumsumT *last_elem_host_usm = sycl::malloc_host<cumsumT>(1, q);
-
-    if (last_elem_host_usm == nullptr) {
-        throw std::bad_alloc();
-    }
-    sycl::event copy_e =
-        q.copy<cumsumT>(last_elem, last_elem_host_usm, 1, {comp_ev});
-    copy_e.wait();
-    size_t return_val = static_cast<size_t>(*last_elem_host_usm);
-    sycl::free(last_elem_host_usm, q);
-
-    return return_val;
-}
-
-template <typename fnT, typename T> struct MaskPositionsStridedFactoryForInt32
-{
-    fnT get()
-    {
-        fnT fn = mask_positions_strided_impl<T, std::int32_t>;
-        return fn;
-    }
-};
-
-template <typename fnT, typename T> struct MaskPositionsStridedFactoryForInt64
-{
-    fnT get()
-    {
-        fnT fn = mask_positions_strided_impl<T, std::int64_t>;
-        return fn;
-    }
-};
-
 // ======= Masked extraction ================================
 
 template <typename OrthoIndexerT,
diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp
index 8130fde96a..9b54e505f6 100644
--- a/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp
@@ -34,6 +34,7 @@
 #include "pybind11/pybind11.h"
 
 #include "utils/offset_utils.hpp"
+#include "utils/sycl_utils.hpp"
 #include "utils/type_dispatch.hpp"
 #include "utils/type_utils.hpp"
 
@@ -227,9 +228,8 @@ struct ContigBooleanReduction
 
     void operator()(sycl::nd_item<1> it) const
     {
-        const size_t red_gws_ = it.get_global_range(0) / iter_gws_;
-        const size_t reduction_id = it.get_global_id(0) / red_gws_;
-        const size_t reduction_batch_id = get_reduction_batch_id(it);
+        const size_t reduction_id = it.get_group(0) % iter_gws_;
+        const size_t reduction_batch_id = it.get_group(0) / iter_gws_;
         const size_t wg_size = it.get_local_range(0);
 
         const size_t base = reduction_id * reduction_max_gid_;
@@ -241,14 +241,6 @@ struct ContigBooleanReduction
         // in group_op_
         group_op_(it, out_, reduction_id, inp_ + start, inp_ + end);
     }
-
-private:
-    size_t get_reduction_batch_id(sycl::nd_item<1> const &it) const
-    {
-        const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_;
-        const size_t reduction_batch_id = it.get_group(0) % n_reduction_groups;
-        return reduction_batch_id;
-    }
 };
 
 typedef sycl::event (*boolean_reduction_contig_impl_fn_ptr)(
@@ -268,17 +260,19 @@ class boolean_reduction_contig_krn;
 template <typename T1, typename T2, typename T3, typename T4, typename T5>
 class boolean_reduction_seq_contig_krn;
 
+using dpctl::tensor::sycl_utils::choose_workgroup_size;
+
 template <typename argTy, typename resTy, typename RedOpT, typename GroupOpT>
 sycl::event
-boolean_reduction_contig_impl(sycl::queue exec_q,
-                              size_t iter_nelems,
-                              size_t reduction_nelems,
-                              const char *arg_cp,
-                              char *res_cp,
-                              py::ssize_t iter_arg_offset,
-                              py::ssize_t iter_res_offset,
-                              py::ssize_t red_arg_offset,
-                              const std::vector<sycl::event> &depends)
+boolean_reduction_axis1_contig_impl(sycl::queue exec_q,
+                                    size_t iter_nelems,
+                                    size_t reduction_nelems,
+                                    const char *arg_cp,
+                                    char *res_cp,
+                                    py::ssize_t iter_arg_offset,
+                                    py::ssize_t iter_res_offset,
+                                    py::ssize_t red_arg_offset,
+                                    const std::vector<sycl::event> &depends)
 {
     const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
                           iter_arg_offset + red_arg_offset;
@@ -288,8 +282,7 @@ boolean_reduction_contig_impl(sycl::queue exec_q,
 
     const sycl::device &d = exec_q.get_device();
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
-    size_t wg =
-        4 * (*std::max_element(std::begin(sg_sizes), std::end(sg_sizes)));
+    size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
 
     sycl::event red_ev;
     if (reduction_nelems < wg) {
@@ -322,18 +315,8 @@ boolean_reduction_contig_impl(sycl::queue exec_q,
         });
     }
     else {
-        sycl::event init_ev = exec_q.submit([&](sycl::handler &cgh) {
-            using IndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-
-            IndexerT res_indexer{};
-
-            cgh.depends_on(depends);
-
-            cgh.parallel_for(sycl::range<1>(iter_nelems), [=](sycl::id<1> id) {
-                auto res_offset = res_indexer(id[0]);
-                res_tp[res_offset] = identity_val;
-            });
-        });
+        sycl::event init_ev = exec_q.fill<resTy>(res_tp, resTy(identity_val),
+                                                 iter_nelems, depends);
         red_ev = exec_q.submit([&](sycl::handler &cgh) {
             cgh.depends_on(init_ev);
 
@@ -363,7 +346,7 @@ boolean_reduction_contig_impl(sycl::queue exec_q,
     return red_ev;
 }
 
-template <typename fnT, typename srcTy> struct AllContigFactory
+template <typename fnT, typename srcTy> struct AllAxis1ContigFactory
 {
     fnT get() const
     {
@@ -372,12 +355,12 @@ template <typename fnT, typename srcTy> struct AllContigFactory
         using GroupOpT =
             all_reduce_wg_contig<srcTy, resTy, boolean_predicate<srcTy>>;
 
-        return dpctl::tensor::kernels::boolean_reduction_contig_impl<
+        return dpctl::tensor::kernels::boolean_reduction_axis1_contig_impl<
             srcTy, resTy, RedOpT, GroupOpT>;
     }
 };
 
-template <typename fnT, typename srcTy> struct AnyContigFactory
+template <typename fnT, typename srcTy> struct AnyAxis1ContigFactory
 {
     fnT get() const
     {
@@ -386,7 +369,7 @@ template <typename fnT, typename srcTy> struct AnyContigFactory
         using GroupOpT =
             any_reduce_wg_contig<srcTy, resTy, boolean_predicate<srcTy>>;
 
-        return dpctl::tensor::kernels::boolean_reduction_contig_impl<
+        return dpctl::tensor::kernels::boolean_reduction_axis1_contig_impl<
             srcTy, resTy, RedOpT, GroupOpT>;
     }
 };
@@ -433,9 +416,9 @@ struct StridedBooleanReduction
 
     void operator()(sycl::nd_item<1> it) const
     {
-        const size_t red_gws_ = it.get_global_range(0) / iter_gws_;
-        const size_t reduction_id = it.get_global_id(0) / red_gws_;
-        const size_t reduction_batch_id = get_reduction_batch_id(it);
+        const size_t reduction_id = it.get_group(0) % iter_gws_;
+        const size_t reduction_batch_id = it.get_group(0) / iter_gws_;
+
         const size_t reduction_lid = it.get_local_id(0);
         const size_t wg_size = it.get_local_range(0);
 
@@ -468,13 +451,112 @@ struct StridedBooleanReduction
         // in group_op_
         group_op_(it, out_, out_iter_offset, local_red_val);
     }
+};
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6>
+class boolean_reduction_axis0_contig_krn;
+
+template <typename argTy, typename resTy, typename RedOpT, typename GroupOpT>
+sycl::event
+boolean_reduction_axis0_contig_impl(sycl::queue exec_q,
+                                    size_t iter_nelems,
+                                    size_t reduction_nelems,
+                                    const char *arg_cp,
+                                    char *res_cp,
+                                    py::ssize_t iter_arg_offset,
+                                    py::ssize_t iter_res_offset,
+                                    py::ssize_t red_arg_offset,
+                                    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + red_arg_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
+
+    constexpr resTy identity_val = sycl::known_identity<RedOpT, resTy>::value;
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
 
-private:
-    size_t get_reduction_batch_id(sycl::nd_item<1> const &it) const
     {
-        const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_;
-        const size_t reduction_batch_id = it.get_group(0) % n_reduction_groups;
-        return reduction_batch_id;
+        sycl::event init_ev = exec_q.fill<resTy>(res_tp, resTy(identity_val),
+                                                 iter_nelems, depends);
+        sycl::event red_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(init_ev);
+
+            constexpr std::uint8_t dim = 1;
+
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    NoOpIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT = ColsIndexerT;
+
+            NoOpIndexerT columns_indexer{};
+            NoOpIndexerT result_indexer{};
+            InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
+                                                        result_indexer};
+            ReductionIndexerT reduction_indexer{
+                0, static_cast<py::ssize_t>(reduction_nelems),
+                static_cast<py::ssize_t>(iter_nelems)};
+
+            constexpr size_t preferred_reductions_per_wi = 4;
+            size_t reductions_per_wi =
+                (reduction_nelems < preferred_reductions_per_wi * wg)
+                    ? ((reduction_nelems + wg - 1) / wg)
+                    : preferred_reductions_per_wi;
+
+            size_t reduction_groups =
+                (reduction_nelems + reductions_per_wi * wg - 1) /
+                (reductions_per_wi * wg);
+
+            auto gws = sycl::range<dim>{iter_nelems * reduction_groups * wg};
+            auto lws = sycl::range<dim>{wg};
+
+            cgh.parallel_for<class boolean_reduction_axis0_contig_krn<
+                argTy, resTy, RedOpT, GroupOpT, InputOutputIterIndexerT,
+                ReductionIndexerT>>(
+                sycl::nd_range<dim>(gws, lws),
+                StridedBooleanReduction<argTy, resTy, RedOpT, GroupOpT,
+                                        InputOutputIterIndexerT,
+                                        ReductionIndexerT>(
+                    arg_tp, res_tp, RedOpT(), GroupOpT(), identity_val,
+                    in_out_iter_indexer, reduction_indexer, reduction_nelems,
+                    iter_nelems, reductions_per_wi));
+        });
+        return red_ev;
+    }
+}
+
+template <typename fnT, typename srcTy> struct AllAxis0ContigFactory
+{
+    fnT get() const
+    {
+        using resTy = std::int32_t;
+        using RedOpT = sycl::logical_and<resTy>;
+        using GroupOpT = all_reduce_wg_strided<resTy>;
+
+        return dpctl::tensor::kernels::boolean_reduction_axis0_contig_impl<
+            srcTy, resTy, RedOpT, GroupOpT>;
+    }
+};
+
+template <typename fnT, typename srcTy> struct AnyAxis0ContigFactory
+{
+    fnT get() const
+    {
+        using resTy = std::int32_t;
+        using RedOpT = sycl::logical_or<resTy>;
+        using GroupOpT = any_reduce_wg_strided<resTy>;
+
+        return dpctl::tensor::kernels::boolean_reduction_axis0_contig_impl<
+            srcTy, resTy, RedOpT, GroupOpT>;
     }
 };
 
@@ -527,8 +609,7 @@ boolean_reduction_strided_impl(sycl::queue exec_q,
 
     const sycl::device &d = exec_q.get_device();
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
-    size_t wg =
-        4 * (*std::max_element(std::begin(sg_sizes), std::end(sg_sizes)));
+    size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
 
     sycl::event red_ev;
     if (reduction_nelems < wg) {
@@ -558,7 +639,7 @@ boolean_reduction_strided_impl(sycl::queue exec_q,
         });
     }
     else {
-        sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
+        sycl::event init_ev = exec_q.submit([&](sycl::handler &cgh) {
             using IndexerT =
                 dpctl::tensor::offset_utils::UnpackedStridedIndexer;
 
@@ -576,7 +657,7 @@ boolean_reduction_strided_impl(sycl::queue exec_q,
             });
         });
         red_ev = exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(res_init_ev);
+            cgh.depends_on(init_ev);
 
             constexpr std::uint8_t dim = 1;
 
diff --git a/dpctl/tensor/libtensor/include/kernels/repeat.hpp b/dpctl/tensor/libtensor/include/kernels/repeat.hpp
new file mode 100644
index 0000000000..4129f358df
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/repeat.hpp
@@ -0,0 +1,438 @@
+//=== repeat.hpp -  Implementation of repeat kernels ---*-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor repeating operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <complex>
+#include <cstdint>
+#include <pybind11/pybind11.h>
+#include <type_traits>
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_utils.hpp"
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace repeat
+{
+
+namespace py = pybind11;
+using namespace dpctl::tensor::offset_utils;
+
+template <typename OrthogIndexer,
+          typename AxisIndexer,
+          typename RepIndexer,
+          typename T,
+          typename repT>
+class repeat_by_sequence_kernel;
+
+template <typename OrthogIndexer,
+          typename AxisIndexer,
+          typename RepIndexer,
+          typename T,
+          typename repT>
+class RepeatSequenceFunctor
+{
+private:
+    const T *src = nullptr;
+    T *dst = nullptr;
+    const repT *reps = nullptr;
+    const repT *cumsum = nullptr;
+    size_t src_axis_nelems = 1;
+    OrthogIndexer orthog_strider;
+    AxisIndexer src_axis_strider;
+    AxisIndexer dst_axis_strider;
+    RepIndexer reps_strider;
+
+public:
+    RepeatSequenceFunctor(const T *src_,
+                          T *dst_,
+                          const repT *reps_,
+                          const repT *cumsum_,
+                          size_t src_axis_nelems_,
+                          OrthogIndexer orthog_strider_,
+                          AxisIndexer src_axis_strider_,
+                          AxisIndexer dst_axis_strider_,
+                          RepIndexer reps_strider_)
+        : src(src_), dst(dst_), reps(reps_), cumsum(cumsum_),
+          src_axis_nelems(src_axis_nelems_), orthog_strider(orthog_strider_),
+          src_axis_strider(src_axis_strider_),
+          dst_axis_strider(dst_axis_strider_), reps_strider(reps_strider_)
+    {
+    }
+
+    void operator()(sycl::id<1> idx) const
+    {
+        size_t id = idx[0];
+        auto i_orthog = id / src_axis_nelems;
+        auto i_along = id - (i_orthog * src_axis_nelems);
+
+        auto orthog_offsets = orthog_strider(i_orthog);
+        auto src_offset = orthog_offsets.get_first_offset();
+        auto dst_offset = orthog_offsets.get_second_offset();
+
+        auto val = src[src_offset + src_axis_strider(i_along)];
+        auto last = cumsum[i_along];
+        auto first = last - reps[reps_strider(i_along)];
+        for (auto i = first; i < last; ++i) {
+            dst[dst_offset + dst_axis_strider(i)] = val;
+        }
+    }
+};
+
+typedef sycl::event (*repeat_by_sequence_fn_ptr_t)(
+    sycl::queue,
+    size_t,
+    size_t,
+    const char *,
+    char *,
+    const char *,
+    const char *,
+    int,
+    const py::ssize_t *,
+    py::ssize_t,
+    py::ssize_t,
+    py::ssize_t,
+    py::ssize_t,
+    py::ssize_t,
+    py::ssize_t,
+    py::ssize_t,
+    py::ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T, typename repT>
+sycl::event
+repeat_by_sequence_impl(sycl::queue q,
+                        size_t orthog_nelems,
+                        size_t src_axis_nelems,
+                        const char *src_cp,
+                        char *dst_cp,
+                        const char *reps_cp,
+                        const char *cumsum_cp,
+                        int orthog_nd,
+                        const py::ssize_t *orthog_src_dst_shape_and_strides,
+                        py::ssize_t src_offset,
+                        py::ssize_t dst_offset,
+                        py::ssize_t src_axis_shape,
+                        py::ssize_t src_axis_stride,
+                        py::ssize_t dst_axis_shape,
+                        py::ssize_t dst_axis_stride,
+                        py::ssize_t reps_shape,
+                        py::ssize_t reps_stride,
+                        const std::vector<sycl::event> &depends)
+{
+    sycl::event repeat_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const T *src_tp = reinterpret_cast<const T *>(src_cp);
+        const repT *reps_tp = reinterpret_cast<const repT *>(reps_cp);
+        const repT *cumsum_tp = reinterpret_cast<const repT *>(cumsum_cp);
+        T *dst_tp = reinterpret_cast<T *>(dst_cp);
+
+        // orthog ndim indexer
+        TwoOffsets_StridedIndexer orthog_indexer{
+            orthog_nd, src_offset, dst_offset,
+            orthog_src_dst_shape_and_strides};
+        // indexers along repeated axis
+        Strided1DIndexer src_axis_indexer{0, src_axis_shape, src_axis_stride};
+        Strided1DIndexer dst_axis_indexer{0, dst_axis_shape, dst_axis_stride};
+        // indexer along reps array
+        Strided1DIndexer reps_indexer{0, reps_shape, reps_stride};
+
+        const size_t gws = orthog_nelems * src_axis_nelems;
+
+        cgh.parallel_for<repeat_by_sequence_kernel<TwoOffsets_StridedIndexer,
+                                                   Strided1DIndexer,
+                                                   Strided1DIndexer, T, repT>>(
+            sycl::range<1>(gws),
+            RepeatSequenceFunctor<TwoOffsets_StridedIndexer, Strided1DIndexer,
+                                  Strided1DIndexer, T, repT>(
+                src_tp, dst_tp, reps_tp, cumsum_tp, src_axis_nelems,
+                orthog_indexer, src_axis_indexer, dst_axis_indexer,
+                reps_indexer));
+    });
+
+    return repeat_ev;
+}
+
+template <typename fnT, typename T> struct RepeatSequenceFactory
+{
+    fnT get()
+    {
+        fnT fn = repeat_by_sequence_impl<T, std::int64_t>;
+        return fn;
+    }
+};
+
+typedef sycl::event (*repeat_by_sequence_1d_fn_ptr_t)(
+    sycl::queue,
+    size_t,
+    const char *,
+    char *,
+    const char *,
+    const char *,
+    py::ssize_t,
+    py::ssize_t,
+    py::ssize_t,
+    py::ssize_t,
+    py::ssize_t,
+    py::ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T, typename repT>
+sycl::event repeat_by_sequence_1d_impl(sycl::queue q,
+                                       size_t src_nelems,
+                                       const char *src_cp,
+                                       char *dst_cp,
+                                       const char *reps_cp,
+                                       const char *cumsum_cp,
+                                       py::ssize_t src_shape,
+                                       py::ssize_t src_stride,
+                                       py::ssize_t dst_shape,
+                                       py::ssize_t dst_stride,
+                                       py::ssize_t reps_shape,
+                                       py::ssize_t reps_stride,
+                                       const std::vector<sycl::event> &depends)
+{
+    sycl::event repeat_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const T *src_tp = reinterpret_cast<const T *>(src_cp);
+        const repT *reps_tp = reinterpret_cast<const repT *>(reps_cp);
+        const repT *cumsum_tp = reinterpret_cast<const repT *>(cumsum_cp);
+        T *dst_tp = reinterpret_cast<T *>(dst_cp);
+
+        // orthog ndim indexer
+        TwoZeroOffsets_Indexer orthog_indexer{};
+        // indexers along repeated axis
+        Strided1DIndexer src_indexer{0, src_shape, src_stride};
+        Strided1DIndexer dst_indexer{0, dst_shape, dst_stride};
+        // indexer along reps array
+        Strided1DIndexer reps_indexer{0, reps_shape, reps_stride};
+
+        const size_t gws = src_nelems;
+
+        cgh.parallel_for<
+            repeat_by_sequence_kernel<TwoZeroOffsets_Indexer, Strided1DIndexer,
+                                      Strided1DIndexer, T, repT>>(
+            sycl::range<1>(gws),
+            RepeatSequenceFunctor<TwoZeroOffsets_Indexer, Strided1DIndexer,
+                                  Strided1DIndexer, T, repT>(
+                src_tp, dst_tp, reps_tp, cumsum_tp, src_nelems, orthog_indexer,
+                src_indexer, dst_indexer, reps_indexer));
+    });
+
+    return repeat_ev;
+}
+
+template <typename fnT, typename T> struct RepeatSequence1DFactory
+{
+    fnT get()
+    {
+        fnT fn = repeat_by_sequence_1d_impl<T, std::int64_t>;
+        return fn;
+    }
+};
+
+template <typename OrthogIndexer, typename AxisIndexer, typename T>
+class repeat_by_scalar_kernel;
+
+template <typename OrthogIndexer, typename AxisIndexer, typename T>
+class RepeatScalarFunctor
+{
+private:
+    const T *src = nullptr;
+    T *dst = nullptr;
+    const py::ssize_t reps = 1;
+    size_t dst_axis_nelems = 0;
+    OrthogIndexer orthog_strider;
+    AxisIndexer src_axis_strider;
+    AxisIndexer dst_axis_strider;
+
+public:
+    RepeatScalarFunctor(const T *src_,
+                        T *dst_,
+                        const py::ssize_t reps_,
+                        size_t dst_axis_nelems_,
+                        OrthogIndexer orthog_strider_,
+                        AxisIndexer src_axis_strider_,
+                        AxisIndexer dst_axis_strider_)
+        : src(src_), dst(dst_), reps(reps_), dst_axis_nelems(dst_axis_nelems_),
+          orthog_strider(orthog_strider_), src_axis_strider(src_axis_strider_),
+          dst_axis_strider(dst_axis_strider_)
+    {
+    }
+
+    void operator()(sycl::id<1> idx) const
+    {
+        size_t id = idx[0];
+        auto i_orthog = id / dst_axis_nelems;
+        auto i_along = id - (i_orthog * dst_axis_nelems);
+
+        auto orthog_offsets = orthog_strider(i_orthog);
+        auto src_offset = orthog_offsets.get_first_offset();
+        auto dst_offset = orthog_offsets.get_second_offset();
+
+        auto dst_axis_offset = dst_axis_strider(i_along);
+        auto src_axis_offset = src_axis_strider(i_along / reps);
+        dst[dst_offset + dst_axis_offset] = src[src_offset + src_axis_offset];
+    }
+};
+
+typedef sycl::event (*repeat_by_scalar_fn_ptr_t)(
+    sycl::queue,
+    size_t,
+    size_t,
+    const char *,
+    char *,
+    const py::ssize_t,
+    int,
+    const py::ssize_t *,
+    py::ssize_t,
+    py::ssize_t,
+    py::ssize_t,
+    py::ssize_t,
+    py::ssize_t,
+    py::ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T>
+sycl::event repeat_by_scalar_impl(sycl::queue q,
+                                  size_t orthog_nelems,
+                                  size_t dst_axis_nelems,
+                                  const char *src_cp,
+                                  char *dst_cp,
+                                  const py::ssize_t reps,
+                                  int orthog_nd,
+                                  const py::ssize_t *orthog_shape_and_strides,
+                                  py::ssize_t src_offset,
+                                  py::ssize_t dst_offset,
+                                  py::ssize_t src_axis_shape,
+                                  py::ssize_t src_axis_stride,
+                                  py::ssize_t dst_axis_shape,
+                                  py::ssize_t dst_axis_stride,
+                                  const std::vector<sycl::event> &depends)
+{
+    sycl::event repeat_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const T *src_tp = reinterpret_cast<const T *>(src_cp);
+        T *dst_tp = reinterpret_cast<T *>(dst_cp);
+
+        // orthog ndim indexer
+        TwoOffsets_StridedIndexer orthog_indexer{
+            orthog_nd, src_offset, dst_offset, orthog_shape_and_strides};
+        // indexers along repeated axis
+        Strided1DIndexer src_axis_indexer{0, src_axis_shape, src_axis_stride};
+        Strided1DIndexer dst_axis_indexer{0, dst_axis_shape, dst_axis_stride};
+
+        const size_t gws = orthog_nelems * dst_axis_nelems;
+
+        cgh.parallel_for<repeat_by_scalar_kernel<TwoOffsets_StridedIndexer,
+                                                 Strided1DIndexer, T>>(
+            sycl::range<1>(gws),
+            RepeatScalarFunctor<TwoOffsets_StridedIndexer, Strided1DIndexer, T>(
+                src_tp, dst_tp, reps, dst_axis_nelems, orthog_indexer,
+                src_axis_indexer, dst_axis_indexer));
+    });
+
+    return repeat_ev;
+}
+
+template <typename fnT, typename T> struct RepeatScalarFactory
+{
+    fnT get()
+    {
+        fnT fn = repeat_by_scalar_impl<T>;
+        return fn;
+    }
+};
+
+typedef sycl::event (*repeat_by_scalar_1d_fn_ptr_t)(
+    sycl::queue,
+    size_t,
+    const char *,
+    char *,
+    const py::ssize_t,
+    py::ssize_t,
+    py::ssize_t,
+    py::ssize_t,
+    py::ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T>
+sycl::event repeat_by_scalar_1d_impl(sycl::queue q,
+                                     size_t dst_nelems,
+                                     const char *src_cp,
+                                     char *dst_cp,
+                                     const py::ssize_t reps,
+                                     py::ssize_t src_shape,
+                                     py::ssize_t src_stride,
+                                     py::ssize_t dst_shape,
+                                     py::ssize_t dst_stride,
+                                     const std::vector<sycl::event> &depends)
+{
+    sycl::event repeat_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        const T *src_tp = reinterpret_cast<const T *>(src_cp);
+        T *dst_tp = reinterpret_cast<T *>(dst_cp);
+
+        // orthog ndim indexer
+        TwoZeroOffsets_Indexer orthog_indexer{};
+        // indexers along repeated axis
+        Strided1DIndexer src_indexer(0, src_shape, src_stride);
+        Strided1DIndexer dst_indexer{0, dst_shape, dst_stride};
+
+        const size_t gws = dst_nelems;
+
+        cgh.parallel_for<repeat_by_scalar_kernel<TwoZeroOffsets_Indexer,
+                                                 Strided1DIndexer, T>>(
+            sycl::range<1>(gws),
+            RepeatScalarFunctor<TwoZeroOffsets_Indexer, Strided1DIndexer, T>(
+                src_tp, dst_tp, reps, dst_nelems, orthog_indexer, src_indexer,
+                dst_indexer));
+    });
+
+    return repeat_ev;
+}
+
+template <typename fnT, typename T> struct RepeatScalar1DFactory
+{
+    fnT get()
+    {
+        fnT fn = repeat_by_scalar_1d_impl<T>;
+        return fn;
+    }
+};
+
+} // namespace repeat
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/accumulators.cpp b/dpctl/tensor/libtensor/source/accumulators.cpp
new file mode 100644
index 0000000000..5ce863ad3f
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/accumulators.cpp
@@ -0,0 +1,352 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <cstdint>
+#include <limits>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <utility>
+#include <vector>
+
+#include "kernels/accumulators.hpp"
+#include "simplify_iteration_space.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+// Computation of positions of masked elements
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::accumulators::accumulate_contig_impl_fn_ptr_t;
+static accumulate_contig_impl_fn_ptr_t
+    mask_positions_contig_i64_dispatch_vector[td_ns::num_types];
+static accumulate_contig_impl_fn_ptr_t
+    mask_positions_contig_i32_dispatch_vector[td_ns::num_types];
+
+using dpctl::tensor::kernels::accumulators::accumulate_strided_impl_fn_ptr_t;
+static accumulate_strided_impl_fn_ptr_t
+    mask_positions_strided_i64_dispatch_vector[td_ns::num_types];
+static accumulate_strided_impl_fn_ptr_t
+    mask_positions_strided_i32_dispatch_vector[td_ns::num_types];
+
+void populate_mask_positions_dispatch_vectors(void)
+{
+    using dpctl::tensor::kernels::accumulators::
+        MaskPositionsContigFactoryForInt64;
+    td_ns::DispatchVectorBuilder<accumulate_contig_impl_fn_ptr_t,
+                                 MaskPositionsContigFactoryForInt64,
+                                 td_ns::num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(mask_positions_contig_i64_dispatch_vector);
+
+    using dpctl::tensor::kernels::accumulators::
+        MaskPositionsContigFactoryForInt32;
+    td_ns::DispatchVectorBuilder<accumulate_contig_impl_fn_ptr_t,
+                                 MaskPositionsContigFactoryForInt32,
+                                 td_ns::num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(mask_positions_contig_i32_dispatch_vector);
+
+    using dpctl::tensor::kernels::accumulators::
+        MaskPositionsStridedFactoryForInt64;
+    td_ns::DispatchVectorBuilder<accumulate_strided_impl_fn_ptr_t,
+                                 MaskPositionsStridedFactoryForInt64,
+                                 td_ns::num_types>
+        dvb3;
+    dvb3.populate_dispatch_vector(mask_positions_strided_i64_dispatch_vector);
+
+    using dpctl::tensor::kernels::accumulators::
+        MaskPositionsStridedFactoryForInt32;
+    td_ns::DispatchVectorBuilder<accumulate_strided_impl_fn_ptr_t,
+                                 MaskPositionsStridedFactoryForInt32,
+                                 td_ns::num_types>
+        dvb4;
+    dvb4.populate_dispatch_vector(mask_positions_strided_i32_dispatch_vector);
+
+    return;
+}
+
+size_t py_mask_positions(dpctl::tensor::usm_ndarray mask,
+                         dpctl::tensor::usm_ndarray cumsum,
+                         sycl::queue exec_q,
+                         std::vector<sycl::event> const &depends)
+{
+    // cumsum is 1D
+    if (cumsum.get_ndim() != 1) {
+        throw py::value_error("Result array must be one-dimensional.");
+    }
+
+    if (!cumsum.is_c_contiguous()) {
+        throw py::value_error("Expecting `cumsum` array must be C-contiguous.");
+    }
+
+    // cumsum.shape == (mask.size,)
+    auto mask_size = mask.get_size();
+    auto cumsum_size = cumsum.get_shape(0);
+    if (cumsum_size != mask_size) {
+        throw py::value_error("Inconsistent dimensions");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {mask, cumsum})) {
+        // FIXME: use ExecutionPlacementError
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    if (mask_size == 0) {
+        return 0;
+    }
+
+    int mask_typenum = mask.get_typenum();
+    int cumsum_typenum = cumsum.get_typenum();
+
+    // mask can be any type
+    const char *mask_data = mask.get_data();
+    char *cumsum_data = cumsum.get_data();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+
+    int mask_typeid = array_types.typenum_to_lookup_id(mask_typenum);
+    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
+
+    // cumsum must be int32_t/int64_t only
+    constexpr int int32_typeid = static_cast<int>(td_ns::typenum_t::INT32);
+    constexpr int int64_typeid = static_cast<int>(td_ns::typenum_t::INT64);
+    if (cumsum_typeid != int32_typeid && cumsum_typeid != int64_typeid) {
+        throw py::value_error(
+            "Cumulative sum array must have int32 or int64 data-type.");
+    }
+
+    const bool use_i32 = (cumsum_typeid == int32_typeid);
+
+    if (mask.is_c_contiguous()) {
+        auto fn = (use_i32)
+                      ? mask_positions_contig_i32_dispatch_vector[mask_typeid]
+                      : mask_positions_contig_i64_dispatch_vector[mask_typeid];
+
+        return fn(exec_q, mask_size, mask_data, cumsum_data, depends);
+    }
+
+    const py::ssize_t *shape = mask.get_shape_raw();
+    auto const &strides_vector = mask.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT compact_shape;
+    shT compact_strides;
+
+    int mask_nd = mask.get_ndim();
+    int nd = mask_nd;
+
+    dpctl::tensor::py_internal::compact_iteration_space(
+        nd, shape, strides_vector, compact_shape, compact_strides);
+
+    // Strided implementation
+    auto strided_fn =
+        (use_i32) ? mask_positions_strided_i32_dispatch_vector[mask_typeid]
+                  : mask_positions_strided_i64_dispatch_vector[mask_typeid];
+    std::vector<sycl::event> host_task_events;
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    const auto &ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, compact_shape, compact_strides);
+    py::ssize_t *shape_strides = std::get<0>(ptr_size_event_tuple);
+    if (shape_strides == nullptr) {
+        sycl::event::wait(host_task_events);
+        throw std::runtime_error("Unexpected error");
+    }
+    sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+
+    if (2 * static_cast<size_t>(nd) != std::get<1>(ptr_size_event_tuple)) {
+        copy_shape_ev.wait();
+        sycl::event::wait(host_task_events);
+        sycl::free(shape_strides, exec_q);
+        throw std::runtime_error("Unexpected error");
+    }
+
+    std::vector<sycl::event> dependent_events;
+    dependent_events.reserve(depends.size() + 1);
+    dependent_events.insert(dependent_events.end(), copy_shape_ev);
+    dependent_events.insert(dependent_events.end(), depends.begin(),
+                            depends.end());
+
+    size_t total_set = strided_fn(exec_q, mask_size, mask_data, nd,
+                                  shape_strides, cumsum_data, dependent_events);
+
+    sycl::event::wait(host_task_events);
+    sycl::free(shape_strides, exec_q);
+
+    return total_set;
+}
+
+using dpctl::tensor::kernels::accumulators::accumulate_strided_impl_fn_ptr_t;
+static accumulate_strided_impl_fn_ptr_t
+    cumsum_1d_strided_dispatch_vector[td_ns::num_types];
+using dpctl::tensor::kernels::accumulators::accumulate_contig_impl_fn_ptr_t;
+static accumulate_contig_impl_fn_ptr_t
+    cumsum_1d_contig_dispatch_vector[td_ns::num_types];
+
+void populate_cumsum_1d_dispatch_vectors(void)
+{
+    using dpctl::tensor::kernels::accumulators::Cumsum1DContigFactory;
+    td_ns::DispatchVectorBuilder<accumulate_contig_impl_fn_ptr_t,
+                                 Cumsum1DContigFactory, td_ns::num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(cumsum_1d_contig_dispatch_vector);
+
+    using dpctl::tensor::kernels::accumulators::Cumsum1DStridedFactory;
+    td_ns::DispatchVectorBuilder<accumulate_strided_impl_fn_ptr_t,
+                                 Cumsum1DStridedFactory, td_ns::num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(cumsum_1d_strided_dispatch_vector);
+
+    return;
+}
+
+size_t py_cumsum_1d(dpctl::tensor::usm_ndarray src,
+                    dpctl::tensor::usm_ndarray cumsum,
+                    sycl::queue exec_q,
+                    std::vector<sycl::event> const &depends)
+{
+    // cumsum is 1D
+    if (cumsum.get_ndim() != 1) {
+        throw py::value_error("cumsum array must be one-dimensional.");
+    }
+
+    if (!cumsum.is_c_contiguous()) {
+        throw py::value_error("Expecting `cumsum` array to be C-contiguous.");
+    }
+
+    // cumsum.shape == (src.size,)
+    auto src_size = src.get_size();
+    auto cumsum_size = cumsum.get_shape(0);
+    if (cumsum_size != src_size) {
+        throw py::value_error("Inconsistent dimensions");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, cumsum})) {
+        // FIXME: use ExecutionPlacementError
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    if (src_size == 0) {
+        return 0;
+    }
+
+    int src_typenum = src.get_typenum();
+    int cumsum_typenum = cumsum.get_typenum();
+
+    // src can be any type
+    const char *src_data = src.get_data();
+    char *cumsum_data = cumsum.get_data();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
+
+    // this cumsum must be int64_t only
+    constexpr int int64_typeid = static_cast<int>(td_ns::typenum_t::INT64);
+    if (cumsum_typeid != int64_typeid) {
+        throw py::value_error(
+            "Cumulative sum array must have int64 data-type.");
+    }
+
+    if (src.is_c_contiguous()) {
+        auto fn = cumsum_1d_contig_dispatch_vector[src_typeid];
+        if (fn == nullptr) {
+            throw std::runtime_error(
+                "this cumsum requires integer type, got src_typeid=" +
+                std::to_string(src_typeid));
+        }
+        return fn(exec_q, src_size, src_data, cumsum_data, depends);
+    }
+
+    const py::ssize_t *shape = src.get_shape_raw();
+    auto const &strides_vector = src.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT compact_shape;
+    shT compact_strides;
+
+    int src_nd = src.get_ndim();
+    int nd = src_nd;
+
+    dpctl::tensor::py_internal::compact_iteration_space(
+        nd, shape, strides_vector, compact_shape, compact_strides);
+
+    // Strided implementation
+    auto strided_fn = cumsum_1d_strided_dispatch_vector[src_typeid];
+    if (strided_fn == nullptr) {
+        throw std::runtime_error(
+            "this cumsum requires integer type, got src_typeid=" +
+            std::to_string(src_typeid));
+    }
+    std::vector<sycl::event> host_task_events;
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    const auto &ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, compact_shape, compact_strides);
+    py::ssize_t *shape_strides = std::get<0>(ptr_size_event_tuple);
+    if (shape_strides == nullptr) {
+        sycl::event::wait(host_task_events);
+        throw std::runtime_error("Unexpected error");
+    }
+    sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+
+    if (2 * static_cast<size_t>(nd) != std::get<1>(ptr_size_event_tuple)) {
+        copy_shape_ev.wait();
+        sycl::event::wait(host_task_events);
+        sycl::free(shape_strides, exec_q);
+        throw std::runtime_error("Unexpected error");
+    }
+
+    std::vector<sycl::event> dependent_events;
+    dependent_events.reserve(depends.size() + 1);
+    dependent_events.insert(dependent_events.end(), copy_shape_ev);
+    dependent_events.insert(dependent_events.end(), depends.begin(),
+                            depends.end());
+
+    size_t total = strided_fn(exec_q, src_size, src_data, nd, shape_strides,
+                              cumsum_data, dependent_events);
+
+    sycl::event::wait(host_task_events);
+    sycl::free(shape_strides, exec_q);
+
+    return total;
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/accumulators.hpp b/dpctl/tensor/libtensor/source/accumulators.hpp
new file mode 100644
index 0000000000..f4e5ce9d84
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/accumulators.hpp
@@ -0,0 +1,56 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void populate_mask_positions_dispatch_vectors(void);
+
+extern size_t py_mask_positions(dpctl::tensor::usm_ndarray mask,
+                                dpctl::tensor::usm_ndarray cumsum,
+                                sycl::queue exec_q,
+                                std::vector<sycl::event> const &depends = {});
+
+extern void populate_cumsum_1d_dispatch_vectors(void);
+
+extern size_t py_cumsum_1d(dpctl::tensor::usm_ndarray src,
+                           dpctl::tensor::usm_ndarray cumsum,
+                           sycl::queue exec_q,
+                           std::vector<sycl::event> const &depends = {});
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp
index d37967daef..43ab92b86d 100644
--- a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp
+++ b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp
@@ -46,214 +46,10 @@ namespace tensor
 namespace py_internal
 {
 
-/* @brief Split shape/strides into dir1 (complementary to axis_start <= i <
- * axis_end) and dir2 (along given set of axes)
- */
-template <typename shT>
-void _split_iteration_space(const shT &shape_vec,
-                            const shT &strides_vec,
-                            int axis_start,
-                            int axis_end,
-                            shT &dir1_shape_vec,
-                            shT &dir2_shape_vec,
-                            shT &dir1_strides_vec,
-                            shT &dir2_strides_vec)
-{
-    int nd = static_cast<int>(shape_vec.size());
-    int dir2_sz = axis_end - axis_start;
-    int dir1_sz = nd - dir2_sz;
-
-    assert(dir1_sz > 0);
-    assert(dir2_sz > 0);
-
-    dir1_shape_vec.resize(dir1_sz);
-    dir2_shape_vec.resize(dir2_sz);
-
-    std::copy(shape_vec.begin(), shape_vec.begin() + axis_start,
-              dir1_shape_vec.begin());
-    std::copy(shape_vec.begin() + axis_end, shape_vec.end(),
-              dir1_shape_vec.begin() + axis_start);
-
-    std::copy(shape_vec.begin() + axis_start, shape_vec.begin() + axis_end,
-              dir2_shape_vec.begin());
-
-    dir1_strides_vec.resize(dir1_sz);
-    dir2_strides_vec.resize(dir2_sz);
-
-    std::copy(strides_vec.begin(), strides_vec.begin() + axis_start,
-              dir1_strides_vec.begin());
-    std::copy(strides_vec.begin() + axis_end, strides_vec.end(),
-              dir1_strides_vec.begin() + axis_start);
-
-    std::copy(strides_vec.begin() + axis_start, strides_vec.begin() + axis_end,
-              dir2_strides_vec.begin());
-
-    return;
-}
-
-// Computation of positions of masked elements
+// Masked extraction
 
 namespace td_ns = dpctl::tensor::type_dispatch;
 
-using dpctl::tensor::kernels::indexing::mask_positions_contig_impl_fn_ptr_t;
-static mask_positions_contig_impl_fn_ptr_t
-    mask_positions_contig_i64_dispatch_vector[td_ns::num_types];
-static mask_positions_contig_impl_fn_ptr_t
-    mask_positions_contig_i32_dispatch_vector[td_ns::num_types];
-
-using dpctl::tensor::kernels::indexing::mask_positions_strided_impl_fn_ptr_t;
-static mask_positions_strided_impl_fn_ptr_t
-    mask_positions_strided_i64_dispatch_vector[td_ns::num_types];
-static mask_positions_strided_impl_fn_ptr_t
-    mask_positions_strided_i32_dispatch_vector[td_ns::num_types];
-
-void populate_mask_positions_dispatch_vectors(void)
-{
-    using dpctl::tensor::kernels::indexing::MaskPositionsContigFactoryForInt64;
-    td_ns::DispatchVectorBuilder<mask_positions_contig_impl_fn_ptr_t,
-                                 MaskPositionsContigFactoryForInt64,
-                                 td_ns::num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(mask_positions_contig_i64_dispatch_vector);
-
-    using dpctl::tensor::kernels::indexing::MaskPositionsContigFactoryForInt32;
-    td_ns::DispatchVectorBuilder<mask_positions_contig_impl_fn_ptr_t,
-                                 MaskPositionsContigFactoryForInt32,
-                                 td_ns::num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(mask_positions_contig_i32_dispatch_vector);
-
-    using dpctl::tensor::kernels::indexing::MaskPositionsStridedFactoryForInt64;
-    td_ns::DispatchVectorBuilder<mask_positions_strided_impl_fn_ptr_t,
-                                 MaskPositionsStridedFactoryForInt64,
-                                 td_ns::num_types>
-        dvb3;
-    dvb3.populate_dispatch_vector(mask_positions_strided_i64_dispatch_vector);
-
-    using dpctl::tensor::kernels::indexing::MaskPositionsStridedFactoryForInt32;
-    td_ns::DispatchVectorBuilder<mask_positions_strided_impl_fn_ptr_t,
-                                 MaskPositionsStridedFactoryForInt32,
-                                 td_ns::num_types>
-        dvb4;
-    dvb4.populate_dispatch_vector(mask_positions_strided_i32_dispatch_vector);
-
-    return;
-}
-
-size_t py_mask_positions(dpctl::tensor::usm_ndarray mask,
-                         dpctl::tensor::usm_ndarray cumsum,
-                         sycl::queue exec_q,
-                         std::vector<sycl::event> const &depends)
-{
-    // cumsum is 1D
-    if (cumsum.get_ndim() != 1) {
-        throw py::value_error("Result array must be one-dimensional.");
-    }
-
-    if (!cumsum.is_c_contiguous()) {
-        throw py::value_error("Expecting `cumsum` array must be C-contiguous.");
-    }
-
-    // cumsum.shape == (mask.size,)
-    auto mask_size = mask.get_size();
-    auto cumsum_size = cumsum.get_shape(0);
-    if (cumsum_size != mask_size) {
-        throw py::value_error("Inconsistent dimensions");
-    }
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {mask, cumsum})) {
-        // FIXME: use ExecutionPlacementError
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    if (mask_size == 0) {
-        return 0;
-    }
-
-    int mask_typenum = mask.get_typenum();
-    int cumsum_typenum = cumsum.get_typenum();
-
-    // mask can be any type
-    const char *mask_data = mask.get_data();
-    char *cumsum_data = cumsum.get_data();
-
-    auto const &array_types = td_ns::usm_ndarray_types();
-
-    int mask_typeid = array_types.typenum_to_lookup_id(mask_typenum);
-    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
-
-    // cumsum must be int32_t/int64_t only
-    constexpr int int32_typeid = static_cast<int>(td_ns::typenum_t::INT32);
-    constexpr int int64_typeid = static_cast<int>(td_ns::typenum_t::INT64);
-    if (cumsum_typeid != int32_typeid && cumsum_typeid != int64_typeid) {
-        throw py::value_error(
-            "Cumulative sum array must have int32 or int64 data-type.");
-    }
-
-    const bool use_i32 = (cumsum_typeid == int32_typeid);
-
-    if (mask.is_c_contiguous()) {
-        auto fn = (use_i32)
-                      ? mask_positions_contig_i32_dispatch_vector[mask_typeid]
-                      : mask_positions_contig_i64_dispatch_vector[mask_typeid];
-
-        return fn(exec_q, mask_size, mask_data, cumsum_data, depends);
-    }
-
-    const py::ssize_t *shape = mask.get_shape_raw();
-    auto const &strides_vector = mask.get_strides_vector();
-
-    using shT = std::vector<py::ssize_t>;
-    shT compact_shape;
-    shT compact_strides;
-
-    int mask_nd = mask.get_ndim();
-    int nd = mask_nd;
-
-    dpctl::tensor::py_internal::compact_iteration_space(
-        nd, shape, strides_vector, compact_shape, compact_strides);
-
-    // Strided implementation
-    auto strided_fn =
-        (use_i32) ? mask_positions_strided_i32_dispatch_vector[mask_typeid]
-                  : mask_positions_strided_i64_dispatch_vector[mask_typeid];
-    std::vector<sycl::event> host_task_events;
-
-    using dpctl::tensor::offset_utils::device_allocate_and_pack;
-    const auto &ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
-        exec_q, host_task_events, compact_shape, compact_strides);
-    py::ssize_t *shape_strides = std::get<0>(ptr_size_event_tuple);
-    if (shape_strides == nullptr) {
-        sycl::event::wait(host_task_events);
-        throw std::runtime_error("Unexpected error");
-    }
-    sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple);
-
-    if (2 * static_cast<size_t>(nd) != std::get<1>(ptr_size_event_tuple)) {
-        copy_shape_ev.wait();
-        sycl::event::wait(host_task_events);
-        sycl::free(shape_strides, exec_q);
-        throw std::runtime_error("Unexpected error");
-    }
-
-    std::vector<sycl::event> dependent_events;
-    dependent_events.reserve(depends.size() + 1);
-    dependent_events.insert(dependent_events.end(), copy_shape_ev);
-    dependent_events.insert(dependent_events.end(), depends.begin(),
-                            depends.end());
-
-    size_t total_set = strided_fn(exec_q, mask_size, mask_data, nd,
-                                  shape_strides, cumsum_data, dependent_events);
-
-    sycl::event::wait(host_task_events);
-    sycl::free(shape_strides, exec_q);
-
-    return total_set;
-}
-
-// Masked extraction
-
 using dpctl::tensor::kernels::indexing::
     masked_extract_all_slices_strided_impl_fn_ptr_t;
 
@@ -493,19 +289,21 @@ py_extract(dpctl::tensor::usm_ndarray src,
         shT masked_src_shape;
         shT ortho_src_strides;
         shT masked_src_strides;
-        _split_iteration_space(src_shape_vec, src_strides_vec, axis_start,
-                               axis_end, ortho_src_shape,
-                               masked_src_shape, // 4 vectors modified
-                               ortho_src_strides, masked_src_strides);
+        dpctl::tensor::py_internal::split_iteration_space(
+            src_shape_vec, src_strides_vec, axis_start, axis_end,
+            ortho_src_shape,
+            masked_src_shape, // 4 vectors modified
+            ortho_src_strides, masked_src_strides);
 
         shT ortho_dst_shape;
         shT masked_dst_shape;
         shT ortho_dst_strides;
         shT masked_dst_strides;
-        _split_iteration_space(dst_shape_vec, dst_strides_vec, axis_start,
-                               axis_start + 1, ortho_dst_shape,
-                               masked_dst_shape, // 4 vectors modified
-                               ortho_dst_strides, masked_dst_strides);
+        dpctl::tensor::py_internal::split_iteration_space(
+            dst_shape_vec, dst_strides_vec, axis_start, axis_start + 1,
+            ortho_dst_shape,
+            masked_dst_shape, // 4 vectors modified
+            ortho_dst_strides, masked_dst_strides);
 
         assert(ortho_src_shape.size() == static_cast<size_t>(ortho_nd));
         assert(ortho_dst_shape.size() == static_cast<size_t>(ortho_nd));
@@ -821,19 +619,21 @@ py_place(dpctl::tensor::usm_ndarray dst,
         shT masked_dst_shape;
         shT ortho_dst_strides;
         shT masked_dst_strides;
-        _split_iteration_space(dst_shape_vec, dst_strides_vec, axis_start,
-                               axis_end, ortho_dst_shape,
-                               masked_dst_shape, // 4 vectors modified
-                               ortho_dst_strides, masked_dst_strides);
+        dpctl::tensor::py_internal::split_iteration_space(
+            dst_shape_vec, dst_strides_vec, axis_start, axis_end,
+            ortho_dst_shape,
+            masked_dst_shape, // 4 vectors modified
+            ortho_dst_strides, masked_dst_strides);
 
         shT ortho_rhs_shape;
         shT masked_rhs_shape;
         shT ortho_rhs_strides;
         shT masked_rhs_strides;
-        _split_iteration_space(rhs_shape_vec, rhs_strides_vec, axis_start,
-                               axis_start + 1, ortho_rhs_shape,
-                               masked_rhs_shape, // 4 vectors modified
-                               ortho_rhs_strides, masked_rhs_strides);
+        dpctl::tensor::py_internal::split_iteration_space(
+            rhs_shape_vec, rhs_strides_vec, axis_start, axis_start + 1,
+            ortho_rhs_shape,
+            masked_rhs_shape, // 4 vectors modified
+            ortho_rhs_strides, masked_rhs_strides);
 
         assert(ortho_dst_shape.size() == static_cast<size_t>(ortho_nd));
         assert(ortho_rhs_shape.size() == static_cast<size_t>(ortho_nd));
diff --git a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.hpp
index 647d78bbbb..5ce868894a 100644
--- a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.hpp
+++ b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.hpp
@@ -38,13 +38,6 @@ namespace tensor
 namespace py_internal
 {
 
-extern void populate_mask_positions_dispatch_vectors(void);
-
-extern size_t py_mask_positions(dpctl::tensor::usm_ndarray mask,
-                                dpctl::tensor::usm_ndarray cumsum,
-                                sycl::queue exec_q,
-                                std::vector<sycl::event> const &depends = {});
-
 extern std::pair<sycl::event, sycl::event>
 py_extract(dpctl::tensor::usm_ndarray src,
            dpctl::tensor::usm_ndarray cumsum,
diff --git a/dpctl/tensor/libtensor/source/boolean_reductions.cpp b/dpctl/tensor/libtensor/source/boolean_reductions.cpp
index 5def6c5158..db07d05c73 100644
--- a/dpctl/tensor/libtensor/source/boolean_reductions.cpp
+++ b/dpctl/tensor/libtensor/source/boolean_reductions.cpp
@@ -58,7 +58,9 @@ using dpctl::tensor::kernels::boolean_reduction_strided_impl_fn_ptr;
 static boolean_reduction_strided_impl_fn_ptr
     all_reduction_strided_dispatch_vector[td_ns::num_types];
 static boolean_reduction_contig_impl_fn_ptr
-    all_reduction_contig_dispatch_vector[td_ns::num_types];
+    all_reduction_axis1_contig_dispatch_vector[td_ns::num_types];
+static boolean_reduction_contig_impl_fn_ptr
+    all_reduction_axis0_contig_dispatch_vector[td_ns::num_types];
 
 void populate_all_dispatch_vectors(void)
 {
@@ -74,11 +76,19 @@ void populate_all_dispatch_vectors(void)
 
     using dpctl::tensor::kernels::boolean_reduction_contig_impl_fn_ptr;
 
-    using dpctl::tensor::kernels::AllContigFactory;
+    using dpctl::tensor::kernels::AllAxis1ContigFactory;
     DispatchVectorBuilder<boolean_reduction_contig_impl_fn_ptr,
-                          AllContigFactory, td_ns::num_types>
+                          AllAxis1ContigFactory, td_ns::num_types>
         all_dvb2;
-    all_dvb2.populate_dispatch_vector(all_reduction_contig_dispatch_vector);
+    all_dvb2.populate_dispatch_vector(
+        all_reduction_axis1_contig_dispatch_vector);
+
+    using dpctl::tensor::kernels::AllAxis0ContigFactory;
+    DispatchVectorBuilder<boolean_reduction_contig_impl_fn_ptr,
+                          AllAxis0ContigFactory, td_ns::num_types>
+        all_dvb3;
+    all_dvb3.populate_dispatch_vector(
+        all_reduction_axis0_contig_dispatch_vector);
 };
 
 } // namespace impl
@@ -91,7 +101,9 @@ static boolean_reduction_strided_impl_fn_ptr
     any_reduction_strided_dispatch_vector[td_ns::num_types];
 using dpctl::tensor::kernels::boolean_reduction_contig_impl_fn_ptr;
 static boolean_reduction_contig_impl_fn_ptr
-    any_reduction_contig_dispatch_vector[td_ns::num_types];
+    any_reduction_axis1_contig_dispatch_vector[td_ns::num_types];
+static boolean_reduction_contig_impl_fn_ptr
+    any_reduction_axis0_contig_dispatch_vector[td_ns::num_types];
 
 void populate_any_dispatch_vectors(void)
 {
@@ -107,11 +119,19 @@ void populate_any_dispatch_vectors(void)
 
     using dpctl::tensor::kernels::boolean_reduction_contig_impl_fn_ptr;
 
-    using dpctl::tensor::kernels::AnyContigFactory;
+    using dpctl::tensor::kernels::AnyAxis1ContigFactory;
     DispatchVectorBuilder<boolean_reduction_contig_impl_fn_ptr,
-                          AnyContigFactory, td_ns::num_types>
+                          AnyAxis1ContigFactory, td_ns::num_types>
         any_dvb2;
-    any_dvb2.populate_dispatch_vector(any_reduction_contig_dispatch_vector);
+    any_dvb2.populate_dispatch_vector(
+        any_reduction_axis1_contig_dispatch_vector);
+
+    using dpctl::tensor::kernels::AnyAxis0ContigFactory;
+    DispatchVectorBuilder<boolean_reduction_contig_impl_fn_ptr,
+                          AnyAxis0ContigFactory, td_ns::num_types>
+        any_dvb3;
+    any_dvb3.populate_dispatch_vector(
+        any_reduction_axis0_contig_dispatch_vector);
 };
 
 } // namespace impl
@@ -124,16 +144,18 @@ void init_boolean_reduction_functions(py::module_ m)
     // ALL
     {
         impl::populate_all_dispatch_vectors();
-        using impl::all_reduction_contig_dispatch_vector;
+        using impl::all_reduction_axis0_contig_dispatch_vector;
+        using impl::all_reduction_axis1_contig_dispatch_vector;
         using impl::all_reduction_strided_dispatch_vector;
 
         auto all_pyapi = [&](arrayT src, int trailing_dims_to_reduce,
                              arrayT dst, sycl::queue exec_q,
                              const event_vecT &depends = {}) {
-            return py_boolean_reduction(src, trailing_dims_to_reduce, dst,
-                                        exec_q, depends,
-                                        all_reduction_contig_dispatch_vector,
-                                        all_reduction_strided_dispatch_vector);
+            return py_boolean_reduction(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                all_reduction_axis1_contig_dispatch_vector,
+                all_reduction_axis0_contig_dispatch_vector,
+                all_reduction_strided_dispatch_vector);
         };
         m.def("_all", all_pyapi, "", py::arg("src"),
               py::arg("trailing_dims_to_reduce"), py::arg("dst"),
@@ -143,16 +165,18 @@ void init_boolean_reduction_functions(py::module_ m)
     // ANY
     {
         impl::populate_any_dispatch_vectors();
-        using impl::any_reduction_contig_dispatch_vector;
+        using impl::any_reduction_axis0_contig_dispatch_vector;
+        using impl::any_reduction_axis1_contig_dispatch_vector;
         using impl::any_reduction_strided_dispatch_vector;
 
         auto any_pyapi = [&](arrayT src, int trailing_dims_to_reduce,
                              arrayT dst, sycl::queue exec_q,
                              const event_vecT &depends = {}) {
-            return py_boolean_reduction(src, trailing_dims_to_reduce, dst,
-                                        exec_q, depends,
-                                        any_reduction_contig_dispatch_vector,
-                                        any_reduction_strided_dispatch_vector);
+            return py_boolean_reduction(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                any_reduction_axis1_contig_dispatch_vector,
+                any_reduction_axis0_contig_dispatch_vector,
+                any_reduction_strided_dispatch_vector);
         };
         m.def("_any", any_pyapi, "", py::arg("src"),
               py::arg("trailing_dims_to_reduce"), py::arg("dst"),
diff --git a/dpctl/tensor/libtensor/source/boolean_reductions.hpp b/dpctl/tensor/libtensor/source/boolean_reductions.hpp
index 3d5970d783..591439a7c9 100644
--- a/dpctl/tensor/libtensor/source/boolean_reductions.hpp
+++ b/dpctl/tensor/libtensor/source/boolean_reductions.hpp
@@ -54,7 +54,8 @@ py_boolean_reduction(dpctl::tensor::usm_ndarray src,
                      dpctl::tensor::usm_ndarray dst,
                      sycl::queue exec_q,
                      const std::vector<sycl::event> &depends,
-                     const contig_dispatchT &contig_dispatch_vector,
+                     const contig_dispatchT &axis1_contig_dispatch_vector,
+                     const contig_dispatchT &axis0_contig_dispatch_vector,
                      const strided_dispatchT &strided_dispatch_vector)
 {
     int src_nd = src.get_ndim();
@@ -131,16 +132,32 @@ py_boolean_reduction(dpctl::tensor::usm_ndarray src,
 
     bool is_src_c_contig = src.is_c_contiguous();
     bool is_src_f_contig = src.is_f_contiguous();
-
     bool is_dst_c_contig = dst.is_c_contiguous();
 
     if ((is_src_c_contig && is_dst_c_contig) ||
-        (is_src_f_contig && dst_nd == 0)) {
-        auto fn = contig_dispatch_vector[src_typeid];
+        (is_src_f_contig && dst_nelems == 0))
+    {
+        auto fn = axis1_contig_dispatch_vector[src_typeid];
         constexpr py::ssize_t zero_offset = 0;
 
-        auto red_ev = fn(exec_q, dst_nelems, red_nelems, src_data, dst_data,
-                         zero_offset, zero_offset, zero_offset, depends);
+        sycl::event red_ev =
+            fn(exec_q, dst_nelems, red_nelems, src_data, dst_data, zero_offset,
+               zero_offset, zero_offset, depends);
+
+        sycl::event keep_args_event =
+            dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev});
+
+        return std::make_pair(keep_args_event, red_ev);
+    }
+    else if (is_src_f_contig &&
+             ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous()))
+    {
+        auto fn = axis0_contig_dispatch_vector[src_typeid];
+        constexpr py::ssize_t zero_offset = 0;
+
+        sycl::event red_ev =
+            fn(exec_q, dst_nelems, red_nelems, src_data, dst_data, zero_offset,
+               zero_offset, zero_offset, depends);
 
         sycl::event keep_args_event =
             dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev});
@@ -196,24 +213,48 @@ py_boolean_reduction(dpctl::tensor::usm_ndarray src,
             simplified_iter_dst_strides, iter_src_offset, iter_dst_offset);
     }
 
-    if ((simplified_red_nd == 1) && (simplified_red_src_strides[0] == 1) &&
-        (iter_nd == 1) &&
-        ((simplified_iter_shape[0] == 1) ||
-         ((simplified_iter_dst_strides[0] == 1) &&
-          (simplified_iter_src_strides[0] ==
-           static_cast<py::ssize_t>(red_nelems)))))
-    {
-        auto fn = contig_dispatch_vector[src_typeid];
+    if (simplified_red_nd == 1 && iter_nd == 1) {
+        bool mat_reduce_over_axis1 = false;
+        bool mat_reduce_over_axis0 = false;
+        bool array_reduce_all_elems = false;
         size_t iter_nelems = dst_nelems;
 
-        sycl::event red_ev =
-            fn(exec_q, iter_nelems, red_nelems, src.get_data(), dst.get_data(),
-               iter_src_offset, iter_dst_offset, red_src_offset, depends);
+        if (simplified_red_src_strides[0] == 1) {
+            array_reduce_all_elems = (simplified_iter_shape[0] == 1);
+            mat_reduce_over_axis1 =
+                (simplified_iter_dst_strides[0] == 1) &&
+                (static_cast<size_t>(simplified_iter_src_strides[0]) ==
+                 red_nelems);
+        }
+        else if (static_cast<size_t>(simplified_red_src_strides[0]) ==
+                 iter_nelems) {
+            mat_reduce_over_axis0 = (simplified_iter_dst_strides[0] == 1) &&
+                                    (simplified_iter_src_strides[0] == 1);
+        }
+        if (mat_reduce_over_axis1 || array_reduce_all_elems) {
+            auto fn = axis1_contig_dispatch_vector[src_typeid];
 
-        sycl::event keep_args_event =
-            dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev});
+            sycl::event red_ev =
+                fn(exec_q, iter_nelems, red_nelems, src_data, dst_data,
+                   iter_src_offset, iter_dst_offset, red_src_offset, depends);
 
-        return std::make_pair(keep_args_event, red_ev);
+            sycl::event keep_args_event =
+                dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev});
+
+            return std::make_pair(keep_args_event, red_ev);
+        }
+        else if (mat_reduce_over_axis0) {
+            auto fn = axis0_contig_dispatch_vector[src_typeid];
+
+            sycl::event red_ev =
+                fn(exec_q, iter_nelems, red_nelems, src_data, dst_data,
+                   iter_src_offset, iter_dst_offset, red_src_offset, depends);
+
+            sycl::event keep_args_event =
+                dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev});
+
+            return std::make_pair(keep_args_event, red_ev);
+        }
     }
 
     auto fn = strided_dispatch_vector[src_typeid];
diff --git a/dpctl/tensor/libtensor/source/repeat.cpp b/dpctl/tensor/libtensor/source/repeat.cpp
new file mode 100644
index 0000000000..4d6ef27a81
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/repeat.cpp
@@ -0,0 +1,559 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include "dpctl4pybind11.hpp"
+#include <CL/sycl.hpp>
+#include <cstdint>
+#include <limits>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <utility>
+#include <vector>
+
+#include "kernels/repeat.hpp"
+#include "simplify_iteration_space.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::repeat::repeat_by_sequence_fn_ptr_t;
+static repeat_by_sequence_fn_ptr_t
+    repeat_by_sequence_dispatch_vector[td_ns::num_types];
+
+using dpctl::tensor::kernels::repeat::repeat_by_sequence_1d_fn_ptr_t;
+static repeat_by_sequence_1d_fn_ptr_t
+    repeat_by_sequence_1d_dispatch_vector[td_ns::num_types];
+
+using dpctl::tensor::kernels::repeat::repeat_by_scalar_fn_ptr_t;
+static repeat_by_scalar_fn_ptr_t
+    repeat_by_scalar_dispatch_vector[td_ns::num_types];
+
+using dpctl::tensor::kernels::repeat::repeat_by_scalar_1d_fn_ptr_t;
+static repeat_by_scalar_1d_fn_ptr_t
+    repeat_by_scalar_1d_dispatch_vector[td_ns::num_types];
+
+void init_repeat_dispatch_vectors(void)
+{
+    using dpctl::tensor::kernels::repeat::RepeatSequenceFactory;
+    td_ns::DispatchVectorBuilder<repeat_by_sequence_fn_ptr_t,
+                                 RepeatSequenceFactory, td_ns::num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(repeat_by_sequence_dispatch_vector);
+
+    using dpctl::tensor::kernels::repeat::RepeatSequence1DFactory;
+    td_ns::DispatchVectorBuilder<repeat_by_sequence_1d_fn_ptr_t,
+                                 RepeatSequence1DFactory, td_ns::num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(repeat_by_sequence_1d_dispatch_vector);
+
+    using dpctl::tensor::kernels::repeat::RepeatScalarFactory;
+    td_ns::DispatchVectorBuilder<repeat_by_scalar_fn_ptr_t, RepeatScalarFactory,
+                                 td_ns::num_types>
+        dvb3;
+    dvb3.populate_dispatch_vector(repeat_by_scalar_dispatch_vector);
+
+    using dpctl::tensor::kernels::repeat::RepeatScalar1DFactory;
+    td_ns::DispatchVectorBuilder<repeat_by_scalar_1d_fn_ptr_t,
+                                 RepeatScalar1DFactory, td_ns::num_types>
+        dvb4;
+    dvb4.populate_dispatch_vector(repeat_by_scalar_1d_dispatch_vector);
+}
+
+std::pair<sycl::event, sycl::event>
+py_repeat_by_sequence(dpctl::tensor::usm_ndarray src,
+                      dpctl::tensor::usm_ndarray dst,
+                      dpctl::tensor::usm_ndarray reps,
+                      dpctl::tensor::usm_ndarray cumsum,
+                      int axis,
+                      sycl::queue exec_q,
+                      std::vector<sycl::event> const &depends)
+{
+    int src_nd = src.get_ndim();
+    if (axis < 0 || (axis + 1 > src_nd && src_nd > 0) ||
+        (axis > 0 && src_nd == 0)) {
+        throw py::value_error("Specified axis is invalid.");
+    }
+
+    int dst_nd = dst.get_ndim();
+    if ((src_nd != dst_nd && src_nd > 0) || (src_nd == 0 && dst_nd > 1)) {
+        throw py::value_error("Number of dimensions of source and destination "
+                              "arrays is not consistent");
+    }
+
+    int reps_nd = reps.get_ndim();
+    if (reps_nd != 1) {
+        throw py::value_error("`reps` array must be 1-dimensional");
+    }
+
+    if (cumsum.get_ndim() != 1) {
+        throw py::value_error("`cumsum` array must be 1-dimensional.");
+    }
+
+    if (!cumsum.is_c_contiguous()) {
+        throw py::value_error("Expecting `cumsum` array to be C-contiguous.");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, reps, cumsum, dst}))
+    {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    size_t reps_sz = reps.get_size();
+    size_t cumsum_sz = cumsum.get_size();
+
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    bool same_orthog_dims(true);
+    size_t orthog_nelems(1); // number of orthogonal iterations
+
+    for (auto i = 0; i < axis; ++i) {
+        auto src_sh_i = src_shape[i];
+        orthog_nelems *= src_sh_i;
+        same_orthog_dims = same_orthog_dims && (src_sh_i == dst_shape[i]);
+    }
+    for (auto i = axis + 1; i < src_nd; ++i) {
+        auto src_sh_i = src_shape[i];
+        orthog_nelems *= src_sh_i;
+        same_orthog_dims = same_orthog_dims && (src_sh_i == dst_shape[i]);
+    }
+
+    size_t src_axis_nelems(1);
+    if (src_nd > 0) {
+        src_axis_nelems = src_shape[axis];
+    }
+    size_t dst_axis_nelems(dst_shape[axis]);
+
+    // shape at repeated axis must be equal to the sum of reps
+    if (!same_orthog_dims || src_axis_nelems != reps_sz ||
+        src_axis_nelems != cumsum_sz)
+    {
+        throw py::value_error("Inconsistent array dimensions");
+    }
+
+    if (orthog_nelems == 0 || src_axis_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    // ensure that dst is sufficiently ample
+    auto dst_offsets = dst.get_minmax_offsets();
+    // destination must be ample enough to accommodate all elements
+    {
+        size_t range =
+            static_cast<size_t>(dst_offsets.second - dst_offsets.first);
+        if (range + 1 < static_cast<size_t>(orthog_nelems * dst_axis_nelems)) {
+            throw py::value_error(
+                "Memory addressed by the destination array can not "
+                "accommodate all the "
+                "array elements.");
+        }
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    // check that dst does not intersect with src or reps
+    if (overlap(dst, src) || overlap(dst, reps) || overlap(dst, cumsum)) {
+        throw py::value_error("Destination array overlaps with inputs");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+    int reps_typenum = reps.get_typenum();
+    int cumsum_typenum = cumsum.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+    int reps_typeid = array_types.typenum_to_lookup_id(reps_typenum);
+    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
+
+    if (src_typeid != dst_typeid) {
+        throw py::value_error(
+            "Destination array must have the same elemental data type");
+    }
+
+    constexpr int int64_typeid = static_cast<int>(td_ns::typenum_t::INT64);
+    if (cumsum_typeid != int64_typeid) {
+        throw py::value_error(
+            "Unexpected data type of `cumsum` array, expecting "
+            "'int64'");
+    }
+
+    if (reps_typeid != cumsum_typeid) {
+        throw py::value_error("`reps` array must have the same elemental "
+                              "data type as cumsum");
+    }
+
+    const char *src_data_p = src.get_data();
+    const char *reps_data_p = reps.get_data();
+    const char *cumsum_data_p = cumsum.get_data();
+    char *dst_data_p = dst.get_data();
+
+    auto src_shape_vec = src.get_shape_vector();
+    auto src_strides_vec = src.get_strides_vector();
+
+    auto dst_shape_vec = dst.get_shape_vector();
+    auto dst_strides_vec = dst.get_strides_vector();
+
+    auto reps_shape_vec = reps.get_shape_vector();
+    auto reps_strides_vec = reps.get_strides_vector();
+
+    sycl::event repeat_ev;
+    std::vector<sycl::event> host_task_events{};
+    if (axis == 0 && src_nd < 2) {
+        // empty orthogonal directions
+
+        auto fn = repeat_by_sequence_1d_dispatch_vector[src_typeid];
+
+        assert(dst_shape_vec.size() == 1);
+        assert(dst_strides_vec.size() == 1);
+
+        py::ssize_t src_shape(0);
+        py::ssize_t src_stride(0);
+        if (src_nd > 0) {
+            src_shape = src_shape_vec[0];
+            src_stride = src_strides_vec[0];
+        }
+
+        sycl::event repeat_ev =
+            fn(exec_q, src_axis_nelems, src_data_p, dst_data_p, reps_data_p,
+               cumsum_data_p, src_shape, src_stride, dst_shape_vec[0],
+               dst_strides_vec[0], reps_shape_vec[0], reps_strides_vec[0],
+               depends);
+    }
+    else {
+        // non-empty othogonal directions
+
+        auto fn = repeat_by_sequence_dispatch_vector[src_typeid];
+
+        int orthog_nd = src_nd - 1;
+
+        using shT = std::vector<py::ssize_t>;
+        shT orthog_src_shape;
+        shT orthog_src_strides;
+        shT axis_src_shape;
+        shT axis_src_stride;
+        dpctl::tensor::py_internal::split_iteration_space(
+            src_shape_vec, src_strides_vec, axis, axis + 1, orthog_src_shape,
+            axis_src_shape, orthog_src_strides, axis_src_stride);
+
+        shT orthog_dst_shape;
+        shT orthog_dst_strides;
+        shT axis_dst_shape;
+        shT axis_dst_stride;
+        dpctl::tensor::py_internal::split_iteration_space(
+            dst_shape_vec, dst_strides_vec, axis, axis + 1, orthog_dst_shape,
+            axis_dst_shape, orthog_dst_strides, axis_dst_stride);
+
+        assert(orthog_src_shape.size() == static_cast<size_t>(orthog_nd));
+        assert(orthog_dst_shape.size() == static_cast<size_t>(orthog_nd));
+        assert(std::equal(orthog_src_shape.begin(), orthog_src_shape.end(),
+                          orthog_dst_shape.begin()));
+
+        std::vector<py::ssize_t> simplified_orthog_shape;
+        std::vector<py::ssize_t> simplified_orthog_src_strides;
+        std::vector<py::ssize_t> simplified_orthog_dst_strides;
+
+        const py::ssize_t *_shape = orthog_src_shape.data();
+
+        py::ssize_t orthog_src_offset(0);
+        py::ssize_t orthog_dst_offset(0);
+        dpctl::tensor::py_internal::simplify_iteration_space(
+            orthog_nd, _shape, orthog_src_strides, orthog_dst_strides,
+            // output
+            simplified_orthog_shape, simplified_orthog_src_strides,
+            simplified_orthog_dst_strides, orthog_src_offset,
+            orthog_dst_offset);
+
+        using dpctl::tensor::offset_utils::device_allocate_and_pack;
+        const auto &ptr_size_event_tuple1 =
+            device_allocate_and_pack<py::ssize_t>(
+                exec_q, host_task_events, simplified_orthog_shape,
+                simplified_orthog_src_strides, simplified_orthog_dst_strides);
+        py::ssize_t *packed_shapes_strides = std::get<0>(ptr_size_event_tuple1);
+        if (packed_shapes_strides == nullptr) {
+            throw std::runtime_error("Unable to allocate device memory");
+        }
+        sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
+
+        std::vector<sycl::event> all_deps;
+        all_deps.reserve(depends.size() + 1);
+        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+        all_deps.push_back(copy_shapes_strides_ev);
+
+        assert(all_deps.size() == depends.size() + 1);
+
+        repeat_ev = fn(exec_q, orthog_nelems, src_axis_nelems, src_data_p,
+                       dst_data_p, reps_data_p, cumsum_data_p,
+                       // data to build orthog indexer
+                       orthog_nd, packed_shapes_strides, orthog_src_offset,
+                       orthog_dst_offset,
+                       // data to build indexers along repeated axis in src
+                       axis_src_shape[0], axis_src_stride[0],
+                       // data to build indexer along repeated axis in dst
+                       axis_dst_shape[0], axis_dst_stride[0],
+                       // data to build indexer for reps array
+                       reps_shape_vec[0], reps_strides_vec[0], all_deps);
+
+        sycl::event cleanup_tmp_allocations_ev =
+            exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(repeat_ev);
+                auto ctx = exec_q.get_context();
+                cgh.host_task([ctx, packed_shapes_strides] {
+                    sycl::free(packed_shapes_strides, ctx);
+                });
+            });
+        host_task_events.push_back(cleanup_tmp_allocations_ev);
+    }
+
+    host_task_events.push_back(repeat_ev);
+
+    sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive(
+        exec_q, {src, reps, cumsum, dst}, host_task_events);
+
+    return std::make_pair(py_obj_management_host_task_ev, repeat_ev);
+}
+
+std::pair<sycl::event, sycl::event>
+py_repeat_by_scalar(dpctl::tensor::usm_ndarray src,
+                    dpctl::tensor::usm_ndarray dst,
+                    const py::ssize_t reps,
+                    int axis,
+                    sycl::queue exec_q,
+                    std::vector<sycl::event> const &depends)
+{
+    int src_nd = src.get_ndim();
+    if (axis < 0 || (axis + 1 > src_nd && src_nd > 0) ||
+        (axis > 0 && src_nd == 0)) {
+        throw py::value_error("Specified axis is invalid.");
+    }
+
+    int dst_nd = dst.get_ndim();
+    if ((src_nd != dst_nd && src_nd > 0) || (src_nd == 0 && dst_nd > 1)) {
+        throw py::value_error("Number of dimensions of source and destination "
+                              "arrays is not consistent");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    const py::ssize_t *src_shape = src.get_shape_raw();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    bool same_orthog_dims(true);
+    size_t orthog_nelems(1); // number of orthogonal iterations
+
+    for (auto i = 0; i < axis; ++i) {
+        auto src_sh_i = src_shape[i];
+        orthog_nelems *= src_sh_i;
+        same_orthog_dims = same_orthog_dims && (src_sh_i == dst_shape[i]);
+    }
+    for (auto i = axis + 1; i < src_nd; ++i) {
+        auto src_sh_i = src_shape[i];
+        orthog_nelems *= src_sh_i;
+        same_orthog_dims = same_orthog_dims && (src_sh_i == dst_shape[i]);
+    }
+
+    size_t src_axis_nelems(1);
+    if (src_nd > 0) {
+        src_axis_nelems = src_shape[axis];
+    }
+    size_t dst_axis_nelems(dst_shape[axis]);
+
+    // shape at repeated axis must be equal to the shape of src at the axis *
+    // reps
+    if (!same_orthog_dims || (src_axis_nelems * reps) != dst_axis_nelems) {
+        throw py::value_error("Inconsistent array dimensions");
+    }
+
+    if (orthog_nelems == 0 || src_axis_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    // ensure that dst is sufficiently ample
+    auto dst_offsets = dst.get_minmax_offsets();
+    // destination must be ample enough to accommodate all elements
+    {
+        size_t range =
+            static_cast<size_t>(dst_offsets.second - dst_offsets.first);
+        if (range + 1 <
+            static_cast<size_t>(orthog_nelems * (src_axis_nelems * reps))) {
+            throw py::value_error(
+                "Memory addressed by the destination array can not "
+                "accommodate all the "
+                "array elements.");
+        }
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    // check that dst does not intersect with src
+    if (overlap(dst, src)) {
+        throw py::value_error("Destination array overlaps with inputs");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto const &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src_typeid != dst_typeid) {
+        throw py::value_error(
+            "Destination array must have the same elemental data type");
+    }
+
+    const char *src_data_p = src.get_data();
+    char *dst_data_p = dst.get_data();
+
+    auto src_shape_vec = src.get_shape_vector();
+    auto src_strides_vec = src.get_strides_vector();
+
+    auto dst_shape_vec = dst.get_shape_vector();
+    auto dst_strides_vec = dst.get_strides_vector();
+
+    sycl::event repeat_ev;
+    std::vector<sycl::event> host_task_events{};
+    if (axis == 0 && src_nd < 2) {
+        // empty orthogonal directions
+
+        auto fn = repeat_by_scalar_1d_dispatch_vector[src_typeid];
+
+        assert(dst_shape_vec.size() == 1);
+        assert(dst_strides_vec.size() == 1);
+
+        py::ssize_t src_shape(0);
+        py::ssize_t src_stride(0);
+        if (src_nd > 0) {
+            src_shape = src_shape_vec[0];
+            src_stride = src_strides_vec[0];
+        }
+        sycl::event repeat_ev =
+            fn(exec_q, dst_axis_nelems, src_data_p, dst_data_p, reps, src_shape,
+               src_stride, dst_shape_vec[0], dst_strides_vec[0], depends);
+    }
+    else {
+        // non-empty othogonal directions
+
+        auto fn = repeat_by_scalar_dispatch_vector[src_typeid];
+
+        int orthog_nd = src_nd - 1;
+
+        using shT = std::vector<py::ssize_t>;
+        shT orthog_src_shape;
+        shT orthog_src_strides;
+        shT axis_src_shape;
+        shT axis_src_stride;
+        dpctl::tensor::py_internal::split_iteration_space(
+            src_shape_vec, src_strides_vec, axis, axis + 1, orthog_src_shape,
+            axis_src_shape, orthog_src_strides, axis_src_stride);
+
+        shT orthog_dst_shape;
+        shT orthog_dst_strides;
+        shT axis_dst_shape;
+        shT axis_dst_stride;
+        dpctl::tensor::py_internal::split_iteration_space(
+            dst_shape_vec, dst_strides_vec, axis, axis + 1, orthog_dst_shape,
+            axis_dst_shape, orthog_dst_strides, axis_dst_stride);
+
+        assert(orthog_src_shape.size() == static_cast<size_t>(orthog_nd));
+        assert(orthog_dst_shape.size() == static_cast<size_t>(orthog_nd));
+        assert(std::equal(orthog_src_shape.begin(), orthog_src_shape.end(),
+                          orthog_dst_shape.begin()));
+
+        std::vector<py::ssize_t> simplified_orthog_shape;
+        std::vector<py::ssize_t> simplified_orthog_src_strides;
+        std::vector<py::ssize_t> simplified_orthog_dst_strides;
+
+        const py::ssize_t *_shape = orthog_src_shape.data();
+
+        py::ssize_t orthog_src_offset(0);
+        py::ssize_t orthog_dst_offset(0);
+
+        dpctl::tensor::py_internal::simplify_iteration_space(
+            orthog_nd, _shape, orthog_src_strides, orthog_dst_strides,
+            // output
+            simplified_orthog_shape, simplified_orthog_src_strides,
+            simplified_orthog_dst_strides, orthog_src_offset,
+            orthog_dst_offset);
+
+        using dpctl::tensor::offset_utils::device_allocate_and_pack;
+        const auto &ptr_size_event_tuple1 =
+            device_allocate_and_pack<py::ssize_t>(
+                exec_q, host_task_events, simplified_orthog_shape,
+                simplified_orthog_src_strides, simplified_orthog_dst_strides);
+        py::ssize_t *packed_shapes_strides = std::get<0>(ptr_size_event_tuple1);
+        if (packed_shapes_strides == nullptr) {
+            throw std::runtime_error("Unable to allocate device memory");
+        }
+        sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
+
+        std::vector<sycl::event> all_deps;
+        all_deps.reserve(depends.size() + 1);
+        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
+        all_deps.push_back(copy_shapes_strides_ev);
+
+        assert(all_deps.size() == depends.size() + 1);
+
+        repeat_ev = fn(exec_q, orthog_nelems, dst_axis_nelems, src_data_p,
+                       dst_data_p, reps,
+                       // data to build orthog indexer
+                       orthog_nd, packed_shapes_strides, orthog_src_offset,
+                       orthog_dst_offset,
+                       // data to build indexer along repeated axis in src
+                       axis_src_shape[0], axis_src_stride[0],
+                       // data to build indexer along repeated axis in dst
+                       axis_dst_shape[0], axis_dst_stride[0], all_deps);
+
+        sycl::event cleanup_tmp_allocations_ev =
+            exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(repeat_ev);
+                auto ctx = exec_q.get_context();
+                cgh.host_task([ctx, packed_shapes_strides] {
+                    sycl::free(packed_shapes_strides, ctx);
+                });
+            });
+        host_task_events.push_back(cleanup_tmp_allocations_ev);
+    }
+
+    host_task_events.push_back(repeat_ev);
+
+    sycl::event py_obj_management_host_task_ev =
+        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events);
+
+    return std::make_pair(py_obj_management_host_task_ev, repeat_ev);
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/repeat.hpp b/dpctl/tensor/libtensor/source/repeat.hpp
new file mode 100644
index 0000000000..e7ec59f209
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/repeat.hpp
@@ -0,0 +1,61 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_repeat_dispatch_vectors(void);
+
+extern std::pair<sycl::event, sycl::event>
+py_repeat_by_sequence(dpctl::tensor::usm_ndarray src,
+                      dpctl::tensor::usm_ndarray dst,
+                      dpctl::tensor::usm_ndarray reps,
+                      dpctl::tensor::usm_ndarray cumsum,
+                      int axis,
+                      sycl::queue exec_q,
+                      std::vector<sycl::event> const &depends);
+
+extern std::pair<sycl::event, sycl::event>
+py_repeat_by_scalar(dpctl::tensor::usm_ndarray src,
+                    dpctl::tensor::usm_ndarray dst,
+                    const py::ssize_t reps,
+                    int axis,
+                    sycl::queue exec_q,
+                    std::vector<sycl::event> const &depends);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp b/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp
index 31f5250f8a..e77bbbb5ce 100644
--- a/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp
+++ b/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp
@@ -408,6 +408,50 @@ void compact_iteration_space(int &nd,
     }
 }
 
+/* @brief Split shape/strides into dir1 (complementary to axis_start <= i <
+ * axis_end) and dir2 (along given set of axes)
+ */
+void split_iteration_space(const std::vector<py::ssize_t> &shape_vec,
+                           const std::vector<py::ssize_t> &strides_vec,
+                           int axis_start,
+                           int axis_end,
+                           std::vector<py::ssize_t> &dir1_shape_vec,
+                           std::vector<py::ssize_t> &dir2_shape_vec,
+                           std::vector<py::ssize_t> &dir1_strides_vec,
+                           std::vector<py::ssize_t> &dir2_strides_vec)
+{
+    int nd = static_cast<int>(shape_vec.size());
+    int dir2_sz = axis_end - axis_start;
+    int dir1_sz = nd - dir2_sz;
+
+    assert(dir1_sz > 0);
+    assert(dir2_sz > 0);
+
+    dir1_shape_vec.resize(dir1_sz);
+    dir2_shape_vec.resize(dir2_sz);
+
+    std::copy(shape_vec.begin(), shape_vec.begin() + axis_start,
+              dir1_shape_vec.begin());
+    std::copy(shape_vec.begin() + axis_end, shape_vec.end(),
+              dir1_shape_vec.begin() + axis_start);
+
+    std::copy(shape_vec.begin() + axis_start, shape_vec.begin() + axis_end,
+              dir2_shape_vec.begin());
+
+    dir1_strides_vec.resize(dir1_sz);
+    dir2_strides_vec.resize(dir2_sz);
+
+    std::copy(strides_vec.begin(), strides_vec.begin() + axis_start,
+              dir1_strides_vec.begin());
+    std::copy(strides_vec.begin() + axis_end, strides_vec.end(),
+              dir1_strides_vec.begin() + axis_start);
+
+    std::copy(strides_vec.begin() + axis_start, strides_vec.begin() + axis_end,
+              dir2_strides_vec.begin());
+
+    return;
+}
+
 py::ssize_t _ravel_multi_index_c(std::vector<py::ssize_t> const &mi,
                                  std::vector<py::ssize_t> const &shape)
 {
diff --git a/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp b/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp
index 275129ad5c..74a80c98b0 100644
--- a/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp
+++ b/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp
@@ -97,6 +97,16 @@ void compact_iteration_space(int &,
                              std::vector<py::ssize_t> &,
                              std::vector<py::ssize_t> &);
 
+void split_iteration_space(const std::vector<py::ssize_t> &,
+                           const std::vector<py::ssize_t> &,
+                           int,
+                           int,
+                           // output
+                           std::vector<py::ssize_t> &,
+                           std::vector<py::ssize_t> &,
+                           std::vector<py::ssize_t> &,
+                           std::vector<py::ssize_t> &);
+
 py::ssize_t _ravel_multi_index_c(std::vector<py::ssize_t> const &,
                                  std::vector<py::ssize_t> const &);
 py::ssize_t _ravel_multi_index_f(std::vector<py::ssize_t> const &,
diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp
index 691a56b11f..fec246325c 100644
--- a/dpctl/tensor/libtensor/source/tensor_py.cpp
+++ b/dpctl/tensor/libtensor/source/tensor_py.cpp
@@ -33,6 +33,7 @@
 
 #include "dpctl4pybind11.hpp"
 
+#include "accumulators.hpp"
 #include "boolean_advanced_indexing.hpp"
 #include "boolean_reductions.hpp"
 #include "copy_and_cast_usm_to_usm.hpp"
@@ -45,6 +46,7 @@
 #include "full_ctor.hpp"
 #include "integer_advanced_indexing.hpp"
 #include "linear_sequences.hpp"
+#include "repeat.hpp"
 #include "simplify_iteration_space.hpp"
 #include "sum_reductions.hpp"
 #include "triul_ctor.hpp"
@@ -96,6 +98,11 @@ using dpctl::tensor::py_internal::py_mask_positions;
 using dpctl::tensor::py_internal::py_nonzero;
 using dpctl::tensor::py_internal::py_place;
 
+/* ================= Repeat ====================*/
+using dpctl::tensor::py_internal::py_cumsum_1d;
+using dpctl::tensor::py_internal::py_repeat_by_scalar;
+using dpctl::tensor::py_internal::py_repeat_by_sequence;
+
 /* ================ Eye ================== */
 
 using dpctl::tensor::py_internal::usm_ndarray_eye;
@@ -132,10 +139,14 @@ void init_dispatch_vectors(void)
     init_eye_ctor_dispatch_vectors();
     init_triul_ctor_dispatch_vectors();
 
-    populate_mask_positions_dispatch_vectors();
     populate_masked_extract_dispatch_vectors();
     populate_masked_place_dispatch_vectors();
 
+    populate_mask_positions_dispatch_vectors();
+
+    populate_cumsum_1d_dispatch_vectors();
+    init_repeat_dispatch_vectors();
+
     return;
 }
 
@@ -353,6 +364,9 @@ PYBIND11_MODULE(_tensor_impl, m)
           py::arg("cumsum"), py::arg("sycl_queue"),
           py::arg("depends") = py::list());
 
+    m.def("_cumsum_1d", &py_cumsum_1d, "", py::arg("src"), py::arg("cumsum"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
     m.def("_extract", &py_extract, "", py::arg("src"), py::arg("cumsum"),
           py::arg("axis_start"), py::arg("axis_end"), py::arg("dst"),
           py::arg("sycl_queue"), py::arg("depends") = py::list());
@@ -387,6 +401,14 @@ PYBIND11_MODULE(_tensor_impl, m)
           py::arg("x2"), py::arg("dst"), py::arg("sycl_queue"),
           py::arg("depends") = py::list());
 
+    m.def("_repeat_by_sequence", &py_repeat_by_sequence, "", py::arg("src"),
+          py::arg("dst"), py::arg("reps"), py::arg("cumsum"), py::arg("axis"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    m.def("_repeat_by_scalar", &py_repeat_by_scalar, "", py::arg("src"),
+          py::arg("dst"), py::arg("reps"), py::arg("axis"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
     dpctl::tensor::py_internal::init_elementwise_functions(m);
     dpctl::tensor::py_internal::init_boolean_reduction_functions(m);
     dpctl::tensor::py_internal::init_reduction_functions(m);
diff --git a/dpctl/tests/test_usm_ndarray_manipulation.py b/dpctl/tests/test_usm_ndarray_manipulation.py
index 6152a15aae..2126727d5b 100644
--- a/dpctl/tests/test_usm_ndarray_manipulation.py
+++ b/dpctl/tests/test_usm_ndarray_manipulation.py
@@ -17,11 +17,12 @@
 
 import numpy as np
 import pytest
-from helper import get_queue_or_skip
 from numpy.testing import assert_, assert_array_equal, assert_raises_regex
 
 import dpctl
 import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip
+from dpctl.utils import ExecutionPlacementError
 
 
 def test_permute_dims_incorrect_type():
@@ -1116,3 +1117,314 @@ def test_finfo_object():
     assert isinstance(fi.dtype, dpt.dtype)
     assert isinstance(str(fi), str)
     assert isinstance(repr(fi), str)
+
+
+def test_repeat_scalar_sequence_agreement():
+    get_queue_or_skip()
+
+    x = dpt.arange(5, dtype="i4")
+    expected_res = dpt.empty(10, dtype="i4")
+    expected_res[1::2], expected_res[::2] = x, x
+
+    # scalar case
+    reps = 2
+    res = dpt.repeat(x, reps)
+    assert dpt.all(res == expected_res)
+
+    # tuple
+    reps = (2, 2, 2, 2, 2)
+    res = dpt.repeat(x, reps)
+    assert dpt.all(res == expected_res)
+
+
+def test_repeat_as_broadcasting():
+    get_queue_or_skip()
+
+    reps = 5
+    x = dpt.arange(reps, dtype="i4")
+    x1 = x[:, dpt.newaxis]
+    expected_res = dpt.broadcast_to(x1, (reps, reps))
+
+    res = dpt.repeat(x1, reps, axis=1)
+    assert dpt.all(res == expected_res)
+
+    x2 = x[dpt.newaxis, :]
+    expected_res = dpt.broadcast_to(x2, (reps, reps))
+
+    res = dpt.repeat(x2, reps, axis=0)
+    assert dpt.all(res == expected_res)
+
+
+def test_repeat_axes():
+    get_queue_or_skip()
+
+    reps = 2
+    x = dpt.reshape(dpt.arange(5 * 10, dtype="i4"), (5, 10))
+    expected_res = dpt.empty((x.shape[0] * 2, x.shape[1]), x.dtype)
+    expected_res[::2, :], expected_res[1::2] = x, x
+    res = dpt.repeat(x, reps, axis=0)
+    assert dpt.all(res == expected_res)
+
+    expected_res = dpt.empty((x.shape[0], x.shape[1] * 2), x.dtype)
+    expected_res[:, ::2], expected_res[:, 1::2] = x, x
+    res = dpt.repeat(x, reps, axis=1)
+    assert dpt.all(res == expected_res)
+
+
+def test_repeat_size_0_outputs():
+    get_queue_or_skip()
+
+    x = dpt.ones((3, 0, 5), dtype="i4")
+    reps = 10
+    res = dpt.repeat(x, reps, axis=0)
+    assert res.size == 0
+    assert res.shape == (30, 0, 5)
+
+    res = dpt.repeat(x, reps, axis=1)
+    assert res.size == 0
+    assert res.shape == (3, 0, 5)
+
+    res = dpt.repeat(x, (2, 2, 2), axis=0)
+    assert res.size == 0
+    assert res.shape == (6, 0, 5)
+
+    x = dpt.ones((3, 2, 5))
+    res = dpt.repeat(x, 0, axis=1)
+    assert res.size == 0
+    assert res.shape == (3, 0, 5)
+
+    x = dpt.ones((3, 2, 5))
+    res = dpt.repeat(x, (0, 0), axis=1)
+    assert res.size == 0
+    assert res.shape == (3, 0, 5)
+
+
+def test_repeat_strides():
+    get_queue_or_skip()
+
+    reps = 2
+    x = dpt.reshape(dpt.arange(10 * 10, dtype="i4"), (10, 10))
+    x1 = x[:, ::-2]
+    expected_res = dpt.empty((10, 10), dtype="i4")
+    expected_res[:, ::2], expected_res[:, 1::2] = x1, x1
+    res = dpt.repeat(x1, reps, axis=1)
+    assert dpt.all(res == expected_res)
+    res = dpt.repeat(x1, (reps,) * x1.shape[1], axis=1)
+    assert dpt.all(res == expected_res)
+
+    x1 = x[::-2, :]
+    expected_res = dpt.empty((10, 10), dtype="i4")
+    expected_res[::2, :], expected_res[1::2, :] = x1, x1
+    res = dpt.repeat(x1, reps, axis=0)
+    assert dpt.all(res == expected_res)
+    res = dpt.repeat(x1, (reps,) * x1.shape[0], axis=0)
+    assert dpt.all(res == expected_res)
+
+
+def test_repeat_casting():
+    get_queue_or_skip()
+
+    x = dpt.arange(5, dtype="i4")
+    # i4 is cast to i8
+    reps = dpt.ones(5, dtype="i4")
+    res = dpt.repeat(x, reps)
+    assert res.shape == x.shape
+    assert dpt.all(res == x)
+
+
+def test_repeat_strided_repeats():
+    get_queue_or_skip()
+
+    x = dpt.arange(5, dtype="i4")
+    reps = dpt.ones(10, dtype="i8")
+    reps[::2] = 0
+    reps = reps[::-2]
+    res = dpt.repeat(x, reps)
+    assert res.shape == x.shape
+    assert dpt.all(res == x)
+
+
+def test_repeat_arg_validation():
+    get_queue_or_skip()
+
+    x = dict()
+    with pytest.raises(TypeError):
+        dpt.repeat(x, 2)
+
+    # axis must be 0 for scalar
+    x = dpt.empty(())
+    with pytest.raises(ValueError):
+        dpt.repeat(x, 2, axis=1)
+
+    # x.ndim cannot be > 1 for axis=None
+    x = dpt.empty((5, 10))
+    with pytest.raises(ValueError):
+        dpt.repeat(x, 2, axis=None)
+
+    # repeats must be positive
+    x = dpt.empty(5)
+    with pytest.raises(ValueError):
+        dpt.repeat(x, -2)
+
+    # repeats must be integers
+    with pytest.raises(TypeError):
+        dpt.repeat(x, 2.0)
+
+    # repeats tuple must be the same length as axis
+    with pytest.raises(ValueError):
+        dpt.repeat(x, (1, 2))
+
+    # repeats tuple elements must be positive
+    with pytest.raises(ValueError):
+        dpt.repeat(x, (-1,))
+
+    # repeats must be int or tuple
+    with pytest.raises(TypeError):
+        dpt.repeat(x, dict())
+
+    # repeats array must be 0d or 1d
+    with pytest.raises(ValueError):
+        dpt.repeat(x, dpt.ones((1, 1), dtype="i8"))
+
+    # repeats must be castable to i8
+    with pytest.raises(TypeError):
+        dpt.repeat(x, dpt.asarray(2.0, dtype="f4"))
+
+    # compute follows data
+    q2 = dpctl.SyclQueue()
+    reps = dpt.asarray(1, dtype="i8", sycl_queue=q2)
+    with pytest.raises(ExecutionPlacementError):
+        dpt.repeat(x, reps)
+
+    # repeats array must not contain negative elements
+    reps = dpt.asarray(-1, dtype="i8")
+    with pytest.raises(ValueError):
+        dpt.repeat(x, reps)
+    reps = dpt.asarray([1, 1, 1, 1, -1], dtype="i8")
+    with pytest.raises(ValueError):
+        dpt.repeat(x, reps)
+
+    # repeats must broadcastable to axis size
+    reps = dpt.arange(10, dtype="i8")
+    with pytest.raises(ValueError):
+        dpt.repeat(x, reps)
+
+
+def test_tile_basic():
+    get_queue_or_skip()
+
+    reps = 2
+    x = dpt.arange(5, dtype="i4")
+    res = dpt.tile(x, reps)
+    assert res.shape == (x.shape[0] * reps,)
+    assert dpt.all(res[: x.size] == res[x.size :])
+
+    reps = (2, 1)
+    expected_sh = (2, x.shape[0])
+    expected_res = dpt.broadcast_to(x, expected_sh)
+    res = dpt.tile(x, reps)
+    assert res.shape == expected_sh
+    assert dpt.all(expected_res == res)
+
+
+def test_tile_size_1():
+    get_queue_or_skip()
+
+    reps = 5
+    # test for 0d array
+    x = dpt.asarray(2, dtype="i4")
+    res = dpt.tile(x, reps)
+    assert dpt.all(res == dpt.full(reps, 2, dtype="i4"))
+
+    # test for 1d array with single element
+    x = dpt.asarray([2], dtype="i4")
+    res = dpt.tile(x, reps)
+    assert dpt.all(res == dpt.full(reps, 2, dtype="i4"))
+
+    # test empty reps returns copy of input
+    reps = ()
+    res = dpt.tile(x, reps)
+    assert x.shape == res.shape
+    assert x == res
+
+
+def test_tile_prepends_axes():
+    get_queue_or_skip()
+
+    reps = (2,)
+    x = dpt.ones((5, 10), dtype="i4")
+    expected_res = dpt.ones((5, 20), dtype="i4")
+    res = dpt.tile(x, reps)
+    assert dpt.all(res == expected_res)
+
+    reps = (3, 2, 2)
+    expected_res = dpt.ones((3, 10, 20), dtype="i4")
+    res = dpt.tile(x, reps)
+    assert dpt.all(res == expected_res)
+
+
+def test_tile_empty_outputs():
+    get_queue_or_skip()
+
+    x = dpt.asarray((), dtype="i4")
+    reps = 10
+    res = dpt.tile(x, reps)
+    assert res.size == 0
+    assert res.shape == (0,)
+
+    x = dpt.ones((3, 0, 5), dtype="i4")
+    res = dpt.tile(x, reps)
+    assert res.size == 0
+    assert res.shape == (3, 0, 50)
+
+    reps = (2, 1, 2)
+    res = dpt.tile(x, reps)
+    assert res.size == 0
+    assert res.shape == (6, 0, 10)
+
+    x = dpt.ones((2, 3, 4), dtype="i4")
+    reps = (0, 1, 1)
+    res = dpt.tile(x, reps)
+    assert res.size == 0
+    assert res.shape == (0, 3, 4)
+
+
+def test_tile_strides():
+    get_queue_or_skip()
+
+    reps = (1, 2)
+    x = dpt.reshape(dpt.arange(10 * 10, dtype="i4"), (10, 10))
+    x1 = x[:, ::-2]
+    expected_res = dpt.empty((10, 10), dtype="i4")
+    expected_res[:, : x1.shape[1]], expected_res[:, x1.shape[1] :] = x1, x1
+    res = dpt.tile(x1, reps)
+    assert dpt.all(res == expected_res)
+
+    reps = (2, 1)
+    x1 = x[::-2, :]
+    expected_res = dpt.empty((10, 10), dtype="i4")
+    expected_res[: x1.shape[0], :], expected_res[x1.shape[0] :, :] = x1, x1
+    res = dpt.tile(x1, reps)
+    assert dpt.all(res == expected_res)
+
+
+def test_tile_size_1_axes():
+    get_queue_or_skip()
+
+    reps = (1, 2, 1)
+    x = dpt.ones((2, 1, 3), dtype="i4")
+    res = dpt.tile(x, reps)
+    expected_res = dpt.broadcast_to(x, (2, 2, 3))
+    assert dpt.all(res == expected_res)
+
+
+def test_tile_arg_validation():
+    get_queue_or_skip()
+
+    with pytest.raises(TypeError):
+        dpt.tile(dict(), 2)
+
+    # repetitions must be int or tuple
+    x = dpt.empty(())
+    with pytest.raises(TypeError):
+        dpt.tile(x, dict())