From 83858f05b855b7186a85dff78a42c8f897999e1a Mon Sep 17 00:00:00 2001
From: ndgrigorian <46709016+ndgrigorian@users.noreply.github.com>
Date: Sat, 19 Aug 2023 18:29:06 -0700
Subject: [PATCH 01/44] Implements dpctl.tensor.matrix_transpose (#1356)

* Implements matrix_transpose
- Function wrapper for call to dpctl.tensor.usm_ndarray.mT attribute

* Add arg validation tests for matrix_transpose

* Added a test for matrix_transpose for coverage
---
 dpctl/tensor/__init__.py                  |  2 +
 dpctl/tensor/_linear_algebra_functions.py | 48 +++++++++++++++++++++++
 dpctl/tests/test_usm_ndarray_linalg.py    | 48 +++++++++++++++++++++++
 3 files changed, 98 insertions(+)
 create mode 100644 dpctl/tensor/_linear_algebra_functions.py
 create mode 100644 dpctl/tests/test_usm_ndarray_linalg.py

diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py
index 7438fb8a67..ad51689f3f 100644
--- a/dpctl/tensor/__init__.py
+++ b/dpctl/tensor/__init__.py
@@ -60,6 +60,7 @@
 from dpctl.tensor._device import Device
 from dpctl.tensor._dlpack import from_dlpack
 from dpctl.tensor._indexing_functions import extract, nonzero, place, put, take
+from dpctl.tensor._linear_algebra_functions import matrix_transpose
 from dpctl.tensor._manipulation_functions import (
     broadcast_arrays,
     broadcast_to,
@@ -199,6 +200,7 @@
     "tril",
     "triu",
     "where",
+    "matrix_transpose",
     "all",
     "any",
     "dtype",
diff --git a/dpctl/tensor/_linear_algebra_functions.py b/dpctl/tensor/_linear_algebra_functions.py
new file mode 100644
index 0000000000..fd2c58b08a
--- /dev/null
+++ b/dpctl/tensor/_linear_algebra_functions.py
@@ -0,0 +1,48 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import dpctl.tensor as dpt
+
+
+def matrix_transpose(x):
+    """matrix_transpose(x)
+
+    Transposes the innermost two dimensions of `x`, where `x` is a
+    2-dimensional matrix or a stack of 2-dimensional matrices.
+
+    To convert from a 1-dimensional array to a 2-dimensional column
+    vector, use x[:, dpt.newaxis].
+
+    Args:
+       x (usm_ndarray):
+          Input array with shape (..., m, n).
+
+    Returns:
+       usm_ndarray:
+          Array with shape (..., n, m).
+    """
+
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            "Expected instance of `dpt.usm_ndarray`, got `{}`.".format(type(x))
+        )
+    if x.ndim < 2:
+        raise ValueError(
+            "dpctl.tensor.matrix_transpose requires array to have"
+            "at least 2 dimensions"
+        )
+
+    return x.mT
diff --git a/dpctl/tests/test_usm_ndarray_linalg.py b/dpctl/tests/test_usm_ndarray_linalg.py
new file mode 100644
index 0000000000..4023eb8ad7
--- /dev/null
+++ b/dpctl/tests/test_usm_ndarray_linalg.py
@@ -0,0 +1,48 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import pytest
+
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip
+
+
+def test_matrix_transpose():
+    get_queue_or_skip()
+
+    X = dpt.reshape(dpt.arange(2 * 3, dtype="i4"), (2, 3))
+    res = dpt.matrix_transpose(X)
+    expected_res = X.mT
+
+    assert expected_res.shape == res.shape
+    assert expected_res.flags["C"] == res.flags["C"]
+    assert expected_res.flags["F"] == res.flags["F"]
+    assert dpt.all(X.mT == res)
+
+
+def test_matrix_transpose_arg_validation():
+    get_queue_or_skip()
+
+    X = dpt.empty(5, dtype="i4")
+    with pytest.raises(ValueError):
+        dpt.matrix_transpose(X)
+
+    X = dict()
+    with pytest.raises(TypeError):
+        dpt.matrix_transpose(X)
+
+    X = dpt.empty((5, 5), dtype="i4")
+    assert isinstance(dpt.matrix_transpose(X), dpt.usm_ndarray)

From abc8c8044cb590ee51a256d77f33f8fe147b5f6c Mon Sep 17 00:00:00 2001
From: ndgrigorian <46709016+ndgrigorian@users.noreply.github.com>
Date: Sat, 19 Aug 2023 22:09:03 -0700
Subject: [PATCH 02/44] _real_view and _imag_view now set flags correctly
 (#1355)

- these properties were setting the flags of the output to the flags of the input, which is incorrect, as the output is almost never contiguous
- added tests for this behavior
---
 dpctl/tensor/_usmarray.pyx           | 10 +++++-----
 dpctl/tests/test_usm_ndarray_ctor.py | 11 ++++++++++-
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx
index 83e5e84759..d4c0a69dfd 100644
--- a/dpctl/tensor/_usmarray.pyx
+++ b/dpctl/tensor/_usmarray.pyx
@@ -662,7 +662,7 @@ cdef class usm_ndarray:
         """
         if self.nd_ < 2:
             raise ValueError(
-                "array.mT requires array to have at least 2-dimensons."
+                "array.mT requires array to have at least 2 dimensions."
             )
         return _m_transpose(self)
 
@@ -1216,14 +1216,14 @@ cdef usm_ndarray _real_view(usm_ndarray ary):
     offset_elems = ary.get_offset() * 2
     r = usm_ndarray.__new__(
         usm_ndarray,
-        _make_int_tuple(ary.nd_, ary.shape_),
+        _make_int_tuple(ary.nd_, ary.shape_) if ary.nd_ > 0 else tuple(),
         dtype=_make_typestr(r_typenum_),
         strides=tuple(2 * si for si in ary.strides),
         buffer=ary.base_,
         offset=offset_elems,
         order=('C' if (ary.flags_ & USM_ARRAY_C_CONTIGUOUS) else 'F')
     )
-    r.flags_ = ary.flags_
+    r.flags_ |= (ary.flags_ & USM_ARRAY_WRITABLE)
     r.array_namespace_ = ary.array_namespace_
     return r
 
@@ -1248,14 +1248,14 @@ cdef usm_ndarray _imag_view(usm_ndarray ary):
     offset_elems = 2 * ary.get_offset() + 1
     r = usm_ndarray.__new__(
         usm_ndarray,
-        _make_int_tuple(ary.nd_, ary.shape_),
+        _make_int_tuple(ary.nd_, ary.shape_) if ary.nd_ > 0 else tuple(),
         dtype=_make_typestr(r_typenum_),
         strides=tuple(2 * si for si in ary.strides),
         buffer=ary.base_,
         offset=offset_elems,
         order=('C' if (ary.flags_ & USM_ARRAY_C_CONTIGUOUS) else 'F')
     )
-    r.flags_ = ary.flags_
+    r.flags_ |= (ary.flags_ & USM_ARRAY_WRITABLE)
     r.array_namespace_ = ary.array_namespace_
     return r
 
diff --git a/dpctl/tests/test_usm_ndarray_ctor.py b/dpctl/tests/test_usm_ndarray_ctor.py
index 77edc1f22e..8e71f3931d 100644
--- a/dpctl/tests/test_usm_ndarray_ctor.py
+++ b/dpctl/tests/test_usm_ndarray_ctor.py
@@ -1438,17 +1438,26 @@ def test_real_imag_views():
     n, m = 2, 3
     try:
         X = dpt.usm_ndarray((n, m), "c8")
+        X_scalar = dpt.usm_ndarray((), dtype="c8")
     except dpctl.SyclDeviceCreationError:
         pytest.skip("No SYCL devices available")
     Xnp_r = np.arange(n * m, dtype="f4").reshape((n, m))
     Xnp_i = np.arange(n * m, 2 * n * m, dtype="f4").reshape((n, m))
     Xnp = Xnp_r + 1j * Xnp_i
     X[:] = Xnp
-    assert np.array_equal(dpt.to_numpy(X.real), Xnp.real)
+    X_real = X.real
+    X_imag = X.imag
+    assert np.array_equal(dpt.to_numpy(X_real), Xnp.real)
     assert np.array_equal(dpt.to_numpy(X.imag), Xnp.imag)
+    assert not X_real.flags["C"] and not X_real.flags["F"]
+    assert not X_imag.flags["C"] and not X_imag.flags["F"]
+    assert X_real.strides == X_imag.strides
     assert np.array_equal(dpt.to_numpy(X[1:].real), Xnp[1:].real)
     assert np.array_equal(dpt.to_numpy(X[1:].imag), Xnp[1:].imag)
 
+    X_scalar[...] = complex(n * m, 2 * n * m)
+    assert X_scalar.real and X_scalar.imag
+
 
 @pytest.mark.parametrize(
     "dtype",

From bcefda7a99cf22d01e222f77eff9291890b7bdb2 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 21 Aug 2023 05:27:48 -0500
Subject: [PATCH 03/44] Closes gh-1358

Removes deprecated DPCTLDevice_GetMaxWorkItemSizes.

Added Null_DRef tests for DPCTLDevice_GetMaxWorkItemSizes1d,
DPCTLDevice_GetMaxWorkItemSizes2d, DPCTLDevice_GetMaxWorkItemSizes3d
---
 .../include/dpctl_sycl_device_interface.h     | 13 ----------
 .../source/dpctl_sycl_device_interface.cpp    |  6 -----
 .../tests/test_sycl_device_interface.cpp      | 26 ++++++++++++-------
 3 files changed, 16 insertions(+), 29 deletions(-)

diff --git a/libsyclinterface/include/dpctl_sycl_device_interface.h b/libsyclinterface/include/dpctl_sycl_device_interface.h
index 50c4d1f1f2..3050d7449f 100644
--- a/libsyclinterface/include/dpctl_sycl_device_interface.h
+++ b/libsyclinterface/include/dpctl_sycl_device_interface.h
@@ -233,19 +233,6 @@ DPCTL_API
 __dpctl_keep size_t *
 DPCTLDevice_GetMaxWorkItemSizes3d(__dpctl_keep const DPCTLSyclDeviceRef DRef);
 
-/*!
- * @brief Wrapper for deprecated get_info<info::device::max_work_item_sizes>().
- *
- * @param    DRef           Opaque pointer to a ``sycl::device``
- * @return   Returns the valid result if device exists else returns NULL.
- * @ingroup DeviceInterface
- */
-#if __cplusplus || (defined(__GNUC__) && __GNUC__ > 10)
-[[deprecated("Use DPCTLDevice_WorkItemSizes3d instead")]]
-#endif
-DPCTL_API __dpctl_keep size_t *
-DPCTLDevice_GetMaxWorkItemSizes(__dpctl_keep const DPCTLSyclDeviceRef DRef);
-
 /*!
  * @brief Wrapper for get_info<info::device::max_work_group_size>().
  *
diff --git a/libsyclinterface/source/dpctl_sycl_device_interface.cpp b/libsyclinterface/source/dpctl_sycl_device_interface.cpp
index bcdd46a9fd..73dfcc7e11 100644
--- a/libsyclinterface/source/dpctl_sycl_device_interface.cpp
+++ b/libsyclinterface/source/dpctl_sycl_device_interface.cpp
@@ -266,12 +266,6 @@ DPCTLDevice_GetMaxWorkItemSizes3d(__dpctl_keep const DPCTLSyclDeviceRef DRef)
     return DPCTLDevice__GetMaxWorkItemSizes<3>(DRef);
 }
 
-__dpctl_keep size_t *
-DPCTLDevice_GetMaxWorkItemSizes(__dpctl_keep const DPCTLSyclDeviceRef DRef)
-{
-    return DPCTLDevice__GetMaxWorkItemSizes<3>(DRef);
-}
-
 size_t
 DPCTLDevice_GetMaxWorkGroupSize(__dpctl_keep const DPCTLSyclDeviceRef DRef)
 {
diff --git a/libsyclinterface/tests/test_sycl_device_interface.cpp b/libsyclinterface/tests/test_sycl_device_interface.cpp
index 7e92c8c9de..dd20c738df 100644
--- a/libsyclinterface/tests/test_sycl_device_interface.cpp
+++ b/libsyclinterface/tests/test_sycl_device_interface.cpp
@@ -175,14 +175,6 @@ TEST_P(TestDPCTLSyclDeviceInterface, ChkGetMaxWorkItemSizes3d)
     EXPECT_NO_FATAL_FAILURE(DPCTLSize_t_Array_Delete(sizes));
 }
 
-TEST_P(TestDPCTLSyclDeviceInterface, ChkGetMaxWorkItemSizes)
-{
-    size_t *sizes = nullptr;
-    EXPECT_NO_FATAL_FAILURE(sizes = DPCTLDevice_GetMaxWorkItemSizes(DRef));
-    EXPECT_TRUE(sizes != nullptr);
-    EXPECT_NO_FATAL_FAILURE(DPCTLSize_t_Array_Delete(sizes));
-}
-
 TEST_P(TestDPCTLSyclDeviceInterface, ChkGetMaxWorkGroupSize)
 {
     size_t n = 0;
@@ -625,10 +617,24 @@ TEST_F(TestDPCTLSyclDeviceNullArgs, ChkGetMaxWorkItemDims)
     ASSERT_TRUE(md == 0);
 }
 
-TEST_F(TestDPCTLSyclDeviceNullArgs, ChkGetMaxWorkItemSizes)
+TEST_F(TestDPCTLSyclDeviceNullArgs, ChkGetMaxWorkItemSizes1d)
+{
+    size_t *sz = nullptr;
+    EXPECT_NO_FATAL_FAILURE(sz = DPCTLDevice_GetMaxWorkItemSizes1d(Null_DRef));
+    ASSERT_TRUE(sz == nullptr);
+}
+
+TEST_F(TestDPCTLSyclDeviceNullArgs, ChkGetMaxWorkItemSizes2d)
+{
+    size_t *sz = nullptr;
+    EXPECT_NO_FATAL_FAILURE(sz = DPCTLDevice_GetMaxWorkItemSizes2d(Null_DRef));
+    ASSERT_TRUE(sz == nullptr);
+}
+
+TEST_F(TestDPCTLSyclDeviceNullArgs, ChkGetMaxWorkItemSizes3d)
 {
     size_t *sz = nullptr;
-    EXPECT_NO_FATAL_FAILURE(sz = DPCTLDevice_GetMaxWorkItemSizes(Null_DRef));
+    EXPECT_NO_FATAL_FAILURE(sz = DPCTLDevice_GetMaxWorkItemSizes3d(Null_DRef));
     ASSERT_TRUE(sz == nullptr);
 }
 

From 4cc552f10f22a207073d0f4104a4f4e0882e4648 Mon Sep 17 00:00:00 2001
From: ndgrigorian <46709016+ndgrigorian@users.noreply.github.com>
Date: Mon, 21 Aug 2023 09:03:47 -0700
Subject: [PATCH 04/44] Implements ``types`` property for elementwise functions
 (#1361)

* Implements ``types`` property for elementwise functions
- Output corresponds with Numpy's: a list with an arrow marking the domain to range type map

* Added tests for behavior of types property
---
 dpctl/tensor/_elementwise_common.py | 29 +++++++++++++++++++++++++++++
 dpctl/tests/elementwise/test_abs.py |  9 +++++++++
 dpctl/tests/elementwise/test_add.py |  9 +++++++++
 3 files changed, 47 insertions(+)

diff --git a/dpctl/tensor/_elementwise_common.py b/dpctl/tensor/_elementwise_common.py
index 002b0ef5ec..08c09f0a7c 100644
--- a/dpctl/tensor/_elementwise_common.py
+++ b/dpctl/tensor/_elementwise_common.py
@@ -29,6 +29,7 @@
 from ._copy_utils import _empty_like_orderK, _empty_like_pair_orderK
 from ._type_utils import (
     _acceptance_fn_default,
+    _all_data_types,
     _find_buf_dtype,
     _find_buf_dtype2,
     _to_device_supported_dtype,
@@ -44,6 +45,7 @@ def __init__(self, name, result_type_resolver_fn, unary_dp_impl_fn, docs):
         self.__name__ = "UnaryElementwiseFunc"
         self.name_ = name
         self.result_type_resolver_fn_ = result_type_resolver_fn
+        self.types_ = None
         self.unary_fn_ = unary_dp_impl_fn
         self.__doc__ = docs
 
@@ -53,6 +55,18 @@ def __str__(self):
     def __repr__(self):
         return f"<{self.__name__} '{self.name_}'>"
 
+    @property
+    def types(self):
+        types = self.types_
+        if not types:
+            types = []
+            for dt1 in _all_data_types(True, True):
+                dt2 = self.result_type_resolver_fn_(dt1)
+                if dt2:
+                    types.append(f"{dt1.char}->{dt2.char}")
+            self.types_ = types
+        return types
+
     def __call__(self, x, out=None, order="K"):
         if not isinstance(x, dpt.usm_ndarray):
             raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
@@ -363,6 +377,7 @@ def __init__(
         self.__name__ = "BinaryElementwiseFunc"
         self.name_ = name
         self.result_type_resolver_fn_ = result_type_resolver_fn
+        self.types_ = None
         self.binary_fn_ = binary_dp_impl_fn
         self.binary_inplace_fn_ = binary_inplace_fn
         self.__doc__ = docs
@@ -377,6 +392,20 @@ def __str__(self):
     def __repr__(self):
         return f"<{self.__name__} '{self.name_}'>"
 
+    @property
+    def types(self):
+        types = self.types_
+        if not types:
+            types = []
+            _all_dtypes = _all_data_types(True, True)
+            for dt1 in _all_dtypes:
+                for dt2 in _all_dtypes:
+                    dt3 = self.result_type_resolver_fn_(dt1, dt2)
+                    if dt3:
+                        types.append(f"{dt1.char}{dt2.char}->{dt3.char}")
+            self.types_ = types
+        return types
+
     def __call__(self, o1, o2, out=None, order="K"):
         if order not in ["K", "C", "F", "A"]:
             order = "K"
diff --git a/dpctl/tests/elementwise/test_abs.py b/dpctl/tests/elementwise/test_abs.py
index 9c800af812..2d2ec96fec 100644
--- a/dpctl/tests/elementwise/test_abs.py
+++ b/dpctl/tests/elementwise/test_abs.py
@@ -76,6 +76,15 @@ def test_abs_usm_type(usm_type):
     assert np.allclose(dpt.asnumpy(Y), expected_Y)
 
 
+def test_abs_types_prop():
+    types = dpt.abs.types_
+    assert types is None
+    types = dpt.abs.types
+    assert isinstance(types, list)
+    assert len(types) > 0
+    assert types == dpt.abs.types_
+
+
 @pytest.mark.parametrize("dtype", _all_dtypes[1:])
 def test_abs_order(dtype):
     q = get_queue_or_skip()
diff --git a/dpctl/tests/elementwise/test_add.py b/dpctl/tests/elementwise/test_add.py
index 891dda5252..2f5fd7c02a 100644
--- a/dpctl/tests/elementwise/test_add.py
+++ b/dpctl/tests/elementwise/test_add.py
@@ -258,6 +258,15 @@ def __sycl_usm_array_interface__(self):
         dpt.add(a, c)
 
 
+def test_add_types_property():
+    types = dpt.add.types_
+    assert types is None
+    types = dpt.add.types
+    assert isinstance(types, list)
+    assert len(types) > 0
+    assert types == dpt.add.types_
+
+
 def test_add_errors():
     get_queue_or_skip()
     try:

From ddcb0aec41d98a2ac500f81ab40306a563d32cdc Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 21 Aug 2023 20:16:44 -0500
Subject: [PATCH 05/44] Corrected messsage text in two exceptions

---
 dpctl/tensor/_elementwise_common.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dpctl/tensor/_elementwise_common.py b/dpctl/tensor/_elementwise_common.py
index 08c09f0a7c..7d46a3d814 100644
--- a/dpctl/tensor/_elementwise_common.py
+++ b/dpctl/tensor/_elementwise_common.py
@@ -468,7 +468,7 @@ def __call__(self, o1, o2, out=None, order="K"):
         o1_dtype = _get_dtype(o1, sycl_dev)
         o2_dtype = _get_dtype(o2, sycl_dev)
         if not all(_validate_dtype(o) for o in (o1_dtype, o2_dtype)):
-            raise ValueError("Operands of unsupported types")
+            raise ValueError("Operands have unsupported data types")
 
         o1_dtype, o2_dtype = _resolve_weak_types(o1_dtype, o2_dtype, sycl_dev)
 
@@ -498,7 +498,7 @@ def __call__(self, o1, o2, out=None, order="K"):
             if out.shape != res_shape:
                 raise ValueError(
                     "The shape of input and output arrays are inconsistent. "
-                    f"Expected output shape is {o1_shape}, got {out.shape}"
+                    f"Expected output shape is {res_shape}, got {out.shape}"
                 )
 
             if res_dt != out.dtype:

From c8326021c7562cd70183fe0874fc9021a688568f Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 22 Aug 2023 19:57:36 -0500
Subject: [PATCH 06/44] Closes gh-1295

Since numba-dpex has dropped Py 3.8 from its matrix of builds, we can do
the same now with dpnp to follow.
---
 .github/workflows/conda-package.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index f717f4b309..9298742ac6 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -18,7 +18,7 @@ jobs:
 
     strategy:
       matrix:
-        python: ['3.8', '3.9', '3.10', '3.11']
+        python: ['3.9', '3.10', '3.11']
     steps:
       - uses: actions/checkout@v3
         with:
@@ -63,7 +63,7 @@ jobs:
 
     strategy:
       matrix:
-        python: ['3.8', '3.9', '3.10', '3.11']
+        python: ['3.9', '3.10', '3.11']
     env:
       conda-bld: C:\Miniconda\conda-bld\win-64\
     steps:
@@ -102,7 +102,7 @@ jobs:
 
     strategy:
       matrix:
-        python: ['3.8', '3.9', '3.10', '3.11']
+        python: ['3.9', '3.10', '3.11']
         experimental: [false]
         runner: [ubuntu-20.04]
     continue-on-error: ${{ matrix.experimental }}
@@ -185,7 +185,7 @@ jobs:
         shell: cmd /C CALL {0}
     strategy:
       matrix:
-        python: ['3.8', '3.9', '3.10', '3.11']
+        python: ['3.9', '3.10', '3.11']
         experimental: [false]
         runner: [windows-latest]
     continue-on-error: ${{ matrix.experimental }}
@@ -300,7 +300,7 @@ jobs:
     runs-on: ubuntu-20.04
     strategy:
       matrix:
-        python: ['3.8', '3.9', '3.10', '3.11']
+        python: ['3.9', '3.10', '3.11']
     steps:
       - name: Download artifact
         uses: actions/download-artifact@v3
@@ -324,7 +324,7 @@ jobs:
     runs-on: windows-latest
     strategy:
       matrix:
-        python: ['3.8', '3.9', '3.10', '3.11']
+        python: ['3.9', '3.10', '3.11']
     steps:
       - name: Download artifact
         uses: actions/download-artifact@v3

From 68ad0540a079a2b6f0e07ddcd85dc2b274143860 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 22 Aug 2023 19:59:07 -0500
Subject: [PATCH 07/44] Using Py 3.11 in OS-LLVM-SYCL-BUILD workflow

Also install ninja from pip instead of apt.
---
 .github/workflows/os-llvm-sycl-build.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/os-llvm-sycl-build.yml b/.github/workflows/os-llvm-sycl-build.yml
index 55d57653f4..d12747f3a9 100644
--- a/.github/workflows/os-llvm-sycl-build.yml
+++ b/.github/workflows/os-llvm-sycl-build.yml
@@ -97,18 +97,18 @@ jobs:
       - name: Install system components
         shell: bash -l {0}
         run: |
-          sudo apt-get install ninja-build libtinfo5
+          sudo apt-get install libtinfo5
 
       - name: Setup Python
         uses: actions/setup-python@v4
         with:
-          python-version: '3.9'
+          python-version: '3.11'
           architecture: x64
 
       - name: Install dpctl dependencies
         shell: bash -l {0}
         run: |
-          pip install numpy cython setuptools pytest scikit-build cmake
+          pip install numpy cython setuptools pytest scikit-build cmake ninja
 
       - name: Checkout repo
         uses: actions/checkout@v3

From 6f22fa04400ca10a060c591420e117cef9998f8c Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 23 Aug 2023 01:24:39 -0500
Subject: [PATCH 08/44] Use older version of Sphinx, see it fixes documentation
 build

---
 .github/workflows/generate-docs.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/generate-docs.yml b/.github/workflows/generate-docs.yml
index 768d958e02..bae12e4373 100644
--- a/.github/workflows/generate-docs.yml
+++ b/.github/workflows/generate-docs.yml
@@ -49,7 +49,7 @@ jobs:
         if: ${{ !github.event.pull_request || github.event.action != 'closed' }}
         shell: bash -l {0}
         run: |
-          pip install numpy cython setuptools scikit-build cmake sphinx sphinx_rtd_theme pydot graphviz sphinxcontrib-programoutput sphinxcontrib-googleanalytics
+          pip install numpy cython setuptools scikit-build cmake sphinx"<7.2" sphinx_rtd_theme pydot graphviz sphinxcontrib-programoutput sphinxcontrib-googleanalytics
       - name: Checkout repo
         uses: actions/checkout@v3
         with:

From 467d133a82ee44135f70c029f4ea081d989ec9a2 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Thu, 24 Aug 2023 09:44:17 -0500
Subject: [PATCH 09/44] Separate copy_usm_ndarray_for_reshape and
 copy_usm_ndarray_for_roll
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`copy_usm_ndarray_for_reshape` allowed shift parameter which allowed
to double its use for implementing `roll` function.

It was suboptimal though, since for `roll` both source and destination
array have the same shape, and stride simplification applies. It also
makes sense to create dedicated kernel to implement `roll` for contiguous
inputs, makings computations measurably faster.

This PR removes support for `shift` parameter from
_tensor_impl._copy_usm_ndarray_for_reshape and introduces
_tensor_impl._copy_usm_ndarray_for_roll.

This latter function ensures same shape, applies stride simplification
and dispatches to specialized kernels for contiguous inputs. Even for
strided inputs less metadata should be copied for the kernel to use
(the shape is common, unlike in reshape).

The result of this change is that _copy_usm_ndarray_for_roll runs about
4.5x faster in an input array with about a million elements than
priovious call to _copy_usm_ndarray_for_reshape with shift parameter set:

```
In [1]: import numpy as np, dpctl.tensor as dpt

In [2]: a = np.ones((3,4,5,6,7,8))

In [3]: b = dpt.ones((3,4,5,6,7,8))

In [4]: w = dpt.empty_like(b)

In [5]: import dpctl.tensor._tensor_impl as ti

In [6]: %timeit ti._copy_usm_ndarray_for_roll(src=b, dst=w, shift=2, sycl_queue=b.sycl_queue)[0].wait()
161 µs ± 12.4 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

In [7]: b.size
Out[7]: 20160

In [8]: b = dpt.ones((30,40,5,6,7,80))

In [9]: w = dpt.empty_like(b)

In [10]: %timeit ti._copy_usm_ndarray_for_roll(src=b, dst=w, shift=2, sycl_queue=b.sycl_queue)[0].wait()
4.91 ms ± 90.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

In [11]: a = np.ones(b.shape)

In [12]: %timeit np.roll(a,2)
23 ms ± 367 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
```

Previously:

```
In [8]: %timeit ti._copy_usm_ndarray_for_reshape(src=b, dst=w, shift=2, sycl_queue=b.sycl_queue)[0].wait()
20.1 ms ± 492 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

In [9]: %timeit ti._copy_usm_ndarray_for_reshape(src=b, dst=w, shift=2, sycl_queue=b.sycl_queue)[0].wait()
19.9 ms ± 410 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

In [10]: %timeit ti._copy_usm_ndarray_for_reshape(src=b, dst=w, shift=0, sycl_queue=b.sycl_queue)[0].wait()
19.7 ms ± 488 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

In [11]: b.shape
Out[11]: (30, 40, 5, 6, 7, 80)
```
---
 dpctl/tensor/CMakeLists.txt                   |   1 +
 .../include/kernels/copy_and_cast.hpp         | 291 +++++++++++++++---
 .../libtensor/source/copy_for_reshape.cpp     |   7 +-
 .../libtensor/source/copy_for_reshape.hpp     |   1 -
 .../tensor/libtensor/source/copy_for_roll.cpp | 267 ++++++++++++++++
 .../tensor/libtensor/source/copy_for_roll.hpp |  51 +++
 dpctl/tensor/libtensor/source/tensor_py.cpp   |  14 +
 7 files changed, 586 insertions(+), 46 deletions(-)
 create mode 100644 dpctl/tensor/libtensor/source/copy_for_roll.cpp
 create mode 100644 dpctl/tensor/libtensor/source/copy_for_roll.hpp

diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt
index ca83b8350b..49f25aef6a 100644
--- a/dpctl/tensor/CMakeLists.txt
+++ b/dpctl/tensor/CMakeLists.txt
@@ -37,6 +37,7 @@ pybind11_add_module(${python_module_name} MODULE
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp
diff --git a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
index f1e63ccc60..4be31578ee 100644
--- a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
@@ -56,9 +56,6 @@ class copy_cast_contig_kernel;
 template <typename srcT, typename dstT, typename IndexerT>
 class copy_cast_from_host_kernel;
 
-template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
-class copy_for_reshape_generic_kernel;
-
 template <typename srcTy, typename dstTy> class Caster
 {
 public:
@@ -629,27 +626,24 @@ struct CopyAndCastFromHostFactory
 
 // =============== Copying for reshape ================== //
 
+template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
+class copy_for_reshape_generic_kernel;
+
 template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
 class GenericCopyForReshapeFunctor
 {
 private:
-    py::ssize_t offset = 0;
-    py::ssize_t size = 1;
-    // USM array of size 2*(src_nd + dst_nd)
-    //   [ src_shape; src_strides; dst_shape; dst_strides ]
-    Ty *src_p = nullptr;
+    const Ty *src_p = nullptr;
     Ty *dst_p = nullptr;
     SrcIndexerT src_indexer_;
     DstIndexerT dst_indexer_;
 
 public:
-    GenericCopyForReshapeFunctor(py::ssize_t shift,
-                                 py::ssize_t nelems,
-                                 char *src_ptr,
+    GenericCopyForReshapeFunctor(const char *src_ptr,
                                  char *dst_ptr,
                                  SrcIndexerT src_indexer,
                                  DstIndexerT dst_indexer)
-        : offset(shift), size(nelems), src_p(reinterpret_cast<Ty *>(src_ptr)),
+        : src_p(reinterpret_cast<const Ty *>(src_ptr)),
           dst_p(reinterpret_cast<Ty *>(dst_ptr)), src_indexer_(src_indexer),
           dst_indexer_(dst_indexer)
     {
@@ -657,40 +651,31 @@ class GenericCopyForReshapeFunctor
 
     void operator()(sycl::id<1> wiid) const
     {
-        py::ssize_t this_src_offset = src_indexer_(wiid.get(0));
-        const Ty *in = src_p + this_src_offset;
-
-        py::ssize_t shifted_wiid =
-            (static_cast<py::ssize_t>(wiid.get(0)) + offset) % size;
-        shifted_wiid = (shifted_wiid >= 0) ? shifted_wiid : shifted_wiid + size;
+        const py::ssize_t src_offset = src_indexer_(wiid.get(0));
+        const py::ssize_t dst_offset = dst_indexer_(wiid.get(0));
 
-        py::ssize_t this_dst_offset = dst_indexer_(shifted_wiid);
-
-        Ty *out = dst_p + this_dst_offset;
-        *out = *in;
+        dst_p[dst_offset] = src_p[src_offset];
     }
 };
 
 // define function type
 typedef sycl::event (*copy_for_reshape_fn_ptr_t)(
     sycl::queue,
-    py::ssize_t, // shift
-    size_t,      // num_elements
-    int,
-    int,           // src_nd, dst_nd
+    size_t,        // num_elements
+    int,           // src_nd
+    int,           // dst_nd
     py::ssize_t *, // packed shapes and strides
-    char *,        // src_data_ptr
+    const char *,  // src_data_ptr
     char *,        // dst_data_ptr
     const std::vector<sycl::event> &);
 
 /*!
  * @brief Function to copy content of array while reshaping.
  *
- * Submits a kernel to perform a copy `dst[unravel_index((i + shift) % nelems ,
+ * Submits a kernel to perform a copy `dst[unravel_index(i,
  * dst.shape)] = src[unravel_undex(i, src.shape)]`.
  *
  * @param  q      The execution queue where kernel is submitted.
- * @param  shift  The shift in flat indexing.
  * @param  nelems The number of elements to copy
  * @param  src_nd Array dimension of the source array
  * @param  dst_nd Array dimension of the destination array
@@ -708,31 +693,40 @@ typedef sycl::event (*copy_for_reshape_fn_ptr_t)(
 template <typename Ty>
 sycl::event
 copy_for_reshape_generic_impl(sycl::queue q,
-                              py::ssize_t shift,
                               size_t nelems,
                               int src_nd,
                               int dst_nd,
                               py::ssize_t *packed_shapes_and_strides,
-                              char *src_p,
+                              const char *src_p,
                               char *dst_p,
                               const std::vector<sycl::event> &depends)
 {
     dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
 
     sycl::event copy_for_reshape_ev = q.submit([&](sycl::handler &cgh) {
-        StridedIndexer src_indexer{
-            src_nd, 0,
-            const_cast<const py::ssize_t *>(packed_shapes_and_strides)};
-        StridedIndexer dst_indexer{
-            dst_nd, 0,
-            const_cast<const py::ssize_t *>(packed_shapes_and_strides +
-                                            (2 * src_nd))};
         cgh.depends_on(depends);
-        cgh.parallel_for<copy_for_reshape_generic_kernel<Ty, StridedIndexer,
-                                                         StridedIndexer>>(
+
+        // packed_shapes_and_strides:
+        //   USM array of size 2*(src_nd + dst_nd)
+        //   [ src_shape; src_strides; dst_shape; dst_strides ]
+
+        const py::ssize_t *src_shape_and_strides =
+            const_cast<const py::ssize_t *>(packed_shapes_and_strides);
+
+        const py::ssize_t *dst_shape_and_strides =
+            const_cast<const py::ssize_t *>(packed_shapes_and_strides +
+                                            (2 * src_nd));
+
+        StridedIndexer src_indexer{src_nd, 0, src_shape_and_strides};
+        StridedIndexer dst_indexer{dst_nd, 0, dst_shape_and_strides};
+
+        using KernelName =
+            copy_for_reshape_generic_kernel<Ty, StridedIndexer, StridedIndexer>;
+
+        cgh.parallel_for<KernelName>(
             sycl::range<1>(nelems),
             GenericCopyForReshapeFunctor<Ty, StridedIndexer, StridedIndexer>(
-                shift, nelems, src_p, dst_p, src_indexer, dst_indexer));
+                src_p, dst_p, src_indexer, dst_indexer));
     });
 
     return copy_for_reshape_ev;
@@ -752,6 +746,221 @@ template <typename fnT, typename Ty> struct CopyForReshapeGenericFactory
     }
 };
 
+// =============== Copying for reshape ================== //
+
+template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
+class copy_for_roll_strided_kernel;
+
+template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
+class StridedCopyForRollFunctor
+{
+private:
+    size_t offset = 0;
+    size_t size = 1;
+    const Ty *src_p = nullptr;
+    Ty *dst_p = nullptr;
+    SrcIndexerT src_indexer_;
+    DstIndexerT dst_indexer_;
+
+public:
+    StridedCopyForRollFunctor(size_t shift,
+                              size_t nelems,
+                              const Ty *src_ptr,
+                              Ty *dst_ptr,
+                              SrcIndexerT src_indexer,
+                              DstIndexerT dst_indexer)
+        : offset(shift), size(nelems), src_p(src_ptr), dst_p(dst_ptr),
+          src_indexer_(src_indexer), dst_indexer_(dst_indexer)
+    {
+    }
+
+    void operator()(sycl::id<1> wiid) const
+    {
+        const size_t gid = wiid.get(0);
+        const size_t shifted_gid =
+            ((gid < offset) ? gid + size - offset : gid - offset);
+
+        const py::ssize_t src_offset = src_indexer_(shifted_gid);
+        const py::ssize_t dst_offset = dst_indexer_(gid);
+
+        dst_p[dst_offset] = src_p[src_offset];
+    }
+};
+
+// define function type
+typedef sycl::event (*copy_for_roll_strided_fn_ptr_t)(
+    sycl::queue,
+    size_t,              // shift
+    size_t,              // num_elements
+    int,                 // common_nd
+    const py::ssize_t *, // packed shapes and strides
+    const char *,        // src_data_ptr
+    py::ssize_t,         // src_offset
+    char *,              // dst_data_ptr
+    py::ssize_t,         // dst_offset
+    const std::vector<sycl::event> &);
+
+template <typename Ty> class copy_for_roll_contig_kernel;
+
+/*!
+ * @brief Function to copy content of array with a shift.
+ *
+ * Submits a kernel to perform a copy `dst[unravel_index((i + shift) % nelems ,
+ * dst.shape)] = src[unravel_undex(i, src.shape)]`.
+ *
+ * @param  q      The execution queue where kernel is submitted.
+ * @param  shift  The shift in flat indexing, must be non-negative.
+ * @param  nelems The number of elements to copy
+ * @param  nd     Array dimensionality of the destination and source arrays
+ * @param  packed_shapes_and_strides Kernel accessible USM array of size
+ * `3*nd` with content `[common_shape, src_strides, dst_strides]`.
+ * @param  src_p  Typeless USM pointer to the buffer of the source array
+ * @param  src_offset Displacement of first element of src relative src_p in
+ * elements
+ * @param  dst_p  Typeless USM pointer to the buffer of the destination array
+ * @param  dst_offset Displacement of first element of dst relative dst_p in
+ * elements
+ * @param  depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename Ty>
+sycl::event
+copy_for_roll_strided_impl(sycl::queue q,
+                           size_t shift,
+                           size_t nelems,
+                           int nd,
+                           const py::ssize_t *packed_shapes_and_strides,
+                           const char *src_p,
+                           py::ssize_t src_offset,
+                           char *dst_p,
+                           py::ssize_t dst_offset,
+                           const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
+
+    sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        // packed_shapes_and_strides:
+        //   USM array of size 3 * nd
+        //   [ common_shape; src_strides; dst_strides ]
+
+        StridedIndexer src_indexer{nd, src_offset, packed_shapes_and_strides};
+        UnpackedStridedIndexer dst_indexer{nd, dst_offset,
+                                           packed_shapes_and_strides,
+                                           packed_shapes_and_strides + 2 * nd};
+
+        using KernelName = copy_for_roll_strided_kernel<Ty, StridedIndexer,
+                                                        UnpackedStridedIndexer>;
+
+        const Ty *src_tp = reinterpret_cast<const Ty *>(src_p);
+        Ty *dst_tp = reinterpret_cast<Ty *>(dst_p);
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(nelems),
+            StridedCopyForRollFunctor<Ty, StridedIndexer,
+                                      UnpackedStridedIndexer>(
+                shift, nelems, src_tp, dst_tp, src_indexer, dst_indexer));
+    });
+
+    return copy_for_roll_ev;
+}
+
+// define function type
+typedef sycl::event (*copy_for_roll_contig_fn_ptr_t)(
+    sycl::queue,
+    size_t,       // shift
+    size_t,       // num_elements
+    const char *, // src_data_ptr
+    py::ssize_t,  // src_offset
+    char *,       // dst_data_ptr
+    py::ssize_t,  // dst_offset
+    const std::vector<sycl::event> &);
+
+/*!
+ * @brief Function to copy content of array with a shift.
+ *
+ * Submits a kernel to perform a copy `dst[unravel_index((i + shift) % nelems ,
+ * dst.shape)] = src[unravel_undex(i, src.shape)]`.
+ *
+ * @param  q      The execution queue where kernel is submitted.
+ * @param  shift  The shift in flat indexing, must be non-negative.
+ * @param  nelems The number of elements to copy
+ * @param  src_p  Typeless USM pointer to the buffer of the source array
+ * @param  src_offset Displacement of the start of array src relative src_p in
+ * elements
+ * @param  dst_p  Typeless USM pointer to the buffer of the destination array
+ * @param  dst_offset Displacement of the start of array dst relative dst_p in
+ * elements
+ * @param  depends  List of events to wait for before starting computations, if
+ * any.
+ *
+ * @return Event to wait on to ensure that computation completes.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename Ty>
+sycl::event copy_for_roll_contig_impl(sycl::queue q,
+                                      size_t shift,
+                                      size_t nelems,
+                                      const char *src_p,
+                                      py::ssize_t src_offset,
+                                      char *dst_p,
+                                      py::ssize_t dst_offset,
+                                      const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
+
+    sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        NoOpIndexer src_indexer{};
+        NoOpIndexer dst_indexer{};
+
+        using KernelName = copy_for_roll_contig_kernel<Ty>;
+
+        const Ty *src_tp = reinterpret_cast<const Ty *>(src_p) + src_offset;
+        Ty *dst_tp = reinterpret_cast<Ty *>(dst_p) + dst_offset;
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(nelems),
+            StridedCopyForRollFunctor<Ty, NoOpIndexer, NoOpIndexer>(
+                shift, nelems, src_tp, dst_tp, src_indexer, dst_indexer));
+    });
+
+    return copy_for_roll_ev;
+}
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for given array data
+ * type `Ty`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename Ty> struct CopyForRollStridedFactory
+{
+    fnT get()
+    {
+        fnT f = copy_for_roll_strided_impl<Ty>;
+        return f;
+    }
+};
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for given array data
+ * type `Ty`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename Ty> struct CopyForRollContigFactory
+{
+    fnT get()
+    {
+        fnT f = copy_for_roll_contig_impl<Ty>;
+        return f;
+    }
+};
+
 } // namespace copy_and_cast
 } // namespace kernels
 } // namespace tensor
diff --git a/dpctl/tensor/libtensor/source/copy_for_reshape.cpp b/dpctl/tensor/libtensor/source/copy_for_reshape.cpp
index 8edf982b16..7114d87c47 100644
--- a/dpctl/tensor/libtensor/source/copy_for_reshape.cpp
+++ b/dpctl/tensor/libtensor/source/copy_for_reshape.cpp
@@ -60,7 +60,6 @@ static copy_for_reshape_fn_ptr_t
 std::pair<sycl::event, sycl::event>
 copy_usm_ndarray_for_reshape(dpctl::tensor::usm_ndarray src,
                              dpctl::tensor::usm_ndarray dst,
-                             py::ssize_t shift,
                              sycl::queue exec_q,
                              const std::vector<sycl::event> &depends)
 {
@@ -109,7 +108,7 @@ copy_usm_ndarray_for_reshape(dpctl::tensor::usm_ndarray src,
     if (src_nelems == 1) {
         // handle special case of 1-element array
         int src_elemsize = src.get_elemsize();
-        char *src_data = src.get_data();
+        const char *src_data = src.get_data();
         char *dst_data = dst.get_data();
         sycl::event copy_ev =
             exec_q.copy<char>(src_data, dst_data, src_elemsize);
@@ -146,7 +145,7 @@ copy_usm_ndarray_for_reshape(dpctl::tensor::usm_ndarray src,
     }
     sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple);
 
-    char *src_data = src.get_data();
+    const char *src_data = src.get_data();
     char *dst_data = dst.get_data();
 
     std::vector<sycl::event> all_deps(depends.size() + 1);
@@ -154,7 +153,7 @@ copy_usm_ndarray_for_reshape(dpctl::tensor::usm_ndarray src,
     all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends));
 
     sycl::event copy_for_reshape_event =
-        fn(exec_q, shift, src_nelems, src_nd, dst_nd, shape_strides, src_data,
+        fn(exec_q, src_nelems, src_nd, dst_nd, shape_strides, src_data,
            dst_data, all_deps);
 
     auto temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
diff --git a/dpctl/tensor/libtensor/source/copy_for_reshape.hpp b/dpctl/tensor/libtensor/source/copy_for_reshape.hpp
index 09caddf824..32d41fc159 100644
--- a/dpctl/tensor/libtensor/source/copy_for_reshape.hpp
+++ b/dpctl/tensor/libtensor/source/copy_for_reshape.hpp
@@ -40,7 +40,6 @@ namespace py_internal
 extern std::pair<sycl::event, sycl::event>
 copy_usm_ndarray_for_reshape(dpctl::tensor::usm_ndarray src,
                              dpctl::tensor::usm_ndarray dst,
-                             py::ssize_t shift,
                              sycl::queue exec_q,
                              const std::vector<sycl::event> &depends = {});
 
diff --git a/dpctl/tensor/libtensor/source/copy_for_roll.cpp b/dpctl/tensor/libtensor/source/copy_for_roll.cpp
new file mode 100644
index 0000000000..d8ae3059b0
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/copy_for_roll.cpp
@@ -0,0 +1,267 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "copy_for_roll.hpp"
+#include "dpctl4pybind11.hpp"
+#include "kernels/copy_and_cast.hpp"
+#include "utils/type_dispatch.hpp"
+#include <pybind11/pybind11.h>
+
+#include "simplify_iteration_space.hpp"
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::copy_and_cast::copy_for_roll_contig_fn_ptr_t;
+using dpctl::tensor::kernels::copy_and_cast::copy_for_roll_strided_fn_ptr_t;
+using dpctl::utils::keep_args_alive;
+
+// define static vector
+static copy_for_roll_strided_fn_ptr_t
+    copy_for_roll_strided_dispatch_vector[td_ns::num_types];
+
+static copy_for_roll_contig_fn_ptr_t
+    copy_for_roll_contig_dispatch_vector[td_ns::num_types];
+
+/*
+ * Copies src into dst (same data type) of different shapes by using flat
+ * iterations.
+ *
+ * Equivalent to the following loop:
+ *
+ * for i for range(src.size):
+ *     dst[np.multi_index(i, dst.shape)] = src[np.multi_index(i, src.shape)]
+ */
+std::pair<sycl::event, sycl::event>
+copy_usm_ndarray_for_roll(dpctl::tensor::usm_ndarray src,
+                          dpctl::tensor::usm_ndarray dst,
+                          py::ssize_t shift,
+                          sycl::queue exec_q,
+                          const std::vector<sycl::event> &depends)
+{
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    // Must have the same number of dimensions
+    if (src_nd != dst_nd) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll requires src and dst to "
+            "have the same number of dimensions.");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    if (!std::equal(src_shape_ptr, src_shape_ptr + src_nd, dst_shape_ptr)) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll requires src and dst to "
+            "have the same shape.");
+    }
+
+    py::ssize_t src_nelems = src.get_size();
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    // typenames must be the same
+    if (src_typenum != dst_typenum) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_reshape requires src and dst to "
+            "have the same type.");
+    }
+
+    if (src_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    // destination must be ample enough to accommodate all elements
+    {
+        auto dst_offsets = dst.get_minmax_offsets();
+        py::ssize_t range =
+            static_cast<py::ssize_t>(dst_offsets.second - dst_offsets.first);
+        if (range + 1 < src_nelems) {
+            throw py::value_error(
+                "Destination array can not accommodate all the "
+                "elements of source array.");
+        }
+    }
+
+    // check same contexts
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    if (src_nelems == 1) {
+        // handle special case of 1-element array
+        int src_elemsize = src.get_elemsize();
+        const char *src_data = src.get_data();
+        char *dst_data = dst.get_data();
+        sycl::event copy_ev =
+            exec_q.copy<char>(src_data, dst_data, src_elemsize);
+        return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}),
+                              copy_ev);
+    }
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int type_id = array_types.typenum_to_lookup_id(src_typenum);
+
+    const bool is_src_c_contig = src.is_c_contiguous();
+    const bool is_src_f_contig = src.is_f_contiguous();
+
+    const bool is_dst_c_contig = dst.is_c_contiguous();
+    const bool is_dst_f_contig = dst.is_f_contiguous();
+
+    const bool both_c_contig = is_src_c_contig && is_dst_c_contig;
+    const bool both_f_contig = is_src_f_contig && is_dst_f_contig;
+
+    // normalize shift parameter to be 0 <= offset < src_nelems
+    size_t offset =
+        (shift > 0) ? (shift % src_nelems) : src_nelems + (shift % src_nelems);
+
+    const char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    if (both_c_contig || both_f_contig) {
+        auto fn = copy_for_roll_contig_dispatch_vector[type_id];
+
+        if (fn != nullptr) {
+            constexpr py::ssize_t zero_offset = 0;
+
+            sycl::event copy_for_roll_ev =
+                fn(exec_q, offset, src_nelems, src_data, zero_offset, dst_data,
+                   zero_offset, depends);
+
+            sycl::event ht_ev =
+                keep_args_alive(exec_q, {src, dst}, {copy_for_roll_ev});
+
+            return std::make_pair(ht_ev, copy_for_roll_ev);
+        }
+    }
+
+    auto const &src_strides = src.get_strides_vector();
+    auto const &dst_strides = dst.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = src_nd;
+    const py::ssize_t *shape = src_shape_ptr;
+
+    // nd, simplified_* and *_offset are modified by reference
+    dpctl::tensor::py_internal::simplify_iteration_space(
+        nd, shape, src_strides, dst_strides,
+        // output
+        simplified_shape, simplified_src_strides, simplified_dst_strides,
+        src_offset, dst_offset);
+
+    if (nd == 1 && simplified_src_strides[0] == 1 &&
+        simplified_dst_strides[0] == 1) {
+        auto fn = copy_for_roll_contig_dispatch_vector[type_id];
+
+        if (fn != nullptr) {
+
+            sycl::event copy_for_roll_ev =
+                fn(exec_q, offset, src_nelems, src_data, src_offset, dst_data,
+                   dst_offset, depends);
+
+            sycl::event ht_ev =
+                keep_args_alive(exec_q, {src, dst}, {copy_for_roll_ev});
+
+            return std::make_pair(ht_ev, copy_for_roll_ev);
+        }
+    }
+
+    auto fn = copy_for_roll_strided_dispatch_vector[type_id];
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    // shape_strides = [src_shape, src_strides, dst_strides]
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    const auto &ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, simplified_shape, simplified_src_strides,
+        simplified_dst_strides);
+
+    py::ssize_t *shape_strides = std::get<0>(ptr_size_event_tuple);
+    if (shape_strides == nullptr) {
+        throw std::runtime_error("Unable to allocate device memory");
+    }
+    sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+
+    std::vector<sycl::event> all_deps(depends.size() + 1);
+    all_deps.push_back(copy_shape_ev);
+    all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends));
+
+    sycl::event copy_for_roll_event =
+        fn(exec_q, offset, src_nelems, src_nd, shape_strides, src_data,
+           src_offset, dst_data, dst_offset, all_deps);
+
+    auto temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(copy_for_roll_event);
+        auto ctx = exec_q.get_context();
+        cgh.host_task(
+            [shape_strides, ctx]() { sycl::free(shape_strides, ctx); });
+    });
+
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          copy_for_roll_event);
+}
+
+void init_copy_for_roll_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    using dpctl::tensor::kernels::copy_and_cast::CopyForRollStridedFactory;
+
+    DispatchVectorBuilder<copy_for_roll_strided_fn_ptr_t,
+                          CopyForRollStridedFactory, num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(copy_for_roll_strided_dispatch_vector);
+
+    using dpctl::tensor::kernels::copy_and_cast::CopyForRollContigFactory;
+    DispatchVectorBuilder<copy_for_roll_contig_fn_ptr_t,
+                          CopyForRollContigFactory, num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(copy_for_roll_contig_dispatch_vector);
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/copy_for_roll.hpp b/dpctl/tensor/libtensor/source/copy_for_roll.hpp
new file mode 100644
index 0000000000..61f0b3baca
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/copy_for_roll.hpp
@@ -0,0 +1,51 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <utility>
+#include <vector>
+
+#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+copy_usm_ndarray_for_roll(dpctl::tensor::usm_ndarray src,
+                          dpctl::tensor::usm_ndarray dst,
+                          py::ssize_t shift,
+                          sycl::queue exec_q,
+                          const std::vector<sycl::event> &depends = {});
+
+extern void init_copy_for_roll_dispatch_vectors();
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp
index 9b4ba6cdad..9decdf3a9b 100644
--- a/dpctl/tensor/libtensor/source/tensor_py.cpp
+++ b/dpctl/tensor/libtensor/source/tensor_py.cpp
@@ -37,6 +37,7 @@
 #include "boolean_reductions.hpp"
 #include "copy_and_cast_usm_to_usm.hpp"
 #include "copy_for_reshape.hpp"
+#include "copy_for_roll.hpp"
 #include "copy_numpy_ndarray_into_usm_ndarray.hpp"
 #include "device_support_queries.hpp"
 #include "elementwise_functions.hpp"
@@ -68,6 +69,10 @@ using dpctl::tensor::py_internal::copy_usm_ndarray_into_usm_ndarray;
 
 using dpctl::tensor::py_internal::copy_usm_ndarray_for_reshape;
 
+/* =========================== Copy for roll ============================= */
+
+using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll;
+
 /* ============= Copy from numpy.ndarray to usm_ndarray ==================== */
 
 using dpctl::tensor::py_internal::copy_numpy_ndarray_into_usm_ndarray;
@@ -120,6 +125,7 @@ void init_dispatch_vectors(void)
     using namespace dpctl::tensor::py_internal;
 
     init_copy_for_reshape_dispatch_vectors();
+    init_copy_for_roll_dispatch_vectors();
     init_linear_sequences_dispatch_vectors();
     init_full_ctor_dispatch_vectors();
     init_eye_ctor_dispatch_vectors();
@@ -221,6 +227,14 @@ PYBIND11_MODULE(_tensor_impl, m)
     m.def("_copy_usm_ndarray_for_reshape", &copy_usm_ndarray_for_reshape,
           "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same "
           "number of elements using underlying 'C'-contiguous order for flat "
+          "traversal. "
+          "Returns a tuple of events: (ht_event, comp_event)",
+          py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
+
+    m.def("_copy_usm_ndarray_for_roll", &copy_usm_ndarray_for_roll,
+          "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same "
+          "shapes using underlying 'C'-contiguous order for flat "
           "traversal with shift. "
           "Returns a tuple of events: (ht_event, comp_event)",
           py::arg("src"), py::arg("dst"), py::arg("shift"),

From 67cab69eb137d02fa686d0433c9e908cd6da2745 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Thu, 24 Aug 2023 09:52:36 -0500
Subject: [PATCH 10/44] Deploy _copy_usm_ndarray_for_roll

Remove use of `shift=0` argument to `_copy_usm_ndarray_for_reshape`
in _reshape.py

Used `_copy_usm_ndarray_for_roll` in `roll` implementation.
---
 dpctl/tensor/_manipulation_functions.py | 3 +--
 dpctl/tensor/_reshape.py                | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/dpctl/tensor/_manipulation_functions.py b/dpctl/tensor/_manipulation_functions.py
index 9406e386af..450bb83aaa 100644
--- a/dpctl/tensor/_manipulation_functions.py
+++ b/dpctl/tensor/_manipulation_functions.py
@@ -429,7 +429,7 @@ def roll(X, shift, axis=None):
         res = dpt.empty(
             X.shape, dtype=X.dtype, usm_type=X.usm_type, sycl_queue=X.sycl_queue
         )
-        hev, _ = ti._copy_usm_ndarray_for_reshape(
+        hev, _ = ti._copy_usm_ndarray_for_roll(
             src=X, dst=res, shift=shift, sycl_queue=X.sycl_queue
         )
         hev.wait()
@@ -550,7 +550,6 @@ def _concat_axis_None(arrays):
             hev, _ = ti._copy_usm_ndarray_for_reshape(
                 src=src_,
                 dst=res[fill_start:fill_end],
-                shift=0,
                 sycl_queue=exec_q,
             )
         fill_start = fill_end
diff --git a/dpctl/tensor/_reshape.py b/dpctl/tensor/_reshape.py
index ac4a04cac4..b363c063de 100644
--- a/dpctl/tensor/_reshape.py
+++ b/dpctl/tensor/_reshape.py
@@ -165,7 +165,7 @@ def reshape(X, shape, order="C", copy=None):
         )
         if order == "C":
             hev, _ = _copy_usm_ndarray_for_reshape(
-                src=X, dst=flat_res, shift=0, sycl_queue=X.sycl_queue
+                src=X, dst=flat_res, sycl_queue=X.sycl_queue
             )
             hev.wait()
         else:

From f7eee1ebe8d5a0a2a17eb26c04bf3e05ee753fe2 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Thu, 24 Aug 2023 13:28:11 -0500
Subject: [PATCH 11/44] Conversion from raw to multi_ptr should be done with
 address_space_cast

We used `sycl::multi_ptr` constructor instead of
`sycl::address_space_cast` previsously, and change in
https://github.com/KhronosGroup/SYCL-Docs/pull/432
introduced `sycl::access:decorated::legacy` as the default
which is deprecated in SYCL 2020 standard which highlighted
the problem.

In using `sycl::address_space_cast` we specify
`sycl::access::decorated::yes`.
---
 .../include/kernels/copy_and_cast.hpp         |  21 ++-
 .../kernels/elementwise_functions/common.hpp  | 177 +++++++++---------
 .../elementwise_functions/common_inplace.hpp  |  65 ++++---
 .../libtensor/include/kernels/where.hpp       |  30 +--
 4 files changed, 148 insertions(+), 145 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
index f1e63ccc60..33969ec24a 100644
--- a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
@@ -244,25 +244,26 @@ class ContigCopyFunctor
 
             if (base + n_vecs * vec_sz * sgSize < nelems &&
                 sgSize == max_sgSize) {
-                using src_ptrT =
-                    sycl::multi_ptr<const srcT,
-                                    sycl::access::address_space::global_space>;
-                using dst_ptrT =
-                    sycl::multi_ptr<dstT,
-                                    sycl::access::address_space::global_space>;
                 sycl::vec<srcT, vec_sz> src_vec;
                 sycl::vec<dstT, vec_sz> dst_vec;
 
 #pragma unroll
                 for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
-                    src_vec =
-                        sg.load<vec_sz>(src_ptrT(&src_p[base + it * sgSize]));
+                    auto src_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(
+                        &src_p[base + it * sgSize]);
+                    auto dst_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(
+                        &dst_p[base + it * sgSize]);
+
+                    src_vec = sg.load<vec_sz>(src_multi_ptr);
 #pragma unroll
                     for (std::uint8_t k = 0; k < vec_sz; k++) {
                         dst_vec[k] = fn(src_vec[k]);
                     }
-                    sg.store<vec_sz>(dst_ptrT(&dst_p[base + it * sgSize]),
-                                     dst_vec);
+                    sg.store<vec_sz>(dst_multi_ptr, dst_vec);
                 }
             }
             else {
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
index 855d5479c1..797a7f2534 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
@@ -65,9 +65,6 @@ struct UnaryContigFunctor
         if constexpr (UnaryOperatorT::is_constant::value) {
             // value of operator is known to be a known constant
             constexpr resT const_val = UnaryOperatorT::constant_value;
-            using out_ptrT =
-                sycl::multi_ptr<resT,
-                                sycl::access::address_space::global_space>;
 
             auto sg = ndit.get_sub_group();
             std::uint8_t sgSize = sg.get_local_range()[0];
@@ -80,8 +77,11 @@ struct UnaryContigFunctor
                 sycl::vec<resT, vec_sz> res_vec(const_val);
 #pragma unroll
                 for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
-                    sg.store<vec_sz>(out_ptrT(&out[base + it * sgSize]),
-                                     res_vec);
+                    auto out_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&out[base + it * sgSize]);
+
+                    sg.store<vec_sz>(out_multi_ptr, res_vec);
                 }
             }
             else {
@@ -94,13 +94,6 @@ struct UnaryContigFunctor
         else if constexpr (UnaryOperatorT::supports_sg_loadstore::value &&
                            UnaryOperatorT::supports_vec::value)
         {
-            using in_ptrT =
-                sycl::multi_ptr<const argT,
-                                sycl::access::address_space::global_space>;
-            using out_ptrT =
-                sycl::multi_ptr<resT,
-                                sycl::access::address_space::global_space>;
-
             auto sg = ndit.get_sub_group();
             std::uint16_t sgSize = sg.get_local_range()[0];
             std::uint16_t max_sgSize = sg.get_max_local_range()[0];
@@ -113,10 +106,16 @@ struct UnaryContigFunctor
 
 #pragma unroll
                 for (std::uint16_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
-                    x = sg.load<vec_sz>(in_ptrT(&in[base + it * sgSize]));
+                    auto in_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&in[base + it * sgSize]);
+                    auto out_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&out[base + it * sgSize]);
+
+                    x = sg.load<vec_sz>(in_multi_ptr);
                     sycl::vec<resT, vec_sz> res_vec = op(x);
-                    sg.store<vec_sz>(out_ptrT(&out[base + it * sgSize]),
-                                     res_vec);
+                    sg.store<vec_sz>(out_multi_ptr, res_vec);
                 }
             }
             else {
@@ -141,23 +140,23 @@ struct UnaryContigFunctor
 
             if ((base + n_vecs * vec_sz * sgSize < nelems_) &&
                 (maxsgSize == sgSize)) {
-                using in_ptrT =
-                    sycl::multi_ptr<const argT,
-                                    sycl::access::address_space::global_space>;
-                using out_ptrT =
-                    sycl::multi_ptr<resT,
-                                    sycl::access::address_space::global_space>;
                 sycl::vec<argT, vec_sz> arg_vec;
 
 #pragma unroll
                 for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
-                    arg_vec = sg.load<vec_sz>(in_ptrT(&in[base + it * sgSize]));
+                    auto in_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&in[base + it * sgSize]);
+                    auto out_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&out[base + it * sgSize]);
+
+                    arg_vec = sg.load<vec_sz>(in_multi_ptr);
 #pragma unroll
                     for (std::uint8_t k = 0; k < vec_sz; ++k) {
                         arg_vec[k] = op(arg_vec[k]);
                     }
-                    sg.store<vec_sz>(out_ptrT(&out[base + it * sgSize]),
-                                     arg_vec);
+                    sg.store<vec_sz>(out_multi_ptr, arg_vec);
                 }
             }
             else {
@@ -179,24 +178,24 @@ struct UnaryContigFunctor
 
             if ((base + n_vecs * vec_sz * sgSize < nelems_) &&
                 (maxsgSize == sgSize)) {
-                using in_ptrT =
-                    sycl::multi_ptr<const argT,
-                                    sycl::access::address_space::global_space>;
-                using out_ptrT =
-                    sycl::multi_ptr<resT,
-                                    sycl::access::address_space::global_space>;
                 sycl::vec<argT, vec_sz> arg_vec;
                 sycl::vec<resT, vec_sz> res_vec;
 
 #pragma unroll
                 for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
-                    arg_vec = sg.load<vec_sz>(in_ptrT(&in[base + it * sgSize]));
+                    auto in_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&in[base + it * sgSize]);
+                    auto out_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&out[base + it * sgSize]);
+
+                    arg_vec = sg.load<vec_sz>(in_multi_ptr);
 #pragma unroll
                     for (std::uint8_t k = 0; k < vec_sz; ++k) {
                         res_vec[k] = op(arg_vec[k]);
                     }
-                    sg.store<vec_sz>(out_ptrT(&out[base + it * sgSize]),
-                                     res_vec);
+                    sg.store<vec_sz>(out_multi_ptr, res_vec);
                 }
             }
             else {
@@ -365,28 +364,26 @@ struct BinaryContigFunctor
 
             if ((base + n_vecs * vec_sz * sgSize < nelems_) &&
                 (sgSize == maxsgSize)) {
-                using in_ptrT1 =
-                    sycl::multi_ptr<const argT1,
-                                    sycl::access::address_space::global_space>;
-                using in_ptrT2 =
-                    sycl::multi_ptr<const argT2,
-                                    sycl::access::address_space::global_space>;
-                using out_ptrT =
-                    sycl::multi_ptr<resT,
-                                    sycl::access::address_space::global_space>;
                 sycl::vec<argT1, vec_sz> arg1_vec;
                 sycl::vec<argT2, vec_sz> arg2_vec;
                 sycl::vec<resT, vec_sz> res_vec;
 
 #pragma unroll
                 for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
-                    arg1_vec =
-                        sg.load<vec_sz>(in_ptrT1(&in1[base + it * sgSize]));
-                    arg2_vec =
-                        sg.load<vec_sz>(in_ptrT2(&in2[base + it * sgSize]));
+                    auto in1_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&in1[base + it * sgSize]);
+                    auto in2_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&in2[base + it * sgSize]);
+                    auto out_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&out[base + it * sgSize]);
+
+                    arg1_vec = sg.load<vec_sz>(in1_multi_ptr);
+                    arg2_vec = sg.load<vec_sz>(in2_multi_ptr);
                     res_vec = op(arg1_vec, arg2_vec);
-                    sg.store<vec_sz>(out_ptrT(&out[base + it * sgSize]),
-                                     res_vec);
+                    sg.store<vec_sz>(out_multi_ptr, res_vec);
                 }
             }
             else {
@@ -407,32 +404,30 @@ struct BinaryContigFunctor
 
             if ((base + n_vecs * vec_sz * sgSize < nelems_) &&
                 (sgSize == maxsgSize)) {
-                using in_ptrT1 =
-                    sycl::multi_ptr<const argT1,
-                                    sycl::access::address_space::global_space>;
-                using in_ptrT2 =
-                    sycl::multi_ptr<const argT2,
-                                    sycl::access::address_space::global_space>;
-                using out_ptrT =
-                    sycl::multi_ptr<resT,
-                                    sycl::access::address_space::global_space>;
                 sycl::vec<argT1, vec_sz> arg1_vec;
                 sycl::vec<argT2, vec_sz> arg2_vec;
                 sycl::vec<resT, vec_sz> res_vec;
 
 #pragma unroll
                 for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
-                    arg1_vec =
-                        sg.load<vec_sz>(in_ptrT1(&in1[base + it * sgSize]));
-                    arg2_vec =
-                        sg.load<vec_sz>(in_ptrT2(&in2[base + it * sgSize]));
+                    auto in1_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&in1[base + it * sgSize]);
+                    auto in2_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&in2[base + it * sgSize]);
+                    auto out_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&out[base + it * sgSize]);
+
+                    arg1_vec = sg.load<vec_sz>(in1_multi_ptr);
+                    arg2_vec = sg.load<vec_sz>(in2_multi_ptr);
 #pragma unroll
                     for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) {
                         res_vec[vec_id] =
                             op(arg1_vec[vec_id], arg2_vec[vec_id]);
                     }
-                    sg.store<vec_sz>(out_ptrT(&out[base + it * sgSize]),
-                                     res_vec);
+                    sg.store<vec_sz>(out_multi_ptr, res_vec);
                 }
             }
             else {
@@ -530,22 +525,24 @@ struct BinaryContigMatrixContigRowBroadcastingFunctor
         size_t base = gid - sg.get_local_id()[0];
 
         if (base + sgSize < n_elems) {
-            using in_ptrT1 =
-                sycl::multi_ptr<const argT1,
-                                sycl::access::address_space::global_space>;
-            using in_ptrT2 =
-                sycl::multi_ptr<const argT2,
-                                sycl::access::address_space::global_space>;
-            using res_ptrT =
-                sycl::multi_ptr<resT,
-                                sycl::access::address_space::global_space>;
-
-            const argT1 mat_el = sg.load(in_ptrT1(&mat[base]));
-            const argT2 vec_el = sg.load(in_ptrT2(&padded_vec[base % n1]));
+            auto in1_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&mat[base]);
+
+            auto in2_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&padded_vec[base % n1]);
+
+            auto out_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&res[base]);
+
+            const argT1 mat_el = sg.load(in1_multi_ptr);
+            const argT2 vec_el = sg.load(in2_multi_ptr);
 
             resT res_el = op(mat_el, vec_el);
 
-            sg.store(res_ptrT(&res[base]), res_el);
+            sg.store(out_multi_ptr, res_el);
         }
         else {
             for (size_t k = base + sg.get_local_id()[0]; k < n_elems;
@@ -592,22 +589,24 @@ struct BinaryContigRowContigMatrixBroadcastingFunctor
         size_t base = gid - sg.get_local_id()[0];
 
         if (base + sgSize < n_elems) {
-            using in_ptrT1 =
-                sycl::multi_ptr<const argT1,
-                                sycl::access::address_space::global_space>;
-            using in_ptrT2 =
-                sycl::multi_ptr<const argT2,
-                                sycl::access::address_space::global_space>;
-            using res_ptrT =
-                sycl::multi_ptr<resT,
-                                sycl::access::address_space::global_space>;
-
-            const argT2 mat_el = sg.load(in_ptrT2(&mat[base]));
-            const argT1 vec_el = sg.load(in_ptrT1(&padded_vec[base % n1]));
+            auto in1_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&padded_vec[base % n1]);
+
+            auto in2_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&mat[base]);
+
+            auto out_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&res[base]);
+
+            const argT2 mat_el = sg.load(in2_multi_ptr);
+            const argT1 vec_el = sg.load(in1_multi_ptr);
 
             resT res_el = op(vec_el, mat_el);
 
-            sg.store(res_ptrT(&res[base]), res_el);
+            sg.store(out_multi_ptr, res_el);
         }
         else {
             for (size_t k = base + sg.get_local_id()[0]; k < n_elems;
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
index a41029b27c..505a40acc5 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
@@ -76,24 +76,24 @@ struct BinaryInplaceContigFunctor
 
             if ((base + n_vecs * vec_sz * sgSize < nelems_) &&
                 (sgSize == maxsgSize)) {
-                using rhs_ptrT =
-                    sycl::multi_ptr<const argT,
-                                    sycl::access::address_space::global_space>;
-                using lhs_ptrT =
-                    sycl::multi_ptr<resT,
-                                    sycl::access::address_space::global_space>;
+
                 sycl::vec<argT, vec_sz> arg_vec;
                 sycl::vec<resT, vec_sz> res_vec;
 
 #pragma unroll
                 for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
-                    arg_vec =
-                        sg.load<vec_sz>(rhs_ptrT(&rhs[base + it * sgSize]));
-                    res_vec =
-                        sg.load<vec_sz>(lhs_ptrT(&lhs[base + it * sgSize]));
+                    auto rhs_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&rhs[base + it * sgSize]);
+                    auto lhs_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&lhs[base + it * sgSize]);
+
+                    arg_vec = sg.load<vec_sz>(rhs_multi_ptr);
+                    res_vec = sg.load<vec_sz>(lhs_multi_ptr);
                     op(res_vec, arg_vec);
-                    sg.store<vec_sz>(lhs_ptrT(&lhs[base + it * sgSize]),
-                                     res_vec);
+
+                    sg.store<vec_sz>(lhs_multi_ptr, res_vec);
                 }
             }
             else {
@@ -115,27 +115,25 @@ struct BinaryInplaceContigFunctor
 
             if ((base + n_vecs * vec_sz * sgSize < nelems_) &&
                 (sgSize == maxsgSize)) {
-                using rhs_ptrT =
-                    sycl::multi_ptr<const argT,
-                                    sycl::access::address_space::global_space>;
-                using lhs_ptrT =
-                    sycl::multi_ptr<resT,
-                                    sycl::access::address_space::global_space>;
                 sycl::vec<argT, vec_sz> arg_vec;
                 sycl::vec<resT, vec_sz> res_vec;
 
 #pragma unroll
                 for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
-                    arg_vec =
-                        sg.load<vec_sz>(rhs_ptrT(&rhs[base + it * sgSize]));
-                    res_vec =
-                        sg.load<vec_sz>(lhs_ptT(&lhs[base + it * sgSize]));
+                    auto rhs_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&rhs[base + it * sgSize]);
+                    auto lhs_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&lhs[base + it * sgSize]);
+
+                    arg_vec = sg.load<vec_sz>(rhs_multi_ptr);
+                    res_vec = sg.load<vec_sz>(lhs_multi_ptr);
 #pragma unroll
                     for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) {
                         op(res_vec[vec_id], arg_vec[vec_id]);
                     }
-                    sg.store<vec_sz>(lhs_ptrT(&lhs[base + it * sgSize]),
-                                     res_vec);
+                    sg.store<vec_sz>(lhs_multi_ptr, res_vec);
                 }
             }
             else {
@@ -223,19 +221,20 @@ struct BinaryInplaceRowMatrixBroadcastingFunctor
         size_t base = gid - sg.get_local_id()[0];
 
         if (base + sgSize < n_elems) {
-            using in_ptrT =
-                sycl::multi_ptr<const argT,
-                                sycl::access::address_space::global_space>;
-            using res_ptrT =
-                sycl::multi_ptr<resT,
-                                sycl::access::address_space::global_space>;
+            auto in_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&padded_vec[base % n1]);
+
+            auto out_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&mat[base]);
 
-            const argT vec_el = sg.load(in_ptrT(&padded_vec[base % n1]));
-            resT mat_el = sg.load(res_ptrT(&mat[base]));
+            const argT vec_el = sg.load(in_multi_ptr);
+            resT mat_el = sg.load(out_multi_ptr);
 
             op(mat_el, vec_el);
 
-            sg.store(res_ptrT(&mat[base]), mat_el);
+            sg.store(out_multi_ptr, mat_el);
         }
         else {
             for (size_t k = base + sg.get_local_id()[0]; k < n_elems;
diff --git a/dpctl/tensor/libtensor/include/kernels/where.hpp b/dpctl/tensor/libtensor/include/kernels/where.hpp
index 67ce2ca1f0..9da5466dbe 100644
--- a/dpctl/tensor/libtensor/include/kernels/where.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/where.hpp
@@ -100,15 +100,6 @@ class WhereContigFunctor
 
             if (base + n_vecs * vec_sz * sgSize < nelems &&
                 sgSize == max_sgSize) {
-                using dst_ptrT =
-                    sycl::multi_ptr<T,
-                                    sycl::access::address_space::global_space>;
-                using x_ptrT =
-                    sycl::multi_ptr<const T,
-                                    sycl::access::address_space::global_space>;
-                using cond_ptrT =
-                    sycl::multi_ptr<const condT,
-                                    sycl::access::address_space::global_space>;
                 sycl::vec<T, vec_sz> dst_vec;
                 sycl::vec<T, vec_sz> x1_vec;
                 sycl::vec<T, vec_sz> x2_vec;
@@ -117,14 +108,27 @@ class WhereContigFunctor
 #pragma unroll
                 for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
                     auto idx = base + it * sgSize;
-                    x1_vec = sg.load<vec_sz>(x_ptrT(&x1_p[idx]));
-                    x2_vec = sg.load<vec_sz>(x_ptrT(&x2_p[idx]));
-                    cond_vec = sg.load<vec_sz>(cond_ptrT(&cond_p[idx]));
+                    auto x1_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&x1_p[idx]);
+                    auto x2_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&x2_p[idx]);
+                    auto cond_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&cond_p[idx]);
+                    auto dst_multi_ptr = sycl::address_space_cast<
+                        sycl::access::address_space::global_space,
+                        sycl::access::decorated::yes>(&dst_p[idx]);
+
+                    x1_vec = sg.load<vec_sz>(x1_multi_ptr);
+                    x2_vec = sg.load<vec_sz>(x2_multi_ptr);
+                    cond_vec = sg.load<vec_sz>(cond_multi_ptr);
 #pragma unroll
                     for (std::uint8_t k = 0; k < vec_sz; ++k) {
                         dst_vec[k] = cond_vec[k] ? x1_vec[k] : x2_vec[k];
                     }
-                    sg.store<vec_sz>(dst_ptrT(&dst_p[idx]), dst_vec);
+                    sg.store<vec_sz>(dst_multi_ptr, dst_vec);
                 }
             }
             else {

From e0aea2896533fba5af7b66d46270622f370a989b Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 25 Aug 2023 04:51:57 -0500
Subject: [PATCH 12/44] Resolved SYCL-2020 deprecation warning

```
In file included from ~/dpctl/dpctl/tensor/libtensor/source/elementwise_functions.cpp:56:
~/dpctl/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp:118:42: warning: 'sincos' is deprecated: SYCL builtin functions with raw pointer arguments have been deprecated. Please use multi_ptr. [-Wdeprecated-declarations]
  118 |             const realT sinY_val = sycl::sincos(y, &cosY_val);
```

The resolution is to convert raw pointer to multi-pointer using `sycl::address_space_cast`.
---
 .../include/kernels/elementwise_functions/expm1.hpp          | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
index b996a6d0ec..3e69aa5464 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
@@ -115,7 +115,10 @@ template <typename argT, typename resT> struct Expm1Functor
 
             // x, y finite numbers
             realT cosY_val;
-            const realT sinY_val = sycl::sincos(y, &cosY_val);
+            auto cosY_val_multi_ptr = sycl::address_space_cast<
+                sycl::access::address_space::global_space,
+                sycl::access::decorated::yes>(&cosY_val);
+            const realT sinY_val = sycl::sincos(y, cosY_val_multi_ptr);
             const realT sinhalfY_val = std::sin(y / 2);
 
             const realT res_re =

From 29fd9e5b109fe18ad5fe3af4d69874c1b8a72e51 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 25 Aug 2023 08:31:43 -0500
Subject: [PATCH 13/44] Address NumPy 1.25 deprecation warnings

Ensure that ndarray that we converted usm_ndarray single element instance
into  is 0d before calling __int__, __float__, __complex__, __index__.
---
 dpctl/tensor/_usmarray.pyx | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx
index d4c0a69dfd..d6e15f9339 100644
--- a/dpctl/tensor/_usmarray.pyx
+++ b/dpctl/tensor/_usmarray.pyx
@@ -52,6 +52,15 @@ cdef class InternalUSMArrayError(Exception):
     pass
 
 
+cdef object _as_zero_dim_ndarray(object usm_ary):
+    "Convert size-1 array to NumPy 0d array"
+    mem_view = dpmem.as_usm_memory(usm_ary)
+    host_buf = mem_view.copy_to_host()
+    view = host_buf.view(usm_ary.dtype)
+    view.shape = tuple()
+    return view
+
+
 cdef class usm_ndarray:
     """ usm_ndarray(shape, dtype=None, strides=None, buffer="device", \
            offset=0, order="C", buffer_ctor_kwargs=dict(), \
@@ -840,9 +849,7 @@ cdef class usm_ndarray:
 
     def __bool__(self):
         if self.size == 1:
-            mem_view = dpmem.as_usm_memory(self)
-            host_buf = mem_view.copy_to_host()
-            view = host_buf.view(self.dtype)
+            view = _as_zero_dim_ndarray(self)
             return view.__bool__()
 
         if self.size == 0:
@@ -857,9 +864,7 @@ cdef class usm_ndarray:
 
     def __float__(self):
         if self.size == 1:
-            mem_view = dpmem.as_usm_memory(self)
-            host_buf = mem_view.copy_to_host()
-            view = host_buf.view(self.dtype)
+            view = _as_zero_dim_ndarray(self)
             return view.__float__()
 
         raise ValueError(
@@ -868,9 +873,7 @@ cdef class usm_ndarray:
 
     def __complex__(self):
         if self.size == 1:
-            mem_view = dpmem.as_usm_memory(self)
-            host_buf = mem_view.copy_to_host()
-            view = host_buf.view(self.dtype)
+            view = _as_zero_dim_ndarray(self)
             return view.__complex__()
 
         raise ValueError(
@@ -879,9 +882,7 @@ cdef class usm_ndarray:
 
     def __int__(self):
         if self.size == 1:
-            mem_view = dpmem.as_usm_memory(self)
-            host_buf = mem_view.copy_to_host()
-            view = host_buf.view(self.dtype)
+            view = _as_zero_dim_ndarray(self)
             return view.__int__()
 
         raise ValueError(

From 60d1f26c836b1301e792bbaaf15989e1eb779832 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 25 Aug 2023 09:17:35 -0500
Subject: [PATCH 14/44] Addressed Very high Coverity issue

Ensured that `create_property_list` always returns an not-null unique
pointer by creating an default-constructed property_list for the
fall-through.

With this change we no longer need to branches for call to sycl::queue
constructor, since propList is always available.
---
 .../source/dpctl_sycl_queue_interface.cpp     | 26 +++++--------------
 1 file changed, 6 insertions(+), 20 deletions(-)

diff --git a/libsyclinterface/source/dpctl_sycl_queue_interface.cpp b/libsyclinterface/source/dpctl_sycl_queue_interface.cpp
index 387374a032..703f58cb7b 100644
--- a/libsyclinterface/source/dpctl_sycl_queue_interface.cpp
+++ b/libsyclinterface/source/dpctl_sycl_queue_interface.cpp
@@ -121,7 +121,7 @@ bool set_kernel_arg(handler &cgh,
     return arg_set;
 }
 
-std::unique_ptr<property_list> create_property_list(int properties)
+static std::unique_ptr<property_list> create_property_list(int properties)
 {
     std::unique_ptr<property_list> propList;
     int _prop = properties;
@@ -143,6 +143,9 @@ std::unique_ptr<property_list> create_property_list(int properties)
         propList =
             std::make_unique<property_list>(sycl::property::queue::in_order());
     }
+    else {
+        propList = std::make_unique<property_list>();
+    }
 
     if (_prop) {
         std::stringstream ss;
@@ -185,7 +188,7 @@ DPCTLQueue_Create(__dpctl_keep const DPCTLSyclContextRef CRef,
     }
     auto propList = create_property_list(properties);
 
-    if (propList && handler) {
+    if (handler) {
         try {
             auto Queue = new queue(*ctx, *dev, DPCTL_AsyncErrorHandler(handler),
                                    *propList);
@@ -194,26 +197,9 @@ DPCTLQueue_Create(__dpctl_keep const DPCTLSyclContextRef CRef,
             error_handler(e, __FILE__, __func__, __LINE__);
         }
     }
-    else if (properties) {
-        try {
-            auto Queue = new queue(*ctx, *dev, *propList);
-            q = wrap<queue>(Queue);
-        } catch (std::exception const &e) {
-            error_handler(e, __FILE__, __func__, __LINE__);
-        }
-    }
-    else if (handler) {
-        try {
-            auto Queue =
-                new queue(*ctx, *dev, DPCTL_AsyncErrorHandler(handler));
-            q = wrap<queue>(Queue);
-        } catch (std::exception const &e) {
-            error_handler(e, __FILE__, __func__, __LINE__);
-        }
-    }
     else {
         try {
-            auto Queue = new queue(*ctx, *dev);
+            auto Queue = new queue(*ctx, *dev, *propList);
             q = wrap<queue>(Queue);
         } catch (std::exception const &e) {
             error_handler(e, __FILE__, __func__, __LINE__);

From a653eb34807e00d1ab5025fd1dc28b955754e288 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 25 Aug 2023 12:04:41 -0500
Subject: [PATCH 15/44] Avoid calling int/float/complex/operator.index on 1d
 ndarray

This is to address NumPy 1.25 deprecation warnings.
---
 dpctl/tests/test_usm_ndarray_ctor.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/dpctl/tests/test_usm_ndarray_ctor.py b/dpctl/tests/test_usm_ndarray_ctor.py
index 8e71f3931d..82b4303460 100644
--- a/dpctl/tests/test_usm_ndarray_ctor.py
+++ b/dpctl/tests/test_usm_ndarray_ctor.py
@@ -239,8 +239,9 @@ def test_copy_scalar_with_func(func, shape, dtype):
         X = dpt.usm_ndarray(shape, dtype=dtype)
     except dpctl.SyclDeviceCreationError:
         pytest.skip("No SYCL devices available")
-    Y = np.arange(1, X.size + 1, dtype=dtype).reshape(shape)
-    X.usm_data.copy_from_host(Y.reshape(-1).view("|u1"))
+    Y = np.arange(1, X.size + 1, dtype=dtype)
+    X.usm_data.copy_from_host(Y.view("|u1"))
+    Y.shape = tuple()
     assert func(X) == func(Y)
 
 
@@ -254,8 +255,9 @@ def test_copy_scalar_with_method(method, shape, dtype):
         X = dpt.usm_ndarray(shape, dtype=dtype)
     except dpctl.SyclDeviceCreationError:
         pytest.skip("No SYCL devices available")
-    Y = np.arange(1, X.size + 1, dtype=dtype).reshape(shape)
-    X.usm_data.copy_from_host(Y.reshape(-1).view("|u1"))
+    Y = np.arange(1, X.size + 1, dtype=dtype)
+    X.usm_data.copy_from_host(Y.view("|u1"))
+    Y.shape = tuple()
     assert getattr(X, method)() == getattr(Y, method)()
 
 

From 55bb70f5c124fde10a0f627a3819fcfee89aaa35 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sun, 27 Aug 2023 07:31:41 -0500
Subject: [PATCH 16/44] Turn comparison call into assertion

Moved the assertion about comparison with generic types before
other assertions.

This change is made in reference to Coverity scan issue.
---
 dpctl/tests/test_usm_ndarray_ctor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dpctl/tests/test_usm_ndarray_ctor.py b/dpctl/tests/test_usm_ndarray_ctor.py
index 82b4303460..72f5aabebb 100644
--- a/dpctl/tests/test_usm_ndarray_ctor.py
+++ b/dpctl/tests/test_usm_ndarray_ctor.py
@@ -2136,6 +2136,8 @@ def test_flags():
     except dpctl.SyclDeviceCreationError:
         pytest.skip("No SYCL devices available")
     f = x.flags
+    # check comparison with generic types
+    assert f != Ellipsis
     f.__repr__()
     assert f.c_contiguous == f["C"]
     assert f.f_contiguous == f["F"]
@@ -2144,8 +2146,6 @@ def test_flags():
     assert f.forc == f["FORC"]
     assert f.fnc == f["FNC"]
     assert f.writable == f["W"]
-    # check comparison with generic types
-    f == Ellipsis
 
 
 def test_asarray_uint64():

From 02cfdb463f72bc9e9e0c10dfd57a059482d5045b Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sun, 27 Aug 2023 11:30:22 -0500
Subject: [PATCH 17/44] Initialized env to empty dictionary instead of None

This resolves two Coverity reported issues.
---
 scripts/gen_coverage.py | 2 +-
 scripts/gen_docs.py     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/gen_coverage.py b/scripts/gen_coverage.py
index 41b30d00d9..93328f1b85 100644
--- a/scripts/gen_coverage.py
+++ b/scripts/gen_coverage.py
@@ -57,7 +57,7 @@ def run(
         "-DDPCTL_BUILD_CAPI_TESTS=ON",
         "-DDPCTL_COVERAGE_REPORT_OUTPUT_DIR=" + setup_dir,
     ]
-    env = None
+    env = dict()
     if bin_llvm:
         env = {
             "PATH": ":".join((os.environ.get("PATH", ""), bin_llvm)),
diff --git a/scripts/gen_docs.py b/scripts/gen_docs.py
index 377b37f6d8..9e2285a477 100644
--- a/scripts/gen_docs.py
+++ b/scripts/gen_docs.py
@@ -59,7 +59,7 @@ def run(
         cmake_args.append("-DDPCTL_ENABLE_DOXYREST=ON")
         cmake_args.append("-DDoxyrest_DIR=" + doxyrest_dir)
 
-    env = None
+    env = dict()
     if bin_llvm:
         env = {
             "PATH": ":".join((os.environ.get("PATH", ""), bin_llvm)),

From 29c52b48a0f6a7b0f8fc1c6ede75768f06866669 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sun, 27 Aug 2023 11:58:50 -0500
Subject: [PATCH 18/44] Addressed coverity issue about handling bools in
 floor_divide

---
 .../elementwise_functions/floor_divide.hpp     | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
index 268c679f00..32e97df58d 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
@@ -57,7 +57,12 @@ struct FloorDivideFunctor
 
     resT operator()(const argT1 &in1, const argT2 &in2)
     {
-        if constexpr (std::is_integral_v<argT1> || std::is_integral_v<argT2>) {
+        if constexpr (std::is_same_v<argT1, bool> &&
+                      std::is_same_v<argT2, bool>) {
+            return (in2) ? static_cast<resT>(in1) : resT(0);
+        }
+        else if constexpr (std::is_integral_v<argT1> ||
+                           std::is_integral_v<argT2>) {
             if (in2 == argT2(0)) {
                 return resT(0);
             }
@@ -81,7 +86,16 @@ struct FloorDivideFunctor
     sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT1, vec_sz> &in1,
                                        const sycl::vec<argT2, vec_sz> &in2)
     {
-        if constexpr (std::is_integral_v<resT>) {
+        if constexpr (std::is_same_v<argT1, bool> &&
+                      std::is_same_v<argT2, bool>) {
+            sycl::vec<resT, vec_sz> res;
+#pragma unroll
+            for (int i = 0; i < vec_sz; ++i) {
+                res[i] = (in2[i]) ? static_cast<resT>(in1[i]) : resT(0);
+            }
+            return res;
+        }
+        else if constexpr (std::is_integral_v<resT>) {
             sycl::vec<resT, vec_sz> res;
 #pragma unroll
             for (int i = 0; i < vec_sz; ++i) {

From b2cd5c18e6bcd9f3e81b19b1c39d798e0f3836b6 Mon Sep 17 00:00:00 2001
From: PiotrekB416 <piotrek15.pb@gmail.com>
Date: Mon, 28 Aug 2023 00:08:13 +0200
Subject: [PATCH 19/44] Add support for lowercase order in tensor.copy and
 tensor.astype

---
 dpctl/tensor/_copy_utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py
index 3eae29f057..321c1393c4 100644
--- a/dpctl/tensor/_copy_utils.py
+++ b/dpctl/tensor/_copy_utils.py
@@ -517,6 +517,7 @@ def copy(usm_ary, order="K"):
        - "K": match the layout of `usm_ary` as closely as possible.
 
     """
+    order = order.upper()
     if not isinstance(usm_ary, dpt.usm_ndarray):
         return TypeError(
             f"Expected object of type dpt.usm_ndarray, got {type(usm_ary)}"
@@ -581,6 +582,7 @@ def astype(usm_ary, newdtype, order="K", casting="unsafe", copy=True):
 
     A view can be returned, if possible, when `copy=False` is used.
     """
+    order = order.upper()
     if not isinstance(usm_ary, dpt.usm_ndarray):
         return TypeError(
             f"Expected object of type dpt.usm_ndarray, got {type(usm_ary)}"

From 5390456db2b1958f44e89ad20f57c65384d6a2bb Mon Sep 17 00:00:00 2001
From: PiotrekB416 <piotrek15.pb@gmail.com>
Date: Mon, 28 Aug 2023 00:18:09 +0200
Subject: [PATCH 20/44] Refactored

---
 dpctl/tensor/_copy_utils.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py
index 321c1393c4..ad5b956851 100644
--- a/dpctl/tensor/_copy_utils.py
+++ b/dpctl/tensor/_copy_utils.py
@@ -517,7 +517,11 @@ def copy(usm_ary, order="K"):
        - "K": match the layout of `usm_ary` as closely as possible.
 
     """
-    order = order.upper()
+    if len(order) == 0 or order[0] not in "KkAaCcFf":
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'K', 'A', 'F', or 'C'."
+        )
+    order = order[0].upper()
     if not isinstance(usm_ary, dpt.usm_ndarray):
         return TypeError(
             f"Expected object of type dpt.usm_ndarray, got {type(usm_ary)}"
@@ -582,16 +586,15 @@ def astype(usm_ary, newdtype, order="K", casting="unsafe", copy=True):
 
     A view can be returned, if possible, when `copy=False` is used.
     """
-    order = order.upper()
     if not isinstance(usm_ary, dpt.usm_ndarray):
         return TypeError(
             f"Expected object of type dpt.usm_ndarray, got {type(usm_ary)}"
         )
-    if not isinstance(order, str) or order not in ["A", "C", "F", "K"]:
+    if len(order) == 0 or order[0] not in "KkAaCcFf":
         raise ValueError(
-            "Unrecognized value of the order keyword. "
-            "Recognized values are 'A', 'C', 'F', or 'K'"
+            "Unrecognized order keyword value, expecting 'K', 'A', 'F', or 'C'."
         )
+    order = order[0].upper()
     ary_dtype = usm_ary.dtype
     target_dtype = _get_dtype(newdtype, usm_ary.sycl_queue)
     if not dpt.can_cast(ary_dtype, target_dtype, casting=casting):

From 8a9ebe650dca967453cd1745cb771e53b3bb5ace Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sun, 27 Aug 2023 23:04:37 -0500
Subject: [PATCH 21/44] Only attempt to leave comments on PRs from this repo

Also adds a step to output array-api-test summary into the log
(step which works for PRs regardless whether they are opened from
a fork, or from a branch in this repo).
---
 .github/workflows/conda-package.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index 9298742ac6..d0ad1c8205 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -608,7 +608,11 @@ jobs:
             echo "Array API standard conformance tests failed to run for dpctl=$PACKAGE_VERSION."
             exit 1
           fi
+      - name: Output API summary
+        shell: bash -l {0}
+        run: echo "::notice ${{ env.MESSAGE }}"
       - name: Post result to PR
+        if: ${{ github.event.pull_request && !github.event.pull_request.head.repo.fork }}
         uses: mshick/add-pr-comment@v1
         with:
           message: |

From e06bf4044be256ab9cbe4fb2d0db85d3820bf0ea Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sun, 27 Aug 2023 23:06:25 -0500
Subject: [PATCH 22/44] Generate docs made friendly for PRs from forks

Only publish sources and comments with link to them for PRs
opened from branches in this repo. PRs from forks would have
artifacts with rendered docs uploaded.
---
 .github/workflows/generate-docs.yml | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/generate-docs.yml b/.github/workflows/generate-docs.yml
index bae12e4373..84bbed4622 100644
--- a/.github/workflows/generate-docs.yml
+++ b/.github/workflows/generate-docs.yml
@@ -76,7 +76,7 @@ jobs:
           mv ../cmake-install/docs/docs ~/docs
           git clean -dfx
       - name: Publish docs
-        if: ${{ github.ref == 'refs/heads/master' }}
+        if: ${{ github.event.pull_request && !github.event.pull_request.head.repo.fork && github.ref == 'refs/heads/master' }}
         shell: bash -l {0}
         run: |
           git remote add tokened_docs https://IntelPython:${{ secrets.GITHUB_TOKEN }}@github.com/IntelPython/dpctl.git
@@ -91,8 +91,15 @@ jobs:
           git config --global user.email 'github-actions[doc-deploy-bot]@users.noreply.github.com'
           git commit -m "Latest docs."
           git push tokened_docs gh-pages
+      - name: Save built docs as an artifact
+        if: ${{ github.event.pull_request && github.event.pull_request.head.repo.fork && github.event.action != 'closed'}}
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ env.PACKAGE_NAME }} rendered documentation
+          path: ~/docs
+
       - name: Publish pull-request docs
-        if: ${{ github.event.pull_request && github.event.action != 'closed' }}
+        if: ${{ github.event.pull_request && !github.event.pull_request.head.repo.fork && github.event.action != 'closed' }}
         env:
           PR_NUM: ${{ github.event.number }}
         shell: bash -l {0}
@@ -111,7 +118,7 @@ jobs:
           git commit -m "Docs for pull request ${PR_NUM}"
           git push tokened_docs gh-pages
       - name: Unpublish pull-request docs
-        if: ${{ github.event.pull_request && github.event.action == 'closed' }}
+        if: ${{ github.event.pull_request && !github.event.pull_request.head.repo.fork && github.event.action == 'closed' }}
         env:
           PR_NUM: ${{ github.event.number }}
         shell: bash -l {0}
@@ -128,7 +135,7 @@ jobs:
           git commit -m "Removing docs for closed pull request ${PR_NUM}"
           git push tokened_docs gh-pages
       - name: Comment with URL to published pull-request docs
-        if: ${{ github.event.pull_request && github.event.action != 'closed' }}
+        if: ${{ github.event.pull_request && !github.event.pull_request.head.repo.fork && github.event.action != 'closed' }}
         env:
           PR_NUM: ${{ github.event.number }}
         uses: mshick/add-pr-comment@v1
@@ -138,7 +145,7 @@ jobs:
           repo-token: ${{ secrets.GITHUB_TOKEN }}
           repo-token-user-login: 'github-actions[bot]'
       - name: Comment with URL about removal of PR docs
-        if: ${{ github.event.pull_request && github.event.action == 'closed' }}
+        if: ${{ github.event.pull_request && !github.event.pull_request.head.repo.fork && github.event.action == 'closed' }}
         env:
           PR_NUM: ${{ github.event.number }}
         uses: mshick/add-pr-comment@v1

From f49889c4d263b69dd8cad0d7a4a866f119865344 Mon Sep 17 00:00:00 2001
From: ndgrigorian <46709016+ndgrigorian@users.noreply.github.com>
Date: Mon, 28 Aug 2023 09:06:09 -0700
Subject: [PATCH 23/44] Fixes boolean indexing for strided masks (#1370)

* Corrected boolean indexing cumsum
- The cumulative sum was being calculated incorrectly -- the offset from stride simplification was unused and the result was incorrect for some cases with non-C-contiguous strides
- To fix this, new functions ``compact_iteration_space`` and complementary function ``compact_iteration`` have been implemented

* Add a test for nonzero where dimension compacting occurs

* Added tests for the corrected behavior of boolean indexing and nonzero

* Removed dead branch in py_mask_positions

Compacting strides can reduce dimensionality of the array, but it can not
turn an input that is not already C-contiguous into a C-contiguous one.

Hence the branch checking if the input became C-contiguous after compacting
is effectively dead.

* Added docstring for compact_iteration

---------

Co-authored-by: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
---
 .../kernels/boolean_advanced_indexing.hpp     |  4 +-
 .../libtensor/include/utils/strided_iters.hpp | 49 +++++++++++++++++++
 .../source/boolean_advanced_indexing.cpp      | 22 +++------
 .../source/simplify_iteration_space.cpp       | 39 +++++++++++++++
 .../source/simplify_iteration_space.hpp       |  7 +++
 dpctl/tests/test_usm_ndarray_indexing.py      | 45 +++++++++++++++++
 6 files changed, 147 insertions(+), 19 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
index bb2ddc5ad6..43c546860b 100644
--- a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
@@ -424,7 +424,6 @@ typedef size_t (*mask_positions_strided_impl_fn_ptr_t)(
     size_t,
     const char *,
     int,
-    py::ssize_t,
     const py::ssize_t *,
     char *,
     std::vector<sycl::event> const &);
@@ -434,7 +433,6 @@ size_t mask_positions_strided_impl(sycl::queue q,
                                    size_t n_elems,
                                    const char *mask,
                                    int nd,
-                                   py::ssize_t input_offset,
                                    const py::ssize_t *shape_strides,
                                    char *cumsum,
                                    std::vector<sycl::event> const &depends = {})
@@ -444,7 +442,7 @@ size_t mask_positions_strided_impl(sycl::queue q,
     cumsumT *cumsum_data_ptr = reinterpret_cast<cumsumT *>(cumsum);
     size_t wg_size = 128;
 
-    StridedIndexer strided_indexer{nd, input_offset, shape_strides};
+    StridedIndexer strided_indexer{nd, 0, shape_strides};
     NonZeroIndicator<maskT, cumsumT> non_zero_indicator{};
 
     sycl::event comp_ev =
diff --git a/dpctl/tensor/libtensor/include/utils/strided_iters.hpp b/dpctl/tensor/libtensor/include/utils/strided_iters.hpp
index f654087281..bd174e3f90 100644
--- a/dpctl/tensor/libtensor/include/utils/strided_iters.hpp
+++ b/dpctl/tensor/libtensor/include/utils/strided_iters.hpp
@@ -909,6 +909,55 @@ contract_iter4(vecT shape,
                            out_strides3, disp3, out_strides4, disp4);
 }
 
+/*
+    For purposes of iterating over elements of an array with  `shape` and
+    strides `strides` given as pointers `compact_iteration(nd, shape, strides)`
+    may modify memory and returns the new length of the array.
+
+    The new shape and new strides `(new_shape, new_strides)` are such that
+    iterating over them will traverse the same elements in the same order,
+    possibly with reduced dimensionality.
+ */
+template <class ShapeTy, class StridesTy>
+int compact_iteration(const int nd, ShapeTy *shape, StridesTy *strides)
+{
+    if (nd < 2)
+        return nd;
+
+    bool contractable = true;
+    for (int i = 0; i < nd; ++i) {
+        if (strides[i] < 0) {
+            contractable = false;
+        }
+    }
+
+    int nd_ = nd;
+    while (contractable) {
+        bool changed = false;
+        for (int i = 0; i + 1 < nd_; ++i) {
+            StridesTy str = strides[i + 1];
+            StridesTy jump = strides[i] - (shape[i + 1] - 1) * str;
+
+            if (jump == str) {
+                changed = true;
+                shape[i] *= shape[i + 1];
+                for (int j = i; j < nd_; ++j) {
+                    strides[j] = strides[j + 1];
+                }
+                for (int j = i + 1; j + 1 < nd_; ++j) {
+                    shape[j] = shape[j + 1];
+                }
+                --nd_;
+                break;
+            }
+        }
+        if (!changed)
+            break;
+    }
+
+    return nd_;
+}
+
 } // namespace strides
 } // namespace tensor
 } // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp
index 0dd63fe973..d37967daef 100644
--- a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp
+++ b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp
@@ -205,24 +205,14 @@ size_t py_mask_positions(dpctl::tensor::usm_ndarray mask,
     auto const &strides_vector = mask.get_strides_vector();
 
     using shT = std::vector<py::ssize_t>;
-    shT simplified_shape;
-    shT simplified_strides;
-    py::ssize_t offset(0);
+    shT compact_shape;
+    shT compact_strides;
 
     int mask_nd = mask.get_ndim();
     int nd = mask_nd;
 
-    dpctl::tensor::py_internal::simplify_iteration_space_1(
-        nd, shape, strides_vector, simplified_shape, simplified_strides,
-        offset);
-
-    if (nd == 1 && simplified_strides[0] == 1) {
-        auto fn = (use_i32)
-                      ? mask_positions_contig_i32_dispatch_vector[mask_typeid]
-                      : mask_positions_contig_i64_dispatch_vector[mask_typeid];
-
-        return fn(exec_q, mask_size, mask_data, cumsum_data, depends);
-    }
+    dpctl::tensor::py_internal::compact_iteration_space(
+        nd, shape, strides_vector, compact_shape, compact_strides);
 
     // Strided implementation
     auto strided_fn =
@@ -232,7 +222,7 @@ size_t py_mask_positions(dpctl::tensor::usm_ndarray mask,
 
     using dpctl::tensor::offset_utils::device_allocate_and_pack;
     const auto &ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
-        exec_q, host_task_events, simplified_shape, simplified_strides);
+        exec_q, host_task_events, compact_shape, compact_strides);
     py::ssize_t *shape_strides = std::get<0>(ptr_size_event_tuple);
     if (shape_strides == nullptr) {
         sycl::event::wait(host_task_events);
@@ -253,7 +243,7 @@ size_t py_mask_positions(dpctl::tensor::usm_ndarray mask,
     dependent_events.insert(dependent_events.end(), depends.begin(),
                             depends.end());
 
-    size_t total_set = strided_fn(exec_q, mask_size, mask_data, nd, offset,
+    size_t total_set = strided_fn(exec_q, mask_size, mask_data, nd,
                                   shape_strides, cumsum_data, dependent_events);
 
     sycl::event::wait(host_task_events);
diff --git a/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp b/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp
index 2fb2d6078e..31f5250f8a 100644
--- a/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp
+++ b/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp
@@ -369,6 +369,45 @@ void simplify_iteration_space_4(
     }
 }
 
+void compact_iteration_space(int &nd,
+                             const py::ssize_t *const &shape,
+                             std::vector<py::ssize_t> const &strides,
+                             // output
+                             std::vector<py::ssize_t> &compact_shape,
+                             std::vector<py::ssize_t> &compact_strides)
+{
+    using dpctl::tensor::strides::compact_iteration;
+    if (nd > 1) {
+        // Compact iteration space to reduce dimensionality
+        // and improve access pattern
+        compact_shape.reserve(nd);
+        compact_shape.insert(std::begin(compact_shape), shape, shape + nd);
+        assert(compact_shape.size() == static_cast<size_t>(nd));
+
+        compact_strides.reserve(nd);
+        compact_strides.insert(std::end(compact_strides), std::begin(strides),
+                               std::end(strides));
+        assert(compact_strides.size() == static_cast<size_t>(nd));
+
+        int contracted_nd =
+            compact_iteration(nd, compact_shape.data(), compact_strides.data());
+        compact_shape.resize(contracted_nd);
+        compact_strides.resize(contracted_nd);
+
+        nd = contracted_nd;
+    }
+    else if (nd == 1) {
+        // Populate vectors
+        compact_shape.reserve(nd);
+        compact_shape.push_back(shape[0]);
+        assert(compact_shape.size() == static_cast<size_t>(nd));
+
+        compact_strides.reserve(nd);
+        compact_strides.push_back(strides[0]);
+        assert(compact_strides.size() == static_cast<size_t>(nd));
+    }
+}
+
 py::ssize_t _ravel_multi_index_c(std::vector<py::ssize_t> const &mi,
                                  std::vector<py::ssize_t> const &shape)
 {
diff --git a/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp b/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp
index 1bd8ff5aa0..275129ad5c 100644
--- a/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp
+++ b/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp
@@ -90,6 +90,13 @@ void simplify_iteration_space_4(int &,
                                 py::ssize_t &,
                                 py::ssize_t &);
 
+void compact_iteration_space(int &,
+                             const py::ssize_t *const &,
+                             std::vector<py::ssize_t> const &,
+                             // output
+                             std::vector<py::ssize_t> &,
+                             std::vector<py::ssize_t> &);
+
 py::ssize_t _ravel_multi_index_c(std::vector<py::ssize_t> const &,
                                  std::vector<py::ssize_t> const &);
 py::ssize_t _ravel_multi_index_f(std::vector<py::ssize_t> const &,
diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py
index 9d166226e7..9183226be2 100644
--- a/dpctl/tests/test_usm_ndarray_indexing.py
+++ b/dpctl/tests/test_usm_ndarray_indexing.py
@@ -1044,6 +1044,19 @@ def test_extract_all_1d():
     res2 = dpt.extract(sel, x)
     assert (dpt.asnumpy(res2) == expected_res).all()
 
+    # test strided case
+    x = dpt.arange(15, dtype="i4")
+    sel_np = np.zeros(15, dtype="?")
+    np.put(sel_np, np.random.choice(sel_np.size, size=7), True)
+    sel = dpt.asarray(sel_np)
+
+    res = x[sel[::-1]]
+    expected_res = dpt.asnumpy(x)[sel_np[::-1]]
+    assert (dpt.asnumpy(res) == expected_res).all()
+
+    res2 = dpt.extract(sel[::-1], x)
+    assert (dpt.asnumpy(res2) == expected_res).all()
+
 
 def test_extract_all_2d():
     get_queue_or_skip()
@@ -1287,6 +1300,38 @@ def test_nonzero():
     assert (dpt.asnumpy(i) == np.array([3, 4, 5, 6])).all()
 
 
+def test_nonzero_f_contig():
+    "See gh-1370"
+    get_queue_or_skip
+
+    mask = dpt.zeros((5, 5), dtype="?", order="F")
+    mask[2, 3] = True
+
+    expected_res = (2, 3)
+    res = dpt.nonzero(mask)
+
+    assert expected_res == res
+    assert mask[res]
+
+
+def test_nonzero_compacting():
+    """See gh-1370.
+    Test with input where dimensionality
+    of iteration space is compacted from 3d to 2d
+    """
+    get_queue_or_skip
+
+    mask = dpt.zeros((5, 5, 5), dtype="?", order="F")
+    mask[3, 2, 1] = True
+    mask_view = mask[..., :3]
+
+    expected_res = (3, 2, 1)
+    res = dpt.nonzero(mask_view)
+
+    assert expected_res == res
+    assert mask_view[res]
+
+
 def test_assign_scalar():
     get_queue_or_skip()
     x = dpt.arange(-5, 5, dtype="i8")

From dd6a01473a9355a39384d6a3f41003c1aa378ff2 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 28 Aug 2023 12:38:27 -0500
Subject: [PATCH 24/44] Update
 libsyclinterface/source/dpctl_sycl_queue_interface.cpp

Remove unneeded static keyword in definition of create_property_list function in anonymous namespace
---
 libsyclinterface/source/dpctl_sycl_queue_interface.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libsyclinterface/source/dpctl_sycl_queue_interface.cpp b/libsyclinterface/source/dpctl_sycl_queue_interface.cpp
index 703f58cb7b..ce318ce37e 100644
--- a/libsyclinterface/source/dpctl_sycl_queue_interface.cpp
+++ b/libsyclinterface/source/dpctl_sycl_queue_interface.cpp
@@ -121,7 +121,7 @@ bool set_kernel_arg(handler &cgh,
     return arg_set;
 }
 
-static std::unique_ptr<property_list> create_property_list(int properties)
+std::unique_ptr<property_list> create_property_list(int properties)
 {
     std::unique_ptr<property_list> propList;
     int _prop = properties;

From a81d9c809cc4bd36cebcd7a78821cb8cb0d327a7 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 22 Aug 2023 01:55:16 -0500
Subject: [PATCH 25/44] Improved performance of reduction kernel with atomics

1. Contig implementation kernel gets a dedicated name
  (easier to spot in the output of onetrace)
2. Increase work-group multiple
3. Change the order in which workgroups tile the array
   from 'along reduction axis' moves fastest to
   'along iteration axis' moves fastests.

This last change contributes to significant performance improvement:

```
================= Before change

In [1]: import dpctl.tensor as dpt

In [2]: x = dpt.reshape(dpt.asarray(1, dtype="f4")/dpt.square(dpt.arange(1, 1282200*128 + 1, dtype="f4")), (1282200, 128))

In [3]: %time y = dpt.sum(x, axis=0)
CPU times: user 309 ms, sys: 128 ms, total: 437 ms
Wall time: 473 ms

In [4]: %time y = dpt.sum(x, axis=0)
CPU times: user 132 ms, sys: 160 ms, total: 292 ms
Wall time: 316 ms

In [5]: %time y = dpt.sum(x, axis=0)
CPU times: user 104 ms, sys: 185 ms, total: 289 ms
Wall time: 312 ms
```

```
===== After change

In [1]: import dpctl.tensor as dpt

In [2]: x = dpt.reshape(dpt.asarray(1, dtype="f4")/dpt.square(dpt.arange(1, 1282200*128 + 1, dtype="f4")), (1282200, 128))

In [3]: %time y = dpt.sum(x, axis=0)
CPU times: user 150 ms, sys: 32.9 ms, total: 183 ms
Wall time: 198 ms

In [4]: %time y = dpt.sum(x, axis=0)
CPU times: user 20 ms, sys: 22.7 ms, total: 42.7 ms
Wall time: 49.4 ms

In [5]: %time y = dpt.sum(x, axis=0)
CPU times: user 10.2 ms, sys: 28.9 ms, total: 39.1 ms
Wall time: 41.4 ms

In [6]: %time y = dpt.sum(x, axis=0)
CPU times: user 23 ms, sys: 18 ms, total: 41 ms
Wall time: 43.5 ms
```
---
 .../libtensor/include/kernels/reductions.hpp  | 26 ++++++++-----------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
index c8aae0a3b9..1257574016 100644
--- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
@@ -146,9 +146,9 @@ struct ReductionOverGroupWithAtomicFunctor
 
     void operator()(sycl::nd_item<1> it) const
     {
-        const size_t red_gws_ = it.get_global_range(0) / iter_gws_;
-        const size_t iter_gid = it.get_global_id(0) / red_gws_;
-        const size_t reduction_batch_id = get_reduction_batch_id(it);
+        const size_t iter_gid = it.get_group(0) % iter_gws_;
+        const size_t reduction_batch_id = it.get_group(0) / iter_gws_;
+
         const size_t reduction_lid = it.get_local_id(0);
         const size_t wg = it.get_local_range(0); //   0 <= reduction_lid < wg
 
@@ -204,14 +204,6 @@ struct ReductionOverGroupWithAtomicFunctor
             }
         }
     }
-
-private:
-    size_t get_reduction_batch_id(sycl::nd_item<1> const &it) const
-    {
-        const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_;
-        const size_t reduction_batch_id = it.get_group(0) % n_reduction_groups;
-        return reduction_batch_id;
-    }
 };
 
 typedef sycl::event (*sum_reduction_strided_impl_fn_ptr)(
@@ -241,6 +233,9 @@ class sum_reduction_seq_strided_krn;
 template <typename T1, typename T2, typename T3, typename T4, typename T5>
 class sum_reduction_seq_contig_krn;
 
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class sum_reduction_over_group_with_atomics_contig_krn;
+
 using dpctl::tensor::sycl_utils::choose_workgroup_size;
 
 template <typename argTy, typename resTy>
@@ -417,7 +412,7 @@ sycl::event sum_reduction_over_group_with_atomics_contig_impl(
 
     const sycl::device &d = exec_q.get_device();
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
-    size_t wg = choose_workgroup_size<2>(reduction_nelems, sg_sizes);
+    size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
 
     if (reduction_nelems < wg) {
         sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
@@ -499,9 +494,10 @@ sycl::event sum_reduction_over_group_with_atomics_contig_impl(
                 sycl::range<1>{iter_nelems * reduction_groups * wg};
             auto localRange = sycl::range<1>{wg};
 
-            using KernelName = class sum_reduction_over_group_with_atomics_krn<
-                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                ReductionIndexerT>;
+            using KernelName =
+                class sum_reduction_over_group_with_atomics_contig_krn<
+                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                    ReductionIndexerT>;
 
             cgh.parallel_for<KernelName>(
                 sycl::nd_range<1>(globalRange, localRange),

From 8ecb43b4c457f0e18e3bb98d2cca3b8ab29b2166 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 23 Aug 2023 04:04:24 -0500
Subject: [PATCH 26/44] Introduce reduce_over_axis0 kernel for contiguous
 inputs

This achieves additional savings over the prior commit:

```
In [1]: import dpctl.tensor as dpt

In [2]: x = dpt.reshape(dpt.asarray(1, dtype="f4")/dpt.square(dpt.arange(1, 1282200*128 + 1, dtype="f4")), (1282200, 128))

In [3]: %time y = dpt.sum(x, axis=0)
CPU times: user 136 ms, sys: 9.52 ms, total: 145 ms
Wall time: 158 ms

In [4]: %time y = dpt.sum(x, axis=0)
CPU times: user 18.8 ms, sys: 17.3 ms, total: 36.1 ms
Wall time: 42 ms

In [5]: %time y = dpt.sum(x, axis=0)
CPU times: user 19.2 ms, sys: 16.9 ms, total: 36.1 ms
Wall time: 38.4 ms

In [6]: %time y = dpt.sum(x, axis=0)
CPU times: user 1.69 ms, sys: 35.2 ms, total: 36.9 ms
Wall time: 39.4 ms

In [7]: quit
```

Prior to this the wall time stood at 49 ms.
---
 .../libtensor/include/kernels/reductions.hpp  | 131 +++++++++++++++++-
 .../libtensor/source/sum_reductions.cpp       | 112 +++++++++++----
 2 files changed, 211 insertions(+), 32 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
index 1257574016..ca9d8cc664 100644
--- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
@@ -234,7 +234,10 @@ template <typename T1, typename T2, typename T3, typename T4, typename T5>
 class sum_reduction_seq_contig_krn;
 
 template <typename T1, typename T2, typename T3, typename T4, typename T5>
-class sum_reduction_over_group_with_atomics_contig_krn;
+class sum_reduction_axis0_over_group_with_atomics_contig_krn;
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class sum_reduction_axis1_over_group_with_atomics_contig_krn;
 
 using dpctl::tensor::sycl_utils::choose_workgroup_size;
 
@@ -390,7 +393,7 @@ typedef sycl::event (*sum_reduction_contig_impl_fn_ptr)(
 
 /* @brief Reduce rows in a matrix */
 template <typename argTy, typename resTy>
-sycl::event sum_reduction_over_group_with_atomics_contig_impl(
+sycl::event sum_reduction_axis1_over_group_with_atomics_contig_impl(
     sycl::queue exec_q,
     size_t iter_nelems, // number of reductions    (num. of rows in a matrix
                         // when reducing over rows)
@@ -458,11 +461,11 @@ sycl::event sum_reduction_over_group_with_atomics_contig_impl(
                     RowsIndexerT, NoOpIndexerT>;
             using ReductionIndexerT = NoOpIndexerT;
 
-            RowsIndexerT columns_indexer{
+            RowsIndexerT rows_indexer{
                 0, static_cast<py::ssize_t>(iter_nelems),
                 static_cast<py::ssize_t>(reduction_nelems)};
             NoOpIndexerT result_indexer{};
-            InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
+            InputOutputIterIndexerT in_out_iter_indexer{rows_indexer,
                                                         result_indexer};
             ReductionIndexerT reduction_indexer{};
 
@@ -495,7 +498,102 @@ sycl::event sum_reduction_over_group_with_atomics_contig_impl(
             auto localRange = sycl::range<1>{wg};
 
             using KernelName =
-                class sum_reduction_over_group_with_atomics_contig_krn<
+                class sum_reduction_axis1_over_group_with_atomics_contig_krn<
+                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                    ReductionIndexerT>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(globalRange, localRange),
+                ReductionOverGroupWithAtomicFunctor<argTy, resTy, ReductionOpT,
+                                                    InputOutputIterIndexerT,
+                                                    ReductionIndexerT>(
+                    arg_tp, res_tp, ReductionOpT(), identity_val,
+                    in_out_iter_indexer, reduction_indexer, reduction_nelems,
+                    iter_nelems, reductions_per_wi));
+        });
+
+        return comp_ev;
+    }
+}
+
+/* @brief Reduce rows in a matrix */
+template <typename argTy, typename resTy>
+sycl::event sum_reduction_axis0_over_group_with_atomics_contig_impl(
+    sycl::queue exec_q,
+    size_t iter_nelems, // number of reductions    (num. of cols in a matrix
+                        // when reducing over cols)
+    size_t reduction_nelems, // size of each reduction  (length of cols, i.e.
+                             // number of rows)
+    const char *arg_cp,
+    char *res_cp,
+    py::ssize_t iter_arg_offset,
+    py::ssize_t iter_res_offset,
+    py::ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + reduction_arg_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
+
+    using ReductionOpT = sycl::plus<resTy>;
+    constexpr resTy identity_val = resTy{0};
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    {
+        sycl::event res_init_ev = exec_q.fill<resTy>(
+            res_tp, resTy(identity_val), iter_nelems, depends);
+
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(res_init_ev);
+
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    NoOpIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT = ColsIndexerT;
+
+            NoOpIndexerT columns_indexer{};
+            NoOpIndexerT result_indexer{};
+            InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
+                                                        result_indexer};
+            ReductionIndexerT reduction_indexer{
+                0, /* size */ static_cast<py::ssize_t>(reduction_nelems),
+                /* step */ static_cast<py::ssize_t>(iter_nelems)};
+
+            constexpr size_t preferrered_reductions_per_wi = 8;
+            size_t reductions_per_wi =
+                (reduction_nelems < preferrered_reductions_per_wi * wg)
+                    ? std::max<size_t>(1, (reduction_nelems + wg - 1) / wg)
+                    : preferrered_reductions_per_wi;
+
+            size_t reduction_groups =
+                (reduction_nelems + reductions_per_wi * wg - 1) /
+                (reductions_per_wi * wg);
+
+            if (reduction_groups > 1) {
+                const size_t &max_wg =
+                    d.get_info<sycl::info::device::max_work_group_size>();
+
+                if (reduction_nelems < preferrered_reductions_per_wi * max_wg) {
+                    wg = max_wg;
+                    reductions_per_wi =
+                        std::max<size_t>(1, (reduction_nelems + wg - 1) / wg);
+                    reduction_groups =
+                        (reduction_nelems + reductions_per_wi * wg - 1) /
+                        (reductions_per_wi * wg);
+                }
+            }
+
+            auto globalRange =
+                sycl::range<1>{iter_nelems * reduction_groups * wg};
+            auto localRange = sycl::range<1>{wg};
+
+            using KernelName =
+                class sum_reduction_axis0_over_group_with_atomics_contig_krn<
                     argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
                     ReductionIndexerT>;
 
@@ -1075,7 +1173,25 @@ struct SumOverAxisTempsStridedFactory
 };
 
 template <typename fnT, typename srcTy, typename dstTy>
-struct SumOverAxisAtomicContigFactory
+struct SumOverAxis1AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSumReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            return dpctl::tensor::kernels::
+                sum_reduction_axis1_over_group_with_atomics_contig_impl<srcTy,
+                                                                        dstTy>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct SumOverAxis0AtomicContigFactory
 {
     fnT get() const
     {
@@ -1083,7 +1199,8 @@ struct SumOverAxisAtomicContigFactory
                           srcTy, dstTy>::is_defined)
         {
             return dpctl::tensor::kernels::
-                sum_reduction_over_group_with_atomics_contig_impl<srcTy, dstTy>;
+                sum_reduction_axis0_over_group_with_atomics_contig_impl<srcTy,
+                                                                        dstTy>;
         }
         else {
             return nullptr;
diff --git a/dpctl/tensor/libtensor/source/sum_reductions.cpp b/dpctl/tensor/libtensor/source/sum_reductions.cpp
index 3502a81a0e..7628813c6d 100644
--- a/dpctl/tensor/libtensor/source/sum_reductions.cpp
+++ b/dpctl/tensor/libtensor/source/sum_reductions.cpp
@@ -88,8 +88,11 @@ static sum_reduction_strided_impl_fn_ptr
 
 using dpctl::tensor::kernels::sum_reduction_contig_impl_fn_ptr;
 static sum_reduction_contig_impl_fn_ptr
-    sum_over_axis_contig_atomic_dispatch_table[td_ns::num_types]
-                                              [td_ns::num_types];
+    sum_over_axis1_contig_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static sum_reduction_contig_impl_fn_ptr
+    sum_over_axis0_contig_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
 
 std::pair<sycl::event, sycl::event> py_sum_over_axis(
     dpctl::tensor::usm_ndarray src,
@@ -194,8 +197,30 @@ std::pair<sycl::event, sycl::event> py_sum_over_axis(
         if ((is_src_c_contig && is_dst_c_contig) ||
             (is_src_f_contig && dst_nelems == 1))
         {
-            auto fn = sum_over_axis_contig_atomic_dispatch_table[src_typeid]
-                                                                [dst_typeid];
+            auto fn = sum_over_axis1_contig_atomic_dispatch_table[src_typeid]
+                                                                 [dst_typeid];
+            if (fn != nullptr) {
+                size_t iter_nelems = dst_nelems;
+
+                constexpr py::ssize_t zero_offset = 0;
+
+                sycl::event sum_over_axis_contig_ev =
+                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                       dst.get_data(),
+                       zero_offset, // iteration_src_offset
+                       zero_offset, // iteration_dst_offset
+                       zero_offset, // reduction_src_offset
+                       depends);
+
+                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                    exec_q, {src, dst}, {sum_over_axis_contig_ev});
+
+                return std::make_pair(keep_args_event, sum_over_axis_contig_ev);
+            }
+        }
+        else if (is_src_f_contig & is_dst_c_contig) {
+            auto fn = sum_over_axis0_contig_atomic_dispatch_table[src_typeid]
+                                                                 [dst_typeid];
             if (fn != nullptr) {
                 size_t iter_nelems = dst_nelems;
 
@@ -271,27 +296,58 @@ std::pair<sycl::event, sycl::event> py_sum_over_axis(
                                  iteration_src_offset, iteration_dst_offset);
     }
 
-    if (supports_atomics && (reduction_nd == 1) &&
-        (simplified_reduction_src_strides[0] == 1) && (iteration_nd == 1) &&
-        ((simplified_iteration_shape[0] == 1) ||
-         ((simplified_iteration_dst_strides[0] == 1) &&
-          (static_cast<size_t>(simplified_iteration_src_strides[0]) ==
-           reduction_nelems))))
-    {
-        auto fn =
-            sum_over_axis_contig_atomic_dispatch_table[src_typeid][dst_typeid];
-        if (fn != nullptr) {
-            size_t iter_nelems = dst_nelems;
+    if (supports_atomics && (reduction_nd == 1) && (iteration_nd == 1)) {
+        bool mat_reduce_over_axis1 = false;
+        bool mat_reduce_over_axis0 = false;
+        bool array_reduce_all_elems = false;
+        size_t iter_nelems = dst_nelems;
+
+        if (simplified_reduction_src_strides[0] == 1) {
+            array_reduce_all_elems = (simplified_iteration_shape[0] == 1);
+            mat_reduce_over_axis1 =
+                (simplified_iteration_dst_strides[0] == 1) &&
+                (static_cast<size_t>(simplified_iteration_src_strides[0]) ==
+                 reduction_nelems);
+        }
+        else if (static_cast<size_t>(simplified_reduction_src_strides[0]) ==
+                 iter_nelems)
+        {
+            mat_reduce_over_axis0 =
+                (simplified_iteration_dst_strides[0] == 1) &&
+                (simplified_iteration_src_strides[0] == 1);
+        }
+
+        if (mat_reduce_over_axis1 || array_reduce_all_elems) {
+            auto fn = sum_over_axis1_contig_atomic_dispatch_table[src_typeid]
+                                                                 [dst_typeid];
+            if (fn != nullptr) {
+                sycl::event sum_over_axis1_contig_ev =
+                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                       dst.get_data(), iteration_src_offset,
+                       iteration_dst_offset, reduction_src_offset, depends);
 
-            sycl::event sum_over_axis_contig_ev =
-                fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
-                   dst.get_data(), iteration_src_offset, iteration_dst_offset,
-                   reduction_src_offset, depends);
+                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                    exec_q, {src, dst}, {sum_over_axis1_contig_ev});
+
+                return std::make_pair(keep_args_event,
+                                      sum_over_axis1_contig_ev);
+            }
+        }
+        else if (mat_reduce_over_axis0) {
+            auto fn = sum_over_axis0_contig_atomic_dispatch_table[src_typeid]
+                                                                 [dst_typeid];
+            if (fn != nullptr) {
+                sycl::event sum_over_axis0_contig_ev =
+                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                       dst.get_data(), iteration_src_offset,
+                       iteration_dst_offset, reduction_src_offset, depends);
 
-            sycl::event keep_args_event = dpctl::utils::keep_args_alive(
-                exec_q, {src, dst}, {sum_over_axis_contig_ev});
+                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                    exec_q, {src, dst}, {sum_over_axis0_contig_ev});
 
-            return std::make_pair(keep_args_event, sum_over_axis_contig_ev);
+                return std::make_pair(keep_args_event,
+                                      sum_over_axis0_contig_ev);
+            }
         }
     }
 
@@ -451,11 +507,17 @@ void populate_sum_over_axis_dispatch_table(void)
         dtb2;
     dtb2.populate_dispatch_table(sum_over_axis_strided_temps_dispatch_table);
 
-    using dpctl::tensor::kernels::SumOverAxisAtomicContigFactory;
+    using dpctl::tensor::kernels::SumOverAxis1AtomicContigFactory;
     DispatchTableBuilder<sum_reduction_contig_impl_fn_ptr,
-                         SumOverAxisAtomicContigFactory, num_types>
+                         SumOverAxis1AtomicContigFactory, num_types>
         dtb3;
-    dtb3.populate_dispatch_table(sum_over_axis_contig_atomic_dispatch_table);
+    dtb3.populate_dispatch_table(sum_over_axis1_contig_atomic_dispatch_table);
+
+    using dpctl::tensor::kernels::SumOverAxis0AtomicContigFactory;
+    DispatchTableBuilder<sum_reduction_contig_impl_fn_ptr,
+                         SumOverAxis0AtomicContigFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(sum_over_axis0_contig_atomic_dispatch_table);
 }
 
 namespace py = pybind11;

From c79445f6439ebad1091ffdf5ef3077a6ad26d915 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sat, 26 Aug 2023 01:53:21 -0500
Subject: [PATCH 27/44] Remove logic to select max_work_group_size

The logic was misguided, and based on the idea that if
using max-work-group-size can lead to launching just a
single work-group, then we can reduce everything within
the work-group and not use atomics altogether.

This lead to problems on CPU, where max-work-group-size is 8192,
and max-work-group size was selected, but the total number of
work-groups launched was high due to large iteration space size,
and this resulted in severe underutilization of the device (low
ocupancy).
---
 .../libtensor/include/kernels/reductions.hpp  | 42 -------------------
 1 file changed, 42 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
index ca9d8cc664..0480fb9037 100644
--- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
@@ -342,20 +342,6 @@ sycl::event sum_reduction_over_group_with_atomics_strided_impl(
                 (reduction_nelems + reductions_per_wi * wg - 1) /
                 (reductions_per_wi * wg);
 
-            if (reduction_groups > 1) {
-                const size_t &max_wg =
-                    d.get_info<sycl::info::device::max_work_group_size>();
-
-                if (reduction_nelems < preferrered_reductions_per_wi * max_wg) {
-                    wg = max_wg;
-                    reductions_per_wi =
-                        std::max<size_t>(1, (reduction_nelems + wg - 1) / wg);
-                    reduction_groups =
-                        (reduction_nelems + reductions_per_wi * wg - 1) /
-                        (reductions_per_wi * wg);
-                }
-            }
-
             auto globalRange =
                 sycl::range<1>{iter_nelems * reduction_groups * wg};
             auto localRange = sycl::range<1>{wg};
@@ -479,20 +465,6 @@ sycl::event sum_reduction_axis1_over_group_with_atomics_contig_impl(
                 (reduction_nelems + reductions_per_wi * wg - 1) /
                 (reductions_per_wi * wg);
 
-            if (reduction_groups > 1) {
-                const size_t &max_wg =
-                    d.get_info<sycl::info::device::max_work_group_size>();
-
-                if (reduction_nelems < preferrered_reductions_per_wi * max_wg) {
-                    wg = max_wg;
-                    reductions_per_wi =
-                        std::max<size_t>(1, (reduction_nelems + wg - 1) / wg);
-                    reduction_groups =
-                        (reduction_nelems + reductions_per_wi * wg - 1) /
-                        (reductions_per_wi * wg);
-                }
-            }
-
             auto globalRange =
                 sycl::range<1>{iter_nelems * reduction_groups * wg};
             auto localRange = sycl::range<1>{wg};
@@ -574,20 +546,6 @@ sycl::event sum_reduction_axis0_over_group_with_atomics_contig_impl(
                 (reduction_nelems + reductions_per_wi * wg - 1) /
                 (reductions_per_wi * wg);
 
-            if (reduction_groups > 1) {
-                const size_t &max_wg =
-                    d.get_info<sycl::info::device::max_work_group_size>();
-
-                if (reduction_nelems < preferrered_reductions_per_wi * max_wg) {
-                    wg = max_wg;
-                    reductions_per_wi =
-                        std::max<size_t>(1, (reduction_nelems + wg - 1) / wg);
-                    reduction_groups =
-                        (reduction_nelems + reductions_per_wi * wg - 1) /
-                        (reductions_per_wi * wg);
-                }
-            }
-
             auto globalRange =
                 sycl::range<1>{iter_nelems * reduction_groups * wg};
             auto localRange = sycl::range<1>{wg};

From 32d4419a00b851e07348ade431f5f00456f83f3b Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 29 Aug 2023 04:27:21 -0500
Subject: [PATCH 28/44] Change WG traversal pattern in tree reduction kernel

Made changes similar to those made in kernels for atomic
reduction. The WG's location change along iteration dimension
the fastest (previously along reduction dimension the fastest).

Due to this change performance of reduction increases 7-8x:

```
In [1]: import dpctl.tensor as dpt

In [2]: x = dpt.reshape(dpt.asarray(1, dtype="f4")/dpt.square(dpt.arange(1, 1282200*128 + 1, dtype="f2")), (1282200, 128))

In [3]: %time y = dpt.sum(x, axis=0, dtype="f2")
CPU times: user 284 ms, sys: 3.68 ms, total: 287 ms
Wall time: 316 ms

In [4]: %time y = dpt.sum(x, axis=0, dtype="f2")
CPU times: user 18.6 ms, sys: 18.9 ms, total: 37.5 ms
Wall time: 43 ms

In [5]: quit
```

While in the main branch:

```
In [1]: import dpctl.tensor as dpt

In [2]: x = dpt.reshape(dpt.asarray(1, dtype="f4")/dpt.square(dpt.arange(1, 1282200*128 + 1, dtype="f2")), (1282200, 128))

In [3]: %time y = dpt.sum(x, axis=0, dtype="f2")
CPU times: user 440 ms, sys: 129 ms, total: 569 ms
Wall time: 514 ms

In [4]: %time y = dpt.sum(x, axis=0, dtype="f2")
CPU times: user 142 ms, sys: 159 ms, total: 301 ms
Wall time: 325 ms

In [5]: %time y = dpt.sum(x, axis=0, dtype="f2")
CPU times: user 142 ms, sys: 154 ms, total: 296 ms
Wall time: 325 ms

In [6]: quit
```
---
 dpctl/tensor/libtensor/include/kernels/reductions.hpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
index 0480fb9037..30cd3fad42 100644
--- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
@@ -610,14 +610,13 @@ struct ReductionOverGroupNoAtomicFunctor
 
     void operator()(sycl::nd_item<1> it) const
     {
-
-        const size_t red_gws_ = it.get_global_range(0) / iter_gws_;
-        const size_t iter_gid = it.get_global_id(0) / red_gws_;
-        const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_;
-        const size_t reduction_batch_id = it.get_group(0) % n_reduction_groups;
         const size_t reduction_lid = it.get_local_id(0);
         const size_t wg = it.get_local_range(0); //   0 <= reduction_lid < wg
 
+        const size_t iter_gid = it.get_group(0) % iter_gws_;
+        const size_t reduction_batch_id = it.get_group(0) / iter_gws_;
+        const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_;
+
         // work-items sums over input with indices
         //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
         //   + reduction_lid

From 2c3f748723842aa7e4dec2e32137703e505116e6 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 29 Aug 2023 10:38:47 -0500
Subject: [PATCH 29/44] Call operator of all indexers must return py::ssize_t

---
 .../libtensor/include/utils/offset_utils.hpp  | 21 +++++++++----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp b/dpctl/tensor/libtensor/include/utils/offset_utils.hpp
index ac02d26bf0..c8fe1f3cd3 100644
--- a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/offset_utils.hpp
@@ -144,12 +144,12 @@ struct StridedIndexer
     {
     }
 
-    size_t operator()(py::ssize_t gid) const
+    py::ssize_t operator()(py::ssize_t gid) const
     {
         return compute_offset(gid);
     }
 
-    size_t operator()(size_t gid) const
+    py::ssize_t operator()(size_t gid) const
     {
         return compute_offset(static_cast<py::ssize_t>(gid));
     }
@@ -159,7 +159,7 @@ struct StridedIndexer
     py::ssize_t starting_offset;
     py::ssize_t const *shape_strides;
 
-    size_t compute_offset(py::ssize_t gid) const
+    py::ssize_t compute_offset(py::ssize_t gid) const
     {
         using dpctl::tensor::strides::CIndexer_vector;
 
@@ -185,12 +185,12 @@ struct UnpackedStridedIndexer
     {
     }
 
-    size_t operator()(py::ssize_t gid) const
+    py::ssize_t operator()(py::ssize_t gid) const
     {
         return compute_offset(gid);
     }
 
-    size_t operator()(size_t gid) const
+    py::ssize_t operator()(size_t gid) const
     {
         return compute_offset(static_cast<py::ssize_t>(gid));
     }
@@ -201,7 +201,7 @@ struct UnpackedStridedIndexer
     py::ssize_t const *shape;
     py::ssize_t const *strides;
 
-    size_t compute_offset(py::ssize_t gid) const
+    py::ssize_t compute_offset(py::ssize_t gid) const
     {
         using dpctl::tensor::strides::CIndexer_vector;
 
@@ -223,11 +223,10 @@ struct Strided1DIndexer
     {
     }
 
-    size_t operator()(size_t gid) const
+    py::ssize_t operator()(size_t gid) const
     {
         // ensure 0 <= gid < size
-        return static_cast<size_t>(offset +
-                                   std::min<size_t>(gid, size - 1) * step);
+        return offset + std::min<size_t>(gid, size - 1) * step;
     }
 
 private:
@@ -245,9 +244,9 @@ struct Strided1DCyclicIndexer
     {
     }
 
-    size_t operator()(size_t gid) const
+    py::ssize_t operator()(size_t gid) const
     {
-        return static_cast<size_t>(offset + (gid % size) * step);
+        return offset + (gid % size) * step;
     }
 
 private:

From aea79dd4614595517be35e8bbb7b438f51ee3b5f Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 29 Aug 2023 10:43:24 -0500
Subject: [PATCH 30/44] Add method
 CIndexer_vector::get_left_rolled_displacement

This is used to compute displacement for
   a[(i0 - shifts[0]) % shape[0],
     (i1 - shifts[1]) % shape[1], ... ]
---
 .../libtensor/include/utils/strided_iters.hpp | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/dpctl/tensor/libtensor/include/utils/strided_iters.hpp b/dpctl/tensor/libtensor/include/utils/strided_iters.hpp
index f654087281..c0de44f058 100644
--- a/dpctl/tensor/libtensor/include/utils/strided_iters.hpp
+++ b/dpctl/tensor/libtensor/include/utils/strided_iters.hpp
@@ -238,6 +238,30 @@ template <typename indT = std::ptrdiff_t> class CIndexer_vector
         }
         return;
     }
+
+    template <class ShapeTy, class StridesTy>
+    void get_left_rolled_displacement(indT i,
+                                      ShapeTy shape,
+                                      StridesTy stride,
+                                      StridesTy shifts,
+                                      indT &disp) const
+    {
+        indT i_ = i;
+        indT d = 0;
+        for (int dim = nd; --dim > 0;) {
+            const indT si = shape[dim];
+            const indT q = i_ / si;
+            const indT r = (i_ - q * si);
+            // assumes si > shifts[dim] >= 0
+            const indT shifted_r =
+                (r < shifts[dim] ? r + si - shifts[dim] : r - shifts[dim]);
+            d += shifted_r * stride[dim];
+            i_ = q;
+        }
+        const indT shifted_r =
+            (i_ < shifts[0] ? i_ + shape[0] - shifts[0] : i_ - shifts[0]);
+        disp = d + shifted_r * stride[0];
+    }
 };
 
 /*

From 558765f7b276faad87a919b48c7108aaf5d4983b Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 30 Aug 2023 02:03:31 -0500
Subject: [PATCH 31/44] Introduced dedicated function for rolling with nd-shift

Function for flattened rolling is renamed:
   _copy_usm_ndarray_for_roll -> _copy_usm_ndarray_for_roll_1d

_copy_usm_ndarray_for_roll_1d has the same signature:
   _copy_usm_ndarray_for_roll_1d(
         src : usm_ndarray,
	 dst : usm_ndarray,
	 shift: Int,
	 sycl_queue: dpctl.SyclQueue) ->
	      Tuple[dpctl.SyclEvent, dpctl.SyclEvent]

Introduced
   _copy_usm_ndarray_for_roll_nd(
         src : usm_ndarray,
	 dst : usm_ndarray,
	 shifts: Tuple[Int],
	 sycl_queue: dpctl.SyclQueue) ->
	      Tuple[dpctl.SyclEvent, dpctl.SyclEvent]

The length of shifts tuple must be the same as the dimensionality
of src and dst arrays, which are supposed to have the same shape
and the same data type.
---
 .../include/kernels/copy_and_cast.hpp         | 206 ++++++++++++++++--
 .../tensor/libtensor/source/copy_for_roll.cpp | 168 +++++++++++++-
 .../tensor/libtensor/source/copy_for_roll.hpp |  17 +-
 dpctl/tensor/libtensor/source/tensor_py.cpp   |  13 +-
 4 files changed, 369 insertions(+), 35 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
index 4be31578ee..ce69afdbca 100644
--- a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
@@ -746,7 +746,85 @@ template <typename fnT, typename Ty> struct CopyForReshapeGenericFactory
     }
 };
 
-// =============== Copying for reshape ================== //
+// ================== Copying for roll ================== //
+
+/*! @brief Functor to cyclically roll global_id to the left */
+struct LeftRolled1DTransformer
+{
+    LeftRolled1DTransformer(size_t offset, size_t size)
+        : offset_(offset), size_(size)
+    {
+    }
+
+    size_t operator()(size_t gid) const
+    {
+        const size_t shifted_gid =
+            ((gid < offset_) ? gid + size_ - offset_ : gid - offset_);
+        return shifted_gid;
+    }
+
+private:
+    size_t offset_ = 0;
+    size_t size_ = 1;
+};
+
+/*! @brief Indexer functor to compose indexer and transformer */
+template <typename IndexerT, typename TransformerT> struct CompositionIndexer
+{
+    CompositionIndexer(IndexerT f, TransformerT t) : f_(f), t_(t) {}
+
+    auto operator()(size_t gid) const
+    {
+        return f_(t_(gid));
+    }
+
+private:
+    IndexerT f_;
+    TransformerT t_;
+};
+
+/*! @brief Indexer functor to find offset for nd-shifted indices lifted from
+ * iteration id */
+struct RolledNDIndexer
+{
+    RolledNDIndexer(int nd,
+                    const py::ssize_t *shape,
+                    const py::ssize_t *strides,
+                    const py::ssize_t *ndshifts,
+                    py::ssize_t starting_offset)
+        : nd_(nd), shape_(shape), strides_(strides), ndshifts_(ndshifts),
+          starting_offset_(starting_offset)
+    {
+    }
+
+    py::ssize_t operator()(size_t gid) const
+    {
+        return compute_offset(gid);
+    }
+
+private:
+    int nd_ = -1;
+    const py::ssize_t *shape_ = nullptr;
+    const py::ssize_t *strides_ = nullptr;
+    const py::ssize_t *ndshifts_ = nullptr;
+    py::ssize_t starting_offset_ = 0;
+
+    py::ssize_t compute_offset(py::ssize_t gid) const
+    {
+        using dpctl::tensor::strides::CIndexer_vector;
+
+        CIndexer_vector _ind(nd_);
+        py::ssize_t relative_offset_(0);
+        _ind.get_left_rolled_displacement<const py::ssize_t *,
+                                          const py::ssize_t *>(
+            gid,
+            shape_,    // shape ptr
+            strides_,  // strides ptr
+            ndshifts_, // shifts ptr
+            relative_offset_);
+        return starting_offset_ + relative_offset_;
+    }
+};
 
 template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
 class copy_for_roll_strided_kernel;
@@ -755,32 +833,26 @@ template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
 class StridedCopyForRollFunctor
 {
 private:
-    size_t offset = 0;
-    size_t size = 1;
     const Ty *src_p = nullptr;
     Ty *dst_p = nullptr;
     SrcIndexerT src_indexer_;
     DstIndexerT dst_indexer_;
 
 public:
-    StridedCopyForRollFunctor(size_t shift,
-                              size_t nelems,
-                              const Ty *src_ptr,
+    StridedCopyForRollFunctor(const Ty *src_ptr,
                               Ty *dst_ptr,
                               SrcIndexerT src_indexer,
                               DstIndexerT dst_indexer)
-        : offset(shift), size(nelems), src_p(src_ptr), dst_p(dst_ptr),
-          src_indexer_(src_indexer), dst_indexer_(dst_indexer)
+        : src_p(src_ptr), dst_p(dst_ptr), src_indexer_(src_indexer),
+          dst_indexer_(dst_indexer)
     {
     }
 
     void operator()(sycl::id<1> wiid) const
     {
         const size_t gid = wiid.get(0);
-        const size_t shifted_gid =
-            ((gid < offset) ? gid + size - offset : gid - offset);
 
-        const py::ssize_t src_offset = src_indexer_(shifted_gid);
+        const py::ssize_t src_offset = src_indexer_(gid);
         const py::ssize_t dst_offset = dst_indexer_(gid);
 
         dst_p[dst_offset] = src_p[src_offset];
@@ -800,8 +872,6 @@ typedef sycl::event (*copy_for_roll_strided_fn_ptr_t)(
     py::ssize_t,         // dst_offset
     const std::vector<sycl::event> &);
 
-template <typename Ty> class copy_for_roll_contig_kernel;
-
 /*!
  * @brief Function to copy content of array with a shift.
  *
@@ -812,8 +882,8 @@ template <typename Ty> class copy_for_roll_contig_kernel;
  * @param  shift  The shift in flat indexing, must be non-negative.
  * @param  nelems The number of elements to copy
  * @param  nd     Array dimensionality of the destination and source arrays
- * @param  packed_shapes_and_strides Kernel accessible USM array of size
- * `3*nd` with content `[common_shape, src_strides, dst_strides]`.
+ * @param  packed_shapes_and_strides Kernel accessible USM array
+ * of size `3*nd` with content `[common_shape, src_strides, dst_strides]`.
  * @param  src_p  Typeless USM pointer to the buffer of the source array
  * @param  src_offset Displacement of first element of src relative src_p in
  * elements
@@ -849,11 +919,19 @@ copy_for_roll_strided_impl(sycl::queue q,
         //   [ common_shape; src_strides; dst_strides ]
 
         StridedIndexer src_indexer{nd, src_offset, packed_shapes_and_strides};
+        LeftRolled1DTransformer left_roll_transformer{shift, nelems};
+
+        using CompositeIndexerT =
+            CompositionIndexer<StridedIndexer, LeftRolled1DTransformer>;
+
+        CompositeIndexerT rolled_src_indexer(src_indexer,
+                                             left_roll_transformer);
+
         UnpackedStridedIndexer dst_indexer{nd, dst_offset,
                                            packed_shapes_and_strides,
                                            packed_shapes_and_strides + 2 * nd};
 
-        using KernelName = copy_for_roll_strided_kernel<Ty, StridedIndexer,
+        using KernelName = copy_for_roll_strided_kernel<Ty, CompositeIndexerT,
                                                         UnpackedStridedIndexer>;
 
         const Ty *src_tp = reinterpret_cast<const Ty *>(src_p);
@@ -861,9 +939,9 @@ copy_for_roll_strided_impl(sycl::queue q,
 
         cgh.parallel_for<KernelName>(
             sycl::range<1>(nelems),
-            StridedCopyForRollFunctor<Ty, StridedIndexer,
+            StridedCopyForRollFunctor<Ty, CompositeIndexerT,
                                       UnpackedStridedIndexer>(
-                shift, nelems, src_tp, dst_tp, src_indexer, dst_indexer));
+                src_tp, dst_tp, rolled_src_indexer, dst_indexer));
     });
 
     return copy_for_roll_ev;
@@ -880,6 +958,8 @@ typedef sycl::event (*copy_for_roll_contig_fn_ptr_t)(
     py::ssize_t,  // dst_offset
     const std::vector<sycl::event> &);
 
+template <typename Ty> class copy_for_roll_contig_kernel;
+
 /*!
  * @brief Function to copy content of array with a shift.
  *
@@ -917,6 +997,10 @@ sycl::event copy_for_roll_contig_impl(sycl::queue q,
         cgh.depends_on(depends);
 
         NoOpIndexer src_indexer{};
+        LeftRolled1DTransformer roller{shift, nelems};
+
+        CompositionIndexer<NoOpIndexer, LeftRolled1DTransformer>
+            left_rolled_src_indexer{src_indexer, roller};
         NoOpIndexer dst_indexer{};
 
         using KernelName = copy_for_roll_contig_kernel<Ty>;
@@ -926,8 +1010,10 @@ sycl::event copy_for_roll_contig_impl(sycl::queue q,
 
         cgh.parallel_for<KernelName>(
             sycl::range<1>(nelems),
-            StridedCopyForRollFunctor<Ty, NoOpIndexer, NoOpIndexer>(
-                shift, nelems, src_tp, dst_tp, src_indexer, dst_indexer));
+            StridedCopyForRollFunctor<
+                Ty, CompositionIndexer<NoOpIndexer, LeftRolled1DTransformer>,
+                NoOpIndexer>(src_tp, dst_tp, left_rolled_src_indexer,
+                             dst_indexer));
     });
 
     return copy_for_roll_ev;
@@ -961,6 +1047,86 @@ template <typename fnT, typename Ty> struct CopyForRollContigFactory
     }
 };
 
+template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
+class copy_for_roll_ndshift_strided_kernel;
+
+// define function type
+typedef sycl::event (*copy_for_roll_ndshift_strided_fn_ptr_t)(
+    sycl::queue,
+    size_t,              // num_elements
+    int,                 // common_nd
+    const py::ssize_t *, // packed shape, strides, shifts
+    const char *,        // src_data_ptr
+    py::ssize_t,         // src_offset
+    char *,              // dst_data_ptr
+    py::ssize_t,         // dst_offset
+    const std::vector<sycl::event> &);
+
+template <typename Ty>
+sycl::event copy_for_roll_ndshift_strided_impl(
+    sycl::queue q,
+    size_t nelems,
+    int nd,
+    const py::ssize_t *packed_shapes_and_strides_and_shifts,
+    const char *src_p,
+    py::ssize_t src_offset,
+    char *dst_p,
+    py::ssize_t dst_offset,
+    const std::vector<sycl::event> &depends)
+{
+    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
+
+    sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        // packed_shapes_and_strides_and_shifts:
+        //   USM array of size 4 * nd
+        //   [ common_shape; src_strides; dst_strides; shifts ]
+
+        const py::ssize_t *shape_ptr = packed_shapes_and_strides_and_shifts;
+        const py::ssize_t *src_strides_ptr =
+            packed_shapes_and_strides_and_shifts + nd;
+        const py::ssize_t *dst_strides_ptr =
+            packed_shapes_and_strides_and_shifts + 2 * nd;
+        const py::ssize_t *shifts_ptr =
+            packed_shapes_and_strides_and_shifts + 3 * nd;
+
+        RolledNDIndexer src_indexer{nd, shape_ptr, src_strides_ptr, shifts_ptr,
+                                    src_offset};
+
+        UnpackedStridedIndexer dst_indexer{nd, dst_offset, shape_ptr,
+                                           dst_strides_ptr};
+
+        using KernelName = copy_for_roll_strided_kernel<Ty, RolledNDIndexer,
+                                                        UnpackedStridedIndexer>;
+
+        const Ty *src_tp = reinterpret_cast<const Ty *>(src_p);
+        Ty *dst_tp = reinterpret_cast<Ty *>(dst_p);
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(nelems),
+            StridedCopyForRollFunctor<Ty, RolledNDIndexer,
+                                      UnpackedStridedIndexer>(
+                src_tp, dst_tp, src_indexer, dst_indexer));
+    });
+
+    return copy_for_roll_ev;
+}
+
+/*!
+ * @brief Factory to get function pointer of type `fnT` for given array data
+ * type `Ty`.
+ * @ingroup CopyAndCastKernels
+ */
+template <typename fnT, typename Ty> struct CopyForRollNDShiftFactory
+{
+    fnT get()
+    {
+        fnT f = copy_for_roll_ndshift_strided_impl<Ty>;
+        return f;
+    }
+};
+
 } // namespace copy_and_cast
 } // namespace kernels
 } // namespace tensor
diff --git a/dpctl/tensor/libtensor/source/copy_for_roll.cpp b/dpctl/tensor/libtensor/source/copy_for_roll.cpp
index d8ae3059b0..eee129932f 100644
--- a/dpctl/tensor/libtensor/source/copy_for_roll.cpp
+++ b/dpctl/tensor/libtensor/source/copy_for_roll.cpp
@@ -44,6 +44,8 @@ namespace py_internal
 namespace td_ns = dpctl::tensor::type_dispatch;
 
 using dpctl::tensor::kernels::copy_and_cast::copy_for_roll_contig_fn_ptr_t;
+using dpctl::tensor::kernels::copy_and_cast::
+    copy_for_roll_ndshift_strided_fn_ptr_t;
 using dpctl::tensor::kernels::copy_and_cast::copy_for_roll_strided_fn_ptr_t;
 using dpctl::utils::keep_args_alive;
 
@@ -54,6 +56,9 @@ static copy_for_roll_strided_fn_ptr_t
 static copy_for_roll_contig_fn_ptr_t
     copy_for_roll_contig_dispatch_vector[td_ns::num_types];
 
+static copy_for_roll_ndshift_strided_fn_ptr_t
+    copy_for_roll_ndshift_dispatch_vector[td_ns::num_types];
+
 /*
  * Copies src into dst (same data type) of different shapes by using flat
  * iterations.
@@ -64,11 +69,11 @@ static copy_for_roll_contig_fn_ptr_t
  *     dst[np.multi_index(i, dst.shape)] = src[np.multi_index(i, src.shape)]
  */
 std::pair<sycl::event, sycl::event>
-copy_usm_ndarray_for_roll(dpctl::tensor::usm_ndarray src,
-                          dpctl::tensor::usm_ndarray dst,
-                          py::ssize_t shift,
-                          sycl::queue exec_q,
-                          const std::vector<sycl::event> &depends)
+copy_usm_ndarray_for_roll_1d(dpctl::tensor::usm_ndarray src,
+                             dpctl::tensor::usm_ndarray dst,
+                             py::ssize_t shift,
+                             sycl::queue exec_q,
+                             const std::vector<sycl::event> &depends)
 {
     int src_nd = src.get_ndim();
     int dst_nd = dst.get_ndim();
@@ -76,7 +81,7 @@ copy_usm_ndarray_for_roll(dpctl::tensor::usm_ndarray src,
     // Must have the same number of dimensions
     if (src_nd != dst_nd) {
         throw py::value_error(
-            "copy_usm_ndarray_for_roll requires src and dst to "
+            "copy_usm_ndarray_for_roll_1d requires src and dst to "
             "have the same number of dimensions.");
     }
 
@@ -85,7 +90,7 @@ copy_usm_ndarray_for_roll(dpctl::tensor::usm_ndarray src,
 
     if (!std::equal(src_shape_ptr, src_shape_ptr + src_nd, dst_shape_ptr)) {
         throw py::value_error(
-            "copy_usm_ndarray_for_roll requires src and dst to "
+            "copy_usm_ndarray_for_roll_1d requires src and dst to "
             "have the same shape.");
     }
 
@@ -97,7 +102,7 @@ copy_usm_ndarray_for_roll(dpctl::tensor::usm_ndarray src,
     // typenames must be the same
     if (src_typenum != dst_typenum) {
         throw py::value_error(
-            "copy_usm_ndarray_for_reshape requires src and dst to "
+            "copy_usm_ndarray_for_roll_1d requires src and dst to "
             "have the same type.");
     }
 
@@ -245,6 +250,147 @@ copy_usm_ndarray_for_roll(dpctl::tensor::usm_ndarray src,
                           copy_for_roll_event);
 }
 
+std::pair<sycl::event, sycl::event>
+copy_usm_ndarray_for_roll_nd(dpctl::tensor::usm_ndarray src,
+                             dpctl::tensor::usm_ndarray dst,
+                             const std::vector<py::ssize_t> &shifts,
+                             sycl::queue exec_q,
+                             const std::vector<sycl::event> &depends)
+{
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    // Must have the same number of dimensions
+    if (src_nd != dst_nd) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll_nd requires src and dst to "
+            "have the same number of dimensions.");
+    }
+
+    if (static_cast<size_t>(src_nd) != shifts.size()) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll_nd requires shifts to "
+            "contain an integral shift for each array dimension.");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    if (!std::equal(src_shape_ptr, src_shape_ptr + src_nd, dst_shape_ptr)) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll_nd requires src and dst to "
+            "have the same shape.");
+    }
+
+    py::ssize_t src_nelems = src.get_size();
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    // typenames must be the same
+    if (src_typenum != dst_typenum) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_reshape requires src and dst to "
+            "have the same type.");
+    }
+
+    if (src_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    // destination must be ample enough to accommodate all elements
+    {
+        auto dst_offsets = dst.get_minmax_offsets();
+        py::ssize_t range =
+            static_cast<py::ssize_t>(dst_offsets.second - dst_offsets.first);
+        if (range + 1 < src_nelems) {
+            throw py::value_error(
+                "Destination array can not accommodate all the "
+                "elements of source array.");
+        }
+    }
+
+    // check for compatible queues
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    if (src_nelems == 1) {
+        // handle special case of 1-element array
+        int src_elemsize = src.get_elemsize();
+        const char *src_data = src.get_data();
+        char *dst_data = dst.get_data();
+        sycl::event copy_ev =
+            exec_q.copy<char>(src_data, dst_data, src_elemsize);
+        return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}),
+                              copy_ev);
+    }
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int type_id = array_types.typenum_to_lookup_id(src_typenum);
+
+    std::vector<py::ssize_t> normalized_shifts{};
+    normalized_shifts.reserve(src_nd);
+
+    for (int i = 0; i < src_nd; ++i) {
+        // normalize shift parameter to be 0 <= offset < dim
+        py::ssize_t dim = src_shape_ptr[i];
+        size_t offset =
+            (shifts[i] > 0) ? (shifts[i] % dim) : dim + (shifts[i] % dim);
+
+        normalized_shifts.push_back(offset);
+    }
+
+    const char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    auto const &src_strides = src.get_strides_vector();
+    auto const &dst_strides = dst.get_strides_vector();
+    auto const &common_shape = src.get_shape_vector();
+
+    constexpr py::ssize_t src_offset = 0;
+    constexpr py::ssize_t dst_offset = 0;
+
+    auto fn = copy_for_roll_ndshift_dispatch_vector[type_id];
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    // shape_strides = [src_shape, src_strides, dst_strides]
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    const auto &ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, common_shape, src_strides, dst_strides,
+        normalized_shifts);
+
+    py::ssize_t *shape_strides_shifts = std::get<0>(ptr_size_event_tuple);
+    if (shape_strides_shifts == nullptr) {
+        throw std::runtime_error("Unable to allocate device memory");
+    }
+    sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+
+    std::vector<sycl::event> all_deps(depends.size() + 1);
+    all_deps.push_back(copy_shape_ev);
+    all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends));
+
+    sycl::event copy_for_roll_event =
+        fn(exec_q, src_nelems, src_nd, shape_strides_shifts, src_data,
+           src_offset, dst_data, dst_offset, all_deps);
+
+    auto temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(copy_for_roll_event);
+        auto ctx = exec_q.get_context();
+        cgh.host_task([shape_strides_shifts, ctx]() {
+            sycl::free(shape_strides_shifts, ctx);
+        });
+    });
+
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          copy_for_roll_event);
+}
+
 void init_copy_for_roll_dispatch_vectors(void)
 {
     using namespace td_ns;
@@ -260,6 +406,12 @@ void init_copy_for_roll_dispatch_vectors(void)
                           CopyForRollContigFactory, num_types>
         dvb2;
     dvb2.populate_dispatch_vector(copy_for_roll_contig_dispatch_vector);
+
+    using dpctl::tensor::kernels::copy_and_cast::CopyForRollNDShiftFactory;
+    DispatchVectorBuilder<copy_for_roll_ndshift_strided_fn_ptr_t,
+                          CopyForRollNDShiftFactory, num_types>
+        dvb3;
+    dvb3.populate_dispatch_vector(copy_for_roll_ndshift_dispatch_vector);
 }
 
 } // namespace py_internal
diff --git a/dpctl/tensor/libtensor/source/copy_for_roll.hpp b/dpctl/tensor/libtensor/source/copy_for_roll.hpp
index 61f0b3baca..0c00710e11 100644
--- a/dpctl/tensor/libtensor/source/copy_for_roll.hpp
+++ b/dpctl/tensor/libtensor/source/copy_for_roll.hpp
@@ -38,11 +38,18 @@ namespace py_internal
 {
 
 extern std::pair<sycl::event, sycl::event>
-copy_usm_ndarray_for_roll(dpctl::tensor::usm_ndarray src,
-                          dpctl::tensor::usm_ndarray dst,
-                          py::ssize_t shift,
-                          sycl::queue exec_q,
-                          const std::vector<sycl::event> &depends = {});
+copy_usm_ndarray_for_roll_1d(dpctl::tensor::usm_ndarray src,
+                             dpctl::tensor::usm_ndarray dst,
+                             py::ssize_t shift,
+                             sycl::queue exec_q,
+                             const std::vector<sycl::event> &depends = {});
+
+extern std::pair<sycl::event, sycl::event>
+copy_usm_ndarray_for_roll_nd(dpctl::tensor::usm_ndarray src,
+                             dpctl::tensor::usm_ndarray dst,
+                             const std::vector<py::ssize_t> &shifts,
+                             sycl::queue exec_q,
+                             const std::vector<sycl::event> &depends = {});
 
 extern void init_copy_for_roll_dispatch_vectors();
 
diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp
index 9decdf3a9b..3f44f90d9b 100644
--- a/dpctl/tensor/libtensor/source/tensor_py.cpp
+++ b/dpctl/tensor/libtensor/source/tensor_py.cpp
@@ -71,7 +71,8 @@ using dpctl::tensor::py_internal::copy_usm_ndarray_for_reshape;
 
 /* =========================== Copy for roll ============================= */
 
-using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll;
+using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_1d;
+using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_nd;
 
 /* ============= Copy from numpy.ndarray to usm_ndarray ==================== */
 
@@ -232,7 +233,7 @@ PYBIND11_MODULE(_tensor_impl, m)
           py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
           py::arg("depends") = py::list());
 
-    m.def("_copy_usm_ndarray_for_roll", &copy_usm_ndarray_for_roll,
+    m.def("_copy_usm_ndarray_for_roll_1d", &copy_usm_ndarray_for_roll_1d,
           "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same "
           "shapes using underlying 'C'-contiguous order for flat "
           "traversal with shift. "
@@ -240,6 +241,14 @@ PYBIND11_MODULE(_tensor_impl, m)
           py::arg("src"), py::arg("dst"), py::arg("shift"),
           py::arg("sycl_queue"), py::arg("depends") = py::list());
 
+    m.def("_copy_usm_ndarray_for_roll_nd", &copy_usm_ndarray_for_roll_nd,
+          "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same "
+          "shapes using underlying 'C'-contiguous order for "
+          "traversal with shifts along each axis. "
+          "Returns a tuple of events: (ht_event, comp_event)",
+          py::arg("src"), py::arg("dst"), py::arg("shifts"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
     m.def("_linspace_step", &usm_ndarray_linear_sequence_step,
           "Fills input 1D contiguous usm_ndarray `dst` with linear sequence "
           "specified by "

From 987af9e68e7b7169d3516b46a75765c9ca77f6e5 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 30 Aug 2023 02:10:31 -0500
Subject: [PATCH 32/44] Change name of _tensor_impl function in roll
 implementation

---
 dpctl/tensor/_manipulation_functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpctl/tensor/_manipulation_functions.py b/dpctl/tensor/_manipulation_functions.py
index 450bb83aaa..fd3b41e641 100644
--- a/dpctl/tensor/_manipulation_functions.py
+++ b/dpctl/tensor/_manipulation_functions.py
@@ -429,7 +429,7 @@ def roll(X, shift, axis=None):
         res = dpt.empty(
             X.shape, dtype=X.dtype, usm_type=X.usm_type, sycl_queue=X.sycl_queue
         )
-        hev, _ = ti._copy_usm_ndarray_for_roll(
+        hev, _ = ti._copy_usm_ndarray_for_roll_1d(
             src=X, dst=res, shift=shift, sycl_queue=X.sycl_queue
         )
         hev.wait()

From 64fa95ed7c6cc590c404fbc27398091782e138f6 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 30 Aug 2023 04:04:08 -0500
Subject: [PATCH 33/44] Deployed _copy_usm_ndarray_for_roll_nd in dpt.roll

---
 dpctl/tensor/_manipulation_functions.py | 33 +++++++++----------------
 1 file changed, 12 insertions(+), 21 deletions(-)

diff --git a/dpctl/tensor/_manipulation_functions.py b/dpctl/tensor/_manipulation_functions.py
index fd3b41e641..45cdf69c42 100644
--- a/dpctl/tensor/_manipulation_functions.py
+++ b/dpctl/tensor/_manipulation_functions.py
@@ -15,7 +15,8 @@
 #  limitations under the License.
 
 
-from itertools import chain, product, repeat
+import operator
+from itertools import chain, repeat
 
 import numpy as np
 from numpy.core.numeric import normalize_axis_index, normalize_axis_tuple
@@ -426,6 +427,7 @@ def roll(X, shift, axis=None):
     if not isinstance(X, dpt.usm_ndarray):
         raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
     if axis is None:
+        shift = operator.index(shift)
         res = dpt.empty(
             X.shape, dtype=X.dtype, usm_type=X.usm_type, sycl_queue=X.sycl_queue
         )
@@ -438,31 +440,20 @@ def roll(X, shift, axis=None):
     broadcasted = np.broadcast(shift, axis)
     if broadcasted.ndim > 1:
         raise ValueError("'shift' and 'axis' should be scalars or 1D sequences")
-    shifts = {ax: 0 for ax in range(X.ndim)}
+    shifts = [
+        0,
+    ] * X.ndim
     for sh, ax in broadcasted:
         shifts[ax] += sh
-    rolls = [((np.s_[:], np.s_[:]),)] * X.ndim
-    for ax, offset in shifts.items():
-        offset %= X.shape[ax] or 1
-        if offset:
-            # (original, result), (original, result)
-            rolls[ax] = (
-                (np.s_[:-offset], np.s_[offset:]),
-                (np.s_[-offset:], np.s_[:offset]),
-            )
 
+    exec_q = X.sycl_queue
     res = dpt.empty(
-        X.shape, dtype=X.dtype, usm_type=X.usm_type, sycl_queue=X.sycl_queue
+        X.shape, dtype=X.dtype, usm_type=X.usm_type, sycl_queue=exec_q
     )
-    hev_list = []
-    for indices in product(*rolls):
-        arr_index, res_index = zip(*indices)
-        hev, _ = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=X[arr_index], dst=res[res_index], sycl_queue=X.sycl_queue
-        )
-        hev_list.append(hev)
-
-    dpctl.SyclEvent.wait_for(hev_list)
+    ht_e, _ = ti._copy_usm_ndarray_for_roll_nd(
+        src=X, dst=res, shifts=shifts, sycl_queue=exec_q
+    )
+    ht_e.wait()
     return res
 
 

From 3f6d78e9a36c8200685d17e1b906e0cdad3c43c0 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sat, 2 Sep 2023 02:43:53 -0500
Subject: [PATCH 34/44] run_test files to output verbose listing of platform
 config

---
 conda-recipe/run_test.bat | 4 ++--
 conda-recipe/run_test.sh  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/conda-recipe/run_test.bat b/conda-recipe/run_test.bat
index 833ddfd21c..85cac031e7 100644
--- a/conda-recipe/run_test.bat
+++ b/conda-recipe/run_test.bat
@@ -3,8 +3,8 @@
 "%PYTHON%" -c "import dpctl; print(dpctl.__version__)"
 if errorlevel 1 exit 1
 
-"%PYTHON%" -c "import dpctl; dpctl.lsplatform()"
+"%PYTHON%" -m dpctl -f
 if errorlevel 1 exit 1
 
-python -m pytest -q -p no:faulthandler -ra --disable-warnings --pyargs dpctl -vv
+python -m pytest -q -ra --disable-warnings --pyargs dpctl -vv
 if errorlevel 1 exit 1
diff --git a/conda-recipe/run_test.sh b/conda-recipe/run_test.sh
index 4b3566d7bd..63ae7996cc 100644
--- a/conda-recipe/run_test.sh
+++ b/conda-recipe/run_test.sh
@@ -3,5 +3,5 @@
 set -e
 
 ${PYTHON} -c "import dpctl; print(dpctl.__version__)"
-${PYTHON} -c "import dpctl; dpctl.lsplatform(verbosity=2)"
-${PYTHON} -m pytest -q -ra --disable-warnings -p no:faulthandler --cov dpctl --cov-report term-missing --pyargs dpctl -vv
+${PYTHON} -m dpctl -f
+${PYTHON} -m pytest -q -ra --disable-warnings --cov dpctl --cov-report term-missing --pyargs dpctl -vv

From 7997fd5fb5d786461884b038c49978aef66fcf1b Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sat, 2 Sep 2023 20:51:01 -0500
Subject: [PATCH 35/44] Spelling fixes pointed out by codespell

---
 dpctl/_sycl_context.pyx                                | 10 +++++-----
 dpctl/_sycl_device.pyx                                 |  2 +-
 dpctl/_sycl_event.pyx                                  |  2 +-
 dpctl/_sycl_queue.pyx                                  |  2 +-
 dpctl/_sycl_timer.py                                   |  6 +++---
 dpctl/apis/include/dpctl4pybind11.hpp                  |  4 ++--
 dpctl/memory/_memory.pyx                               |  2 +-
 dpctl/tensor/_copy_utils.py                            |  2 +-
 dpctl/tensor/_device.py                                |  2 +-
 dpctl/tensor/_elementwise_funcs.py                     |  2 +-
 dpctl/tensor/_manipulation_functions.py                |  4 ++--
 dpctl/tensor/_print.py                                 |  2 +-
 dpctl/tensor/_slicing.pxi                              |  4 ++--
 dpctl/tensor/_stride_utils.pxi                         |  2 +-
 dpctl/tensor/_usmarray.pyx                             | 10 +++++-----
 dpctl/tensor/include/dlpack/README.md                  |  2 +-
 dpctl/tensor/include/dlpack/dlpack.h                   |  2 +-
 .../tensor/libtensor/include/kernels/copy_and_cast.hpp |  4 ++--
 dpctl/tensor/libtensor/include/kernels/reductions.hpp  |  2 +-
 dpctl/tensor/libtensor/include/utils/offset_utils.hpp  |  2 +-
 .../source/copy_numpy_ndarray_into_usm_ndarray.cpp     |  2 +-
 .../tensor/libtensor/source/device_support_queries.cpp |  2 +-
 dpctl/tensor/libtensor/source/tensor_py.cpp            |  2 +-
 dpctl/tests/test_tensor_asarray.py                     |  2 +-
 libsyclinterface/include/dpctl_data_types.h            |  8 ++++----
 libsyclinterface/include/dpctl_device_selection.hpp    |  3 ++-
 .../include/dpctl_sycl_device_selector_interface.h     |  2 +-
 libsyclinterface/include/dpctl_sycl_event_interface.h  |  2 +-
 libsyclinterface/include/dpctl_sycl_queue_interface.h  |  8 ++++----
 libsyclinterface/include/dpctl_vector.h                |  4 ++--
 30 files changed, 52 insertions(+), 51 deletions(-)

diff --git a/dpctl/_sycl_context.pyx b/dpctl/_sycl_context.pyx
index 9a9f7c5277..71c2fb7591 100644
--- a/dpctl/_sycl_context.pyx
+++ b/dpctl/_sycl_context.pyx
@@ -114,7 +114,7 @@ cdef class SyclContext(_SyclContext):
                 ctx = dpctl.SyclContext()
                 print(ctx.get_devices())
 
-        - Invoking the constuctor with a specific filter string that creates a
+        - Invoking the constructor with a specific filter string that creates a
           context for the device corresponding to the filter string.
 
         :Example:
@@ -127,7 +127,7 @@ cdef class SyclContext(_SyclContext):
                 d = ctx.get_devices()[0]
                 assert(d.is_gpu)
 
-        - Invoking the constuctor with a :class:`dpctl.SyclDevice` object
+        - Invoking the constructor with a :class:`dpctl.SyclDevice` object
           creates a context for that device.
 
         :Example:
@@ -141,7 +141,7 @@ cdef class SyclContext(_SyclContext):
                 d = ctx.get_devices()[0]
                 assert(d.is_gpu)
 
-        - Invoking the constuctor with a list of :class:`dpctl.SyclDevice`
+        - Invoking the constructor with a list of :class:`dpctl.SyclDevice`
           objects creates a common context for all the devices. This
           constructor call is especially useful when creation a context for
           multiple sub-devices.
@@ -159,7 +159,7 @@ cdef class SyclContext(_SyclContext):
                 ctx = dpctl.SyclContext(sub_devices)
                 assert(len(ctx.get_devices) == len(sub_devices))
 
-        - Invoking the constuctor with a named ``PyCapsule`` with name
+        - Invoking the constructor with a named ``PyCapsule`` with name
           **"SyclContextRef"** that carries a pointer to a ``sycl::context``
           object. The capsule will be renamed upon successful consumption
           to ensure one-time use. A new named capsule can be constructed by
@@ -430,7 +430,7 @@ cdef class SyclContext(_SyclContext):
             return num_devs
         else:
             raise ValueError(
-                "An error was encountered quering the number of devices "
+                "An error was encountered querying the number of devices "
                 "associated with this context"
             )
 
diff --git a/dpctl/_sycl_device.pyx b/dpctl/_sycl_device.pyx
index 6814dec677..50ce94d94c 100644
--- a/dpctl/_sycl_device.pyx
+++ b/dpctl/_sycl_device.pyx
@@ -1163,7 +1163,7 @@ cdef class SyclDevice(_SyclDevice):
     def create_sub_devices(self, **kwargs):
         """create_sub_devices(partition=parition_spec)
         Creates a list of sub-devices by partitioning a root device based on the
-        provided partion specifier.
+        provided partition specifier.
 
         A partition specifier must be provided using a "partition"
         keyword argument. Possible values for the specifier are: an int, a
diff --git a/dpctl/_sycl_event.pyx b/dpctl/_sycl_event.pyx
index 90165b4547..34576a2ef7 100644
--- a/dpctl/_sycl_event.pyx
+++ b/dpctl/_sycl_event.pyx
@@ -119,7 +119,7 @@ cdef class SyclEvent(_SyclEvent):
                 # Create a default SyclEvent
                 e = dpctl.SyclEvent()
 
-        - Invoking the constuctor with a named ``PyCapsule`` with name
+        - Invoking the constructor with a named ``PyCapsule`` with name
           **"SyclEventRef"** that carries a pointer to a ``sycl::event``
           object. The capsule will be renamed upon successful consumption
           to ensure one-time use. A new named capsule can be constructed by
diff --git a/dpctl/_sycl_queue.pyx b/dpctl/_sycl_queue.pyx
index dbf0ae8385..6acf3396e1 100644
--- a/dpctl/_sycl_queue.pyx
+++ b/dpctl/_sycl_queue.pyx
@@ -246,7 +246,7 @@ cdef class SyclQueue(_SyclQueue):
                 # create a queue for each sub-device using the common context
                 queues = [dpctl.SyclQueue(ctx, sub_d) for sub_d in sub_devices]
 
-        - Invoking the constuctor with a named ``PyCapsule`` with the name
+        - Invoking the constructor with a named ``PyCapsule`` with the name
           **"SyclQueueRef"** that carries a pointer to a ``sycl::queue``
           object. The capsule will be renamed upon successful consumption
           to ensure one-time use. A new named capsule can be constructed by
diff --git a/dpctl/_sycl_timer.py b/dpctl/_sycl_timer.py
index 0137549251..322272df2d 100644
--- a/dpctl/_sycl_timer.py
+++ b/dpctl/_sycl_timer.py
@@ -37,14 +37,14 @@ class SyclTimer:
             q = dpctl.SyclQueue(property='enable_profiling')
 
             # create the timer
-            miliseconds_sc = 1e-3
-            timer = dpctl.SyclTimer(time_scale = miliseconds_sc)
+            milliseconds_sc = 1e-3
+            timer = dpctl.SyclTimer(time_scale = milliseconds_sc)
 
             # use the timer
             with timer(queue=q):
                 code_block
 
-            # retrieve elapsed times in miliseconds
+            # retrieve elapsed times in milliseconds
             sycl_dt, wall_dt = timer.dt
 
     Remark:
diff --git a/dpctl/apis/include/dpctl4pybind11.hpp b/dpctl/apis/include/dpctl4pybind11.hpp
index 1434da1f32..b529c41599 100644
--- a/dpctl/apis/include/dpctl4pybind11.hpp
+++ b/dpctl/apis/include/dpctl4pybind11.hpp
@@ -742,12 +742,12 @@ class usm_memory : public py::object
             return nullptr;
         }
 
-        auto convertor =
+        auto converter =
             ::dpctl::detail::dpctl_capi::get().as_usm_memory_pyobj();
 
         py::object res;
         try {
-            res = convertor(py::handle(o));
+            res = converter(py::handle(o));
         } catch (const py::error_already_set &e) {
             return nullptr;
         }
diff --git a/dpctl/memory/_memory.pyx b/dpctl/memory/_memory.pyx
index 4ea19bfba0..e0900f870e 100644
--- a/dpctl/memory/_memory.pyx
+++ b/dpctl/memory/_memory.pyx
@@ -92,7 +92,7 @@ cdef void copy_via_host(void *dest_ptr, SyclQueue dest_queue,
                         void *src_ptr, SyclQueue src_queue, size_t nbytes):
     """
     Copies `nbytes` bytes from `src_ptr` USM memory to
-    `dest_ptr` USM memory using host as the intemediary.
+    `dest_ptr` USM memory using host as the intermediary.
 
     This is useful when `src_ptr` and `dest_ptr` are bound to incompatible
     SYCL contexts.
diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py
index ad5b956851..bc1b071460 100644
--- a/dpctl/tensor/_copy_utils.py
+++ b/dpctl/tensor/_copy_utils.py
@@ -56,7 +56,7 @@ def _copy_to_numpy(ary):
 
 def _copy_from_numpy(np_ary, usm_type="device", sycl_queue=None):
     "Copies numpy array `np_ary` into a new usm_ndarray"
-    # This may peform a copy to meet stated requirements
+    # This may perform a copy to meet stated requirements
     Xnp = np.require(np_ary, requirements=["A", "E"])
     alloc_q = normalize_queue_device(sycl_queue=sycl_queue, device=None)
     dt = Xnp.dtype
diff --git a/dpctl/tensor/_device.py b/dpctl/tensor/_device.py
index 63e9cee80f..30afacb435 100644
--- a/dpctl/tensor/_device.py
+++ b/dpctl/tensor/_device.py
@@ -96,7 +96,7 @@ def sycl_context(self):
     @property
     def sycl_device(self):
         """
-        :class:`dpctl.SyclDevice` targed by this :class:`.Device`.
+        :class:`dpctl.SyclDevice` targeted by this :class:`.Device`.
         """
         return self.sycl_queue_.sycl_device
 
diff --git a/dpctl/tensor/_elementwise_funcs.py b/dpctl/tensor/_elementwise_funcs.py
index fe85a183ba..8e2abee837 100644
--- a/dpctl/tensor/_elementwise_funcs.py
+++ b/dpctl/tensor/_elementwise_funcs.py
@@ -1628,7 +1628,7 @@
 _subtract_docstring_ = """
 subtract(x1, x2, out=None, order='K')
 
-Calculates the difference bewteen each element `x1_i` of the input
+Calculates the difference between each element `x1_i` of the input
 array `x1` and the respective element `x2_i` of the input array `x2`.
 
 Args:
diff --git a/dpctl/tensor/_manipulation_functions.py b/dpctl/tensor/_manipulation_functions.py
index 9406e386af..d76f33af94 100644
--- a/dpctl/tensor/_manipulation_functions.py
+++ b/dpctl/tensor/_manipulation_functions.py
@@ -315,7 +315,7 @@ def broadcast_to(X, shape):
         raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
 
     # Use numpy.broadcast_to to check the validity of the input
-    # parametr 'shape'. Raise ValueError if 'X' is not compatible
+    # parameter 'shape'. Raise ValueError if 'X' is not compatible
     # with 'shape' according to NumPy's broadcasting rules.
     new_array = np.broadcast_to(
         np.broadcast_to(np.empty(tuple(), dtype="u1"), X.shape), shape
@@ -757,7 +757,7 @@ def iinfo(dtype):
 
     Returns:
         iinfo_object:
-            An object with the followign attributes
+            An object with the following attributes
             * bits: int
                 number of bits occupied by the data type
             * max: int
diff --git a/dpctl/tensor/_print.py b/dpctl/tensor/_print.py
index 30bb353df7..97e485a7e7 100644
--- a/dpctl/tensor/_print.py
+++ b/dpctl/tensor/_print.py
@@ -148,7 +148,7 @@ def set_print_options(
         suppress (bool, optional): If `True,` numbers equal to zero
             in the current precision will print as zero.
             Default: `False`.
-        nanstr (str, optional): String used to repesent nan.
+        nanstr (str, optional): String used to represent nan.
             Raises `TypeError` if nanstr is not a string.
             Default: `"nan"`.
         infstr (str, optional): String used to represent infinity.
diff --git a/dpctl/tensor/_slicing.pxi b/dpctl/tensor/_slicing.pxi
index 4f45d57391..4289284251 100644
--- a/dpctl/tensor/_slicing.pxi
+++ b/dpctl/tensor/_slicing.pxi
@@ -104,7 +104,7 @@ def _basic_slice_meta(ind, shape : tuple, strides : tuple, offset : int):
     Give basic slicing index `ind` and array layout information produce
     a 5-tuple (resulting_shape, resulting_strides, resulting_offset,
        advanced_ind, resulting_advanced_ind_pos)
-    used to contruct a view into underlying array over which advanced
+    used to construct a view into underlying array over which advanced
     indexing, if any, is to be performed.
 
     Raises IndexError for invalid index `ind`.
@@ -201,7 +201,7 @@ def _basic_slice_meta(ind, shape : tuple, strides : tuple, offset : int):
                 raise TypeError
         if ellipses_count > 1:
             raise IndexError(
-                "an index can only have a sinlge ellipsis ('...')")
+                "an index can only have a single ellipsis ('...')")
         if axes_referenced > len(shape):
             raise IndexError(
                 "too many indices for an array, array is "
diff --git a/dpctl/tensor/_stride_utils.pxi b/dpctl/tensor/_stride_utils.pxi
index bc7349a5e6..4f12b989dc 100644
--- a/dpctl/tensor/_stride_utils.pxi
+++ b/dpctl/tensor/_stride_utils.pxi
@@ -55,7 +55,7 @@ cdef int _from_input_shape_strides(
         nelems - Number of elements in array
         min_disp = min( dot(strides, index), index for shape)
         max_disp = max( dor(strides, index), index for shape)
-        contig = enumation for array contiguity
+        contig = enumeration for array contiguity
     Returns: 0 on success, error code otherwise.
         On success pointers point to allocated arrays,
         Otherwise they are set to NULL
diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx
index d6e15f9339..ba18600135 100644
--- a/dpctl/tensor/_usmarray.pyx
+++ b/dpctl/tensor/_usmarray.pyx
@@ -425,7 +425,7 @@ cdef class usm_ndarray:
         cdef char *ary_ptr = NULL
         if (not isinstance(self.base_, dpmem._memory._Memory)):
             raise InternalUSMArrayError(
-                "Invalid instance of usm_ndarray ecountered. "
+                "Invalid instance of usm_ndarray encountered. "
                 "Private field base_ has an unexpected type {}.".format(
                     type(self.base_)
                 )
@@ -566,7 +566,7 @@ cdef class usm_ndarray:
             elif (self.flags_ & USM_ARRAY_F_CONTIGUOUS):
                 return _f_contig_strides(self.nd_, self.shape_)
             else:
-                raise ValueError("Inconsitent usm_ndarray data")
+                raise ValueError("Inconsistent usm_ndarray data")
 
     @property
     def flags(self):
@@ -653,7 +653,7 @@ cdef class usm_ndarray:
 
     @property
     def T(self):
-        """ Returns tranposed array for 2D array, raises `ValueError`
+        """ Returns transposed array for 2D array, raises `ValueError`
         otherwise.
         """
         if self.nd_ == 2:
@@ -1376,8 +1376,8 @@ cdef api object UsmNDArray_MakeSimpleFromMemory(
         QRef: DPCTLSyclQueueRef associated with the allocation
         offset: distance between element with zero multi-index and the
                 start of allocation
-        oder: Memory layout of the array. Use 'C' for C-contiguous or
-              row-major layout; 'F' for F-contiguous or column-major layout
+        order: Memory layout of the array. Use 'C' for C-contiguous or
+               row-major layout; 'F' for F-contiguous or column-major layout
     Returns:
         Created usm_ndarray instance
     """
diff --git a/dpctl/tensor/include/dlpack/README.md b/dpctl/tensor/include/dlpack/README.md
index 02c594c0fe..2c22e9aa8d 100644
--- a/dpctl/tensor/include/dlpack/README.md
+++ b/dpctl/tensor/include/dlpack/README.md
@@ -4,4 +4,4 @@ The header `dlpack.h` downloaded from `https://github.com/dmlc/dlpack.git` remot
 
 The file can also be viewed using github web interface at https://github.com/dmlc/dlpack/blob/e2bdd3bee8cb6501558042633fa59144cc8b7f5f/include/dlpack/dlpack.h
 
-License file was retrived from https://github.com/dmlc/dlpack/blob/main/LICENSE
+License file was retrieved from https://github.com/dmlc/dlpack/blob/main/LICENSE
diff --git a/dpctl/tensor/include/dlpack/dlpack.h b/dpctl/tensor/include/dlpack/dlpack.h
index 6d51801123..672448d1c6 100644
--- a/dpctl/tensor/include/dlpack/dlpack.h
+++ b/dpctl/tensor/include/dlpack/dlpack.h
@@ -168,7 +168,7 @@ typedef struct {
    * `byte_offset` field should be used to point to the beginning of the data.
    *
    * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
-   * TVM, perhaps others) do not adhere to this 256 byte aligment requirement
+   * TVM, perhaps others) do not adhere to this 256 byte alignment requirement
    * on CPU/CUDA/ROCm, and always use `byte_offset=0`.  This must be fixed
    * (after which this note will be updated); at the moment it is recommended
    * to not rely on the data pointer being correctly aligned.
diff --git a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
index 33969ec24a..99c356aeb9 100644
--- a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
@@ -118,7 +118,7 @@ typedef sycl::event (*copy_and_cast_generic_fn_ptr_t)(
  * @brief Generic function to copy `nelems` elements from `src` usm_ndarray to
  `dst` usm_ndarray while casting from `srcTy` to `dstTy`.
 
-   Both arrays have array dimensionality specied via argument `nd`. The
+   Both arrays have array dimensionality specified via argument `nd`. The
  `shape_and_strides` is kernel accessible USM array of length `3*nd`, where the
  first `nd` elements encode common shape, second `nd` elements contain strides
  of `src` array, and the trailing `nd` elements contain strides of `dst` array.
@@ -696,7 +696,7 @@ typedef sycl::event (*copy_for_reshape_fn_ptr_t)(
  * @param  src_nd Array dimension of the source array
  * @param  dst_nd Array dimension of the destination array
  * @param  packed_shapes_and_strides Kernel accessible USM array of size
- * `2*src_nd + 2*dst_nd` with contant `[src_shape, src_strides, dst_shape,
+ * `2*src_nd + 2*dst_nd` with content `[src_shape, src_strides, dst_shape,
  * dst_strides]`.
  * @param  src_p  Typeless USM pointer to the buffer of the source array
  * @param  dst_p  Typeless USM pointer to the buffer of the destination array
diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
index c8aae0a3b9..f6657fa0af 100644
--- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
@@ -642,7 +642,7 @@ sycl::event sum_reduction_over_group_temps_strided_impl(
 
     size_t reductions_per_wi(preferrered_reductions_per_wi);
     if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) {
-        // reduction only requries 1 work-group, can output directly to res
+        // reduction only requires 1 work-group, can output directly to res
         sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
             cgh.depends_on(depends);
 
diff --git a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp b/dpctl/tensor/libtensor/include/utils/offset_utils.hpp
index ac02d26bf0..99e56b850d 100644
--- a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/offset_utils.hpp
@@ -91,7 +91,7 @@ device_allocate_and_pack(sycl::queue q,
 {
 
     // memory transfer optimization, use USM-host for temporary speeds up
-    // tranfer to device, especially on dGPUs
+    // transfer to device, especially on dGPUs
     using usm_host_allocatorT =
         sycl::usm_allocator<indT, sycl::usm::alloc::host>;
     using shT = std::vector<indT, usm_host_allocatorT>;
diff --git a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
index 4097e76178..0e7bc195e9 100644
--- a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
+++ b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
@@ -210,7 +210,7 @@ void copy_numpy_ndarray_into_usm_ndarray(
         simplified_dst_strides.push_back(1);
     }
 
-    // Minumum and maximum element offsets for source np.ndarray
+    // Minimum and maximum element offsets for source np.ndarray
     py::ssize_t npy_src_min_nelem_offset(src_offset);
     py::ssize_t npy_src_max_nelem_offset(src_offset);
     for (int i = 0; i < nd; ++i) {
diff --git a/dpctl/tensor/libtensor/source/device_support_queries.cpp b/dpctl/tensor/libtensor/source/device_support_queries.cpp
index 946c36ad26..d04c9c9ed2 100644
--- a/dpctl/tensor/libtensor/source/device_support_queries.cpp
+++ b/dpctl/tensor/libtensor/source/device_support_queries.cpp
@@ -51,7 +51,7 @@ std::string _default_device_fp_type(sycl::device d)
 
 std::string _default_device_int_type(sycl::device)
 {
-    return "l"; // code for numpy.dtype('long') to be consisent
+    return "l"; // code for numpy.dtype('long') to be consistent
                 // with NumPy's default integer type across
                 // platforms.
 }
diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp
index 9b4ba6cdad..662ca464c4 100644
--- a/dpctl/tensor/libtensor/source/tensor_py.cpp
+++ b/dpctl/tensor/libtensor/source/tensor_py.cpp
@@ -245,7 +245,7 @@ PYBIND11_MODULE(_tensor_impl, m)
 
     m.def("_copy_numpy_ndarray_into_usm_ndarray",
           &copy_numpy_ndarray_into_usm_ndarray,
-          "Copy fom numpy array `src` into usm_ndarray `dst` synchronously.",
+          "Copy from numpy array `src` into usm_ndarray `dst` synchronously.",
           py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
           py::arg("depends") = py::list());
 
diff --git a/dpctl/tests/test_tensor_asarray.py b/dpctl/tests/test_tensor_asarray.py
index f9bc31972c..e73c35ce26 100644
--- a/dpctl/tests/test_tensor_asarray.py
+++ b/dpctl/tests/test_tensor_asarray.py
@@ -71,7 +71,7 @@ def test_asarray_from_numpy():
     assert type(Y) is dpt.usm_ndarray
     assert Y.shape == Xnp.shape
     assert Y.dtype == Xnp.dtype
-    # Fortan contiguous case
+    # Fortran contiguous case
     Xnp = np.array([[1, 2, 3], [4, 5, 6]], dtype="f4", order="F")
     Y = dpt.asarray(Xnp, usm_type="shared")
     assert type(Y) is dpt.usm_ndarray
diff --git a/libsyclinterface/include/dpctl_data_types.h b/libsyclinterface/include/dpctl_data_types.h
index c8c812b86d..9bafac93d2 100644
--- a/libsyclinterface/include/dpctl_data_types.h
+++ b/libsyclinterface/include/dpctl_data_types.h
@@ -53,7 +53,7 @@
         "__STDC_CONSTANT_MACROS before #including llvm-c/DataTypes.h"
 #endif
 
-/* Note that <inttypes.h> includes <stdint.h>, if this is a C99 system. */
+/* Note thatt <inttypes.h> includes <stdint.h>, if this is a C99 system. */
 #include <sys/types.h>
 
 #ifdef _AIX
@@ -81,21 +81,21 @@ typedef signed int ssize_t;
 #endif /* _MSC_VER */
 
 /*!
-    @brief Represents tha largest possible value of a 64 bit signed integer.
+    @brief Represents the largest possible value of a 64 bit signed integer.
 */
 #if !defined(INT64_MAX)
 #define INT64_MAX 9223372036854775807LL
 #endif
 
 /*!
-    @brief Represents tha smallest possible value of a 64 bit signed integer.
+    @brief Represents the smallest possible value of a 64 bit signed integer.
 */
 #if !defined(INT64_MIN)
 #define INT64_MIN ((-INT64_MAX) - 1)
 #endif
 
 /*!
-    @brief Represents tha largest possible value of a 64bit unsigned integer.
+    @brief Represents the largest possible value of a 64bit unsigned integer.
 */
 #if !defined(UINT64_MAX)
 #define UINT64_MAX 0xffffffffffffffffULL
diff --git a/libsyclinterface/include/dpctl_device_selection.hpp b/libsyclinterface/include/dpctl_device_selection.hpp
index 6aa4e69ec7..9da0072ab1 100644
--- a/libsyclinterface/include/dpctl_device_selection.hpp
+++ b/libsyclinterface/include/dpctl_device_selection.hpp
@@ -1,4 +1,5 @@
-//===-- dpctl_device_selection.h - Device selector class declar. --*-C++-*- =//
+//===-- dpctl_device_selection.h -
+//                              Device selector class declaration --*-C++-*- =//
 //
 //
 //                      Data Parallel Control (dpctl)
diff --git a/libsyclinterface/include/dpctl_sycl_device_selector_interface.h b/libsyclinterface/include/dpctl_sycl_device_selector_interface.h
index c1cd5fcd5c..f439f3281a 100644
--- a/libsyclinterface/include/dpctl_sycl_device_selector_interface.h
+++ b/libsyclinterface/include/dpctl_sycl_device_selector_interface.h
@@ -19,7 +19,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// This header declares C contructors for the various SYCL device_selector
+/// This header declares C constructors for the various SYCL device_selector
 /// classes.
 ///
 //===----------------------------------------------------------------------===//
diff --git a/libsyclinterface/include/dpctl_sycl_event_interface.h b/libsyclinterface/include/dpctl_sycl_event_interface.h
index a49d17f12e..6bb4d0412c 100644
--- a/libsyclinterface/include/dpctl_sycl_event_interface.h
+++ b/libsyclinterface/include/dpctl_sycl_event_interface.h
@@ -44,7 +44,7 @@ DPCTL_C_EXTERN_C_BEGIN
 DPCTL_DECLARE_VECTOR(Event)
 
 /*!
- * @brief A wrapper for ``sycl::event`` contructor to construct a new event.
+ * @brief A wrapper for ``sycl::event`` constructor to construct a new event.
  *
  * @return   An opaque DPCTLSyclEventRef pointer wrapping a ``sycl::event``.
  * @ingroup EventInterface
diff --git a/libsyclinterface/include/dpctl_sycl_queue_interface.h b/libsyclinterface/include/dpctl_sycl_queue_interface.h
index 8dd07280d2..1c5e53a395 100644
--- a/libsyclinterface/include/dpctl_sycl_queue_interface.h
+++ b/libsyclinterface/include/dpctl_sycl_queue_interface.h
@@ -42,8 +42,8 @@ DPCTL_C_EXTERN_C_BEGIN
  */
 
 /*!
- * @brief A wrapper for sycl::queue contructor to construct a new queue from the
- * provided context, device, async handler and propertis bit flags.
+ * @brief A wrapper for sycl::queue constructor to construct a new queue from
+ * the provided context, device, async handler and propertis bit flags.
  *
  * @param    CRef           An opaque pointer to a sycl::context.
  * @param    DRef           An opaque pointer to a sycl::device
@@ -362,7 +362,7 @@ DPCTL_API
 size_t DPCTLQueue_Hash(__dpctl_keep const DPCTLSyclQueueRef QRef);
 
 /*!
- * @brief C-API wraper for ``sycl::queue::submit_barrier()``.
+ * @brief C-API wrapper for ``sycl::queue::submit_barrier()``.
  *
  * @param    QRef    An opaque pointer to the ``sycl::queue``.
  * @return   An opaque pointer to the ``sycl::event`` returned by the
@@ -373,7 +373,7 @@ __dpctl_give DPCTLSyclEventRef
 DPCTLQueue_SubmitBarrier(__dpctl_keep const DPCTLSyclQueueRef QRef);
 
 /*!
- * @brief C-API wraper for ``sycl::queue::submit_barrier(event_vector)``.
+ * @brief C-API wrapper for ``sycl::queue::submit_barrier(event_vector)``.
  *
  * @param    QRef    An opaque pointer to the ``sycl::queue``.
  * @param    DepEvents     List of dependent DPCTLSyclEventRef objects (events)
diff --git a/libsyclinterface/include/dpctl_vector.h b/libsyclinterface/include/dpctl_vector.h
index fea49925ea..792368bbc3 100644
--- a/libsyclinterface/include/dpctl_vector.h
+++ b/libsyclinterface/include/dpctl_vector.h
@@ -83,8 +83,8 @@ DPCTL_C_EXTERN_C_BEGIN
     /*!                                                                        \
        @brief Returns the element at the specified index.                      \
        @param VRef Opaque pointer to a vector.                                 \
-       @param index The index postion of the element to be returned.           \
-       @return The element at the specified postion, if the index position is  \
+       @param index The index position of the element to be returned.          \
+       @return The element at the specified position, if the index position is \
                out of bounds then a nullptr is returned.                       \
      */                                                                        \
     DPCTL_API                                                                  \

From 2f36893bda56365be094b2cec4b6f89f125fe943 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sun, 3 Sep 2023 09:27:52 -0500
Subject: [PATCH 36/44] Update libsyclinterface/include/dpctl_data_types.h

Co-authored-by: ndgrigorian <46709016+ndgrigorian@users.noreply.github.com>
---
 libsyclinterface/include/dpctl_data_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libsyclinterface/include/dpctl_data_types.h b/libsyclinterface/include/dpctl_data_types.h
index 9bafac93d2..2e644f1327 100644
--- a/libsyclinterface/include/dpctl_data_types.h
+++ b/libsyclinterface/include/dpctl_data_types.h
@@ -53,7 +53,7 @@
         "__STDC_CONSTANT_MACROS before #including llvm-c/DataTypes.h"
 #endif
 
-/* Note thatt <inttypes.h> includes <stdint.h>, if this is a C99 system. */
+/* Note that <inttypes.h> includes <stdint.h>, if this is a C99 system. */
 #include <sys/types.h>
 
 #ifdef _AIX

From 845b4bb21382803f1f07622e23ff6d7fc2c985d1 Mon Sep 17 00:00:00 2001
From: Yevhenii Havrylko <yevhenii.havrylko@intel.com>
Date: Tue, 5 Sep 2023 21:42:48 +0200
Subject: [PATCH 37/44] Remove depricated FindPythonLibs

---
 cmake/FindDpctl.cmake | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/cmake/FindDpctl.cmake b/cmake/FindDpctl.cmake
index e917aaa194..fe75f3767f 100644
--- a/cmake/FindDpctl.cmake
+++ b/cmake/FindDpctl.cmake
@@ -17,15 +17,8 @@
 #
 
 if(NOT Dpctl_FOUND)
-  set(_find_extra_args)
-  if(Dpctl_FIND_REQUIRED)
-    list(APPEND _find_extra_args REQUIRED)
-  endif()
-  if(Dpctl_FIND_QUIET)
-    list(APPEND _find_extra_args QUIET)
-  endif()
-  find_package(PythonInterp ${_find_extra_args})
-  find_package(PythonLibs ${_find_extra_args})
+  find_package(Python 3.9 REQUIRED
+    COMPONENTS Interpreter Development.Module)
 
   if(PYTHON_EXECUTABLE)
     execute_process(COMMAND "${PYTHON_EXECUTABLE}"

From 03e7f264f403b94479f10a9d50bd7826580cca3d Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 6 Sep 2023 00:52:21 -0500
Subject: [PATCH 38/44] address_space_cast<address_space::private_space,...> on
 local variable

To create the multi_ptr from a local variable (in private memory) we should be
using address_space::private_space.
---
 .../libtensor/include/kernels/elementwise_functions/expm1.hpp   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
index 3e69aa5464..e1c23113c6 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
@@ -116,7 +116,7 @@ template <typename argT, typename resT> struct Expm1Functor
             // x, y finite numbers
             realT cosY_val;
             auto cosY_val_multi_ptr = sycl::address_space_cast<
-                sycl::access::address_space::global_space,
+                sycl::access::address_space::private_space,
                 sycl::access::decorated::yes>(&cosY_val);
             const realT sinY_val = sycl::sincos(y, cosY_val_multi_ptr);
             const realT sinhalfY_val = std::sin(y / 2);

From fed0e9bfca4bb85e4ff471a89941fb99bcd5d658 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Thu, 31 Aug 2023 00:55:28 -0500
Subject: [PATCH 39/44] Fixed constexpr significant bits value for double

Renamed variable for clarity.
---
 .../libtensor/include/kernels/elementwise_functions/sqrt.hpp  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
index 808e82539e..9b9fa95061 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
@@ -150,11 +150,11 @@ template <typename argT, typename resT> struct SqrtFunctor
 
     int get_normal_scale_double(const double &v) const
     {
-        constexpr int float_significant_bits = 53;
+        constexpr int double_significant_bits = 52;
         constexpr std::uint64_t exponent_mask = 0x7ff;
         constexpr int exponent_bias = 1023;
         const int scale = static_cast<int>(
-            (sycl::bit_cast<std::uint64_t>(v) >> float_significant_bits) &
+            (sycl::bit_cast<std::uint64_t>(v) >> double_significant_bits) &
             exponent_mask);
         return scale - exponent_bias;
     }

From 36bc04ac28ec7742ca35a87b68d22d37567d0241 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 6 Sep 2023 05:28:08 -0500
Subject: [PATCH 40/44] Restricted use of reduce_over_axis0 special kernels

The kernel is applicable if both inputs are F-contiguous, or
if the first input if F-contiguous and we are reducing to
1d C-contiguous array.

Closes gh-1391
---
 dpctl/tensor/libtensor/source/sum_reductions.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/dpctl/tensor/libtensor/source/sum_reductions.cpp b/dpctl/tensor/libtensor/source/sum_reductions.cpp
index 7628813c6d..13ab268b55 100644
--- a/dpctl/tensor/libtensor/source/sum_reductions.cpp
+++ b/dpctl/tensor/libtensor/source/sum_reductions.cpp
@@ -218,7 +218,9 @@ std::pair<sycl::event, sycl::event> py_sum_over_axis(
                 return std::make_pair(keep_args_event, sum_over_axis_contig_ev);
             }
         }
-        else if (is_src_f_contig & is_dst_c_contig) {
+        else if (is_src_f_contig &&
+                 ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous()))
+        {
             auto fn = sum_over_axis0_contig_atomic_dispatch_table[src_typeid]
                                                                  [dst_typeid];
             if (fn != nullptr) {

From 8d8ef0b74ff97744c26d411aabbffd65bfe6770e Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 6 Sep 2023 07:33:16 -0500
Subject: [PATCH 41/44] Add test based on gh-1391

---
 dpctl/tests/test_tensor_sum.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/dpctl/tests/test_tensor_sum.py b/dpctl/tests/test_tensor_sum.py
index fc2a0ec8de..403a823324 100644
--- a/dpctl/tests/test_tensor_sum.py
+++ b/dpctl/tests/test_tensor_sum.py
@@ -172,3 +172,18 @@ def test_largish_reduction(arg_dtype, n):
 
     assert dpt.all(dpt.equal(y1, y2))
     assert dpt.all(dpt.equal(y1, n * m))
+
+
+def test_axis0_bug():
+    "gh-1391"
+    get_queue_or_skip()
+
+    sh = (1, 2, 3)
+    a = dpt.arange(sh[0] * sh[1] * sh[2], dtype="i4")
+    a = dpt.reshape(a, sh)
+    aT = dpt.permute_dims(a, (2, 1, 0))
+
+    s = dpt.sum(aT, axis=2)
+    expected = dpt.asarray([[0, 3], [1, 4], [2, 5]])
+
+    assert dpt.all(s == expected)

From 172394fc7e75625a1d904f374f93e0ad7d3a0c6f Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 6 Sep 2023 14:53:13 -0500
Subject: [PATCH 42/44] Fixed qualifier name typo

---
 dpctl/tests/test_sycl_queue_manager.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dpctl/tests/test_sycl_queue_manager.py b/dpctl/tests/test_sycl_queue_manager.py
index c01dca0139..28800334b8 100644
--- a/dpctl/tests/test_sycl_queue_manager.py
+++ b/dpctl/tests/test_sycl_queue_manager.py
@@ -1,6 +1,6 @@
 #                      Data Parallel Control (dpctl)
 #
-# Copyright 2020-2022 Intel Corporation
+# Copyright 2020-2023 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -69,7 +69,7 @@ def test_is_in_device_context_inside_nested_device_ctxt_cpu():
     n = cpu.max_compute_units
     n_half = n // 2
     try:
-        d0, d1 = cpu.create_subdevices(partition=[n_half, n - n_half])
+        d0, d1 = cpu.create_sub_devices(partition=[n_half, n - n_half])
     except Exception:
         pytest.skip("Could not create subdevices")
     assert 0 == dpctl.get_num_activated_queues()

From 49002cc5927b87db4d86ec49700008b899efb6cd Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 6 Sep 2023 15:52:27 -0500
Subject: [PATCH 43/44] Made except pattern more specific in the test

---
 dpctl/tests/test_sycl_queue_manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpctl/tests/test_sycl_queue_manager.py b/dpctl/tests/test_sycl_queue_manager.py
index 28800334b8..d640042eb9 100644
--- a/dpctl/tests/test_sycl_queue_manager.py
+++ b/dpctl/tests/test_sycl_queue_manager.py
@@ -70,7 +70,7 @@ def test_is_in_device_context_inside_nested_device_ctxt_cpu():
     n_half = n // 2
     try:
         d0, d1 = cpu.create_sub_devices(partition=[n_half, n - n_half])
-    except Exception:
+    except dpctl.SyclSubDeviceCreationError:
         pytest.skip("Could not create subdevices")
     assert 0 == dpctl.get_num_activated_queues()
     with dpctl.device_context(d0):

From ac331bb98ab7c2fe0b51830cb0fbfc5c8e9f802f Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Thu, 7 Sep 2023 22:14:08 -0700
Subject: [PATCH 44/44] Added a test for `roll` input validation - Will cover
 lines missed by test suite

---
 dpctl/tests/test_usm_ndarray_manipulation.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/dpctl/tests/test_usm_ndarray_manipulation.py b/dpctl/tests/test_usm_ndarray_manipulation.py
index 1cee5e6c8f..6152a15aae 100644
--- a/dpctl/tests/test_usm_ndarray_manipulation.py
+++ b/dpctl/tests/test_usm_ndarray_manipulation.py
@@ -648,6 +648,19 @@ def test_roll_2d(data):
     assert_array_equal(Ynp, dpt.asnumpy(Y))
 
 
+def test_roll_validation():
+    get_queue_or_skip()
+
+    X = dict()
+    with pytest.raises(TypeError):
+        dpt.roll(X)
+
+    X = dpt.empty((1, 2, 3))
+    shift = ((2, 3, 1), (1, 2, 3))
+    with pytest.raises(ValueError):
+        dpt.roll(X, shift=shift, axis=(0, 1, 2))
+
+
 def test_concat_incorrect_type():
     Xnp = np.ones((2, 2))
     pytest.raises(TypeError, dpt.concat)