From c5fe3146cc3fcdd153273d70458a397593c57ce0 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 2 Aug 2023 16:17:01 -0500
Subject: [PATCH 01/26] Changed behavior of __array_namespace__

usm_ndarray constructor's array_namespace argument's default value of None
is now interpreted to mean that usm_ndarray.__array_namespace__ returns
dpctl.tensor, rather than None.

This directly affects how Python operator special methods of usm_ndarray
behave. They would now look up corresponding attribute in dpctl.tensor
namespace.
---
 dpctl/tensor/_usmarray.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx
index 5b1bd5f6a3..1e8cf8d433 100644
--- a/dpctl/tensor/_usmarray.pyx
+++ b/dpctl/tensor/_usmarray.pyx
@@ -884,7 +884,7 @@ cdef class usm_ndarray:
         Returns array namespace, member functions of which
         implement data API.
         """
-        return self.array_namespace_
+        return self.array_namespace_ if self.array_namespace_ is not None else dpctl.tensor
 
     def __bool__(self):
         if self.size == 1:

From 4555bf1a3acf25c40f9c38137e4d3304b7724489 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 2 Aug 2023 16:22:19 -0500
Subject: [PATCH 02/26] Corrected text of exception message

---
 dpctl/tensor/_usmarray.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx
index 1e8cf8d433..1667c174e7 100644
--- a/dpctl/tensor/_usmarray.pyx
+++ b/dpctl/tensor/_usmarray.pyx
@@ -898,7 +898,7 @@ cdef class usm_ndarray:
 
         raise ValueError(
             "The truth value of an array with more than one element is "
-            "ambiguous. Use a.any() or a.all()"
+            "ambiguous. Use dpctl.tensor.any() or dpctl.tensor.all()"
         )
 
     def __float__(self):

From 964db08258b43b3dafb182f45555c5ee21adfb86 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 2 Aug 2023 16:22:45 -0500
Subject: [PATCH 03/26] Corrected operator true_divide with divide

---
 dpctl/tensor/_usmarray.pyx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx
index 1667c174e7..0dc4b3395f 100644
--- a/dpctl/tensor/_usmarray.pyx
+++ b/dpctl/tensor/_usmarray.pyx
@@ -1202,9 +1202,9 @@ cdef class usm_ndarray:
     def __truediv__(first, other):
         "See comment in __add__"
         if isinstance(first, usm_ndarray):
-            return _dispatch_binary_elementwise(first, "true_divide", other)
+            return _dispatch_binary_elementwise(first, "divide", other)
         elif isinstance(other, usm_ndarray):
-            return _dispatch_binary_elementwise2(first, "true_divide", other)
+            return _dispatch_binary_elementwise2(first, "divide", other)
         return NotImplemented
 
     def __xor__(first, other):
@@ -1249,7 +1249,7 @@ cdef class usm_ndarray:
         return _dispatch_binary_elementwise2(other, "subtract", self)
 
     def __rtruediv__(self, other):
-        return _dispatch_binary_elementwise2(other, "true_divide", self)
+        return _dispatch_binary_elementwise2(other, "divide", self)
 
     def __rxor__(self, other):
         return _dispatch_binary_elementwise2(other, "logical_xor", self)

From 78aa99b4598c0164c3d6fc5421d113e7390362ec Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 2 Aug 2023 19:48:44 -0500
Subject: [PATCH 04/26] Fixed test per change in dpctl implementation

---
 dpctl/tests/test_usm_ndarray_operators.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dpctl/tests/test_usm_ndarray_operators.py b/dpctl/tests/test_usm_ndarray_operators.py
index 92dc38bd47..def22562c0 100644
--- a/dpctl/tests/test_usm_ndarray_operators.py
+++ b/dpctl/tests/test_usm_ndarray_operators.py
@@ -47,7 +47,7 @@ def multiply(a, b):
             return b
 
 
-@pytest.mark.parametrize("namespace", [None, Dummy()])
+@pytest.mark.parametrize("namespace", [dpt, Dummy()])
 def test_fp_ops(namespace):
     try:
         X = dpt.ones(1)
@@ -81,7 +81,7 @@ def test_fp_ops(namespace):
     X.__ifloordiv__(1.0)
 
 
-@pytest.mark.parametrize("namespace", [None, Dummy()])
+@pytest.mark.parametrize("namespace", [dpt, Dummy()])
 def test_int_ops(namespace):
     try:
         X = dpt.usm_ndarray(1, "i4")
@@ -113,7 +113,7 @@ def test_int_ops(namespace):
     X.__ipow__(2)
 
 
-@pytest.mark.parametrize("namespace", [None, Dummy()])
+@pytest.mark.parametrize("namespace", [dpt, Dummy()])
 def test_mat_ops(namespace):
     try:
         M = dpt.eye(3, 3)

From 51bf9e1283d9b5e553a4c283d87be50e5efc1ae0 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Thu, 3 Aug 2023 09:20:53 -0500
Subject: [PATCH 05/26] Fixed _slice_len for vacuous slices

```
In [1]: dpctl.tensor._usmarray as ua

ua._basic_slice_meta((0, slice(1, 0, None)), (4, 4), (4, 1), 0)
```

The issue was discovered by array API conformance tests.
---
 dpctl/tensor/_slicing.pxi | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/dpctl/tensor/_slicing.pxi b/dpctl/tensor/_slicing.pxi
index 361dd906c3..259248b323 100644
--- a/dpctl/tensor/_slicing.pxi
+++ b/dpctl/tensor/_slicing.pxi
@@ -33,9 +33,13 @@ cdef Py_ssize_t _slice_len(
     if sl_start == sl_stop:
         return 0
     if sl_step > 0:
+        if sl_start > sl_stop:
+            return 0
         # 1 + argmax k such htat sl_start + sl_step*k < sl_stop
         return 1 + ((sl_stop - sl_start - 1) // sl_step)
     else:
+        if sl_start < sl_stop:
+            return 0
         return 1 + ((sl_stop - sl_start + 1) // sl_step)
 
 
@@ -236,6 +240,7 @@ def _basic_slice_meta(ind, shape : tuple, strides : tuple, offset : int):
                     new_offset = new_offset + sl_start * strides[k]
                 if sh_i == 0:
                     is_empty = True
+                    new_offset = offset
                 k = k_new
             elif _is_boolean(ind_i):
                 new_shape.append(1 if ind_i else 0)

From f4a31bb4f63edbd07dd0603f695d5c9c23f7cb36 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Thu, 3 Aug 2023 11:56:46 -0500
Subject: [PATCH 06/26] Fixed issue discovered by array API tests

```
import dpctl.tensor as dpt
x = dpt.empy((0,2), dtype='i4')
x[Ellipsis, 1]
     returns empty 1d array
```
---
 dpctl/tensor/_slicing.pxi | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/dpctl/tensor/_slicing.pxi b/dpctl/tensor/_slicing.pxi
index 259248b323..4f45d57391 100644
--- a/dpctl/tensor/_slicing.pxi
+++ b/dpctl/tensor/_slicing.pxi
@@ -225,6 +225,9 @@ def _basic_slice_meta(ind, shape : tuple, strides : tuple, offset : int):
                 k_new = k + ellipses_count
                 new_shape.extend(shape[k:k_new])
                 new_strides.extend(strides[k:k_new])
+                if any(dim == 0 for dim in shape[k:k_new]):
+                    is_empty = True
+                    new_offset = offset
                 k = k_new
             elif ind_i is None:
                 new_shape.append(1)

From 05aa952de6acae3c088dfc44b82678eabea68d8d Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Thu, 3 Aug 2023 10:51:16 -0700
Subject: [PATCH 07/26] Corrected remaining operators in _usmarray.pyx -
 "power" corrected to "pow, "mod" corrected to "remainder" - logical functions
 corrected to bitwise

---
 dpctl/tensor/_usmarray.pyx | 42 +++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx
index 0dc4b3395f..3a04f0f3d5 100644
--- a/dpctl/tensor/_usmarray.pyx
+++ b/dpctl/tensor/_usmarray.pyx
@@ -957,9 +957,9 @@ cdef class usm_ndarray:
     def __and__(first, other):
         "See comment in __add__"
         if isinstance(first, usm_ndarray):
-            return _dispatch_binary_elementwise(first, "logical_and", other)
+            return _dispatch_binary_elementwise(first, "bitwise_and", other)
         elif isinstance(other, usm_ndarray):
-            return _dispatch_binary_elementwise2(first, "logical_and", other)
+            return _dispatch_binary_elementwise2(first, "bitwise_and", other)
         return NotImplemented
 
     def __dlpack__(self, stream=None):
@@ -1037,9 +1037,9 @@ cdef class usm_ndarray:
     def __lshift__(first, other):
         "See comment in __add__"
         if isinstance(first, usm_ndarray):
-            return _dispatch_binary_elementwise(first, "left_shift", other)
+            return _dispatch_binary_elementwise(first, "bitwise_left_shift", other)
         elif isinstance(other, usm_ndarray):
-            return _dispatch_binary_elementwise2(first, "left_shift", other)
+            return _dispatch_binary_elementwise2(first, "bitwise_left_shift", other)
         return NotImplemented
 
     def __lt__(self, other):
@@ -1056,9 +1056,9 @@ cdef class usm_ndarray:
     def __mod__(first, other):
         "See comment in __add__"
         if isinstance(first, usm_ndarray):
-            return _dispatch_binary_elementwise(first, "mod", other)
+            return _dispatch_binary_elementwise(first, "remainder", other)
         elif isinstance(other, usm_ndarray):
-            return _dispatch_binary_elementwise2(first, "mod", other)
+            return _dispatch_binary_elementwise2(first, "remainder", other)
         return NotImplemented
 
     def __mul__(first, other):
@@ -1078,9 +1078,9 @@ cdef class usm_ndarray:
     def __or__(first, other):
         "See comment in __add__"
         if isinstance(first, usm_ndarray):
-            return _dispatch_binary_elementwise(first, "logical_or", other)
+            return _dispatch_binary_elementwise(first, "bitwise_or", other)
         elif isinstance(other, usm_ndarray):
-            return _dispatch_binary_elementwise2(first, "logical_or", other)
+            return _dispatch_binary_elementwise2(first, "bitwise_or", other)
         return NotImplemented
 
     def __pos__(self):
@@ -1090,17 +1090,17 @@ cdef class usm_ndarray:
         "See comment in __add__"
         if mod is None:
             if isinstance(first, usm_ndarray):
-                return _dispatch_binary_elementwise(first, "power", other)
+                return _dispatch_binary_elementwise(first, "pow", other)
             elif isinstance(other, usm_ndarray):
-                return _dispatch_binary_elementwise(first, "power", other)
+                return _dispatch_binary_elementwise(first, "pow", other)
         return NotImplemented
 
     def __rshift__(first, other):
         "See comment in __add__"
         if isinstance(first, usm_ndarray):
-            return _dispatch_binary_elementwise(first, "right_shift", other)
+            return _dispatch_binary_elementwise(first, "bitwise_right_shift", other)
         elif isinstance(other, usm_ndarray):
-            return _dispatch_binary_elementwise2(first, "right_shift", other)
+            return _dispatch_binary_elementwise2(first, "bitwise_right_shift", other)
         return NotImplemented
 
     def __setitem__(self, key, rhs):
@@ -1210,40 +1210,40 @@ cdef class usm_ndarray:
     def __xor__(first, other):
         "See comment in __add__"
         if isinstance(first, usm_ndarray):
-            return _dispatch_binary_elementwise(first, "logical_xor", other)
+            return _dispatch_binary_elementwise(first, "bitwise_xor", other)
         elif isinstance(other, usm_ndarray):
-            return _dispatch_binary_elementwise2(first, "logical_xor", other)
+            return _dispatch_binary_elementwise2(first, "bitwise_xor", other)
         return NotImplemented
 
     def __radd__(self, other):
         return _dispatch_binary_elementwise(self, "add", other)
 
     def __rand__(self, other):
-        return _dispatch_binary_elementwise(self, "logical_and", other)
+        return _dispatch_binary_elementwise(self, "bitwise_and", other)
 
     def __rfloordiv__(self, other):
         return _dispatch_binary_elementwise2(other, "floor_divide", self)
 
     def __rlshift__(self, other):
-        return _dispatch_binary_elementwise2(other, "left_shift", self)
+        return _dispatch_binary_elementwise2(other, "bitwise_left_shift", self)
 
     def __rmatmul__(self, other):
         return _dispatch_binary_elementwise2(other, "matmul", self)
 
     def __rmod__(self, other):
-        return _dispatch_binary_elementwise2(other, "mod", self)
+        return _dispatch_binary_elementwise2(other, "remainder", self)
 
     def __rmul__(self, other):
         return _dispatch_binary_elementwise(self, "multiply", other)
 
     def __ror__(self, other):
-        return _dispatch_binary_elementwise(self, "logical_or", other)
+        return _dispatch_binary_elementwise(self, "bitwise_or", other)
 
     def __rpow__(self, other):
-        return _dispatch_binary_elementwise2(other, "power", self)
+        return _dispatch_binary_elementwise2(other, "pow", self)
 
     def __rrshift__(self, other):
-        return _dispatch_binary_elementwise2(other, "right_shift", self)
+        return _dispatch_binary_elementwise2(other, "bitwise_right_shift", self)
 
     def __rsub__(self, other):
         return _dispatch_binary_elementwise2(other, "subtract", self)
@@ -1252,7 +1252,7 @@ cdef class usm_ndarray:
         return _dispatch_binary_elementwise2(other, "divide", self)
 
     def __rxor__(self, other):
-        return _dispatch_binary_elementwise2(other, "logical_xor", self)
+        return _dispatch_binary_elementwise2(other, "bitwise_xor", self)
 
     def __iadd__(self, other):
         from ._elementwise_funcs import add

From 25547e75e1d3604fbbfcb1abfd6745fa667a122e Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Thu, 3 Aug 2023 14:53:34 -0500
Subject: [PATCH 08/26] Fixed bug in contiguity flag computation found by
 array-api-tests

```
import dpctl.tensor as dpt
import dpctl.tensor._usmarray as ua

x = dpt.asarray([[0, 0], [1, 0]], dtype='int8')
key = (None, 0, slice(1, None, -1))

meta = ua._basic_slice_meta(key, x.shape, x.strides, x._element_offset)

y = dpt.usm_ndarray.__new__(dpt.usm_ndarray, meta[0], dtype=x.dtype, strides=meta[1], buffer=x.usm_data, offset=meta[2])

print(y.flags)
```

Both 'C' and 'F' flags of the created array used to be set, now both
are correctly unset.
---
 dpctl/tensor/_stride_utils.pxi | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/dpctl/tensor/_stride_utils.pxi b/dpctl/tensor/_stride_utils.pxi
index 896c31e65a..ea59ec5402 100644
--- a/dpctl/tensor/_stride_utils.pxi
+++ b/dpctl/tensor/_stride_utils.pxi
@@ -64,6 +64,8 @@ cdef int _from_input_shape_strides(
     cdef int j
     cdef bint all_incr = 1
     cdef bint all_decr = 1
+    cdef bint all_incr_modified = 0
+    cdef bint all_decr_modified = 0
     cdef Py_ssize_t elem_count = 1
     cdef Py_ssize_t min_shift = 0
     cdef Py_ssize_t max_shift = 0
@@ -166,12 +168,14 @@ cdef int _from_input_shape_strides(
                     j = j + 1
                 if j < nd:
                     if all_incr:
+                        all_incr_modified = 1
                         all_incr = (
                             (strides_arr[i] > 0) and
                             (strides_arr[j] > 0) and
                             (strides_arr[i] <= strides_arr[j])
                         )
                     if all_decr:
+                        all_decr_modified = 1
                         all_decr = (
                             (strides_arr[i] > 0) and
                             (strides_arr[j] > 0) and
@@ -180,6 +184,10 @@ cdef int _from_input_shape_strides(
                     i = j
                 else:
                     break
+            # should only set contig flags on actually obtained
+            # values, rather than default values
+            all_incr = all_incr and all_incr_modified
+            all_decr = all_decr and all_decr_modified
             if all_incr and all_decr:
                 contig[0] = (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)
             elif all_incr:

From 22c95b6207ae12863192c52857004411c237aa8a Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Thu, 3 Aug 2023 13:37:39 -0700
Subject: [PATCH 09/26] Fixed flags test case for changes to contiguity flag
 computation - Incorrect test logic was being hidden by the bug

---
 dpctl/tests/test_usm_ndarray_ctor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpctl/tests/test_usm_ndarray_ctor.py b/dpctl/tests/test_usm_ndarray_ctor.py
index 5772968d64..604ed9d87f 100644
--- a/dpctl/tests/test_usm_ndarray_ctor.py
+++ b/dpctl/tests/test_usm_ndarray_ctor.py
@@ -84,7 +84,7 @@ def test_usm_ndarray_flags():
     assert f.forc
     assert f.fnc
 
-    f = dpt.usm_ndarray((5, 1, 1), dtype="i4", strides=(1, 0, 1)).flags
+    f = dpt.usm_ndarray((5, 0, 1), dtype="i4", strides=(1, 0, 1)).flags
     assert f.fc
     assert f.forc
     assert not dpt.usm_ndarray(

From 4803f130ce8204c1e960e043c7f9773edf26e1c9 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Thu, 3 Aug 2023 14:57:59 -0500
Subject: [PATCH 10/26] unpacked chained method calls

---
 dpctl/tensor/_usmarray.pyx | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx
index 3a04f0f3d5..65c18d7a37 100644
--- a/dpctl/tensor/_usmarray.pyx
+++ b/dpctl/tensor/_usmarray.pyx
@@ -889,7 +889,9 @@ cdef class usm_ndarray:
     def __bool__(self):
         if self.size == 1:
             mem_view = dpmem.as_usm_memory(self)
-            return mem_view.copy_to_host().view(self.dtype).__bool__()
+            host_buf = mem_view.copy_to_host()
+            view = host_buf.view(self.dtype)
+            return view.__bool__()
 
         if self.size == 0:
             raise ValueError(
@@ -904,7 +906,9 @@ cdef class usm_ndarray:
     def __float__(self):
         if self.size == 1:
             mem_view = dpmem.as_usm_memory(self)
-            return mem_view.copy_to_host().view(self.dtype).__float__()
+            host_buf = mem_view.copy_to_host()
+            view = host_buf.view(self.dtype)
+            return view.__float__()
 
         raise ValueError(
             "only size-1 arrays can be converted to Python scalars"
@@ -913,7 +917,9 @@ cdef class usm_ndarray:
     def __complex__(self):
         if self.size == 1:
             mem_view = dpmem.as_usm_memory(self)
-            return mem_view.copy_to_host().view(self.dtype).__complex__()
+            host_buf = mem_view.copy_to_host()
+            view = host_buf.view(self.dtype)
+            return view.__complex__()
 
         raise ValueError(
             "only size-1 arrays can be converted to Python scalars"
@@ -922,7 +928,9 @@ cdef class usm_ndarray:
     def __int__(self):
         if self.size == 1:
             mem_view = dpmem.as_usm_memory(self)
-            return mem_view.copy_to_host().view(self.dtype).__int__()
+            host_buf = mem_view.copy_to_host()
+            view = host_buf.view(self.dtype)
+            return view.__int__()
 
         raise ValueError(
             "only size-1 arrays can be converted to Python scalars"

From cc08b5d9e187d31d3797d256583a70b24b28db9f Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sat, 5 Aug 2023 21:33:58 -0500
Subject: [PATCH 11/26] Fixed Cython warning

---
 dpctl/_sycl_queue.pxd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpctl/_sycl_queue.pxd b/dpctl/_sycl_queue.pxd
index 729adfc3cb..c906ada4d6 100644
--- a/dpctl/_sycl_queue.pxd
+++ b/dpctl/_sycl_queue.pxd
@@ -29,7 +29,7 @@ from ._sycl_event cimport SyclEvent
 from .program._program cimport SyclKernel
 
 
-cdef void default_async_error_handler(int) nogil except *
+cdef void default_async_error_handler(int) except * nogil
 
 cdef public api class _SyclQueue [
     object Py_SyclQueueObject, type Py_SyclQueueType

From 5c1a961afee913e675c92a7ecc7d15ab308f61d8 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sat, 5 Aug 2023 21:34:52 -0500
Subject: [PATCH 12/26] Fixed array API test failure by adding validation

---
 dpctl/tensor/_copy_utils.py |  2 ++
 dpctl/tensor/_usmarray.pyx  | 19 ++++++++++++++++++-
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py
index e759571790..c6cbd2a173 100644
--- a/dpctl/tensor/_copy_utils.py
+++ b/dpctl/tensor/_copy_utils.py
@@ -492,6 +492,8 @@ def _extract_impl(ary, ary_mask, axis=0):
     dst = dpt.empty(
         dst_shape, dtype=ary.dtype, usm_type=ary.usm_type, device=ary.device
     )
+    if dst.size == 0:
+        return dst
     hev, _ = ti._extract(
         src=ary,
         cumsum=cumsum,
diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx
index 65c18d7a37..7ebf81657c 100644
--- a/dpctl/tensor/_usmarray.pyx
+++ b/dpctl/tensor/_usmarray.pyx
@@ -764,6 +764,8 @@ cdef class usm_ndarray:
             ind, (<object>self).shape, (<object> self).strides,
             self.get_offset())
         cdef usm_ndarray res
+        cdef int i = 0
+        cdef bint matching = 1
 
         if len(_meta) < 5:
             raise RuntimeError
@@ -787,7 +789,20 @@ cdef class usm_ndarray:
 
         from ._copy_utils import _extract_impl, _nonzero_impl, _take_multi_index
         if len(adv_ind) == 1 and adv_ind[0].dtype == dpt_bool:
-            return _extract_impl(res, adv_ind[0], axis=adv_ind_start_p)
+            key_ = adv_ind[0]
+            adv_ind_end_p = key_.ndim + adv_ind_start_p
+            if adv_ind_end_p > res.ndim:
+                raise IndexError("too many indices for the array")
+            key_shape = key_.shape
+            arr_shape = res.shape[adv_ind_start_p:adv_ind_end_p]
+            for i in range(key_.ndim):
+                if matching:
+                    if not key_shape[i] == arr_shape[i] and key_shape[i] > 0:
+                        matching = 0
+            if not matching:
+                raise IndexError("boolean index did not match indexed array in dimensions")
+            res = _extract_impl(res, key_, axis=adv_ind_start_p)
+            return res
 
         if any(ind.dtype == dpt_bool for ind in adv_ind):
             adv_ind_int = list()
@@ -1152,6 +1167,8 @@ cdef class usm_ndarray:
         if adv_ind_start_p < 0:
             # basic slicing
             if isinstance(rhs, usm_ndarray):
+                if Xv.size == 0:
+                    return
                 _copy_from_usm_ndarray_to_usm_ndarray(Xv, rhs)
             else:
                 if hasattr(rhs, "__sycl_usm_array_interface__"):

From 07faf2b666bd6b3eaed49649bba4202140e8c741 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sun, 6 Aug 2023 08:13:41 -0500
Subject: [PATCH 13/26] Use bitwise_invert for __invert__

---
 dpctl/tensor/_usmarray.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx
index 7ebf81657c..57dd1e47f0 100644
--- a/dpctl/tensor/_usmarray.pyx
+++ b/dpctl/tensor/_usmarray.pyx
@@ -1046,7 +1046,7 @@ cdef class usm_ndarray:
         return _dispatch_binary_elementwise(self, "greater", other)
 
     def __invert__(self):
-        return _dispatch_unary_elementwise(self, "invert")
+        return _dispatch_unary_elementwise(self, "bitwise_invert")
 
     def __le__(self, other):
         return _dispatch_binary_elementwise(self, "less_equal", other)

From 3ddf51c0498f9d1139b1cb550f6b58de8eb76eb4 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sun, 6 Aug 2023 08:14:21 -0500
Subject: [PATCH 14/26] Corrected order='K' support in astype

Array API tests pointed out an error in implementation of order='K'
in dpctl.tensor.astype. Moved _empty_like_orderK and fried from
_type_utils to _copy_utils and used it to implement astype.

Modified import statement in _elementwise_common where _empty_like*
are used.
---
 dpctl/tensor/_copy_utils.py                | 114 ++++++++++++++++++---
 dpctl/tensor/_elementwise_common.py        |   3 +-
 dpctl/tensor/_type_utils.py                |  94 -----------------
 dpctl/tests/elementwise/test_type_utils.py |   9 +-
 4 files changed, 103 insertions(+), 117 deletions(-)

diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py
index c6cbd2a173..6ad4f6a154 100644
--- a/dpctl/tensor/_copy_utils.py
+++ b/dpctl/tensor/_copy_utils.py
@@ -13,6 +13,7 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
+import builtins
 import operator
 
 import numpy as np
@@ -361,6 +362,96 @@ def copy(usm_ary, order="K"):
     return R
 
 
+def _empty_like_orderK(X, dt, usm_type=None, dev=None):
+    """Returns empty array like `x`, using order='K'
+
+    For an array `x` that was obtained by permutation of a contiguous
+    array the returned array will have the same shape and the same
+    strides as `x`.
+    """
+    if not isinstance(X, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray, got {type(X)}")
+    if usm_type is None:
+        usm_type = X.usm_type
+    if dev is None:
+        dev = X.device
+    fl = X.flags
+    if fl["C"] or X.size <= 1:
+        return dpt.empty_like(
+            X, dtype=dt, usm_type=usm_type, device=dev, order="C"
+        )
+    elif fl["F"]:
+        return dpt.empty_like(
+            X, dtype=dt, usm_type=usm_type, device=dev, order="F"
+        )
+    st = list(X.strides)
+    perm = sorted(
+        range(X.ndim), key=lambda d: builtins.abs(st[d]), reverse=True
+    )
+    inv_perm = sorted(range(X.ndim), key=lambda i: perm[i])
+    st_sorted = [st[i] for i in perm]
+    sh = X.shape
+    sh_sorted = tuple(sh[i] for i in perm)
+    R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C")
+    if min(st_sorted) < 0:
+        sl = tuple(
+            slice(None, None, -1)
+            if st_sorted[i] < 0
+            else slice(None, None, None)
+            for i in range(X.ndim)
+        )
+        R = R[sl]
+    return dpt.permute_dims(R, inv_perm)
+
+
+def _empty_like_pair_orderK(X1, X2, dt, res_shape, usm_type, dev):
+    if not isinstance(X1, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray, got {type(X1)}")
+    if not isinstance(X2, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray, got {type(X2)}")
+    nd1 = X1.ndim
+    nd2 = X2.ndim
+    if nd1 > nd2 and X1.shape == res_shape:
+        return _empty_like_orderK(X1, dt, usm_type, dev)
+    elif nd1 < nd2 and X2.shape == res_shape:
+        return _empty_like_orderK(X2, dt, usm_type, dev)
+    fl1 = X1.flags
+    fl2 = X2.flags
+    if fl1["C"] or fl2["C"]:
+        return dpt.empty(
+            res_shape, dtype=dt, usm_type=usm_type, device=dev, order="C"
+        )
+    if fl1["F"] and fl2["F"]:
+        return dpt.empty(
+            res_shape, dtype=dt, usm_type=usm_type, device=dev, order="F"
+        )
+    st1 = list(X1.strides)
+    st2 = list(X2.strides)
+    max_ndim = max(nd1, nd2)
+    st1 += [0] * (max_ndim - len(st1))
+    st2 += [0] * (max_ndim - len(st2))
+    perm = sorted(
+        range(max_ndim),
+        key=lambda d: (builtins.abs(st1[d]), builtins.abs(st2[d])),
+        reverse=True,
+    )
+    inv_perm = sorted(range(max_ndim), key=lambda i: perm[i])
+    st1_sorted = [st1[i] for i in perm]
+    st2_sorted = [st2[i] for i in perm]
+    sh = res_shape
+    sh_sorted = tuple(sh[i] for i in perm)
+    R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C")
+    if max(min(st1_sorted), min(st2_sorted)) < 0:
+        sl = tuple(
+            slice(None, None, -1)
+            if (st1_sorted[i] < 0 and st2_sorted[i] < 0)
+            else slice(None, None, None)
+            for i in range(nd1)
+        )
+        R = R[sl]
+    return dpt.permute_dims(R, inv_perm)
+
+
 def astype(usm_ary, newdtype, order="K", casting="unsafe", copy=True):
     """ astype(array, new_dtype, order="K", casting="unsafe", \
             copy=True)
@@ -432,26 +523,15 @@ def astype(usm_ary, newdtype, order="K", casting="unsafe", copy=True):
             "Unrecognized value of the order keyword. "
             "Recognized values are 'A', 'C', 'F', or 'K'"
         )
-    R = dpt.usm_ndarray(
-        usm_ary.shape,
-        dtype=target_dtype,
-        buffer=usm_ary.usm_type,
-        order=copy_order,
-        buffer_ctor_kwargs={"queue": usm_ary.sycl_queue},
-    )
-    if order == "K" and (not c_contig and not f_contig):
-        original_strides = usm_ary.strides
-        ind = sorted(
-            range(usm_ary.ndim),
-            key=lambda i: abs(original_strides[i]),
-            reverse=True,
-        )
-        new_strides = tuple(R.strides[ind[i]] for i in ind)
+    if order == "K":
+        R = _empty_like_orderK(usm_ary, target_dtype)
+    else:
         R = dpt.usm_ndarray(
             usm_ary.shape,
             dtype=target_dtype,
-            buffer=R.usm_data,
-            strides=new_strides,
+            buffer=usm_ary.usm_type,
+            order=copy_order,
+            buffer_ctor_kwargs={"queue": usm_ary.sycl_queue},
         )
     _copy_from_usm_ndarray_to_usm_ndarray(R, usm_ary)
     return R
diff --git a/dpctl/tensor/_elementwise_common.py b/dpctl/tensor/_elementwise_common.py
index 78c79fb2ad..f924ee31cd 100644
--- a/dpctl/tensor/_elementwise_common.py
+++ b/dpctl/tensor/_elementwise_common.py
@@ -26,10 +26,9 @@
 from dpctl.tensor._usmarray import _is_object_with_buffer_protocol as _is_buffer
 from dpctl.utils import ExecutionPlacementError
 
+from ._copy_utils import _empty_like_orderK, _empty_like_pair_orderK
 from ._type_utils import (
     _acceptance_fn_default,
-    _empty_like_orderK,
-    _empty_like_pair_orderK,
     _find_buf_dtype,
     _find_buf_dtype2,
     _find_inplace_dtype,
diff --git a/dpctl/tensor/_type_utils.py b/dpctl/tensor/_type_utils.py
index fb2223f292..b576764689 100644
--- a/dpctl/tensor/_type_utils.py
+++ b/dpctl/tensor/_type_utils.py
@@ -14,8 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import builtins
-
 import dpctl.tensor as dpt
 import dpctl.tensor._tensor_impl as ti
 
@@ -116,96 +114,6 @@ def _can_cast(from_: dpt.dtype, to_: dpt.dtype, _fp16: bool, _fp64: bool):
     return can_cast_v
 
 
-def _empty_like_orderK(X, dt, usm_type=None, dev=None):
-    """Returns empty array like `x`, using order='K'
-
-    For an array `x` that was obtained by permutation of a contiguous
-    array the returned array will have the same shape and the same
-    strides as `x`.
-    """
-    if not isinstance(X, dpt.usm_ndarray):
-        raise TypeError(f"Expected usm_ndarray, got {type(X)}")
-    if usm_type is None:
-        usm_type = X.usm_type
-    if dev is None:
-        dev = X.device
-    fl = X.flags
-    if fl["C"] or X.size <= 1:
-        return dpt.empty_like(
-            X, dtype=dt, usm_type=usm_type, device=dev, order="C"
-        )
-    elif fl["F"]:
-        return dpt.empty_like(
-            X, dtype=dt, usm_type=usm_type, device=dev, order="F"
-        )
-    st = list(X.strides)
-    perm = sorted(
-        range(X.ndim), key=lambda d: builtins.abs(st[d]), reverse=True
-    )
-    inv_perm = sorted(range(X.ndim), key=lambda i: perm[i])
-    st_sorted = [st[i] for i in perm]
-    sh = X.shape
-    sh_sorted = tuple(sh[i] for i in perm)
-    R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C")
-    if min(st_sorted) < 0:
-        sl = tuple(
-            slice(None, None, -1)
-            if st_sorted[i] < 0
-            else slice(None, None, None)
-            for i in range(X.ndim)
-        )
-        R = R[sl]
-    return dpt.permute_dims(R, inv_perm)
-
-
-def _empty_like_pair_orderK(X1, X2, dt, res_shape, usm_type, dev):
-    if not isinstance(X1, dpt.usm_ndarray):
-        raise TypeError(f"Expected usm_ndarray, got {type(X1)}")
-    if not isinstance(X2, dpt.usm_ndarray):
-        raise TypeError(f"Expected usm_ndarray, got {type(X2)}")
-    nd1 = X1.ndim
-    nd2 = X2.ndim
-    if nd1 > nd2 and X1.shape == res_shape:
-        return _empty_like_orderK(X1, dt, usm_type, dev)
-    elif nd1 < nd2 and X2.shape == res_shape:
-        return _empty_like_orderK(X2, dt, usm_type, dev)
-    fl1 = X1.flags
-    fl2 = X2.flags
-    if fl1["C"] or fl2["C"]:
-        return dpt.empty(
-            res_shape, dtype=dt, usm_type=usm_type, device=dev, order="C"
-        )
-    if fl1["F"] and fl2["F"]:
-        return dpt.empty(
-            res_shape, dtype=dt, usm_type=usm_type, device=dev, order="F"
-        )
-    st1 = list(X1.strides)
-    st2 = list(X2.strides)
-    max_ndim = max(nd1, nd2)
-    st1 += [0] * (max_ndim - len(st1))
-    st2 += [0] * (max_ndim - len(st2))
-    perm = sorted(
-        range(max_ndim),
-        key=lambda d: (builtins.abs(st1[d]), builtins.abs(st2[d])),
-        reverse=True,
-    )
-    inv_perm = sorted(range(max_ndim), key=lambda i: perm[i])
-    st1_sorted = [st1[i] for i in perm]
-    st2_sorted = [st2[i] for i in perm]
-    sh = res_shape
-    sh_sorted = tuple(sh[i] for i in perm)
-    R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C")
-    if max(min(st1_sorted), min(st2_sorted)) < 0:
-        sl = tuple(
-            slice(None, None, -1)
-            if (st1_sorted[i] < 0 and st2_sorted[i] < 0)
-            else slice(None, None, None)
-            for i in range(nd1)
-        )
-        R = R[sl]
-    return dpt.permute_dims(R, inv_perm)
-
-
 def _to_device_supported_dtype(dt, dev):
     has_fp16 = dev.has_aspect_fp16
     has_fp64 = dev.has_aspect_fp64
@@ -339,8 +247,6 @@ def _find_inplace_dtype(lhs_dtype, rhs_dtype, query_fn, sycl_dev):
     "_find_buf_dtype",
     "_find_buf_dtype2",
     "_find_inplace_dtype",
-    "_empty_like_orderK",
-    "_empty_like_pair_orderK",
     "_to_device_supported_dtype",
     "_acceptance_fn_default",
     "_acceptance_fn_divide",
diff --git a/dpctl/tests/elementwise/test_type_utils.py b/dpctl/tests/elementwise/test_type_utils.py
index 403e455c2e..2ca9c547d4 100644
--- a/dpctl/tests/elementwise/test_type_utils.py
+++ b/dpctl/tests/elementwise/test_type_utils.py
@@ -18,6 +18,7 @@
 
 import dpctl
 import dpctl.tensor as dpt
+import dpctl.tensor._copy_utils as cu
 import dpctl.tensor._type_utils as tu
 
 from .utils import _all_dtypes, _map_to_device_dtype
@@ -73,15 +74,15 @@ def test_type_utils_empty_like_orderK():
         a = dpt.empty((10, 10), dtype=dpt.int32, order="F")
     except dpctl.SyclDeviceCreationError:
         pytest.skip("No SYCL devices available")
-    X = tu._empty_like_orderK(a, dpt.int32, a.usm_type, a.device)
+    X = cu._empty_like_orderK(a, dpt.int32, a.usm_type, a.device)
     assert X.flags["F"]
 
 
 def test_type_utils_empty_like_orderK_invalid_args():
     with pytest.raises(TypeError):
-        tu._empty_like_orderK([1, 2, 3], dpt.int32, "device", None)
+        cu._empty_like_orderK([1, 2, 3], dpt.int32, "device", None)
     with pytest.raises(TypeError):
-        tu._empty_like_pair_orderK(
+        cu._empty_like_pair_orderK(
             [1, 2, 3],
             (
                 1,
@@ -98,7 +99,7 @@ def test_type_utils_empty_like_orderK_invalid_args():
     except dpctl.SyclDeviceCreationError:
         pytest.skip("No SYCL devices available")
     with pytest.raises(TypeError):
-        tu._empty_like_pair_orderK(
+        cu._empty_like_pair_orderK(
             a,
             (
                 1,

From 51e3f15d76aa0721566f26b9a4f6448deb79b4aa Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sun, 6 Aug 2023 20:12:30 -0500
Subject: [PATCH 15/26] Moved 2 tests from test_type_utils to
 elementwise/test_type_utils

The dpctl/tests/test_type_utils.py has been removed.
---
 dpctl/tests/elementwise/test_type_utils.py | 44 ++++++++++++++
 dpctl/tests/test_type_utils.py             | 68 ----------------------
 2 files changed, 44 insertions(+), 68 deletions(-)
 delete mode 100644 dpctl/tests/test_type_utils.py

diff --git a/dpctl/tests/elementwise/test_type_utils.py b/dpctl/tests/elementwise/test_type_utils.py
index 2ca9c547d4..f6e09b9b4e 100644
--- a/dpctl/tests/elementwise/test_type_utils.py
+++ b/dpctl/tests/elementwise/test_type_utils.py
@@ -186,3 +186,47 @@ def test_binary_func_arg_validation():
     with pytest.raises(ValueError):
         dpt.add(a, Ellipsis)
     dpt.add(a, a, order="invalid")
+
+
+def test_all_data_types():
+    fp16_fp64_types = set([dpt.float16, dpt.float64, dpt.complex128])
+    fp64_types = set([dpt.float64, dpt.complex128])
+
+    all_dts = tu._all_data_types(True, True)
+    assert fp16_fp64_types.issubset(all_dts)
+
+    all_dts = tu._all_data_types(True, False)
+    assert dpt.float16 in all_dts
+    assert not fp64_types.issubset(all_dts)
+
+    all_dts = tu._all_data_types(False, True)
+    assert dpt.float16 not in all_dts
+    assert fp64_types.issubset(all_dts)
+
+    all_dts = tu._all_data_types(False, False)
+    assert not fp16_fp64_types.issubset(all_dts)
+
+
+@pytest.mark.parametrize("fp16", [True, False])
+@pytest.mark.parametrize("fp64", [True, False])
+def test_maximal_inexact_types(fp16, fp64):
+    assert not tu._is_maximal_inexact_type(dpt.int32, fp16, fp64)
+    assert fp64 == tu._is_maximal_inexact_type(dpt.float64, fp16, fp64)
+    assert fp64 == tu._is_maximal_inexact_type(dpt.complex128, fp16, fp64)
+    assert fp64 != tu._is_maximal_inexact_type(dpt.float32, fp16, fp64)
+    assert fp64 != tu._is_maximal_inexact_type(dpt.complex64, fp16, fp64)
+
+
+def test_can_cast_device():
+    assert tu._can_cast(dpt.int64, dpt.float64, True, True)
+    # if f8 is available, can't cast i8 to f4
+    assert not tu._can_cast(dpt.int64, dpt.float32, True, True)
+    assert not tu._can_cast(dpt.int64, dpt.float32, False, True)
+    # should be able to cast to f8 when f2 unavailable
+    assert tu._can_cast(dpt.int64, dpt.float64, False, True)
+    # casting to f4 acceptable when f8 unavailable
+    assert tu._can_cast(dpt.int64, dpt.float32, True, False)
+    assert tu._can_cast(dpt.int64, dpt.float32, False, False)
+    # can't safely cast inexact type to inexact type of lesser precision
+    assert not tu._can_cast(dpt.float32, dpt.float16, True, False)
+    assert not tu._can_cast(dpt.float64, dpt.float32, False, True)
diff --git a/dpctl/tests/test_type_utils.py b/dpctl/tests/test_type_utils.py
deleted file mode 100644
index 882478a2ce..0000000000
--- a/dpctl/tests/test_type_utils.py
+++ /dev/null
@@ -1,68 +0,0 @@
-#                      Data Parallel Control (dpctl)
-#
-# Copyright 2020-2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-import dpctl.tensor as dpt
-from dpctl.tensor._type_utils import (
-    _all_data_types,
-    _can_cast,
-    _is_maximal_inexact_type,
-)
-
-
-def test_all_data_types():
-    fp16_fp64_types = set([dpt.float16, dpt.float64, dpt.complex128])
-    fp64_types = set([dpt.float64, dpt.complex128])
-
-    all_dts = _all_data_types(True, True)
-    assert fp16_fp64_types.issubset(all_dts)
-
-    all_dts = _all_data_types(True, False)
-    assert dpt.float16 in all_dts
-    assert not fp64_types.issubset(all_dts)
-
-    all_dts = _all_data_types(False, True)
-    assert dpt.float16 not in all_dts
-    assert fp64_types.issubset(all_dts)
-
-    all_dts = _all_data_types(False, False)
-    assert not fp16_fp64_types.issubset(all_dts)
-
-
-@pytest.mark.parametrize("fp16", [True, False])
-@pytest.mark.parametrize("fp64", [True, False])
-def test_maximal_inexact_types(fp16, fp64):
-    assert not _is_maximal_inexact_type(dpt.int32, fp16, fp64)
-    assert fp64 == _is_maximal_inexact_type(dpt.float64, fp16, fp64)
-    assert fp64 == _is_maximal_inexact_type(dpt.complex128, fp16, fp64)
-    assert fp64 != _is_maximal_inexact_type(dpt.float32, fp16, fp64)
-    assert fp64 != _is_maximal_inexact_type(dpt.complex64, fp16, fp64)
-
-
-def test_can_cast_device():
-    assert _can_cast(dpt.int64, dpt.float64, True, True)
-    # if f8 is available, can't cast i8 to f4
-    assert not _can_cast(dpt.int64, dpt.float32, True, True)
-    assert not _can_cast(dpt.int64, dpt.float32, False, True)
-    # should be able to cast to f8 when f2 unavailable
-    assert _can_cast(dpt.int64, dpt.float64, False, True)
-    # casting to f4 acceptable when f8 unavailable
-    assert _can_cast(dpt.int64, dpt.float32, True, False)
-    assert _can_cast(dpt.int64, dpt.float32, False, False)
-    # can't safely cast inexact type to inexact type of lesser precision
-    assert not _can_cast(dpt.float32, dpt.float16, True, False)
-    assert not _can_cast(dpt.float64, dpt.float32, False, True)

From e5785ca51b1518d78b50d3395d8d9efc7fbef35a Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sun, 6 Aug 2023 20:14:00 -0500
Subject: [PATCH 16/26] Fixed bug in concat uncovered by array API tests

```
import dpctl.tensor as dpt
x1 = dpt.full(tuple(), 77, dtype='u2')
x2 = dpt.zeros(2, dtype='uint8')[dpt.newaxis, :]

dpt.concat((x1, x2), axis=None)
```

The reason the exception was raised is that
_copy_usm_ndarray_for_reshape which is used in the
implementation of concat with axis=None requires
both source and destination to have the same
data-type.
---
 dpctl/tensor/_manipulation_functions.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/dpctl/tensor/_manipulation_functions.py b/dpctl/tensor/_manipulation_functions.py
index 7b066417af..4d5de3fb0f 100644
--- a/dpctl/tensor/_manipulation_functions.py
+++ b/dpctl/tensor/_manipulation_functions.py
@@ -554,8 +554,13 @@ def _concat_axis_None(arrays):
                 sycl_queue=exec_q,
             )
         else:
+            src_ = array
+            # _copy_usm_ndarray_for_reshape requires src and dst to have
+            # the same data type
+            if not array.dtype == res_dtype:
+                src_ = dpt.astype(src_, res_dtype)
             hev, _ = ti._copy_usm_ndarray_for_reshape(
-                src=array,
+                src=src_,
                 dst=res[fill_start:fill_end],
                 shift=0,
                 sycl_queue=exec_q,

From a3c00bc3ed9af0fc13f0331e526072690fb70575 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sun, 6 Aug 2023 22:11:35 -0500
Subject: [PATCH 17/26] Closes gh-1325

```

In [1]: import dpctl.tensor as dpt

In [2]: a = dpt.arange(10, dtype='int64')
   ...: b = dpt.arange(10, dtype='float32')

In [3]: dpt.concat((a,b))
Out[3]:
usm_ndarray([0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 0., 1., 2., 3., 4.,
             5., 6., 7., 8., 9.], dtype=float32)

In [4]: _.sycl_device.name
Out[4]: 'Intel(R) Graphics [0x9a49]'
```
---
 dpctl/tensor/_manipulation_functions.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/dpctl/tensor/_manipulation_functions.py b/dpctl/tensor/_manipulation_functions.py
index 4d5de3fb0f..26c1ab60cf 100644
--- a/dpctl/tensor/_manipulation_functions.py
+++ b/dpctl/tensor/_manipulation_functions.py
@@ -25,6 +25,8 @@
 import dpctl.tensor._tensor_impl as ti
 import dpctl.utils as dputils
 
+from ._type_utils import _to_device_supported_dtype
+
 __doc__ = (
     "Implementation module for array manipulation "
     "functions in :module:`dpctl.tensor`"
@@ -504,8 +506,10 @@ def _arrays_validation(arrays, check_ndim=True):
     _supported_dtype(Xi.dtype for Xi in arrays)
 
     res_dtype = X0.dtype
+    dev = exec_q.sycl_device
     for i in range(1, n):
         res_dtype = np.promote_types(res_dtype, arrays[i])
+        res_dtype = _to_device_supported_dtype(res_dtype, dev)
 
     if check_ndim:
         for i in range(1, n):

From 80eae6ee12152ceedf9538a80c3312d205cd200f Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Mon, 7 Aug 2023 07:39:07 -0700
Subject: [PATCH 18/26] Corrected order="K" support in copy

---
 dpctl/tensor/_copy_utils.py | 131 ++++++++++++++++--------------------
 1 file changed, 59 insertions(+), 72 deletions(-)

diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py
index 6ad4f6a154..565a11dec7 100644
--- a/dpctl/tensor/_copy_utils.py
+++ b/dpctl/tensor/_copy_utils.py
@@ -290,78 +290,6 @@ def _copy_from_usm_ndarray_to_usm_ndarray(dst, src):
     _copy_same_shape(dst, src_same_shape)
 
 
-def copy(usm_ary, order="K"):
-    """copy(ary, order="K")
-
-    Creates a copy of given instance of :class:`dpctl.tensor.usm_ndarray`.
-
-    Args:
-        ary (usm_ndarray):
-            Input array.
-        order ({"C", "F", "A", "K"}, optional):
-            Controls the memory layout of the output array.
-    Returns:
-        usm_ndarray:
-            A copy of the input array.
-
-    Memory layout of the copy is controlled by `order` keyword,
-    following NumPy's conventions. The `order` keywords can be
-    one of the following:
-
-       - "C": C-contiguous memory layout
-       - "F": Fortran-contiguous memory layout
-       - "A": Fortran-contiguous if the input array is also Fortran-contiguous,
-         otherwise C-contiguous
-       - "K": match the layout of `usm_ary` as closely as possible.
-
-    """
-    if not isinstance(usm_ary, dpt.usm_ndarray):
-        return TypeError(
-            f"Expected object of type dpt.usm_ndarray, got {type(usm_ary)}"
-        )
-    copy_order = "C"
-    if order == "C":
-        pass
-    elif order == "F":
-        copy_order = order
-    elif order == "A":
-        if usm_ary.flags.f_contiguous:
-            copy_order = "F"
-    elif order == "K":
-        if usm_ary.flags.f_contiguous:
-            copy_order = "F"
-    else:
-        raise ValueError(
-            "Unrecognized value of the order keyword. "
-            "Recognized values are 'A', 'C', 'F', or 'K'"
-        )
-    c_contig = usm_ary.flags.c_contiguous
-    f_contig = usm_ary.flags.f_contiguous
-    R = dpt.usm_ndarray(
-        usm_ary.shape,
-        dtype=usm_ary.dtype,
-        buffer=usm_ary.usm_type,
-        order=copy_order,
-        buffer_ctor_kwargs={"queue": usm_ary.sycl_queue},
-    )
-    if order == "K" and (not c_contig and not f_contig):
-        original_strides = usm_ary.strides
-        ind = sorted(
-            range(usm_ary.ndim),
-            key=lambda i: abs(original_strides[i]),
-            reverse=True,
-        )
-        new_strides = tuple(R.strides[ind[i]] for i in ind)
-        R = dpt.usm_ndarray(
-            usm_ary.shape,
-            dtype=usm_ary.dtype,
-            buffer=R.usm_data,
-            strides=new_strides,
-        )
-    _copy_same_shape(R, usm_ary)
-    return R
-
-
 def _empty_like_orderK(X, dt, usm_type=None, dev=None):
     """Returns empty array like `x`, using order='K'
 
@@ -452,6 +380,65 @@ def _empty_like_pair_orderK(X1, X2, dt, res_shape, usm_type, dev):
     return dpt.permute_dims(R, inv_perm)
 
 
+def copy(usm_ary, order="K"):
+    """copy(ary, order="K")
+
+    Creates a copy of given instance of :class:`dpctl.tensor.usm_ndarray`.
+
+    Args:
+        ary (usm_ndarray):
+            Input array.
+        order ({"C", "F", "A", "K"}, optional):
+            Controls the memory layout of the output array.
+    Returns:
+        usm_ndarray:
+            A copy of the input array.
+
+    Memory layout of the copy is controlled by `order` keyword,
+    following NumPy's conventions. The `order` keywords can be
+    one of the following:
+
+       - "C": C-contiguous memory layout
+       - "F": Fortran-contiguous memory layout
+       - "A": Fortran-contiguous if the input array is also Fortran-contiguous,
+         otherwise C-contiguous
+       - "K": match the layout of `usm_ary` as closely as possible.
+
+    """
+    if not isinstance(usm_ary, dpt.usm_ndarray):
+        return TypeError(
+            f"Expected object of type dpt.usm_ndarray, got {type(usm_ary)}"
+        )
+    copy_order = "C"
+    if order == "C":
+        pass
+    elif order == "F":
+        copy_order = order
+    elif order == "A":
+        if usm_ary.flags.f_contiguous:
+            copy_order = "F"
+    elif order == "K":
+        if usm_ary.flags.f_contiguous:
+            copy_order = "F"
+    else:
+        raise ValueError(
+            "Unrecognized value of the order keyword. "
+            "Recognized values are 'A', 'C', 'F', or 'K'"
+        )
+    if order == "K":
+        R = _empty_like_orderK(usm_ary, usm_ary.dtype)
+    else:
+        R = dpt.usm_ndarray(
+            usm_ary.shape,
+            dtype=usm_ary.dtype,
+            buffer=usm_ary.usm_type,
+            order=copy_order,
+            buffer_ctor_kwargs={"queue": usm_ary.sycl_queue},
+        )
+    _copy_same_shape(R, usm_ary)
+    return R
+
+
 def astype(usm_ary, newdtype, order="K", casting="unsafe", copy=True):
     """ astype(array, new_dtype, order="K", casting="unsafe", \
             copy=True)

From ff1081aeb0c9555ddbd7761ef1175b562dc60272 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Mon, 7 Aug 2023 07:39:13 -0700
Subject: [PATCH 19/26] Fixed logaddexp for mixed nan and number operands

---
 .../elementwise_functions/logaddexp.hpp       | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
index 39ba5c3fcf..f2d2505f2c 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
@@ -28,6 +28,7 @@
 #include <CL/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
+#include <limits>
 #include <type_traits>
 
 #include "utils/offset_utils.hpp"
@@ -55,13 +56,14 @@ using dpctl::tensor::type_utils::vec_cast;
 
 template <typename argT1, typename argT2, typename resT> struct LogAddExpFunctor
 {
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
-    using supports_vec = typename std::negation<
-        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_sg_loadstore = std::true_type;
+    using supports_vec = std::true_type;
 
     resT operator()(const argT1 &in1, const argT2 &in2)
     {
+        if (std::isnan(in1) || std::isnan(in2)) {
+            return std::numeric_limits<resT>::quiet_NaN();
+        }
         resT max = std::max<resT>(in1, in2);
         resT min = std::min<resT>(in1, in2);
         return max + std::log1p(std::exp(min - max));
@@ -76,8 +78,13 @@ template <typename argT1, typename argT2, typename resT> struct LogAddExpFunctor
 
 #pragma unroll
         for (int i = 0; i < vec_sz; ++i) {
-            resT max = std::max<resT>(in1[i], in2[i]);
-            res[i] = max + std::log1p(std::exp(std::abs(diff[i])));
+            if (std::isnan(in1[i]) || std::isnan(in2[i])) {
+                res[i] = std::numeric_limits<resT>::quiet_NaN();
+            }
+            else {
+                resT max = std::max<resT>(in1[i], in2[i]);
+                res[i] = max + std::log1p(std::exp(std::abs(diff[i])));
+            }
         }
 
         return res;

From 1b5419f19a0be887743a316170ddb7a101d8ae37 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Mon, 7 Aug 2023 10:32:29 -0700
Subject: [PATCH 20/26] logaddexp now handles both NaNs and infinities
 correctly per array API

---
 .../kernels/elementwise_functions/logaddexp.hpp      | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
index f2d2505f2c..b5f48dfffb 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
@@ -61,10 +61,10 @@ template <typename argT1, typename argT2, typename resT> struct LogAddExpFunctor
 
     resT operator()(const argT1 &in1, const argT2 &in2)
     {
-        if (std::isnan(in1) || std::isnan(in2)) {
-            return std::numeric_limits<resT>::quiet_NaN();
-        }
         resT max = std::max<resT>(in1, in2);
+        if (std::isnan(max) || std::isinf(max)) {
+            return max;
+        }
         resT min = std::min<resT>(in1, in2);
         return max + std::log1p(std::exp(min - max));
     }
@@ -78,11 +78,11 @@ template <typename argT1, typename argT2, typename resT> struct LogAddExpFunctor
 
 #pragma unroll
         for (int i = 0; i < vec_sz; ++i) {
-            if (std::isnan(in1[i]) || std::isnan(in2[i])) {
-                res[i] = std::numeric_limits<resT>::quiet_NaN();
+            resT max = std::max<resT>(in1[i], in2[i]);
+            if (std::isnan(max) || std::isinf(max)) {
+                res[i] = max;
             }
             else {
-                resT max = std::max<resT>(in1[i], in2[i]);
                 res[i] = max + std::log1p(std::exp(std::abs(diff[i])));
             }
         }

From 3c874338321f18f5e554074c37ef4d1a30a3539a Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Mon, 7 Aug 2023 14:56:23 -0700
Subject: [PATCH 21/26] Broke up 'or' conditional in logaddexp logic for inf
 and NaN - 'or' conditions can sometimes cause wrong results when using the OS
 compiler

---
 .../kernels/elementwise_functions/logaddexp.hpp  | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
index b5f48dfffb..91fca7fcb4 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
@@ -62,8 +62,13 @@ template <typename argT1, typename argT2, typename resT> struct LogAddExpFunctor
     resT operator()(const argT1 &in1, const argT2 &in2)
     {
         resT max = std::max<resT>(in1, in2);
-        if (std::isnan(max) || std::isinf(max)) {
-            return max;
+        if (std::isnan(max)) {
+            return std::numeric_limits<resT>::quiet_NaN();
+        }
+        else {
+            if (std::isinf(max)) {
+                return std::numeric_limits<resT>::infinity();
+            }
         }
         resT min = std::min<resT>(in1, in2);
         return max + std::log1p(std::exp(min - max));
@@ -79,8 +84,11 @@ template <typename argT1, typename argT2, typename resT> struct LogAddExpFunctor
 #pragma unroll
         for (int i = 0; i < vec_sz; ++i) {
             resT max = std::max<resT>(in1[i], in2[i]);
-            if (std::isnan(max) || std::isinf(max)) {
-                res[i] = max;
+            if (std::isnan(max)) {
+                res[i] = std::numeric_limits<resT>::quiet_NaN();
+            }
+            else if (std::isinf(max)) {
+                res[i] = std::numeric_limits<resT>::infinity();
             }
             else {
                 res[i] = max + std::log1p(std::exp(std::abs(diff[i])));

From 3c0aeedfd824d6e8a4d665dd70d51e6cc65676d4 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 7 Aug 2023 11:42:10 -0500
Subject: [PATCH 22/26] Modularized logic implementing logaddexp

If both arguments are -inf, the result is also -inf.
---
 .../elementwise_functions/logaddexp.hpp       | 41 +++++++++----------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
index 91fca7fcb4..02375d5313 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
@@ -61,17 +61,7 @@ template <typename argT1, typename argT2, typename resT> struct LogAddExpFunctor
 
     resT operator()(const argT1 &in1, const argT2 &in2)
     {
-        resT max = std::max<resT>(in1, in2);
-        if (std::isnan(max)) {
-            return std::numeric_limits<resT>::quiet_NaN();
-        }
-        else {
-            if (std::isinf(max)) {
-                return std::numeric_limits<resT>::infinity();
-            }
-        }
-        resT min = std::min<resT>(in1, in2);
-        return max + std::log1p(std::exp(min - max));
+        return impl<resT>(in1, in2);
     }
 
     template <int vec_sz>
@@ -83,20 +73,29 @@ template <typename argT1, typename argT2, typename resT> struct LogAddExpFunctor
 
 #pragma unroll
         for (int i = 0; i < vec_sz; ++i) {
-            resT max = std::max<resT>(in1[i], in2[i]);
-            if (std::isnan(max)) {
-                res[i] = std::numeric_limits<resT>::quiet_NaN();
-            }
-            else if (std::isinf(max)) {
-                res[i] = std::numeric_limits<resT>::infinity();
-            }
-            else {
-                res[i] = max + std::log1p(std::exp(std::abs(diff[i])));
-            }
+            res[i] = impl<resT>(in1[i], in2[i]);
         }
 
         return res;
     }
+
+private:
+    template <typename T> T impl(T const &in1, T const &in2)
+    {
+        T max = std::max<T>(in1, in2);
+        if (std::isnan(max)) {
+            return std::numeric_limits<T>::quiet_NaN();
+        }
+        else {
+            if (std::isinf(max)) {
+                // if both args are -inf, and hence max is -inf
+                // the result is -inf as well
+                return max;
+            }
+        }
+        T min = std::min<T>(in1, in2);
+        return max + std::log1p(std::exp(min - max));
+    }
 };
 
 template <typename argT1,

From 7c94a336e0994be73c36561e08a113a41b9d4bc6 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 7 Aug 2023 02:13:02 -0500
Subject: [PATCH 23/26] Simplified flags_ computation in to_device method

Use `self.flags_` directly, instead of getting it via temporary
Flags object.
---
 dpctl/tensor/_usmarray.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx
index 57dd1e47f0..20a226bd3e 100644
--- a/dpctl/tensor/_usmarray.pyx
+++ b/dpctl/tensor/_usmarray.pyx
@@ -872,7 +872,7 @@ cdef class usm_ndarray:
                 strides=self.strides,
                 offset=self.get_offset()
             )
-            res.flags_ = self.flags.flags
+            res.flags_ = self.flags_
             return res
         else:
             nbytes = self.usm_data.nbytes
@@ -887,7 +887,7 @@ cdef class usm_ndarray:
                 strides=self.strides,
                 offset=self.get_offset()
             )
-            res.flags_ = self.flags.flags
+            res.flags_ = self.flags_
             return res
 
     def _set_namespace(self, mod):

From cf7d9bf11d3e269d8b43ba128a9c709a58065fa1 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 7 Aug 2023 02:14:35 -0500
Subject: [PATCH 24/26] Change to test_complex_special_cases

The change is to suppress RuntimeWarning arising from within
assert_allclose utility.
---
 dpctl/tests/elementwise/test_complex.py | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/dpctl/tests/elementwise/test_complex.py b/dpctl/tests/elementwise/test_complex.py
index 684b405612..85d3c0ad2e 100644
--- a/dpctl/tests/elementwise/test_complex.py
+++ b/dpctl/tests/elementwise/test_complex.py
@@ -15,6 +15,7 @@
 #  limitations under the License.
 
 import itertools
+import warnings
 
 import numpy as np
 import pytest
@@ -203,12 +204,18 @@ def test_complex_special_cases(dtype):
     Xc = dpt.asarray(Xc_np, dtype=dtype, sycl_queue=q)
 
     tol = 8 * dpt.finfo(dtype).resolution
-    assert_allclose(
-        dpt.asnumpy(dpt.real(Xc)), np.real(Xc_np), atol=tol, rtol=tol
-    )
-    assert_allclose(
-        dpt.asnumpy(dpt.imag(Xc)), np.imag(Xc_np), atol=tol, rtol=tol
-    )
-    assert_allclose(
-        dpt.asnumpy(dpt.conj(Xc)), np.conj(Xc_np), atol=tol, rtol=tol
-    )
+
+    actual = dpt.real(Xc)
+    expected = np.real(Xc_np)
+    assert_allclose(dpt.asnumpy(actual), expected, atol=tol, rtol=tol)
+
+    actual = dpt.imag(Xc)
+    expected = np.imag(Xc_np)
+    assert_allclose(dpt.asnumpy(actual), expected, atol=tol, rtol=tol)
+
+    actual = dpt.conj(Xc)
+    expected = np.conj(Xc_np)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        assert_allclose(dpt.asnumpy(actual), expected, atol=tol, rtol=tol)

From 8343edc268e95837ea686f7b7050bffed4f02a08 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 7 Aug 2023 18:29:30 -0500
Subject: [PATCH 25/26] Array-API conformance testing can start as soon as
 build_linux jobs finished

---
 .github/workflows/conda-package.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index 74e387f91a..f717f4b309 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -486,7 +486,7 @@ jobs:
           done
 
   array-api-conformity:
-    needs: test_linux
+    needs: build_linux
     runs-on:  ${{ matrix.runner }}
 
     strategy:

From ebd1fafe53a86f647bd78d015290360508cb10d3 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 7 Aug 2023 23:21:09 -0500
Subject: [PATCH 26/26] Fixed log-add-exp per review feedback

---
 .../elementwise_functions/logaddexp.hpp       | 37 +++++++++++++------
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
index 02375d5313..b718a5f991 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
@@ -69,11 +69,17 @@ template <typename argT1, typename argT2, typename resT> struct LogAddExpFunctor
                                        const sycl::vec<argT2, vec_sz> &in2)
     {
         sycl::vec<resT, vec_sz> res;
-        auto diff = in1 - in2;
+        auto diff = in1 - in2; // take advantange of faster vec arithmetic
 
 #pragma unroll
         for (int i = 0; i < vec_sz; ++i) {
-            res[i] = impl<resT>(in1[i], in2[i]);
+            if (std::isfinite(diff[i])) {
+                res[i] = std::max<resT>(in1[i], in2[i]) +
+                         impl_finite<resT>(-std::abs(diff[i]));
+            }
+            else {
+                res[i] = impl<resT>(in1[i], in2[i]);
+            }
         }
 
         return res;
@@ -82,19 +88,28 @@ template <typename argT1, typename argT2, typename resT> struct LogAddExpFunctor
 private:
     template <typename T> T impl(T const &in1, T const &in2)
     {
-        T max = std::max<T>(in1, in2);
-        if (std::isnan(max)) {
-            return std::numeric_limits<T>::quiet_NaN();
+        if (in1 == in2) { // handle signed infinities
+            const T log2 = std::log(T(2));
+            return in1 + log2;
         }
         else {
-            if (std::isinf(max)) {
-                // if both args are -inf, and hence max is -inf
-                // the result is -inf as well
-                return max;
+            const T tmp = in1 - in2;
+            if (tmp > 0) {
+                return in1 + std::log1p(std::exp(-tmp));
+            }
+            else if (tmp <= 0) {
+                return in2 + std::log1p(std::exp(tmp));
+            }
+            else {
+                return std::numeric_limits<T>::quiet_NaN();
             }
         }
-        T min = std::min<T>(in1, in2);
-        return max + std::log1p(std::exp(min - max));
+    }
+
+    template <typename T> T impl_finite(T const &in)
+    {
+        return (in > 0) ? (in + std::log1p(std::exp(-in)))
+                        : std::log1p(std::exp(in));
     }
 };