From 369c500c0851e94c15b89ac197509b4eabd6dde2 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 16 Aug 2023 00:02:11 -0700
Subject: [PATCH 1/8] Binary elementwise functions can now act on any input
 in-place - A temporary will be allocated as necessary (i.e., when arrays
 overlap, are not going to be cast, and are not the same logical arrays) -
 Uses dedicated in-place kernels where they are implemented - Now called
 directly by Python operators - Removes _inplace method of
 BinaryElementwiseFunc class - Removes _find_inplace_dtype function

---
 dpctl/tensor/_elementwise_common.py           | 249 +++++++-----------
 dpctl/tensor/_type_utils.py                   |  18 --
 dpctl/tensor/_usmarray.pyx                    |   9 +-
 .../source/elementwise_functions.hpp          |  11 +-
 4 files changed, 110 insertions(+), 177 deletions(-)

diff --git a/dpctl/tensor/_elementwise_common.py b/dpctl/tensor/_elementwise_common.py
index f924ee31cd..8ecd5f585e 100644
--- a/dpctl/tensor/_elementwise_common.py
+++ b/dpctl/tensor/_elementwise_common.py
@@ -31,7 +31,6 @@
     _acceptance_fn_default,
     _find_buf_dtype,
     _find_buf_dtype2,
-    _find_inplace_dtype,
     _to_device_supported_dtype,
 )
 
@@ -383,14 +382,6 @@ def __repr__(self):
         return f"<{self.__name__} '{self.name_}'>"
 
     def __call__(self, o1, o2, out=None, order="K"):
-        # FIXME: replace with check against base array
-        # when views can be identified
-        if self.binary_inplace_fn_:
-            if o1 is out:
-                return self._inplace(o1, o2)
-            elif o2 is out:
-                return self._inplace(o2, o1)
-
         if order not in ["K", "C", "F", "A"]:
             order = "K"
         q1, o1_usm_type = _get_queue_usm_type(o1)
@@ -472,6 +463,7 @@ def __call__(self, o1, o2, out=None, order="K"):
                 "supported types according to the casting rule ''safe''."
             )
 
+        orig_out = out
         if out is not None:
             if not isinstance(out, dpt.usm_ndarray):
                 raise TypeError(
@@ -484,19 +476,76 @@ def __call__(self, o1, o2, out=None, order="K"):
                     f"Expected output shape is {o1_shape}, got {out.shape}"
                 )
 
-            if ti._array_overlap(o1, out) or ti._array_overlap(o2, out):
-                raise TypeError("Input and output arrays have memory overlap")
+            if res_dt != out.dtype:
+                raise TypeError(
+                    f"Output array of type {res_dt} is needed,"
+                    f"got {out.dtype}"
+                )
 
             if (
-                dpctl.utils.get_execution_queue(
-                    (o1.sycl_queue, o2.sycl_queue, out.sycl_queue)
-                )
+                dpctl.utils.get_execution_queue((exec_q, out.sycl_queue))
                 is None
             ):
                 raise TypeError(
                     "Input and output allocation queues are not compatible"
                 )
 
+            if isinstance(o1, dpt.usm_ndarray):
+                if ti._array_overlap(o1, out) and buf1_dt is None:
+                    if not ti._same_logical_tensors(o1, out):
+                        out = dpt.empty_like(out)
+                    elif self.binary_inplace_fn_ is not None:
+                        # if there is a dedicated in-place kernel
+                        # it can be called here, otherwise continues
+                        if isinstance(o2, dpt.usm_ndarray):
+                            src2 = o2
+                            if (
+                                ti._array_overlap(o2, out)
+                                and not ti._same_logical_tensors(o2, out)
+                                and buf2_dt is None
+                            ):
+                                buf2_dt = o2_dtype
+                        else:
+                            src2 = dpt.asarray(
+                                o2, dtype=o2_dtype, sycl_queue=exec_q
+                            )
+                        if buf2_dt is None:
+                            src2 = dpt.broadcast_to(src2, res_shape)
+                            ht_, _ = self.binary_inplace_fn_(
+                                lhs=o1, rhs=src2, sycl_queue=exec_q
+                            )
+                            ht_.wait()
+                        else:
+                            buf2 = dpt.empty_like(src2, dtype=buf2_dt)
+                            (
+                                ht_copy_ev,
+                                copy_ev,
+                            ) = ti._copy_usm_ndarray_into_usm_ndarray(
+                                src=src2, dst=buf2, sycl_queue=exec_q
+                            )
+
+                            buf2 = dpt.broadcast_to(buf2, res_shape)
+                            ht_, _ = self.binary_inplace_fn_(
+                                lhs=o1,
+                                rhs=buf2,
+                                sycl_queue=exec_q,
+                                depends=[copy_ev],
+                            )
+                            ht_copy_ev.wait()
+                            ht_.wait()
+
+                        return out
+
+            if isinstance(o2, dpt.usm_ndarray):
+                if (
+                    ti._array_overlap(o2, out)
+                    and not ti._same_logical_tensors(o2, out)
+                    and buf2_dt is None
+                ):
+                    # should not reach if out is reallocated
+                    # after being checked against o1
+                    out = dpt.empty_like(out)
+
         if isinstance(o1, dpt.usm_ndarray):
             src1 = o1
         else:
@@ -532,19 +581,23 @@ def __call__(self, o1, o2, out=None, order="K"):
                         sycl_queue=exec_q,
                         order=order,
                     )
-            else:
-                if res_dt != out.dtype:
-                    raise TypeError(
-                        f"Output array of type {res_dt} is needed,"
-                        f"got {out.dtype}"
-                    )
 
             src1 = dpt.broadcast_to(src1, res_shape)
             src2 = dpt.broadcast_to(src2, res_shape)
-            ht_, _ = self.binary_fn_(
+            ht_binary_ev, binary_ev = self.binary_fn_(
                 src1=src1, src2=src2, dst=out, sycl_queue=exec_q
             )
-            ht_.wait()
+            if not (orig_out is None or orig_out is out):
+                # Copy the out data from temporary buffer to original memory
+                ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray(
+                    src=out,
+                    dst=orig_out,
+                    sycl_queue=exec_q,
+                    depends=[binary_ev],
+                )
+                ht_copy_out_ev.wait()
+                out = orig_out
+            ht_binary_ev.wait()
             return out
         elif buf1_dt is None:
             if order == "K":
@@ -578,15 +631,25 @@ def __call__(self, o1, o2, out=None, order="K"):
 
             src1 = dpt.broadcast_to(src1, res_shape)
             buf2 = dpt.broadcast_to(buf2, res_shape)
-            ht_, _ = self.binary_fn_(
+            ht_binary_ev, binary_ev = self.binary_fn_(
                 src1=src1,
                 src2=buf2,
                 dst=out,
                 sycl_queue=exec_q,
                 depends=[copy_ev],
             )
+            if not (orig_out is None or orig_out is out):
+                # Copy the out data from temporary buffer to original memory
+                ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray(
+                    src=out,
+                    dst=orig_out,
+                    sycl_queue=exec_q,
+                    depends=[binary_ev],
+                )
+                ht_copy_out_ev.wait()
+                out = orig_out
             ht_copy_ev.wait()
-            ht_.wait()
+            ht_binary_ev.wait()
             return out
         elif buf2_dt is None:
             if order == "K":
@@ -611,24 +674,28 @@ def __call__(self, o1, o2, out=None, order="K"):
                         sycl_queue=exec_q,
                         order=order,
                     )
-            else:
-                if res_dt != out.dtype:
-                    raise TypeError(
-                        f"Output array of type {res_dt} is needed,"
-                        f"got {out.dtype}"
-                    )
 
             buf1 = dpt.broadcast_to(buf1, res_shape)
             src2 = dpt.broadcast_to(src2, res_shape)
-            ht_, _ = self.binary_fn_(
+            ht_binary_ev, binary_ev = self.binary_fn_(
                 src1=buf1,
                 src2=src2,
                 dst=out,
                 sycl_queue=exec_q,
                 depends=[copy_ev],
             )
+            if not (orig_out is None or orig_out is out):
+                # Copy the out data from temporary buffer to original memory
+                ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray(
+                    src=out,
+                    dst=orig_out,
+                    sycl_queue=exec_q,
+                    depends=[binary_ev],
+                )
+                ht_copy_out_ev.wait()
+                out = orig_out
             ht_copy_ev.wait()
-            ht_.wait()
+            ht_binary_ev.wait()
             return out
 
         if order in ["K", "A"]:
@@ -665,11 +732,6 @@ def __call__(self, o1, o2, out=None, order="K"):
                     sycl_queue=exec_q,
                     order=order,
                 )
-        else:
-            if res_dt != out.dtype:
-                raise TypeError(
-                    f"Output array of type {res_dt} is needed, got {out.dtype}"
-                )
 
         buf1 = dpt.broadcast_to(buf1, res_shape)
         buf2 = dpt.broadcast_to(buf2, res_shape)
@@ -682,116 +744,3 @@ def __call__(self, o1, o2, out=None, order="K"):
         )
         dpctl.SyclEvent.wait_for([ht_copy1_ev, ht_copy2_ev, ht_])
         return out
-
-    def _inplace(self, lhs, val):
-        if self.binary_inplace_fn_ is None:
-            raise ValueError(
-                f"In-place operation not supported for ufunc '{self.name_}'"
-            )
-        if not isinstance(lhs, dpt.usm_ndarray):
-            raise TypeError(
-                f"Expected dpctl.tensor.usm_ndarray, got {type(lhs)}"
-            )
-        q1, lhs_usm_type = _get_queue_usm_type(lhs)
-        q2, val_usm_type = _get_queue_usm_type(val)
-        if q2 is None:
-            exec_q = q1
-            usm_type = lhs_usm_type
-        else:
-            exec_q = dpctl.utils.get_execution_queue((q1, q2))
-            if exec_q is None:
-                raise ExecutionPlacementError(
-                    "Execution placement can not be unambiguously inferred "
-                    "from input arguments."
-                )
-            usm_type = dpctl.utils.get_coerced_usm_type(
-                (
-                    lhs_usm_type,
-                    val_usm_type,
-                )
-            )
-        dpctl.utils.validate_usm_type(usm_type, allow_none=False)
-        lhs_shape = _get_shape(lhs)
-        val_shape = _get_shape(val)
-        if not all(
-            isinstance(s, (tuple, list))
-            for s in (
-                lhs_shape,
-                val_shape,
-            )
-        ):
-            raise TypeError(
-                "Shape of arguments can not be inferred. "
-                "Arguments are expected to be "
-                "lists, tuples, or both"
-            )
-        try:
-            res_shape = _broadcast_shape_impl(
-                [
-                    lhs_shape,
-                    val_shape,
-                ]
-            )
-        except ValueError:
-            raise ValueError(
-                "operands could not be broadcast together with shapes "
-                f"{lhs_shape} and {val_shape}"
-            )
-        if res_shape != lhs_shape:
-            raise ValueError(
-                f"output shape {lhs_shape} does not match "
-                f"broadcast shape {res_shape}"
-            )
-        sycl_dev = exec_q.sycl_device
-        lhs_dtype = lhs.dtype
-        val_dtype = _get_dtype(val, sycl_dev)
-        if not _validate_dtype(val_dtype):
-            raise ValueError("Input operand of unsupported type")
-
-        lhs_dtype, val_dtype = _resolve_weak_types(
-            lhs_dtype, val_dtype, sycl_dev
-        )
-
-        buf_dt = _find_inplace_dtype(
-            lhs_dtype, val_dtype, self.result_type_resolver_fn_, sycl_dev
-        )
-
-        if buf_dt is None:
-            raise TypeError(
-                f"In-place '{self.name_}' does not support input types "
-                f"({lhs_dtype}, {val_dtype}), "
-                "and the inputs could not be safely coerced to any "
-                "supported types according to the casting rule ''safe''."
-            )
-
-        if isinstance(val, dpt.usm_ndarray):
-            rhs = val
-            overlap = ti._array_overlap(lhs, rhs)
-        else:
-            rhs = dpt.asarray(val, dtype=val_dtype, sycl_queue=exec_q)
-            overlap = False
-
-        if buf_dt == val_dtype and overlap is False:
-            rhs = dpt.broadcast_to(rhs, res_shape)
-            ht_, _ = self.binary_inplace_fn_(
-                lhs=lhs, rhs=rhs, sycl_queue=exec_q
-            )
-            ht_.wait()
-
-        else:
-            buf = dpt.empty_like(rhs, dtype=buf_dt)
-            ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-                src=rhs, dst=buf, sycl_queue=exec_q
-            )
-
-            buf = dpt.broadcast_to(buf, res_shape)
-            ht_, _ = self.binary_inplace_fn_(
-                lhs=lhs,
-                rhs=buf,
-                sycl_queue=exec_q,
-                depends=[copy_ev],
-            )
-            ht_copy_ev.wait()
-            ht_.wait()
-
-        return lhs
diff --git a/dpctl/tensor/_type_utils.py b/dpctl/tensor/_type_utils.py
index b576764689..7f496b02fa 100644
--- a/dpctl/tensor/_type_utils.py
+++ b/dpctl/tensor/_type_utils.py
@@ -226,27 +226,9 @@ def _find_buf_dtype2(arg1_dtype, arg2_dtype, query_fn, sycl_dev, acceptance_fn):
     return None, None, None
 
 
-def _find_inplace_dtype(lhs_dtype, rhs_dtype, query_fn, sycl_dev):
-    res_dt = query_fn(lhs_dtype, rhs_dtype)
-    if res_dt and res_dt == lhs_dtype:
-        return rhs_dtype
-
-    _fp16 = sycl_dev.has_aspect_fp16
-    _fp64 = sycl_dev.has_aspect_fp64
-    all_dts = _all_data_types(_fp16, _fp64)
-    for buf_dt in all_dts:
-        if _can_cast(rhs_dtype, buf_dt, _fp16, _fp64):
-            res_dt = query_fn(lhs_dtype, buf_dt)
-            if res_dt and res_dt == lhs_dtype:
-                return buf_dt
-
-    return None
-
-
 __all__ = [
     "_find_buf_dtype",
     "_find_buf_dtype2",
-    "_find_inplace_dtype",
     "_to_device_supported_dtype",
     "_acceptance_fn_default",
     "_acceptance_fn_divide",
diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx
index 1336063323..ebee97b5f2 100644
--- a/dpctl/tensor/_usmarray.pyx
+++ b/dpctl/tensor/_usmarray.pyx
@@ -1286,8 +1286,7 @@ cdef class usm_ndarray:
         return _dispatch_binary_elementwise2(other, "bitwise_xor", self)
 
     def __iadd__(self, other):
-        from ._elementwise_funcs import add
-        return add._inplace(self, other)
+        return dpctl.tensor.add(self, other, out=self)
 
     def __iand__(self, other):
         res = self.__and__(other)
@@ -1325,8 +1324,7 @@ cdef class usm_ndarray:
         return self
 
     def __imul__(self, other):
-        from ._elementwise_funcs import multiply
-        return multiply._inplace(self, other)
+        return dpctl.tensor.multiply(self, other, out=self)
 
     def __ior__(self, other):
         res = self.__or__(other)
@@ -1350,8 +1348,7 @@ cdef class usm_ndarray:
         return self
 
     def __isub__(self, other):
-        from ._elementwise_funcs import subtract
-        return subtract._inplace(self, other)
+        return dpctl.tensor.subtract(self, other, out=self)
 
     def __itruediv__(self, other):
         res = self.__truediv__(other)
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions.hpp b/dpctl/tensor/libtensor/source/elementwise_functions.hpp
index 6c3543fb76..5c4b50bbc0 100644
--- a/dpctl/tensor/libtensor/source/elementwise_functions.hpp
+++ b/dpctl/tensor/libtensor/source/elementwise_functions.hpp
@@ -379,9 +379,12 @@ std::pair<sycl::event, sycl::event> py_binary_ufunc(
         }
     }
 
-    // check memory overlap
     auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    if (overlap(src1, dst) || overlap(src2, dst)) {
+    auto const &same_logical_tensors =
+        dpctl::tensor::overlap::SameLogicalTensors();
+    if ((overlap(src1, dst) && !same_logical_tensors(src1, dst)) ||
+        (overlap(src2, dst) && !same_logical_tensors(src2, dst)))
+    {
         throw py::value_error("Arrays index overlapping segments of memory");
     }
     // check memory overlap
@@ -678,8 +681,10 @@ py_binary_inplace_ufunc(dpctl::tensor::usm_ndarray lhs,
     }
 
     // check memory overlap
+    auto const &same_logical_tensors =
+        dpctl::tensor::overlap::SameLogicalTensors();
     auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    if (overlap(rhs, lhs)) {
+    if (overlap(rhs, lhs) && !same_logical_tensors(rhs, lhs)) {
         throw py::value_error("Arrays index overlapping segments of memory");
     }
     // check memory overlap

From fc9a8da63dc1c526298c598c476a6d10c49e65c7 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 16 Aug 2023 16:17:05 -0700
Subject: [PATCH 2/8] Tests for new out parameter behavior for add

---
 dpctl/tests/elementwise/test_add.py | 42 ++++++++++++++++++++++++-----
 1 file changed, 36 insertions(+), 6 deletions(-)

diff --git a/dpctl/tests/elementwise/test_add.py b/dpctl/tests/elementwise/test_add.py
index 8ba100d219..cd4e099064 100644
--- a/dpctl/tests/elementwise/test_add.py
+++ b/dpctl/tests/elementwise/test_add.py
@@ -381,17 +381,35 @@ def test_add_inplace_dtype_matrix(op1_dtype, op2_dtype):
             dpt.asnumpy(ar1) == np.full(ar1.shape, 2, dtype=ar1.dtype)
         ).all()
 
-        ar3 = dpt.ones(sz, dtype=op1_dtype)
-        ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
-
-        ar3[::-1] += ar4[::2]
+        ar3 = dpt.ones(sz, dtype=op1_dtype)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype)[::2]
+        ar3 += ar4
         assert (
             dpt.asnumpy(ar3) == np.full(ar3.shape, 2, dtype=ar3.dtype)
         ).all()
-
     else:
         with pytest.raises(TypeError):
             ar1 += ar2
+            dpt.add(ar1, ar2, out=ar1)
+
+    # out is second arg
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+    if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64):
+        dpt.add(ar1, ar2, out=ar2)
+        assert (
+            dpt.asnumpy(ar2) == np.full(ar2.shape, 2, dtype=ar2.dtype)
+        ).all()
+
+        ar3 = dpt.ones(sz, dtype=op1_dtype)[::-1]
+        ar4 = dpt.ones(2 * sz, dtype=op2_dtype)[::2]
+        dpt.add(ar3, ar4, out=ar4)
+        assert (
+            dpt.asnumpy(ar4) == np.full(ar4.shape, 2, dtype=ar4.dtype)
+        ).all()
+    else:
+        with pytest.raises(TypeError):
+            dpt.add(ar1, ar2, out=ar2)
 
 
 def test_add_inplace_broadcasting():
@@ -403,6 +421,12 @@ def test_add_inplace_broadcasting():
     m += v
     assert (dpt.asnumpy(m) == np.arange(1, 6, dtype="i4")[np.newaxis, :]).all()
 
+    # check case where second arg is out
+    dpt.add(v, m, out=m)
+    assert (
+        dpt.asnumpy(m) == np.arange(10, dtype="i4")[np.newaxis, 1:10:2]
+    ).all()
+
 
 def test_add_inplace_errors():
     get_queue_or_skip()
@@ -441,7 +465,7 @@ def test_add_inplace_errors():
         ar1 += ar2
 
 
-def test_add_inplace_overlap():
+def test_add_inplace_same_tensors():
     get_queue_or_skip()
 
     ar1 = dpt.ones(10, dtype="i4")
@@ -451,7 +475,13 @@ def test_add_inplace_overlap():
     ar1 = dpt.ones(10, dtype="i4")
     ar2 = dpt.ones(10, dtype="i4")
     dpt.add(ar1, ar2, out=ar1)
+    # all ar1 vals should be 2
     assert (dpt.asnumpy(ar1) == np.full(ar1.shape, 2, dtype="i4")).all()
 
     dpt.add(ar2, ar1, out=ar2)
+    # all ar2 vals should be 3
     assert (dpt.asnumpy(ar2) == np.full(ar2.shape, 3, dtype="i4")).all()
+
+    dpt.add(ar1, ar2, out=ar2)
+    # all ar2 vals should be 5
+    assert (dpt.asnumpy(ar2) == np.full(ar2.shape, 5, dtype="i4")).all()

From 7eaab6dfc8f87bfff7809b71cf617c3f5862ad05 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 16 Aug 2023 21:49:12 -0700
Subject: [PATCH 3/8] Broadcasting made conditional in binary functions where
 memory overlap is possible - Broadcasting can change the values of strides
 without changing array shape

---
 dpctl/tensor/_elementwise_common.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/dpctl/tensor/_elementwise_common.py b/dpctl/tensor/_elementwise_common.py
index 8ecd5f585e..d5485b2caf 100644
--- a/dpctl/tensor/_elementwise_common.py
+++ b/dpctl/tensor/_elementwise_common.py
@@ -510,7 +510,8 @@ def __call__(self, o1, o2, out=None, order="K"):
                                 o2, dtype=o2_dtype, sycl_queue=exec_q
                             )
                         if buf2_dt is None:
-                            src2 = dpt.broadcast_to(src2, res_shape)
+                            if src2.shape != res_shape:
+                                src2 = dpt.broadcast_to(src2, res_shape)
                             ht_, _ = self.binary_inplace_fn_(
                                 lhs=o1, rhs=src2, sycl_queue=exec_q
                             )
@@ -581,9 +582,10 @@ def __call__(self, o1, o2, out=None, order="K"):
                         sycl_queue=exec_q,
                         order=order,
                     )
-
-            src1 = dpt.broadcast_to(src1, res_shape)
-            src2 = dpt.broadcast_to(src2, res_shape)
+            if src1.shape != res_shape:
+                src1 = dpt.broadcast_to(src1, res_shape)
+            if src2.shape != res_shape:
+                src2 = dpt.broadcast_to(src2, res_shape)
             ht_binary_ev, binary_ev = self.binary_fn_(
                 src1=src1, src2=src2, dst=out, sycl_queue=exec_q
             )
@@ -628,8 +630,8 @@ def __call__(self, o1, o2, out=None, order="K"):
                         f"Output array of type {res_dt} is needed,"
                         f"got {out.dtype}"
                     )
-
-            src1 = dpt.broadcast_to(src1, res_shape)
+            if src1.shape != res_shape:
+                src1 = dpt.broadcast_to(src1, res_shape)
             buf2 = dpt.broadcast_to(buf2, res_shape)
             ht_binary_ev, binary_ev = self.binary_fn_(
                 src1=src1,
@@ -676,7 +678,8 @@ def __call__(self, o1, o2, out=None, order="K"):
                     )
 
             buf1 = dpt.broadcast_to(buf1, res_shape)
-            src2 = dpt.broadcast_to(src2, res_shape)
+            if src2.shape != res_shape:
+                src2 = dpt.broadcast_to(src2, res_shape)
             ht_binary_ev, binary_ev = self.binary_fn_(
                 src1=buf1,
                 src2=src2,

From 28d699c4856a9b7606c01def9d092ee4864af0c4 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Thu, 17 Aug 2023 16:29:25 -0500
Subject: [PATCH 4/8] Changed exception types raised

Use ExecutionPlacementError for CFD violations.
Use ValueError is types of input are as expected, but values are
not as expected.
---
 dpctl/tensor/_elementwise_common.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/dpctl/tensor/_elementwise_common.py b/dpctl/tensor/_elementwise_common.py
index d5485b2caf..acc216a696 100644
--- a/dpctl/tensor/_elementwise_common.py
+++ b/dpctl/tensor/_elementwise_common.py
@@ -78,8 +78,8 @@ def __call__(self, x, out=None, order="K"):
                 )
 
             if out.shape != x.shape:
-                raise TypeError(
-                    "The shape of input and output arrays are inconsistent."
+                raise ValueError(
+                    "The shape of input and output arrays are inconsistent. "
                     f"Expected output shape is {x.shape}, got {out.shape}"
                 )
 
@@ -103,7 +103,7 @@ def __call__(self, x, out=None, order="K"):
                 dpctl.utils.get_execution_queue((x.sycl_queue, out.sycl_queue))
                 is None
             ):
-                raise TypeError(
+                raise ExecutionPlacementError(
                     "Input and output allocation queues are not compatible"
                 )
 
@@ -471,8 +471,8 @@ def __call__(self, o1, o2, out=None, order="K"):
                 )
 
             if out.shape != res_shape:
-                raise TypeError(
-                    "The shape of input and output arrays are inconsistent."
+                raise ValueError(
+                    "The shape of input and output arrays are inconsistent. "
                     f"Expected output shape is {o1_shape}, got {out.shape}"
                 )
 
@@ -486,7 +486,7 @@ def __call__(self, o1, o2, out=None, order="K"):
                 dpctl.utils.get_execution_queue((exec_q, out.sycl_queue))
                 is None
             ):
-                raise TypeError(
+                raise ExecutionPlacementError(
                     "Input and output allocation queues are not compatible"
                 )
 

From 1d10f86ab3f8ce94b99c502c17a3ba088dd8da82 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Thu, 17 Aug 2023 16:30:26 -0500
Subject: [PATCH 5/8] Adding tests to improve coverage

Removed tests expecting error raised in case of overlapping inputs.
Added tests guided by coverage report.
---
 dpctl/tests/elementwise/test_add.py | 64 +++++++++++++++++++++--------
 1 file changed, 48 insertions(+), 16 deletions(-)

diff --git a/dpctl/tests/elementwise/test_add.py b/dpctl/tests/elementwise/test_add.py
index cd4e099064..891dda5252 100644
--- a/dpctl/tests/elementwise/test_add.py
+++ b/dpctl/tests/elementwise/test_add.py
@@ -175,7 +175,7 @@ def test_add_broadcasting_new_shape():
     ).all()
 
     r3 = dpt.empty_like(ar1)
-    with pytest.raises(TypeError):
+    with pytest.raises(ValueError):
         dpt.add(ar1, ar2, out=r3)
 
     ar3 = dpt.ones((6, 1), dtype="i4")
@@ -273,7 +273,7 @@ def test_add_errors():
     ar2 = dpt.ones_like(ar1, sycl_queue=gpu_queue)
     y = dpt.empty_like(ar1, sycl_queue=cpu_queue)
     assert_raises_regex(
-        TypeError,
+        ExecutionPlacementError,
         "Input and output allocation queues are not compatible",
         dpt.add,
         ar1,
@@ -285,7 +285,7 @@ def test_add_errors():
     ar2 = dpt.ones_like(ar1, dtype="int32")
     y = dpt.empty(3)
     assert_raises_regex(
-        TypeError,
+        ValueError,
         "The shape of input and output arrays are inconsistent",
         dpt.add,
         ar1,
@@ -293,19 +293,6 @@ def test_add_errors():
         y,
     )
 
-    ar1 = dpt.ones(2, dtype="float32")
-    ar2 = dpt.ones_like(ar1, dtype="int32")
-    # identical view but a different object
-    y = ar1[:]
-    assert_raises_regex(
-        TypeError,
-        "Input and output arrays have memory overlap",
-        dpt.add,
-        ar1,
-        ar2,
-        y,
-    )
-
     ar1 = np.ones(2, dtype="float32")
     ar2 = np.ones_like(ar1, dtype="int32")
     assert_raises_regex(
@@ -485,3 +472,48 @@ def test_add_inplace_same_tensors():
     dpt.add(ar1, ar2, out=ar2)
     # all ar2 vals should be 5
     assert (dpt.asnumpy(ar2) == np.full(ar2.shape, 5, dtype="i4")).all()
+
+
+def test_add_str_repr():
+    add_s = str(dpt.add)
+    assert isinstance(add_s, str)
+    assert "add" in add_s
+
+    add_r = repr(dpt.add)
+    assert isinstance(add_r, str)
+    assert "add" in add_r
+
+
+def test_add_cfd():
+    q1 = get_queue_or_skip()
+    q2 = dpctl.SyclQueue(q1.sycl_device)
+
+    x1 = dpt.ones(10, sycl_queue=q1)
+    x2 = dpt.ones(10, sycl_queue=q2)
+    with pytest.raises(ExecutionPlacementError):
+        dpt.add(x1, x2)
+
+    with pytest.raises(ExecutionPlacementError):
+        dpt.add(x1, x1, out=x2)
+
+
+def test_add_out_type_check():
+    get_queue_or_skip()
+
+    x1 = dpt.ones(10)
+    x2 = dpt.ones(10)
+
+    out = range(10)
+
+    with pytest.raises(TypeError):
+        dpt.add(x1, x2, out=out)
+
+
+def test_add_out_need_temporary():
+    get_queue_or_skip()
+
+    x = dpt.ones(10, dtype="u4")
+
+    dpt.add(x[:6], 1, out=x[-6:])
+
+    assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2)

From 0017a7d248c010fc0cd606e2c377fe1c87196e63 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Thu, 17 Aug 2023 19:43:23 -0500
Subject: [PATCH 6/8] Removed provably unreachable branches in
 _resolve_weak_types

Since o1_dtype_kind_num > o2_dtype_kind_num, o1 can be not be
weak boolean type, since it has the lowest kind number in the
hierarchy.
---
 dpctl/tensor/_elementwise_common.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/dpctl/tensor/_elementwise_common.py b/dpctl/tensor/_elementwise_common.py
index acc216a696..002b0ef5ec 100644
--- a/dpctl/tensor/_elementwise_common.py
+++ b/dpctl/tensor/_elementwise_common.py
@@ -301,8 +301,6 @@ def _resolve_weak_types(o1_dtype, o2_dtype, dev):
         o1_kind_num = _weak_type_num_kind(o1_dtype)
         o2_kind_num = _strong_dtype_num_kind(o2_dtype)
         if o1_kind_num > o2_kind_num:
-            if isinstance(o1_dtype, WeakBooleanType):
-                return dpt.bool, o2_dtype
             if isinstance(o1_dtype, WeakIntegralType):
                 return dpt.int64, o2_dtype
             if isinstance(o1_dtype, WeakComplexType):
@@ -322,8 +320,6 @@ def _resolve_weak_types(o1_dtype, o2_dtype, dev):
         o1_kind_num = _strong_dtype_num_kind(o1_dtype)
         o2_kind_num = _weak_type_num_kind(o2_dtype)
         if o2_kind_num > o1_kind_num:
-            if isinstance(o2_dtype, WeakBooleanType):
-                return o1_dtype, dpt.bool
             if isinstance(o2_dtype, WeakIntegralType):
                 return o1_dtype, dpt.int64
             if isinstance(o2_dtype, WeakComplexType):

From 3371abd4b0184064433f1d82ce47f05097a8c66c Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Thu, 17 Aug 2023 18:15:39 -0700
Subject: [PATCH 7/8] All in-place operators now use call operator of
 BinaryElementwiseFunc

---
 dpctl/tensor/_usmarray.pyx | 54 +++++++-------------------------------
 1 file changed, 9 insertions(+), 45 deletions(-)

diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx
index ebee97b5f2..a886e0eae7 100644
--- a/dpctl/tensor/_usmarray.pyx
+++ b/dpctl/tensor/_usmarray.pyx
@@ -1289,25 +1289,13 @@ cdef class usm_ndarray:
         return dpctl.tensor.add(self, other, out=self)
 
     def __iand__(self, other):
-        res = self.__and__(other)
-        if res is NotImplemented:
-            return res
-        self.__setitem__(Ellipsis, res)
-        return self
+        return dpctl.tensor.bitwise_and(self, other, out=self)
 
     def __ifloordiv__(self, other):
-        res = self.__floordiv__(other)
-        if res is NotImplemented:
-            return res
-        self.__setitem__(Ellipsis, res)
-        return self
+        return dpctl.tensor.floor_divide(self, other, out=self)
 
     def __ilshift__(self, other):
-        res = self.__lshift__(other)
-        if res is NotImplemented:
-            return res
-        self.__setitem__(Ellipsis, res)
-        return self
+        return dpctl.tensor.bitwise_left_shift(self, other, out=self)
 
     def __imatmul__(self, other):
         res = self.__matmul__(other)
@@ -1317,52 +1305,28 @@ cdef class usm_ndarray:
         return self
 
     def __imod__(self, other):
-        res = self.__mod__(other)
-        if res is NotImplemented:
-            return res
-        self.__setitem__(Ellipsis, res)
-        return self
+        return dpctl.tensor.remainder(self, other, out=self)
 
     def __imul__(self, other):
         return dpctl.tensor.multiply(self, other, out=self)
 
     def __ior__(self, other):
-        res = self.__or__(other)
-        if res is NotImplemented:
-            return res
-        self.__setitem__(Ellipsis, res)
-        return self
+        return dpctl.tensor.bitwise_or(self, other, out=self)
 
     def __ipow__(self, other):
-        res = self.__pow__(other, None)
-        if res is NotImplemented:
-            return res
-        self.__setitem__(Ellipsis, res)
-        return self
+        return dpctl.tensor.pow(self, other, out=self)
 
     def __irshift__(self, other):
-        res = self.__rshift__(other)
-        if res is NotImplemented:
-            return res
-        self.__setitem__(Ellipsis, res)
-        return self
+        return dpctl.tensor.bitwise_right_shift(self, other, out=self)
 
     def __isub__(self, other):
         return dpctl.tensor.subtract(self, other, out=self)
 
     def __itruediv__(self, other):
-        res = self.__truediv__(other)
-        if res is NotImplemented:
-            return res
-        self.__setitem__(Ellipsis, res)
-        return self
+        return dpctl.tensor.divide(self, other, out=self)
 
     def __ixor__(self, other):
-        res = self.__xor__(other)
-        if res is NotImplemented:
-            return res
-        self.__setitem__(Ellipsis, res)
-        return self
+        return dpctl.tensor.bitwise_xor(self, other, out=self)
 
     def __str__(self):
         return usm_ndarray_str(self)

From 2a749f6e508917f1c6b0160a87648b0f4891f8de Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Fri, 18 Aug 2023 11:14:14 -0700
Subject: [PATCH 8/8] Removed some redundant and obsolete tests - Removed from
 test_floor_ceil_trunc, test_hyperbolic, test_trigonometric, and
 test_logaddexp - These tests would fail on GPU but never run on CPU, and
 therefore were not impacting the coverage - These tests focused on aspects of
 the BinaryElementwiseFunc class rather than the behavior of the operator

---
 .../elementwise/test_floor_ceil_trunc.py      |  40 -------
 dpctl/tests/elementwise/test_hyperbolic.py    |  40 -------
 dpctl/tests/elementwise/test_logaddexp.py     | 106 ------------------
 dpctl/tests/elementwise/test_trigonometric.py |  40 -------
 4 files changed, 226 deletions(-)

diff --git a/dpctl/tests/elementwise/test_floor_ceil_trunc.py b/dpctl/tests/elementwise/test_floor_ceil_trunc.py
index e79e049893..211c423ee6 100644
--- a/dpctl/tests/elementwise/test_floor_ceil_trunc.py
+++ b/dpctl/tests/elementwise/test_floor_ceil_trunc.py
@@ -24,7 +24,6 @@
     assert_raises_regex,
 )
 
-import dpctl
 import dpctl.tensor as dpt
 from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
 
@@ -92,45 +91,6 @@ def test_floor_ceil_trunc_order(np_call, dpt_call, dtype):
             assert_allclose(dpt.asnumpy(Y), expected_Y)
 
 
-@pytest.mark.parametrize("dpt_call", [dpt.floor, dpt.ceil, dpt.trunc])
-def test_floor_ceil_trunc_errors(dpt_call):
-    get_queue_or_skip()
-    try:
-        gpu_queue = dpctl.SyclQueue("gpu")
-    except dpctl.SyclQueueCreationError:
-        pytest.skip("SyclQueue('gpu') failed, skipping")
-    try:
-        cpu_queue = dpctl.SyclQueue("cpu")
-    except dpctl.SyclQueueCreationError:
-        pytest.skip("SyclQueue('cpu') failed, skipping")
-
-    x = dpt.zeros(2, sycl_queue=gpu_queue)
-    y = dpt.empty_like(x, sycl_queue=cpu_queue)
-    assert_raises_regex(
-        TypeError,
-        "Input and output allocation queues are not compatible",
-        dpt_call,
-        x,
-        y,
-    )
-
-    x = dpt.zeros(2)
-    y = dpt.empty(3)
-    assert_raises_regex(
-        TypeError,
-        "The shape of input and output arrays are inconsistent",
-        dpt_call,
-        x,
-        y,
-    )
-
-    x = dpt.zeros(2, dtype="float32")
-    y = np.empty_like(x)
-    assert_raises_regex(
-        TypeError, "output array must be of usm_ndarray type", dpt_call, x, y
-    )
-
-
 @pytest.mark.parametrize("dpt_call", [dpt.floor, dpt.ceil, dpt.trunc])
 @pytest.mark.parametrize("dtype", _real_value_dtypes)
 def test_floor_ceil_trunc_error_dtype(dpt_call, dtype):
diff --git a/dpctl/tests/elementwise/test_hyperbolic.py b/dpctl/tests/elementwise/test_hyperbolic.py
index 2a7c3a6a53..401249443e 100644
--- a/dpctl/tests/elementwise/test_hyperbolic.py
+++ b/dpctl/tests/elementwise/test_hyperbolic.py
@@ -20,7 +20,6 @@
 import pytest
 from numpy.testing import assert_allclose, assert_raises_regex
 
-import dpctl
 import dpctl.tensor as dpt
 from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
 
@@ -178,45 +177,6 @@ def test_hyper_order(np_call, dpt_call, dtype):
             assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
 
 
-@pytest.mark.parametrize("callable", _dpt_funcs)
-def test_hyper_errors(callable):
-    get_queue_or_skip()
-    try:
-        gpu_queue = dpctl.SyclQueue("gpu")
-    except dpctl.SyclQueueCreationError:
-        pytest.skip("SyclQueue('gpu') failed, skipping")
-    try:
-        cpu_queue = dpctl.SyclQueue("cpu")
-    except dpctl.SyclQueueCreationError:
-        pytest.skip("SyclQueue('cpu') failed, skipping")
-
-    x = dpt.ones(2, sycl_queue=gpu_queue)
-    y = dpt.empty_like(x, sycl_queue=cpu_queue)
-    assert_raises_regex(
-        TypeError,
-        "Input and output allocation queues are not compatible",
-        callable,
-        x,
-        y,
-    )
-
-    x = dpt.ones(2)
-    y = dpt.empty(3)
-    assert_raises_regex(
-        TypeError,
-        "The shape of input and output arrays are inconsistent",
-        callable,
-        x,
-        y,
-    )
-
-    x = dpt.ones(2, dtype="float32")
-    y = np.empty_like(x)
-    assert_raises_regex(
-        TypeError, "output array must be of usm_ndarray type", callable, x, y
-    )
-
-
 @pytest.mark.parametrize("callable", _dpt_funcs)
 @pytest.mark.parametrize("dtype", _all_dtypes)
 def test_hyper_error_dtype(callable, dtype):
diff --git a/dpctl/tests/elementwise/test_logaddexp.py b/dpctl/tests/elementwise/test_logaddexp.py
index a693389354..6e894e0843 100644
--- a/dpctl/tests/elementwise/test_logaddexp.py
+++ b/dpctl/tests/elementwise/test_logaddexp.py
@@ -23,7 +23,6 @@
 import dpctl
 import dpctl.tensor as dpt
 from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-from dpctl.utils import ExecutionPlacementError
 
 from .utils import _compare_dtypes, _no_complex_dtypes, _usm_types
 
@@ -178,111 +177,6 @@ def test_logaddexp_python_scalar(arr_dt):
         assert isinstance(R, dpt.usm_ndarray)
 
 
-class MockArray:
-    def __init__(self, arr):
-        self.data_ = arr
-
-    @property
-    def __sycl_usm_array_interface__(self):
-        return self.data_.__sycl_usm_array_interface__
-
-
-def test_logaddexp_mock_array():
-    get_queue_or_skip()
-    a = dpt.arange(10)
-    b = dpt.ones(10)
-    c = MockArray(b)
-    r = dpt.logaddexp(a, c)
-    assert isinstance(r, dpt.usm_ndarray)
-
-
-def test_logaddexp_canary_mock_array():
-    get_queue_or_skip()
-    a = dpt.arange(10)
-
-    class Canary:
-        def __init__(self):
-            pass
-
-        @property
-        def __sycl_usm_array_interface__(self):
-            return None
-
-    c = Canary()
-    with pytest.raises(ValueError):
-        dpt.logaddexp(a, c)
-
-
-def test_logaddexp_errors():
-    get_queue_or_skip()
-    try:
-        gpu_queue = dpctl.SyclQueue("gpu")
-    except dpctl.SyclQueueCreationError:
-        pytest.skip("SyclQueue('gpu') failed, skipping")
-    try:
-        cpu_queue = dpctl.SyclQueue("cpu")
-    except dpctl.SyclQueueCreationError:
-        pytest.skip("SyclQueue('cpu') failed, skipping")
-
-    ar1 = dpt.ones(2, dtype="float32", sycl_queue=gpu_queue)
-    ar2 = dpt.ones_like(ar1, sycl_queue=gpu_queue)
-    y = dpt.empty_like(ar1, sycl_queue=cpu_queue)
-    assert_raises_regex(
-        TypeError,
-        "Input and output allocation queues are not compatible",
-        dpt.logaddexp,
-        ar1,
-        ar2,
-        y,
-    )
-
-    ar1 = dpt.ones(2, dtype="float32")
-    ar2 = dpt.ones_like(ar1, dtype="int32")
-    y = dpt.empty(3)
-    assert_raises_regex(
-        TypeError,
-        "The shape of input and output arrays are inconsistent",
-        dpt.logaddexp,
-        ar1,
-        ar2,
-        y,
-    )
-
-    ar1 = dpt.ones(2, dtype="float32")
-    ar2 = dpt.ones_like(ar1, dtype="int32")
-    y = ar1
-    assert_raises_regex(
-        TypeError,
-        "Input and output arrays have memory overlap",
-        dpt.logaddexp,
-        ar1,
-        ar2,
-        y,
-    )
-
-    ar1 = np.ones(2, dtype="float32")
-    ar2 = np.ones_like(ar1, dtype="int32")
-    assert_raises_regex(
-        ExecutionPlacementError,
-        "Execution placement can not be unambiguously inferred.*",
-        dpt.logaddexp,
-        ar1,
-        ar2,
-    )
-
-    ar1 = dpt.ones(2, dtype="float32")
-    ar2 = dpt.ones_like(ar1, dtype="int32")
-    y = np.empty_like(ar1)
-    assert_raises_regex(
-        TypeError,
-        "output array must be of usm_ndarray type",
-        dpt.logaddexp,
-        ar1,
-        ar2,
-        y,
-    )
-
-
 @pytest.mark.parametrize("dtype", _no_complex_dtypes)
 def test_logaddexp_dtype_error(
     dtype,
diff --git a/dpctl/tests/elementwise/test_trigonometric.py b/dpctl/tests/elementwise/test_trigonometric.py
index 42c8453968..e947d9e469 100644
--- a/dpctl/tests/elementwise/test_trigonometric.py
+++ b/dpctl/tests/elementwise/test_trigonometric.py
@@ -20,7 +20,6 @@
 import pytest
 from numpy.testing import assert_allclose, assert_raises_regex
 
-import dpctl
 import dpctl.tensor as dpt
 from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
 
@@ -175,45 +174,6 @@ def test_trig_order(np_call, dpt_call, dtype):
             assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
 
 
-@pytest.mark.parametrize("callable", _dpt_funcs)
-def test_trig_errors(callable):
-    get_queue_or_skip()
-    try:
-        gpu_queue = dpctl.SyclQueue("gpu")
-    except dpctl.SyclQueueCreationError:
-        pytest.skip("SyclQueue('gpu') failed, skipping")
-    try:
-        cpu_queue = dpctl.SyclQueue("cpu")
-    except dpctl.SyclQueueCreationError:
-        pytest.skip("SyclQueue('cpu') failed, skipping")
-
-    x = dpt.zeros(2, sycl_queue=gpu_queue)
-    y = dpt.empty_like(x, sycl_queue=cpu_queue)
-    assert_raises_regex(
-        TypeError,
-        "Input and output allocation queues are not compatible",
-        callable,
-        x,
-        y,
-    )
-
-    x = dpt.zeros(2)
-    y = dpt.empty(3)
-    assert_raises_regex(
-        TypeError,
-        "The shape of input and output arrays are inconsistent",
-        callable,
-        x,
-        y,
-    )
-
-    x = dpt.zeros(2, dtype="float32")
-    y = np.empty_like(x)
-    assert_raises_regex(
-        TypeError, "output array must be of usm_ndarray type", callable, x, y
-    )
-
-
 @pytest.mark.parametrize("callable", _dpt_funcs)
 @pytest.mark.parametrize("dtype", _all_dtypes)
 def test_trig_error_dtype(callable, dtype):