From 80eae6ee12152ceedf9538a80c3312d205cd200f Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Mon, 7 Aug 2023 07:39:07 -0700
Subject: [PATCH 1/5] Corrected order="K" support in copy

---
 dpctl/tensor/_copy_utils.py | 131 ++++++++++++++++--------------------
 1 file changed, 59 insertions(+), 72 deletions(-)

diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py
index 6ad4f6a154..565a11dec7 100644
--- a/dpctl/tensor/_copy_utils.py
+++ b/dpctl/tensor/_copy_utils.py
@@ -290,78 +290,6 @@ def _copy_from_usm_ndarray_to_usm_ndarray(dst, src):
     _copy_same_shape(dst, src_same_shape)
 
 
-def copy(usm_ary, order="K"):
-    """copy(ary, order="K")
-
-    Creates a copy of given instance of :class:`dpctl.tensor.usm_ndarray`.
-
-    Args:
-        ary (usm_ndarray):
-            Input array.
-        order ({"C", "F", "A", "K"}, optional):
-            Controls the memory layout of the output array.
-    Returns:
-        usm_ndarray:
-            A copy of the input array.
-
-    Memory layout of the copy is controlled by `order` keyword,
-    following NumPy's conventions. The `order` keywords can be
-    one of the following:
-
-       - "C": C-contiguous memory layout
-       - "F": Fortran-contiguous memory layout
-       - "A": Fortran-contiguous if the input array is also Fortran-contiguous,
-         otherwise C-contiguous
-       - "K": match the layout of `usm_ary` as closely as possible.
-
-    """
-    if not isinstance(usm_ary, dpt.usm_ndarray):
-        return TypeError(
-            f"Expected object of type dpt.usm_ndarray, got {type(usm_ary)}"
-        )
-    copy_order = "C"
-    if order == "C":
-        pass
-    elif order == "F":
-        copy_order = order
-    elif order == "A":
-        if usm_ary.flags.f_contiguous:
-            copy_order = "F"
-    elif order == "K":
-        if usm_ary.flags.f_contiguous:
-            copy_order = "F"
-    else:
-        raise ValueError(
-            "Unrecognized value of the order keyword. "
-            "Recognized values are 'A', 'C', 'F', or 'K'"
-        )
-    c_contig = usm_ary.flags.c_contiguous
-    f_contig = usm_ary.flags.f_contiguous
-    R = dpt.usm_ndarray(
-        usm_ary.shape,
-        dtype=usm_ary.dtype,
-        buffer=usm_ary.usm_type,
-        order=copy_order,
-        buffer_ctor_kwargs={"queue": usm_ary.sycl_queue},
-    )
-    if order == "K" and (not c_contig and not f_contig):
-        original_strides = usm_ary.strides
-        ind = sorted(
-            range(usm_ary.ndim),
-            key=lambda i: abs(original_strides[i]),
-            reverse=True,
-        )
-        new_strides = tuple(R.strides[ind[i]] for i in ind)
-        R = dpt.usm_ndarray(
-            usm_ary.shape,
-            dtype=usm_ary.dtype,
-            buffer=R.usm_data,
-            strides=new_strides,
-        )
-    _copy_same_shape(R, usm_ary)
-    return R
-
-
 def _empty_like_orderK(X, dt, usm_type=None, dev=None):
     """Returns empty array like `x`, using order='K'
 
@@ -452,6 +380,65 @@ def _empty_like_pair_orderK(X1, X2, dt, res_shape, usm_type, dev):
     return dpt.permute_dims(R, inv_perm)
 
 
+def copy(usm_ary, order="K"):
+    """copy(ary, order="K")
+
+    Creates a copy of given instance of :class:`dpctl.tensor.usm_ndarray`.
+
+    Args:
+        ary (usm_ndarray):
+            Input array.
+        order ({"C", "F", "A", "K"}, optional):
+            Controls the memory layout of the output array.
+    Returns:
+        usm_ndarray:
+            A copy of the input array.
+
+    Memory layout of the copy is controlled by `order` keyword,
+    following NumPy's conventions. The `order` keywords can be
+    one of the following:
+
+       - "C": C-contiguous memory layout
+       - "F": Fortran-contiguous memory layout
+       - "A": Fortran-contiguous if the input array is also Fortran-contiguous,
+         otherwise C-contiguous
+       - "K": match the layout of `usm_ary` as closely as possible.
+
+    """
+    if not isinstance(usm_ary, dpt.usm_ndarray):
+        return TypeError(
+            f"Expected object of type dpt.usm_ndarray, got {type(usm_ary)}"
+        )
+    copy_order = "C"
+    if order == "C":
+        pass
+    elif order == "F":
+        copy_order = order
+    elif order == "A":
+        if usm_ary.flags.f_contiguous:
+            copy_order = "F"
+    elif order == "K":
+        if usm_ary.flags.f_contiguous:
+            copy_order = "F"
+    else:
+        raise ValueError(
+            "Unrecognized value of the order keyword. "
+            "Recognized values are 'A', 'C', 'F', or 'K'"
+        )
+    if order == "K":
+        R = _empty_like_orderK(usm_ary, usm_ary.dtype)
+    else:
+        R = dpt.usm_ndarray(
+            usm_ary.shape,
+            dtype=usm_ary.dtype,
+            buffer=usm_ary.usm_type,
+            order=copy_order,
+            buffer_ctor_kwargs={"queue": usm_ary.sycl_queue},
+        )
+    _copy_same_shape(R, usm_ary)
+    return R
+
+
 def astype(usm_ary, newdtype, order="K", casting="unsafe", copy=True):
     """ astype(array, new_dtype, order="K", casting="unsafe", \
             copy=True)

From ff1081aeb0c9555ddbd7761ef1175b562dc60272 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Mon, 7 Aug 2023 07:39:13 -0700
Subject: [PATCH 2/5] Fixed logaddexp for mixed nan and number operands

---
 .../elementwise_functions/logaddexp.hpp       | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
index 39ba5c3fcf..f2d2505f2c 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
@@ -28,6 +28,7 @@
 #include <CL/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
+#include <limits>
 #include <type_traits>
 
 #include "utils/offset_utils.hpp"
@@ -55,13 +56,14 @@ using dpctl::tensor::type_utils::vec_cast;
 
 template <typename argT1, typename argT2, typename resT> struct LogAddExpFunctor
 {
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
-    using supports_vec = typename std::negation<
-        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_sg_loadstore = std::true_type;
+    using supports_vec = std::true_type;
 
     resT operator()(const argT1 &in1, const argT2 &in2)
     {
+        if (std::isnan(in1) || std::isnan(in2)) {
+            return std::numeric_limits<resT>::quiet_NaN();
+        }
         resT max = std::max<resT>(in1, in2);
         resT min = std::min<resT>(in1, in2);
         return max + std::log1p(std::exp(min - max));
@@ -76,8 +78,13 @@ template <typename argT1, typename argT2, typename resT> struct LogAddExpFunctor
 
 #pragma unroll
         for (int i = 0; i < vec_sz; ++i) {
-            resT max = std::max<resT>(in1[i], in2[i]);
-            res[i] = max + std::log1p(std::exp(std::abs(diff[i])));
+            if (std::isnan(in1[i]) || std::isnan(in2[i])) {
+                res[i] = std::numeric_limits<resT>::quiet_NaN();
+            }
+            else {
+                resT max = std::max<resT>(in1[i], in2[i]);
+                res[i] = max + std::log1p(std::exp(std::abs(diff[i])));
+            }
         }
 
         return res;

From 1b5419f19a0be887743a316170ddb7a101d8ae37 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Mon, 7 Aug 2023 10:32:29 -0700
Subject: [PATCH 3/5] logaddexp now handles both NaNs and infinities correctly
 per array API

---
 .../kernels/elementwise_functions/logaddexp.hpp      | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
index f2d2505f2c..b5f48dfffb 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
@@ -61,10 +61,10 @@ template <typename argT1, typename argT2, typename resT> struct LogAddExpFunctor
 
     resT operator()(const argT1 &in1, const argT2 &in2)
     {
-        if (std::isnan(in1) || std::isnan(in2)) {
-            return std::numeric_limits<resT>::quiet_NaN();
-        }
         resT max = std::max<resT>(in1, in2);
+        if (std::isnan(max) || std::isinf(max)) {
+            return max;
+        }
         resT min = std::min<resT>(in1, in2);
         return max + std::log1p(std::exp(min - max));
     }
@@ -78,11 +78,11 @@ template <typename argT1, typename argT2, typename resT> struct LogAddExpFunctor
 
 #pragma unroll
         for (int i = 0; i < vec_sz; ++i) {
-            if (std::isnan(in1[i]) || std::isnan(in2[i])) {
-                res[i] = std::numeric_limits<resT>::quiet_NaN();
+            resT max = std::max<resT>(in1[i], in2[i]);
+            if (std::isnan(max) || std::isinf(max)) {
+                res[i] = max;
             }
             else {
-                resT max = std::max<resT>(in1[i], in2[i]);
                 res[i] = max + std::log1p(std::exp(std::abs(diff[i])));
             }
         }

From 3c874338321f18f5e554074c37ef4d1a30a3539a Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Mon, 7 Aug 2023 14:56:23 -0700
Subject: [PATCH 4/5] Broke up 'or' conditional in logaddexp logic for inf and
 NaN - 'or' conditions can sometimes cause wrong results when using the OS
 compiler

---
 .../kernels/elementwise_functions/logaddexp.hpp  | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
index b5f48dfffb..91fca7fcb4 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
@@ -62,8 +62,13 @@ template <typename argT1, typename argT2, typename resT> struct LogAddExpFunctor
     resT operator()(const argT1 &in1, const argT2 &in2)
     {
         resT max = std::max<resT>(in1, in2);
-        if (std::isnan(max) || std::isinf(max)) {
-            return max;
+        if (std::isnan(max)) {
+            return std::numeric_limits<resT>::quiet_NaN();
+        }
+        else {
+            if (std::isinf(max)) {
+                return std::numeric_limits<resT>::infinity();
+            }
         }
         resT min = std::min<resT>(in1, in2);
         return max + std::log1p(std::exp(min - max));
@@ -79,8 +84,11 @@ template <typename argT1, typename argT2, typename resT> struct LogAddExpFunctor
 #pragma unroll
         for (int i = 0; i < vec_sz; ++i) {
             resT max = std::max<resT>(in1[i], in2[i]);
-            if (std::isnan(max) || std::isinf(max)) {
-                res[i] = max;
+            if (std::isnan(max)) {
+                res[i] = std::numeric_limits<resT>::quiet_NaN();
+            }
+            else if (std::isinf(max)) {
+                res[i] = std::numeric_limits<resT>::infinity();
             }
             else {
                 res[i] = max + std::log1p(std::exp(std::abs(diff[i])));

From 3c0aeedfd824d6e8a4d665dd70d51e6cc65676d4 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 7 Aug 2023 11:42:10 -0500
Subject: [PATCH 5/5] Modularized logic implementing logaddexp

If both arguments are -inf, the result is also -inf.
---
 .../elementwise_functions/logaddexp.hpp       | 41 +++++++++----------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
index 91fca7fcb4..02375d5313 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
@@ -61,17 +61,7 @@ template <typename argT1, typename argT2, typename resT> struct LogAddExpFunctor
 
     resT operator()(const argT1 &in1, const argT2 &in2)
     {
-        resT max = std::max<resT>(in1, in2);
-        if (std::isnan(max)) {
-            return std::numeric_limits<resT>::quiet_NaN();
-        }
-        else {
-            if (std::isinf(max)) {
-                return std::numeric_limits<resT>::infinity();
-            }
-        }
-        resT min = std::min<resT>(in1, in2);
-        return max + std::log1p(std::exp(min - max));
+        return impl<resT>(in1, in2);
     }
 
     template <int vec_sz>
@@ -83,20 +73,29 @@ template <typename argT1, typename argT2, typename resT> struct LogAddExpFunctor
 
 #pragma unroll
         for (int i = 0; i < vec_sz; ++i) {
-            resT max = std::max<resT>(in1[i], in2[i]);
-            if (std::isnan(max)) {
-                res[i] = std::numeric_limits<resT>::quiet_NaN();
-            }
-            else if (std::isinf(max)) {
-                res[i] = std::numeric_limits<resT>::infinity();
-            }
-            else {
-                res[i] = max + std::log1p(std::exp(std::abs(diff[i])));
-            }
+            res[i] = impl<resT>(in1[i], in2[i]);
         }
 
         return res;
     }
+
+private:
+    template <typename T> T impl(T const &in1, T const &in2)
+    {
+        T max = std::max<T>(in1, in2);
+        if (std::isnan(max)) {
+            return std::numeric_limits<T>::quiet_NaN();
+        }
+        else {
+            if (std::isinf(max)) {
+                // if both args are -inf, and hence max is -inf
+                // the result is -inf as well
+                return max;
+            }
+        }
+        T min = std::min<T>(in1, in2);
+        return max + std::log1p(std::exp(min - max));
+    }
 };
 
 template <typename argT1,