From 224455b636fc7fd1603d34471c3e86eaf56a72f9 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Tue, 4 Jul 2023 01:55:11 -0700
Subject: [PATCH 1/8] Implements logaddexp and hypot

---
 dpctl/tensor/__init__.py                      |   4 +
 dpctl/tensor/_elementwise_funcs.py            |  59 +++-
 .../kernels/elementwise_functions/hypot.hpp   | 215 +++++++++++++
 .../elementwise_functions/logaddexp.hpp       | 221 +++++++++++++
 .../source/elementwise_functions.cpp          | 155 ++++++++-
 dpctl/tests/elementwise/test_hypot.py         | 193 ++++++++++++
 dpctl/tests/elementwise/test_logaddexp.py     | 298 ++++++++++++++++++
 7 files changed, 1140 insertions(+), 5 deletions(-)
 create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp
 create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
 create mode 100644 dpctl/tests/elementwise/test_hypot.py
 create mode 100644 dpctl/tests/elementwise/test_logaddexp.py

diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py
index a0a1a0b4df..fe77645389 100644
--- a/dpctl/tensor/__init__.py
+++ b/dpctl/tensor/__init__.py
@@ -105,6 +105,7 @@
     floor_divide,
     greater,
     greater_equal,
+    hypot,
     imag,
     isfinite,
     isinf,
@@ -115,6 +116,7 @@
     log1p,
     log2,
     log10,
+    logaddexp,
     logical_and,
     logical_not,
     logical_or,
@@ -222,6 +224,7 @@
     "floor_divide",
     "greater",
     "greater_equal",
+    "hypot",
     "imag",
     "isfinite",
     "isinf",
@@ -241,6 +244,7 @@
     "not_equal",
     "positive",
     "pow",
+    "logaddexp",
     "proj",
     "real",
     "sin",
diff --git a/dpctl/tensor/_elementwise_funcs.py b/dpctl/tensor/_elementwise_funcs.py
index 06a5080a5e..e15add335c 100644
--- a/dpctl/tensor/_elementwise_funcs.py
+++ b/dpctl/tensor/_elementwise_funcs.py
@@ -661,7 +661,32 @@
 )
 
 # B15: ==== LOGADDEXP   (x1, x2)
-# FIXME: implement B15
+_logaddexp_docstring_ = """
+logaddexp(x1, x2, out=None, order='K')
+
+Calculates the ratio for each element `x1_i` of the input array `x1` with
+the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have numeric data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have numeric data type.
+    out ({None, usm_ndarray}, optional):
+        Output array to populate.
+        Array have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the newly output array, if parameter `out` is `None`.
+        Default: "K".
+Returns:
+    usm_narray:
+        An array containing the result of element-wise division. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+logaddexp = BinaryElementwiseFunc(
+    "logaddexp", ti._logaddexp_result_type, ti._logaddexp, _logaddexp_docstring_
+)
 
 # B16: ==== LOGICAL_AND (x1, x2)
 _logical_and_docstring_ = """
@@ -1094,12 +1119,40 @@
     order ("C","F","A","K", optional):
         Memory layout of the newly output array, if parameter `out` is `None`.
         Default: "K".
+Returns:
+    usm_narray:
+        An array containing the result of element-wise division. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+trunc = UnaryElementwiseFunc(
+    "trunc", ti._trunc_result_type, ti._trunc, _trunc_docstring
+)
+
+
+# B24: ==== HYPOT        (x1, x2)
+_hypot_docstring_ = """
+hypot(x1, x2, out=None, order='K')
+
+Calculates the ratio for each element `x1_i` of the input array `x1` with
+the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have numeric data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have numeric data type.
+    out ({None, usm_ndarray}, optional):
+        Output array to populate.
+        Array have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the newly output array, if parameter `out` is `None`.
+        Default: "K".
 Returns:
     usm_narray:
         An array containing the element-wise truncated value of input array.
         The returned array has the same data type as `x`.
 """
 
-trunc = UnaryElementwiseFunc(
-    "trunc", ti._trunc_result_type, ti._trunc, _trunc_docstring
+hypot = BinaryElementwiseFunc(
+    "hypot", ti._hypot_result_type, ti._hypot, _hypot_docstring_
 )
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp
new file mode 100644
index 0000000000..00e8327559
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp
@@ -0,0 +1,215 @@
+//=== HYPOT.hpp -   Binary function HYPOT  ------               *-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of HYPOT(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace hypot
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT> struct HypotFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2)
+    {
+        return std::hypot(in1, in2);
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT1, vec_sz> &in1,
+                                       const sycl::vec<argT2, vec_sz> &in2)
+    {
+        auto res = sycl::hypot(in1, in2);
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(res)::element_type>) {
+            return res;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(res)::element_type, vec_sz>(
+                res);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+using HypotContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            HypotFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using HypotStridedFunctor =
+    elementwise_common::BinaryStridedFunctor<argT1,
+                                             argT2,
+                                             resT,
+                                             IndexerT,
+                                             HypotFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2> struct HypotOutputType
+{
+    using value_type = typename std::disjunction< // disjunction is C++17
+                                                  // feature, supported by DPC++
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          unsigned int vec_sz,
+          unsigned int n_vecs>
+class hypot_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event hypot_contig_impl(sycl::queue exec_q,
+                              size_t nelems,
+                              const char *arg1_p,
+                              py::ssize_t arg1_offset,
+                              const char *arg2_p,
+                              py::ssize_t arg2_offset,
+                              char *res_p,
+                              py::ssize_t res_offset,
+                              const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, HypotOutputType, HypotContigFunctor,
+        hypot_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
+                             arg2_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2> struct HypotContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename HypotOutputType<T1, T2>::value_type, void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = hypot_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2> struct HypotTypeMapFactory
+{
+    /*! @brief get typeid for output type of std::hypot(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename HypotOutputType<T1, T2>::value_type;
+        ;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class hypot_strided_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+hypot_strided_impl(sycl::queue exec_q,
+                   size_t nelems,
+                   int nd,
+                   const py::ssize_t *shape_and_strides,
+                   const char *arg1_p,
+                   py::ssize_t arg1_offset,
+                   const char *arg2_p,
+                   py::ssize_t arg2_offset,
+                   char *res_p,
+                   py::ssize_t res_offset,
+                   const std::vector<sycl::event> &depends,
+                   const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, HypotOutputType, HypotStridedFunctor,
+        hypot_strided_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2> struct HypotStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename HypotOutputType<T1, T2>::value_type, void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = hypot_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace hypot
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
new file mode 100644
index 0000000000..c39aec1877
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
@@ -0,0 +1,221 @@
+//=== logaddexp.hpp -   Binary function LOGADDEXP                    ------
+//*-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of LOGADDEXP(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace logaddexp
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+using dpctl::tensor::type_utils::is_complex;
+using dpctl::tensor::type_utils::vec_cast;
+
+template <typename argT1, typename argT2, typename resT> struct LogAddExpFunctor
+{
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = typename std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2)
+    {
+        return std::log(std::exp(in1) + std::exp(in2));
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT1, vec_sz> &in1,
+                                       const sycl::vec<argT2, vec_sz> &in2)
+    {
+        auto res = sycl::log(sycl::exp(in1) + sycl::exp(in2));
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(res)::element_type>) {
+            return res;
+        }
+        else {
+            return vec_cast<resT, typename decltype(res)::element_type, vec_sz>(
+                res);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+using LogAddExpContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    LogAddExpFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using LogAddExpStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    LogAddExpFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2> struct LogAddExpOutputType
+{
+    using value_type = typename std::disjunction< // disjunction is C++17
+                                                  // feature, supported by DPC++
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          unsigned int vec_sz,
+          unsigned int n_vecs>
+class logaddexp_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event logaddexp_contig_impl(sycl::queue exec_q,
+                                  size_t nelems,
+                                  const char *arg1_p,
+                                  py::ssize_t arg1_offset,
+                                  const char *arg2_p,
+                                  py::ssize_t arg2_offset,
+                                  char *res_p,
+                                  py::ssize_t res_offset,
+                                  const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, LogAddExpOutputType, LogAddExpContigFunctor,
+        logaddexp_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
+                                 arg2_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2> struct LogAddExpContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename LogAddExpOutputType<T1, T2>::value_type,
+                          void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = logaddexp_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2> struct LogAddExpTypeMapFactory
+{
+    /*! @brief get typeid for output type of std::logaddexp(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename LogAddExpOutputType<T1, T2>::value_type;
+        ;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class logaddexp_strided_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+logaddexp_strided_impl(sycl::queue exec_q,
+                       size_t nelems,
+                       int nd,
+                       const py::ssize_t *shape_and_strides,
+                       const char *arg1_p,
+                       py::ssize_t arg1_offset,
+                       const char *arg2_p,
+                       py::ssize_t arg2_offset,
+                       char *res_p,
+                       py::ssize_t res_offset,
+                       const std::vector<sycl::event> &depends,
+                       const std::vector<sycl::event> &logaddexpitional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, LogAddExpOutputType, LogAddExpStridedFunctor,
+        logaddexp_strided_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, logaddexpitional_depends);
+}
+
+template <typename fnT, typename T1, typename T2> struct LogAddExpStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename LogAddExpOutputType<T1, T2>::value_type,
+                          void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = logaddexp_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT1, typename argT2, typename resT>
+class logaddexp_matrix_row_broadcast_sg_krn;
+
+} // namespace logaddexp
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions.cpp b/dpctl/tensor/libtensor/source/elementwise_functions.cpp
index 7246236d37..1eab41e3af 100644
--- a/dpctl/tensor/libtensor/source/elementwise_functions.cpp
+++ b/dpctl/tensor/libtensor/source/elementwise_functions.cpp
@@ -44,6 +44,7 @@
 #include "kernels/elementwise_functions/floor_divide.hpp"
 #include "kernels/elementwise_functions/greater.hpp"
 #include "kernels/elementwise_functions/greater_equal.hpp"
+#include "kernels/elementwise_functions/hypot.hpp"
 #include "kernels/elementwise_functions/imag.hpp"
 #include "kernels/elementwise_functions/isfinite.hpp"
 #include "kernels/elementwise_functions/isinf.hpp"
@@ -54,6 +55,7 @@
 #include "kernels/elementwise_functions/log10.hpp"
 #include "kernels/elementwise_functions/log1p.hpp"
 #include "kernels/elementwise_functions/log2.hpp"
+#include "kernels/elementwise_functions/logaddexp.hpp"
 #include "kernels/elementwise_functions/logical_and.hpp"
 #include "kernels/elementwise_functions/logical_not.hpp"
 #include "kernels/elementwise_functions/logical_or.hpp"
@@ -1149,7 +1151,39 @@ void populate_log10_dispatch_vectors(void)
 // B15: ==== LOGADDEXP   (x1, x2)
 namespace impl
 {
-// FIXME: add code for B15
+namespace logaddexp_fn_ns = dpctl::tensor::kernels::logaddexp;
+
+static binary_contig_impl_fn_ptr_t
+    logaddexp_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int logaddexp_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    logaddexp_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_logaddexp_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = logaddexp_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::LogAddExpTypeMapFactory;
+    DispatchTableBuilder<int, LogAddExpTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(logaddexp_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::LogAddExpStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, LogAddExpStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(logaddexp_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::LogAddExpContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, LogAddExpContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(logaddexp_contig_dispatch_table);
+};
 } // namespace impl
 
 // B16: ==== LOGICAL_AND (x1, x2)
@@ -1895,6 +1929,46 @@ void populate_trunc_dispatch_vectors(void)
 
 } // namespace impl
 
+// B24:  ==== HYPOT    (x1, x2)
+
+namespace impl
+{
+namespace hypot_fn_ns = dpctl::tensor::kernels::hypot;
+
+static binary_contig_impl_fn_ptr_t
+    hypot_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int hypot_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    hypot_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_hypot_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = hypot_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::HypotTypeMapFactory;
+    DispatchTableBuilder<int, HypotTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(hypot_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::HypotStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, HypotStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(hypot_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::HypotContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, HypotContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(hypot_contig_dispatch_table);
+};
+
+} // namespace impl
+
 // ==========================================================================================
 // //
 
@@ -2638,7 +2712,45 @@ void init_elementwise_functions(py::module_ m)
     }
 
     // B15: ==== LOGADDEXP   (x1, x2)
-    // FIXME:
+    {
+        impl::populate_logaddexp_dispatch_tables();
+        using impl::logaddexp_contig_dispatch_table;
+        using impl::logaddexp_output_id_table;
+        using impl::logaddexp_strided_dispatch_table;
+
+        auto logaddexp_pyapi = [&](dpctl::tensor::usm_ndarray src1,
+                                   dpctl::tensor::usm_ndarray src2,
+                                   dpctl::tensor::usm_ndarray dst,
+                                   sycl::queue exec_q,
+                                   const std::vector<sycl::event> &depends =
+                                       {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, logaddexp_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                logaddexp_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                logaddexp_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto logaddexp_result_type_pyapi = [&](py::dtype dtype1,
+                                               py::dtype dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               logaddexp_output_id_table);
+        };
+        m.def("_logaddexp", logaddexp_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_logaddexp_result_type", logaddexp_result_type_pyapi, "");
+    }
 
     // B16: ==== LOGICAL_AND (x1, x2)
     {
@@ -3196,6 +3308,45 @@ void init_elementwise_functions(py::module_ m)
         };
         m.def("_trunc_result_type", trunc_result_type_pyapi);
     }
+
+    // B24: ==== HYPOT       (x)
+    {
+        impl::populate_hypot_dispatch_tables();
+        using impl::hypot_contig_dispatch_table;
+        using impl::hypot_output_id_table;
+        using impl::hypot_strided_dispatch_table;
+
+        auto hypot_pyapi = [&](dpctl::tensor::usm_ndarray src1,
+                               dpctl::tensor::usm_ndarray src2,
+                               dpctl::tensor::usm_ndarray dst,
+                               sycl::queue exec_q,
+                               const std::vector<sycl::event> &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, hypot_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                hypot_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                hypot_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto hypot_result_type_pyapi = [&](py::dtype dtype1, py::dtype dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               hypot_output_id_table);
+        };
+        m.def("_hypot", hypot_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_hypot_result_type", hypot_result_type_pyapi, "");
+    }
 }
 
 } // namespace py_internal
diff --git a/dpctl/tests/elementwise/test_hypot.py b/dpctl/tests/elementwise/test_hypot.py
new file mode 100644
index 0000000000..a16bcc5d0c
--- /dev/null
+++ b/dpctl/tests/elementwise/test_hypot.py
@@ -0,0 +1,193 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import ctypes
+
+import numpy as np
+import pytest
+
+import dpctl
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+from .utils import _compare_dtypes, _no_complex_dtypes, _usm_types
+
+
+@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes[1:])
+@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes[1:])
+def test_hypot_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.zeros(sz, dtype=op1_dtype, sycl_queue=q)
+    ar2 = dpt.zeros_like(ar1, dtype=op2_dtype, sycl_queue=q)
+
+    r = dpt.hypot(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.hypot(
+        np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.zeros(sz, dtype=op1_dtype, sycl_queue=q)
+    ar4 = dpt.zeros(2 * sz, dtype=op2_dtype, sycl_queue=q)
+
+    r = dpt.hypot(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.hypot(
+        np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_hypot_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
+
+    r = dpt.hypot(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpctl.utils.get_coerced_usm_type(
+        (op1_usm_type, op2_usm_type)
+    )
+    assert r.usm_type == expected_usm_type
+
+
+def test_hypot_order():
+    get_queue_or_skip()
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
+    r1 = dpt.hypot(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.hypot(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.hypot(ar1, ar2, order="A")
+    assert r3.flags.c_contiguous
+    r4 = dpt.hypot(ar1, ar2, order="K")
+    assert r4.flags.c_contiguous
+
+    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
+    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
+    r1 = dpt.hypot(ar1, ar2, order="C")
+    assert r1.flags.c_contiguous
+    r2 = dpt.hypot(ar1, ar2, order="F")
+    assert r2.flags.f_contiguous
+    r3 = dpt.hypot(ar1, ar2, order="A")
+    assert r3.flags.f_contiguous
+    r4 = dpt.hypot(ar1, ar2, order="K")
+    assert r4.flags.f_contiguous
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
+    r4 = dpt.hypot(ar1, ar2, order="K")
+    assert r4.strides == (20, -1)
+
+    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
+    r4 = dpt.hypot(ar1, ar2, order="K")
+    assert r4.strides == (-1, 20)
+
+
+def test_hypot_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(1, 6, dtype="i4")
+
+    r = dpt.hypot(m, v)
+
+    expected = np.hypot(
+        np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4")
+    )
+    tol = 8 * np.finfo(r.dtype).resolution
+    assert np.allclose(
+        dpt.asnumpy(r), expected.astype(r.dtype), atol=tol, rtol=tol
+    )
+
+    r2 = dpt.hypot(v, m)
+    expected2 = np.hypot(
+        np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4")
+    )
+    assert np.allclose(
+        dpt.asnumpy(r2), expected2.astype(r2.dtype), atol=tol, rtol=tol
+    )
+
+
+@pytest.mark.parametrize("arr_dt", _no_complex_dtypes[1:])
+def test_hypot_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_ones = (
+        bool(1),
+        int(1),
+        float(1),
+        np.float32(1),
+        ctypes.c_int(1),
+    )
+    for sc in py_ones:
+        R = dpt.hypot(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.hypot(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+class MockArray:
+    def __init__(self, arr):
+        self.data_ = arr
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        return self.data_.__sycl_usm_array_interface__
+
+
+def test_hypot_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+    b = dpt.ones(10)
+    c = MockArray(b)
+    r = dpt.hypot(a, c)
+    assert isinstance(r, dpt.usm_ndarray)
+
+
+def test_hypot_canary_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+
+    class Canary:
+        def __init__(self):
+            pass
+
+        @property
+        def __sycl_usm_array_interface__(self):
+            return None
+
+    c = Canary()
+    with pytest.raises(ValueError):
+        dpt.hypot(a, c)
diff --git a/dpctl/tests/elementwise/test_logaddexp.py b/dpctl/tests/elementwise/test_logaddexp.py
new file mode 100644
index 0000000000..bf334eca9d
--- /dev/null
+++ b/dpctl/tests/elementwise/test_logaddexp.py
@@ -0,0 +1,298 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import ctypes
+
+import numpy as np
+import pytest
+from numpy.testing import assert_raises_regex
+
+import dpctl
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+from dpctl.utils import ExecutionPlacementError
+
+from .utils import _compare_dtypes, _no_complex_dtypes, _usm_types
+
+
+@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes)
+@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes)
+def test_logaddexp_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.logaddexp(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.logaddexp(dpt.asnumpy(ar1), dpt.asnumpy(ar2))
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    tol = 8 * np.finfo(r.dtype).resolution
+    assert np.allclose(
+        dpt.asnumpy(r), expected.astype(r.dtype), atol=tol, rtol=tol
+    )
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.ones(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.logaddexp(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.logaddexp(dpt.asnumpy(ar3)[::-1], dpt.asnumpy(ar4)[::2])
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert np.allclose(
+        dpt.asnumpy(r), expected.astype(r.dtype), atol=tol, rtol=tol
+    )
+
+
+@pytest.mark.parametrize("op1_usm_type", _usm_types)
+@pytest.mark.parametrize("op2_usm_type", _usm_types)
+def test_logaddexp_usm_type_matrix(op1_usm_type, op2_usm_type):
+    get_queue_or_skip()
+
+    sz = 128
+    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
+    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
+
+    r = dpt.logaddexp(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected_usm_type = dpctl.utils.get_coerced_usm_type(
+        (op1_usm_type, op2_usm_type)
+    )
+    assert r.usm_type == expected_usm_type
+
+
+def test_logaddexp_order():
+    get_queue_or_skip()
+
+    test_shape = (
+        20,
+        20,
+    )
+    test_shape2 = tuple(2 * dim for dim in test_shape)
+    n = test_shape[-1]
+
+    for dt1, dt2 in zip(["i4", "i4", "f4"], ["i4", "f4", "i4"]):
+        ar1 = dpt.ones(test_shape, dtype=dt1, order="C")
+        ar2 = dpt.ones(test_shape, dtype=dt2, order="C")
+        r1 = dpt.logaddexp(ar1, ar2, order="C")
+        assert r1.flags.c_contiguous
+        r2 = dpt.logaddexp(ar1, ar2, order="F")
+        assert r2.flags.f_contiguous
+        r3 = dpt.logaddexp(ar1, ar2, order="A")
+        assert r3.flags.c_contiguous
+        r4 = dpt.logaddexp(ar1, ar2, order="K")
+        assert r4.flags.c_contiguous
+
+        ar1 = dpt.ones(test_shape, dtype=dt1, order="F")
+        ar2 = dpt.ones(test_shape, dtype=dt2, order="F")
+        r1 = dpt.logaddexp(ar1, ar2, order="C")
+        assert r1.flags.c_contiguous
+        r2 = dpt.logaddexp(ar1, ar2, order="F")
+        assert r2.flags.f_contiguous
+        r3 = dpt.logaddexp(ar1, ar2, order="A")
+        assert r3.flags.f_contiguous
+        r4 = dpt.logaddexp(ar1, ar2, order="K")
+        assert r4.flags.f_contiguous
+
+        ar1 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2]
+        ar2 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2]
+        r4 = dpt.logaddexp(ar1, ar2, order="K")
+        assert r4.strides == (n, -1)
+        r5 = dpt.logaddexp(ar1, ar2, order="C")
+        assert r5.strides == (n, 1)
+
+        ar1 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2].mT
+        ar2 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2].mT
+        r4 = dpt.logaddexp(ar1, ar2, order="K")
+        assert r4.strides == (-1, n)
+        r5 = dpt.logaddexp(ar1, ar2, order="C")
+        assert r5.strides == (n, 1)
+
+
+def test_logaddexp_broadcasting():
+    get_queue_or_skip()
+
+    m = dpt.ones((100, 5), dtype="i4")
+    v = dpt.arange(1, 6, dtype="i4")
+
+    r = dpt.logaddexp(m, v)
+
+    expected = np.logaddexp(
+        np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4")
+    )
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+    r2 = dpt.logaddexp(v, m)
+    expected2 = np.logaddexp(
+        np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4")
+    )
+    assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all()
+
+
+def test_logaddexp_broadcasting_error():
+    get_queue_or_skip()
+    m = dpt.ones((10, 10), dtype="i4")
+    v = dpt.ones((3,), dtype="i4")
+    with pytest.raises(ValueError):
+        dpt.logaddexp(m, v)
+
+
+@pytest.mark.parametrize("arr_dt", _no_complex_dtypes)
+def test_logaddexp_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.zeros((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_zeros = (
+        bool(0),
+        int(0),
+        float(0),
+        complex(0),
+        np.float32(0),
+        ctypes.c_int(0),
+    )
+    for sc in py_zeros:
+        R = dpt.logaddexp(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.logaddexp(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+class MockArray:
+    def __init__(self, arr):
+        self.data_ = arr
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        return self.data_.__sycl_usm_array_interface__
+
+
+def test_logaddexp_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+    b = dpt.ones(10)
+    c = MockArray(b)
+    r = dpt.logaddexp(a, c)
+    assert isinstance(r, dpt.usm_ndarray)
+
+
+def test_logaddexp_canary_mock_array():
+    get_queue_or_skip()
+    a = dpt.arange(10)
+
+    class Canary:
+        def __init__(self):
+            pass
+
+        @property
+        def __sycl_usm_array_interface__(self):
+            return None
+
+    c = Canary()
+    with pytest.raises(ValueError):
+        dpt.logaddexp(a, c)
+
+
+def test_logaddexp_errors():
+    get_queue_or_skip()
+    try:
+        gpu_queue = dpctl.SyclQueue("gpu")
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("SyclQueue('gpu') failed, skipping")
+    try:
+        cpu_queue = dpctl.SyclQueue("cpu")
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("SyclQueue('cpu') failed, skipping")
+
+    ar1 = dpt.ones(2, dtype="float32", sycl_queue=gpu_queue)
+    ar2 = dpt.ones_like(ar1, sycl_queue=gpu_queue)
+    y = dpt.empty_like(ar1, sycl_queue=cpu_queue)
+    assert_raises_regex(
+        TypeError,
+        "Input and output allocation queues are not compatible",
+        dpt.logaddexp,
+        ar1,
+        ar2,
+        y,
+    )
+
+    ar1 = dpt.ones(2, dtype="float32")
+    ar2 = dpt.ones_like(ar1, dtype="int32")
+    y = dpt.empty(3)
+    assert_raises_regex(
+        TypeError,
+        "The shape of input and output arrays are inconsistent",
+        dpt.logaddexp,
+        ar1,
+        ar2,
+        y,
+    )
+
+    ar1 = dpt.ones(2, dtype="float32")
+    ar2 = dpt.ones_like(ar1, dtype="int32")
+    y = ar1
+    assert_raises_regex(
+        TypeError,
+        "Input and output arrays have memory overlap",
+        dpt.logaddexp,
+        ar1,
+        ar2,
+        y,
+    )
+
+    ar1 = np.ones(2, dtype="float32")
+    ar2 = np.ones_like(ar1, dtype="int32")
+    assert_raises_regex(
+        ExecutionPlacementError,
+        "Execution placement can not be unambiguously inferred.*",
+        dpt.logaddexp,
+        ar1,
+        ar2,
+    )
+
+    ar1 = dpt.ones(2, dtype="float32")
+    ar2 = dpt.ones_like(ar1, dtype="int32")
+    y = np.empty_like(ar1)
+    assert_raises_regex(
+        TypeError,
+        "output array must be of usm_ndarray type",
+        dpt.logaddexp,
+        ar1,
+        ar2,
+        y,
+    )
+
+
+@pytest.mark.parametrize("dtype", _no_complex_dtypes)
+def test_logaddexp_dtype_error(
+    dtype,
+):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    ar1 = dpt.ones(5, dtype=dtype)
+    ar2 = dpt.ones_like(ar1, dtype="f4")
+
+    y = dpt.zeros_like(ar1, dtype="int8")
+    assert_raises_regex(
+        TypeError, "Output array of type.*is needed", dpt.logaddexp, ar1, ar2, y
+    )

From a173b9cd8415b1e2d5adc42f52703fc546687dff Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sun, 23 Jul 2023 13:19:50 -0500
Subject: [PATCH 2/8] BinaryElementwiseFunc get acceptance_fn parameter, set
 individually for tensor.divide

The _find_buf_dtype2 must stay consistent with _find_buf_dtype in that for
binary_fn that can be expressed via
    unary_fn(other_binary_fn(unary_fn(in1), unary_fn(in2))),
e.g. logaddexp(x1, x2) == log(exp(x1) + exp(x2))
the promotion of integral types to fp type must be consistence with both
evaluation.

The special-casing, necessary for proper working of dpt.divide, where
dpt.divide(dpt.asarray(1, dtype="i1"), dpt.asarray(1, dtype="u1")) should
give default fp type resulted in
 logaddexp(dpt.asarray(1, dtype="i1"), dpt.asarray(1, dtype="u1"))
returning array of different data-type than if evaluated alternatively,
since dpt.exp(dpt.asarray(1, dtype="i1")) and dpt.exp(dpt.asarray(1, dtype="u1"))
both gave float16 arrays, and the result was of float16 data type.

Now, both behaviors are fixed:

```
In [5]: dpt.log(dpt.add(dpt.exp(dpt.asarray(1, dtype="u1")), dpt.exp(dpt.asarray(1, dtype="i1"))))
Out[5]: usm_ndarray(1.693, dtype=float16)

In [6]: dpt.logaddexp(dpt.asarray(1, dtype="u1"), dpt.asarray(1, dtype="i1"))
Out[6]: usm_ndarray(1.693, dtype=float16)

In [7]: dpt.divide(dpt.asarray(1, dtype="u1"), dpt.asarray(1, dtype="i1"))
Out[7]: usm_ndarray(1., dtype=float32)
```
---
 dpctl/tensor/_elementwise_common.py | 22 ++++++++++--
 dpctl/tensor/_elementwise_funcs.py  |  7 +++-
 dpctl/tensor/_type_utils.py         | 56 +++++++++++++++++++++--------
 3 files changed, 66 insertions(+), 19 deletions(-)

diff --git a/dpctl/tensor/_elementwise_common.py b/dpctl/tensor/_elementwise_common.py
index 221c482969..409f97e5aa 100644
--- a/dpctl/tensor/_elementwise_common.py
+++ b/dpctl/tensor/_elementwise_common.py
@@ -27,6 +27,7 @@
 from dpctl.utils import ExecutionPlacementError
 
 from ._type_utils import (
+    _acceptance_fn_default,
     _empty_like_orderK,
     _empty_like_pair_orderK,
     _find_buf_dtype,
@@ -48,6 +49,12 @@ def __init__(self, name, result_type_resolver_fn, unary_dp_impl_fn, docs):
         self.unary_fn_ = unary_dp_impl_fn
         self.__doc__ = docs
 
+    def __str__(self):
+        return f"<{self.__name__} '{self.name_}'>"
+
+    def __repr__(self):
+        return f"<{self.__name__} '{self.name_}'>"
+
     def __call__(self, x, out=None, order="K"):
         if not isinstance(x, dpt.usm_ndarray):
             raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
@@ -357,6 +364,7 @@ def __init__(
         binary_dp_impl_fn,
         docs,
         binary_inplace_fn=None,
+        acceptance_fn=None,
     ):
         self.__name__ = "BinaryElementwiseFunc"
         self.name_ = name
@@ -364,12 +372,16 @@ def __init__(
         self.binary_fn_ = binary_dp_impl_fn
         self.binary_inplace_fn_ = binary_inplace_fn
         self.__doc__ = docs
+        if callable(acceptance_fn):
+            self.acceptance_fn_ = acceptance_fn
+        else:
+            self.acceptance_fn_ = _acceptance_fn_default
 
     def __str__(self):
-        return f"<BinaryElementwiseFunc '{self.name_}'>"
+        return f"<{self.__name__} '{self.name_}'>"
 
     def __repr__(self):
-        return f"<BinaryElementwiseFunc '{self.name_}'>"
+        return f"<{self.__name__} '{self.name_}'>"
 
     def __call__(self, o1, o2, out=None, order="K"):
         # FIXME: replace with check against base array
@@ -445,7 +457,11 @@ def __call__(self, o1, o2, out=None, order="K"):
         o1_dtype, o2_dtype = _resolve_weak_types(o1_dtype, o2_dtype, sycl_dev)
 
         buf1_dt, buf2_dt, res_dt = _find_buf_dtype2(
-            o1_dtype, o2_dtype, self.result_type_resolver_fn_, sycl_dev
+            o1_dtype,
+            o2_dtype,
+            self.result_type_resolver_fn_,
+            sycl_dev,
+            acceptance_fn=self.acceptance_fn_,
         )
 
         if res_dt is None:
diff --git a/dpctl/tensor/_elementwise_funcs.py b/dpctl/tensor/_elementwise_funcs.py
index e15add335c..d843041431 100644
--- a/dpctl/tensor/_elementwise_funcs.py
+++ b/dpctl/tensor/_elementwise_funcs.py
@@ -17,6 +17,7 @@
 import dpctl.tensor._tensor_impl as ti
 
 from ._elementwise_common import BinaryElementwiseFunc, UnaryElementwiseFunc
+from ._type_utils import _acceptance_fn_divide
 
 # U01: ==== ABS    (x)
 _abs_docstring_ = """
@@ -215,7 +216,11 @@
 """
 
 divide = BinaryElementwiseFunc(
-    "divide", ti._divide_result_type, ti._divide, _divide_docstring_
+    "divide",
+    ti._divide_result_type,
+    ti._divide,
+    _divide_docstring_,
+    acceptance_fn=_acceptance_fn_divide,
 )
 
 # B09: ==== EQUAL         (x1, x2)
diff --git a/dpctl/tensor/_type_utils.py b/dpctl/tensor/_type_utils.py
index 3ba6c316fb..fb2223f292 100644
--- a/dpctl/tensor/_type_utils.py
+++ b/dpctl/tensor/_type_utils.py
@@ -255,7 +255,34 @@ def _get_device_default_dtype(dt_kind, sycl_dev):
     raise RuntimeError
 
 
-def _find_buf_dtype2(arg1_dtype, arg2_dtype, query_fn, sycl_dev):
+def _acceptance_fn_default(
+    arg1_dtype, arg2_dtype, ret_buf1_dt, ret_buf2_dt, res_dt, sycl_dev
+):
+    return True
+
+
+def _acceptance_fn_divide(
+    arg1_dtype, arg2_dtype, ret_buf1_dt, ret_buf2_dt, res_dt, sycl_dev
+):
+    # both are being promoted, if the kind of result is
+    # different than the kind of original input dtypes,
+    # we use default dtype for the resulting kind.
+    # This covers, e.g. (array_dtype_i1 / array_dtype_u1)
+    # result of which in divide is double (in NumPy), but
+    # regular type promotion rules peg at float16
+    if (ret_buf1_dt.kind != arg1_dtype.kind) and (
+        ret_buf2_dt.kind != arg2_dtype.kind
+    ):
+        default_dt = _get_device_default_dtype(res_dt.kind, sycl_dev)
+        if res_dt == default_dt:
+            return True
+        else:
+            return False
+    else:
+        return True
+
+
+def _find_buf_dtype2(arg1_dtype, arg2_dtype, query_fn, sycl_dev, acceptance_fn):
     res_dt = query_fn(arg1_dtype, arg2_dtype)
     if res_dt:
         return None, None, res_dt
@@ -275,21 +302,18 @@ def _find_buf_dtype2(arg1_dtype, arg2_dtype, query_fn, sycl_dev):
                     if ret_buf1_dt is None or ret_buf2_dt is None:
                         return ret_buf1_dt, ret_buf2_dt, res_dt
                     else:
-                        # both are being promoted, if the kind of result is
-                        # different than the kind of original input dtypes,
-                        # we must use default dtype for the resulting kind.
-                        if (res_dt.kind != arg1_dtype.kind) and (
-                            res_dt.kind != arg2_dtype.kind
-                        ):
-                            default_dt = _get_device_default_dtype(
-                                res_dt.kind, sycl_dev
-                            )
-                            if res_dt == default_dt:
-                                return ret_buf1_dt, ret_buf2_dt, res_dt
-                            else:
-                                continue
-                        else:
+                        acceptable = acceptance_fn(
+                            arg1_dtype,
+                            arg2_dtype,
+                            ret_buf1_dt,
+                            ret_buf2_dt,
+                            res_dt,
+                            sycl_dev,
+                        )
+                        if acceptable:
                             return ret_buf1_dt, ret_buf2_dt, res_dt
+                        else:
+                            continue
 
     return None, None, None
 
@@ -318,4 +342,6 @@ def _find_inplace_dtype(lhs_dtype, rhs_dtype, query_fn, sycl_dev):
     "_empty_like_orderK",
     "_empty_like_pair_orderK",
     "_to_device_supported_dtype",
+    "_acceptance_fn_default",
+    "_acceptance_fn_divide",
 ]

From 626dd4a972373ddaf5012e2283746044fa6087df Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sun, 23 Jul 2023 13:28:43 -0500
Subject: [PATCH 3/8] Only try calling _inplace if inplace function is defined

---
 dpctl/tensor/_elementwise_common.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/dpctl/tensor/_elementwise_common.py b/dpctl/tensor/_elementwise_common.py
index 409f97e5aa..78c79fb2ad 100644
--- a/dpctl/tensor/_elementwise_common.py
+++ b/dpctl/tensor/_elementwise_common.py
@@ -386,10 +386,11 @@ def __repr__(self):
     def __call__(self, o1, o2, out=None, order="K"):
         # FIXME: replace with check against base array
         # when views can be identified
-        if o1 is out:
-            return self._inplace(o1, o2)
-        elif o2 is out:
-            return self._inplace(o2, o1)
+        if self.binary_inplace_fn_:
+            if o1 is out:
+                return self._inplace(o1, o2)
+            elif o2 is out:
+                return self._inplace(o2, o1)
 
         if order not in ["K", "C", "F", "A"]:
             order = "K"

From 22081fb423fe16abd4eca8412dcc3835e0ba5ec3 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sun, 23 Jul 2023 13:29:15 -0500
Subject: [PATCH 4/8] Since logaddexp is not defined for complex types, do not
 try complex Python scalar

---
 dpctl/tests/elementwise/test_logaddexp.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dpctl/tests/elementwise/test_logaddexp.py b/dpctl/tests/elementwise/test_logaddexp.py
index bf334eca9d..355f0d0dc7 100644
--- a/dpctl/tests/elementwise/test_logaddexp.py
+++ b/dpctl/tests/elementwise/test_logaddexp.py
@@ -166,7 +166,6 @@ def test_logaddexp_python_scalar(arr_dt):
         bool(0),
         int(0),
         float(0),
-        complex(0),
         np.float32(0),
         ctypes.c_int(0),
     )

From d8437e649ca296beb24d6eee673803a82824d067 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sun, 23 Jul 2023 13:30:06 -0500
Subject: [PATCH 5/8] Implement logaddexp in numerically stable way

---
 .../elementwise_functions/logaddexp.hpp       | 21 +++++++++++--------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
index c39aec1877..f785e566de 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
@@ -62,22 +62,25 @@ template <typename argT1, typename argT2, typename resT> struct LogAddExpFunctor
 
     resT operator()(const argT1 &in1, const argT2 &in2)
     {
-        return std::log(std::exp(in1) + std::exp(in2));
+        resT max = std::max<resT>(in1, in2);
+        resT min = std::min<resT>(in1, in2);
+        return max + std::log(resT(1) + std::exp(min - max));
     }
 
     template <int vec_sz>
     sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT1, vec_sz> &in1,
                                        const sycl::vec<argT2, vec_sz> &in2)
     {
-        auto res = sycl::log(sycl::exp(in1) + sycl::exp(in2));
-        if constexpr (std::is_same_v<resT,
-                                     typename decltype(res)::element_type>) {
-            return res;
-        }
-        else {
-            return vec_cast<resT, typename decltype(res)::element_type, vec_sz>(
-                res);
+        sycl::vec<resT, vec_sz> res;
+        auto diff = in1 - in2;
+
+#pragma unroll
+        for (int i = 0; i < vec_sz; ++i) {
+            resT max = std::max<resT>(in1[i], in2[i]);
+            res[i] = max + std::log(resT(1) + std::exp(std::abs(diff[i])));
         }
+
+        return res;
     }
 };
 

From c43caee67ba748e7b731f26991162821f2336d43 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sun, 23 Jul 2023 18:42:14 -0500
Subject: [PATCH 6/8] Added missing argument

---
 dpctl/tests/elementwise/test_type_utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/dpctl/tests/elementwise/test_type_utils.py b/dpctl/tests/elementwise/test_type_utils.py
index 4c2fd3cb60..403e455c2e 100644
--- a/dpctl/tests/elementwise/test_type_utils.py
+++ b/dpctl/tests/elementwise/test_type_utils.py
@@ -155,7 +155,9 @@ def _denier_fn(dt1, dt2):
             dev = MockDevice(fp16, fp64)
             arg1_dt = dpt.float64
             arg2_dt = dpt.complex64
-            r = tu._find_buf_dtype2(arg1_dt, arg2_dt, _denier_fn, dev)
+            r = tu._find_buf_dtype2(
+                arg1_dt, arg2_dt, _denier_fn, dev, tu._acceptance_fn_default
+            )
             assert r == (
                 None,
                 None,

From 3788569d3d8c2e19bcbd54305ad1fcc360579d2b Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 24 Jul 2023 06:20:33 -0500
Subject: [PATCH 7/8] Fixed computing allclose testing tolerance in
 test_dtype_matrix test

When NumPy's result has coarser dtype than that of DPCTL we must use
the largest resolution from each dtype.
---
 dpctl/tests/elementwise/test_logaddexp.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/dpctl/tests/elementwise/test_logaddexp.py b/dpctl/tests/elementwise/test_logaddexp.py
index 355f0d0dc7..a693389354 100644
--- a/dpctl/tests/elementwise/test_logaddexp.py
+++ b/dpctl/tests/elementwise/test_logaddexp.py
@@ -18,7 +18,7 @@
 
 import numpy as np
 import pytest
-from numpy.testing import assert_raises_regex
+from numpy.testing import assert_allclose, assert_raises_regex
 
 import dpctl
 import dpctl.tensor as dpt
@@ -44,8 +44,10 @@ def test_logaddexp_dtype_matrix(op1_dtype, op2_dtype):
     expected = np.logaddexp(dpt.asnumpy(ar1), dpt.asnumpy(ar2))
     assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
     assert r.shape == ar1.shape
-    tol = 8 * np.finfo(r.dtype).resolution
-    assert np.allclose(
+    tol = 8 * max(
+        np.finfo(r.dtype).resolution, np.finfo(expected.dtype).resolution
+    )
+    assert_allclose(
         dpt.asnumpy(r), expected.astype(r.dtype), atol=tol, rtol=tol
     )
     assert r.sycl_queue == ar1.sycl_queue
@@ -58,7 +60,7 @@ def test_logaddexp_dtype_matrix(op1_dtype, op2_dtype):
     expected = np.logaddexp(dpt.asnumpy(ar3)[::-1], dpt.asnumpy(ar4)[::2])
     assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
     assert r.shape == ar3.shape
-    assert np.allclose(
+    assert_allclose(
         dpt.asnumpy(r), expected.astype(r.dtype), atol=tol, rtol=tol
     )
 

From 67c8195f9d556e3948b2a308e29a66de3d3d7f40 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Mon, 24 Jul 2023 11:04:48 -0700
Subject: [PATCH 8/8] Calls of the type log(1 + X) changed to log1p(X) in
 logaddexp

---
 .../include/kernels/elementwise_functions/logaddexp.hpp       | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
index f785e566de..c8f549205e 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
@@ -64,7 +64,7 @@ template <typename argT1, typename argT2, typename resT> struct LogAddExpFunctor
     {
         resT max = std::max<resT>(in1, in2);
         resT min = std::min<resT>(in1, in2);
-        return max + std::log(resT(1) + std::exp(min - max));
+        return max + std::log1p(std::exp(min - max));
     }
 
     template <int vec_sz>
@@ -77,7 +77,7 @@ template <typename argT1, typename argT2, typename resT> struct LogAddExpFunctor
 #pragma unroll
         for (int i = 0; i < vec_sz; ++i) {
             resT max = std::max<resT>(in1[i], in2[i]);
-            res[i] = max + std::log(resT(1) + std::exp(std::abs(diff[i])));
+            res[i] = max + std::log1p(std::exp(std::abs(diff[i])));
         }
 
         return res;