From 8dd508879f3782c0fc64239090bdbf779ef72c83 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 3 Feb 2023 05:39:40 -0600
Subject: [PATCH 1/5] dpnp.add() doesn't work properly with a scalar

---
 .../include/dpnp_gen_2arg_3type_tbl.hpp       |  93 ++++-
 dpnp/backend/include/dpnp_iface.hpp           |   3 +-
 dpnp/backend/include/dpnp_iface_fptr.hpp      |   4 +-
 dpnp/backend/kernels/dpnp_krnl_elemwise.cpp   | 351 ++++++++----------
 dpnp/backend/src/dpnp_fptr.hpp                |  49 +++
 dpnp/dpnp_iface_mathematical.py               |  82 ++--
 tests/skipped_tests.tbl                       |   9 +-
 tests/skipped_tests_gpu.tbl                   |  37 +-
 tests/test_indexing.py                        |  55 +--
 tests/test_mathematical.py                    | 148 ++++----
 tests/test_strides.py                         |  67 ++--
 .../cupy/math_tests/test_arithmetic.py        |   2 +-
 12 files changed, 466 insertions(+), 434 deletions(-)
diff --git a/dpnp/backend/include/dpnp_gen_2arg_3type_tbl.hpp b/dpnp/backend/include/dpnp_gen_2arg_3type_tbl.hpp
index 5d4ae22f796f..33f5e0d19a46 100644
--- a/dpnp/backend/include/dpnp_gen_2arg_3type_tbl.hpp
+++ b/dpnp/backend/include/dpnp_gen_2arg_3type_tbl.hpp
@@ -1,5 +1,5 @@
 //*****************************************************************************
-// Copyright (c) 2016-2020, Intel Corporation
+// Copyright (c) 2016-2023, Intel Corporation
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -31,7 +31,10 @@
  * Parameters:
  * - public name of the function and kernel name
  * - operation used to calculate the result
+ * - vector operation over SYCL group used to calculate the result
+ * - list of types vector operation accepts
  * - mkl operation used to calculate the result
+ * - list of types mkl operation accepts
  *
  */
 
@@ -41,11 +44,12 @@
 
 #ifdef _SECTION_DOCUMENTATION_GENERATION_
 
-#define MACRO_2ARG_3TYPES_OP(__name__, __operation1__, __operation2__)                                                  \
+#define MACRO_2ARG_3TYPES_OP(                                                                                           \
+    __name__, __operation__, __vec_operation__, __vec_types__, __mkl_operation__, __mkl_types__)                        \
     /** @ingroup BACKEND_API                                                                                         */ \
     /** @brief Per element operation function __name__                                                               */ \
     /**                                                                                                              */ \
-    /** Function "__name__" executes operator "__operation1__" over corresponding elements of input arrays           */ \
+    /** Function "__name__" executes operator "__operation__" over corresponding elements of input arrays            */ \
     /**                                                                                                              */ \
     /** @param[in]  q_ref              Reference to SYCL queue.                                                      */ \
     /** @param[out] result_out         Output array.                                                                 */ \
@@ -105,23 +109,84 @@
 
 #endif
 
-MACRO_2ARG_3TYPES_OP(dpnp_add_c, input1_elem + input2_elem, oneapi::mkl::vm::add)
-MACRO_2ARG_3TYPES_OP(dpnp_arctan2_c, sycl::atan2((double)input1_elem, (double)input2_elem), oneapi::mkl::vm::atan2)
+MACRO_2ARG_3TYPES_OP(dpnp_add_c,
+                     input1_elem + input2_elem,
+                     sycl::add_sat(x1, x2),
+                     MACRO_UNPACK_TYPES(int, long),
+                     oneapi::mkl::vm::add,
+                     MACRO_UNPACK_TYPES(float, double, std::complex<float>, std::complex<double>))
+
+MACRO_2ARG_3TYPES_OP(dpnp_arctan2_c,
+                     sycl::atan2((double)input1_elem, (double)input2_elem),
+                     nullptr,
+                     std::false_type,
+                     oneapi::mkl::vm::atan2,
+                     MACRO_UNPACK_TYPES(float, double))
+
 MACRO_2ARG_3TYPES_OP(dpnp_copysign_c,
                      sycl::copysign((double)input1_elem, (double)input2_elem),
-                     oneapi::mkl::vm::copysign)
-MACRO_2ARG_3TYPES_OP(dpnp_divide_c, input1_elem / input2_elem, oneapi::mkl::vm::div)
-MACRO_2ARG_3TYPES_OP(dpnp_fmod_c, sycl::fmod((double)input1_elem, (double)input2_elem), oneapi::mkl::vm::fmod)
-MACRO_2ARG_3TYPES_OP(dpnp_hypot_c, sycl::hypot((double)input1_elem, (double)input2_elem), oneapi::mkl::vm::hypot)
-MACRO_2ARG_3TYPES_OP(dpnp_maximum_c, sycl::max(input1_elem, input2_elem), oneapi::mkl::vm::fmax)
-MACRO_2ARG_3TYPES_OP(dpnp_minimum_c, sycl::min(input1_elem, input2_elem), oneapi::mkl::vm::fmin)
+                     nullptr,
+                     std::false_type,
+                     oneapi::mkl::vm::copysign,
+                     MACRO_UNPACK_TYPES(float, double))
+
+MACRO_2ARG_3TYPES_OP(dpnp_divide_c,
+                     input1_elem / input2_elem,
+                     nullptr,
+                     std::false_type,
+                     oneapi::mkl::vm::div,
+                     MACRO_UNPACK_TYPES(float, double))
+
+MACRO_2ARG_3TYPES_OP(dpnp_fmod_c,
+                     sycl::fmod((double)input1_elem, (double)input2_elem),
+                     nullptr,
+                     std::false_type,
+                     oneapi::mkl::vm::fmod,
+                     MACRO_UNPACK_TYPES(float, double))
+
+MACRO_2ARG_3TYPES_OP(dpnp_hypot_c,
+                     sycl::hypot((double)input1_elem, (double)input2_elem),
+                     nullptr,
+                     std::false_type,
+                     oneapi::mkl::vm::hypot,
+                     MACRO_UNPACK_TYPES(float, double))
+
+MACRO_2ARG_3TYPES_OP(dpnp_maximum_c,
+                     sycl::max(input1_elem, input2_elem),
+                     nullptr,
+                     std::false_type,
+                     oneapi::mkl::vm::fmax,
+                     MACRO_UNPACK_TYPES(float, double))
+
+MACRO_2ARG_3TYPES_OP(dpnp_minimum_c,
+                     sycl::min(input1_elem, input2_elem),
+                     nullptr,
+                     std::false_type,
+                     oneapi::mkl::vm::fmin,
+                     MACRO_UNPACK_TYPES(float, double))
 
 // "multiply" needs to be standalone kernel (not autogenerated) due to complex algorithm. This is not an element wise.
 // pytest "tests/third_party/cupy/creation_tests/test_ranges.py::TestMgrid::test_mgrid3"
 // requires multiplication shape1[10] with shape2[10,1] and result expected as shape[10,10]
-MACRO_2ARG_3TYPES_OP(dpnp_multiply_c, input1_elem* input2_elem, oneapi::mkl::vm::mul)
+MACRO_2ARG_3TYPES_OP(dpnp_multiply_c,
+                     input1_elem* input2_elem,
+                     nullptr,
+                     std::false_type,
+                     oneapi::mkl::vm::mul,
+                     MACRO_UNPACK_TYPES(float, double, std::complex<float>, std::complex<double>))
+
+MACRO_2ARG_3TYPES_OP(dpnp_power_c,
+                     sycl::pow((double)input1_elem, (double)input2_elem),
+                     nullptr,
+                     std::false_type,
+                     oneapi::mkl::vm::pow,
+                     MACRO_UNPACK_TYPES(float, double))
 
-MACRO_2ARG_3TYPES_OP(dpnp_power_c, sycl::pow((double)input1_elem, (double)input2_elem), oneapi::mkl::vm::pow)
-MACRO_2ARG_3TYPES_OP(dpnp_subtract_c, input1_elem - input2_elem, oneapi::mkl::vm::sub)
+MACRO_2ARG_3TYPES_OP(dpnp_subtract_c,
+                     input1_elem - input2_elem,
+                     nullptr,
+                     std::false_type,
+                     oneapi::mkl::vm::sub,
+                     MACRO_UNPACK_TYPES(float, double))
 
 #undef MACRO_2ARG_3TYPES_OP
diff --git a/dpnp/backend/include/dpnp_iface.hpp b/dpnp/backend/include/dpnp_iface.hpp
index 713e3e821979..7a80b40a3d2e 100644
--- a/dpnp/backend/include/dpnp_iface.hpp
+++ b/dpnp/backend/include/dpnp_iface.hpp
@@ -1829,7 +1829,8 @@ INP_DLLEXPORT void dpnp_invert_c(void* array1_in, void* result, size_t size);
 
 #include <dpnp_gen_2arg_2type_tbl.hpp>
 
-#define MACRO_2ARG_3TYPES_OP(__name__, __operation1__, __operation2__)                                                 \
+#define MACRO_2ARG_3TYPES_OP(                                                                                          \
+    __name__, __operation__, __vec_operation__, __vec_types__, __mkl_operation__, __mkl_types__)                       \
     template <typename _DataType_output, typename _DataType_input1, typename _DataType_input2>                         \
     INP_DLLEXPORT DPCTLSyclEventRef __name__(DPCTLSyclQueueRef q_ref,                                                  \
                                              void* result_out,                                                         \
diff --git a/dpnp/backend/include/dpnp_iface_fptr.hpp b/dpnp/backend/include/dpnp_iface_fptr.hpp
index f77a37aade89..61c1c9838ad6 100644
--- a/dpnp/backend/include/dpnp_iface_fptr.hpp
+++ b/dpnp/backend/include/dpnp_iface_fptr.hpp
@@ -394,13 +394,13 @@ enum class DPNPFuncName : size_t
 enum class DPNPFuncType : size_t
 {
     DPNP_FT_NONE,     /**< Very first element of the enumeration */
+    DPNP_FT_BOOL,     /**< analog of numpy.bool_ or bool */
     DPNP_FT_INT,      /**< analog of numpy.int32 or int */
     DPNP_FT_LONG,     /**< analog of numpy.int64 or long */
     DPNP_FT_FLOAT,    /**< analog of numpy.float32 or float */
     DPNP_FT_DOUBLE,   /**< analog of numpy.float32 or double */
     DPNP_FT_CMPLX64,  /**< analog of numpy.complex64 or std::complex<float> */
-    DPNP_FT_CMPLX128, /**< analog of numpy.complex128 or std::complex<double> */
-    DPNP_FT_BOOL      /**< analog of numpy.bool_ or bool */
+    DPNP_FT_CMPLX128  /**< analog of numpy.complex128 or std::complex<double> */
 };
 
 /**
diff --git a/dpnp/backend/kernels/dpnp_krnl_elemwise.cpp b/dpnp/backend/kernels/dpnp_krnl_elemwise.cpp
index eafa50d4cee2..1b90e4a6821e 100644
--- a/dpnp/backend/kernels/dpnp_krnl_elemwise.cpp
+++ b/dpnp/backend/kernels/dpnp_krnl_elemwise.cpp
@@ -1,5 +1,5 @@
 //*****************************************************************************
-// Copyright (c) 2016-2022, Intel Corporation
+// Copyright (c) 2016-2023, Intel Corporation
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -825,7 +825,9 @@ static void func_map_init_elemwise_1arg_1type(func_map_t& fmap)
     return;
 }
 
-#define MACRO_2ARG_3TYPES_OP(__name__, __operation1__, __operation2__)                                                 \
+
+#define MACRO_2ARG_3TYPES_OP(                                                                                          \
+    __name__, __operation__, __vec_operation__, __vec_types__, __mkl_operation__, __mkl_types__)                       \
     template <typename _KernelNameSpecialization1,                                                                     \
               typename _KernelNameSpecialization2,                                                                     \
               typename _KernelNameSpecialization3>                                                                     \
@@ -834,6 +836,11 @@ static void func_map_init_elemwise_1arg_1type(func_map_t& fmap)
     template <typename _KernelNameSpecialization1,                                                                     \
               typename _KernelNameSpecialization2,                                                                     \
               typename _KernelNameSpecialization3>                                                                     \
+    class __name__##_sg_kernel;                                                                                        \
+                                                                                                                       \
+    template <typename _KernelNameSpecialization1,                                                                     \
+              typename _KernelNameSpecialization2,                                                                     \
+              typename _KernelNameSpecialization3>                                                                     \
     class __name__##_broadcast_kernel;                                                                                 \
                                                                                                                        \
     template <typename _KernelNameSpecialization1,                                                                     \
@@ -874,45 +881,24 @@ static void func_map_init_elemwise_1arg_1type(func_map_t& fmap)
                                                                                                                        \
         sycl::queue q = *(reinterpret_cast<sycl::queue*>(q_ref));                                                      \
                                                                                                                        \
-        DPNPC_ptr_adapter<_DataType_input1> input1_ptr(q_ref, input1_in, input1_size);                                 \
-        DPNPC_ptr_adapter<shape_elem_type> input1_shape_ptr(q_ref, input1_shape, input1_ndim, true);                   \
-        DPNPC_ptr_adapter<shape_elem_type> input1_strides_ptr(q_ref, input1_strides, input1_ndim, true);               \
-        DPNPC_ptr_adapter<_DataType_input2> input2_ptr(q_ref, input2_in, input2_size);                                 \
-        DPNPC_ptr_adapter<shape_elem_type> input2_shape_ptr(q_ref, input2_shape, input2_ndim, true);                   \
-        DPNPC_ptr_adapter<shape_elem_type> input2_strides_ptr(q_ref, input2_strides, input2_ndim, true);               \
-                                                                                                                       \
-        DPNPC_ptr_adapter<_DataType_output> result_ptr(q_ref, result_out, result_size, false, true);                   \
-        DPNPC_ptr_adapter<shape_elem_type> result_shape_ptr(q_ref, result_shape, result_ndim);                         \
-        DPNPC_ptr_adapter<shape_elem_type> result_strides_ptr(q_ref, result_strides, result_ndim);                     \
-                                                                                                                       \
-        _DataType_input1* input1_data = input1_ptr.get_ptr();                                                          \
-        shape_elem_type* input1_shape_data = input1_shape_ptr.get_ptr();                                               \
-        shape_elem_type* input1_strides_data = input1_strides_ptr.get_ptr();                                           \
+        _DataType_input1* input1_data = static_cast<_DataType_input1 *>(const_cast<void *>(input1_in));                \
+        _DataType_input2* input2_data = static_cast<_DataType_input2 *>(const_cast<void *>(input2_in));                \
+        _DataType_output* result = static_cast<_DataType_output *>(result_out);                                        \
                                                                                                                        \
-        _DataType_input2* input2_data = input2_ptr.get_ptr();                                                          \
-        shape_elem_type* input2_shape_data = input2_shape_ptr.get_ptr();                                               \
-        shape_elem_type* input2_strides_data = input2_strides_ptr.get_ptr();                                           \
+        bool use_broadcasting = !array_equal(input1_shape, input1_ndim, input2_shape, input2_ndim);                    \
                                                                                                                        \
-        _DataType_output* result = result_ptr.get_ptr();                                                               \
-        shape_elem_type* result_shape_data = result_shape_ptr.get_ptr();                                               \
-        shape_elem_type* result_strides_data = result_strides_ptr.get_ptr();                                           \
+        shape_elem_type* input1_shape_offsets = new shape_elem_type[input1_ndim];                                      \
                                                                                                                        \
-        bool use_broadcasting = !array_equal(input1_shape_data, input1_ndim, input2_shape_data, input2_ndim);          \
+        get_shape_offsets_inkernel(input1_shape, input1_ndim, input1_shape_offsets);                                   \
+        bool use_strides = !array_equal(input1_strides, input1_ndim, input1_shape_offsets, input1_ndim);               \
+        delete[] input1_shape_offsets;                                                                                 \
                                                                                                                        \
-        const size_t input1_shape_size_in_bytes = input1_ndim * sizeof(shape_elem_type);                               \
-        shape_elem_type* input1_shape_offsets =                                                                        \
-            reinterpret_cast<shape_elem_type*>(sycl::malloc_shared(input1_shape_size_in_bytes, q));                    \
-        get_shape_offsets_inkernel(input1_shape_data, input1_ndim, input1_shape_offsets);                              \
-        bool use_strides = !array_equal(input1_strides_data, input1_ndim, input1_shape_offsets, input1_ndim);          \
-        sycl::free(input1_shape_offsets, q);                                                                           \
+        shape_elem_type* input2_shape_offsets = new shape_elem_type[input2_ndim];                                      \
                                                                                                                        \
-        const size_t input2_shape_size_in_bytes = input2_ndim * sizeof(shape_elem_type);                               \
-        shape_elem_type* input2_shape_offsets =                                                                        \
-            reinterpret_cast<shape_elem_type*>(sycl::malloc_shared(input2_shape_size_in_bytes, q));                    \
-        get_shape_offsets_inkernel(input2_shape_data, input2_ndim, input2_shape_offsets);                              \
+        get_shape_offsets_inkernel(input2_shape, input2_ndim, input2_shape_offsets);                                   \
         use_strides =                                                                                                  \
-            use_strides || !array_equal(input2_strides_data, input2_ndim, input2_shape_offsets, input2_ndim);          \
-        sycl::free(input2_shape_offsets, q);                                                                           \
+            use_strides || !array_equal(input2_strides, input2_ndim, input2_shape_offsets, input2_ndim);               \
+        delete[] input2_shape_offsets;                                                                                 \
                                                                                                                        \
         sycl::event event;                                                                                             \
         sycl::range<1> gws(result_size);                                                                               \
@@ -924,25 +910,25 @@ static void func_map_init_elemwise_1arg_1type(func_map_t& fmap)
             input1_it = reinterpret_cast<DPNPC_id<_DataType_input1>*>(dpnp_memory_alloc_c(q_ref,                       \
                                                                                           input1_it_size_in_bytes));   \
             new (input1_it)                                                                                            \
-                DPNPC_id<_DataType_input1>(q_ref, input1_data, input1_shape_data, input1_strides_data, input1_ndim);   \
+                DPNPC_id<_DataType_input1>(q_ref, input1_data, input1_shape, input1_strides, input1_ndim);             \
                                                                                                                        \
-            input1_it->broadcast_to_shape(result_shape_data, result_ndim);                                             \
+            input1_it->broadcast_to_shape(result_shape, result_ndim);                                                  \
                                                                                                                        \
             DPNPC_id<_DataType_input2>* input2_it;                                                                     \
             const size_t input2_it_size_in_bytes = sizeof(DPNPC_id<_DataType_input2>);                                 \
             input2_it = reinterpret_cast<DPNPC_id<_DataType_input2>*>(dpnp_memory_alloc_c(q_ref,                       \
                                                                                           input2_it_size_in_bytes));   \
             new (input2_it)                                                                                            \
-                DPNPC_id<_DataType_input2>(q_ref, input2_data, input2_shape_data, input2_strides_data, input2_ndim);   \
+                DPNPC_id<_DataType_input2>(q_ref, input2_data, input2_shape, input2_strides, input2_ndim);             \
                                                                                                                        \
-            input2_it->broadcast_to_shape(result_shape_data, result_ndim);                                             \
+            input2_it->broadcast_to_shape(result_shape, result_ndim);                                                  \
                                                                                                                        \
             auto kernel_parallel_for_func = [=](sycl::id<1> global_id) {                                               \
-                const size_t i = global_id[0]; /*for (size_t i = 0; i < result_size; ++i)*/                            \
+                const size_t i = global_id[0]; /* for (size_t i = 0; i < result_size; ++i) */                          \
                 {                                                                                                      \
                     const _DataType_output input1_elem = (*input1_it)[i];                                              \
                     const _DataType_output input2_elem = (*input2_it)[i];                                              \
-                    result[i] = __operation1__;                                                                        \
+                    result[i] = __operation__;                                                                         \
                 }                                                                                                      \
             };                                                                                                         \
             auto kernel_func = [&](sycl::handler& cgh) {                                                               \
@@ -951,8 +937,7 @@ static void func_map_init_elemwise_1arg_1type(func_map_t& fmap)
                     gws, kernel_parallel_for_func);                                                                    \
             };                                                                                                         \
                                                                                                                        \
-            event = q.submit(kernel_func);                                                                             \
-            event.wait();                                                                                              \
+            q.submit(kernel_func).wait();                                                                              \
                                                                                                                        \
             input1_it->~DPNPC_id();                                                                                    \
             input2_it->~DPNPC_id();                                                                                    \
@@ -961,11 +946,42 @@ static void func_map_init_elemwise_1arg_1type(func_map_t& fmap)
         }                                                                                                              \
         else if (use_strides)                                                                                          \
         {                                                                                                              \
+            if ((result_ndim != input1_ndim) || (result_ndim != input2_ndim))                                          \
+            {                                                                                                          \
+                throw std::runtime_error("Result ndim=" + std::to_string(result_ndim) +                                \
+                                         " mismatches with either input1 ndim=" + std::to_string(input1_ndim) +        \
+                                         " or input2 ndim=" + std::to_string(input2_ndim));                            \
+            }                                                                                                          \
+                                                                                                                       \
+            /* memory transfer optimization, use USM-host for temporary speeds up tranfer to device */                 \
+            using usm_host_allocatorT = sycl::usm_allocator<shape_elem_type, sycl::usm::alloc::host>;                  \
+                                                                                                                       \
+            size_t strides_size = 3 * result_ndim;                                                                     \
+            shape_elem_type *dev_strides_data = sycl::malloc_device<shape_elem_type>(strides_size, q);                 \
+                                                                                                                       \
+            /* create host temporary for packed strides managed by shared pointer */                                   \
+            auto strides_host_packed = std::vector<shape_elem_type, usm_host_allocatorT>(strides_size,                 \
+                                                                                         usm_host_allocatorT(q));      \
+                                                                                                                       \
+            /* packed vector is concatenation of result_strides, input1_strides and input2_strides */                  \
+            std::copy(result_strides, result_strides + result_ndim, strides_host_packed.begin());                      \
+            std::copy(input1_strides, input1_strides + result_ndim, strides_host_packed.begin() + result_ndim);        \
+            std::copy(input2_strides, input2_strides + result_ndim, strides_host_packed.begin() + 2 * result_ndim);    \
+                                                                                                                       \
+            auto copy_strides_ev = q.copy<shape_elem_type>(strides_host_packed.data(),                                 \
+                                                           dev_strides_data,                                           \
+                                                           strides_host_packed.size());                                \
+                                                                                                                       \
             auto kernel_parallel_for_func = [=](sycl::id<1> global_id) {                                               \
-                const size_t output_id = global_id[0]; /*for (size_t i = 0; i < result_size; ++i)*/                    \
+                const size_t output_id = global_id[0]; /* for (size_t i = 0; i < result_size; ++i) */                  \
                 {                                                                                                      \
+                    const shape_elem_type *result_strides_data = &dev_strides_data[0];                                 \
+                    const shape_elem_type *input1_strides_data = &dev_strides_data[1];                                 \
+                    const shape_elem_type *input2_strides_data = &dev_strides_data[2];                                 \
+                                                                                                                       \
                     size_t input1_id = 0;                                                                              \
                     size_t input2_id = 0;                                                                              \
+                                                                                                                       \
                     for (size_t i = 0; i < result_ndim; ++i)                                                           \
                     {                                                                                                  \
                         const size_t output_xyz_id =                                                                   \
@@ -976,33 +992,88 @@ static void func_map_init_elemwise_1arg_1type(func_map_t& fmap)
                                                                                                                        \
                     const _DataType_output input1_elem = input1_data[input1_id];                                       \
                     const _DataType_output input2_elem = input2_data[input2_id];                                       \
-                    result[output_id] = __operation1__;                                                                \
+                    result[output_id] = __operation__;                                                                 \
                 }                                                                                                      \
             };                                                                                                         \
             auto kernel_func = [&](sycl::handler& cgh) {                                                               \
+                cgh.depends_on(copy_strides_ev);                                                                       \
                 cgh.parallel_for<                                                                                      \
                     class __name__##_strides_kernel<_DataType_output, _DataType_input1, _DataType_input2>>(            \
                     gws, kernel_parallel_for_func);                                                                    \
             };                                                                                                         \
                                                                                                                        \
-            event = q.submit(kernel_func);                                                                             \
+            q.submit(kernel_func).wait();                                                                              \
+                                                                                                                       \
+            sycl::free(dev_strides_data, q);                                                                           \
+            return event_ref;                                                                                          \
         }                                                                                                              \
         else                                                                                                           \
         {                                                                                                              \
-            if constexpr ((std::is_same<_DataType_input1, double>::value ||                                            \
-                           std::is_same<_DataType_input1, float>::value) &&                                            \
-                          std::is_same<_DataType_input2, _DataType_input1>::value)                                     \
+            if constexpr (both_types_are_same<_DataType_input1, _DataType_input2, __mkl_types__>)                      \
             {                                                                                                          \
-                event = __operation2__(q, result_size, input1_data, input2_data, result);                              \
+                event = __mkl_operation__(q, result_size, input1_data, input2_data, result);                           \
             }                                                                                                          \
-            else                                                                                                       \
+            else if constexpr (none_of_both_types<_DataType_input1, _DataType_input2,                                  \
+                                                  std::complex<float>, std::complex<double>>)                          \
+            {                                                                                                          \
+                constexpr size_t lws = 64;                                                                             \
+                constexpr unsigned int vec_sz = 8;                                                                     \
+                constexpr sycl::access::address_space global_space = sycl::access::address_space::global_space;        \
+                                                                                                                       \
+                auto gws_range = sycl::range<1>(((result_size + lws * vec_sz - 1) / (lws * vec_sz)) * lws);            \
+                auto lws_range = sycl::range<1>(lws);                                                                  \
+                                                                                                                       \
+                auto kernel_parallel_for_func = [=](sycl::nd_item<1> nd_it) {                                          \
+                    auto sg = nd_it.get_sub_group();                                                                   \
+                    size_t start = vec_sz * (nd_it.get_group(0) * nd_it.get_local_range(0) +                           \
+                                            sg.get_group_id()[0] * sg.get_max_local_range()[0]);                       \
+                    size_t end = start + static_cast<size_t>(vec_sz);                                                  \
+                                                                                                                       \
+                    if (end < result_size) {                                                                           \
+                        sycl::vec<_DataType_input1, vec_sz> x1 =                                                       \
+                            sg.load<vec_sz>(sycl::multi_ptr<_DataType_input1, global_space>(&input1_data[start]));     \
+                        sycl::vec<_DataType_input2, vec_sz> x2 =                                                       \
+                            sg.load<vec_sz>(sycl::multi_ptr<_DataType_input2, global_space>(&input2_data[start]));     \
+                        sycl::vec<_DataType_output, vec_sz> res_vec;                                                   \
+                                                                                                                       \
+                        if constexpr (both_types_are_same<_DataType_input1, _DataType_input2, __vec_types__>)          \
+                        {                                                                                              \
+                            res_vec = __vec_operation__;                                                               \
+                        }                                                                                              \
+                        else                                                                                           \
+                        {                                                                                              \
+                            for (size_t k = 0; k < vec_sz; ++k) {                                                      \
+                                const _DataType_output input1_elem = x1[k];                                            \
+                                const _DataType_output input2_elem = x2[k];                                            \
+                                res_vec[k] = __operation__;                                                            \
+                            }                                                                                          \
+                        }                                                                                              \
+                        sg.store<vec_sz>(sycl::multi_ptr<_DataType_output, global_space>(&result[start]), res_vec);    \
+                                                                                                                       \
+                    }                                                                                                  \
+                    else {                                                                                             \
+                        for (size_t k = start; k < result_size; ++k) {                                                 \
+                            const _DataType_output input1_elem = input1_data[k];                                       \
+                            const _DataType_output input2_elem = input2_data[k];                                       \
+                            result[k] = __operation__;                                                                 \
+                        }                                                                                              \
+                    }                                                                                                  \
+                };                                                                                                     \
+                                                                                                                       \
+                auto kernel_func = [&](sycl::handler& cgh) {                                                           \
+                    cgh.parallel_for<class __name__##_sg_kernel<_DataType_output, _DataType_input1, _DataType_input2>>(\
+                        sycl::nd_range<1>(gws_range, lws_range), kernel_parallel_for_func);                            \
+                };                                                                                                     \
+                event = q.submit(kernel_func);                                                                         \
+            }                                                                                                          \
+            else /* either input1 or input2 has complex type */                                                        \
             {                                                                                                          \
                 auto kernel_parallel_for_func = [=](sycl::id<1> global_id) {                                           \
-                    const size_t i = global_id[0]; /*for (size_t i = 0; i < result_size; ++i)*/                        \
+                    const size_t i = global_id[0]; /* for (size_t i = 0; i < result_size; ++i) */                      \
                     {                                                                                                  \
                         const _DataType_output input1_elem = input1_data[i];                                           \
                         const _DataType_output input2_elem = input2_data[i];                                           \
-                        result[i] = __operation1__;                                                                    \
+                        result[i] = __operation__;                                                                     \
                     }                                                                                                  \
                 };                                                                                                     \
                 auto kernel_func = [&](sycl::handler& cgh) {                                                           \
@@ -1013,18 +1084,7 @@ static void func_map_init_elemwise_1arg_1type(func_map_t& fmap)
             }                                                                                                          \
         }                                                                                                              \
                                                                                                                        \
-        input1_ptr.depends_on(event);                                                                                  \
-        input1_shape_ptr.depends_on(event);                                                                            \
-        input1_strides_ptr.depends_on(event);                                                                          \
-        input2_ptr.depends_on(event);                                                                                  \
-        input2_shape_ptr.depends_on(event);                                                                            \
-        input2_strides_ptr.depends_on(event);                                                                          \
-        result_ptr.depends_on(event);                                                                                  \
-        result_shape_ptr.depends_on(event);                                                                            \
-        result_strides_ptr.depends_on(event);                                                                          \
-                                                                                                                       \
         event_ref = reinterpret_cast<DPCTLSyclEventRef>(&event);                                                       \
-                                                                                                                       \
         return DPCTLEvent_Copy(event_ref);                                                                             \
     }                                                                                                                  \
                                                                                                                        \
@@ -1114,6 +1174,29 @@ static void func_map_init_elemwise_1arg_1type(func_map_t& fmap)
 
 #include <dpnp_gen_2arg_3type_tbl.hpp>
 
+template <DPNPFuncType FT1, DPNPFuncType... FTs>
+static void func_map_elemwise_2arg_3type_core(func_map_t& fmap)
+{
+    ((fmap[DPNPFuncName::DPNP_FN_ADD_EXT][FT1][FTs] =
+          {populate_func_types<FT1, FTs>(),
+           (void*)dpnp_add_c_ext<func_type_map_t::find_type<populate_func_types<FT1, FTs>()>,
+                                 func_type_map_t::find_type<FT1>,
+                                 func_type_map_t::find_type<FTs>>}),
+     ...);
+    ((fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][FT1][FTs] =
+          {populate_func_types<FT1, FTs>(),
+           (void*)dpnp_multiply_c_ext<func_type_map_t::find_type<populate_func_types<FT1, FTs>()>,
+                                      func_type_map_t::find_type<FT1>,
+                                      func_type_map_t::find_type<FTs>>}),
+     ...);
+}
+
+template <DPNPFuncType... FTs>
+static void func_map_elemwise_2arg_3type_helper(func_map_t& fmap)
+{
+    ((func_map_elemwise_2arg_3type_core<FTs, FTs...>(fmap)), ...);
+}
+
 static void func_map_init_elemwise_2arg_3type(func_map_t& fmap)
 {
     fmap[DPNPFuncName::DPNP_FN_ADD][eft_INT][eft_INT] = {eft_INT,
@@ -1149,39 +1232,6 @@ static void func_map_init_elemwise_2arg_3type(func_map_t& fmap)
     fmap[DPNPFuncName::DPNP_FN_ADD][eft_DBL][eft_DBL] = {eft_DBL,
                                                          (void*)dpnp_add_c_default<double, double, double>};
 
-    fmap[DPNPFuncName::DPNP_FN_ADD_EXT][eft_INT][eft_INT] = {eft_INT,
-                                                             (void*)dpnp_add_c_ext<int32_t, int32_t, int32_t>};
-    fmap[DPNPFuncName::DPNP_FN_ADD_EXT][eft_INT][eft_LNG] = {eft_LNG,
-                                                             (void*)dpnp_add_c_ext<int64_t, int32_t, int64_t>};
-    fmap[DPNPFuncName::DPNP_FN_ADD_EXT][eft_INT][eft_FLT] = {eft_DBL,
-                                                             (void*)dpnp_add_c_ext<double, int32_t, float>};
-    fmap[DPNPFuncName::DPNP_FN_ADD_EXT][eft_INT][eft_DBL] = {eft_DBL,
-                                                             (void*)dpnp_add_c_ext<double, int32_t, double>};
-    fmap[DPNPFuncName::DPNP_FN_ADD_EXT][eft_LNG][eft_INT] = {eft_LNG,
-                                                             (void*)dpnp_add_c_ext<int64_t, int64_t, int32_t>};
-    fmap[DPNPFuncName::DPNP_FN_ADD_EXT][eft_LNG][eft_LNG] = {eft_LNG,
-                                                             (void*)dpnp_add_c_ext<int64_t, int64_t, int64_t>};
-    fmap[DPNPFuncName::DPNP_FN_ADD_EXT][eft_LNG][eft_FLT] = {eft_DBL,
-                                                             (void*)dpnp_add_c_ext<double, int64_t, float>};
-    fmap[DPNPFuncName::DPNP_FN_ADD_EXT][eft_LNG][eft_DBL] = {eft_DBL,
-                                                             (void*)dpnp_add_c_ext<double, int64_t, double>};
-    fmap[DPNPFuncName::DPNP_FN_ADD_EXT][eft_FLT][eft_INT] = {eft_DBL,
-                                                             (void*)dpnp_add_c_ext<double, float, int32_t>};
-    fmap[DPNPFuncName::DPNP_FN_ADD_EXT][eft_FLT][eft_LNG] = {eft_DBL,
-                                                             (void*)dpnp_add_c_ext<double, float, int64_t>};
-    fmap[DPNPFuncName::DPNP_FN_ADD_EXT][eft_FLT][eft_FLT] = {eft_FLT,
-                                                             (void*)dpnp_add_c_ext<float, float, float>};
-    fmap[DPNPFuncName::DPNP_FN_ADD_EXT][eft_FLT][eft_DBL] = {eft_DBL,
-                                                             (void*)dpnp_add_c_ext<double, float, double>};
-    fmap[DPNPFuncName::DPNP_FN_ADD_EXT][eft_DBL][eft_INT] = {eft_DBL,
-                                                             (void*)dpnp_add_c_ext<double, double, int32_t>};
-    fmap[DPNPFuncName::DPNP_FN_ADD_EXT][eft_DBL][eft_LNG] = {eft_DBL,
-                                                             (void*)dpnp_add_c_ext<double, double, int64_t>};
-    fmap[DPNPFuncName::DPNP_FN_ADD_EXT][eft_DBL][eft_FLT] = {eft_DBL,
-                                                             (void*)dpnp_add_c_ext<double, double, float>};
-    fmap[DPNPFuncName::DPNP_FN_ADD_EXT][eft_DBL][eft_DBL] = {eft_DBL,
-                                                             (void*)dpnp_add_c_ext<double, double, double>};
-
     fmap[DPNPFuncName::DPNP_FN_ARCTAN2][eft_INT][eft_INT] = {eft_DBL,
                                                              (void*)dpnp_arctan2_c_default<double, int32_t, int32_t>};
     fmap[DPNPFuncName::DPNP_FN_ARCTAN2][eft_INT][eft_LNG] = {eft_DBL,
@@ -1725,111 +1775,6 @@ static void func_map_init_elemwise_2arg_3type(func_map_t& fmap)
     fmap[DPNPFuncName::DPNP_FN_MULTIPLY][eft_C128][eft_C128] = {
         eft_C128, (void*)dpnp_multiply_c_default<std::complex<double>, std::complex<double>, std::complex<double>>};
 
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_BLN][eft_BLN] = {
-        eft_BLN, (void*)dpnp_multiply_c_ext<bool, bool, bool>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_BLN][eft_INT] = {
-        eft_INT, (void*)dpnp_multiply_c_ext<int32_t, bool, int32_t>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_BLN][eft_LNG] = {
-        eft_LNG, (void*)dpnp_multiply_c_ext<int64_t, bool, int64_t>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_BLN][eft_FLT] = {
-        eft_FLT, (void*)dpnp_multiply_c_ext<float, bool, float>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_BLN][eft_DBL] = {
-        eft_DBL,  (void*)dpnp_multiply_c_ext<double, bool, double>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_BLN][eft_C64] = {
-        eft_C64, (void*)dpnp_multiply_c_ext<std::complex<float>, bool, std::complex<float>>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_BLN][eft_C128] = {
-        eft_C128,  (void*)dpnp_multiply_c_ext<std::complex<double>, bool, std::complex<double>>};
-
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_INT][eft_BLN] = {
-        eft_INT, (void*)dpnp_multiply_c_ext<int32_t, int32_t, bool>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_INT][eft_INT] = {
-        eft_INT, (void*)dpnp_multiply_c_ext<int32_t, int32_t, int32_t>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_INT][eft_LNG] = {
-        eft_LNG, (void*)dpnp_multiply_c_ext<int64_t, int32_t, int64_t>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_INT][eft_FLT] = {
-        eft_FLT, (void*)dpnp_multiply_c_ext<float, int32_t, float>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_INT][eft_DBL] = {
-        eft_DBL, (void*)dpnp_multiply_c_ext<double, int32_t, double>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_INT][eft_C64] = {
-        eft_C64, (void*)dpnp_multiply_c_ext<std::complex<float>, int32_t, std::complex<float>>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_INT][eft_C128] = {
-        eft_C128,  (void*)dpnp_multiply_c_ext<std::complex<double>, int32_t, std::complex<double>>};
-
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_LNG][eft_BLN] = {
-        eft_LNG, (void*)dpnp_multiply_c_ext<int64_t, int64_t, bool>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_LNG][eft_INT] = {
-        eft_LNG, (void*)dpnp_multiply_c_ext<int64_t, int64_t, int32_t>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_LNG][eft_LNG] = {
-        eft_LNG, (void*)dpnp_multiply_c_ext<int64_t, int64_t, int64_t>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_LNG][eft_FLT] = {
-        eft_FLT, (void*)dpnp_multiply_c_ext<float, int64_t, float>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_LNG][eft_DBL] = {
-        eft_DBL, (void*)dpnp_multiply_c_ext<double, int64_t, double>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_LNG][eft_C64] = {
-        eft_C64, (void*)dpnp_multiply_c_ext<std::complex<float>, int64_t, std::complex<float>>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_LNG][eft_C128] = {
-        eft_C128,  (void*)dpnp_multiply_c_ext<std::complex<double>, int64_t, std::complex<double>>};
-
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_FLT][eft_BLN] = {
-        eft_FLT, (void*)dpnp_multiply_c_ext<float, float, bool>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_FLT][eft_INT] = {
-        eft_FLT, (void*)dpnp_multiply_c_ext<float, float, int32_t>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_FLT][eft_LNG] = {
-        eft_FLT, (void*)dpnp_multiply_c_ext<float, float, int64_t>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_FLT][eft_FLT] = {
-        eft_FLT, (void*)dpnp_multiply_c_ext<float, float, float>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_FLT][eft_DBL] = {
-        eft_DBL, (void*)dpnp_multiply_c_ext<double, float, double>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_FLT][eft_C64] = {
-        eft_C64, (void*)dpnp_multiply_c_ext<std::complex<float>, float, std::complex<float>>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_FLT][eft_C128] = {
-        eft_C128,  (void*)dpnp_multiply_c_ext<std::complex<double>, float, std::complex<double>>};
-
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_DBL][eft_BLN] = {
-        eft_DBL, (void*)dpnp_multiply_c_ext<double, double, bool>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_DBL][eft_INT] = {
-        eft_DBL, (void*)dpnp_multiply_c_ext<double, double, int32_t>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_DBL][eft_LNG] = {
-        eft_DBL, (void*)dpnp_multiply_c_ext<double, double, int64_t>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_DBL][eft_FLT] = {
-        eft_DBL, (void*)dpnp_multiply_c_ext<double, double, float>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_DBL][eft_DBL] = {
-        eft_DBL, (void*)dpnp_multiply_c_ext<double, double, double>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_DBL][eft_C64] = {
-        eft_C64, (void*)dpnp_multiply_c_ext<std::complex<float>, double, std::complex<float>>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_DBL][eft_C128] = {
-        eft_C128,  (void*)dpnp_multiply_c_ext<std::complex<double>, double, std::complex<double>>};
-
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_C64][eft_BLN] = {
-        eft_C64, (void*)dpnp_multiply_c_ext<std::complex<float>, std::complex<float>, bool>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_C64][eft_INT] = {
-        eft_C64, (void*)dpnp_multiply_c_ext<std::complex<float>, std::complex<float>, int32_t>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_C64][eft_LNG] = {
-        eft_C64, (void*)dpnp_multiply_c_ext<std::complex<float>, std::complex<float>, int64_t>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_C64][eft_FLT] = {
-        eft_C64, (void*)dpnp_multiply_c_ext<std::complex<float>, std::complex<float>, float>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_C64][eft_DBL] = {
-        eft_C64, (void*)dpnp_multiply_c_ext<std::complex<float>, std::complex<float>, double>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_C64][eft_C64] = {
-        eft_C64, (void*)dpnp_multiply_c_ext<std::complex<float>, std::complex<float>, std::complex<float>>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_C64][eft_C128] = {
-        eft_C128, (void*)dpnp_multiply_c_ext<std::complex<double>, std::complex<float>, std::complex<double>>};
-
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_C128][eft_BLN] = {
-        eft_C128, (void*)dpnp_multiply_c_ext<std::complex<double>, std::complex<double>, bool>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_C128][eft_INT] = {
-        eft_C128, (void*)dpnp_multiply_c_ext<std::complex<double>, std::complex<double>, int32_t>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_C128][eft_LNG] = {
-        eft_C128, (void*)dpnp_multiply_c_ext<std::complex<double>, std::complex<double>, int64_t>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_C128][eft_FLT] = {
-        eft_C128, (void*)dpnp_multiply_c_ext<std::complex<double>, std::complex<double>, float>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_C128][eft_DBL] = {
-        eft_C128, (void*)dpnp_multiply_c_ext<std::complex<double>, std::complex<double>, double>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_C128][eft_C64] = {
-        eft_C128, (void*)dpnp_multiply_c_ext<std::complex<double>, std::complex<double>, std::complex<float>>};
-    fmap[DPNPFuncName::DPNP_FN_MULTIPLY_EXT][eft_C128][eft_C128] = {
-        eft_C128, (void*)dpnp_multiply_c_ext<std::complex<double>, std::complex<double>, std::complex<double>>};
-
     fmap[DPNPFuncName::DPNP_FN_POWER][eft_INT][eft_INT] = {eft_INT,
                                                            (void*)dpnp_power_c_default<int32_t, int32_t, int32_t>};
     fmap[DPNPFuncName::DPNP_FN_POWER][eft_INT][eft_LNG] = {eft_LNG,
@@ -1962,6 +1907,8 @@ static void func_map_init_elemwise_2arg_3type(func_map_t& fmap)
     fmap[DPNPFuncName::DPNP_FN_SUBTRACT_EXT][eft_DBL][eft_DBL] = {
         eft_DBL, (void*)dpnp_subtract_c_ext<double, double, double>};
 
+    func_map_elemwise_2arg_3type_helper<eft_BLN, eft_INT, eft_LNG, eft_FLT, eft_DBL, eft_C64, eft_C128>(fmap);
+
     return;
 }
 
diff --git a/dpnp/backend/src/dpnp_fptr.hpp b/dpnp/backend/src/dpnp_fptr.hpp
index 76116cafae71..4cb664858319 100644
--- a/dpnp/backend/src/dpnp_fptr.hpp
+++ b/dpnp/backend/src/dpnp_fptr.hpp
@@ -99,6 +99,55 @@ typedef func_type_map_factory_t<func_type_pair_t<eft_BLN, bool>,
                                 func_type_pair_t<eft_C64, std::complex<float>>,
                                 func_type_pair_t<eft_C128, std::complex<double>>> func_type_map_t;
 
+/**
+ * Return an enum value of result type populated from input types.
+ */
+template <DPNPFuncType FT1, DPNPFuncType FT2>
+static constexpr DPNPFuncType populate_func_types()
+{
+    if constexpr (FT1 == DPNPFuncType::DPNP_FT_NONE)
+    {
+        throw std::runtime_error("Templated enum value of FT1 is None");
+    }
+    else if constexpr (FT2 == DPNPFuncType::DPNP_FT_NONE)
+    {
+        throw std::runtime_error("Templated enum value of FT2 is None");
+    }
+    return (FT1 < FT2) ? FT2 : FT1;
+}
+
+/**
+ * Removes parentheses for a passed list of types separated by comma.
+ * It's intended to be used in operations macro.
+ */
+#define MACRO_UNPACK_TYPES(...) __VA_ARGS__
+
+/**
+ * Implements std::is_same<> with variadic number of types to compare with
+ * and when type T has to match only one of types Ts.
+ */
+template <typename T, typename... Ts>
+struct is_any : std::disjunction<std::is_same<T, Ts>...> {};
+
+/**
+ * Implements std::is_same<> with variadic number of types to compare with
+ * and when type T has to match every type from Ts sequence.
+ */
+template <typename T, typename... Ts>
+struct are_same : std::conjunction<std::is_same<T, Ts>...> {};
+
+/**
+ * A template constat to check if both types T1 and T2 match every type from Ts sequence.
+ */
+template <typename T1, typename T2, typename... Ts>
+constexpr auto both_types_are_same = std::conjunction_v<is_any<T1, Ts...>, are_same<T1, T2>>;
+
+/**
+ * A template constat to check if both types T1 and T2 don't match any type from Ts sequence.
+ */
+template <typename T1, typename T2, typename... Ts>
+constexpr auto none_of_both_types = !std::disjunction_v<is_any<T1, Ts...>, is_any<T2, Ts...>>;
+
 /**
  * FPTR interface initialization functions
  */
diff --git a/dpnp/dpnp_iface_mathematical.py b/dpnp/dpnp_iface_mathematical.py
index ce9f340e8e45..8104fdea6735 100644
--- a/dpnp/dpnp_iface_mathematical.py
+++ b/dpnp/dpnp_iface_mathematical.py
@@ -154,56 +154,64 @@ def absolute(x1, **kwargs):
     return call_origin(numpy.absolute, x1, **kwargs)
 
 
-def add(x1, x2, dtype=None, out=None, where=True, **kwargs):
+def add(x1,
+        x2,
+        /,
+        out=None,
+        *,
+        where=True,
+        dtype=None,
+        subok=True,
+        **kwargs):
     """
     Add arguments element-wise.
 
     For full documentation refer to :obj:`numpy.add`.
 
+    Returns
+    -------
+    add : dpnp.ndarray
+        The sum of `x1` and `x2`, element-wise.
+
     Limitations
     -----------
-    Parameters ``x1`` and ``x2`` are supported as either :obj:`dpnp.ndarray` or scalar.
-    Parameters ``dtype``, ``out`` and ``where`` are supported with their default values.
+    Parameters `x1` and `x2` are supported as either :class:`dpnp.ndarray` or scalar,
+    but not both (at least either `x1` or `x2` should be as :class:`dpnp.ndarray`).
+    Parameters `out`, `where`, `dtype` and `subok` are supported with their default values.
     Keyword arguments ``kwargs`` are currently unsupported.
-    Otherwise the functions will be executed sequentially on CPU.
+    Otherwise the function will be executed sequentially on CPU.
     Input array data types are limited by supported DPNP :ref:`Data types`.
 
     Examples
     --------
-    >>> import dpnp as np
-    >>> a = np.array([1, 2, 3])
-    >>> b = np.array([1, 2, 3])
-    >>> result = np.add(a, b)
-    >>> [x for x in result]
+    >>> import dpnp as dp
+    >>> a = dp.array([1, 2, 3])
+    >>> b = dp.array([1, 2, 3])
+    >>> result = dp.add(a, b)
+    >>> print(result)
     [2, 4, 6]
 
     """
 
-    x1_is_scalar = dpnp.isscalar(x1)
-    x2_is_scalar = dpnp.isscalar(x2)
-    x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_strides=False, copy_when_nondefault_queue=False)
-    x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_strides=False, copy_when_nondefault_queue=False)
+    if out is not None:
+        pass
+    elif where is not True:
+        pass
+    elif dtype is not None:
+        pass
+    elif subok is not True:
+        pass
+    elif dpnp.isscalar(x1) and dpnp.isscalar(x2):
+        # at least either x1 or x2 has to be an array
+        pass
+    else:
+        # get a common queue to copy data from the host into a device if any input is scalar
+        queue = get_common_allocation_queue([x1, x2]) if dpnp.isscalar(x1) or dpnp.isscalar(x2) else None
 
-    if x1_desc and x2_desc and not kwargs:
-        if not x1_desc and not x1_is_scalar:
-            pass
-        elif not x2_desc and not x2_is_scalar:
-            pass
-        elif x1_is_scalar and x2_is_scalar:
-            pass
-        elif x1_desc and x1_desc.ndim == 0:
-            pass
-        elif x2_desc and x2_desc.ndim == 0:
-            pass
-        elif dtype is not None:
-            pass
-        elif out is not None:
-            pass
-        elif not where:
-            pass
-        else:
-            out_desc = dpnp.get_dpnp_descriptor(out, copy_when_nondefault_queue=False) if out is not None else None
-            return dpnp_add(x1_desc, x2_desc, dtype, out_desc, where).get_pyobj()
+        x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_strides=False, copy_when_nondefault_queue=False, alloc_queue=queue)
+        x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_strides=False, copy_when_nondefault_queue=False, alloc_queue=queue)
+        if x1_desc and x2_desc:
+            return dpnp_add(x1_desc, x2_desc, dtype=dtype, out=out, where=where).get_pyobj()
 
     return call_origin(numpy.add, x1, x2, dtype=dtype, out=out, where=where, **kwargs)
 
@@ -1093,11 +1101,11 @@ def multiply(x1,
     -------
     y : {dpnp.ndarray, scalar}
         The product of `x1` and `x2`, element-wise.
-        The result is a scalar if both x1 and x2 are scalars.
 
     Limitations
     -----------
-    Parameters `x1` and `x2` are supported as either :class:`dpnp.ndarray` or scalar.
+    Parameters `x1` and `x2` are supported as either :class:`dpnp.ndarray` or scalar,
+    but not both (at least either `x1` or `x2` should be as :class:`dpnp.ndarray`).
     Parameters `out`, `where`, `dtype` and `subok` are supported with their default values.
     Keyword arguments ``kwargs`` are currently unsupported.
     Otherwise the functions will be executed sequentially on CPU.
@@ -1122,8 +1130,8 @@ def multiply(x1,
     elif subok is not True:
         pass
     elif dpnp.isscalar(x1) and dpnp.isscalar(x2):
-        # keep the result in host memory, if both inputs are scalars
-        return x1 * x2
+        # at least either x1 or x2 has to be an array
+        pass
     else:
         # get a common queue to copy data from the host into a device if any input is scalar
         queue = get_common_allocation_queue([x1, x2]) if dpnp.isscalar(x1) or dpnp.isscalar(x2) else None
diff --git a/tests/skipped_tests.tbl b/tests/skipped_tests.tbl
index 63c6cbd0d133..acd920580cf0 100644
--- a/tests/skipped_tests.tbl
+++ b/tests/skipped_tests.tbl
@@ -765,17 +765,12 @@ tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_para
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_547_{arg1=array([[1, 2, 3],       [4, 5, 6]]), arg2=array([[0, 1, 2],       [3, 4, 5]]), dtype=float64, name='remainder', use_dtype=False}::test_binary
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_549_{arg1=array([[1, 2, 3],       [4, 5, 6]]), arg2=array([[0, 1, 2],       [3, 4, 5]]), dtype=float64, name='mod', use_dtype=False}::test_binary
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticModf::test_modf
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_0_{name='reciprocal', nargs=1}::test_raises_with_numpy_input
+
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_10_{name='remainder', nargs=2}::test_raises_with_numpy_input
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_11_{name='mod', nargs=2}::test_raises_with_numpy_input
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_1_{name='angle', nargs=1}::test_raises_with_numpy_input
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_2_{name='add', nargs=2}::test_raises_with_numpy_input
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_4_{name='divide', nargs=2}::test_raises_with_numpy_input
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_5_{name='power', nargs=2}::test_raises_with_numpy_input
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_6_{name='subtract', nargs=2}::test_raises_with_numpy_input
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_7_{name='true_divide', nargs=2}::test_raises_with_numpy_input
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_8_{name='floor_divide', nargs=2}::test_raises_with_numpy_input
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_9_{name='fmod', nargs=2}::test_raises_with_numpy_input
+
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestBoolSubtract_param_3_{shape=(), xp=dpnp}::test_bool_subtract
 tests/third_party/cupy/math_tests/test_explog.py::TestExplog::test_logaddexp
 tests/third_party/cupy/math_tests/test_explog.py::TestExplog::test_logaddexp2
diff --git a/tests/skipped_tests_gpu.tbl b/tests/skipped_tests_gpu.tbl
index c64c7fa45f99..f34ac97fe065 100644
--- a/tests/skipped_tests_gpu.tbl
+++ b/tests/skipped_tests_gpu.tbl
@@ -91,18 +91,7 @@ tests/third_party/cupy/indexing_tests/test_insert.py::TestDiagIndicesInvalidValu
 tests/third_party/cupy/indexing_tests/test_insert.py::TestDiagIndicesFrom_param_0_{shape=(3, 3)}::test_diag_indices_from
 tests/third_party/cupy/indexing_tests/test_insert.py::TestDiagIndicesFrom_param_1_{shape=(0, 0)}::test_diag_indices_from
 tests/third_party/cupy/indexing_tests/test_insert.py::TestDiagIndicesFrom_param_2_{shape=(2, 2, 2)}::test_diag_indices_from
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_295_{arg1=array([[1., 2., 3.],       [4., 5., 6.]], dtype=float32), arg2=array([[0, 1, 2],       [3, 4, 5]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_303_{arg1=array([[1., 2., 3.],       [4., 5., 6.]], dtype=float32), arg2=array([[0, 1, 2],       [3, 4, 5]], dtype=int64), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_375_{arg1=array([[1., 2., 3.],       [4., 5., 6.]]), arg2=array([[0, 1, 2],       [3, 4, 5]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_383_{arg1=array([[1., 2., 3.],       [4., 5., 6.]]), arg2=array([[0, 1, 2],       [3, 4, 5]], dtype=int64), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_439_{arg1=array([[1, 2, 3],       [4, 5, 6]]), arg2=array([[0., 1., 2.],       [3., 4., 5.]], dtype=float32), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_447_{arg1=array([[1, 2, 3],       [4, 5, 6]]), arg2=array([[0., 1., 2.],       [3., 4., 5.]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_455_{arg1=array([[1, 2, 3],       [4, 5, 6]]), arg2=array([[0, 1, 2],       [3, 4, 5]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_463_{arg1=array([[1, 2, 3],       [4, 5, 6]]), arg2=array([[0, 1, 2],       [3, 4, 5]], dtype=int64), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_519_{arg1=array([[1, 2, 3],       [4, 5, 6]], dtype=int64), arg2=array([[0., 1., 2.],       [3., 4., 5.]], dtype=float32), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_527_{arg1=array([[1, 2, 3],       [4, 5, 6]], dtype=int64), arg2=array([[0., 1., 2.],       [3., 4., 5.]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_535_{arg1=array([[1, 2, 3],       [4, 5, 6]], dtype=int64), arg2=array([[0, 1, 2],       [3, 4, 5]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_543_{arg1=array([[1, 2, 3],       [4, 5, 6]], dtype=int64), arg2=array([[0, 1, 2],       [3, 4, 5]], dtype=int64), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
+
 tests/third_party/cupy/math_tests/test_sumprod.py::TestSumprod::test_external_prod_all
 tests/third_party/cupy/math_tests/test_sumprod.py::TestSumprod::test_external_prod_axis
 tests/third_party/cupy/math_tests/test_sumprod.py::TestSumprod::test_external_sum_all
@@ -969,35 +958,17 @@ tests/third_party/cupy/manipulation_tests/test_tiling.py::TestTile_param_4_{reps
 tests/third_party/cupy/manipulation_tests/test_tiling.py::TestTile_param_5_{reps=(2, 3, 4, 5)}::test_array_tile
 tests/third_party/cupy/manipulation_tests/test_transpose.py::TestTranspose::test_moveaxis_invalid5_2
 tests/third_party/cupy/manipulation_tests/test_transpose.py::TestTranspose::test_moveaxis_invalid5_3
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_279_{arg1=array([[1., 2., 3.],       [4., 5., 6.]], dtype=float32), arg2=array([[0., 1., 2.],       [3., 4., 5.]], dtype=float32), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_287_{arg1=array([[1., 2., 3.],       [4., 5., 6.]], dtype=float32), arg2=array([[0., 1., 2.],       [3., 4., 5.]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_295_{arg1=array([[1., 2., 3.],       [4., 5., 6.]], dtype=float32), arg2=array([[0, 1, 2],       [3, 4, 5]], dtype=int32), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_303_{arg1=array([[1., 2., 3.],       [4., 5., 6.]], dtype=float32), arg2=array([[0, 1, 2],       [3, 4, 5]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_359_{arg1=array([[1., 2., 3.],       [4., 5., 6.]]), arg2=array([[0., 1., 2.],       [3., 4., 5.]], dtype=float32), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_367_{arg1=array([[1., 2., 3.],       [4., 5., 6.]]), arg2=array([[0., 1., 2.],       [3., 4., 5.]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_375_{arg1=array([[1., 2., 3.],       [4., 5., 6.]]), arg2=array([[0, 1, 2],       [3, 4, 5]], dtype=int32), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_383_{arg1=array([[1., 2., 3.],       [4., 5., 6.]]), arg2=array([[0, 1, 2],       [3, 4, 5]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_439_{arg1=array([[1, 2, 3],       [4, 5, 6]], dtype=int32), arg2=array([[0., 1., 2.],       [3., 4., 5.]], dtype=float32), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_447_{arg1=array([[1, 2, 3],       [4, 5, 6]], dtype=int32), arg2=array([[0., 1., 2.],       [3., 4., 5.]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
+
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_455_{arg1=array([[1, 2, 3],       [4, 5, 6]], dtype=int32), arg2=array([[0, 1, 2],       [3, 4, 5]], dtype=int32), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_463_{arg1=array([[1, 2, 3],       [4, 5, 6]], dtype=int32), arg2=array([[0, 1, 2],       [3, 4, 5]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_519_{arg1=array([[1, 2, 3],       [4, 5, 6]]), arg2=array([[0., 1., 2.],       [3., 4., 5.]], dtype=float32), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_527_{arg1=array([[1, 2, 3],       [4, 5, 6]]), arg2=array([[0., 1., 2.],       [3., 4., 5.]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_535_{arg1=array([[1, 2, 3],       [4, 5, 6]]), arg2=array([[0, 1, 2],       [3, 4, 5]], dtype=int32), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_543_{arg1=array([[1, 2, 3],       [4, 5, 6]]), arg2=array([[0, 1, 2],       [3, 4, 5]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticModf::test_modf
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_0_{name='reciprocal', nargs=1}::test_raises_with_numpy_input
+
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_10_{name='remainder', nargs=2}::test_raises_with_numpy_input
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_11_{name='mod', nargs=2}::test_raises_with_numpy_input
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_1_{name='angle', nargs=1}::test_raises_with_numpy_input
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_2_{name='add', nargs=2}::test_raises_with_numpy_input
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_4_{name='divide', nargs=2}::test_raises_with_numpy_input
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_5_{name='power', nargs=2}::test_raises_with_numpy_input
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_6_{name='subtract', nargs=2}::test_raises_with_numpy_input
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_7_{name='true_divide', nargs=2}::test_raises_with_numpy_input
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_8_{name='floor_divide', nargs=2}::test_raises_with_numpy_input
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_9_{name='fmod', nargs=2}::test_raises_with_numpy_input
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestBoolSubtract_param_3_{shape=(), xp=dpnp}::test_bool_subtract
+
 tests/third_party/cupy/math_tests/test_explog.py::TestExplog::test_logaddexp
 tests/third_party/cupy/math_tests/test_explog.py::TestExplog::test_logaddexp2
 tests/third_party/cupy/math_tests/test_floating.py::TestFloating::test_copysign_float
diff --git a/tests/test_indexing.py b/tests/test_indexing.py
index 091cf1345c4e..1a40777afac8 100644
--- a/tests/test_indexing.py
+++ b/tests/test_indexing.py
@@ -3,6 +3,9 @@
 import dpnp
 
 import numpy
+from numpy.testing import (
+    assert_array_equal
+)
 
 
 @pytest.mark.usefixtures("allow_fall_back_on_numpy")
@@ -16,7 +19,7 @@ def test_choose():
 
     expected = numpy.choose([0, 0, 0, 0], [a, b, c])
     result = dpnp.choose([0, 0, 0, 0], [ia, ib, ic])
-    numpy.testing.assert_array_equal(expected, result)
+    assert_array_equal(expected, result)
 
 
 @pytest.mark.parametrize("offset",
@@ -47,7 +50,7 @@ def test_diagonal(array, offset):
     ia = dpnp.array(a)
     expected = numpy.diagonal(a, offset)
     result = dpnp.diagonal(ia, offset)
-    numpy.testing.assert_array_equal(expected, result)
+    assert_array_equal(expected, result)
 
 
 @pytest.mark.parametrize("val",
@@ -71,7 +74,7 @@ def test_fill_diagonal(array, val):
     ia = dpnp.array(a)
     expected = numpy.fill_diagonal(a, val)
     result = dpnp.fill_diagonal(ia, val)
-    numpy.testing.assert_array_equal(expected, result)
+    assert_array_equal(expected, result)
 
 
 @pytest.mark.parametrize("dimension",
@@ -81,7 +84,7 @@ def test_fill_diagonal(array, val):
 def test_indices(dimension):
     expected = numpy.indices(dimension)
     result = dpnp.indices(dimension)
-    numpy.testing.assert_array_equal(expected, result)
+    assert_array_equal(expected, result)
 
 
 @pytest.mark.parametrize("array",
@@ -107,7 +110,7 @@ def test_nonzero(array):
     ia = dpnp.array(array)
     expected = numpy.nonzero(a)
     result = dpnp.nonzero(ia)
-    numpy.testing.assert_array_equal(expected, result)
+    assert_array_equal(expected, result)
 
 
 @pytest.mark.usefixtures("allow_fall_back_on_numpy")
@@ -137,7 +140,7 @@ def test_place1(arr, mask, vals):
     im = dpnp.array(m)
     numpy.place(a, m, vals)
     dpnp.place(ia, im, vals)
-    numpy.testing.assert_array_equal(a, ia)
+    assert_array_equal(a, ia)
 
 
 @pytest.mark.usefixtures("allow_fall_back_on_numpy")
@@ -161,7 +164,7 @@ def test_place2(arr, mask, vals):
     im = dpnp.array(m)
     numpy.place(a, m, vals)
     dpnp.place(ia, im, vals)
-    numpy.testing.assert_array_equal(a, ia)
+    assert_array_equal(a, ia)
 
 
 @pytest.mark.usefixtures("allow_fall_back_on_numpy")
@@ -186,7 +189,7 @@ def test_place3(arr, mask, vals):
     im = dpnp.array(m)
     numpy.place(a, m, vals)
     dpnp.place(ia, im, vals)
-    numpy.testing.assert_array_equal(a, ia)
+    assert_array_equal(a, ia)
 
 
 @pytest.mark.parametrize("v",
@@ -211,7 +214,7 @@ def test_put(array, ind, v):
     ia = dpnp.array(a)
     numpy.put(a, ind, v)
     dpnp.put(ia, ind, v)
-    numpy.testing.assert_array_equal(a, ia)
+    assert_array_equal(a, ia)
 
 
 @pytest.mark.parametrize("v",
@@ -236,7 +239,7 @@ def test_put2(array, ind, v):
     ia = dpnp.array(a)
     numpy.put(a, ind, v)
     dpnp.put(ia, ind, v)
-    numpy.testing.assert_array_equal(a, ia)
+    assert_array_equal(a, ia)
 
 
 def test_put3():
@@ -244,7 +247,7 @@ def test_put3():
     ia = dpnp.array(a)
     dpnp.put(ia, [0, 2], [-44, -55])
     numpy.put(a, [0, 2], [-44, -55])
-    numpy.testing.assert_array_equal(a, ia)
+    assert_array_equal(a, ia)
 
 
 @pytest.mark.usefixtures("allow_fall_back_on_numpy")
@@ -256,7 +259,7 @@ def test_put_along_axis_val_int():
     for axis in range(2):
         numpy.put_along_axis(a, ind_r, 777, axis)
         dpnp.put_along_axis(ai, ind_r_i, 777, axis)
-        numpy.testing.assert_array_equal(a, ai)
+        assert_array_equal(a, ai)
 
 
 @pytest.mark.usefixtures("allow_fall_back_on_numpy")
@@ -268,7 +271,7 @@ def test_put_along_axis1():
     for axis in range(3):
         numpy.put_along_axis(a, ind_r, 777, axis)
         dpnp.put_along_axis(ai, ind_r_i, 777, axis)
-        numpy.testing.assert_array_equal(a, ai)
+        assert_array_equal(a, ai)
 
 
 @pytest.mark.usefixtures("allow_fall_back_on_numpy")
@@ -280,7 +283,7 @@ def test_put_along_axis2():
     for axis in range(3):
         numpy.put_along_axis(a, ind_r, [100, 200, 300, 400], axis)
         dpnp.put_along_axis(ai, ind_r_i, [100, 200, 300, 400], axis)
-        numpy.testing.assert_array_equal(a, ai)
+        assert_array_equal(a, ai)
 
 
 @pytest.mark.parametrize("vals",
@@ -309,7 +312,7 @@ def test_putmask1(arr, mask, vals):
     iv = dpnp.array(v)
     numpy.putmask(a, m, v)
     dpnp.putmask(ia, im, iv)
-    numpy.testing.assert_array_equal(a, ia)
+    assert_array_equal(a, ia)
 
 
 @pytest.mark.parametrize("vals",
@@ -334,7 +337,7 @@ def test_putmask2(arr, mask, vals):
     iv = dpnp.array(v)
     numpy.putmask(a, m, v)
     dpnp.putmask(ia, im, iv)
-    numpy.testing.assert_array_equal(a, ia)
+    assert_array_equal(a, ia)
 
 
 @pytest.mark.parametrize("vals",
@@ -360,7 +363,7 @@ def test_putmask3(arr, mask, vals):
     iv = dpnp.array(v)
     numpy.putmask(a, m, v)
     dpnp.putmask(ia, im, iv)
-    numpy.testing.assert_array_equal(a, ia)
+    assert_array_equal(a, ia)
 
 
 def test_select():
@@ -378,7 +381,7 @@ def test_select():
     ichoicelist = [ichoice_val1, ichoice_val2]
     expected = numpy.select(condlist, choicelist)
     result = dpnp.select(icondlist, ichoicelist)
-    numpy.testing.assert_array_equal(expected, result)
+    assert_array_equal(expected, result)
 
 
 @pytest.mark.parametrize("array_type",
@@ -415,10 +418,9 @@ def test_take(array, indices, array_type, indices_type):
     iind = dpnp.array(ind)
     expected = numpy.take(a, ind)
     result = dpnp.take(ia, iind)
-    numpy.testing.assert_array_equal(expected, result)
+    assert_array_equal(expected, result)
 
 
-@pytest.mark.usefixtures("allow_fall_back_on_numpy")
 def test_take_along_axis():
     a = numpy.arange(16).reshape(4, 4)
     ai = dpnp.array(a)
@@ -427,10 +429,9 @@ def test_take_along_axis():
     for axis in range(2):
         expected = numpy.take_along_axis(a, ind_r, axis)
         result = dpnp.take_along_axis(ai, ind_r_i, axis)
-        numpy.testing.assert_array_equal(expected, result)
+        assert_array_equal(expected, result)
 
 
-@pytest.mark.usefixtures("allow_fall_back_on_numpy")
 def test_take_along_axis1():
     a = numpy.arange(64).reshape(4, 4, 4)
     ai = dpnp.array(a)
@@ -439,7 +440,7 @@ def test_take_along_axis1():
     for axis in range(3):
         expected = numpy.take_along_axis(a, ind_r, axis)
         result = dpnp.take_along_axis(ai, ind_r_i, axis)
-        numpy.testing.assert_array_equal(expected, result)
+        assert_array_equal(expected, result)
 
 
 @pytest.mark.parametrize("m",
@@ -454,7 +455,7 @@ def test_take_along_axis1():
 def test_tril_indices(n, k, m):
     result = dpnp.tril_indices(n, k, m)
     expected = numpy.tril_indices(n, k, m)
-    numpy.testing.assert_array_equal(expected, result)
+    assert_array_equal(expected, result)
 
 
 @pytest.mark.parametrize("k",
@@ -472,7 +473,7 @@ def test_tril_indices_from(array, k):
     ia = dpnp.array(a)
     result = dpnp.tril_indices_from(ia, k)
     expected = numpy.tril_indices_from(a, k)
-    numpy.testing.assert_array_equal(expected, result)
+    assert_array_equal(expected, result)
 
 
 @pytest.mark.parametrize("m",
@@ -487,7 +488,7 @@ def test_tril_indices_from(array, k):
 def test_triu_indices(n, k, m):
     result = dpnp.triu_indices(n, k, m)
     expected = numpy.triu_indices(n, k, m)
-    numpy.testing.assert_array_equal(expected, result)
+    assert_array_equal(expected, result)
 
 
 @pytest.mark.parametrize("k",
@@ -505,4 +506,4 @@ def test_triu_indices_from(array, k):
     ia = dpnp.array(a)
     result = dpnp.triu_indices_from(ia, k)
     expected = numpy.triu_indices_from(a, k)
-    numpy.testing.assert_array_equal(expected, result)
+    assert_array_equal(expected, result)
diff --git a/tests/test_mathematical.py b/tests/test_mathematical.py
index 21071bec41e9..70e0bd73dc50 100644
--- a/tests/test_mathematical.py
+++ b/tests/test_mathematical.py
@@ -1,8 +1,15 @@
 import pytest
+from .helper import get_all_dtypes
 
 import dpnp
 
 import numpy
+from numpy.testing import (
+    assert_allclose,
+    assert_array_almost_equal,
+    assert_array_equal,
+    assert_raises
+)
 
 
 @pytest.mark.usefixtures("allow_fall_back_on_numpy")
@@ -10,27 +17,27 @@ class TestConvolve:
     def test_object(self):
         d = [1.] * 100
         k = [1.] * 3
-        numpy.testing.assert_array_almost_equal(dpnp.convolve(d, k)[2:-2], dpnp.full(98, 3))
+        assert_array_almost_equal(dpnp.convolve(d, k)[2:-2], dpnp.full(98, 3))
 
     def test_no_overwrite(self):
         d = dpnp.ones(100)
         k = dpnp.ones(3)
         dpnp.convolve(d, k)
-        numpy.testing.assert_array_equal(d, dpnp.ones(100))
-        numpy.testing.assert_array_equal(k, dpnp.ones(3))
+        assert_array_equal(d, dpnp.ones(100))
+        assert_array_equal(k, dpnp.ones(3))
 
     def test_mode(self):
         d = dpnp.ones(100)
         k = dpnp.ones(3)
         default_mode = dpnp.convolve(d, k, mode='full')
         full_mode = dpnp.convolve(d, k, mode='f')
-        numpy.testing.assert_array_equal(full_mode, default_mode)
+        assert_array_equal(full_mode, default_mode)
         # integer mode
-        with numpy.testing.assert_raises(ValueError):
+        with assert_raises(ValueError):
             dpnp.convolve(d, k, mode=-1)
-        numpy.testing.assert_array_equal(dpnp.convolve(d, k, mode=2), full_mode)
+        assert_array_equal(dpnp.convolve(d, k, mode=2), full_mode)
         # illegal arguments
-        with numpy.testing.assert_raises(TypeError):
+        with assert_raises(TypeError):
             dpnp.convolve(d, k, mode=None)
 
 
@@ -53,33 +60,34 @@ def test_diff(array):
     dpnp_a = dpnp.array(array)
     expected = numpy.diff(np_a)
     result = dpnp.diff(dpnp_a)
-    numpy.testing.assert_allclose(expected, result)
+    assert_allclose(expected, result)
 
 
-@pytest.mark.parametrize("dtype1",
-                         [numpy.bool_, numpy.float64, numpy.float32, numpy.int64, numpy.int32, numpy.complex64, numpy.complex128],
-                         ids=['numpy.bool_', 'numpy.float64', 'numpy.float32', 'numpy.int64', 'numpy.int32', 'numpy.complex64', 'numpy.complex128'])
-@pytest.mark.parametrize("dtype2",
-                         [numpy.bool_, numpy.float64, numpy.float32, numpy.int64, numpy.int32, numpy.complex64, numpy.complex128],
-                         ids=['numpy.bool_', 'numpy.float64', 'numpy.float32', 'numpy.int64', 'numpy.int32', 'numpy.complex64', 'numpy.complex128'])
+@pytest.mark.parametrize("dtype1", get_all_dtypes())
+@pytest.mark.parametrize("dtype2", get_all_dtypes())
+@pytest.mark.parametrize("func",
+                         ['add', 'multiply'])
 @pytest.mark.parametrize("data",
                          [[[1, 2], [3, 4]]],
                          ids=['[[1, 2], [3, 4]]'])
-def test_multiply_dtype(dtype1, dtype2, data):
+def test_op_multiple_dtypes(dtype1, func, dtype2, data):
     np_a = numpy.array(data, dtype=dtype1)
     dpnp_a = dpnp.array(data, dtype=dtype1)
 
     np_b = numpy.array(data, dtype=dtype2)
     dpnp_b = dpnp.array(data, dtype=dtype2)
 
-    result = dpnp.multiply(dpnp_a, dpnp_b)
-    expected = numpy.multiply(np_a, np_b)
-    numpy.testing.assert_array_equal(result, expected)
+    result = getattr(dpnp, func)(dpnp_a, dpnp_b)
+    expected = getattr(numpy, func)(np_a, np_b)
+    assert_array_equal(result, expected)
 
 
 @pytest.mark.parametrize("rhs", [[[1, 2, 3], [4, 5, 6]], [2.0, 1.5, 1.0], 3, 0.3])
-@pytest.mark.parametrize("lhs", [[[6, 5, 4], [3, 2, 1]], [1.3, 2.6, 3.9], 5, 0.5])
-@pytest.mark.parametrize("dtype", [numpy.int32, numpy.int64, numpy.float32, numpy.float64])
+@pytest.mark.parametrize("lhs", [[[6, 5, 4], [3, 2, 1]], [1.3, 2.6, 3.9]])
+# TODO: achieve the same level of dtype support for all mathematical operations, like
+# @pytest.mark.parametrize("dtype", get_all_dtypes())
+# and to get rid of fallbacks on numpy allowed by below fixture
+# @pytest.mark.usefixtures("allow_fall_back_on_numpy")
 class TestMathematical:
 
     @staticmethod
@@ -98,56 +106,68 @@ def _test_mathematical(self, name, dtype, lhs, rhs):
         b = self.array_or_scalar(numpy, rhs, dtype=dtype)
         expected = getattr(numpy, name)(a, b)
 
-        numpy.testing.assert_allclose(result, expected, atol=1e-4)
+        assert_allclose(result, expected, atol=1e-4)
 
-    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
+    @pytest.mark.parametrize("dtype", get_all_dtypes())
     def test_add(self, dtype, lhs, rhs):
         self._test_mathematical('add', dtype, lhs, rhs)
 
     @pytest.mark.usefixtures("allow_fall_back_on_numpy")
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True, no_complex=True))
     def test_arctan2(self, dtype, lhs, rhs):
         self._test_mathematical('arctan2', dtype, lhs, rhs)
 
     @pytest.mark.usefixtures("allow_fall_back_on_numpy")
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True, no_complex=True))
     def test_copysign(self, dtype, lhs, rhs):
         self._test_mathematical('copysign', dtype, lhs, rhs)
 
     @pytest.mark.usefixtures("allow_fall_back_on_numpy")
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True, no_complex=True))
     def test_divide(self, dtype, lhs, rhs):
         self._test_mathematical('divide', dtype, lhs, rhs)
 
     @pytest.mark.usefixtures("allow_fall_back_on_numpy")
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True, no_complex=True))
     def test_fmod(self, dtype, lhs, rhs):
         self._test_mathematical('fmod', dtype, lhs, rhs)
 
     @pytest.mark.usefixtures("allow_fall_back_on_numpy")
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True, no_complex=True))
     def test_floor_divide(self, dtype, lhs, rhs):
         self._test_mathematical('floor_divide', dtype, lhs, rhs)
 
     @pytest.mark.usefixtures("allow_fall_back_on_numpy")
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True, no_complex=True))
     def test_hypot(self, dtype, lhs, rhs):
         self._test_mathematical('hypot', dtype, lhs, rhs)
 
     @pytest.mark.usefixtures("allow_fall_back_on_numpy")
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True, no_complex=True))
     def test_maximum(self, dtype, lhs, rhs):
         self._test_mathematical('maximum', dtype, lhs, rhs)
 
     @pytest.mark.usefixtures("allow_fall_back_on_numpy")
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True, no_complex=True))
     def test_minimum(self, dtype, lhs, rhs):
         self._test_mathematical('minimum', dtype, lhs, rhs)
 
+    @pytest.mark.parametrize("dtype", get_all_dtypes())
     def test_multiply(self, dtype, lhs, rhs):
         self._test_mathematical('multiply', dtype, lhs, rhs)
 
     @pytest.mark.usefixtures("allow_fall_back_on_numpy")
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True, no_complex=True))
     def test_remainder(self, dtype, lhs, rhs):
         self._test_mathematical('remainder', dtype, lhs, rhs)
 
     @pytest.mark.usefixtures("allow_fall_back_on_numpy")
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True, no_complex=True))
     def test_power(self, dtype, lhs, rhs):
         self._test_mathematical('power', dtype, lhs, rhs)
 
     @pytest.mark.usefixtures("allow_fall_back_on_numpy")
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True, no_complex=True))
     def test_subtract(self, dtype, lhs, rhs):
         self._test_mathematical('subtract', dtype, lhs, rhs)
 
@@ -155,9 +175,9 @@ def test_subtract(self, dtype, lhs, rhs):
 @pytest.mark.parametrize("val_type",
                          [bool, int, float],
                          ids=['bool', 'int', 'float'])
-@pytest.mark.parametrize("data_type",
-                         [numpy.bool_, numpy.float64, numpy.float32, numpy.int64, numpy.int32],
-                         ids=['numpy.bool_', 'numpy.float64', 'numpy.float32', 'numpy.int64', 'numpy.int32'])
+@pytest.mark.parametrize("data_type", get_all_dtypes())
+@pytest.mark.parametrize("func",
+                         ['add', 'multiply'])
 @pytest.mark.parametrize("val",
                          [0, 1, 5],
                          ids=['0', '1', '5'])
@@ -172,18 +192,18 @@ def test_subtract(self, dtype, lhs, rhs):
                               '[[1, 2], [3, 4]]',
                               '[[[1, 2], [3, 4]], [[1, 2], [2, 1]], [[1, 3], [3, 1]]]',
                               '[[[[1, 2], [3, 4]], [[1, 2], [2, 1]]], [[[1, 3], [3, 1]], [[0, 1], [1, 3]]]]'])
-def test_multiply_scalar(array, val, data_type, val_type):
+def test_op_with_scalar(array, val, func, data_type, val_type):
     np_a = numpy.array(array, dtype=data_type)
     dpnp_a = dpnp.array(array, dtype=data_type)
     val_ = val_type(val)
 
-    result = dpnp.multiply(dpnp_a, val_)
-    expected = numpy.multiply(np_a, val_)
-    numpy.testing.assert_array_equal(result, expected)
+    result = getattr(dpnp, func)(dpnp_a, val_)
+    expected = getattr(numpy, func)(np_a, val_)
+    assert_array_equal(result, expected)
 
-    result = dpnp.multiply(val_, dpnp_a)
-    expected = numpy.multiply(val_, np_a)
-    numpy.testing.assert_array_equal(result, expected)
+    result = getattr(dpnp, func)(val_, dpnp_a)
+    expected = getattr(numpy, func)(val_, np_a)
+    assert_array_equal(result, expected)
 
 
 @pytest.mark.parametrize("shape",
@@ -196,9 +216,9 @@ def test_multiply_scalar2(shape, dtype):
     np_a = numpy.ones(shape, dtype=dtype)
     dpnp_a = dpnp.ones(shape, dtype=dtype)
 
-    result = 0.5 * dpnp_a
-    expected = 0.5 * np_a
-    numpy.testing.assert_array_equal(result, expected)
+    result = 0.5 * dpnp_a * 1.7
+    expected = 0.5 * np_a * 1.7
+    assert_allclose(result, expected)
 
 
 @pytest.mark.usefixtures("allow_fall_back_on_numpy")
@@ -211,7 +231,7 @@ def test_nancumprod(array):
 
     result = dpnp.nancumprod(dpnp_a)
     expected = numpy.nancumprod(np_a)
-    numpy.testing.assert_array_equal(expected, result)
+    assert_array_equal(expected, result)
 
 
 @pytest.mark.usefixtures("allow_fall_back_on_numpy")
@@ -224,31 +244,25 @@ def test_nancumsum(array):
 
     result = dpnp.nancumsum(dpnp_a)
     expected = numpy.nancumsum(np_a)
-    numpy.testing.assert_array_equal(expected, result)
+    assert_array_equal(expected, result)
 
 
 @pytest.mark.parametrize("data",
                          [[[1., -1.], [0.1, -0.1]], [-2, -1, 0, 1, 2]],
                          ids=['[[1., -1.], [0.1, -0.1]]', '[-2, -1, 0, 1, 2]'])
-@pytest.mark.parametrize("dtype",
-                         [numpy.float64, numpy.float32, numpy.int64, numpy.int32],
-                         ids=['numpy.float64', 'numpy.float32', 'numpy.int64', 'numpy.int32'])
+@pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True, no_complex=True))
 def test_negative(data, dtype):
     np_a = numpy.array(data, dtype=dtype)
     dpnp_a = dpnp.array(data, dtype=dtype)
 
     result = dpnp.negative(dpnp_a)
     expected = numpy.negative(np_a)
-    numpy.testing.assert_array_equal(result, expected)
+    assert_array_equal(result, expected)
 
 
 @pytest.mark.usefixtures("allow_fall_back_on_numpy")
-@pytest.mark.parametrize("val_type",
-                         [numpy.float64, numpy.float32, numpy.int64, numpy.int32],
-                         ids=['numpy.float64', 'numpy.float32', 'numpy.int64', 'numpy.int32'])
-@pytest.mark.parametrize("data_type",
-                         [numpy.float64, numpy.float32, numpy.int64, numpy.int32],
-                         ids=['numpy.float64', 'numpy.float32', 'numpy.int64', 'numpy.int32'])
+@pytest.mark.parametrize("val_type", get_all_dtypes(no_bool=True, no_complex=True, no_none=True))
+@pytest.mark.parametrize("data_type", get_all_dtypes(no_bool=True, no_complex=True))
 @pytest.mark.parametrize("val",
                          [0, 1, 5],
                          ids=['0', '1', '5'])
@@ -269,12 +283,11 @@ def test_power(array, val, data_type, val_type):
     val_ = val_type(val)
     result = dpnp.power(dpnp_a, val_)
     expected = numpy.power(np_a, val_)
-    numpy.testing.assert_array_equal(expected, result)
+    assert_array_equal(expected, result)
 
 
 class TestEdiff1d:
-    @pytest.mark.parametrize("data_type",
-                             [numpy.float64, numpy.float32, numpy.int64, numpy.int32])
+    @pytest.mark.parametrize("data_type", get_all_dtypes(no_bool=True, no_complex=True))
     @pytest.mark.parametrize("array", [[1, 2, 4, 7, 0],
                                        [],
                                        [1],
@@ -285,7 +298,7 @@ def test_ediff1d_int(self, array, data_type):
 
         result = dpnp.ediff1d(dpnp_a)
         expected = numpy.ediff1d(np_a)
-        numpy.testing.assert_array_equal(expected, result)
+        assert_array_equal(expected, result)
 
     
     @pytest.mark.usefixtures("allow_fall_back_on_numpy")
@@ -297,13 +310,12 @@ def test_ediff1d_args(self):
 
         result = dpnp.ediff1d(np_a, to_end=to_end, to_begin=to_begin)
         expected = numpy.ediff1d(np_a, to_end=to_end, to_begin=to_begin)
-        numpy.testing.assert_array_equal(expected, result)
+        assert_array_equal(expected, result)
 
 
 @pytest.mark.usefixtures("allow_fall_back_on_numpy")
 class TestTrapz:
-    @pytest.mark.parametrize("data_type",
-                             [numpy.float64, numpy.float32, numpy.int64, numpy.int32])
+    @pytest.mark.parametrize("data_type", get_all_dtypes(no_bool=True, no_complex=True))
     @pytest.mark.parametrize("array", [[1, 2, 3],
                                        [[1, 2, 3], [4, 5, 6]],
                                        [1, 4, 6, 9, 10, 12],
@@ -315,12 +327,10 @@ def test_trapz_default(self, array, data_type):
 
         result = dpnp.trapz(dpnp_a)
         expected = numpy.trapz(np_a)
-        numpy.testing.assert_array_equal(expected, result)
+        assert_array_equal(expected, result)
 
-    @pytest.mark.parametrize("data_type_y",
-                             [numpy.float64, numpy.float32, numpy.int64, numpy.int32])
-    @pytest.mark.parametrize("data_type_x",
-                             [numpy.float64, numpy.float32, numpy.int64, numpy.int32])
+    @pytest.mark.parametrize("data_type_y", get_all_dtypes(no_bool=True, no_complex=True))
+    @pytest.mark.parametrize("data_type_x", get_all_dtypes(no_bool=True, no_complex=True))
     @pytest.mark.parametrize("y_array", [[1, 2, 4, 5],
                                          [1., 2.5, 6., 7.]])
     @pytest.mark.parametrize("x_array", [[2, 5, 6, 9]])
@@ -333,7 +343,7 @@ def test_trapz_with_x_params(self, y_array, x_array, data_type_y, data_type_x):
 
         result = dpnp.trapz(dpnp_y, dpnp_x)
         expected = numpy.trapz(np_y, np_x)
-        numpy.testing.assert_array_equal(expected, result)
+        assert_array_equal(expected, result)
 
     @pytest.mark.parametrize("array", [[1, 2, 3], [4, 5, 6]])
     def test_trapz_with_x_param_2ndim(self, array):
@@ -342,7 +352,7 @@ def test_trapz_with_x_param_2ndim(self, array):
 
         result = dpnp.trapz(dpnp_a, dpnp_a)
         expected = numpy.trapz(np_a, np_a)
-        numpy.testing.assert_array_equal(expected, result)
+        assert_array_equal(expected, result)
 
     @pytest.mark.parametrize("y_array", [[1, 2, 4, 5],
                                          [1., 2.5, 6., 7., ]])
@@ -353,7 +363,7 @@ def test_trapz_with_dx_params(self, y_array, dx):
 
         result = dpnp.trapz(dpnp_y, dx=dx)
         expected = numpy.trapz(np_y, dx=dx)
-        numpy.testing.assert_array_equal(expected, result)
+        assert_array_equal(expected, result)
 
 
 @pytest.mark.usefixtures("allow_fall_back_on_numpy")
@@ -388,7 +398,7 @@ def test_cross_3x3(self, x1, x2, axisa, axisb, axisc, axis):
 
         result = dpnp.cross(dpnp_x1, dpnp_x2, axisa, axisb, axisc, axis)
         expected = numpy.cross(np_x1, np_x2, axisa, axisb, axisc, axis)
-        numpy.testing.assert_array_equal(expected, result)
+        assert_array_equal(expected, result)
 
 
 @pytest.mark.usefixtures("allow_fall_back_on_numpy")
@@ -403,7 +413,7 @@ def test_gradient_y1(self, array):
 
         result = dpnp.gradient(dpnp_y)
         expected = numpy.gradient(np_y)
-        numpy.testing.assert_array_equal(expected, result)
+        assert_array_equal(expected, result)
 
     @pytest.mark.parametrize("array", [[2, 3, 6, 8, 4, 9],
                                        [3., 4., 7.5, 9.],
@@ -415,7 +425,7 @@ def test_gradient_y1_dx(self, array, dx):
 
         result = dpnp.gradient(dpnp_y, dx)
         expected = numpy.gradient(np_y, dx)
-        numpy.testing.assert_array_equal(expected, result)
+        assert_array_equal(expected, result)
 
 
 class TestCeil:
@@ -433,7 +443,7 @@ def test_ceil(self):
         np_array = numpy.array(array_data, dtype=numpy.float64)
         expected = numpy.ceil(np_array, out=out)
 
-        numpy.testing.assert_array_equal(expected, result)
+        assert_array_equal(expected, result)
 
     @pytest.mark.parametrize("dtype",
                              [numpy.float32, numpy.int64, numpy.int32],
@@ -473,7 +483,7 @@ def test_floor(self):
         np_array = numpy.array(array_data, dtype=numpy.float64)
         expected = numpy.floor(np_array, out=out)
 
-        numpy.testing.assert_array_equal(expected, result)
+        assert_array_equal(expected, result)
 
     @pytest.mark.parametrize("dtype",
                              [numpy.float32, numpy.int64, numpy.int32],
@@ -513,7 +523,7 @@ def test_trunc(self):
         np_array = numpy.array(array_data, dtype=numpy.float64)
         expected = numpy.trunc(np_array, out=out)
 
-        numpy.testing.assert_array_equal(expected, result)
+        assert_array_equal(expected, result)
 
     @pytest.mark.parametrize("dtype",
                              [numpy.float32, numpy.int64, numpy.int32],
@@ -556,7 +566,7 @@ def test_power(self):
         np_array2 = numpy.array(array2_data, dtype=numpy.float64)
         expected = numpy.power(np_array1, np_array2, out=out)
 
-        numpy.testing.assert_array_equal(expected, result)
+        assert_array_equal(expected, result)
 
     @pytest.mark.parametrize("dtype",
                              [numpy.float32, numpy.int64, numpy.int32],
diff --git a/tests/test_strides.py b/tests/test_strides.py
index 7ec1d6b3f03f..3c0d86a44a5a 100644
--- a/tests/test_strides.py
+++ b/tests/test_strides.py
@@ -1,8 +1,13 @@
 import math
 import pytest
+from .helper import get_all_dtypes
 
 import dpnp
+
 import numpy
+from numpy.testing import (
+    assert_allclose
+)
 
 
 def _getattr(ex, str_):
@@ -15,12 +20,10 @@ def _getattr(ex, str_):
 
 @pytest.mark.parametrize("func_name",
                          ['abs', ])
-@pytest.mark.parametrize("type",
-                         [numpy.float64, numpy.float32, numpy.int64, numpy.int32],
-                         ids=['float64', 'float32', 'int64', 'int32'])
-def test_strides(func_name, type):
+@pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True, no_complex=True))
+def test_strides(func_name, dtype):
     shape = (4, 4)
-    a = numpy.arange(shape[0] * shape[1], dtype=type).reshape(shape)
+    a = numpy.arange(shape[0] * shape[1], dtype=dtype).reshape(shape)
     a_strides = a[0::2, 0::2]
     dpa = dpnp.array(a)
     dpa_strides = dpa[0::2, 0::2]
@@ -31,7 +34,7 @@ def test_strides(func_name, type):
     numpy_func = _getattr(numpy, func_name)
     expected = numpy_func(a_strides)
 
-    numpy.testing.assert_allclose(expected, result)
+    assert_allclose(expected, result)
 
 
 @pytest.mark.parametrize("func_name",
@@ -39,9 +42,7 @@ def test_strides(func_name, type):
                           "cosh", "conjugate", "degrees", "ediff1d", "exp", "exp2", "expm1", "fabs", "floor", "log",
                           "log10", "log1p", "log2", "negative", "radians", "sign", "sin", "sinh", "sqrt", "square",
                           "tanh", "trunc"])
-@pytest.mark.parametrize("dtype",
-                         [numpy.float64, numpy.float32, numpy.int64, numpy.int32],
-                         ids=["float64", "float32", "int64", "int32"])
+@pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True, no_complex=True))
 @pytest.mark.parametrize("shape",
                          [(10,)],
                          ids=["(10,)"])
@@ -58,12 +59,10 @@ def test_strides_1arg(func_name, dtype, shape):
     numpy_func = _getattr(numpy, func_name)
     expected = numpy_func(b)
 
-    numpy.testing.assert_allclose(result, expected)
+    assert_allclose(result, expected)
 
 
-@pytest.mark.parametrize("dtype",
-                         [numpy.float64, numpy.float32, numpy.int64, numpy.int32],
-                         ids=["float64", "float32", "int64", "int32"])
+@pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True, no_complex=True))
 @pytest.mark.parametrize("shape",
                          [(10,)],
                          ids=["(10,)"])
@@ -80,12 +79,10 @@ def test_strides_erf(dtype, shape):
     for idx, val in enumerate(b):
         expected[idx] = math.erf(val)
 
-    numpy.testing.assert_allclose(result, expected)
+    assert_allclose(result, expected)
 
 
-@pytest.mark.parametrize("dtype",
-                         [numpy.float64, numpy.float32, numpy.int64, numpy.int32],
-                         ids=["float64", "float32", "int64", "int32"])
+@pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True, no_complex=True))
 @pytest.mark.parametrize("shape",
                          [(10,)],
                          ids=["(10,)"])
@@ -101,12 +98,10 @@ def test_strides_reciprocal(dtype, shape):
     result = dpnp.reciprocal(dpb)
     expected = numpy.reciprocal(b)
 
-    numpy.testing.assert_allclose(result, expected, rtol=1e-06)
+    assert_allclose(result, expected, rtol=1e-06)
 
 
-@pytest.mark.parametrize("dtype",
-                         [numpy.float64, numpy.float32, numpy.int64, numpy.int32],
-                         ids=["float64", "float32", "int64", "int32"])
+@pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True, no_complex=True))
 @pytest.mark.parametrize("shape",
                          [(10,)],
                          ids=["(10,)"])
@@ -120,14 +115,12 @@ def test_strides_tan(dtype, shape):
     result = dpnp.tan(dpb)
     expected = numpy.tan(b)
 
-    numpy.testing.assert_allclose(result, expected, rtol=1e-06)
+    assert_allclose(result, expected, rtol=1e-06)
 
 
 @pytest.mark.parametrize("func_name",
                          ["add", "arctan2", "hypot", "maximum", "minimum", "multiply", "power", "subtract"])
-@pytest.mark.parametrize("dtype",
-                         [numpy.float64, numpy.float32, numpy.int64, numpy.int32],
-                         ids=["float64", "float32", "int64", "int32"])
+@pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True, no_complex=True))
 @pytest.mark.parametrize("shape",
                          [(3, 3)],
                          ids=["(3, 3)"])
@@ -144,7 +137,7 @@ def test_strides_2args(func_name, dtype, shape):
     numpy_func = _getattr(numpy, func_name)
     expected = numpy_func(a, b)
 
-    numpy.testing.assert_allclose(result, expected)
+    assert_allclose(result, expected)
 
 
 @pytest.mark.parametrize("func_name",
@@ -168,12 +161,10 @@ def test_strides_bitwise(func_name, dtype, shape):
     numpy_func = _getattr(numpy, func_name)
     expected = numpy_func(a, b)
 
-    numpy.testing.assert_allclose(result, expected)
+    assert_allclose(result, expected)
 
 
-@pytest.mark.parametrize("dtype",
-                         [numpy.float64, numpy.float32, numpy.int64, numpy.int32],
-                         ids=["float64", "float32", "int64", "int32"])
+@pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True, no_complex=True))
 @pytest.mark.parametrize("shape",
                          [(3, 3)],
                          ids=["(3, 3)"])
@@ -187,13 +178,10 @@ def test_strides_copysign(dtype, shape):
     result = dpnp.copysign(dpa, dpb)
     expected = numpy.copysign(a, b)
 
-    numpy.testing.assert_allclose(result, expected)
+    assert_allclose(result, expected)
 
 
-@pytest.mark.usefixtures("allow_fall_back_on_numpy")
-@pytest.mark.parametrize("dtype",
-                         [numpy.float64, numpy.float32, numpy.int64, numpy.int32],
-                         ids=["float64", "float32", "int64", "int32"])
+@pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True, no_complex=True))
 @pytest.mark.parametrize("shape",
                          [(3, 3)],
                          ids=["(3, 3)"])
@@ -207,13 +195,10 @@ def test_strides_fmod(dtype, shape):
     result = dpnp.fmod(dpa, dpb)
     expected = numpy.fmod(a, b)
 
-    numpy.testing.assert_allclose(result, expected)
+    assert_allclose(result, expected)
 
 
-@pytest.mark.usefixtures("allow_fall_back_on_numpy")
-@pytest.mark.parametrize("dtype",
-                         [numpy.float64, numpy.float32, numpy.int64, numpy.int32],
-                         ids=["float64", "float32", "int64", "int32"])
+@pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True, no_complex=True))
 @pytest.mark.parametrize("shape",
                          [(3, 3)],
                          ids=["(3, 3)"])
@@ -227,4 +212,4 @@ def test_strides_true_devide(dtype, shape):
     result = dpnp.fmod(dpa, dpb)
     expected = numpy.fmod(a, b)
 
-    numpy.testing.assert_allclose(result, expected)
+    assert_allclose(result, expected)
diff --git a/tests/third_party/cupy/math_tests/test_arithmetic.py b/tests/third_party/cupy/math_tests/test_arithmetic.py
index 158f5cc14421..a53a8494707c 100644
--- a/tests/third_party/cupy/math_tests/test_arithmetic.py
+++ b/tests/third_party/cupy/math_tests/test_arithmetic.py
@@ -146,7 +146,7 @@ def check_binary(self, xp):
                 y = y.astype(numpy.complex64)
 
         # NumPy returns an output array of another type than DPNP when input ones have diffrent types.
-        if self.name == 'multiply' and xp is cupy:
+        if self.name in ('add', 'multiply') and xp is cupy:
             if xp.isscalar(arg1) and xp.isscalar(arg2):
                 # If both are scalars, the result will be a scalar, so needs to convert into numpy-scalar.
                 y = numpy.asarray(y)

From 430eca37f2958fb03ba8a6033d48703d129be746 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Thu, 9 Feb 2023 02:19:23 -0600
Subject: [PATCH 2/5] get rid of dpctl.SyclQueue() call in tests with
 unsupported device keyword

---
 dpnp/dpnp_iface_mathematical.py |  2 +-
 tests/skipped_tests.tbl         |  4 ++--
 tests/skipped_tests_gpu.tbl     | 36 +++++++++++++++++++++++++++++----
 tests/test_sycl_queue.py        |  2 +-
 4 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/dpnp/dpnp_iface_mathematical.py b/dpnp/dpnp_iface_mathematical.py
index 8104fdea6735..786bbe006803 100644
--- a/dpnp/dpnp_iface_mathematical.py
+++ b/dpnp/dpnp_iface_mathematical.py
@@ -170,7 +170,7 @@ def add(x1,
 
     Returns
     -------
-    add : dpnp.ndarray
+    y : dpnp.ndarray
         The sum of `x1` and `x2`, element-wise.
 
     Limitations
diff --git a/tests/skipped_tests.tbl b/tests/skipped_tests.tbl
index acd920580cf0..96e44cca594d 100644
--- a/tests/skipped_tests.tbl
+++ b/tests/skipped_tests.tbl
@@ -765,12 +765,12 @@ tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_para
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_547_{arg1=array([[1, 2, 3],       [4, 5, 6]]), arg2=array([[0, 1, 2],       [3, 4, 5]]), dtype=float64, name='remainder', use_dtype=False}::test_binary
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_549_{arg1=array([[1, 2, 3],       [4, 5, 6]]), arg2=array([[0, 1, 2],       [3, 4, 5]]), dtype=float64, name='mod', use_dtype=False}::test_binary
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticModf::test_modf
-
+tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_0_{name='reciprocal', nargs=1}::test_raises_with_numpy_input
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_10_{name='remainder', nargs=2}::test_raises_with_numpy_input
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_11_{name='mod', nargs=2}::test_raises_with_numpy_input
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_1_{name='angle', nargs=1}::test_raises_with_numpy_input
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_8_{name='floor_divide', nargs=2}::test_raises_with_numpy_input
-
+tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_9_{name='fmod', nargs=2}::test_raises_with_numpy_input
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestBoolSubtract_param_3_{shape=(), xp=dpnp}::test_bool_subtract
 tests/third_party/cupy/math_tests/test_explog.py::TestExplog::test_logaddexp
 tests/third_party/cupy/math_tests/test_explog.py::TestExplog::test_logaddexp2
diff --git a/tests/skipped_tests_gpu.tbl b/tests/skipped_tests_gpu.tbl
index f34ac97fe065..af2dbd783a4e 100644
--- a/tests/skipped_tests_gpu.tbl
+++ b/tests/skipped_tests_gpu.tbl
@@ -91,7 +91,18 @@ tests/third_party/cupy/indexing_tests/test_insert.py::TestDiagIndicesInvalidValu
 tests/third_party/cupy/indexing_tests/test_insert.py::TestDiagIndicesFrom_param_0_{shape=(3, 3)}::test_diag_indices_from
 tests/third_party/cupy/indexing_tests/test_insert.py::TestDiagIndicesFrom_param_1_{shape=(0, 0)}::test_diag_indices_from
 tests/third_party/cupy/indexing_tests/test_insert.py::TestDiagIndicesFrom_param_2_{shape=(2, 2, 2)}::test_diag_indices_from
-
+tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_295_{arg1=array([[1., 2., 3.],       [4., 5., 6.]], dtype=float32), arg2=array([[0, 1, 2],       [3, 4, 5]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
+tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_303_{arg1=array([[1., 2., 3.],       [4., 5., 6.]], dtype=float32), arg2=array([[0, 1, 2],       [3, 4, 5]], dtype=int64), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
+tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_375_{arg1=array([[1., 2., 3.],       [4., 5., 6.]]), arg2=array([[0, 1, 2],       [3, 4, 5]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
+tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_383_{arg1=array([[1., 2., 3.],       [4., 5., 6.]]), arg2=array([[0, 1, 2],       [3, 4, 5]], dtype=int64), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
+tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_439_{arg1=array([[1, 2, 3],       [4, 5, 6]]), arg2=array([[0., 1., 2.],       [3., 4., 5.]], dtype=float32), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
+tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_447_{arg1=array([[1, 2, 3],       [4, 5, 6]]), arg2=array([[0., 1., 2.],       [3., 4., 5.]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
+tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_455_{arg1=array([[1, 2, 3],       [4, 5, 6]]), arg2=array([[0, 1, 2],       [3, 4, 5]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
+tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_463_{arg1=array([[1, 2, 3],       [4, 5, 6]]), arg2=array([[0, 1, 2],       [3, 4, 5]], dtype=int64), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
+tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_519_{arg1=array([[1, 2, 3],       [4, 5, 6]], dtype=int64), arg2=array([[0., 1., 2.],       [3., 4., 5.]], dtype=float32), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
+tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_527_{arg1=array([[1, 2, 3],       [4, 5, 6]], dtype=int64), arg2=array([[0., 1., 2.],       [3., 4., 5.]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
+tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_535_{arg1=array([[1, 2, 3],       [4, 5, 6]], dtype=int64), arg2=array([[0, 1, 2],       [3, 4, 5]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
+tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_543_{arg1=array([[1, 2, 3],       [4, 5, 6]], dtype=int64), arg2=array([[0, 1, 2],       [3, 4, 5]], dtype=int64), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
 tests/third_party/cupy/math_tests/test_sumprod.py::TestSumprod::test_external_prod_all
 tests/third_party/cupy/math_tests/test_sumprod.py::TestSumprod::test_external_prod_axis
 tests/third_party/cupy/math_tests/test_sumprod.py::TestSumprod::test_external_sum_all
@@ -958,17 +969,34 @@ tests/third_party/cupy/manipulation_tests/test_tiling.py::TestTile_param_4_{reps
 tests/third_party/cupy/manipulation_tests/test_tiling.py::TestTile_param_5_{reps=(2, 3, 4, 5)}::test_array_tile
 tests/third_party/cupy/manipulation_tests/test_transpose.py::TestTranspose::test_moveaxis_invalid5_2
 tests/third_party/cupy/manipulation_tests/test_transpose.py::TestTranspose::test_moveaxis_invalid5_3
-
+tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_279_{arg1=array([[1., 2., 3.],       [4., 5., 6.]], dtype=float32), arg2=array([[0., 1., 2.],       [3., 4., 5.]], dtype=float32), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
+tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_287_{arg1=array([[1., 2., 3.],       [4., 5., 6.]], dtype=float32), arg2=array([[0., 1., 2.],       [3., 4., 5.]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
+tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_295_{arg1=array([[1., 2., 3.],       [4., 5., 6.]], dtype=float32), arg2=array([[0, 1, 2],       [3, 4, 5]], dtype=int32), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
+tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_303_{arg1=array([[1., 2., 3.],       [4., 5., 6.]], dtype=float32), arg2=array([[0, 1, 2],       [3, 4, 5]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
+tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_359_{arg1=array([[1., 2., 3.],       [4., 5., 6.]]), arg2=array([[0., 1., 2.],       [3., 4., 5.]], dtype=float32), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
+tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_367_{arg1=array([[1., 2., 3.],       [4., 5., 6.]]), arg2=array([[0., 1., 2.],       [3., 4., 5.]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
+tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_375_{arg1=array([[1., 2., 3.],       [4., 5., 6.]]), arg2=array([[0, 1, 2],       [3, 4, 5]], dtype=int32), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
+tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_383_{arg1=array([[1., 2., 3.],       [4., 5., 6.]]), arg2=array([[0, 1, 2],       [3, 4, 5]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
+tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_439_{arg1=array([[1, 2, 3],       [4, 5, 6]], dtype=int32), arg2=array([[0., 1., 2.],       [3., 4., 5.]], dtype=float32), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
+tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_447_{arg1=array([[1, 2, 3],       [4, 5, 6]], dtype=int32), arg2=array([[0., 1., 2.],       [3., 4., 5.]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_455_{arg1=array([[1, 2, 3],       [4, 5, 6]], dtype=int32), arg2=array([[0, 1, 2],       [3, 4, 5]], dtype=int32), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_463_{arg1=array([[1, 2, 3],       [4, 5, 6]], dtype=int32), arg2=array([[0, 1, 2],       [3, 4, 5]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
+tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_519_{arg1=array([[1, 2, 3],       [4, 5, 6]]), arg2=array([[0., 1., 2.],       [3., 4., 5.]], dtype=float32), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
+tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_527_{arg1=array([[1, 2, 3],       [4, 5, 6]]), arg2=array([[0., 1., 2.],       [3., 4., 5.]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_535_{arg1=array([[1, 2, 3],       [4, 5, 6]]), arg2=array([[0, 1, 2],       [3, 4, 5]], dtype=int32), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_543_{arg1=array([[1, 2, 3],       [4, 5, 6]]), arg2=array([[0, 1, 2],       [3, 4, 5]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-
+tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticModf::test_modf
+tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_0_{name='reciprocal', nargs=1}::test_raises_with_numpy_input
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_10_{name='remainder', nargs=2}::test_raises_with_numpy_input
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_11_{name='mod', nargs=2}::test_raises_with_numpy_input
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_1_{name='angle', nargs=1}::test_raises_with_numpy_input
+tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_4_{name='divide', nargs=2}::test_raises_with_numpy_input
+tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_5_{name='power', nargs=2}::test_raises_with_numpy_input
+tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_6_{name='subtract', nargs=2}::test_raises_with_numpy_input
+tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_7_{name='true_divide', nargs=2}::test_raises_with_numpy_input
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_8_{name='floor_divide', nargs=2}::test_raises_with_numpy_input
-
+tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_9_{name='fmod', nargs=2}::test_raises_with_numpy_input
+tests/third_party/cupy/math_tests/test_arithmetic.py::TestBoolSubtract_param_3_{shape=(), xp=dpnp}::test_bool_subtract
 tests/third_party/cupy/math_tests/test_explog.py::TestExplog::test_logaddexp
 tests/third_party/cupy/math_tests/test_explog.py::TestExplog::test_logaddexp2
 tests/third_party/cupy/math_tests/test_floating.py::TestFloating::test_copysign_float
diff --git a/tests/test_sycl_queue.py b/tests/test_sycl_queue.py
index 413596e2cc76..bc42f70b3700 100644
--- a/tests/test_sycl_queue.py
+++ b/tests/test_sycl_queue.py
@@ -426,7 +426,7 @@ def test_random_state(func, args, kwargs, device, usm_type):
     assert device == res_array.sycl_device
     assert usm_type == res_array.usm_type
 
-    sycl_queue = dpctl.SyclQueue(device=device, property="in_order")
+    sycl_queue = dpctl.SyclQueue(device, property="in_order")
 
     # test with in-order SYCL queue per a device and passed as argument
     rs = dpnp.random.RandomState((147, 56, 896), sycl_queue=sycl_queue)

From a7539e6c526da1e9b2368b21e8b84becb957afa6 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 10 Feb 2023 06:55:46 -0600
Subject: [PATCH 3/5] Add a fix for crash on CPU device

---
 .github/workflows/conda-package.yml         |  1 +
 dpnp/backend/kernels/dpnp_krnl_elemwise.cpp | 26 ++++++++++++---------
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index ff25e456436d..da29bf31dd09 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -17,6 +17,7 @@ env:
       test_linalg.py
       test_mathematical.py
       test_random_state.py
+      test_special.py
       test_usm_type.py
   VER_JSON_NAME: 'version.json'
   VER_SCRIPT1: "import json; f = open('version.json', 'r'); j = json.load(f); f.close(); "
diff --git a/dpnp/backend/kernels/dpnp_krnl_elemwise.cpp b/dpnp/backend/kernels/dpnp_krnl_elemwise.cpp
index 1b90e4a6821e..4ce5ad01e9c6 100644
--- a/dpnp/backend/kernels/dpnp_krnl_elemwise.cpp
+++ b/dpnp/backend/kernels/dpnp_krnl_elemwise.cpp
@@ -1027,22 +1027,23 @@ static void func_map_init_elemwise_1arg_1type(func_map_t& fmap)
                     auto sg = nd_it.get_sub_group();                                                                   \
                     size_t start = vec_sz * (nd_it.get_group(0) * nd_it.get_local_range(0) +                           \
                                             sg.get_group_id()[0] * sg.get_max_local_range()[0]);                       \
-                    size_t end = start + static_cast<size_t>(vec_sz);                                                  \
+                    size_t end = start + static_cast<size_t>(vec_sz) * sg.get_max_local_range()[0] - 1;                \
                                                                                                                        \
-                    if (end < result_size) {                                                                           \
+                    if (end < result_size)                                                                             \
+                    {                                                                                                  \
                         sycl::vec<_DataType_input1, vec_sz> x1 =                                                       \
                             sg.load<vec_sz>(sycl::multi_ptr<_DataType_input1, global_space>(&input1_data[start]));     \
                         sycl::vec<_DataType_input2, vec_sz> x2 =                                                       \
                             sg.load<vec_sz>(sycl::multi_ptr<_DataType_input2, global_space>(&input2_data[start]));     \
                         sycl::vec<_DataType_output, vec_sz> res_vec;                                                   \
-                                                                                                                       \
                         if constexpr (both_types_are_same<_DataType_input1, _DataType_input2, __vec_types__>)          \
                         {                                                                                              \
                             res_vec = __vec_operation__;                                                               \
                         }                                                                                              \
                         else                                                                                           \
                         {                                                                                              \
-                            for (size_t k = 0; k < vec_sz; ++k) {                                                      \
+                            for (size_t k = 0; k < vec_sz; ++k)                                                        \
+                            {                                                                                          \
                                 const _DataType_output input1_elem = x1[k];                                            \
                                 const _DataType_output input2_elem = x2[k];                                            \
                                 res_vec[k] = __operation__;                                                            \
@@ -1051,8 +1052,10 @@ static void func_map_init_elemwise_1arg_1type(func_map_t& fmap)
                         sg.store<vec_sz>(sycl::multi_ptr<_DataType_output, global_space>(&result[start]), res_vec);    \
                                                                                                                        \
                     }                                                                                                  \
-                    else {                                                                                             \
-                        for (size_t k = start; k < result_size; ++k) {                                                 \
+                    else                                                                                               \
+                    {                                                                                                  \
+                        for (size_t k = start; k < result_size; ++k)                                                   \
+                        {                                                                                              \
                             const _DataType_output input1_elem = input1_data[k];                                       \
                             const _DataType_output input2_elem = input2_data[k];                                       \
                             result[k] = __operation__;                                                                 \
@@ -1061,6 +1064,7 @@ static void func_map_init_elemwise_1arg_1type(func_map_t& fmap)
                 };                                                                                                     \
                                                                                                                        \
                 auto kernel_func = [&](sycl::handler& cgh) {                                                           \
+                    sycl::stream out(65536, 128, cgh);\
                     cgh.parallel_for<class __name__##_sg_kernel<_DataType_output, _DataType_input1, _DataType_input2>>(\
                         sycl::nd_range<1>(gws_range, lws_range), kernel_parallel_for_func);                            \
                 };                                                                                                     \
@@ -1070,11 +1074,11 @@ static void func_map_init_elemwise_1arg_1type(func_map_t& fmap)
             {                                                                                                          \
                 auto kernel_parallel_for_func = [=](sycl::id<1> global_id) {                                           \
                     const size_t i = global_id[0]; /* for (size_t i = 0; i < result_size; ++i) */                      \
-                    {                                                                                                  \
-                        const _DataType_output input1_elem = input1_data[i];                                           \
-                        const _DataType_output input2_elem = input2_data[i];                                           \
-                        result[i] = __operation__;                                                                     \
-                    }                                                                                                  \
+                                                                                                                       \
+                    const _DataType_output input1_elem = input1_data[i];                                               \
+                    const _DataType_output input2_elem = input2_data[i];                                               \
+                    result[i] = __operation__;                                                                         \
+                                                                                                                       \
                 };                                                                                                     \
                 auto kernel_func = [&](sycl::handler& cgh) {                                                           \
                     cgh.parallel_for<class __name__##_kernel<_DataType_output, _DataType_input1, _DataType_input2>>(   \

From e8edc9a9ea9db485323bca4b58d125d7eec32202 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 10 Feb 2023 08:56:24 -0600
Subject: [PATCH 4/5] USM type in operations with a scalar

---
 dpnp/dpnp_iface.py                  |  5 +-
 dpnp/dpnp_iface_logic.py            | 90 +++++++++++++++++------------
 dpnp/dpnp_iface_mathematical.py     | 20 ++++---
 dpnp/dpnp_utils/dpnp_algo_utils.pyx | 49 +++++++++++-----
 tests/test_usm_type.py              | 42 ++++++++++----
 5 files changed, 134 insertions(+), 72 deletions(-)

diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py
index 4806b511aff4..1c60d1c999e1 100644
--- a/dpnp/dpnp_iface.py
+++ b/dpnp/dpnp_iface.py
@@ -225,6 +225,7 @@ def default_float_type(device=None, sycl_queue=None):
 def get_dpnp_descriptor(ext_obj,
                         copy_when_strides=True,
                         copy_when_nondefault_queue=True,
+                        alloc_usm_type=None,
                         alloc_queue=None):
     """
     Return True:
@@ -245,9 +246,9 @@ def get_dpnp_descriptor(ext_obj,
         return False
 
     # If input object is a scalar, it means it was allocated on host memory.
-    # We need to copy it to device memory according to compute follows data paradigm.
+    # We need to copy it to USM memory according to compute follows data paradigm.
     if isscalar(ext_obj):
-        ext_obj = array(ext_obj, sycl_queue=alloc_queue)
+        ext_obj = array(ext_obj, usm_type=alloc_usm_type, sycl_queue=alloc_queue)
 
     # while dpnp functions have no implementation with strides support
     # we need to create a non-strided copy
diff --git a/dpnp/dpnp_iface_logic.py b/dpnp/dpnp_iface_logic.py
index de7537a42878..e94b0f6c1efb 100644
--- a/dpnp/dpnp_iface_logic.py
+++ b/dpnp/dpnp_iface_logic.py
@@ -277,11 +277,13 @@ def equal(x1,
         # at least either x1 or x2 has to be an array
         pass
     else:
-        # get a common queue to copy data from the host into a device if any input is scalar
-        queue = get_common_allocation_queue([x1, x2]) if dpnp.isscalar(x1) or dpnp.isscalar(x2) else None
+        # get USM type and queue to copy scalar from the host memory into a USM allocation
+        usm_type, queue = get_usm_allocations([x1, x2]) if dpnp.isscalar(x1) or dpnp.isscalar(x2) else (None, None)
 
-        x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_strides=False, copy_when_nondefault_queue=False, alloc_queue=queue)
-        x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_strides=False, copy_when_nondefault_queue=False, alloc_queue=queue)
+        x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_strides=False, copy_when_nondefault_queue=False,
+                                           alloc_usm_type=usm_type, alloc_queue=queue)
+        x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_strides=False, copy_when_nondefault_queue=False,
+                                           alloc_usm_type=usm_type, alloc_queue=queue)
         if x1_desc and x2_desc:
             return dpnp_equal(x1_desc, x2_desc).get_pyobj()
     return call_origin(numpy.equal, x1, x2)
@@ -345,11 +347,13 @@ def greater(x1,
         # at least either x1 or x2 has to be an array
         pass
     else:
-        # get a common queue to copy data from the host into a device if any input is scalar
-        queue = get_common_allocation_queue([x1, x2]) if dpnp.isscalar(x1) or dpnp.isscalar(x2) else None
+        # get USM type and queue to copy scalar from the host memory into a USM allocation
+        usm_type, queue = get_usm_allocations([x1, x2]) if dpnp.isscalar(x1) or dpnp.isscalar(x2) else (None, None)
 
-        x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_strides=False, copy_when_nondefault_queue=False, alloc_queue=queue)
-        x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_strides=False, copy_when_nondefault_queue=False, alloc_queue=queue)
+        x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_strides=False, copy_when_nondefault_queue=False,
+                                           alloc_usm_type=usm_type, alloc_queue=queue)
+        x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_strides=False, copy_when_nondefault_queue=False,
+                                           alloc_usm_type=usm_type, alloc_queue=queue)
         if x1_desc and x2_desc:
             return dpnp_greater(x1_desc, x2_desc).get_pyobj()
     return call_origin(numpy.greater, x1, x2)
@@ -413,11 +417,13 @@ def greater_equal(x1,
         # at least either x1 or x2 has to be an array
         pass
     else:
-        # get a common queue to copy data from the host into a device if any input is scalar
-        queue = get_common_allocation_queue([x1, x2]) if dpnp.isscalar(x1) or dpnp.isscalar(x2) else None
+        # get USM type and queue to copy scalar from the host memory into a USM allocation
+        usm_type, queue = get_usm_allocations([x1, x2]) if dpnp.isscalar(x1) or dpnp.isscalar(x2) else (None, None)
 
-        x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_strides=False, copy_when_nondefault_queue=False, alloc_queue=queue)
-        x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_strides=False, copy_when_nondefault_queue=False, alloc_queue=queue)
+        x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_strides=False, copy_when_nondefault_queue=False,
+                                           alloc_usm_type=usm_type, alloc_queue=queue)
+        x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_strides=False, copy_when_nondefault_queue=False,
+                                           alloc_usm_type=usm_type, alloc_queue=queue)
         if x1_desc and x2_desc:
             return dpnp_greater_equal(x1_desc, x2_desc).get_pyobj()
     return call_origin(numpy.greater_equal, x1, x2)
@@ -659,11 +665,13 @@ def less(x1,
         # at least either x1 or x2 has to be an array
         pass
     else:
-        # get a common queue to copy data from the host into a device if any input is scalar
-        queue = get_common_allocation_queue([x1, x2]) if dpnp.isscalar(x1) or dpnp.isscalar(x2) else None
+        # get USM type and queue to copy scalar from the host memory into a USM allocation
+        usm_type, queue = get_usm_allocations([x1, x2]) if dpnp.isscalar(x1) or dpnp.isscalar(x2) else (None, None)
 
-        x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_strides=False, copy_when_nondefault_queue=False, alloc_queue=queue)
-        x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_strides=False, copy_when_nondefault_queue=False, alloc_queue=queue)
+        x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_strides=False, copy_when_nondefault_queue=False,
+                                           alloc_usm_type=usm_type, alloc_queue=queue)
+        x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_strides=False, copy_when_nondefault_queue=False,
+                                           alloc_usm_type=usm_type, alloc_queue=queue)
         if x1_desc and x2_desc:
             return dpnp_less(x1_desc, x2_desc).get_pyobj()
     return call_origin(numpy.less, x1, x2)
@@ -727,11 +735,13 @@ def less_equal(x1,
         # at least either x1 or x2 has to be an array
         pass
     else:
-        # get a common queue to copy data from the host into a device if any input is scalar
-        queue = get_common_allocation_queue([x1, x2]) if dpnp.isscalar(x1) or dpnp.isscalar(x2) else None
+        # get USM type and queue to copy scalar from the host memory into a USM allocation
+        usm_type, queue = get_usm_allocations([x1, x2]) if dpnp.isscalar(x1) or dpnp.isscalar(x2) else (None, None)
 
-        x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_strides=False, copy_when_nondefault_queue=False, alloc_queue=queue)
-        x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_strides=False, copy_when_nondefault_queue=False, alloc_queue=queue)
+        x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_strides=False, copy_when_nondefault_queue=False,
+                                           alloc_usm_type=usm_type, alloc_queue=queue)
+        x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_strides=False, copy_when_nondefault_queue=False,
+                                           alloc_usm_type=usm_type, alloc_queue=queue)
         if x1_desc and x2_desc:
             return dpnp_less_equal(x1_desc, x2_desc).get_pyobj()
     return call_origin(numpy.less_equal, x1, x2)
@@ -794,11 +804,13 @@ def logical_and(x1,
         # at least either x1 or x2 has to be an array
         pass
     else:
-        # get a common queue to copy data from the host into a device if any input is scalar
-        queue = get_common_allocation_queue([x1, x2]) if dpnp.isscalar(x1) or dpnp.isscalar(x2) else None
+        # get USM type and queue to copy scalar from the host memory into a USM allocation
+        usm_type, queue = get_usm_allocations([x1, x2]) if dpnp.isscalar(x1) or dpnp.isscalar(x2) else (None, None)
 
-        x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_strides=False, copy_when_nondefault_queue=False, alloc_queue=queue)
-        x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_strides=False, copy_when_nondefault_queue=False, alloc_queue=queue)
+        x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_strides=False, copy_when_nondefault_queue=False,
+                                           alloc_usm_type=usm_type, alloc_queue=queue)
+        x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_strides=False, copy_when_nondefault_queue=False,
+                                           alloc_usm_type=usm_type, alloc_queue=queue)
         if x1_desc and x2_desc:
             return dpnp_logical_and(x1_desc, x2_desc).get_pyobj()
     return call_origin(numpy.logical_and, x1, x2)
@@ -918,11 +930,13 @@ def logical_or(x1,
         # at least either x1 or x2 has to be an array
         pass
     else:
-        # get a common queue to copy data from the host into a device if any input is scalar
-        queue = get_common_allocation_queue([x1, x2]) if dpnp.isscalar(x1) or dpnp.isscalar(x2) else None
+        # get USM type and queue to copy scalar from the host memory into a USM allocation
+        usm_type, queue = get_usm_allocations([x1, x2]) if dpnp.isscalar(x1) or dpnp.isscalar(x2) else (None, None)
 
-        x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_strides=False, copy_when_nondefault_queue=False, alloc_queue=queue)
-        x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_strides=False, copy_when_nondefault_queue=False, alloc_queue=queue)
+        x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_strides=False, copy_when_nondefault_queue=False,
+                                           alloc_usm_type=usm_type, alloc_queue=queue)
+        x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_strides=False, copy_when_nondefault_queue=False,
+                                           alloc_usm_type=usm_type, alloc_queue=queue)
         if x1_desc and x2_desc:
             return dpnp_logical_or(x1_desc, x2_desc).get_pyobj()
     return call_origin(numpy.logical_or, x1, x2)
@@ -985,11 +999,13 @@ def logical_xor(x1,
         # at least either x1 or x2 has to be an array
         pass
     else:
-        # get a common queue to copy data from the host into a device if any input is scalar
-        queue = get_common_allocation_queue([x1, x2]) if dpnp.isscalar(x1) or dpnp.isscalar(x2) else None
+        # get USM type and queue to copy scalar from the host memory into a USM allocation
+        usm_type, queue = get_usm_allocations([x1, x2]) if dpnp.isscalar(x1) or dpnp.isscalar(x2) else (None, None)
 
-        x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_strides=False, copy_when_nondefault_queue=False, alloc_queue=queue)
-        x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_strides=False, copy_when_nondefault_queue=False, alloc_queue=queue)
+        x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_strides=False, copy_when_nondefault_queue=False,
+                                           alloc_usm_type=usm_type, alloc_queue=queue)
+        x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_strides=False, copy_when_nondefault_queue=False,
+                                           alloc_usm_type=usm_type, alloc_queue=queue)
         if x1_desc and x2_desc:
             return dpnp_logical_xor(x1_desc, x2_desc).get_pyobj()
     return call_origin(numpy.logical_xor, x1, x2)
@@ -1053,11 +1069,13 @@ def not_equal(x1,
         # at least either x1 or x2 has to be an array
         pass
     else:
-        # get a common queue to copy data from the host into a device if any input is scalar
-        queue = get_common_allocation_queue([x1, x2]) if dpnp.isscalar(x1) or dpnp.isscalar(x2) else None
+        # get USM type and queue to copy scalar from the host memory into a USM allocation
+        usm_type, queue = get_usm_allocations([x1, x2]) if dpnp.isscalar(x1) or dpnp.isscalar(x2) else (None, None)
 
-        x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_strides=False, copy_when_nondefault_queue=False, alloc_queue=queue)
-        x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_strides=False, copy_when_nondefault_queue=False, alloc_queue=queue)
+        x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_strides=False, copy_when_nondefault_queue=False,
+                                           alloc_usm_type=usm_type, alloc_queue=queue)
+        x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_strides=False, copy_when_nondefault_queue=False,
+                                           alloc_usm_type=usm_type, alloc_queue=queue)
         if x1_desc and x2_desc:
             return dpnp_not_equal(x1_desc, x2_desc).get_pyobj()
     return call_origin(numpy.not_equal, x1, x2)
diff --git a/dpnp/dpnp_iface_mathematical.py b/dpnp/dpnp_iface_mathematical.py
index 786bbe006803..e254e916b846 100644
--- a/dpnp/dpnp_iface_mathematical.py
+++ b/dpnp/dpnp_iface_mathematical.py
@@ -205,11 +205,13 @@ def add(x1,
         # at least either x1 or x2 has to be an array
         pass
     else:
-        # get a common queue to copy data from the host into a device if any input is scalar
-        queue = get_common_allocation_queue([x1, x2]) if dpnp.isscalar(x1) or dpnp.isscalar(x2) else None
+        # get USM type and queue to copy scalar from the host memory into a USM allocation
+        usm_type, queue = get_usm_allocations([x1, x2]) if dpnp.isscalar(x1) or dpnp.isscalar(x2) else (None, None)
 
-        x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_strides=False, copy_when_nondefault_queue=False, alloc_queue=queue)
-        x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_strides=False, copy_when_nondefault_queue=False, alloc_queue=queue)
+        x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_strides=False, copy_when_nondefault_queue=False,
+                                           alloc_usm_type=usm_type, alloc_queue=queue)
+        x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_strides=False, copy_when_nondefault_queue=False,
+                                           alloc_usm_type=usm_type, alloc_queue=queue)
         if x1_desc and x2_desc:
             return dpnp_add(x1_desc, x2_desc, dtype=dtype, out=out, where=where).get_pyobj()
 
@@ -1133,11 +1135,13 @@ def multiply(x1,
         # at least either x1 or x2 has to be an array
         pass
     else:
-        # get a common queue to copy data from the host into a device if any input is scalar
-        queue = get_common_allocation_queue([x1, x2]) if dpnp.isscalar(x1) or dpnp.isscalar(x2) else None
+        # get USM type and queue to copy scalar from the host memory into a USM allocation
+        usm_type, queue = get_usm_allocations([x1, x2]) if dpnp.isscalar(x1) or dpnp.isscalar(x2) else (None, None)
 
-        x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_strides=False, copy_when_nondefault_queue=False, alloc_queue=queue)
-        x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_strides=False, copy_when_nondefault_queue=False, alloc_queue=queue)
+        x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_strides=False, copy_when_nondefault_queue=False,
+                                           alloc_usm_type=usm_type, alloc_queue=queue)
+        x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_strides=False, copy_when_nondefault_queue=False,
+                                           alloc_usm_type=usm_type, alloc_queue=queue)
         if x1_desc and x2_desc:
             return dpnp_multiply(x1_desc, x2_desc, dtype=dtype, out=out, where=where).get_pyobj()
 
diff --git a/dpnp/dpnp_utils/dpnp_algo_utils.pyx b/dpnp/dpnp_utils/dpnp_algo_utils.pyx
index 4913d5854918..abdc4107f649 100644
--- a/dpnp/dpnp_utils/dpnp_algo_utils.pyx
+++ b/dpnp/dpnp_utils/dpnp_algo_utils.pyx
@@ -34,7 +34,7 @@ This module contains differnt helpers and utilities
 import numpy
 
 import dpctl
-import dpctl.tensor as dpt
+import dpctl.utils as dpu
 
 import dpnp.config as config
 import dpnp.dpnp_container as dpnp_container
@@ -70,7 +70,7 @@ __all__ = [
     "dpnp_descriptor",
     "get_axis_indeces",
     "get_axis_offsets",
-    "get_common_allocation_queue",
+    "get_usm_allocations",
     "_get_linear_index",
     "map_dtype_to_device",
     "normalize_axis",
@@ -163,7 +163,7 @@ def call_origin(function, *args, **kwargs):
         kwargx = convert_item(kwarg)
         kwargs_new[key] = kwargx
 
-    exec_q = dpctl.utils.get_execution_queue(alloc_queues)
+    exec_q = dpu.get_execution_queue(alloc_queues)
     if exec_q is None:
         exec_q = dpnp.get_normalized_queue_device(sycl_queue=sycl_queue)
     # print(f"DPNP call_origin(): bakend called. \n\t function={function}, \n\t args_new={args_new}, \n\t kwargs_new={kwargs_new}, \n\t dpnp_inplace={dpnp_inplace}")
@@ -220,30 +220,49 @@ def unwrap_array(x1):
     return x1
 
 
-def get_common_allocation_queue(objects):
-    """
-    Given a list of objects returns the queue which can be used for a memory allocation
-    to follow compute follows data paradigm, or returns `None` if the default queue can be used.
-    An exception will be raised, if the paradigm is broked for the given list of objects.
-    """
-    if not isinstance(objects, (list, tuple)):
-        raise TypeError("Expected a list or a tuple, got {}".format(type(objects)))
-    
-    if len(objects) == 0:
+def _get_coerced_usm_type(objects):
+    types_in_use = [obj.usm_type for obj in objects if hasattr(obj, "usm_type")]
+    if len(types_in_use) == 0:
         return None
+    elif len(types_in_use) == 1:
+        return types_in_use[0]
+
+    common_usm_type = dpu.get_coerced_usm_type(types_in_use)
+    if common_usm_type is None:
+        raise ValueError("Input arrays must have coerced USM types")
+    return common_usm_type
 
+
+def _get_common_allocation_queue(objects):
     queues_in_use = [obj.sycl_queue for obj in objects if hasattr(obj, "sycl_queue")]
     if len(queues_in_use) == 0:
         return None
     elif len(queues_in_use) == 1:
         return queues_in_use[0]
 
-    common_queue = dpt.get_execution_queue(queues_in_use)
+    common_queue = dpu.get_execution_queue(queues_in_use)
     if common_queue is None:
         raise ValueError("Input arrays must be allocated on the same SYCL queue")
     return common_queue
 
 
+def get_usm_allocations(objects):
+    """
+    Given a list of objects returns a tuple of USM type and SYCL queue
+    which can be used for a memory allocation and to follow compute follows data paradigm,
+    or returns `(None, None)` if the default USM type and SYCL queue can be used.
+    An exception will be raised, if the paradigm is broked for the given list of objects.
+
+    """
+
+    if not isinstance(objects, (list, tuple)):
+        raise TypeError("Expected a list or a tuple, got {}".format(type(objects)))
+    
+    if len(objects) == 0:
+        return (None, None)
+    return (_get_coerced_usm_type(objects), _get_common_allocation_queue(objects))
+
+
 def map_dtype_to_device(dtype, device):
     """
     Map an input ``dtype`` with type ``device`` may use
@@ -631,7 +650,7 @@ cdef tuple get_common_usm_allocation(dpnp_descriptor x1, dpnp_descriptor x2):
             "could not recognize common USM type for inputs of USM types {} and {}"
             "".format(array1_obj.usm_type, array2_obj.usm_type))
 
-    common_sycl_queue = dpctl.utils.get_execution_queue((array1_obj.sycl_queue, array2_obj.sycl_queue))
+    common_sycl_queue = dpu.get_execution_queue((array1_obj.sycl_queue, array2_obj.sycl_queue))
     # TODO: refactor, remove when CFD is implemented in all array constructors
     if common_sycl_queue is None and array1_obj.sycl_context == array2_obj.sycl_context:
         common_sycl_queue = array1_obj.sycl_queue
diff --git a/tests/test_usm_type.py b/tests/test_usm_type.py
index 094fe419c263..15b853b3bfa2 100644
--- a/tests/test_usm_type.py
+++ b/tests/test_usm_type.py
@@ -11,16 +11,17 @@
 ]
 
 
-@pytest.mark.parametrize("usm_type", list_of_usm_types, ids=list_of_usm_types)
-def test_coerced_usm_types_sum(usm_type):
-    x = dp.arange(10, usm_type = "device")
-    y = dp.arange(10, usm_type = usm_type)
+@pytest.mark.parametrize("usm_type_x", list_of_usm_types, ids=list_of_usm_types)
+@pytest.mark.parametrize("usm_type_y", list_of_usm_types, ids=list_of_usm_types)
+def test_coerced_usm_types_sum(usm_type_x, usm_type_y):
+    x = dp.arange(1000, usm_type = usm_type_x)
+    y = dp.arange(1000, usm_type = usm_type_y)
 
-    z = x + y
-    
-    assert z.usm_type == x.usm_type
-    assert z.usm_type == "device"
-    assert y.usm_type == usm_type
+    z = 1.3 + x + y + 2
+
+    assert x.usm_type == usm_type_x
+    assert y.usm_type == usm_type_y
+    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
 
 
 @pytest.mark.parametrize("usm_type_x", list_of_usm_types, ids=list_of_usm_types)
@@ -29,8 +30,8 @@ def test_coerced_usm_types_mul(usm_type_x, usm_type_y):
     x = dp.arange(10, usm_type = usm_type_x)
     y = dp.arange(10, usm_type = usm_type_y)
 
-    z = x * y
-    
+    z = 3 * x * y * 1.5
+
     assert x.usm_type == usm_type_x
     assert y.usm_type == usm_type_y
     assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
@@ -61,3 +62,22 @@ def test_array_creation(func, args, usm_type_x, usm_type_y):
 
     assert x.usm_type == usm_type_x
     assert y.usm_type == usm_type_y
+
+@pytest.mark.parametrize("op",
+                         ['equal', 'greater', 'greater_equal', 'less', 'less_equal',
+                          'logical_and', 'logical_or', 'logical_xor', 'not_equal'],
+                         ids=['equal', 'greater', 'greater_equal', 'less', 'less_equal',
+                              'logical_and', 'logical_or', 'logical_xor', 'not_equal'])
+@pytest.mark.parametrize("usm_type_x", list_of_usm_types, ids=list_of_usm_types)
+@pytest.mark.parametrize("usm_type_y", list_of_usm_types, ids=list_of_usm_types)
+def test_coerced_usm_types_logic_op(op, usm_type_x, usm_type_y):
+    x = dp.arange(100, usm_type = usm_type_x)
+    y = dp.arange(100, usm_type = usm_type_y)[::-1]
+
+    z = getattr(dp, op)(x, y)
+    zx = getattr(dp, op)(x, 50)
+    zy = getattr(dp, op)(30, y)
+
+    assert x.usm_type == zx.usm_type == usm_type_x
+    assert y.usm_type == zy.usm_type == usm_type_y
+    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])

From 2f68eb18e4cea16b79d0afd224a075ad29dc32f4 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 10 Feb 2023 10:01:47 -0600
Subject: [PATCH 5/5] Porting fix for crash to logic kernel

---
 dpnp/backend/kernels/dpnp_krnl_elemwise.cpp |  8 ++++----
 dpnp/backend/kernels/dpnp_krnl_logic.cpp    | 16 ++++++++--------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/dpnp/backend/kernels/dpnp_krnl_elemwise.cpp b/dpnp/backend/kernels/dpnp_krnl_elemwise.cpp
index 4ce5ad01e9c6..32097d321a71 100644
--- a/dpnp/backend/kernels/dpnp_krnl_elemwise.cpp
+++ b/dpnp/backend/kernels/dpnp_krnl_elemwise.cpp
@@ -1025,11 +1025,11 @@ static void func_map_init_elemwise_1arg_1type(func_map_t& fmap)
                                                                                                                        \
                 auto kernel_parallel_for_func = [=](sycl::nd_item<1> nd_it) {                                          \
                     auto sg = nd_it.get_sub_group();                                                                   \
-                    size_t start = vec_sz * (nd_it.get_group(0) * nd_it.get_local_range(0) +                           \
-                                            sg.get_group_id()[0] * sg.get_max_local_range()[0]);                       \
-                    size_t end = start + static_cast<size_t>(vec_sz) * sg.get_max_local_range()[0] - 1;                \
+                    const auto max_sg_size = sg.get_max_local_range()[0];                                              \
+                    const size_t start = vec_sz * (nd_it.get_group(0) * nd_it.get_local_range(0) +                     \
+                                                   sg.get_group_id()[0] * max_sg_size);                                \
                                                                                                                        \
-                    if (end < result_size)                                                                             \
+                    if (start + static_cast<size_t>(vec_sz) * max_sg_size < result_size)                               \
                     {                                                                                                  \
                         sycl::vec<_DataType_input1, vec_sz> x1 =                                                       \
                             sg.load<vec_sz>(sycl::multi_ptr<_DataType_input1, global_space>(&input1_data[start]));     \
diff --git a/dpnp/backend/kernels/dpnp_krnl_logic.cpp b/dpnp/backend/kernels/dpnp_krnl_logic.cpp
index be1bb1bab79b..157347aa90c0 100644
--- a/dpnp/backend/kernels/dpnp_krnl_logic.cpp
+++ b/dpnp/backend/kernels/dpnp_krnl_logic.cpp
@@ -403,11 +403,11 @@ DPCTLSyclEventRef (*dpnp_any_ext_c)(DPCTLSyclQueueRef,
                                                                                                                        \
             auto kernel_parallel_for_func = [=](sycl::nd_item<1> nd_it) {                                              \
                 auto sg = nd_it.get_sub_group();                                                                       \
-                size_t start = vec_sz * (nd_it.get_group(0) * nd_it.get_local_range(0) +                               \
-                                         sg.get_group_id()[0] * sg.get_max_local_range()[0]);                          \
-                size_t end = start + static_cast<size_t>(vec_sz);                                                      \
+                const auto max_sg_size = sg.get_max_local_range()[0];                                                  \
+                const size_t start = vec_sz * (nd_it.get_group(0) * nd_it.get_local_range(0) +                         \
+                                               sg.get_group_id()[0] * max_sg_size);                                    \
                                                                                                                        \
-                if (end < result_size) {                                                                               \
+                if (start + static_cast<size_t>(vec_sz) * max_sg_size < result_size) {                                 \
                     sycl::vec<_DataType_input1, vec_sz> x1 =                                                           \
                         sg.load<vec_sz>(sycl::multi_ptr<_DataType_input1, global_space>(&input1_data[start]));         \
                     sycl::vec<bool, vec_sz> res_vec;                                                                   \
@@ -647,11 +647,11 @@ static void func_map_logic_1arg_1type_helper(func_map_t& fmap)
                                                                                                                        \
             auto kernel_parallel_for_func = [=](sycl::nd_item<1> nd_it) {                                              \
                 auto sg = nd_it.get_sub_group();                                                                       \
-                size_t start = vec_sz * (nd_it.get_group(0) * nd_it.get_local_range(0) +                               \
-                                         sg.get_group_id()[0] * sg.get_max_local_range()[0]);                          \
-                size_t end = start + static_cast<size_t>(vec_sz);                                                      \
+                const auto max_sg_size = sg.get_max_local_range()[0];                                                  \
+                const size_t start = vec_sz * (nd_it.get_group(0) * nd_it.get_local_range(0) +                         \
+                                               sg.get_group_id()[0] * max_sg_size);                                    \
                                                                                                                        \
-                if (end < result_size) {                                                                               \
+                if (start + static_cast<size_t>(vec_sz) * max_sg_size < result_size) {                                 \
                     sycl::vec<_DataType_input1, vec_sz> x1 =                                                           \
                         sg.load<vec_sz>(sycl::multi_ptr<_DataType_input1, global_space>(&input1_data[start]));         \
                     sycl::vec<_DataType_input2, vec_sz> x2 =                                                           \