From e26c3f1f3a92321f24cd3646d6a5de9635dc7821 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 3 Feb 2023 05:39:40 -0600
Subject: [PATCH 1/7] dpnp.add() doesn't work properly with a scalar

---
 tests/skipped_tests.tbl     |  6 ++----
 tests/skipped_tests_gpu.tbl | 34 ++++------------------------------
 2 files changed, 6 insertions(+), 34 deletions(-)

diff --git a/tests/skipped_tests.tbl b/tests/skipped_tests.tbl
index 53bdec8af0a4..98ed90a6d890 100644
--- a/tests/skipped_tests.tbl
+++ b/tests/skipped_tests.tbl
@@ -765,16 +765,14 @@ tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_para
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_547_{arg1=array([[1, 2, 3],       [4, 5, 6]]), arg2=array([[0, 1, 2],       [3, 4, 5]]), dtype=float64, name='remainder', use_dtype=False}::test_binary
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_549_{arg1=array([[1, 2, 3],       [4, 5, 6]]), arg2=array([[0, 1, 2],       [3, 4, 5]]), dtype=float64, name='mod', use_dtype=False}::test_binary
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticModf::test_modf
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_0_{name='reciprocal', nargs=1}::test_raises_with_numpy_input
+
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_10_{name='remainder', nargs=2}::test_raises_with_numpy_input
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_11_{name='mod', nargs=2}::test_raises_with_numpy_input
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_1_{name='angle', nargs=1}::test_raises_with_numpy_input
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_4_{name='divide', nargs=2}::test_raises_with_numpy_input
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_5_{name='power', nargs=2}::test_raises_with_numpy_input
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_6_{name='subtract', nargs=2}::test_raises_with_numpy_input
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_7_{name='true_divide', nargs=2}::test_raises_with_numpy_input
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_8_{name='floor_divide', nargs=2}::test_raises_with_numpy_input
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_9_{name='fmod', nargs=2}::test_raises_with_numpy_input
+
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestBoolSubtract_param_3_{shape=(), xp=dpnp}::test_bool_subtract
 tests/third_party/cupy/math_tests/test_explog.py::TestExplog::test_logaddexp
 tests/third_party/cupy/math_tests/test_explog.py::TestExplog::test_logaddexp2
diff --git a/tests/skipped_tests_gpu.tbl b/tests/skipped_tests_gpu.tbl
index af2dbd783a4e..c1bb4aa3b8fa 100644
--- a/tests/skipped_tests_gpu.tbl
+++ b/tests/skipped_tests_gpu.tbl
@@ -91,18 +91,7 @@ tests/third_party/cupy/indexing_tests/test_insert.py::TestDiagIndicesInvalidValu
 tests/third_party/cupy/indexing_tests/test_insert.py::TestDiagIndicesFrom_param_0_{shape=(3, 3)}::test_diag_indices_from
 tests/third_party/cupy/indexing_tests/test_insert.py::TestDiagIndicesFrom_param_1_{shape=(0, 0)}::test_diag_indices_from
 tests/third_party/cupy/indexing_tests/test_insert.py::TestDiagIndicesFrom_param_2_{shape=(2, 2, 2)}::test_diag_indices_from
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_295_{arg1=array([[1., 2., 3.],       [4., 5., 6.]], dtype=float32), arg2=array([[0, 1, 2],       [3, 4, 5]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_303_{arg1=array([[1., 2., 3.],       [4., 5., 6.]], dtype=float32), arg2=array([[0, 1, 2],       [3, 4, 5]], dtype=int64), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_375_{arg1=array([[1., 2., 3.],       [4., 5., 6.]]), arg2=array([[0, 1, 2],       [3, 4, 5]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_383_{arg1=array([[1., 2., 3.],       [4., 5., 6.]]), arg2=array([[0, 1, 2],       [3, 4, 5]], dtype=int64), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_439_{arg1=array([[1, 2, 3],       [4, 5, 6]]), arg2=array([[0., 1., 2.],       [3., 4., 5.]], dtype=float32), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_447_{arg1=array([[1, 2, 3],       [4, 5, 6]]), arg2=array([[0., 1., 2.],       [3., 4., 5.]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_455_{arg1=array([[1, 2, 3],       [4, 5, 6]]), arg2=array([[0, 1, 2],       [3, 4, 5]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_463_{arg1=array([[1, 2, 3],       [4, 5, 6]]), arg2=array([[0, 1, 2],       [3, 4, 5]], dtype=int64), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_519_{arg1=array([[1, 2, 3],       [4, 5, 6]], dtype=int64), arg2=array([[0., 1., 2.],       [3., 4., 5.]], dtype=float32), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_527_{arg1=array([[1, 2, 3],       [4, 5, 6]], dtype=int64), arg2=array([[0., 1., 2.],       [3., 4., 5.]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_535_{arg1=array([[1, 2, 3],       [4, 5, 6]], dtype=int64), arg2=array([[0, 1, 2],       [3, 4, 5]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_543_{arg1=array([[1, 2, 3],       [4, 5, 6]], dtype=int64), arg2=array([[0, 1, 2],       [3, 4, 5]], dtype=int64), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
+
 tests/third_party/cupy/math_tests/test_sumprod.py::TestSumprod::test_external_prod_all
 tests/third_party/cupy/math_tests/test_sumprod.py::TestSumprod::test_external_prod_axis
 tests/third_party/cupy/math_tests/test_sumprod.py::TestSumprod::test_external_sum_all
@@ -969,34 +958,19 @@ tests/third_party/cupy/manipulation_tests/test_tiling.py::TestTile_param_4_{reps
 tests/third_party/cupy/manipulation_tests/test_tiling.py::TestTile_param_5_{reps=(2, 3, 4, 5)}::test_array_tile
 tests/third_party/cupy/manipulation_tests/test_transpose.py::TestTranspose::test_moveaxis_invalid5_2
 tests/third_party/cupy/manipulation_tests/test_transpose.py::TestTranspose::test_moveaxis_invalid5_3
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_279_{arg1=array([[1., 2., 3.],       [4., 5., 6.]], dtype=float32), arg2=array([[0., 1., 2.],       [3., 4., 5.]], dtype=float32), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_287_{arg1=array([[1., 2., 3.],       [4., 5., 6.]], dtype=float32), arg2=array([[0., 1., 2.],       [3., 4., 5.]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_295_{arg1=array([[1., 2., 3.],       [4., 5., 6.]], dtype=float32), arg2=array([[0, 1, 2],       [3, 4, 5]], dtype=int32), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_303_{arg1=array([[1., 2., 3.],       [4., 5., 6.]], dtype=float32), arg2=array([[0, 1, 2],       [3, 4, 5]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_359_{arg1=array([[1., 2., 3.],       [4., 5., 6.]]), arg2=array([[0., 1., 2.],       [3., 4., 5.]], dtype=float32), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_367_{arg1=array([[1., 2., 3.],       [4., 5., 6.]]), arg2=array([[0., 1., 2.],       [3., 4., 5.]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_375_{arg1=array([[1., 2., 3.],       [4., 5., 6.]]), arg2=array([[0, 1, 2],       [3, 4, 5]], dtype=int32), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_383_{arg1=array([[1., 2., 3.],       [4., 5., 6.]]), arg2=array([[0, 1, 2],       [3, 4, 5]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_439_{arg1=array([[1, 2, 3],       [4, 5, 6]], dtype=int32), arg2=array([[0., 1., 2.],       [3., 4., 5.]], dtype=float32), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_447_{arg1=array([[1, 2, 3],       [4, 5, 6]], dtype=int32), arg2=array([[0., 1., 2.],       [3., 4., 5.]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
+
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_455_{arg1=array([[1, 2, 3],       [4, 5, 6]], dtype=int32), arg2=array([[0, 1, 2],       [3, 4, 5]], dtype=int32), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_463_{arg1=array([[1, 2, 3],       [4, 5, 6]], dtype=int32), arg2=array([[0, 1, 2],       [3, 4, 5]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_519_{arg1=array([[1, 2, 3],       [4, 5, 6]]), arg2=array([[0., 1., 2.],       [3., 4., 5.]], dtype=float32), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_527_{arg1=array([[1, 2, 3],       [4, 5, 6]]), arg2=array([[0., 1., 2.],       [3., 4., 5.]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_535_{arg1=array([[1, 2, 3],       [4, 5, 6]]), arg2=array([[0, 1, 2],       [3, 4, 5]], dtype=int32), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticBinary2_param_543_{arg1=array([[1, 2, 3],       [4, 5, 6]]), arg2=array([[0, 1, 2],       [3, 4, 5]]), dtype=float64, name='floor_divide', use_dtype=False}::test_binary
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticModf::test_modf
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_0_{name='reciprocal', nargs=1}::test_raises_with_numpy_input
+
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_10_{name='remainder', nargs=2}::test_raises_with_numpy_input
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_11_{name='mod', nargs=2}::test_raises_with_numpy_input
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_1_{name='angle', nargs=1}::test_raises_with_numpy_input
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_4_{name='divide', nargs=2}::test_raises_with_numpy_input
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_5_{name='power', nargs=2}::test_raises_with_numpy_input
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_6_{name='subtract', nargs=2}::test_raises_with_numpy_input
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_7_{name='true_divide', nargs=2}::test_raises_with_numpy_input
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_8_{name='floor_divide', nargs=2}::test_raises_with_numpy_input
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_9_{name='fmod', nargs=2}::test_raises_with_numpy_input
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestBoolSubtract_param_3_{shape=(), xp=dpnp}::test_bool_subtract
+
 tests/third_party/cupy/math_tests/test_explog.py::TestExplog::test_logaddexp
 tests/third_party/cupy/math_tests/test_explog.py::TestExplog::test_logaddexp2
 tests/third_party/cupy/math_tests/test_floating.py::TestFloating::test_copysign_float

From 0b1345da3d35684aefee6775e32521573c7c4a00 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Tue, 7 Feb 2023 07:24:44 -0600
Subject: [PATCH 2/7] dpnp.subtract() doesn't work properly with a scalar

---
 .../include/dpnp_gen_2arg_3type_tbl.hpp       |  6 +-
 dpnp/backend/kernels/dpnp_krnl_elemwise.cpp   | 39 ++------
 dpnp/dpnp_array.py                            |  4 +-
 dpnp/dpnp_iface_mathematical.py               | 83 +++++++++--------
 tests/skipped_tests.tbl                       |  3 +-
 tests/skipped_tests_gpu.tbl                   |  4 -
 tests/test_arraycreation.py                   |  4 +-
 tests/test_mathematical.py                    | 89 ++++++++++++++-----
 .../cupy/creation_tests/test_from_data.py     |  1 +
 .../cupy/math_tests/test_arithmetic.py        | 48 +++++-----
 10 files changed, 148 insertions(+), 133 deletions(-)

diff --git a/dpnp/backend/include/dpnp_gen_2arg_3type_tbl.hpp b/dpnp/backend/include/dpnp_gen_2arg_3type_tbl.hpp
index 33f5e0d19a46..6d8b0a49340e 100644
--- a/dpnp/backend/include/dpnp_gen_2arg_3type_tbl.hpp
+++ b/dpnp/backend/include/dpnp_gen_2arg_3type_tbl.hpp
@@ -184,9 +184,9 @@ MACRO_2ARG_3TYPES_OP(dpnp_power_c,
 
 MACRO_2ARG_3TYPES_OP(dpnp_subtract_c,
                      input1_elem - input2_elem,
-                     nullptr,
-                     std::false_type,
+                     sycl::sub_sat(x1, x2),
+                     MACRO_UNPACK_TYPES(int, long),
                      oneapi::mkl::vm::sub,
-                     MACRO_UNPACK_TYPES(float, double))
+                     MACRO_UNPACK_TYPES(float, double, std::complex<float>, std::complex<double>))
 
 #undef MACRO_2ARG_3TYPES_OP
diff --git a/dpnp/backend/kernels/dpnp_krnl_elemwise.cpp b/dpnp/backend/kernels/dpnp_krnl_elemwise.cpp
index 32097d321a71..31da58c98ff0 100644
--- a/dpnp/backend/kernels/dpnp_krnl_elemwise.cpp
+++ b/dpnp/backend/kernels/dpnp_krnl_elemwise.cpp
@@ -1193,6 +1193,12 @@ static void func_map_elemwise_2arg_3type_core(func_map_t& fmap)
                                       func_type_map_t::find_type<FT1>,
                                       func_type_map_t::find_type<FTs>>}),
      ...);
+    ((fmap[DPNPFuncName::DPNP_FN_SUBTRACT_EXT][FT1][FTs] =
+          {populate_func_types<FT1, FTs>(),
+           (void*)dpnp_subtract_c_ext<func_type_map_t::find_type<populate_func_types<FT1, FTs>()>,
+                                      func_type_map_t::find_type<FT1>,
+                                      func_type_map_t::find_type<FTs>>}),
+     ...);
 }
 
 template <DPNPFuncType... FTs>
@@ -1878,39 +1884,6 @@ static void func_map_init_elemwise_2arg_3type(func_map_t& fmap)
     fmap[DPNPFuncName::DPNP_FN_SUBTRACT][eft_DBL][eft_DBL] = {
         eft_DBL, (void*)dpnp_subtract_c_default<double, double, double>};
 
-    fmap[DPNPFuncName::DPNP_FN_SUBTRACT_EXT][eft_INT][eft_INT] = {
-        eft_INT, (void*)dpnp_subtract_c_ext<int32_t, int32_t, int32_t>};
-    fmap[DPNPFuncName::DPNP_FN_SUBTRACT_EXT][eft_INT][eft_LNG] = {
-        eft_LNG, (void*)dpnp_subtract_c_ext<int64_t, int32_t, int64_t>};
-    fmap[DPNPFuncName::DPNP_FN_SUBTRACT_EXT][eft_INT][eft_FLT] = {
-        eft_DBL, (void*)dpnp_subtract_c_ext<double, int32_t, float>};
-    fmap[DPNPFuncName::DPNP_FN_SUBTRACT_EXT][eft_INT][eft_DBL] = {
-        eft_DBL, (void*)dpnp_subtract_c_ext<double, int32_t, double>};
-    fmap[DPNPFuncName::DPNP_FN_SUBTRACT_EXT][eft_LNG][eft_INT] = {
-        eft_LNG, (void*)dpnp_subtract_c_ext<int64_t, int64_t, int32_t>};
-    fmap[DPNPFuncName::DPNP_FN_SUBTRACT_EXT][eft_LNG][eft_LNG] = {
-        eft_LNG, (void*)dpnp_subtract_c_ext<int64_t, int64_t, int64_t>};
-    fmap[DPNPFuncName::DPNP_FN_SUBTRACT_EXT][eft_LNG][eft_FLT] = {
-        eft_DBL, (void*)dpnp_subtract_c_ext<double, int64_t, float>};
-    fmap[DPNPFuncName::DPNP_FN_SUBTRACT_EXT][eft_LNG][eft_DBL] = {
-        eft_DBL, (void*)dpnp_subtract_c_ext<double, int64_t, double>};
-    fmap[DPNPFuncName::DPNP_FN_SUBTRACT_EXT][eft_FLT][eft_INT] = {
-        eft_DBL, (void*)dpnp_subtract_c_ext<double, float, int32_t>};
-    fmap[DPNPFuncName::DPNP_FN_SUBTRACT_EXT][eft_FLT][eft_LNG] = {
-        eft_DBL, (void*)dpnp_subtract_c_ext<double, float, int64_t>};
-    fmap[DPNPFuncName::DPNP_FN_SUBTRACT_EXT][eft_FLT][eft_FLT] = {
-        eft_FLT, (void*)dpnp_subtract_c_ext<float, float, float>};
-    fmap[DPNPFuncName::DPNP_FN_SUBTRACT_EXT][eft_FLT][eft_DBL] = {
-        eft_DBL, (void*)dpnp_subtract_c_ext<double, float, double>};
-    fmap[DPNPFuncName::DPNP_FN_SUBTRACT_EXT][eft_DBL][eft_INT] = {
-        eft_DBL, (void*)dpnp_subtract_c_ext<double, double, int32_t>};
-    fmap[DPNPFuncName::DPNP_FN_SUBTRACT_EXT][eft_DBL][eft_LNG] = {
-        eft_DBL, (void*)dpnp_subtract_c_ext<double, double, int64_t>};
-    fmap[DPNPFuncName::DPNP_FN_SUBTRACT_EXT][eft_DBL][eft_FLT] = {
-        eft_DBL, (void*)dpnp_subtract_c_ext<double, double, float>};
-    fmap[DPNPFuncName::DPNP_FN_SUBTRACT_EXT][eft_DBL][eft_DBL] = {
-        eft_DBL, (void*)dpnp_subtract_c_ext<double, double, double>};
-
     func_map_elemwise_2arg_3type_helper<eft_BLN, eft_INT, eft_LNG, eft_FLT, eft_DBL, eft_C64, eft_C128>(fmap);
 
     return;
diff --git a/dpnp/dpnp_array.py b/dpnp/dpnp_array.py
index a60464583cab..2843a044b044 100644
--- a/dpnp/dpnp_array.py
+++ b/dpnp/dpnp_array.py
@@ -270,7 +270,9 @@ def __rmul__(self, other):
  # '__rpow__',
  # '__rrshift__',
  # '__rshift__',
- # '__rsub__',
+
+    def __rsub__(self, other):
+        return dpnp.subtract(other, self)
 
     def __rtruediv__(self, other):
         return dpnp.true_divide(other, self)
diff --git a/dpnp/dpnp_iface_mathematical.py b/dpnp/dpnp_iface_mathematical.py
index e254e916b846..41657fb2d593 100644
--- a/dpnp/dpnp_iface_mathematical.py
+++ b/dpnp/dpnp_iface_mathematical.py
@@ -215,7 +215,7 @@ def add(x1,
         if x1_desc and x2_desc:
             return dpnp_add(x1_desc, x2_desc, dtype=dtype, out=out, where=where).get_pyobj()
 
-    return call_origin(numpy.add, x1, x2, dtype=dtype, out=out, where=where, **kwargs)
+    return call_origin(numpy.add, x1, x2, out=out, where=where, dtype=dtype, subok=subok, **kwargs)
 
 
 def around(x1, decimals=0, out=None):
@@ -1145,7 +1145,7 @@ def multiply(x1,
         if x1_desc and x2_desc:
             return dpnp_multiply(x1_desc, x2_desc, dtype=dtype, out=out, where=where).get_pyobj()
 
-    return call_origin(numpy.multiply, x1, x2, dtype=dtype, out=out, where=where, **kwargs)
+    return call_origin(numpy.multiply, x1, x2, out=out, where=where, dtype=dtype, subok=subok, **kwargs)
 
 
 def nancumprod(x1, **kwargs):
@@ -1520,60 +1520,67 @@ def sign(x1, **kwargs):
     return call_origin(numpy.sign, x1, **kwargs)
 
 
-def subtract(x1, x2, dtype=None, out=None, where=True, **kwargs):
+def subtract(x1,
+             x2,
+             /,
+             out=None,
+             *,
+             where=True,
+             dtype=None,
+             subok=True,
+             **kwargs):
     """
     Subtract arguments, element-wise.
 
     For full documentation refer to :obj:`numpy.subtract`.
 
+    Returns
+    -------
+    y : dpnp.ndarray
+        The difference of `x1` and `x2`, element-wise.
+    
     Limitations
     -----------
-    Parameters ``x1`` and ``x2`` are supported as either :obj:`dpnp.ndarray` or scalar.
-    Parameters ``dtype``, ``out`` and ``where`` are supported with their default values.
+    Parameters `x1` and `x2` are supported as either :class:`dpnp.ndarray` or scalar,
+    but not both (at least either `x1` or `x2` should be as :class:`dpnp.ndarray`).
+    Parameters `out`, `where`, `dtype` and `subok` are supported with their default values.
     Keyword arguments ``kwargs`` are currently unsupported.
-    Otherwise the functions will be executed sequentially on CPU.
+    Otherwise the function will be executed sequentially on CPU.
     Input array data types are limited by supported DPNP :ref:`Data types`.
 
     Example
     -------
-    >>> import dpnp as np
-    >>> result = np.subtract(np.array([4, 3]), np.array([2, 7]))
-    >>> [x for x in result]
+    >>> import dpnp as dp
+    >>> result = dp.subtract(dp.array([4, 3]), dp.array([2, 7]))
+    >>> print(result)
     [2, -4]
 
     """
 
-    x1_is_scalar = dpnp.isscalar(x1)
-    x2_is_scalar = dpnp.isscalar(x2)
-    x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_strides=False, copy_when_nondefault_queue=False)
-    x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_strides=False, copy_when_nondefault_queue=False)
+    if out is not None:
+        pass
+    elif where is not True:
+        pass
+    elif dtype is not None:
+        pass
+    elif subok is not True:
+        pass
+    elif dpnp.isscalar(x1) and dpnp.isscalar(x2):
+        # at least either x1 or x2 has to be an array
+        pass
+    else:
+        # get a common queue to copy data from the host into a device if any input is scalar
+        queue = get_common_allocation_queue([x1, x2]) if dpnp.isscalar(x1) or dpnp.isscalar(x2) else None
 
-    if x1_desc and x2_desc and not kwargs:
-        if not x1_desc and not x1_is_scalar:
-            pass
-        elif not x2_desc and not x2_is_scalar:
-            pass
-        elif x1_is_scalar and x2_is_scalar:
-            pass
-        elif x1_desc and x1_desc.ndim == 0:
-            pass
-        elif x1_desc and x1_desc.dtype == dpnp.bool:
-            pass
-        elif x2_desc and x2_desc.ndim == 0:
-            pass
-        elif x2_desc and x2_desc.dtype == dpnp.bool:
-            pass
-        elif dtype is not None:
-            pass
-        elif out is not None:
-            pass
-        elif not where:
-            pass
-        else:
-            out_desc = dpnp.get_dpnp_descriptor(out, copy_when_nondefault_queue=False) if out is not None else None
-            return dpnp_subtract(x1_desc, x2_desc, dtype=dtype, out=out_desc, where=where).get_pyobj()
+        x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_strides=False, copy_when_nondefault_queue=False, alloc_queue=queue)
+        x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_strides=False, copy_when_nondefault_queue=False, alloc_queue=queue)
+        if x1_desc and x2_desc:
+            if x1_desc.dtype == x2_desc.dtype == dpnp.bool:
+                raise TypeError("DPNP boolean subtract, the `-` operator, is not supported, "
+                                "use the bitwise_xor, the `^` operator, or the logical_xor function instead.")
+            return dpnp_subtract(x1_desc, x2_desc, dtype=dtype, out=out, where=where).get_pyobj()
 
-    return call_origin(numpy.subtract, x1, x2, dtype=dtype, out=out, where=where, **kwargs)
+    return call_origin(numpy.subtract, x1, x2, out=out, where=where, dtype=dtype, subok=subok, **kwargs)
 
 
 def sum(x1, axis=None, dtype=None, out=None, keepdims=False, initial=None, where=True):
diff --git a/tests/skipped_tests.tbl b/tests/skipped_tests.tbl
index 98ed90a6d890..690908638b69 100644
--- a/tests/skipped_tests.tbl
+++ b/tests/skipped_tests.tbl
@@ -389,7 +389,7 @@ tests/third_party/cupy/creation_tests/test_from_data.py::TestFromData::test_asar
 tests/third_party/cupy/creation_tests/test_from_data.py::TestFromData::test_ascontiguousarray_on_noncontiguous_array
 tests/third_party/cupy/creation_tests/test_from_data.py::TestFromData::test_asfortranarray_cuda_array_zero_dim
 tests/third_party/cupy/creation_tests/test_from_data.py::TestFromData::test_asfortranarray_cuda_array_zero_dim_dtype
-tests/third_party/cupy/creation_tests/test_from_data.py::TestFromData::test_fromfile
+
 tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_0_{copy=False, indexing='xy', sparse=False}::test_meshgrid0
 tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_0_{copy=False, indexing='xy', sparse=False}::test_meshgrid1
 tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_0_{copy=False, indexing='xy', sparse=False}::test_meshgrid2
@@ -773,7 +773,6 @@ tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNu
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_6_{name='subtract', nargs=2}::test_raises_with_numpy_input
 tests/third_party/cupy/math_tests/test_arithmetic.py::TestArithmeticRaisesWithNumpyInput_param_8_{name='floor_divide', nargs=2}::test_raises_with_numpy_input
 
-tests/third_party/cupy/math_tests/test_arithmetic.py::TestBoolSubtract_param_3_{shape=(), xp=dpnp}::test_bool_subtract
 tests/third_party/cupy/math_tests/test_explog.py::TestExplog::test_logaddexp
 tests/third_party/cupy/math_tests/test_explog.py::TestExplog::test_logaddexp2
 tests/third_party/cupy/math_tests/test_floating.py::TestFloating::test_copysign_float
diff --git a/tests/skipped_tests_gpu.tbl b/tests/skipped_tests_gpu.tbl
index c1bb4aa3b8fa..932c939743ae 100644
--- a/tests/skipped_tests_gpu.tbl
+++ b/tests/skipped_tests_gpu.tbl
@@ -18,7 +18,6 @@ tests/test_sycl_queue.py::test_1in_1out[opencl:gpu:0-conjugate-data2]
 tests/test_sycl_queue.py::test_1in_1out[opencl:gpu:0-copy-data3]
 tests/test_sycl_queue.py::test_1in_1out[opencl:gpu:0-cumprod-data4]
 tests/test_sycl_queue.py::test_1in_1out[opencl:gpu:0-cumsum-data5]
-tests/test_sycl_queue.py::test_1in_1out[opencl:gpu:0-diff-data6]
 tests/test_sycl_queue.py::test_1in_1out[opencl:gpu:0-ediff1d-data7]
 tests/test_sycl_queue.py::test_1in_1out[opencl:gpu:0-fabs-data8]
 tests/test_sycl_queue.py::test_1in_1out[opencl:gpu:0-floor-data9]
@@ -29,11 +28,9 @@ tests/test_sycl_queue.py::test_1in_1out[level_zero:gpu:0-conjugate-data2]
 tests/test_sycl_queue.py::test_1in_1out[level_zero:gpu:0-copy-data3]
 tests/test_sycl_queue.py::test_1in_1out[level_zero:gpu:0-cumprod-data4]
 tests/test_sycl_queue.py::test_1in_1out[level_zero:gpu:0-cumsum-data5]
-tests/test_sycl_queue.py::test_1in_1out[level_zero:gpu:0-diff-data6]
 tests/test_sycl_queue.py::test_1in_1out[level_zero:gpu:0-ediff1d-data7]
 tests/test_sycl_queue.py::test_1in_1out[level_zero:gpu:0-fabs-data8]
 tests/test_sycl_queue.py::test_1in_1out[level_zero:gpu:0-floor-data9]
-tests/test_sycl_queue.py::test_1in_1out[level_zero:gpu:0-gradient-data10]
 tests/test_sycl_queue.py::test_1in_1out[level_zero:gpu:0-nancumprod-data11]
 tests/test_sycl_queue.py::test_1in_1out[level_zero:gpu:0-nancumsum-data12]
 tests/test_sycl_queue.py::test_1in_1out[level_zero:gpu:0-nanprod-data13]
@@ -554,7 +551,6 @@ tests/third_party/cupy/creation_tests/test_from_data.py::TestFromData::test_asar
 tests/third_party/cupy/creation_tests/test_from_data.py::TestFromData::test_ascontiguousarray_on_noncontiguous_array
 tests/third_party/cupy/creation_tests/test_from_data.py::TestFromData::test_asfortranarray_cuda_array_zero_dim
 tests/third_party/cupy/creation_tests/test_from_data.py::TestFromData::test_asfortranarray_cuda_array_zero_dim_dtype
-tests/third_party/cupy/creation_tests/test_from_data.py::TestFromData::test_fromfile
 
 tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_0_{copy=False, indexing='xy', sparse=False}::test_meshgrid0
 tests/third_party/cupy/creation_tests/test_ranges.py::TestMeshgrid_param_0_{copy=False, indexing='xy', sparse=False}::test_meshgrid1
diff --git a/tests/test_arraycreation.py b/tests/test_arraycreation.py
index 833ea6109c3c..7216f934b3ef 100644
--- a/tests/test_arraycreation.py
+++ b/tests/test_arraycreation.py
@@ -109,7 +109,7 @@ def test_frombuffer(dtype):
 
 
 @pytest.mark.usefixtures("allow_fall_back_on_numpy")
-@pytest.mark.parametrize("dtype", get_all_dtypes(no_float16=False))
+@pytest.mark.parametrize("dtype", get_all_dtypes())
 def test_fromfile(dtype):
     with tempfile.TemporaryFile() as fh:
         fh.write(b"\x00\x01\x02\x03\x04\x05\x06\x07\x08")
@@ -275,6 +275,7 @@ def test_tri_default_dtype():
                               '[[1, 2], [3, 4]]',
                               '[[0, 1, 2], [3, 4, 5], [6, 7, 8]]',
                               '[[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]'])
+# TODO: add fixture 'allow_fall_back_on_numpy' and remove operator.index()
 def test_tril(m, k):
     a = numpy.array(m)
     ia = dpnp.array(a)
@@ -295,6 +296,7 @@ def test_tril(m, k):
                               '[[1, 2], [3, 4]]',
                               '[[0, 1, 2], [3, 4, 5], [6, 7, 8]]',
                               '[[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]'])
+# TODO: add fixture 'allow_fall_back_on_numpy' and remove operator.index()
 def test_triu(m, k):
     a = numpy.array(m)
     ia = dpnp.array(a)
diff --git a/tests/test_mathematical.py b/tests/test_mathematical.py
index 70e0bd73dc50..6f7ee58c0380 100644
--- a/tests/test_mathematical.py
+++ b/tests/test_mathematical.py
@@ -66,7 +66,7 @@ def test_diff(array):
 @pytest.mark.parametrize("dtype1", get_all_dtypes())
 @pytest.mark.parametrize("dtype2", get_all_dtypes())
 @pytest.mark.parametrize("func",
-                         ['add', 'multiply'])
+                         ['add', 'multiply', 'subtract'])
 @pytest.mark.parametrize("data",
                          [[[1, 2], [3, 4]]],
                          ids=['[[1, 2], [3, 4]]'])
@@ -77,9 +77,14 @@ def test_op_multiple_dtypes(dtype1, func, dtype2, data):
     np_b = numpy.array(data, dtype=dtype2)
     dpnp_b = dpnp.array(data, dtype=dtype2)
 
-    result = getattr(dpnp, func)(dpnp_a, dpnp_b)
-    expected = getattr(numpy, func)(np_a, np_b)
-    assert_array_equal(result, expected)
+    if func == 'subtract' and (dtype1 == dtype2 == dpnp.bool):
+        with pytest.raises(TypeError):
+            result = getattr(dpnp, func)(dpnp_a, dpnp_b)
+            expected = getattr(numpy, func)(np_a, np_b)
+    else:
+        result = getattr(dpnp, func)(dpnp_a, dpnp_b)
+        expected = getattr(numpy, func)(np_a, np_b)
+        assert_array_equal(result, expected)
 
 
 @pytest.mark.parametrize("rhs", [[[1, 2, 3], [4, 5, 6]], [2.0, 1.5, 1.0], 3, 0.3])
@@ -98,15 +103,20 @@ def array_or_scalar(xp, data, dtype=None):
         return xp.array(data, dtype=dtype)
 
     def _test_mathematical(self, name, dtype, lhs, rhs):
-        a = self.array_or_scalar(dpnp, lhs, dtype=dtype)
-        b = self.array_or_scalar(dpnp, rhs, dtype=dtype)
-        result = getattr(dpnp, name)(a, b)
+        a_dpnp = self.array_or_scalar(dpnp, lhs, dtype=dtype)
+        b_dpnp = self.array_or_scalar(dpnp, rhs, dtype=dtype)
 
-        a = self.array_or_scalar(numpy, lhs, dtype=dtype)
-        b = self.array_or_scalar(numpy, rhs, dtype=dtype)
-        expected = getattr(numpy, name)(a, b)
+        a_np = self.array_or_scalar(numpy, lhs, dtype=dtype)
+        b_np = self.array_or_scalar(numpy, rhs, dtype=dtype)
 
-        assert_allclose(result, expected, atol=1e-4)
+        if name == 'subtract' and not numpy.isscalar(rhs) and dtype == dpnp.bool:
+            with pytest.raises(TypeError):
+                result = getattr(dpnp, name)(a_dpnp, b_dpnp)
+                expected = getattr(numpy, name)(a_np, b_np)
+        else:
+            result = getattr(dpnp, name)(a_dpnp, b_dpnp)
+            expected = getattr(numpy, name)(a_np, b_np)
+            assert_allclose(result, expected, atol=1e-4)
 
     @pytest.mark.parametrize("dtype", get_all_dtypes())
     def test_add(self, dtype, lhs, rhs):
@@ -166,8 +176,7 @@ def test_remainder(self, dtype, lhs, rhs):
     def test_power(self, dtype, lhs, rhs):
         self._test_mathematical('power', dtype, lhs, rhs)
 
-    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
-    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True, no_complex=True))
+    @pytest.mark.parametrize("dtype", get_all_dtypes())
     def test_subtract(self, dtype, lhs, rhs):
         self._test_mathematical('subtract', dtype, lhs, rhs)
 
@@ -177,7 +186,7 @@ def test_subtract(self, dtype, lhs, rhs):
                          ids=['bool', 'int', 'float'])
 @pytest.mark.parametrize("data_type", get_all_dtypes())
 @pytest.mark.parametrize("func",
-                         ['add', 'multiply'])
+                         ['add', 'multiply', 'subtract'])
 @pytest.mark.parametrize("val",
                          [0, 1, 5],
                          ids=['0', '1', '5'])
@@ -197,22 +206,28 @@ def test_op_with_scalar(array, val, func, data_type, val_type):
     dpnp_a = dpnp.array(array, dtype=data_type)
     val_ = val_type(val)
 
-    result = getattr(dpnp, func)(dpnp_a, val_)
-    expected = getattr(numpy, func)(np_a, val_)
-    assert_array_equal(result, expected)
+    if func == 'subtract' and val_type == bool and data_type == dpnp.bool:
+        with pytest.raises(TypeError):
+            result = getattr(dpnp, func)(dpnp_a, val_)
+            expected = getattr(numpy, func)(np_a, val_)
 
-    result = getattr(dpnp, func)(val_, dpnp_a)
-    expected = getattr(numpy, func)(val_, np_a)
-    assert_array_equal(result, expected)
+            result = getattr(dpnp, func)(val_, dpnp_a)
+            expected = getattr(numpy, func)(val_, np_a)
+    else:
+        result = getattr(dpnp, func)(dpnp_a, val_)
+        expected = getattr(numpy, func)(np_a, val_)
+        assert_array_equal(result, expected)
+
+        result = getattr(dpnp, func)(val_, dpnp_a)
+        expected = getattr(numpy, func)(val_, np_a)
+        assert_array_equal(result, expected)
 
 
 @pytest.mark.parametrize("shape",
                          [(), (3, 2)],
                          ids=['()', '(3, 2)'])
-@pytest.mark.parametrize("dtype",
-                         [numpy.float32, numpy.float64],
-                         ids=['numpy.float32', 'numpy.float64'])
-def test_multiply_scalar2(shape, dtype):
+@pytest.mark.parametrize("dtype", get_all_dtypes())
+def test_multiply_scalar(shape, dtype):
     np_a = numpy.ones(shape, dtype=dtype)
     dpnp_a = dpnp.ones(shape, dtype=dtype)
 
@@ -221,6 +236,32 @@ def test_multiply_scalar2(shape, dtype):
     assert_allclose(result, expected)
 
 
+@pytest.mark.parametrize("shape",
+                         [(), (3, 2)],
+                         ids=['()', '(3, 2)'])
+@pytest.mark.parametrize("dtype", get_all_dtypes())
+def test_add_scalar(shape, dtype):
+    np_a = numpy.ones(shape, dtype=dtype)
+    dpnp_a = dpnp.ones(shape, dtype=dtype)
+
+    result = 0.5 + dpnp_a + 1.7
+    expected = 0.5 + np_a + 1.7
+    assert_allclose(result, expected)
+
+
+@pytest.mark.parametrize("shape",
+                         [(), (3, 2)],
+                         ids=['()', '(3, 2)'])
+@pytest.mark.parametrize("dtype", get_all_dtypes())
+def test_subtract_scalar(shape, dtype):
+    np_a = numpy.ones(shape, dtype=dtype)
+    dpnp_a = dpnp.ones(shape, dtype=dtype)
+
+    result = 0.5 - dpnp_a - 1.7
+    expected = 0.5 - np_a - 1.7
+    assert_allclose(result, expected)
+
+
 @pytest.mark.usefixtures("allow_fall_back_on_numpy")
 @pytest.mark.parametrize("array", [[1, 2, 3, 4, 5],
                                    [1, 2, numpy.nan, 4, 5],
diff --git a/tests/third_party/cupy/creation_tests/test_from_data.py b/tests/third_party/cupy/creation_tests/test_from_data.py
index e07d927b1cf0..ce71ef311a56 100644
--- a/tests/third_party/cupy/creation_tests/test_from_data.py
+++ b/tests/third_party/cupy/creation_tests/test_from_data.py
@@ -454,6 +454,7 @@ def test_asfortranarray_cuda_array_zero_dim_dtype(
         a = xp.ones((), dtype=dtype_a)
         return xp.asfortranarray(a, dtype=dtype_b)
 
+    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
     @testing.numpy_cupy_array_equal()
     def test_fromfile(self, xp):
         with tempfile.TemporaryFile() as fh:
diff --git a/tests/third_party/cupy/math_tests/test_arithmetic.py b/tests/third_party/cupy/math_tests/test_arithmetic.py
index a53a8494707c..21068ece8749 100644
--- a/tests/third_party/cupy/math_tests/test_arithmetic.py
+++ b/tests/third_party/cupy/math_tests/test_arithmetic.py
@@ -146,31 +146,27 @@ def check_binary(self, xp):
                 y = y.astype(numpy.complex64)
 
         # NumPy returns an output array of another type than DPNP when input ones have diffrent types.
-        if self.name in ('add', 'multiply') and xp is cupy:
-            if xp.isscalar(arg1) and xp.isscalar(arg2):
-                # If both are scalars, the result will be a scalar, so needs to convert into numpy-scalar.
-                y = numpy.asarray(y)
-            elif dtype1 != dtype2:
-                is_array_arg1 = not xp.isscalar(arg1)
-                is_array_arg2 = not xp.isscalar(arg2)
-
-                is_int_float = lambda _x, _y: numpy.issubdtype(_x, numpy.integer) and numpy.issubdtype(_y, numpy.floating)
-                is_same_type = lambda _x, _y, _type: numpy.issubdtype(_x, _type) and numpy.issubdtype(_y, _type)
-
-                if is_array_arg1 and is_array_arg2:
-                    # If both inputs are arrays where one is of floating type and another - integer,
-                    # NumPy will return an output array of always "float64" type,
-                    # while DPNP will return the array of a wider type from the input arrays.
-                    if is_int_float(dtype1, dtype2) or is_int_float(dtype2, dtype1):
-                        y = y.astype(numpy.float64)
-                elif is_same_type(dtype1, dtype2, numpy.floating) or is_same_type(dtype1, dtype2, numpy.integer):
-                    # If one input is an array and another - scalar,
-                    # NumPy will return an output array of the same type as the inpupt array has,
-                    # while DPNP will return the array of a wider type from the inputs (considering both array and scalar).
-                    if is_array_arg1 and not is_array_arg2:
-                        y = y.astype(dtype1)
-                    elif is_array_arg2 and not is_array_arg1:
-                        y = y.astype(dtype2)
+        if self.name in ('add', 'multiply', 'subtract') and xp is cupy and dtype1 != dtype2 and not self.use_dtype:
+            is_array_arg1 = not xp.isscalar(arg1)
+            is_array_arg2 = not xp.isscalar(arg2)
+
+            is_int_float = lambda _x, _y: numpy.issubdtype(_x, numpy.integer) and numpy.issubdtype(_y, numpy.floating)
+            is_same_type = lambda _x, _y, _type: numpy.issubdtype(_x, _type) and numpy.issubdtype(_y, _type)
+
+            if is_array_arg1 and is_array_arg2:
+                # If both inputs are arrays where one is of floating type and another - integer,
+                # NumPy will return an output array of always "float64" type,
+                # while DPNP will return the array of a wider type from the input arrays.
+                if is_int_float(dtype1, dtype2) or is_int_float(dtype2, dtype1):
+                    y = y.astype(numpy.float64)
+            elif is_same_type(dtype1, dtype2, numpy.floating) or is_same_type(dtype1, dtype2, numpy.integer):
+                # If one input is an array and another - scalar,
+                # NumPy will return an output array of the same type as the inpupt array has,
+                # while DPNP will return the array of a wider type from the inputs (considering both array and scalar).
+                if is_array_arg1 and not is_array_arg2:
+                    y = y.astype(dtype1)
+                elif is_array_arg2 and not is_array_arg1:
+                    y = y.astype(dtype2)
 
         # NumPy returns different values (nan/inf) on division by zero
         # depending on the architecture.
@@ -188,7 +184,6 @@ def check_binary(self, xp):
 @testing.gpu
 @testing.parameterize(*(
     testing.product({
-        # TODO(unno): boolean subtract causes DeprecationWarning in numpy>=1.13
         'arg1': [testing.shaped_arange((2, 3), numpy, dtype=d)
                  for d in all_types
                  ] + [0, 0.0, 2, 2.0],
@@ -283,7 +278,6 @@ def test_modf(self, xp, dtype):
     'xp': [numpy, cupy],
     'shape': [(3, 2), (), (3, 0, 2)]
 }))
-@pytest.mark.usefixtures("allow_fall_back_on_numpy")
 @testing.gpu
 class TestBoolSubtract(unittest.TestCase):
 

From 8c33cbbafb903be0b1960721b1f5bd8ca712ca4f Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Wed, 8 Feb 2023 13:58:35 -0600
Subject: [PATCH 3/7] dpnp.divide() doesn't work properly with a scalar

---
 .../include/dpnp_gen_2arg_3type_tbl.hpp       |  6 +-
 dpnp/backend/include/dpnp_iface_fptr.hpp      | 22 ++++-
 dpnp/backend/kernels/dpnp_krnl_elemwise.cpp   | 84 +++++++++++--------
 dpnp/dpnp_algo/dpnp_algo.pxd                  |  2 +
 dpnp/dpnp_algo/dpnp_algo.pyx                  | 14 +++-
 dpnp/dpnp_iface_mathematical.py               | 69 ++++++++-------
 tests/conftest.py                             | 21 ++++-
 tests/helper.py                               |  2 +-
 tests/test_linalg.py                          | 71 ++++++++--------
 tests/test_mathematical.py                    | 27 ++++--
 tests/test_strides.py                         |  1 +
 .../cupy/math_tests/test_arithmetic.py        | 38 +++++----
 .../cupy/statistics_tests/test_meanvar.py     |  5 +-
 13 files changed, 227 insertions(+), 135 deletions(-)

diff --git a/dpnp/backend/include/dpnp_gen_2arg_3type_tbl.hpp b/dpnp/backend/include/dpnp_gen_2arg_3type_tbl.hpp
index 6d8b0a49340e..850f3c7213c2 100644
--- a/dpnp/backend/include/dpnp_gen_2arg_3type_tbl.hpp
+++ b/dpnp/backend/include/dpnp_gen_2arg_3type_tbl.hpp
@@ -132,10 +132,10 @@ MACRO_2ARG_3TYPES_OP(dpnp_copysign_c,
 
 MACRO_2ARG_3TYPES_OP(dpnp_divide_c,
                      input1_elem / input2_elem,
-                     nullptr,
-                     std::false_type,
+                     sycl::native::divide(x1, x2),
+                     MACRO_UNPACK_TYPES(float, double),
                      oneapi::mkl::vm::div,
-                     MACRO_UNPACK_TYPES(float, double))
+                     MACRO_UNPACK_TYPES(float, double, std::complex<float>, std::complex<double>))
 
 MACRO_2ARG_3TYPES_OP(dpnp_fmod_c,
                      sycl::fmod((double)input1_elem, (double)input2_elem),
diff --git a/dpnp/backend/include/dpnp_iface_fptr.hpp b/dpnp/backend/include/dpnp_iface_fptr.hpp
index 61c1c9838ad6..0637bfcfc9bc 100644
--- a/dpnp/backend/include/dpnp_iface_fptr.hpp
+++ b/dpnp/backend/include/dpnp_iface_fptr.hpp
@@ -419,8 +419,26 @@ size_t operator-(DPNPFuncType lhs, DPNPFuncType rhs);
  */
 typedef struct DPNPFuncData
 {
-    DPNPFuncType return_type; /**< return type identifier which expected by the @ref ptr function */
-    void* ptr;                /**< C++ backend function pointer */
+    DPNPFuncData(const DPNPFuncType gen_type, void* gen_ptr, const DPNPFuncType type_no_fp64, void* ptr_no_fp64)
+        : return_type(gen_type)
+        , ptr(gen_ptr)
+        , return_type_no_fp64(type_no_fp64)
+        , ptr_no_fp64(ptr_no_fp64)
+    {
+    }
+    DPNPFuncData(const DPNPFuncType gen_type, void* gen_ptr)
+        : DPNPFuncData(gen_type, gen_ptr, DPNPFuncType::DPNP_FT_NONE, nullptr)
+    {
+    }
+    DPNPFuncData()
+        : DPNPFuncData(DPNPFuncType::DPNP_FT_NONE, nullptr)
+    {
+    }
+
+    DPNPFuncType return_type;         /**< return type identifier which expected by the @ref ptr function */
+    void* ptr;                        /**< C++ backend function pointer */
+    DPNPFuncType return_type_no_fp64; /**< alternative return type identifier when no fp64 support by device */
+    void* ptr_no_fp64;                /**< alternative C++ backend function pointer when no fp64 support by device */
 } DPNPFuncData_t;
 
 /**
diff --git a/dpnp/backend/kernels/dpnp_krnl_elemwise.cpp b/dpnp/backend/kernels/dpnp_krnl_elemwise.cpp
index 31da58c98ff0..097cab021d6c 100644
--- a/dpnp/backend/kernels/dpnp_krnl_elemwise.cpp
+++ b/dpnp/backend/kernels/dpnp_krnl_elemwise.cpp
@@ -1178,6 +1178,47 @@ static void func_map_init_elemwise_1arg_1type(func_map_t& fmap)
 
 #include <dpnp_gen_2arg_3type_tbl.hpp>
 
+template <DPNPFuncType FT1, DPNPFuncType FT2, typename has_fp64 = std::true_type>
+static constexpr DPNPFuncType get_divide_res_type()
+{
+    constexpr auto widest_type = populate_func_types<FT1, FT2>();
+    constexpr auto shortes_type = (widest_type == FT1) ? FT2 : FT1;
+
+    if constexpr (widest_type == DPNPFuncType::DPNP_FT_CMPLX128 || widest_type == DPNPFuncType::DPNP_FT_DOUBLE)
+    {
+        return widest_type;
+    }
+    else if constexpr (widest_type == DPNPFuncType::DPNP_FT_CMPLX64)
+    {
+        if constexpr (shortes_type == DPNPFuncType::DPNP_FT_DOUBLE)
+        {
+            return DPNPFuncType::DPNP_FT_CMPLX128;
+        }
+        else if constexpr (has_fp64::value &&
+                           (shortes_type == DPNPFuncType::DPNP_FT_INT || shortes_type == DPNPFuncType::DPNP_FT_LONG))
+        {
+            return DPNPFuncType::DPNP_FT_CMPLX128;
+        }
+    }
+    else if constexpr (widest_type == DPNPFuncType::DPNP_FT_FLOAT)
+    {
+        if constexpr (has_fp64::value &&
+                      (shortes_type == DPNPFuncType::DPNP_FT_INT || shortes_type == DPNPFuncType::DPNP_FT_LONG))
+        {
+            return DPNPFuncType::DPNP_FT_DOUBLE;
+        }
+    }
+    else if constexpr (has_fp64::value)
+    {
+        return DPNPFuncType::DPNP_FT_DOUBLE;
+    }
+    else
+    {
+        return DPNPFuncType::DPNP_FT_FLOAT;
+    }
+    return widest_type;
+}
+
 template <DPNPFuncType FT1, DPNPFuncType... FTs>
 static void func_map_elemwise_2arg_3type_core(func_map_t& fmap)
 {
@@ -1199,6 +1240,16 @@ static void func_map_elemwise_2arg_3type_core(func_map_t& fmap)
                                       func_type_map_t::find_type<FT1>,
                                       func_type_map_t::find_type<FTs>>}),
      ...);
+    ((fmap[DPNPFuncName::DPNP_FN_DIVIDE_EXT][FT1][FTs] =
+          {get_divide_res_type<FT1, FTs>(),
+           (void*)dpnp_divide_c_ext<func_type_map_t::find_type<get_divide_res_type<FT1, FTs>()>,
+                                    func_type_map_t::find_type<FT1>,
+                                    func_type_map_t::find_type<FTs>>,
+           get_divide_res_type<FT1, FTs, std::false_type>(),
+           (void*)dpnp_divide_c_ext<func_type_map_t::find_type<get_divide_res_type<FT1, FTs, std::false_type>()>,
+                                    func_type_map_t::find_type<FT1>,
+                                    func_type_map_t::find_type<FTs>>}),
+     ...);
 }
 
 template <DPNPFuncType... FTs>
@@ -1407,39 +1458,6 @@ static void func_map_init_elemwise_2arg_3type(func_map_t& fmap)
     fmap[DPNPFuncName::DPNP_FN_DIVIDE][eft_DBL][eft_DBL] = {eft_DBL,
                                                             (void*)dpnp_divide_c_default<double, double, double>};
 
-    fmap[DPNPFuncName::DPNP_FN_DIVIDE_EXT][eft_INT][eft_INT] = {eft_DBL,
-                                                                (void*)dpnp_divide_c_ext<double, int32_t, int32_t>};
-    fmap[DPNPFuncName::DPNP_FN_DIVIDE_EXT][eft_INT][eft_LNG] = {eft_DBL,
-                                                                (void*)dpnp_divide_c_ext<double, int32_t, int64_t>};
-    fmap[DPNPFuncName::DPNP_FN_DIVIDE_EXT][eft_INT][eft_FLT] = {eft_DBL,
-                                                                (void*)dpnp_divide_c_ext<double, int32_t, float>};
-    fmap[DPNPFuncName::DPNP_FN_DIVIDE_EXT][eft_INT][eft_DBL] = {eft_DBL,
-                                                                (void*)dpnp_divide_c_ext<double, int32_t, double>};
-    fmap[DPNPFuncName::DPNP_FN_DIVIDE_EXT][eft_LNG][eft_INT] = {eft_DBL,
-                                                                (void*)dpnp_divide_c_ext<double, int64_t, int32_t>};
-    fmap[DPNPFuncName::DPNP_FN_DIVIDE_EXT][eft_LNG][eft_LNG] = {eft_DBL,
-                                                                (void*)dpnp_divide_c_ext<double, int64_t, int64_t>};
-    fmap[DPNPFuncName::DPNP_FN_DIVIDE_EXT][eft_LNG][eft_FLT] = {eft_DBL,
-                                                                (void*)dpnp_divide_c_ext<double, int64_t, float>};
-    fmap[DPNPFuncName::DPNP_FN_DIVIDE_EXT][eft_LNG][eft_DBL] = {eft_DBL,
-                                                                (void*)dpnp_divide_c_ext<double, int64_t, double>};
-    fmap[DPNPFuncName::DPNP_FN_DIVIDE_EXT][eft_FLT][eft_INT] = {eft_DBL,
-                                                                (void*)dpnp_divide_c_ext<double, float, int32_t>};
-    fmap[DPNPFuncName::DPNP_FN_DIVIDE_EXT][eft_FLT][eft_LNG] = {eft_DBL,
-                                                                (void*)dpnp_divide_c_ext<double, float, int64_t>};
-    fmap[DPNPFuncName::DPNP_FN_DIVIDE_EXT][eft_FLT][eft_FLT] = {eft_FLT,
-                                                                (void*)dpnp_divide_c_ext<float, float, float>};
-    fmap[DPNPFuncName::DPNP_FN_DIVIDE_EXT][eft_FLT][eft_DBL] = {eft_DBL,
-                                                                (void*)dpnp_divide_c_ext<double, float, double>};
-    fmap[DPNPFuncName::DPNP_FN_DIVIDE_EXT][eft_DBL][eft_INT] = {eft_DBL,
-                                                                (void*)dpnp_divide_c_ext<double, double, int32_t>};
-    fmap[DPNPFuncName::DPNP_FN_DIVIDE_EXT][eft_DBL][eft_LNG] = {eft_DBL,
-                                                                (void*)dpnp_divide_c_ext<double, double, int64_t>};
-    fmap[DPNPFuncName::DPNP_FN_DIVIDE_EXT][eft_DBL][eft_FLT] = {eft_DBL,
-                                                                (void*)dpnp_divide_c_ext<double, double, float>};
-    fmap[DPNPFuncName::DPNP_FN_DIVIDE_EXT][eft_DBL][eft_DBL] = {eft_DBL,
-                                                                (void*)dpnp_divide_c_ext<double, double, double>};
-
     fmap[DPNPFuncName::DPNP_FN_FMOD][eft_INT][eft_INT] = {eft_INT,
                                                           (void*)dpnp_fmod_c_default<int32_t, int32_t, int32_t>};
     fmap[DPNPFuncName::DPNP_FN_FMOD][eft_INT][eft_LNG] = {eft_LNG,
diff --git a/dpnp/dpnp_algo/dpnp_algo.pxd b/dpnp/dpnp_algo/dpnp_algo.pxd
index 485e8adb1a66..65e07a9c7046 100644
--- a/dpnp/dpnp_algo/dpnp_algo.pxd
+++ b/dpnp/dpnp_algo/dpnp_algo.pxd
@@ -374,6 +374,8 @@ cdef extern from "dpnp_iface_fptr.hpp":
     struct DPNPFuncData:
         DPNPFuncType return_type
         void * ptr
+        DPNPFuncType return_type_no_fp64
+        void *ptr_no_fp64
 
     DPNPFuncData get_dpnp_function_ptr(DPNPFuncName name, DPNPFuncType first_type, DPNPFuncType second_type) except +
 
diff --git a/dpnp/dpnp_algo/dpnp_algo.pyx b/dpnp/dpnp_algo/dpnp_algo.pyx
index aaa7334e18a8..54e71b87d767 100644
--- a/dpnp/dpnp_algo/dpnp_algo.pyx
+++ b/dpnp/dpnp_algo/dpnp_algo.pyx
@@ -481,8 +481,6 @@ cdef utils.dpnp_descriptor call_fptr_2in_1out_strides(DPNPFuncName fptr_name,
     # get the FPTR data structure
     cdef DPNPFuncData kernel_data = get_dpnp_function_ptr(fptr_name, x1_c_type, x2_c_type)
 
-    result_type = dpnp_DPNPFuncType_to_dtype( < size_t > kernel_data.return_type)
-
     # Create result array
     cdef shape_type_c x1_shape = x1_obj.shape
 
@@ -495,6 +493,15 @@ cdef utils.dpnp_descriptor call_fptr_2in_1out_strides(DPNPFuncName fptr_name,
 
     result_sycl_device, result_usm_type, result_sycl_queue = utils.get_common_usm_allocation(x1_obj, x2_obj)
 
+    # get FPTR function and result type
+    cdef fptr_2in_1out_strides_t func = NULL
+    if fptr_name != DPNP_FN_DIVIDE_EXT or result_sycl_device.has_aspect_fp64:
+        result_type = dpnp_DPNPFuncType_to_dtype(< size_t > kernel_data.return_type)
+        func = < fptr_2in_1out_strides_t > kernel_data.ptr
+    else:
+        result_type = dpnp_DPNPFuncType_to_dtype(< size_t > kernel_data.return_type_no_fp64)
+        func = < fptr_2in_1out_strides_t > kernel_data.ptr_no_fp64
+
     if out is None:
         """ Create result array with type given by FPTR data """
         result = utils.create_output_descriptor(result_shape,
@@ -517,11 +524,10 @@ cdef utils.dpnp_descriptor call_fptr_2in_1out_strides(DPNPFuncName fptr_name,
 
     result_obj = result.get_array()
 
-    cdef c_dpctl.SyclQueue q = <c_dpctl.SyclQueue> result_obj.sycl_queue
+    cdef c_dpctl.SyclQueue q = < c_dpctl.SyclQueue > result_obj.sycl_queue
     cdef c_dpctl.DPCTLSyclQueueRef q_ref = q.get_queue_ref()
 
     """ Call FPTR function """
-    cdef fptr_2in_1out_strides_t func = <fptr_2in_1out_strides_t > kernel_data.ptr
     cdef c_dpctl.DPCTLSyclEventRef event_ref = func(q_ref,
                                                     result.get_data(),
                                                     result.size,
diff --git a/dpnp/dpnp_iface_mathematical.py b/dpnp/dpnp_iface_mathematical.py
index 41657fb2d593..af57819827c9 100644
--- a/dpnp/dpnp_iface_mathematical.py
+++ b/dpnp/dpnp_iface_mathematical.py
@@ -544,55 +544,64 @@ def diff(x1, n=1, axis=-1, prepend=numpy._NoValue, append=numpy._NoValue):
     return call_origin(numpy.diff, x1, n=n, axis=axis, prepend=prepend, append=append)
 
 
-def divide(x1, x2, dtype=None, out=None, where=True, **kwargs):
+def divide(x1,
+           x2,
+           /,
+           out=None,
+           *,
+           where=True,
+           dtype=None,
+           subok=True,
+           **kwargs):
     """
     Divide arguments element-wise.
 
     For full documentation refer to :obj:`numpy.divide`.
 
+    Returns
+    -------
+    y : dpnp.ndarray
+        The quotient ``x1/x2``, element-wise.
+    
     Limitations
     -----------
-    Parameters ``x1`` and ``x2`` are supported as either :obj:`dpnp.ndarray` or scalar.
-    Parameters ``dtype``, ``out`` and ``where`` are supported with their default values.
+    Parameters `x1` and `x2` are supported as either :class:`dpnp.ndarray` or scalar,
+    but not both (at least either `x1` or `x2` should be as :class:`dpnp.ndarray`).
+    Parameters `out`, `where`, `dtype` and `subok` are supported with their default values.
     Keyword arguments ``kwargs`` are currently unsupported.
-    Otherwise the functions will be executed sequentially on CPU.
+    Otherwise the function will be executed sequentially on CPU.
     Input array data types are limited by supported DPNP :ref:`Data types`.
 
     Examples
     --------
-    >>> import dpnp as np
-    >>> result = np.divide(np.array([1, -2, 6, -9]), np.array([-2, -2, -2, -2]))
-    >>> [x for x in result]
+    >>> import dpnp as dp
+    >>> result = dp.divide(dp.array([1, -2, 6, -9]), dp.array([-2, -2, -2, -2]))
+    >>> print(result)
     [-0.5, 1.0, -3.0, 4.5]
 
     """
 
-    x1_is_scalar = dpnp.isscalar(x1)
-    x2_is_scalar = dpnp.isscalar(x2)
-    x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_strides=False, copy_when_nondefault_queue=False)
-    x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_strides=False, copy_when_nondefault_queue=False)
+    if out is not None:
+        pass
+    elif where is not True:
+        pass
+    elif dtype is not None:
+        pass
+    elif subok is not True:
+        pass
+    elif dpnp.isscalar(x1) and dpnp.isscalar(x2):
+        # at least either x1 or x2 has to be an array
+        pass
+    else:
+        # get a common queue to copy data from the host into a device if any input is scalar
+        queue = get_common_allocation_queue([x1, x2]) if dpnp.isscalar(x1) or dpnp.isscalar(x2) else None
 
-    if x1_desc and x2_desc and not kwargs:
-        if not x1_desc and not x1_is_scalar:
-            pass
-        elif not x2_desc and not x2_is_scalar:
-            pass
-        elif x1_is_scalar and x2_is_scalar:
-            pass
-        elif x1_desc and x1_desc.ndim == 0:
-            pass
-        elif x2_desc and x2_desc.ndim == 0:
-            pass
-        elif dtype is not None:
-            pass
-        elif out is not None:
-            pass
-        elif not where:
-            pass
-        else:
+        x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_strides=False, copy_when_nondefault_queue=False, alloc_queue=queue)
+        x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_strides=False, copy_when_nondefault_queue=False, alloc_queue=queue)
+        if x1_desc and x2_desc:
             return dpnp_divide(x1_desc, x2_desc, dtype=dtype, out=out, where=where).get_pyobj()
 
-    return call_origin(numpy.divide, x1, x2, dtype=dtype, out=out, where=where, **kwargs)
+    return call_origin(numpy.divide, x1, x2, out=out, where=where, dtype=dtype, subok=subok, **kwargs)
 
 
 def ediff1d(x1, to_end=None, to_begin=None):
diff --git a/tests/conftest.py b/tests/conftest.py
index 78d3180bac08..22276f125f26 100755
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 # *****************************************************************************
-# Copyright (c) 2016-2020, Intel Corporation
+# Copyright (c) 2016-2023, Intel Corporation
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -77,3 +77,22 @@ def pytest_collection_modifyitems(config, items):
 @pytest.fixture
 def allow_fall_back_on_numpy(monkeypatch):
     monkeypatch.setattr(dpnp.config, '__DPNP_RAISE_EXCEPION_ON_NUMPY_FALLBACK__', 0)
+
+@pytest.fixture
+def suppress_divide_numpy_warnings():
+    # divide: treatment for division by zero (infinite result obtained from finite numbers)
+    old_settings = numpy.seterr(divide='ignore')
+    yield
+    numpy.seterr(**old_settings)  # reset to default
+
+@pytest.fixture
+def suppress_invalid_numpy_warnings():
+    # invalid: treatment for invalid floating-point operation
+    # (result is not an expressible number, typically indicates that a NaN was produced)
+    old_settings = numpy.seterr(invalid='ignore')
+    yield
+    numpy.seterr(**old_settings)  # reset to default
+
+@pytest.fixture
+def suppress_divide_invalid_numpy_warnings(suppress_divide_numpy_warnings, suppress_invalid_numpy_warnings):
+    yield
diff --git a/tests/helper.py b/tests/helper.py
index be550a995dce..17c62cecd289 100644
--- a/tests/helper.py
+++ b/tests/helper.py
@@ -32,7 +32,7 @@ def get_all_dtypes(no_bool=False,
         dtypes.append(dpnp.complex64)
         if dev.has_aspect_fp64:
             dtypes.append(dpnp.complex128)
-    
+
     # add None value to validate a default dtype
     if not no_none:
         dtypes.append(None)
diff --git a/tests/test_linalg.py b/tests/test_linalg.py
index ac8392d15384..d9784a41558f 100644
--- a/tests/test_linalg.py
+++ b/tests/test_linalg.py
@@ -1,9 +1,15 @@
 import pytest
+from .helper import get_all_dtypes
 
 import dpnp as inp
 
 import dpctl
+
 import numpy
+from numpy.testing import (
+    assert_allclose,
+    assert_array_equal
+)
 
 
 def vvsort(val, vec, size, xp):
@@ -49,7 +55,7 @@ def test_cholesky(array):
     ia = inp.array(a)
     result = inp.linalg.cholesky(ia)
     expected = numpy.linalg.cholesky(a)
-    numpy.testing.assert_array_equal(expected, result)
+    assert_array_equal(expected, result)
 
 
 @pytest.mark.parametrize("arr",
@@ -63,7 +69,7 @@ def test_cond(arr, p):
     ia = inp.array(a)
     result = inp.linalg.cond(ia, p)
     expected = numpy.linalg.cond(a, p)
-    numpy.testing.assert_array_equal(expected, result)
+    assert_array_equal(expected, result)
 
 
 @pytest.mark.parametrize("array",
@@ -82,13 +88,11 @@ def test_det(array):
     ia = inp.array(a)
     result = inp.linalg.det(ia)
     expected = numpy.linalg.det(a)
-    numpy.testing.assert_allclose(expected, result)
+    assert_allclose(expected, result)
 
 
 @pytest.mark.usefixtures("allow_fall_back_on_numpy")
-@pytest.mark.parametrize("type",
-                         [numpy.float64, numpy.float32, numpy.int64, numpy.int32],
-                         ids=['float64', 'float32', 'int64', 'int32'])
+@pytest.mark.parametrize("type", get_all_dtypes(no_bool=True, no_complex=True))
 @pytest.mark.parametrize("size",
                          [2, 4, 8, 16, 300])
 def test_eig_arange(type, size):
@@ -115,21 +119,19 @@ def test_eig_arange(type, size):
         if np_vec[0, i] * dpnp_vec[0, i] < 0:
             np_vec[:, i] = -np_vec[:, i]
 
-    numpy.testing.assert_array_equal(symm_orig, symm)
-    numpy.testing.assert_array_equal(dpnp_symm_orig, dpnp_symm)
+    assert_array_equal(symm_orig, symm)
+    assert_array_equal(dpnp_symm_orig, dpnp_symm)
 
     assert (dpnp_val.dtype == np_val.dtype)
     assert (dpnp_vec.dtype == np_vec.dtype)
     assert (dpnp_val.shape == np_val.shape)
     assert (dpnp_vec.shape == np_vec.shape)
 
-    numpy.testing.assert_allclose(dpnp_val, np_val, rtol=1e-05, atol=1e-05)
-    numpy.testing.assert_allclose(dpnp_vec, np_vec, rtol=1e-05, atol=1e-05)
+    assert_allclose(dpnp_val, np_val, rtol=1e-05, atol=1e-05)
+    assert_allclose(dpnp_vec, np_vec, rtol=1e-05, atol=1e-05)
 
 
-@pytest.mark.parametrize("type",
-                         [numpy.float64, numpy.float32, numpy.int64, numpy.int32],
-                         ids=['float64', 'float32', 'int64', 'int32'])
+@pytest.mark.parametrize("type", get_all_dtypes(no_bool=True, no_complex=True))
 def test_eigvals(type):
     if dpctl.get_current_device_type() != dpctl.device_type.gpu:
         pytest.skip("eigvals function doesn\'t work on CPU: https://github.com/IntelPython/dpnp/issues/1005")
@@ -144,12 +146,10 @@ def test_eigvals(type):
         ia = inp.array(a)
         result = inp.linalg.eigvals(ia)
         expected = numpy.linalg.eigvals(a)
-        numpy.testing.assert_allclose(expected, result, atol=0.5)
+        assert_allclose(expected, result, atol=0.5)
 
 
-@pytest.mark.parametrize("type",
-                         [numpy.float64, numpy.float32, numpy.int64, numpy.int32],
-                         ids=['float64', 'float32', 'int64', 'int32'])
+@pytest.mark.parametrize("type", get_all_dtypes(no_bool=True, no_complex=True))
 @pytest.mark.parametrize("array",
                          [[[1., 2.], [3., 4.]], [[0, 1, 2], [3, 2, -1], [4, -2, 3]]],
                          ids=['[[1., 2.], [3., 4.]]', '[[0, 1, 2], [3, 2, -1], [4, -2, 3]]'])
@@ -158,12 +158,10 @@ def test_inv(type, array):
     ia = inp.array(a)
     result = inp.linalg.inv(ia)
     expected = numpy.linalg.inv(a)
-    numpy.testing.assert_allclose(expected, result)
+    assert_allclose(expected, result)
 
 
-@pytest.mark.parametrize("type",
-                         [numpy.float64, numpy.float32, numpy.int64, numpy.int32],
-                         ids=['float64', 'float32', 'int64', 'int32'])
+@pytest.mark.parametrize("type", get_all_dtypes(no_bool=True, no_complex=True, no_none=True))
 @pytest.mark.parametrize("array",
                          [[0, 0], [0, 1], [1, 2], [[0, 0], [0, 0]], [[1, 2], [1, 2]], [[1, 2], [3, 4]]],
                          ids=['[0, 0]', '[0, 1]', '[1, 2]', '[[0, 0], [0, 0]]', '[[1, 2], [1, 2]]', '[[1, 2], [3, 4]]'])
@@ -177,10 +175,11 @@ def test_matrix_rank(type, tol, array):
     result = inp.linalg.matrix_rank(ia, tol=tol)
     expected = numpy.linalg.matrix_rank(a, tol=tol)
 
-    numpy.testing.assert_allclose(expected, result)
+    assert_allclose(expected, result)
 
 
 @pytest.mark.usefixtures("allow_fall_back_on_numpy")
+@pytest.mark.usefixtures("suppress_divide_numpy_warnings")
 @pytest.mark.parametrize("array",
                          [[7], [1, 2], [1, 0]],
                          ids=['[7]', '[1, 2]', '[1, 0]'])
@@ -195,7 +194,7 @@ def test_norm1(array, ord, axis):
     ia = inp.array(a)
     result = inp.linalg.norm(ia, ord=ord, axis=axis)
     expected = numpy.linalg.norm(a, ord=ord, axis=axis)
-    numpy.testing.assert_allclose(expected, result)
+    assert_allclose(expected, result)
 
 
 @pytest.mark.usefixtures("allow_fall_back_on_numpy")
@@ -213,7 +212,7 @@ def test_norm2(array, ord, axis):
     ia = inp.array(a)
     result = inp.linalg.norm(ia, ord=ord, axis=axis)
     expected = numpy.linalg.norm(a, ord=ord, axis=axis)
-    numpy.testing.assert_array_equal(expected, result)
+    assert_array_equal(expected, result)
 
 
 @pytest.mark.usefixtures("allow_fall_back_on_numpy")
@@ -231,13 +230,11 @@ def test_norm3(array, ord, axis):
     ia = inp.array(a)
     result = inp.linalg.norm(ia, ord=ord, axis=axis)
     expected = numpy.linalg.norm(a, ord=ord, axis=axis)
-    numpy.testing.assert_array_equal(expected, result)
+    assert_array_equal(expected, result)
 
 
 @pytest.mark.usefixtures("allow_fall_back_on_numpy")
-@pytest.mark.parametrize("type",
-                         [numpy.float64, numpy.float32, numpy.int64, numpy.int32],
-                         ids=['float64', 'float32', 'int64', 'int32'])
+@pytest.mark.parametrize("type", get_all_dtypes(no_bool=True, no_complex=True))
 @pytest.mark.parametrize("shape",
                          [(2, 2), (3, 4), (5, 3), (16, 16)],
                          ids=['(2,2)', '(3,4)', '(5,3)', '(16,16)'])
@@ -262,7 +259,7 @@ def test_qr(type, shape, mode):
         tol = 1e-11
 
     # check decomposition
-    numpy.testing.assert_allclose(ia, numpy.dot(inp.asnumpy(dpnp_q), inp.asnumpy(dpnp_r)), rtol=tol, atol=tol)
+    assert_allclose(ia, numpy.dot(inp.asnumpy(dpnp_q), inp.asnumpy(dpnp_r)), rtol=tol, atol=tol)
 
     # NP change sign for comparison
     ncols = min(a.shape[0], a.shape[1])
@@ -273,14 +270,12 @@ def test_qr(type, shape, mode):
             np_r[i, :] = -np_r[i, :]
 
         if numpy.any(numpy.abs(np_r[i, :]) > tol):
-            numpy.testing.assert_allclose(inp.asnumpy(dpnp_q)[:, i], np_q[:, i], rtol=tol, atol=tol)
+            assert_allclose(inp.asnumpy(dpnp_q)[:, i], np_q[:, i], rtol=tol, atol=tol)
 
-    numpy.testing.assert_allclose(dpnp_r, np_r, rtol=tol, atol=tol)
+    assert_allclose(dpnp_r, np_r, rtol=tol, atol=tol)
 
 
-@pytest.mark.parametrize("type",
-                         [numpy.float64, numpy.float32, numpy.int64, numpy.int32],
-                         ids=['float64', 'float32', 'int64', 'int32'])
+@pytest.mark.parametrize("type", get_all_dtypes(no_bool=True, no_complex=True))
 @pytest.mark.parametrize("shape",
                          [(2, 2), (3, 4), (5, 3), (16, 16)],
                          ids=['(2,2)', '(3,4)', '(5,3)', '(16,16)'])
@@ -309,10 +304,10 @@ def test_svd(type, shape):
         dpnp_diag_s[i, i] = dpnp_s[i]
 
     # check decomposition
-    numpy.testing.assert_allclose(ia, inp.dot(dpnp_u, inp.dot(dpnp_diag_s, dpnp_vt)), rtol=tol, atol=tol)
+    assert_allclose(ia, inp.dot(dpnp_u, inp.dot(dpnp_diag_s, dpnp_vt)), rtol=tol, atol=tol)
 
     # compare singular values
-    # numpy.testing.assert_allclose(dpnp_s, np_s, rtol=tol, atol=tol)
+    # assert_allclose(dpnp_s, np_s, rtol=tol, atol=tol)
 
     # change sign of vectors
     for i in range(min(shape[0], shape[1])):
@@ -322,5 +317,5 @@ def test_svd(type, shape):
 
     # compare vectors for non-zero values
     for i in range(numpy.count_nonzero(np_s > tol)):
-        numpy.testing.assert_allclose(inp.asnumpy(dpnp_u)[:, i], np_u[:, i], rtol=tol, atol=tol)
-        numpy.testing.assert_allclose(inp.asnumpy(dpnp_vt)[i, :], np_vt[i, :], rtol=tol, atol=tol)
+        assert_allclose(inp.asnumpy(dpnp_u)[:, i], np_u[:, i], rtol=tol, atol=tol)
+        assert_allclose(inp.asnumpy(dpnp_vt)[i, :], np_vt[i, :], rtol=tol, atol=tol)
diff --git a/tests/test_mathematical.py b/tests/test_mathematical.py
index 6f7ee58c0380..78f628908337 100644
--- a/tests/test_mathematical.py
+++ b/tests/test_mathematical.py
@@ -66,7 +66,7 @@ def test_diff(array):
 @pytest.mark.parametrize("dtype1", get_all_dtypes())
 @pytest.mark.parametrize("dtype2", get_all_dtypes())
 @pytest.mark.parametrize("func",
-                         ['add', 'multiply', 'subtract'])
+                         ['add', 'multiply', 'subtract', 'divide'])
 @pytest.mark.parametrize("data",
                          [[[1, 2], [3, 4]]],
                          ids=['[[1, 2], [3, 4]]'])
@@ -132,8 +132,7 @@ def test_arctan2(self, dtype, lhs, rhs):
     def test_copysign(self, dtype, lhs, rhs):
         self._test_mathematical('copysign', dtype, lhs, rhs)
 
-    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
-    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True, no_complex=True))
+    @pytest.mark.parametrize("dtype", get_all_dtypes())
     def test_divide(self, dtype, lhs, rhs):
         self._test_mathematical('divide', dtype, lhs, rhs)
 
@@ -181,12 +180,13 @@ def test_subtract(self, dtype, lhs, rhs):
         self._test_mathematical('subtract', dtype, lhs, rhs)
 
 
+@pytest.mark.usefixtures("suppress_divide_invalid_numpy_warnings")
 @pytest.mark.parametrize("val_type",
                          [bool, int, float],
                          ids=['bool', 'int', 'float'])
 @pytest.mark.parametrize("data_type", get_all_dtypes())
 @pytest.mark.parametrize("func",
-                         ['add', 'multiply', 'subtract'])
+                         ['add', 'multiply', 'subtract', 'divide'])
 @pytest.mark.parametrize("val",
                          [0, 1, 5],
                          ids=['0', '1', '5'])
@@ -216,11 +216,11 @@ def test_op_with_scalar(array, val, func, data_type, val_type):
     else:
         result = getattr(dpnp, func)(dpnp_a, val_)
         expected = getattr(numpy, func)(np_a, val_)
-        assert_array_equal(result, expected)
+        assert_allclose(result, expected)
 
         result = getattr(dpnp, func)(val_, dpnp_a)
         expected = getattr(numpy, func)(val_, np_a)
-        assert_array_equal(result, expected)
+        assert_allclose(result, expected)
 
 
 @pytest.mark.parametrize("shape",
@@ -262,6 +262,19 @@ def test_subtract_scalar(shape, dtype):
     assert_allclose(result, expected)
 
 
+@pytest.mark.parametrize("shape",
+                         [(), (3, 2)],
+                         ids=['()', '(3, 2)'])
+@pytest.mark.parametrize("dtype", get_all_dtypes())
+def test_divide_scalar(shape, dtype):
+    np_a = numpy.ones(shape, dtype=dtype)
+    dpnp_a = dpnp.ones(shape, dtype=dtype)
+
+    result = 0.5 / dpnp_a / 1.7
+    expected = 0.5 / np_a / 1.7
+    assert_allclose(result, expected)
+
+
 @pytest.mark.usefixtures("allow_fall_back_on_numpy")
 @pytest.mark.parametrize("array", [[1, 2, 3, 4, 5],
                                    [1, 2, numpy.nan, 4, 5],
@@ -442,7 +455,6 @@ def test_cross_3x3(self, x1, x2, axisa, axisb, axisc, axis):
         assert_array_equal(expected, result)
 
 
-@pytest.mark.usefixtures("allow_fall_back_on_numpy")
 class TestGradient:
 
     @pytest.mark.parametrize("array", [[2, 3, 6, 8, 4, 9],
@@ -456,6 +468,7 @@ def test_gradient_y1(self, array):
         expected = numpy.gradient(np_y)
         assert_array_equal(expected, result)
 
+    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
     @pytest.mark.parametrize("array", [[2, 3, 6, 8, 4, 9],
                                        [3., 4., 7.5, 9.],
                                        [2, 6, 8, 10]])
diff --git a/tests/test_strides.py b/tests/test_strides.py
index 3c0d86a44a5a..02e8c8689757 100644
--- a/tests/test_strides.py
+++ b/tests/test_strides.py
@@ -37,6 +37,7 @@ def test_strides(func_name, dtype):
     assert_allclose(expected, result)
 
 
+@pytest.mark.usefixtures("suppress_divide_invalid_numpy_warnings")
 @pytest.mark.parametrize("func_name",
                          ["arccos", "arccosh", "arcsin", "arcsinh", "arctan", "arctanh", "cbrt", "ceil", "copy", "cos",
                           "cosh", "conjugate", "degrees", "ediff1d", "exp", "exp2", "expm1", "fabs", "floor", "log",
diff --git a/tests/third_party/cupy/math_tests/test_arithmetic.py b/tests/third_party/cupy/math_tests/test_arithmetic.py
index 21068ece8749..027722d8bef2 100644
--- a/tests/third_party/cupy/math_tests/test_arithmetic.py
+++ b/tests/third_party/cupy/math_tests/test_arithmetic.py
@@ -146,27 +146,35 @@ def check_binary(self, xp):
                 y = y.astype(numpy.complex64)
 
         # NumPy returns an output array of another type than DPNP when input ones have diffrent types.
-        if self.name in ('add', 'multiply', 'subtract') and xp is cupy and dtype1 != dtype2 and not self.use_dtype:
+        if xp is cupy and dtype1 != dtype2 and not self.use_dtype:
             is_array_arg1 = not xp.isscalar(arg1)
             is_array_arg2 = not xp.isscalar(arg2)
 
             is_int_float = lambda _x, _y: numpy.issubdtype(_x, numpy.integer) and numpy.issubdtype(_y, numpy.floating)
             is_same_type = lambda _x, _y, _type: numpy.issubdtype(_x, _type) and numpy.issubdtype(_y, _type)
 
-            if is_array_arg1 and is_array_arg2:
-                # If both inputs are arrays where one is of floating type and another - integer,
-                # NumPy will return an output array of always "float64" type,
-                # while DPNP will return the array of a wider type from the input arrays.
-                if is_int_float(dtype1, dtype2) or is_int_float(dtype2, dtype1):
-                    y = y.astype(numpy.float64)
-            elif is_same_type(dtype1, dtype2, numpy.floating) or is_same_type(dtype1, dtype2, numpy.integer):
-                # If one input is an array and another - scalar,
-                # NumPy will return an output array of the same type as the inpupt array has,
-                # while DPNP will return the array of a wider type from the inputs (considering both array and scalar).
-                if is_array_arg1 and not is_array_arg2:
-                    y = y.astype(dtype1)
-                elif is_array_arg2 and not is_array_arg1:
-                    y = y.astype(dtype2)
+            if self.name in ('add', 'multiply', 'subtract'):
+                if is_array_arg1 and is_array_arg2:
+                    # If both inputs are arrays where one is of floating type and another - integer,
+                    # NumPy will return an output array of always "float64" type,
+                    # while DPNP will return the array of a wider type from the input arrays.
+                    if is_int_float(dtype1, dtype2) or is_int_float(dtype2, dtype1):
+                        y = y.astype(numpy.float64)
+                elif is_same_type(dtype1, dtype2, numpy.floating) or is_same_type(dtype1, dtype2, numpy.integer):
+                    # If one input is an array and another - scalar,
+                    # NumPy will return an output array of the same type as the inpupt array has,
+                    # while DPNP will return the array of a wider type from the inputs (considering both array and scalar).
+                    if is_array_arg1 and not is_array_arg2:
+                        y = y.astype(dtype1)
+                    elif is_array_arg2 and not is_array_arg1:
+                        y = y.astype(dtype2)
+            elif self.name in ('divide', 'true_divide'):
+                # If one input is an array of float32 and another - an integer or floating scalar,
+                # NumPy will return an output array of float32, while DPNP will return the array of float64,
+                # since NumPy would use the same float64 type when instead of scalar here is array of integer of floating type.
+                if not (is_array_arg1 and is_array_arg2):
+                    if (is_array_arg1 and arg1.dtype == numpy.float32) ^ (is_array_arg2 and arg2.dtype == numpy.float32):
+                        y = y.astype(numpy.float32)
 
         # NumPy returns different values (nan/inf) on division by zero
         # depending on the architecture.
diff --git a/tests/third_party/cupy/statistics_tests/test_meanvar.py b/tests/third_party/cupy/statistics_tests/test_meanvar.py
index aea22d02c511..60d3413b0daa 100644
--- a/tests/third_party/cupy/statistics_tests/test_meanvar.py
+++ b/tests/third_party/cupy/statistics_tests/test_meanvar.py
@@ -89,7 +89,6 @@ def test_median_axis_sequence(self, xp, dtype):
         return xp.median(a, self.axis, keepdims=self.keepdims)
 
 
-@pytest.mark.usefixtures("allow_fall_back_on_numpy")
 @testing.gpu
 class TestAverage(unittest.TestCase):
 
@@ -101,12 +100,14 @@ def test_average_all(self, xp, dtype):
         a = testing.shaped_arange((2, 3), xp, dtype)
         return xp.average(a)
 
+    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose()
     def test_average_axis(self, xp, dtype):
         a = testing.shaped_arange((2, 3, 4), xp, dtype)
         return xp.average(a, axis=1)
 
+    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose()
     def test_average_weights(self, xp, dtype):
@@ -114,6 +115,7 @@ def test_average_weights(self, xp, dtype):
         w = testing.shaped_arange((2, 3), xp, dtype)
         return xp.average(a, weights=w)
 
+    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose()
     def test_average_axis_weights(self, xp, dtype):
@@ -132,6 +134,7 @@ def check_returned(self, a, axis, weights):
         testing.assert_allclose(average_cpu, average_gpu)
         testing.assert_allclose(sum_weights_cpu, sum_weights_gpu)
 
+    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
     @testing.for_all_dtypes()
     def test_returned(self, dtype):
         a = testing.shaped_arange((2, 3), numpy, dtype)

From 0b3ce153bc2e223ab0833bb3ae73b441ce24b2ff Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Mon, 13 Feb 2023 09:36:28 -0600
Subject: [PATCH 4/7] dpnp.divide() doesn't work properly with a scalar

---
 .../include/dpnp_gen_2arg_3type_tbl.hpp       |  18 +--
 dpnp/backend/kernels/dpnp_krnl_elemwise.cpp   | 139 ++++++++++--------
 dpnp/backend/src/dpnp_fptr.hpp                |  33 +++++
 dpnp/dpnp_iface_mathematical.py               |  20 ++-
 tests/test_usm_type.py                        |  26 ++++
 5 files changed, 159 insertions(+), 77 deletions(-)

diff --git a/dpnp/backend/include/dpnp_gen_2arg_3type_tbl.hpp b/dpnp/backend/include/dpnp_gen_2arg_3type_tbl.hpp
index 850f3c7213c2..148a0a9e646f 100644
--- a/dpnp/backend/include/dpnp_gen_2arg_3type_tbl.hpp
+++ b/dpnp/backend/include/dpnp_gen_2arg_3type_tbl.hpp
@@ -111,8 +111,8 @@
 
 MACRO_2ARG_3TYPES_OP(dpnp_add_c,
                      input1_elem + input2_elem,
-                     sycl::add_sat(x1, x2),
-                     MACRO_UNPACK_TYPES(int, long),
+                     x1 + x2,
+                     MACRO_UNPACK_TYPES(int, long, bool),
                      oneapi::mkl::vm::add,
                      MACRO_UNPACK_TYPES(float, double, std::complex<float>, std::complex<double>))
 
@@ -132,8 +132,8 @@ MACRO_2ARG_3TYPES_OP(dpnp_copysign_c,
 
 MACRO_2ARG_3TYPES_OP(dpnp_divide_c,
                      input1_elem / input2_elem,
-                     sycl::native::divide(x1, x2),
-                     MACRO_UNPACK_TYPES(float, double),
+                     x1 / x2,
+                     MACRO_UNPACK_TYPES(int, long, bool),
                      oneapi::mkl::vm::div,
                      MACRO_UNPACK_TYPES(float, double, std::complex<float>, std::complex<double>))
 
@@ -169,9 +169,9 @@ MACRO_2ARG_3TYPES_OP(dpnp_minimum_c,
 // pytest "tests/third_party/cupy/creation_tests/test_ranges.py::TestMgrid::test_mgrid3"
 // requires multiplication shape1[10] with shape2[10,1] and result expected as shape[10,10]
 MACRO_2ARG_3TYPES_OP(dpnp_multiply_c,
-                     input1_elem* input2_elem,
-                     nullptr,
-                     std::false_type,
+                     input1_elem * input2_elem,
+                     x1 * x2,
+                     MACRO_UNPACK_TYPES(int, long, bool),
                      oneapi::mkl::vm::mul,
                      MACRO_UNPACK_TYPES(float, double, std::complex<float>, std::complex<double>))
 
@@ -184,8 +184,8 @@ MACRO_2ARG_3TYPES_OP(dpnp_power_c,
 
 MACRO_2ARG_3TYPES_OP(dpnp_subtract_c,
                      input1_elem - input2_elem,
-                     sycl::sub_sat(x1, x2),
-                     MACRO_UNPACK_TYPES(int, long),
+                     x1 - x2,
+                     MACRO_UNPACK_TYPES(int, long, bool),
                      oneapi::mkl::vm::sub,
                      MACRO_UNPACK_TYPES(float, double, std::complex<float>, std::complex<double>))
 
diff --git a/dpnp/backend/kernels/dpnp_krnl_elemwise.cpp b/dpnp/backend/kernels/dpnp_krnl_elemwise.cpp
index 097cab021d6c..6f6816012eb7 100644
--- a/dpnp/backend/kernels/dpnp_krnl_elemwise.cpp
+++ b/dpnp/backend/kernels/dpnp_krnl_elemwise.cpp
@@ -881,9 +881,9 @@ static void func_map_init_elemwise_1arg_1type(func_map_t& fmap)
                                                                                                                        \
         sycl::queue q = *(reinterpret_cast<sycl::queue*>(q_ref));                                                      \
                                                                                                                        \
-        _DataType_input1* input1_data = static_cast<_DataType_input1 *>(const_cast<void *>(input1_in));                \
-        _DataType_input2* input2_data = static_cast<_DataType_input2 *>(const_cast<void *>(input2_in));                \
-        _DataType_output* result = static_cast<_DataType_output *>(result_out);                                        \
+        _DataType_input1* input1_data = static_cast<_DataType_input1*>(const_cast<void*>(input1_in));                  \
+        _DataType_input2* input2_data = static_cast<_DataType_input2*>(const_cast<void*>(input2_in));                  \
+        _DataType_output* result = static_cast<_DataType_output*>(result_out);                                         \
                                                                                                                        \
         bool use_broadcasting = !array_equal(input1_shape, input1_ndim, input2_shape, input2_ndim);                    \
                                                                                                                        \
@@ -896,8 +896,7 @@ static void func_map_init_elemwise_1arg_1type(func_map_t& fmap)
         shape_elem_type* input2_shape_offsets = new shape_elem_type[input2_ndim];                                      \
                                                                                                                        \
         get_shape_offsets_inkernel(input2_shape, input2_ndim, input2_shape_offsets);                                   \
-        use_strides =                                                                                                  \
-            use_strides || !array_equal(input2_strides, input2_ndim, input2_shape_offsets, input2_ndim);               \
+        use_strides = use_strides || !array_equal(input2_strides, input2_ndim, input2_shape_offsets, input2_ndim);     \
         delete[] input2_shape_offsets;                                                                                 \
                                                                                                                        \
         sycl::event event;                                                                                             \
@@ -907,19 +906,17 @@ static void func_map_init_elemwise_1arg_1type(func_map_t& fmap)
         {                                                                                                              \
             DPNPC_id<_DataType_input1>* input1_it;                                                                     \
             const size_t input1_it_size_in_bytes = sizeof(DPNPC_id<_DataType_input1>);                                 \
-            input1_it = reinterpret_cast<DPNPC_id<_DataType_input1>*>(dpnp_memory_alloc_c(q_ref,                       \
-                                                                                          input1_it_size_in_bytes));   \
-            new (input1_it)                                                                                            \
-                DPNPC_id<_DataType_input1>(q_ref, input1_data, input1_shape, input1_strides, input1_ndim);             \
+            input1_it =                                                                                                \
+                reinterpret_cast<DPNPC_id<_DataType_input1>*>(dpnp_memory_alloc_c(q_ref, input1_it_size_in_bytes));    \
+            new (input1_it) DPNPC_id<_DataType_input1>(q_ref, input1_data, input1_shape, input1_strides, input1_ndim); \
                                                                                                                        \
             input1_it->broadcast_to_shape(result_shape, result_ndim);                                                  \
                                                                                                                        \
             DPNPC_id<_DataType_input2>* input2_it;                                                                     \
             const size_t input2_it_size_in_bytes = sizeof(DPNPC_id<_DataType_input2>);                                 \
-            input2_it = reinterpret_cast<DPNPC_id<_DataType_input2>*>(dpnp_memory_alloc_c(q_ref,                       \
-                                                                                          input2_it_size_in_bytes));   \
-            new (input2_it)                                                                                            \
-                DPNPC_id<_DataType_input2>(q_ref, input2_data, input2_shape, input2_strides, input2_ndim);             \
+            input2_it =                                                                                                \
+                reinterpret_cast<DPNPC_id<_DataType_input2>*>(dpnp_memory_alloc_c(q_ref, input2_it_size_in_bytes));    \
+            new (input2_it) DPNPC_id<_DataType_input2>(q_ref, input2_data, input2_shape, input2_strides, input2_ndim); \
                                                                                                                        \
             input2_it->broadcast_to_shape(result_shape, result_ndim);                                                  \
                                                                                                                        \
@@ -957,27 +954,26 @@ static void func_map_init_elemwise_1arg_1type(func_map_t& fmap)
             using usm_host_allocatorT = sycl::usm_allocator<shape_elem_type, sycl::usm::alloc::host>;                  \
                                                                                                                        \
             size_t strides_size = 3 * result_ndim;                                                                     \
-            shape_elem_type *dev_strides_data = sycl::malloc_device<shape_elem_type>(strides_size, q);                 \
+            shape_elem_type* dev_strides_data = sycl::malloc_device<shape_elem_type>(strides_size, q);                 \
                                                                                                                        \
             /* create host temporary for packed strides managed by shared pointer */                                   \
-            auto strides_host_packed = std::vector<shape_elem_type, usm_host_allocatorT>(strides_size,                 \
-                                                                                         usm_host_allocatorT(q));      \
+            auto strides_host_packed =                                                                                 \
+                std::vector<shape_elem_type, usm_host_allocatorT>(strides_size, usm_host_allocatorT(q));               \
                                                                                                                        \
             /* packed vector is concatenation of result_strides, input1_strides and input2_strides */                  \
             std::copy(result_strides, result_strides + result_ndim, strides_host_packed.begin());                      \
             std::copy(input1_strides, input1_strides + result_ndim, strides_host_packed.begin() + result_ndim);        \
             std::copy(input2_strides, input2_strides + result_ndim, strides_host_packed.begin() + 2 * result_ndim);    \
                                                                                                                        \
-            auto copy_strides_ev = q.copy<shape_elem_type>(strides_host_packed.data(),                                 \
-                                                           dev_strides_data,                                           \
-                                                           strides_host_packed.size());                                \
+            auto copy_strides_ev =                                                                                     \
+                q.copy<shape_elem_type>(strides_host_packed.data(), dev_strides_data, strides_host_packed.size());     \
                                                                                                                        \
             auto kernel_parallel_for_func = [=](sycl::id<1> global_id) {                                               \
                 const size_t output_id = global_id[0]; /* for (size_t i = 0; i < result_size; ++i) */                  \
                 {                                                                                                      \
-                    const shape_elem_type *result_strides_data = &dev_strides_data[0];                                 \
-                    const shape_elem_type *input1_strides_data = &dev_strides_data[1];                                 \
-                    const shape_elem_type *input2_strides_data = &dev_strides_data[2];                                 \
+                    const shape_elem_type* result_strides_data = &dev_strides_data[0];                                 \
+                    const shape_elem_type* input1_strides_data = &dev_strides_data[1];                                 \
+                    const shape_elem_type* input2_strides_data = &dev_strides_data[2];                                 \
                                                                                                                        \
                     size_t input1_id = 0;                                                                              \
                     size_t input2_id = 0;                                                                              \
@@ -1013,8 +1009,10 @@ static void func_map_init_elemwise_1arg_1type(func_map_t& fmap)
             {                                                                                                          \
                 event = __mkl_operation__(q, result_size, input1_data, input2_data, result);                           \
             }                                                                                                          \
-            else if constexpr (none_of_both_types<_DataType_input1, _DataType_input2,                                  \
-                                                  std::complex<float>, std::complex<double>>)                          \
+            else if constexpr (none_of_both_types<_DataType_input1,                                                    \
+                                                  _DataType_input2,                                                    \
+                                                  std::complex<float>,                                                 \
+                                                  std::complex<double>>)                                               \
             {                                                                                                          \
                 constexpr size_t lws = 64;                                                                             \
                 constexpr unsigned int vec_sz = 8;                                                                     \
@@ -1026,22 +1024,47 @@ static void func_map_init_elemwise_1arg_1type(func_map_t& fmap)
                 auto kernel_parallel_for_func = [=](sycl::nd_item<1> nd_it) {                                          \
                     auto sg = nd_it.get_sub_group();                                                                   \
                     const auto max_sg_size = sg.get_max_local_range()[0];                                              \
-                    const size_t start = vec_sz * (nd_it.get_group(0) * nd_it.get_local_range(0) +                     \
-                                                   sg.get_group_id()[0] * max_sg_size);                                \
+                    const size_t start =                                                                               \
+                        vec_sz * (nd_it.get_group(0) * nd_it.get_local_range(0) + sg.get_group_id()[0] * max_sg_size); \
                                                                                                                        \
                     if (start + static_cast<size_t>(vec_sz) * max_sg_size < result_size)                               \
                     {                                                                                                  \
-                        sycl::vec<_DataType_input1, vec_sz> x1 =                                                       \
-                            sg.load<vec_sz>(sycl::multi_ptr<_DataType_input1, global_space>(&input1_data[start]));     \
-                        sycl::vec<_DataType_input2, vec_sz> x2 =                                                       \
-                            sg.load<vec_sz>(sycl::multi_ptr<_DataType_input2, global_space>(&input2_data[start]));     \
+                        using input1_ptrT = sycl::multi_ptr<_DataType_input1, global_space>;                           \
+                        using input2_ptrT = sycl::multi_ptr<_DataType_input2, global_space>;                           \
+                        using result_ptrT = sycl::multi_ptr<_DataType_output, global_space>;                           \
+                                                                                                                       \
                         sycl::vec<_DataType_output, vec_sz> res_vec;                                                   \
-                        if constexpr (both_types_are_same<_DataType_input1, _DataType_input2, __vec_types__>)          \
+                                                                                                                       \
+                        if constexpr (both_types_are_any_of<_DataType_input1, _DataType_input2, __vec_types__>)        \
                         {                                                                                              \
-                            res_vec = __vec_operation__;                                                               \
+                            if constexpr (both_types_are_same<_DataType_input1, _DataType_input2, _DataType_output>)   \
+                            {                                                                                          \
+                                sycl::vec<_DataType_input1, vec_sz> x1 =                                               \
+                                    sg.load<vec_sz>(input1_ptrT(&input1_data[start]));                                 \
+                                sycl::vec<_DataType_input2, vec_sz> x2 =                                               \
+                                    sg.load<vec_sz>(input2_ptrT(&input2_data[start]));                                 \
+                                                                                                                       \
+                                res_vec = __vec_operation__;                                                           \
+                            }                                                                                          \
+                            else /* input types don't match result type, so explicit casting is required */            \
+                            {                                                                                          \
+                                sycl::vec<_DataType_output, vec_sz> x1 =                                               \
+                                    dpnp_vec_cast<_DataType_output, _DataType_input1, vec_sz>(                         \
+                                        sg.load<vec_sz>(input1_ptrT(&input1_data[start])));                            \
+                                sycl::vec<_DataType_output, vec_sz> x2 =                                               \
+                                    dpnp_vec_cast<_DataType_output, _DataType_input2, vec_sz>(                         \
+                                        sg.load<vec_sz>(input2_ptrT(&input2_data[start])));                            \
+                                                                                                                       \
+                                res_vec = __vec_operation__;                                                           \
+                            }                                                                                          \
                         }                                                                                              \
                         else                                                                                           \
                         {                                                                                              \
+                            sycl::vec<_DataType_input1, vec_sz> x1 =                                                   \
+                                sg.load<vec_sz>(input1_ptrT(&input1_data[start]));                                     \
+                            sycl::vec<_DataType_input2, vec_sz> x2 =                                                   \
+                                sg.load<vec_sz>(input2_ptrT(&input2_data[start]));                                     \
+                                                                                                                       \
                             for (size_t k = 0; k < vec_sz; ++k)                                                        \
                             {                                                                                          \
                                 const _DataType_output input1_elem = x1[k];                                            \
@@ -1049,8 +1072,7 @@ static void func_map_init_elemwise_1arg_1type(func_map_t& fmap)
                                 res_vec[k] = __operation__;                                                            \
                             }                                                                                          \
                         }                                                                                              \
-                        sg.store<vec_sz>(sycl::multi_ptr<_DataType_output, global_space>(&result[start]), res_vec);    \
-                                                                                                                       \
+                        sg.store<vec_sz>(result_ptrT(&result[start]), res_vec);                                        \
                     }                                                                                                  \
                     else                                                                                               \
                     {                                                                                                  \
@@ -1064,8 +1086,8 @@ static void func_map_init_elemwise_1arg_1type(func_map_t& fmap)
                 };                                                                                                     \
                                                                                                                        \
                 auto kernel_func = [&](sycl::handler& cgh) {                                                           \
-                    sycl::stream out(65536, 128, cgh);\
-                    cgh.parallel_for<class __name__##_sg_kernel<_DataType_output, _DataType_input1, _DataType_input2>>(\
+                    cgh.parallel_for<                                                                                  \
+                        class __name__##_sg_kernel<_DataType_output, _DataType_input1, _DataType_input2>>(             \
                         sycl::nd_range<1>(gws_range, lws_range), kernel_parallel_for_func);                            \
                 };                                                                                                     \
                 event = q.submit(kernel_func);                                                                         \
@@ -1078,7 +1100,6 @@ static void func_map_init_elemwise_1arg_1type(func_map_t& fmap)
                     const _DataType_output input1_elem = input1_data[i];                                               \
                     const _DataType_output input2_elem = input2_data[i];                                               \
                     result[i] = __operation__;                                                                         \
-                                                                                                                       \
                 };                                                                                                     \
                 auto kernel_func = [&](sycl::handler& cgh) {                                                           \
                     cgh.parallel_for<class __name__##_kernel<_DataType_output, _DataType_input1, _DataType_input2>>(   \
@@ -1112,26 +1133,25 @@ static void func_map_init_elemwise_1arg_1type(func_map_t& fmap)
     {                                                                                                                  \
         DPCTLSyclQueueRef q_ref = reinterpret_cast<DPCTLSyclQueueRef>(&DPNP_QUEUE);                                    \
         DPCTLEventVectorRef dep_event_vec_ref = nullptr;                                                               \
-        DPCTLSyclEventRef event_ref = __name__<_DataType_output, _DataType_input1, _DataType_input2>(                  \
-            q_ref,                                                                                                     \
-            result_out,                                                                                                \
-            result_size,                                                                                               \
-            result_ndim,                                                                                               \
-            result_shape,                                                                                              \
-            result_strides,                                                                                            \
-            input1_in,                                                                                                 \
-            input1_size,                                                                                               \
-            input1_ndim,                                                                                               \
-            input1_shape,                                                                                              \
-            input1_strides,                                                                                            \
-            input2_in,                                                                                                 \
-            input2_size,                                                                                               \
-            input2_ndim,                                                                                               \
-            input2_shape,                                                                                              \
-            input2_strides,                                                                                            \
-            where,                                                                                                     \
-            dep_event_vec_ref                                                                                          \
-        );                                                                                                             \
+        DPCTLSyclEventRef event_ref =                                                                                  \
+            __name__<_DataType_output, _DataType_input1, _DataType_input2>(q_ref,                                      \
+                                                                           result_out,                                 \
+                                                                           result_size,                                \
+                                                                           result_ndim,                                \
+                                                                           result_shape,                               \
+                                                                           result_strides,                             \
+                                                                           input1_in,                                  \
+                                                                           input1_size,                                \
+                                                                           input1_ndim,                                \
+                                                                           input1_shape,                               \
+                                                                           input1_strides,                             \
+                                                                           input2_in,                                  \
+                                                                           input2_size,                                \
+                                                                           input2_ndim,                                \
+                                                                           input2_shape,                               \
+                                                                           input2_strides,                             \
+                                                                           where,                                      \
+                                                                           dep_event_vec_ref);                         \
         DPCTLEvent_WaitAndThrow(event_ref);                                                                            \
         DPCTLEvent_Delete(event_ref);                                                                                  \
     }                                                                                                                  \
@@ -1172,9 +1192,8 @@ static void func_map_init_elemwise_1arg_1type(func_map_t& fmap)
                                         const shape_elem_type*,                                                        \
                                         const shape_elem_type*,                                                        \
                                         const size_t*,                                                                 \
-                                        const DPCTLEventVectorRef) = __name__<_DataType_output,                        \
-                                                                              _DataType_input1,                        \
-                                                                              _DataType_input2>;
+                                        const DPCTLEventVectorRef) =                                                   \
+        __name__<_DataType_output, _DataType_input1, _DataType_input2>;
 
 #include <dpnp_gen_2arg_3type_tbl.hpp>
 
diff --git a/dpnp/backend/src/dpnp_fptr.hpp b/dpnp/backend/src/dpnp_fptr.hpp
index 4cb664858319..742e6dff3783 100644
--- a/dpnp/backend/src/dpnp_fptr.hpp
+++ b/dpnp/backend/src/dpnp_fptr.hpp
@@ -35,6 +35,8 @@
 #include <map>
 #include <complex>
 
+#include <CL/sycl.hpp>
+
 #include <dpnp_iface_fptr.hpp>
 
 /**
@@ -116,6 +118,31 @@ static constexpr DPNPFuncType populate_func_types()
     return (FT1 < FT2) ? FT2 : FT1;
 }
 
+/**
+ * @brief A helper function to cast SYCL vector between types.
+ */
+template <typename Op, typename Vec, std::size_t... I>
+static auto dpnp_vec_cast_impl(const Vec& v, std::index_sequence<I...>)
+{
+    return Op{v[I]...};
+}
+
+/**
+ * @brief A casting function for SYCL vector.
+ * 
+ * @tparam dstT A result type upon casting.
+ * @tparam srcT An incoming type of the vector.
+ * @tparam N A number of elements with the vector.
+ * @tparam Indices A sequence of integers
+ * @param s An incoming SYCL vector to cast.
+ * @return SYCL vector casted to desctination type.
+ */
+template <typename dstT, typename srcT, std::size_t N, typename Indices = std::make_index_sequence<N>>
+static auto dpnp_vec_cast(const sycl::vec<srcT, N>& s)
+{
+    return dpnp_vec_cast_impl<sycl::vec<dstT, N>, sycl::vec<srcT, N>>(s, Indices{});
+}
+
 /**
  * Removes parentheses for a passed list of types separated by comma.
  * It's intended to be used in operations macro.
@@ -142,6 +169,12 @@ struct are_same : std::conjunction<std::is_same<T, Ts>...> {};
 template <typename T1, typename T2, typename... Ts>
 constexpr auto both_types_are_same = std::conjunction_v<is_any<T1, Ts...>, are_same<T1, T2>>;
 
+/**
+ * A template constat to check if both types T1 and T2 match any type from Ts.
+ */
+template <typename T1, typename T2, typename... Ts>
+constexpr auto both_types_are_any_of = std::conjunction_v<is_any<T1, Ts...>, is_any<T2, Ts...>>;
+
 /**
  * A template constat to check if both types T1 and T2 don't match any type from Ts sequence.
  */
diff --git a/dpnp/dpnp_iface_mathematical.py b/dpnp/dpnp_iface_mathematical.py
index af57819827c9..feff53288cfd 100644
--- a/dpnp/dpnp_iface_mathematical.py
+++ b/dpnp/dpnp_iface_mathematical.py
@@ -593,11 +593,13 @@ def divide(x1,
         # at least either x1 or x2 has to be an array
         pass
     else:
-        # get a common queue to copy data from the host into a device if any input is scalar
-        queue = get_common_allocation_queue([x1, x2]) if dpnp.isscalar(x1) or dpnp.isscalar(x2) else None
+        # get USM type and queue to copy scalar from the host memory into a USM allocation
+        usm_type, queue = get_usm_allocations([x1, x2]) if dpnp.isscalar(x1) or dpnp.isscalar(x2) else (None, None)
 
-        x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_strides=False, copy_when_nondefault_queue=False, alloc_queue=queue)
-        x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_strides=False, copy_when_nondefault_queue=False, alloc_queue=queue)
+        x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_strides=False, copy_when_nondefault_queue=False,
+                                           alloc_usm_type=usm_type, alloc_queue=queue)
+        x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_strides=False, copy_when_nondefault_queue=False,
+                                           alloc_usm_type=usm_type, alloc_queue=queue)
         if x1_desc and x2_desc:
             return dpnp_divide(x1_desc, x2_desc, dtype=dtype, out=out, where=where).get_pyobj()
 
@@ -1578,11 +1580,13 @@ def subtract(x1,
         # at least either x1 or x2 has to be an array
         pass
     else:
-        # get a common queue to copy data from the host into a device if any input is scalar
-        queue = get_common_allocation_queue([x1, x2]) if dpnp.isscalar(x1) or dpnp.isscalar(x2) else None
+        # get USM type and queue to copy scalar from the host memory into a USM allocation
+        usm_type, queue = get_usm_allocations([x1, x2]) if dpnp.isscalar(x1) or dpnp.isscalar(x2) else (None, None)
 
-        x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_strides=False, copy_when_nondefault_queue=False, alloc_queue=queue)
-        x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_strides=False, copy_when_nondefault_queue=False, alloc_queue=queue)
+        x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_strides=False, copy_when_nondefault_queue=False,
+                                           alloc_usm_type=usm_type, alloc_queue=queue)
+        x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_strides=False, copy_when_nondefault_queue=False,
+                                           alloc_usm_type=usm_type, alloc_queue=queue)
         if x1_desc and x2_desc:
             if x1_desc.dtype == x2_desc.dtype == dpnp.bool:
                 raise TypeError("DPNP boolean subtract, the `-` operator, is not supported, "
diff --git a/tests/test_usm_type.py b/tests/test_usm_type.py
index 15b853b3bfa2..2f754df996a8 100644
--- a/tests/test_usm_type.py
+++ b/tests/test_usm_type.py
@@ -37,6 +37,32 @@ def test_coerced_usm_types_mul(usm_type_x, usm_type_y):
     assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
 
 
+@pytest.mark.parametrize("usm_type_x", list_of_usm_types, ids=list_of_usm_types)
+@pytest.mark.parametrize("usm_type_y", list_of_usm_types, ids=list_of_usm_types)
+def test_coerced_usm_types_subtract(usm_type_x, usm_type_y):
+    x = dp.arange(50, usm_type = usm_type_x)
+    y = dp.arange(50, usm_type = usm_type_y)
+
+    z = 20 - x - y - 7.4
+
+    assert x.usm_type == usm_type_x
+    assert y.usm_type == usm_type_y
+    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+
+
+@pytest.mark.parametrize("usm_type_x", list_of_usm_types, ids=list_of_usm_types)
+@pytest.mark.parametrize("usm_type_y", list_of_usm_types, ids=list_of_usm_types)
+def test_coerced_usm_types_divide(usm_type_x, usm_type_y):
+    x = dp.arange(120, usm_type = usm_type_x)
+    y = dp.arange(120, usm_type = usm_type_y)
+
+    z = 2 / x / y / 1.5
+
+    assert x.usm_type == usm_type_x
+    assert y.usm_type == usm_type_y
+    assert z.usm_type == du.get_coerced_usm_type([usm_type_x, usm_type_y])
+
+
 @pytest.mark.parametrize(
     "func, args",
     [

From a37dfdc799e886800173dd4de0dc7b40e122d303 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Mon, 13 Feb 2023 09:58:05 -0600
Subject: [PATCH 5/7] Use std::int32_t and std::int64_t types

---
 dpnp/backend/include/dpnp_gen_2arg_3type_tbl.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/dpnp/backend/include/dpnp_gen_2arg_3type_tbl.hpp b/dpnp/backend/include/dpnp_gen_2arg_3type_tbl.hpp
index 148a0a9e646f..e345c6eefea7 100644
--- a/dpnp/backend/include/dpnp_gen_2arg_3type_tbl.hpp
+++ b/dpnp/backend/include/dpnp_gen_2arg_3type_tbl.hpp
@@ -112,7 +112,7 @@
 MACRO_2ARG_3TYPES_OP(dpnp_add_c,
                      input1_elem + input2_elem,
                      x1 + x2,
-                     MACRO_UNPACK_TYPES(int, long, bool),
+                     MACRO_UNPACK_TYPES(bool, std::int32_t, std::int64_t),
                      oneapi::mkl::vm::add,
                      MACRO_UNPACK_TYPES(float, double, std::complex<float>, std::complex<double>))
 
@@ -133,7 +133,7 @@ MACRO_2ARG_3TYPES_OP(dpnp_copysign_c,
 MACRO_2ARG_3TYPES_OP(dpnp_divide_c,
                      input1_elem / input2_elem,
                      x1 / x2,
-                     MACRO_UNPACK_TYPES(int, long, bool),
+                     MACRO_UNPACK_TYPES(bool, std::int32_t, std::int64_t),
                      oneapi::mkl::vm::div,
                      MACRO_UNPACK_TYPES(float, double, std::complex<float>, std::complex<double>))
 
@@ -171,7 +171,7 @@ MACRO_2ARG_3TYPES_OP(dpnp_minimum_c,
 MACRO_2ARG_3TYPES_OP(dpnp_multiply_c,
                      input1_elem * input2_elem,
                      x1 * x2,
-                     MACRO_UNPACK_TYPES(int, long, bool),
+                     MACRO_UNPACK_TYPES(bool, std::int32_t, std::int64_t),
                      oneapi::mkl::vm::mul,
                      MACRO_UNPACK_TYPES(float, double, std::complex<float>, std::complex<double>))
 
@@ -185,7 +185,7 @@ MACRO_2ARG_3TYPES_OP(dpnp_power_c,
 MACRO_2ARG_3TYPES_OP(dpnp_subtract_c,
                      input1_elem - input2_elem,
                      x1 - x2,
-                     MACRO_UNPACK_TYPES(int, long, bool),
+                     MACRO_UNPACK_TYPES(bool, std::int32_t, std::int64_t),
                      oneapi::mkl::vm::sub,
                      MACRO_UNPACK_TYPES(float, double, std::complex<float>, std::complex<double>))
 

From 1c6517af0c7c59772a57a6e49e6304714db8ae46 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Tue, 14 Feb 2023 11:58:29 -0600
Subject: [PATCH 6/7] Disable floating-point optimizations that assume
 arguments and results are not NaNs or +-Inf

---
 dpnp/backend/CMakeLists.txt | 1 +
 utils/command_build_clib.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/dpnp/backend/CMakeLists.txt b/dpnp/backend/CMakeLists.txt
index f66aa4be1ae5..52e9cb21985b 100644
--- a/dpnp/backend/CMakeLists.txt
+++ b/dpnp/backend/CMakeLists.txt
@@ -93,6 +93,7 @@ string(CONCAT COMMON_COMPILE_FLAGS
   "-fsycl "
   "-fsycl-device-code-split=per_kernel "
   "-fno-approx-func "
+  "-fno-finite-math-only "
 )
 string(CONCAT COMMON_LINK_FLAGS
   "-fsycl "
diff --git a/utils/command_build_clib.py b/utils/command_build_clib.py
index 95887cc65aaa..65673f1ae69a 100644
--- a/utils/command_build_clib.py
+++ b/utils/command_build_clib.py
@@ -63,7 +63,7 @@
 # default variables (for Linux)
 _project_compiler = "icpx"
 _project_linker = "icpx"
-_project_cmplr_flag_sycl_devel = ["-fsycl-device-code-split=per_kernel", "-fno-approx-func"]
+_project_cmplr_flag_sycl_devel = ["-fsycl-device-code-split=per_kernel", "-fno-approx-func", "-fno-approx-func"]
 _project_cmplr_flag_sycl = ["-fsycl"]
 _project_cmplr_flag_stdcpp_static = []  # This brakes TBB ["-static-libstdc++", "-static-libgcc"]
 _project_cmplr_flag_compatibility = ["-Wl,--enable-new-dtags"]

From 9cbf2bde1d329095e52288878ab2e5791864d0d6 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Thu, 16 Feb 2023 16:44:49 +0100
Subject: [PATCH 7/7] Fix issue with divide on Iris Xe

---
 dpnp/dpnp_algo/dpnp_algo.pyx | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/dpnp/dpnp_algo/dpnp_algo.pyx b/dpnp/dpnp_algo/dpnp_algo.pyx
index 54e71b87d767..3838ab5812c6 100644
--- a/dpnp/dpnp_algo/dpnp_algo.pyx
+++ b/dpnp/dpnp_algo/dpnp_algo.pyx
@@ -495,22 +495,24 @@ cdef utils.dpnp_descriptor call_fptr_2in_1out_strides(DPNPFuncName fptr_name,
 
     # get FPTR function and result type
     cdef fptr_2in_1out_strides_t func = NULL
+    cdef DPNPFuncType return_type = DPNP_FT_NONE
     if fptr_name != DPNP_FN_DIVIDE_EXT or result_sycl_device.has_aspect_fp64:
-        result_type = dpnp_DPNPFuncType_to_dtype(< size_t > kernel_data.return_type)
+        return_type = kernel_data.return_type
         func = < fptr_2in_1out_strides_t > kernel_data.ptr
     else:
-        result_type = dpnp_DPNPFuncType_to_dtype(< size_t > kernel_data.return_type_no_fp64)
+        return_type = kernel_data.return_type_no_fp64
         func = < fptr_2in_1out_strides_t > kernel_data.ptr_no_fp64
 
     if out is None:
         """ Create result array with type given by FPTR data """
         result = utils.create_output_descriptor(result_shape,
-                                                kernel_data.return_type,
+                                                return_type,
                                                 None,
                                                 device=result_sycl_device,
                                                 usm_type=result_usm_type,
                                                 sycl_queue=result_sycl_queue)
     else:
+        result_type = dpnp_DPNPFuncType_to_dtype(< size_t > return_type)
         if out.dtype != result_type:
             utils.checker_throw_value_error(func_name, 'out.dtype', out.dtype, result_type)
         if out.shape != result_shape: