From 9960b60bf8024a7dc3e2560fa215da35f76b291c Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Thu, 1 Jun 2023 08:06:03 -0500
Subject: [PATCH 1/7] Improve dpnp.div() implementation:     - reuse
 dpctl.tensor.div()     - write pybind11 extension for div() from OneMKL VM

---
 dpnp/CMakeLists.txt                         |   1 +
 dpnp/backend/extensions/vm/CMakeLists.txt   |  75 +++++
 dpnp/backend/extensions/vm/div.cpp          | 307 ++++++++++++++++++++
 dpnp/backend/extensions/vm/div.hpp          |  57 ++++
 dpnp/backend/extensions/vm/types_matrix.hpp |  67 +++++
 dpnp/backend/extensions/vm/vm_py.cpp        |  71 +++++
 dpnp/dpnp_algo/dpnp_algo.pxd                |   3 -
 dpnp/dpnp_algo/dpnp_algo.pyx                |   2 +-
 dpnp/dpnp_algo/dpnp_algo_mathematical.pxi   |   9 -
 dpnp/dpnp_algo/dpnp_elementwise_common.py   |  84 ++++++
 dpnp/dpnp_iface.py                          |  27 ++
 dpnp/dpnp_iface_mathematical.py             |  28 +-
 tests/test_sycl_queue.py                    |   5 +-
 13 files changed, 708 insertions(+), 28 deletions(-)
 create mode 100644 dpnp/backend/extensions/vm/CMakeLists.txt
 create mode 100644 dpnp/backend/extensions/vm/div.cpp
 create mode 100644 dpnp/backend/extensions/vm/div.hpp
 create mode 100644 dpnp/backend/extensions/vm/types_matrix.hpp
 create mode 100644 dpnp/backend/extensions/vm/vm_py.cpp
 create mode 100644 dpnp/dpnp_algo/dpnp_elementwise_common.py

diff --git a/dpnp/CMakeLists.txt b/dpnp/CMakeLists.txt
index 54be4eb23b9a..89524ab1c58d 100644
--- a/dpnp/CMakeLists.txt
+++ b/dpnp/CMakeLists.txt
@@ -48,6 +48,7 @@ endfunction()
 build_dpnp_cython_ext_with_backend(dparray ${CMAKE_CURRENT_SOURCE_DIR}/dparray.pyx dpnp)
 add_subdirectory(backend)
 add_subdirectory(backend/extensions/lapack)
+add_subdirectory(backend/extensions/vm)
 
 add_subdirectory(dpnp_algo)
 add_subdirectory(dpnp_utils)
diff --git a/dpnp/backend/extensions/vm/CMakeLists.txt b/dpnp/backend/extensions/vm/CMakeLists.txt
new file mode 100644
index 000000000000..8f3086ec3a9e
--- /dev/null
+++ b/dpnp/backend/extensions/vm/CMakeLists.txt
@@ -0,0 +1,75 @@
+# *****************************************************************************
+# Copyright (c) 2023, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+
+set(python_module_name _vm_impl)
+pybind11_add_module(${python_module_name} MODULE
+    vm_py.cpp
+    div.cpp
+)
+
+if (WIN32)
+    if (${CMAKE_VERSION} VERSION_LESS "3.27")
+        # this is a work-around for target_link_options inserting option after -link option, cause
+        # linker to ignore it.
+        set(CMAKE_CXX_LINK_FLAGS "${CMAKE_CXX_LINK_FLAGS} -fsycl-device-code-split=per_kernel")
+    endif()
+endif()
+
+set_target_properties(${python_module_name} PROPERTIES CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+target_include_directories(${python_module_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../include)
+target_include_directories(${python_module_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../src)
+
+target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS})
+target_include_directories(${python_module_name} PUBLIC ${Dpctl_TENSOR_INCLUDE_DIR})
+
+if (WIN32)
+  target_compile_options(${python_module_name} PRIVATE
+    /clang:-fno-approx-func
+    /clang:-fno-finite-math-only
+    )
+else()
+  target_compile_options(${python_module_name} PRIVATE
+    -fno-approx-func
+    -fno-finite-math-only
+    )
+endif()
+
+target_link_options(${python_module_name} PUBLIC -fsycl-device-code-split=per_kernel)
+if (UNIX)
+    # this option is support on Linux only
+    target_link_options(${python_module_name} PUBLIC -fsycl-link-huge-device-code)
+endif()
+
+if (DPNP_GENERATE_COVERAGE)
+    target_link_options(${python_module_name} PRIVATE -fprofile-instr-generate -fcoverage-mapping)
+endif()
+
+target_link_libraries(${python_module_name} PUBLIC MKL::MKL_DPCPP)
+
+install(TARGETS ${python_module_name}
+  DESTINATION "dpnp/backend/extensions/vm"
+)
diff --git a/dpnp/backend/extensions/vm/div.cpp b/dpnp/backend/extensions/vm/div.cpp
new file mode 100644
index 000000000000..48b64959b88a
--- /dev/null
+++ b/dpnp/backend/extensions/vm/div.cpp
@@ -0,0 +1,307 @@
+//*****************************************************************************
+// Copyright (c) 2023, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+
+#include <pybind11/pybind11.h>
+
+// dpctl tensor headers
+#include "utils/memory_overlap.hpp"
+#include "utils/type_utils.hpp"
+
+#include "div.hpp"
+#include "types_matrix.hpp"
+
+#include "dpnp_utils.hpp"
+
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace vm
+{
+
+namespace mkl_vm = oneapi::mkl::vm;
+namespace py = pybind11;
+namespace type_utils = dpctl::tensor::type_utils;
+
+typedef sycl::event (*div_impl_fn_ptr_t)(
+    sycl::queue, const std::int64_t, const char*, const char*, char*, const std::vector<sycl::event>&);
+
+static div_impl_fn_ptr_t div_dispatch_vector[dpctl_td_ns::num_types];
+
+template <typename T>
+static sycl::event div_impl(sycl::queue exec_q,
+                            const std::int64_t n,
+                            const char* in_a,
+                            const char* in_b,
+                            char* out_y,
+                            const std::vector<sycl::event>& depends)
+{
+    type_utils::validate_type_for_device<T>(exec_q);
+
+    const T* a = reinterpret_cast<const T*>(in_a);
+    const T* b = reinterpret_cast<const T*>(in_b);
+    T* y = reinterpret_cast<T*>(out_y);
+
+    return mkl_vm::div(exec_q,
+                       n, // number of elements to be calculated
+                       a, // pointer `a` containing 1st input vector of size n
+                       b, // pointer `b` containing 2nd input vector of size n
+                       y, // pointer `y` to the output vector of size n
+                       depends);
+}
+
+std::pair<sycl::event, sycl::event> div(sycl::queue exec_q,
+                                        dpctl::tensor::usm_ndarray src1,
+                                        dpctl::tensor::usm_ndarray src2,
+                                        dpctl::tensor::usm_ndarray dst, // dst = op(src1, src2), elementwise
+                                        const std::vector<sycl::event>& depends)
+{
+    // check type_nums
+    int src1_typenum = src1.get_typenum();
+    int src2_typenum = src2.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto array_types = dpctl_td_ns::usm_ndarray_types();
+    int src1_typeid = array_types.typenum_to_lookup_id(src1_typenum);
+    int src2_typeid = array_types.typenum_to_lookup_id(src2_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if (src1_typeid != src2_typeid || src2_typeid != dst_typeid)
+    {
+        throw py::value_error("Either any of input arrays or output array have different types");
+    }
+
+    // check that queues are compatible
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src1, src2, dst}))
+    {
+        throw py::value_error("Execution queue is not compatible with allocation queues");
+    }
+
+    // check shapes, broadcasting is assumed done by caller
+    // check that dimensions are the same
+    int dst_nd = dst.get_ndim();
+    if (dst_nd != src1.get_ndim() || dst_nd != src2.get_ndim())
+    {
+        throw py::value_error("Array dimensions are not the same.");
+    }
+
+    // check that shapes are the same
+    const py::ssize_t* src1_shape = src1.get_shape_raw();
+    const py::ssize_t* src2_shape = src2.get_shape_raw();
+    const py::ssize_t* dst_shape = dst.get_shape_raw();
+    bool shapes_equal(true);
+    size_t src_nelems(1);
+
+    for (int i = 0; i < dst_nd; ++i)
+    {
+        src_nelems *= static_cast<size_t>(src1_shape[i]);
+        shapes_equal = shapes_equal && (src1_shape[i] == dst_shape[i] && src2_shape[i] == dst_shape[i]);
+    }
+    if (!shapes_equal)
+    {
+        throw py::value_error("Array shapes are not the same.");
+    }
+
+    // if nelems is zero, return
+    if (src_nelems == 0)
+    {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    // ensure that output is ample enough to accomodate all elements
+    auto dst_offsets = dst.get_minmax_offsets();
+    // destination must be ample enough to accomodate all elements
+    {
+        size_t range = static_cast<size_t>(dst_offsets.second - dst_offsets.first);
+        if (range + 1 < src_nelems)
+        {
+            throw py::value_error(
+                "Destination array can not accomodate all the "
+                "elements of source array.");
+        }
+    }
+
+    // check memory overlap
+    auto const& overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src1, dst) || overlap(src2, dst))
+    {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    const char* src1_data = src1.get_data();
+    const char* src2_data = src2.get_data();
+    char* dst_data = dst.get_data();
+
+    // handle contiguous inputs
+    bool is_src1_c_contig = src1.is_c_contiguous();
+    bool is_src2_c_contig = src2.is_c_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+
+    bool all_c_contig = (is_src1_c_contig && is_src2_c_contig && is_dst_c_contig);
+    if (!all_c_contig)
+    {
+        throw py::value_error("Input and outpur arrays must be C-contiguous");
+    }
+
+    auto div_fn = div_dispatch_vector[dst_typeid];
+    if (div_fn == nullptr)
+    {
+        throw py::value_error("No div implementation defined");
+    }
+    sycl::event sum_ev = div_fn(exec_q, src_nelems, src1_data, src2_data, dst_data, depends);
+
+    sycl::event ht_ev = dpctl::utils::keep_args_alive(exec_q, {src1, src2, dst}, {sum_ev});
+    return std::make_pair(ht_ev, sum_ev);
+}
+
+bool can_call_div(sycl::queue exec_q,
+                  dpctl::tensor::usm_ndarray src1,
+                  dpctl::tensor::usm_ndarray src2,
+                  dpctl::tensor::usm_ndarray dst)
+{
+    // check type_nums
+    int src1_typenum = src1.get_typenum();
+    int src2_typenum = src2.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    auto array_types = dpctl_td_ns::usm_ndarray_types();
+    int src1_typeid = array_types.typenum_to_lookup_id(src1_typenum);
+    int src2_typeid = array_types.typenum_to_lookup_id(src2_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    // types must be the same
+    if (src1_typeid != src2_typeid || src2_typeid != dst_typeid)
+    {
+        return false;
+    }
+
+    // OneMKL VM functions perform a copy on host if no double type support
+    if (!exec_q.get_device().has(sycl::aspect::fp64))
+    {
+        return false;
+    }
+
+    // check that queues are compatible
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src1, src2, dst}))
+    {
+        return false;
+    }
+
+    // dimensions must be the same
+    int dst_nd = dst.get_ndim();
+    if (dst_nd != src1.get_ndim() || dst_nd != src2.get_ndim())
+    {
+        return false;
+    }
+
+    // shapes must be the same
+    const py::ssize_t* src1_shape = src1.get_shape_raw();
+    const py::ssize_t* src2_shape = src2.get_shape_raw();
+    const py::ssize_t* dst_shape = dst.get_shape_raw();
+    bool shapes_equal(true);
+    size_t src_nelems(1);
+
+    for (int i = 0; i < dst_nd; ++i)
+    {
+        src_nelems *= static_cast<size_t>(src1_shape[i]);
+        shapes_equal = shapes_equal && (src1_shape[i] == dst_shape[i] && src2_shape[i] == dst_shape[i]);
+    }
+    if (!shapes_equal)
+    {
+        return false;
+    }
+
+    // if nelems is zero, return false
+    if (src_nelems == 0)
+    {
+        return false;
+    }
+
+    // ensure that output is ample enough to accomodate all elements
+    auto dst_offsets = dst.get_minmax_offsets();
+    // destination must be ample enough to accomodate all elements
+    {
+        size_t range = static_cast<size_t>(dst_offsets.second - dst_offsets.first);
+        if (range + 1 < src_nelems)
+        {
+            return false;
+        }
+    }
+
+    // check memory overlap
+    auto const& overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src1, dst) || overlap(src2, dst))
+    {
+        return false;
+    }
+
+    // suppport only contiguous inputs
+    bool is_src1_c_contig = src1.is_c_contiguous();
+    bool is_src2_c_contig = src2.is_c_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+
+    bool all_c_contig = (is_src1_c_contig && is_src2_c_contig && is_dst_c_contig);
+    if (!all_c_contig)
+    {
+        return false;
+    }
+
+    // MKL function is not defined for the type
+    if (div_dispatch_vector[src1_typeid] == nullptr)
+    {
+        return false;
+    }
+    return true;
+}
+
+template <typename fnT, typename T>
+struct DivContigFactory
+{
+    fnT get()
+    {
+        if constexpr (types::DivTypePairSupportFactory<T>::is_defined)
+        {
+            return div_impl<T>;
+        }
+        else
+        {
+            return nullptr;
+        }
+    }
+};
+
+void init_div_dispatch_vector(void)
+{
+    dpctl_td_ns::DispatchVectorBuilder<div_impl_fn_ptr_t, DivContigFactory, dpctl_td_ns::num_types> contig;
+    contig.populate_dispatch_vector(div_dispatch_vector);
+}
+} // namespace vm
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/backend/extensions/vm/div.hpp b/dpnp/backend/extensions/vm/div.hpp
new file mode 100644
index 000000000000..d530a156aa4a
--- /dev/null
+++ b/dpnp/backend/extensions/vm/div.hpp
@@ -0,0 +1,57 @@
+//*****************************************************************************
+// Copyright (c) 2023, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include <CL/sycl.hpp>
+#include <oneapi/mkl.hpp>
+
+#include <dpctl4pybind11.hpp>
+#include <pybind11/numpy.h>
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace vm
+{
+    extern std::pair<sycl::event, sycl::event> div(sycl::queue exec_q,
+                                                   dpctl::tensor::usm_ndarray src1,
+                                                   dpctl::tensor::usm_ndarray src2,
+                                                   dpctl::tensor::usm_ndarray dst,
+                                                   const std::vector<sycl::event>& depends);
+
+    extern bool can_call_div(sycl::queue exec_q,
+                             dpctl::tensor::usm_ndarray src1,
+                             dpctl::tensor::usm_ndarray src2,
+                             dpctl::tensor::usm_ndarray dst);
+
+    extern void init_div_dispatch_vector(void);
+} // namespace vm
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/backend/extensions/vm/types_matrix.hpp b/dpnp/backend/extensions/vm/types_matrix.hpp
new file mode 100644
index 000000000000..af7c9c47e731
--- /dev/null
+++ b/dpnp/backend/extensions/vm/types_matrix.hpp
@@ -0,0 +1,67 @@
+//*****************************************************************************
+// Copyright (c) 2023, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include <type_traits>
+
+// dpctl tensor headers
+#include "utils/type_dispatch.hpp"
+
+// dpctl namespace for operations with types
+namespace dpctl_td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace vm
+{
+namespace types
+{
+    /**
+     * @brief A factory to define pairs of supported types for which
+     * MKL VM library provides support in oneapi::mkl::vm::div<T> function.
+     *
+     * @tparam T Type of input vectors `a` and `b` and of result vector `y`.
+     */
+    template <typename T>
+    struct DivTypePairSupportFactory
+    {
+        static constexpr bool is_defined = std::disjunction<
+            dpctl_td_ns::TypePairDefinedEntry<T, std::complex<double>, T, std::complex<double>>,
+            dpctl_td_ns::TypePairDefinedEntry<T, std::complex<float>, T, std::complex<float>>,
+            dpctl_td_ns::TypePairDefinedEntry<T, double, T, double>,
+            dpctl_td_ns::TypePairDefinedEntry<T, float, T, float>,
+            // fall-through
+            dpctl_td_ns::NotDefinedEntry>::is_defined;
+    };
+} // namespace types
+} // namespace vm
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/backend/extensions/vm/vm_py.cpp b/dpnp/backend/extensions/vm/vm_py.cpp
new file mode 100644
index 000000000000..9a040b1f59b0
--- /dev/null
+++ b/dpnp/backend/extensions/vm/vm_py.cpp
@@ -0,0 +1,71 @@
+//*****************************************************************************
+// Copyright (c) 2023, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+// This file defines functions of dpnp.backend._lapack_impl extensions
+//
+//*****************************************************************************
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "div.hpp"
+
+namespace vm_ext = dpnp::backend::ext::vm;
+namespace py = pybind11;
+
+// populate dispatch vectors
+void init_dispatch_vectors(void)
+{
+    vm_ext::init_div_dispatch_vector();
+}
+
+// populate dispatch tables
+void init_dispatch_tables(void)
+{
+}
+
+PYBIND11_MODULE(_vm_impl, m)
+{
+    init_dispatch_vectors();
+    init_dispatch_tables();
+
+    m.def("_div",
+          &vm_ext::div,
+          "Call `div` from OneMKL VM library to performs element by element division "
+          "of vector `src1` by vector `src2` to resulting vector `dst`",
+          py::arg("sycl_queue"),
+          py::arg("src1"),
+          py::arg("src2"),
+          py::arg("dst"),
+          py::arg("depends") = py::list());
+
+    m.def("_can_call_div",
+          &vm_ext::can_call_div,
+          "Check input arrays to answer if `div` function from OneMKL VM library can be used",
+          py::arg("sycl_queue"),
+          py::arg("src1"),
+          py::arg("src2"),
+          py::arg("dst"));
+}
diff --git a/dpnp/dpnp_algo/dpnp_algo.pxd b/dpnp/dpnp_algo/dpnp_algo.pxd
index 56195613a338..0120aa6b453a 100644
--- a/dpnp/dpnp_algo/dpnp_algo.pxd
+++ b/dpnp/dpnp_algo/dpnp_algo.pxd
@@ -118,7 +118,6 @@ cdef extern from "dpnp_iface_fptr.hpp" namespace "DPNPFuncName":  # need this na
         DPNP_FN_DIAGONAL
         DPNP_FN_DIAGONAL_EXT
         DPNP_FN_DIVIDE
-        DPNP_FN_DIVIDE_EXT
         DPNP_FN_DOT
         DPNP_FN_DOT_EXT
         DPNP_FN_EDIFF1D
@@ -531,8 +530,6 @@ cpdef dpnp_descriptor dpnp_add(dpnp_descriptor x1_obj, dpnp_descriptor x2_obj, o
                                dpnp_descriptor out=*, object where=*)
 cpdef dpnp_descriptor dpnp_arctan2(dpnp_descriptor x1_obj, dpnp_descriptor x2_obj, object dtype=*,
                                    dpnp_descriptor out=*, object where=*)
-cpdef dpnp_descriptor dpnp_divide(dpnp_descriptor x1_obj, dpnp_descriptor x2_obj, object dtype=*,
-                                  dpnp_descriptor out=*, object where=*)
 cpdef dpnp_descriptor dpnp_hypot(dpnp_descriptor x1_obj, dpnp_descriptor x2_obj, object dtype=*,
                                  dpnp_descriptor out=*, object where=*)
 cpdef dpnp_descriptor dpnp_maximum(dpnp_descriptor x1_obj, dpnp_descriptor x2_obj, object dtype=*,
diff --git a/dpnp/dpnp_algo/dpnp_algo.pyx b/dpnp/dpnp_algo/dpnp_algo.pyx
index 24abd1b4b9e4..8b6d4be73e94 100644
--- a/dpnp/dpnp_algo/dpnp_algo.pyx
+++ b/dpnp/dpnp_algo/dpnp_algo.pyx
@@ -499,7 +499,7 @@ cdef utils.dpnp_descriptor call_fptr_2in_1out_strides(DPNPFuncName fptr_name,
     # get FPTR function and return type
     cdef fptr_2in_1out_strides_t func = NULL
     cdef DPNPFuncType return_type = DPNP_FT_NONE
-    if fptr_name != DPNP_FN_DIVIDE_EXT or result_sycl_device.has_aspect_fp64:
+    if result_sycl_device.has_aspect_fp64:
         return_type = kernel_data.return_type
         func = < fptr_2in_1out_strides_t > kernel_data.ptr
     else:
diff --git a/dpnp/dpnp_algo/dpnp_algo_mathematical.pxi b/dpnp/dpnp_algo/dpnp_algo_mathematical.pxi
index b5534c7c1a8e..3a002fdd4ba7 100644
--- a/dpnp/dpnp_algo/dpnp_algo_mathematical.pxi
+++ b/dpnp/dpnp_algo/dpnp_algo_mathematical.pxi
@@ -47,7 +47,6 @@ __all__ += [
     "dpnp_cumprod",
     "dpnp_cumsum",
     "dpnp_diff",
-    "dpnp_divide",
     "dpnp_ediff1d",
     "dpnp_fabs",
     "dpnp_floor",
@@ -249,14 +248,6 @@ cpdef utils.dpnp_descriptor dpnp_diff(utils.dpnp_descriptor x1, int n):
     return dpnp_diff(res, n - 1)
 
 
-cpdef utils.dpnp_descriptor dpnp_divide(utils.dpnp_descriptor x1_obj,
-                                        utils.dpnp_descriptor x2_obj,
-                                        object dtype=None,
-                                        utils.dpnp_descriptor out=None,
-                                        object where=True):
-    return call_fptr_2in_1out_strides(DPNP_FN_DIVIDE_EXT, x1_obj, x2_obj, dtype, out, where)
-
-
 cpdef utils.dpnp_descriptor dpnp_ediff1d(utils.dpnp_descriptor x1):
 
     if x1.size <= 1:
diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py
new file mode 100644
index 000000000000..15481941f85f
--- /dev/null
+++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py
@@ -0,0 +1,84 @@
+# cython: language_level=3
+# distutils: language = c++
+# -*- coding: utf-8 -*-
+# *****************************************************************************
+# Copyright (c) 2023, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+
+import dpnp
+from dpnp.dpnp_array import dpnp_array
+import dpnp.backend.extensions.vm._vm_impl as vmi
+
+from dpctl.tensor._elementwise_common import (
+    BinaryElementwiseFunc
+)
+import dpctl.tensor._tensor_impl as ti
+
+
+__all__ = [
+    "dpnp_divide"
+]
+
+
+_divide_docstring_ = """
+divide(x1, x2, out=None, order='K')
+
+Calculates the ratio for each element `x1_i` of the input array `x1` with
+the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (dpnp.ndarray):
+        First input array, expected to have numeric data type.
+    x2 (dpnp.ndarray):
+        Second input array, also expected to have numeric data type.
+Returns:
+    dpnp.ndarray:
+        an array containing the result of element-wise division. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+
+def dpnp_divide(x1, x2, out=None, order='K'):
+    """
+    Invokes div() function from pybind11 extension of OneMKL VM if possible.
+    Otherwise fully relies on dpctl.tensor implementation for divide() function.
+
+    """
+
+    def _call_divide(src1, src2, dst, sycl_queue, depends=[]):
+        """A callback to register in BinaryElementwiseFunc class of dpctl.tensor"""
+
+        if vmi._can_call_div(sycl_queue, src1, src2, dst):
+            # call pybind11 extension for div() function from OneMKL VM
+            return vmi._div(sycl_queue, src1, src2, dst, depends)
+        return ti._divide(src1, src2, dst, sycl_queue, depends)
+
+    # dpctl.tensor only works with usm_ndarray or scalar
+    x1_usm_or_scalar = dpnp.get_usm_ndarray_or_scalar(x1)
+    x2_usm_or_scalar = dpnp.get_usm_ndarray_or_scalar(x2)
+    out_usm = None if out is None else dpnp.get_usm_ndarray(out)
+
+    func = BinaryElementwiseFunc("divide", ti._divide_result_type, _call_divide, _divide_docstring_)
+    res_usm = func(x1_usm_or_scalar, x2_usm_or_scalar, out=out_usm, order=order)
+    return dpnp_array._create_from_usm_ndarray(res_usm)
diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py
index ce3c540539d6..93629440e7d2 100644
--- a/dpnp/dpnp_iface.py
+++ b/dpnp/dpnp_iface.py
@@ -69,6 +69,7 @@
     "get_include",
     "get_normalized_queue_device",
     "get_usm_ndarray",
+    "get_usm_ndarray_or_scalar",
     "is_supported_array_type"
 ]
 
@@ -404,6 +405,32 @@ def get_usm_ndarray(a):
     raise TypeError("An array must be any of supported type, but got {}".format(type(a)))
 
 
+def get_usm_ndarray_or_scalar(a):
+    """
+    Return scalar or :class:`dpctl.tensor.usm_ndarray` from input object `a`.
+
+    Parameters
+    ----------
+    a : {scalar, dpnp_array, usm_ndarray}
+        Input of any supported type: scalar, :class:`dpnp.ndarray`
+        or :class:`dpctl.tensor.usm_ndarray`.
+
+    Returns
+    -------
+    out : scalar, usm_ndarray
+        A scalar if the input `a` is scalar.
+        A dpctl USM ndarray if the input `a` is array.
+
+    Raises
+    ------
+    TypeError
+        If input parameter `a` is of unsupported object type.
+
+    """
+
+    return a if isscalar(a) else get_usm_ndarray(a)
+
+
 def is_supported_array_type(a):
     """
     Return ``True`` if an array of either type :class:`dpnp.ndarray`
diff --git a/dpnp/dpnp_iface_mathematical.py b/dpnp/dpnp_iface_mathematical.py
index 98dcc71d31af..a7f5a7e8a0cb 100644
--- a/dpnp/dpnp_iface_mathematical.py
+++ b/dpnp/dpnp_iface_mathematical.py
@@ -40,8 +40,11 @@
 """
 
 
-from dpnp.dpnp_algo import *
-from dpnp.dpnp_utils import *
+from .dpnp_algo import *
+from .dpnp_algo.dpnp_elementwise_common import (
+    dpnp_divide
+)
+from .dpnp_utils import *
 
 import dpnp
 
@@ -586,6 +589,7 @@ def divide(x1,
            out=None,
            *,
            where=True,
+           order='K',
            dtype=None,
            subok=True,
            **kwargs):
@@ -617,28 +621,24 @@ def divide(x1,
 
     """
 
-    if out is not None:
-        pass
-    elif where is not True:
+    if where is not True:
         pass
     elif dtype is not None:
         pass
     elif subok is not True:
         pass
+    elif kwargs:
+        pass
     elif dpnp.isscalar(x1) and dpnp.isscalar(x2):
         # at least either x1 or x2 has to be an array
         pass
     else:
-        # get USM type and queue to copy scalar from the host memory into a USM allocation
-        usm_type, queue = get_usm_allocations([x1, x2]) if dpnp.isscalar(x1) or dpnp.isscalar(x2) else (None, None)
-
-        x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_strides=False, copy_when_nondefault_queue=False,
-                                           alloc_usm_type=usm_type, alloc_queue=queue)
-        x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_strides=False, copy_when_nondefault_queue=False,
-                                           alloc_usm_type=usm_type, alloc_queue=queue)
-        if x1_desc and x2_desc:
-            return dpnp_divide(x1_desc, x2_desc, dtype=dtype, out=out, where=where).get_pyobj()
+        if order in "afkcAFKC":
+            order = order.upper()
+        else:
+            raise ValueError("order must be one of 'C', 'F', 'A', or 'K' (got '{}')".format(order))
 
+        return dpnp_divide(x1, x2, out=out, order=order)
     return call_origin(numpy.divide, x1, x2, out=out, where=where, dtype=dtype, subok=subok, **kwargs)
 
 
diff --git a/tests/test_sycl_queue.py b/tests/test_sycl_queue.py
index e5e53646e1a1..3182003a90be 100644
--- a/tests/test_sycl_queue.py
+++ b/tests/test_sycl_queue.py
@@ -6,6 +6,9 @@
 
 import dpnp
 import dpctl
+from dpctl.utils import (
+    ExecutionPlacementError
+)
 import numpy
 
 from numpy.testing import (
@@ -431,7 +434,7 @@ def test_broadcasting(func, data1, data2, device):
 def test_2in_1out_diff_queue_but_equal_context(func, device):
     x1 = dpnp.arange(10)
     x2 = dpnp.arange(10, sycl_queue=dpctl.SyclQueue(device))[::-1]
-    with assert_raises(ValueError):
+    with assert_raises((ValueError, ExecutionPlacementError)):
         getattr(dpnp, func)(x1, x2)
 
 

From 9cd4eac735b0bc921317811a25a392fda82326e5 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Tue, 6 Jun 2023 09:51:53 -0500
Subject: [PATCH 2/7] Add support of order=None

---
 dpnp/dpnp_iface_mathematical.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dpnp/dpnp_iface_mathematical.py b/dpnp/dpnp_iface_mathematical.py
index a7f5a7e8a0cb..9e877703b409 100644
--- a/dpnp/dpnp_iface_mathematical.py
+++ b/dpnp/dpnp_iface_mathematical.py
@@ -635,6 +635,8 @@ def divide(x1,
     else:
         if order in "afkcAFKC":
             order = order.upper()
+        elif order is None:
+            order = 'K'
         else:
             raise ValueError("order must be one of 'C', 'F', 'A', or 'K' (got '{}')".format(order))
 

From 08b10cf1203b5cefb0dfe2ef13bac008468e9648 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Wed, 7 Jun 2023 11:58:52 -0500
Subject: [PATCH 3/7] add docstrings for out and order

---
 dpnp/dpnp_algo/dpnp_elementwise_common.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py
index 15481941f85f..9c2383e57a0b 100644
--- a/dpnp/dpnp_algo/dpnp_elementwise_common.py
+++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py
@@ -53,6 +53,12 @@
         First input array, expected to have numeric data type.
     x2 (dpnp.ndarray):
         Second input array, also expected to have numeric data type.
+    out ({None, dpnp.ndarray}, optional):
+        Output array to populate.
+        Array have the correct shape and the expected data type.
+    order ("C","F","A","K", None, optional):
+        Memory layout of the newly output array, if parameter `out` is `None`.
+        Default: "K".
 Returns:
     dpnp.ndarray:
         an array containing the result of element-wise division. The data type

From ee09d7b8740c3052c10d8672fc0c029229a13b0d Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 9 Jun 2023 09:15:40 -0500
Subject: [PATCH 4/7] Fix hanging in tril

---
 dpnp/backend/extensions/vm/div.cpp | 5 +++++
 tests/test_mathematical.py         | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/dpnp/backend/extensions/vm/div.cpp b/dpnp/backend/extensions/vm/div.cpp
index 48b64959b88a..6e98e09f3475 100644
--- a/dpnp/backend/extensions/vm/div.cpp
+++ b/dpnp/backend/extensions/vm/div.cpp
@@ -219,6 +219,11 @@ bool can_call_div(sycl::queue exec_q,
     {
         return false;
     }
+    else if (dst_nd == 0)
+    {
+        // don't call OneMKL for 0d arrays
+        return false;
+    }
 
     // shapes must be the same
     const py::ssize_t* src1_shape = src1.get_shape_raw();
diff --git a/tests/test_mathematical.py b/tests/test_mathematical.py
index 5f0d73b23b7b..6224eb57ab81 100644
--- a/tests/test_mathematical.py
+++ b/tests/test_mathematical.py
@@ -284,7 +284,7 @@ def test_divide_scalar(shape, dtype):
 
     result = 0.5 / dpnp_a / 1.7
     expected = 0.5 / np_a / 1.7
-    assert_allclose(result, expected)
+    assert_allclose(result, expected, rtol=1e-6)
 
 
 @pytest.mark.parametrize("shape",

From 05c7d776011d2a1e1f4ae53a314f7a6635450eba Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 9 Jun 2023 12:10:24 -0500
Subject: [PATCH 5/7] Remove w/a in divide tests for type aligning with numpy

---
 tests/third_party/cupy/math_tests/test_arithmetic.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/tests/third_party/cupy/math_tests/test_arithmetic.py b/tests/third_party/cupy/math_tests/test_arithmetic.py
index 39dc3e10f721..71f33429c704 100644
--- a/tests/third_party/cupy/math_tests/test_arithmetic.py
+++ b/tests/third_party/cupy/math_tests/test_arithmetic.py
@@ -168,13 +168,6 @@ def check_binary(self, xp):
                         y = y.astype(dtype1)
                     elif is_array_arg2 and not is_array_arg1:
                         y = y.astype(dtype2)
-            elif self.name in ('divide', 'true_divide'):
-                # If one input is an array of float32 and another - an integer or floating scalar,
-                # NumPy will return an output array of float32, while DPNP will return the array of float64,
-                # since NumPy would use the same float64 type when instead of scalar here is array of integer of floating type.
-                if not (is_array_arg1 and is_array_arg2):
-                    if (is_array_arg1 and arg1.dtype == numpy.float32) ^ (is_array_arg2 and arg2.dtype == numpy.float32):
-                        y = y.astype(numpy.float32)
 
         # NumPy returns different values (nan/inf) on division by zero
         # depending on the architecture.

From 864a620536989d1620a9ca9bc78b84752298a7a5 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Fri, 9 Jun 2023 14:46:37 -0500
Subject: [PATCH 6/7] only events from host task has to be passed in
 keep_args_alive()

---
 .github/workflows/conda-package.yml       |  8 +++-
 .github/workflows/generate_coverage.yaml  |  1 +
 dpnp/backend/extensions/vm/CMakeLists.txt | 13 ++++++
 dpnp/backend/extensions/vm/div.cpp        | 56 ++++++++++++++++++++---
 4 files changed, 69 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index e42adbdc913d..5645191b7c6e 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -194,8 +194,10 @@ jobs:
       # TODO: run the whole scope once the issues on CPU are resolved
       - name: Run tests
         run: |
-          python -m pytest -q -ra --disable-warnings -vv ${{ env.TEST_SCOPE }}
+          python -m pytest -q -ra --disable-warnings -vv -s ${{ env.TEST_SCOPE }}
         working-directory: ${{ env.tests-path }}
+        env:
+          SYCL_QUEUE_THREAD_POOL_SIZE: 16
 
   test_windows:
     name: Test ['windows-latest', python='${{ matrix.python }}']
@@ -331,8 +333,10 @@ jobs:
       # TODO: run the whole scope once the issues on CPU are resolved
       - name: Run tests
         run: |
-          python -m pytest -q -ra --disable-warnings -vv ${{ env.TEST_SCOPE }}
+          python -m pytest -q -ra --disable-warnings -vv -s ${{ env.TEST_SCOPE }}
         working-directory: ${{ env.tests-path }}
+        env:
+          SYCL_QUEUE_THREAD_POOL_SIZE: 16
 
   upload:
     name: Upload ['${{ matrix.os }}', python='${{ matrix.python }}']
diff --git a/.github/workflows/generate_coverage.yaml b/.github/workflows/generate_coverage.yaml
index bd3c8c366f0e..75feca67e560 100644
--- a/.github/workflows/generate_coverage.yaml
+++ b/.github/workflows/generate_coverage.yaml
@@ -66,6 +66,7 @@ jobs:
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           COVERALLS_PARALLEL: true
+          SYCL_QUEUE_THREAD_POOL_SIZE: 6
 
   coveralls:
     name: Indicate completion to coveralls.io
diff --git a/dpnp/backend/extensions/vm/CMakeLists.txt b/dpnp/backend/extensions/vm/CMakeLists.txt
index 8f3086ec3a9e..07a4ffae8aba 100644
--- a/dpnp/backend/extensions/vm/CMakeLists.txt
+++ b/dpnp/backend/extensions/vm/CMakeLists.txt
@@ -55,7 +55,9 @@ else()
   target_compile_options(${python_module_name} PRIVATE
     -fno-approx-func
     -fno-finite-math-only
+    -no-ipo
     )
+    target_link_options(${python_module_name} PRIVATE -no-ipo)
 endif()
 
 target_link_options(${python_module_name} PUBLIC -fsycl-device-code-split=per_kernel)
@@ -70,6 +72,17 @@ endif()
 
 target_link_libraries(${python_module_name} PUBLIC MKL::MKL_DPCPP)
 
+target_link_libraries(${python_module_name} PUBLIC oneDPL)
+
+if (UNIX)
+  # needed for STL headers with GCC < 11
+  target_compile_definitions(${python_module_name} PUBLIC _GLIBCXX_USE_TBB_PAR_BACKEND=0)
+endif()
+
+target_compile_definitions(${python_module_name} PUBLIC PSTL_USE_PARALLEL_POLICIES=0)
+# work-around for Windows at exit crash with predefined policies
+target_compile_definitions(${python_module_name} PUBLIC ONEDPL_USE_PREDEFINED_POLICIES=0)
+
 install(TARGETS ${python_module_name}
   DESTINATION "dpnp/backend/extensions/vm"
 )
diff --git a/dpnp/backend/extensions/vm/div.cpp b/dpnp/backend/extensions/vm/div.cpp
index 6e98e09f3475..8a6751a45a44 100644
--- a/dpnp/backend/extensions/vm/div.cpp
+++ b/dpnp/backend/extensions/vm/div.cpp
@@ -64,16 +64,46 @@ static sycl::event div_impl(sycl::queue exec_q,
 {
     type_utils::validate_type_for_device<T>(exec_q);
 
-    const T* a = reinterpret_cast<const T*>(in_a);
-    const T* b = reinterpret_cast<const T*>(in_b);
-    T* y = reinterpret_cast<T*>(out_y);
+    std::cerr << "enter div_impl" << std::endl;
 
-    return mkl_vm::div(exec_q,
+    const T* _a = reinterpret_cast<const T*>(in_a);
+    const T* _b = reinterpret_cast<const T*>(in_b);
+    T* _y = reinterpret_cast<T*>(out_y);
+
+    std::cerr << "casting is done" << std::endl;
+
+    T* a = sycl::malloc_device<T>(n, exec_q);
+    T* b = sycl::malloc_device<T>(n, exec_q);
+    T* y = sycl::malloc_device<T>(n, exec_q);
+
+    std::cerr << "malloc is done" << std::endl;
+
+    exec_q.copy(_a, a, n).wait();
+    exec_q.copy(_b, b, n).wait();
+    exec_q.copy(_y, y, n).wait();
+
+    std::cerr << "copy is done" << std::endl;
+
+    sycl::event ev = mkl_vm::div(exec_q,
                        n, // number of elements to be calculated
                        a, // pointer `a` containing 1st input vector of size n
                        b, // pointer `b` containing 2nd input vector of size n
                        y, // pointer `y` to the output vector of size n
                        depends);
+    ev.wait();
+
+    std::cerr << "div is done" << std::endl;
+
+    exec_q.copy(y, _y, n).wait();
+
+    std::cerr << "copy is done" << std::endl;
+
+    sycl::free(a, exec_q);
+    sycl::free(b, exec_q);
+    sycl::free(y, exec_q);
+
+    std::cerr << "leaving div_impl" << std::endl;
+    return sycl::event();
 }
 
 std::pair<sycl::event, sycl::event> div(sycl::queue exec_q,
@@ -175,9 +205,21 @@ std::pair<sycl::event, sycl::event> div(sycl::queue exec_q,
         throw py::value_error("No div implementation defined");
     }
     sycl::event sum_ev = div_fn(exec_q, src_nelems, src1_data, src2_data, dst_data, depends);
-
-    sycl::event ht_ev = dpctl::utils::keep_args_alive(exec_q, {src1, src2, dst}, {sum_ev});
-    return std::make_pair(ht_ev, sum_ev);
+    // sum_ev.wait();
+
+    // int* dummy = sycl::malloc_device<int>(1, exec_q);
+    // sycl::event cleanup_ev = exec_q.submit([&](sycl::handler& cgh) {
+    //     // cgh.depends_on(sum_ev);
+    //     auto ctx = exec_q.get_context();
+    //     cgh.host_task([dummy, ctx]() {
+    //         // dummy host task to pass into keep_args_alive
+    //         sycl::free(dummy, ctx);
+    //     });
+    // });
+
+    // sycl::event ht_ev = dpctl::utils::keep_args_alive(exec_q, {src1, src2, dst}, {sum_ev});
+    // return std::make_pair(ht_ev, sum_ev);
+    return std::make_pair(sycl::event(), sycl::event());
 }
 
 bool can_call_div(sycl::queue exec_q,

From 1a07356e8182b646128fb8b0bd6004870af19956 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Mon, 12 Jun 2023 08:24:36 -0500
Subject: [PATCH 7/7] exclude call of div() with MKL 2023.1.0

---
 .github/workflows/conda-package.yml       |  8 +--
 dpnp/backend/extensions/vm/CMakeLists.txt | 13 -----
 dpnp/backend/extensions/vm/div.cpp        | 66 +++++++----------------
 3 files changed, 22 insertions(+), 65 deletions(-)

diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index 5645191b7c6e..fbfe66ff17b9 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -194,10 +194,10 @@ jobs:
       # TODO: run the whole scope once the issues on CPU are resolved
       - name: Run tests
         run: |
-          python -m pytest -q -ra --disable-warnings -vv -s ${{ env.TEST_SCOPE }}
+          python -m pytest -q -ra --disable-warnings -vv ${{ env.TEST_SCOPE }}
         working-directory: ${{ env.tests-path }}
         env:
-          SYCL_QUEUE_THREAD_POOL_SIZE: 16
+          SYCL_QUEUE_THREAD_POOL_SIZE: 6
 
   test_windows:
     name: Test ['windows-latest', python='${{ matrix.python }}']
@@ -333,10 +333,10 @@ jobs:
       # TODO: run the whole scope once the issues on CPU are resolved
       - name: Run tests
         run: |
-          python -m pytest -q -ra --disable-warnings -vv -s ${{ env.TEST_SCOPE }}
+          python -m pytest -q -ra --disable-warnings -vv ${{ env.TEST_SCOPE }}
         working-directory: ${{ env.tests-path }}
         env:
-          SYCL_QUEUE_THREAD_POOL_SIZE: 16
+          SYCL_QUEUE_THREAD_POOL_SIZE: 6
 
   upload:
     name: Upload ['${{ matrix.os }}', python='${{ matrix.python }}']
diff --git a/dpnp/backend/extensions/vm/CMakeLists.txt b/dpnp/backend/extensions/vm/CMakeLists.txt
index 07a4ffae8aba..8f3086ec3a9e 100644
--- a/dpnp/backend/extensions/vm/CMakeLists.txt
+++ b/dpnp/backend/extensions/vm/CMakeLists.txt
@@ -55,9 +55,7 @@ else()
   target_compile_options(${python_module_name} PRIVATE
     -fno-approx-func
     -fno-finite-math-only
-    -no-ipo
     )
-    target_link_options(${python_module_name} PRIVATE -no-ipo)
 endif()
 
 target_link_options(${python_module_name} PUBLIC -fsycl-device-code-split=per_kernel)
@@ -72,17 +70,6 @@ endif()
 
 target_link_libraries(${python_module_name} PUBLIC MKL::MKL_DPCPP)
 
-target_link_libraries(${python_module_name} PUBLIC oneDPL)
-
-if (UNIX)
-  # needed for STL headers with GCC < 11
-  target_compile_definitions(${python_module_name} PUBLIC _GLIBCXX_USE_TBB_PAR_BACKEND=0)
-endif()
-
-target_compile_definitions(${python_module_name} PUBLIC PSTL_USE_PARALLEL_POLICIES=0)
-# work-around for Windows at exit crash with predefined policies
-target_compile_definitions(${python_module_name} PUBLIC ONEDPL_USE_PREDEFINED_POLICIES=0)
-
 install(TARGETS ${python_module_name}
   DESTINATION "dpnp/backend/extensions/vm"
 )
diff --git a/dpnp/backend/extensions/vm/div.cpp b/dpnp/backend/extensions/vm/div.cpp
index 8a6751a45a44..28fbe1cdf3cc 100644
--- a/dpnp/backend/extensions/vm/div.cpp
+++ b/dpnp/backend/extensions/vm/div.cpp
@@ -64,46 +64,16 @@ static sycl::event div_impl(sycl::queue exec_q,
 {
     type_utils::validate_type_for_device<T>(exec_q);
 
-    std::cerr << "enter div_impl" << std::endl;
+    const T* a = reinterpret_cast<const T*>(in_a);
+    const T* b = reinterpret_cast<const T*>(in_b);
+    T* y = reinterpret_cast<T*>(out_y);
 
-    const T* _a = reinterpret_cast<const T*>(in_a);
-    const T* _b = reinterpret_cast<const T*>(in_b);
-    T* _y = reinterpret_cast<T*>(out_y);
-
-    std::cerr << "casting is done" << std::endl;
-
-    T* a = sycl::malloc_device<T>(n, exec_q);
-    T* b = sycl::malloc_device<T>(n, exec_q);
-    T* y = sycl::malloc_device<T>(n, exec_q);
-
-    std::cerr << "malloc is done" << std::endl;
-
-    exec_q.copy(_a, a, n).wait();
-    exec_q.copy(_b, b, n).wait();
-    exec_q.copy(_y, y, n).wait();
-
-    std::cerr << "copy is done" << std::endl;
-
-    sycl::event ev = mkl_vm::div(exec_q,
+    return mkl_vm::div(exec_q,
                        n, // number of elements to be calculated
                        a, // pointer `a` containing 1st input vector of size n
                        b, // pointer `b` containing 2nd input vector of size n
                        y, // pointer `y` to the output vector of size n
                        depends);
-    ev.wait();
-
-    std::cerr << "div is done" << std::endl;
-
-    exec_q.copy(y, _y, n).wait();
-
-    std::cerr << "copy is done" << std::endl;
-
-    sycl::free(a, exec_q);
-    sycl::free(b, exec_q);
-    sycl::free(y, exec_q);
-
-    std::cerr << "leaving div_impl" << std::endl;
-    return sycl::event();
 }
 
 std::pair<sycl::event, sycl::event> div(sycl::queue exec_q,
@@ -205,20 +175,9 @@ std::pair<sycl::event, sycl::event> div(sycl::queue exec_q,
         throw py::value_error("No div implementation defined");
     }
     sycl::event sum_ev = div_fn(exec_q, src_nelems, src1_data, src2_data, dst_data, depends);
-    // sum_ev.wait();
-
-    // int* dummy = sycl::malloc_device<int>(1, exec_q);
-    // sycl::event cleanup_ev = exec_q.submit([&](sycl::handler& cgh) {
-    //     // cgh.depends_on(sum_ev);
-    //     auto ctx = exec_q.get_context();
-    //     cgh.host_task([dummy, ctx]() {
-    //         // dummy host task to pass into keep_args_alive
-    //         sycl::free(dummy, ctx);
-    //     });
-    // });
-
-    // sycl::event ht_ev = dpctl::utils::keep_args_alive(exec_q, {src1, src2, dst}, {sum_ev});
-    // return std::make_pair(ht_ev, sum_ev);
+
+    sycl::event ht_ev = dpctl::utils::keep_args_alive(exec_q, {src1, src2, dst}, {sum_ev});
+    return std::make_pair(ht_ev, sum_ev);
     return std::make_pair(sycl::event(), sycl::event());
 }
 
@@ -227,6 +186,7 @@ bool can_call_div(sycl::queue exec_q,
                   dpctl::tensor::usm_ndarray src2,
                   dpctl::tensor::usm_ndarray dst)
 {
+#if INTEL_MKL_VERSION >= 20230002
     // check type_nums
     int src1_typenum = src1.get_typenum();
     int src2_typenum = src2.get_typenum();
@@ -325,6 +285,16 @@ bool can_call_div(sycl::queue exec_q,
         return false;
     }
     return true;
+#else
+    // In OneMKL 2023.1.0 the call of oneapi::mkl::vm::div() is going to dead lock
+    // inside ~usm_wrapper_to_host()->{...; q_->wait_and_throw(); ...}
+
+    (void)exec_q;
+    (void)src1;
+    (void)src2;
+    (void)dst;
+    return false;
+#endif // INTEL_MKL_VERSION >= 20230002
 }
 
 template <typename fnT, typename T>