Skip to content

Avoid needless copies #1421

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Sep 26, 2023
17 changes: 9 additions & 8 deletions dpctl/apis/include/dpctl4pybind11.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include <complex>
#include <memory>
#include <pybind11/pybind11.h>
#include <utility>
#include <vector>

namespace py = pybind11;
Expand Down Expand Up @@ -369,19 +370,19 @@ class dpctl_capi
sycl::queue q_{};
PySyclQueueObject *py_q_tmp =
SyclQueue_Make(reinterpret_cast<DPCTLSyclQueueRef>(&q_));
py::object py_sycl_queue = py::reinterpret_steal<py::object>(
const py::object &py_sycl_queue = py::reinterpret_steal<py::object>(
reinterpret_cast<PyObject *>(py_q_tmp));

default_sycl_queue_ = std::shared_ptr<py::object>(
new py::object(py_sycl_queue), Deleter{});

py::module_ mod_memory = py::module_::import("dpctl.memory");
py::object py_as_usm_memory = mod_memory.attr("as_usm_memory");
const py::object &py_as_usm_memory = mod_memory.attr("as_usm_memory");
as_usm_memory_ = std::shared_ptr<py::object>(
new py::object{py_as_usm_memory}, Deleter{});

auto mem_kl = mod_memory.attr("MemoryUSMHost");
py::object py_default_usm_memory =
const py::object &py_default_usm_memory =
mem_kl(1, py::arg("queue") = py_sycl_queue);
default_usm_memory_ = std::shared_ptr<py::object>(
new py::object{py_default_usm_memory}, Deleter{});
Expand All @@ -390,7 +391,7 @@ class dpctl_capi
py::module_::import("dpctl.tensor._usmarray");
auto tensor_kl = mod_usmarray.attr("usm_ndarray");

py::object py_default_usm_ndarray =
const py::object &py_default_usm_ndarray =
tensor_kl(py::tuple(), py::arg("dtype") = py::str("u1"),
py::arg("buffer") = py_default_usm_memory);

Expand Down Expand Up @@ -1032,7 +1033,7 @@ namespace utils
{

template <std::size_t num>
sycl::event keep_args_alive(sycl::queue q,
sycl::event keep_args_alive(sycl::queue &q,
const py::object (&py_objs)[num],
const std::vector<sycl::event> &depends = {})
{
Expand All @@ -1043,7 +1044,7 @@ sycl::event keep_args_alive(sycl::queue q,
shp_arr[i] = std::make_shared<py::handle>(py_objs[i]);
shp_arr[i]->inc_ref();
}
cgh.host_task([=]() {
cgh.host_task([shp_arr = std::move(shp_arr)]() {
py::gil_scoped_acquire acquire;

for (std::size_t i = 0; i < num; ++i) {
Expand All @@ -1058,7 +1059,7 @@ sycl::event keep_args_alive(sycl::queue q,
/*! @brief Check if all allocation queues are the same as the
execution queue */
template <std::size_t num>
bool queues_are_compatible(sycl::queue exec_q,
bool queues_are_compatible(const sycl::queue &exec_q,
const sycl::queue (&alloc_qs)[num])
{
for (std::size_t i = 0; i < num; ++i) {
Expand All @@ -1073,7 +1074,7 @@ bool queues_are_compatible(sycl::queue exec_q,
/*! @brief Check if all allocation queues of usm_ndarays are the same as
the execution queue */
template <std::size_t num>
bool queues_are_compatible(sycl::queue exec_q,
bool queues_are_compatible(const sycl::queue &exec_q,
const ::dpctl::tensor::usm_ndarray (&arrs)[num])
{
for (std::size_t i = 0; i < num; ++i) {
Expand Down
35 changes: 18 additions & 17 deletions dpctl/tensor/libtensor/include/kernels/accumulators.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ template <typename inputT,
size_t n_wi,
typename IndexerT,
typename TransformerT>
sycl::event inclusive_scan_rec(sycl::queue exec_q,
sycl::event inclusive_scan_rec(sycl::queue &exec_q,
size_t n_elems,
size_t wg_size,
const inputT *input,
Expand All @@ -116,19 +116,20 @@ sycl::event inclusive_scan_rec(sycl::queue exec_q,
{
size_t n_groups = ceiling_quotient(n_elems, n_wi * wg_size);

sycl::event inc_scan_phase1_ev = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(depends);
const sycl::event &inc_scan_phase1_ev =
exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(depends);

using slmT = sycl::local_accessor<size_t, 1>;
using slmT = sycl::local_accessor<size_t, 1>;

auto lws = sycl::range<1>(wg_size);
auto gws = sycl::range<1>(n_groups * wg_size);
auto lws = sycl::range<1>(wg_size);
auto gws = sycl::range<1>(n_groups * wg_size);

slmT slm_iscan_tmp(lws, cgh);
slmT slm_iscan_tmp(lws, cgh);

cgh.parallel_for<class inclusive_scan_rec_local_scan_krn<
inputT, outputT, n_wi, IndexerT, decltype(transformer)>>(
sycl::nd_range<1>(gws, lws), [=](sycl::nd_item<1> it)
sycl::nd_range<1>(gws, lws), [=, slm_iscan_tmp = std::move(slm_iscan_tmp)](sycl::nd_item<1> it)
{
auto chunk_gid = it.get_global_id(0);
auto lid = it.get_local_id(0);
Expand Down Expand Up @@ -172,7 +173,7 @@ sycl::event inclusive_scan_rec(sycl::queue exec_q,
output[i + m_wi] = local_isum[m_wi];
}
});
});
});

sycl::event out_event = inc_scan_phase1_ev;
if (n_groups > 1) {
Expand Down Expand Up @@ -203,25 +204,25 @@ sycl::event inclusive_scan_rec(sycl::queue exec_q,

sycl::event e4 = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(e3);
auto ctx = exec_q.get_context();
const auto &ctx = exec_q.get_context();
cgh.host_task([ctx, temp]() { sycl::free(temp, ctx); });
});

out_event = e4;
out_event = std::move(e4);
}

return out_event;
}

typedef size_t (*accumulate_contig_impl_fn_ptr_t)(
sycl::queue,
sycl::queue &,
size_t,
const char *,
char *,
std::vector<sycl::event> const &);

template <typename maskT, typename cumsumT, typename transformerT>
size_t accumulate_contig_impl(sycl::queue q,
size_t accumulate_contig_impl(sycl::queue &q,
size_t n_elems,
const char *mask,
char *cumsum,
Expand All @@ -235,7 +236,7 @@ size_t accumulate_contig_impl(sycl::queue q,
NoOpIndexer flat_indexer{};
transformerT non_zero_indicator{};

sycl::event comp_ev =
const sycl::event &comp_ev =
inclusive_scan_rec<maskT, cumsumT, n_wi, decltype(flat_indexer),
decltype(non_zero_indicator)>(
q, n_elems, wg_size, mask_data_ptr, cumsum_data_ptr, 0, 1,
Expand Down Expand Up @@ -296,7 +297,7 @@ template <typename fnT, typename T> struct Cumsum1DContigFactory
};

typedef size_t (*accumulate_strided_impl_fn_ptr_t)(
sycl::queue,
sycl::queue &,
size_t,
const char *,
int,
Expand All @@ -305,7 +306,7 @@ typedef size_t (*accumulate_strided_impl_fn_ptr_t)(
std::vector<sycl::event> const &);

template <typename maskT, typename cumsumT, typename transformerT>
size_t accumulate_strided_impl(sycl::queue q,
size_t accumulate_strided_impl(sycl::queue &q,
size_t n_elems,
const char *mask,
int nd,
Expand All @@ -321,7 +322,7 @@ size_t accumulate_strided_impl(sycl::queue q,
StridedIndexer strided_indexer{nd, 0, shape_strides};
transformerT non_zero_indicator{};

sycl::event comp_ev =
const sycl::event &comp_ev =
inclusive_scan_rec<maskT, cumsumT, n_wi, decltype(strided_indexer),
decltype(non_zero_indicator)>(
q, n_elems, wg_size, mask_data_ptr, cumsum_data_ptr, 0, 1,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ template <typename OrthoIndexerT,
class masked_extract_all_slices_strided_impl_krn;

typedef sycl::event (*masked_extract_all_slices_strided_impl_fn_ptr_t)(
sycl::queue,
sycl::queue &,
py::ssize_t,
const char *,
const char *,
Expand All @@ -211,7 +211,7 @@ typedef sycl::event (*masked_extract_all_slices_strided_impl_fn_ptr_t)(

template <typename dataT, typename indT>
sycl::event masked_extract_all_slices_strided_impl(
sycl::queue exec_q,
sycl::queue &exec_q,
py::ssize_t iteration_size,
const char *src_p,
const char *cumsum_p,
Expand Down Expand Up @@ -253,7 +253,7 @@ sycl::event masked_extract_all_slices_strided_impl(
}

typedef sycl::event (*masked_extract_some_slices_strided_impl_fn_ptr_t)(
sycl::queue,
sycl::queue &,
py::ssize_t,
py::ssize_t,
const char *,
Expand All @@ -278,7 +278,7 @@ class masked_extract_some_slices_strided_impl_krn;

template <typename dataT, typename indT>
sycl::event masked_extract_some_slices_strided_impl(
sycl::queue exec_q,
sycl::queue &exec_q,
py::ssize_t orthog_nelems,
py::ssize_t masked_nelems,
const char *src_p,
Expand Down Expand Up @@ -380,7 +380,7 @@ template <typename OrthoIndexerT,
class masked_place_all_slices_strided_impl_krn;

typedef sycl::event (*masked_place_all_slices_strided_impl_fn_ptr_t)(
sycl::queue,
sycl::queue &,
py::ssize_t,
char *,
const char *,
Expand All @@ -393,7 +393,7 @@ typedef sycl::event (*masked_place_all_slices_strided_impl_fn_ptr_t)(

template <typename dataT, typename indT>
sycl::event masked_place_all_slices_strided_impl(
sycl::queue exec_q,
sycl::queue &exec_q,
py::ssize_t iteration_size,
char *dst_p,
const char *cumsum_p,
Expand Down Expand Up @@ -430,7 +430,7 @@ sycl::event masked_place_all_slices_strided_impl(
}

typedef sycl::event (*masked_place_some_slices_strided_impl_fn_ptr_t)(
sycl::queue,
sycl::queue &,
py::ssize_t,
py::ssize_t,
char *,
Expand All @@ -455,7 +455,7 @@ class masked_place_some_slices_strided_impl_krn;

template <typename dataT, typename indT>
sycl::event masked_place_some_slices_strided_impl(
sycl::queue exec_q,
sycl::queue &exec_q,
py::ssize_t orthog_nelems,
py::ssize_t masked_nelems,
char *dst_p,
Expand Down Expand Up @@ -549,7 +549,7 @@ struct MaskPlaceSomeSlicesStridedFactoryForInt64
template <typename T1, typename T2> class non_zero_indexes_krn;

typedef sycl::event (*non_zero_indexes_fn_ptr_t)(
sycl::queue,
sycl::queue &,
py::ssize_t,
py::ssize_t,
int,
Expand All @@ -559,7 +559,7 @@ typedef sycl::event (*non_zero_indexes_fn_ptr_t)(
std::vector<sycl::event> const &);

template <typename indT1, typename indT2>
sycl::event non_zero_indexes_impl(sycl::queue exec_q,
sycl::event non_zero_indexes_impl(sycl::queue &exec_q,
py::ssize_t iter_size,
py::ssize_t nz_elems,
int nd,
Expand Down
10 changes: 5 additions & 5 deletions dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ struct ContigBooleanReduction
};

typedef sycl::event (*boolean_reduction_contig_impl_fn_ptr)(
sycl::queue,
sycl::queue &,
size_t,
size_t,
const char *,
Expand All @@ -264,7 +264,7 @@ using dpctl::tensor::sycl_utils::choose_workgroup_size;

template <typename argTy, typename resTy, typename RedOpT, typename GroupOpT>
sycl::event
boolean_reduction_axis1_contig_impl(sycl::queue exec_q,
boolean_reduction_axis1_contig_impl(sycl::queue &exec_q,
size_t iter_nelems,
size_t reduction_nelems,
const char *arg_cp,
Expand Down Expand Up @@ -463,7 +463,7 @@ class boolean_reduction_axis0_contig_krn;

template <typename argTy, typename resTy, typename RedOpT, typename GroupOpT>
sycl::event
boolean_reduction_axis0_contig_impl(sycl::queue exec_q,
boolean_reduction_axis0_contig_impl(sycl::queue &exec_q,
size_t iter_nelems,
size_t reduction_nelems,
const char *arg_cp,
Expand Down Expand Up @@ -572,7 +572,7 @@ template <typename T1, typename T2, typename T3, typename T4, typename T5>
class boolean_reduction_seq_strided_krn;

typedef sycl::event (*boolean_reduction_strided_impl_fn_ptr)(
sycl::queue,
sycl::queue &,
size_t,
size_t,
const char *,
Expand All @@ -588,7 +588,7 @@ typedef sycl::event (*boolean_reduction_strided_impl_fn_ptr)(

template <typename argTy, typename resTy, typename RedOpT, typename GroupOpT>
sycl::event
boolean_reduction_strided_impl(sycl::queue exec_q,
boolean_reduction_strided_impl(sycl::queue &exec_q,
size_t iter_nelems,
size_t reduction_nelems,
const char *arg_cp,
Expand Down
Loading