From f7eee1ebe8d5a0a2a17eb26c04bf3e05ee753fe2 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 24 Aug 2023 13:28:11 -0500 Subject: [PATCH] Conversion from raw to multi_ptr should be done with address_space_cast We used `sycl::multi_ptr` constructor instead of `sycl::address_space_cast` previsously, and change in https://github.com/KhronosGroup/SYCL-Docs/pull/432 introduced `sycl::access:decorated::legacy` as the default which is deprecated in SYCL 2020 standard which highlighted the problem. In using `sycl::address_space_cast` we specify `sycl::access::decorated::yes`. --- .../include/kernels/copy_and_cast.hpp | 21 ++- .../kernels/elementwise_functions/common.hpp | 177 +++++++++--------- .../elementwise_functions/common_inplace.hpp | 65 ++++--- .../libtensor/include/kernels/where.hpp | 30 +-- 4 files changed, 148 insertions(+), 145 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp index f1e63ccc60..33969ec24a 100644 --- a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp +++ b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp @@ -244,25 +244,26 @@ class ContigCopyFunctor if (base + n_vecs * vec_sz * sgSize < nelems && sgSize == max_sgSize) { - using src_ptrT = - sycl::multi_ptr; - using dst_ptrT = - sycl::multi_ptr; sycl::vec src_vec; sycl::vec dst_vec; #pragma unroll for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { - src_vec = - sg.load(src_ptrT(&src_p[base + it * sgSize])); + auto src_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>( + &src_p[base + it * sgSize]); + auto dst_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>( + &dst_p[base + it * sgSize]); + + src_vec = sg.load(src_multi_ptr); #pragma unroll for (std::uint8_t k = 0; k < vec_sz; k++) { dst_vec[k] = fn(src_vec[k]); } - sg.store(dst_ptrT(&dst_p[base + it * sgSize]), - dst_vec); + sg.store(dst_multi_ptr, dst_vec); } } else { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp index 855d5479c1..797a7f2534 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp @@ -65,9 +65,6 @@ struct UnaryContigFunctor if constexpr (UnaryOperatorT::is_constant::value) { // value of operator is known to be a known constant constexpr resT const_val = UnaryOperatorT::constant_value; - using out_ptrT = - sycl::multi_ptr; auto sg = ndit.get_sub_group(); std::uint8_t sgSize = sg.get_local_range()[0]; @@ -80,8 +77,11 @@ struct UnaryContigFunctor sycl::vec res_vec(const_val); #pragma unroll for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { - sg.store(out_ptrT(&out[base + it * sgSize]), - res_vec); + auto out_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&out[base + it * sgSize]); + + sg.store(out_multi_ptr, res_vec); } } else { @@ -94,13 +94,6 @@ struct UnaryContigFunctor else if constexpr (UnaryOperatorT::supports_sg_loadstore::value && UnaryOperatorT::supports_vec::value) { - using in_ptrT = - sycl::multi_ptr; - using out_ptrT = - sycl::multi_ptr; - auto sg = ndit.get_sub_group(); std::uint16_t sgSize = sg.get_local_range()[0]; std::uint16_t max_sgSize = sg.get_max_local_range()[0]; @@ -113,10 +106,16 @@ struct UnaryContigFunctor #pragma unroll for (std::uint16_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { - x = sg.load(in_ptrT(&in[base + it * sgSize])); + auto in_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&in[base + it * sgSize]); + auto out_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&out[base + it * sgSize]); + + x = sg.load(in_multi_ptr); sycl::vec res_vec = op(x); - sg.store(out_ptrT(&out[base + it * sgSize]), - res_vec); + sg.store(out_multi_ptr, res_vec); } } else { @@ -141,23 +140,23 @@ struct UnaryContigFunctor if ((base + n_vecs * vec_sz * sgSize < nelems_) && (maxsgSize == sgSize)) { - using in_ptrT = - sycl::multi_ptr; - using out_ptrT = - sycl::multi_ptr; sycl::vec arg_vec; #pragma unroll for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { - arg_vec = sg.load(in_ptrT(&in[base + it * sgSize])); + auto in_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&in[base + it * sgSize]); + auto out_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&out[base + it * sgSize]); + + arg_vec = sg.load(in_multi_ptr); #pragma unroll for (std::uint8_t k = 0; k < vec_sz; ++k) { arg_vec[k] = op(arg_vec[k]); } - sg.store(out_ptrT(&out[base + it * sgSize]), - arg_vec); + sg.store(out_multi_ptr, arg_vec); } } else { @@ -179,24 +178,24 @@ struct UnaryContigFunctor if ((base + n_vecs * vec_sz * sgSize < nelems_) && (maxsgSize == sgSize)) { - using in_ptrT = - sycl::multi_ptr; - using out_ptrT = - sycl::multi_ptr; sycl::vec arg_vec; sycl::vec res_vec; #pragma unroll for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { - arg_vec = sg.load(in_ptrT(&in[base + it * sgSize])); + auto in_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&in[base + it * sgSize]); + auto out_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&out[base + it * sgSize]); + + arg_vec = sg.load(in_multi_ptr); #pragma unroll for (std::uint8_t k = 0; k < vec_sz; ++k) { res_vec[k] = op(arg_vec[k]); } - sg.store(out_ptrT(&out[base + it * sgSize]), - res_vec); + sg.store(out_multi_ptr, res_vec); } } else { @@ -365,28 +364,26 @@ struct BinaryContigFunctor if ((base + n_vecs * vec_sz * sgSize < nelems_) && (sgSize == maxsgSize)) { - using in_ptrT1 = - sycl::multi_ptr; - using in_ptrT2 = - sycl::multi_ptr; - using out_ptrT = - sycl::multi_ptr; sycl::vec arg1_vec; sycl::vec arg2_vec; sycl::vec res_vec; #pragma unroll for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { - arg1_vec = - sg.load(in_ptrT1(&in1[base + it * sgSize])); - arg2_vec = - sg.load(in_ptrT2(&in2[base + it * sgSize])); + auto in1_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&in1[base + it * sgSize]); + auto in2_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&in2[base + it * sgSize]); + auto out_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&out[base + it * sgSize]); + + arg1_vec = sg.load(in1_multi_ptr); + arg2_vec = sg.load(in2_multi_ptr); res_vec = op(arg1_vec, arg2_vec); - sg.store(out_ptrT(&out[base + it * sgSize]), - res_vec); + sg.store(out_multi_ptr, res_vec); } } else { @@ -407,32 +404,30 @@ struct BinaryContigFunctor if ((base + n_vecs * vec_sz * sgSize < nelems_) && (sgSize == maxsgSize)) { - using in_ptrT1 = - sycl::multi_ptr; - using in_ptrT2 = - sycl::multi_ptr; - using out_ptrT = - sycl::multi_ptr; sycl::vec arg1_vec; sycl::vec arg2_vec; sycl::vec res_vec; #pragma unroll for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { - arg1_vec = - sg.load(in_ptrT1(&in1[base + it * sgSize])); - arg2_vec = - sg.load(in_ptrT2(&in2[base + it * sgSize])); + auto in1_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&in1[base + it * sgSize]); + auto in2_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&in2[base + it * sgSize]); + auto out_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&out[base + it * sgSize]); + + arg1_vec = sg.load(in1_multi_ptr); + arg2_vec = sg.load(in2_multi_ptr); #pragma unroll for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) { res_vec[vec_id] = op(arg1_vec[vec_id], arg2_vec[vec_id]); } - sg.store(out_ptrT(&out[base + it * sgSize]), - res_vec); + sg.store(out_multi_ptr, res_vec); } } else { @@ -530,22 +525,24 @@ struct BinaryContigMatrixContigRowBroadcastingFunctor size_t base = gid - sg.get_local_id()[0]; if (base + sgSize < n_elems) { - using in_ptrT1 = - sycl::multi_ptr; - using in_ptrT2 = - sycl::multi_ptr; - using res_ptrT = - sycl::multi_ptr; - - const argT1 mat_el = sg.load(in_ptrT1(&mat[base])); - const argT2 vec_el = sg.load(in_ptrT2(&padded_vec[base % n1])); + auto in1_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&mat[base]); + + auto in2_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&padded_vec[base % n1]); + + auto out_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&res[base]); + + const argT1 mat_el = sg.load(in1_multi_ptr); + const argT2 vec_el = sg.load(in2_multi_ptr); resT res_el = op(mat_el, vec_el); - sg.store(res_ptrT(&res[base]), res_el); + sg.store(out_multi_ptr, res_el); } else { for (size_t k = base + sg.get_local_id()[0]; k < n_elems; @@ -592,22 +589,24 @@ struct BinaryContigRowContigMatrixBroadcastingFunctor size_t base = gid - sg.get_local_id()[0]; if (base + sgSize < n_elems) { - using in_ptrT1 = - sycl::multi_ptr; - using in_ptrT2 = - sycl::multi_ptr; - using res_ptrT = - sycl::multi_ptr; - - const argT2 mat_el = sg.load(in_ptrT2(&mat[base])); - const argT1 vec_el = sg.load(in_ptrT1(&padded_vec[base % n1])); + auto in1_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&padded_vec[base % n1]); + + auto in2_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&mat[base]); + + auto out_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&res[base]); + + const argT2 mat_el = sg.load(in2_multi_ptr); + const argT1 vec_el = sg.load(in1_multi_ptr); resT res_el = op(vec_el, mat_el); - sg.store(res_ptrT(&res[base]), res_el); + sg.store(out_multi_ptr, res_el); } else { for (size_t k = base + sg.get_local_id()[0]; k < n_elems; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp index a41029b27c..505a40acc5 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp @@ -76,24 +76,24 @@ struct BinaryInplaceContigFunctor if ((base + n_vecs * vec_sz * sgSize < nelems_) && (sgSize == maxsgSize)) { - using rhs_ptrT = - sycl::multi_ptr; - using lhs_ptrT = - sycl::multi_ptr; + sycl::vec arg_vec; sycl::vec res_vec; #pragma unroll for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { - arg_vec = - sg.load(rhs_ptrT(&rhs[base + it * sgSize])); - res_vec = - sg.load(lhs_ptrT(&lhs[base + it * sgSize])); + auto rhs_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&rhs[base + it * sgSize]); + auto lhs_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&lhs[base + it * sgSize]); + + arg_vec = sg.load(rhs_multi_ptr); + res_vec = sg.load(lhs_multi_ptr); op(res_vec, arg_vec); - sg.store(lhs_ptrT(&lhs[base + it * sgSize]), - res_vec); + + sg.store(lhs_multi_ptr, res_vec); } } else { @@ -115,27 +115,25 @@ struct BinaryInplaceContigFunctor if ((base + n_vecs * vec_sz * sgSize < nelems_) && (sgSize == maxsgSize)) { - using rhs_ptrT = - sycl::multi_ptr; - using lhs_ptrT = - sycl::multi_ptr; sycl::vec arg_vec; sycl::vec res_vec; #pragma unroll for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { - arg_vec = - sg.load(rhs_ptrT(&rhs[base + it * sgSize])); - res_vec = - sg.load(lhs_ptT(&lhs[base + it * sgSize])); + auto rhs_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&rhs[base + it * sgSize]); + auto lhs_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&lhs[base + it * sgSize]); + + arg_vec = sg.load(rhs_multi_ptr); + res_vec = sg.load(lhs_multi_ptr); #pragma unroll for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) { op(res_vec[vec_id], arg_vec[vec_id]); } - sg.store(lhs_ptrT(&lhs[base + it * sgSize]), - res_vec); + sg.store(lhs_multi_ptr, res_vec); } } else { @@ -223,19 +221,20 @@ struct BinaryInplaceRowMatrixBroadcastingFunctor size_t base = gid - sg.get_local_id()[0]; if (base + sgSize < n_elems) { - using in_ptrT = - sycl::multi_ptr; - using res_ptrT = - sycl::multi_ptr; + auto in_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&padded_vec[base % n1]); + + auto out_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&mat[base]); - const argT vec_el = sg.load(in_ptrT(&padded_vec[base % n1])); - resT mat_el = sg.load(res_ptrT(&mat[base])); + const argT vec_el = sg.load(in_multi_ptr); + resT mat_el = sg.load(out_multi_ptr); op(mat_el, vec_el); - sg.store(res_ptrT(&mat[base]), mat_el); + sg.store(out_multi_ptr, mat_el); } else { for (size_t k = base + sg.get_local_id()[0]; k < n_elems; diff --git a/dpctl/tensor/libtensor/include/kernels/where.hpp b/dpctl/tensor/libtensor/include/kernels/where.hpp index 67ce2ca1f0..9da5466dbe 100644 --- a/dpctl/tensor/libtensor/include/kernels/where.hpp +++ b/dpctl/tensor/libtensor/include/kernels/where.hpp @@ -100,15 +100,6 @@ class WhereContigFunctor if (base + n_vecs * vec_sz * sgSize < nelems && sgSize == max_sgSize) { - using dst_ptrT = - sycl::multi_ptr; - using x_ptrT = - sycl::multi_ptr; - using cond_ptrT = - sycl::multi_ptr; sycl::vec dst_vec; sycl::vec x1_vec; sycl::vec x2_vec; @@ -117,14 +108,27 @@ class WhereContigFunctor #pragma unroll for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { auto idx = base + it * sgSize; - x1_vec = sg.load(x_ptrT(&x1_p[idx])); - x2_vec = sg.load(x_ptrT(&x2_p[idx])); - cond_vec = sg.load(cond_ptrT(&cond_p[idx])); + auto x1_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&x1_p[idx]); + auto x2_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&x2_p[idx]); + auto cond_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&cond_p[idx]); + auto dst_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&dst_p[idx]); + + x1_vec = sg.load(x1_multi_ptr); + x2_vec = sg.load(x2_multi_ptr); + cond_vec = sg.load(cond_multi_ptr); #pragma unroll for (std::uint8_t k = 0; k < vec_sz; ++k) { dst_vec[k] = cond_vec[k] ? x1_vec[k] : x2_vec[k]; } - sg.store(dst_ptrT(&dst_p[idx]), dst_vec); + sg.store(dst_multi_ptr, dst_vec); } } else {