Skip to content

[SYCL][ESIMD] Add compile time properties overload of USM block store #11641

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Oct 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -655,6 +655,39 @@ __esimd_lsc_load_stateless(__ESIMD_DNS::simd_mask_storage_t<N> pred,
}
#endif // __SYCL_DEVICE_ONLY__

/// USM pointer scatter.
/// Supported platforms: DG2, PVC
///
/// Scatters elements to specific address.
///
/// @tparam Ty is element type.
/// @tparam L1H is L1 cache hint.
/// @tparam L2H is L2 cache hint.
/// @tparam AddressScale is the address scale.
/// @tparam ImmOffset is the immediate offset added to each address.
/// @tparam DS is the data size.
/// @tparam VS is the number of elements to load per address.
/// @tparam Transposed indicates if the data is transposed during the transfer.
/// @tparam N is the SIMD size of operation (the number of addresses to access)
/// @param pred is predicates.
/// @param addrs is the prefetch addresses.
/// @param vals is values to store.
template <typename Ty, __ESIMD_NS::cache_hint L1H, __ESIMD_NS::cache_hint L2H,
uint16_t AddressScale, int ImmOffset, __ESIMD_DNS::lsc_data_size DS,
__ESIMD_DNS::lsc_vector_size VS,
__ESIMD_DNS::lsc_data_order _Transposed, int N>
__ESIMD_INTRIN void __esimd_lsc_store_stateless(
__ESIMD_DNS::simd_mask_storage_t<N> pred,
__ESIMD_DNS::vector_type_t<uintptr_t, N> addrs,
__ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_DNS::to_int<VS>()> vals)
#ifdef __SYCL_DEVICE_ONLY__
;
#else // __SYCL_DEVICE_ONLY__
{
__ESIMD_UNSUPPORTED_ON_HOST;
}
#endif // __SYCL_DEVICE_ONLY__

// \brief Raw sends.
//
// @param modifier the send message flags (Bit-0: isSendc, Bit-1: isEOT).
Expand Down
337 changes: 336 additions & 1 deletion sycl/include/sycl/ext/intel/esimd/memory.hpp

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -282,39 +282,6 @@ __ESIMD_INTRIN void __esimd_lsc_store_bti(
}
#endif // __SYCL_DEVICE_ONLY__

/// USM pointer scatter.
/// Supported platforms: DG2, PVC
///
/// Scatters elements to specific address.
///
/// @tparam Ty is element type.
/// @tparam L1H is L1 cache hint.
/// @tparam L3H is L3 cache hint.
/// @tparam AddressScale is the address scale.
/// @tparam ImmOffset is the immediate offset added to each address.
/// @tparam DS is the data size.
/// @tparam VS is the number of elements to load per address.
/// @tparam Transposed indicates if the data is transposed during the transfer.
/// @tparam N is the SIMD size of operation (the number of addresses to access)
/// @param pred is predicates.
/// @param addrs is the prefetch addresses.
/// @param vals is values to store.
template <typename Ty, __ESIMD_ENS::cache_hint L1H, __ESIMD_ENS::cache_hint L3H,
uint16_t AddressScale, int ImmOffset, __ESIMD_ENS::lsc_data_size DS,
__ESIMD_EDNS::lsc_vector_size VS,
__ESIMD_EDNS::lsc_data_order _Transposed, int N>
__ESIMD_INTRIN void __esimd_lsc_store_stateless(
__ESIMD_DNS::simd_mask_storage_t<N> pred,
__ESIMD_DNS::vector_type_t<uintptr_t, N> addrs,
__ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()> vals)
#ifdef __SYCL_DEVICE_ONLY__
;
#else // __SYCL_DEVICE_ONLY__
{
__ESIMD_UNSUPPORTED_ON_HOST;
}
#endif // __SYCL_DEVICE_ONLY__

/// 2D USM pointer block load.
/// Supported platforms: PVC
///
Expand Down
60 changes: 3 additions & 57 deletions sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1765,7 +1765,7 @@ lsc_scatter(AccessorTy acc, __ESIMD_NS::simd<uint32_t, N> offsets,
///
/// @tparam T is element type.
/// @tparam NElts is the number of elements to store per address.
/// @tparam DS is the data size.
/// @tparam DS is the data size (unused/obsolete).
/// @tparam L1H is L1 cache hint.
/// @tparam L3H is L3 cache hint.
/// @param p is the base pointer.
Expand All @@ -1781,62 +1781,8 @@ template <typename T, int NElts, lsc_data_size DS = lsc_data_size::default_size,
__ESIMD_API std::enable_if_t<__ESIMD_NS::is_simd_flag_type_v<FlagsT>>
lsc_block_store(T *p, __ESIMD_NS::simd<T, NElts> vals,
__ESIMD_NS::simd_mask<1> pred = 1, FlagsT flags = FlagsT{}) {
detail::check_lsc_data_size<T, DS>();
detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
constexpr auto Alignment =
FlagsT::template alignment<__ESIMD_DNS::__raw_t<T>>;
static_assert(
(Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
(Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
"Incorrect alignment for the data type");

// Prepare template arguments for the call of intrinsic.
constexpr uint16_t _AddressScale = 1;
constexpr int _ImmOffset = 0;
constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
static_assert(_DS == lsc_data_size::u16 || _DS == lsc_data_size::u8 ||
_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
"Conversion data types are not supported");
constexpr detail::lsc_data_order _Transposed =
detail::lsc_data_order::transpose;
constexpr int N = 1;
__ESIMD_NS::simd<uintptr_t, N> Addrs = reinterpret_cast<uintptr_t>(p);

constexpr int SmallIntFactor32Bit =
(_DS == lsc_data_size::u16) ? 2 : (_DS == lsc_data_size::u8 ? 4 : 1);
static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
"Number of elements is not supported by Transposed store");

constexpr bool Use64BitData =
Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
(sizeof(T) == 8 ||
(DS == lsc_data_size::default_size && NElts / SmallIntFactor32Bit > 64 &&
(NElts * sizeof(T)) % 8 == 0));
constexpr int SmallIntFactor64Bit =
(_DS == lsc_data_size::u16)
? 4
: (_DS == lsc_data_size::u8 ? 8
: (_DS == lsc_data_size::u32 ? 2 : 1));
constexpr int SmallIntFactor =
Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
constexpr int FactoredNElts = NElts / SmallIntFactor;
constexpr lsc_data_size ActualDS = Use64BitData
? __ESIMD_ENS::lsc_data_size::u64
: __ESIMD_ENS::lsc_data_size::u32;

detail::check_lsc_vector_size<FactoredNElts>();
constexpr detail::lsc_vector_size _VS =
detail::to_lsc_vector_size<FactoredNElts>();

using StoreType = __ESIMD_DNS::__raw_t<
std::conditional_t<SmallIntFactor == 1, T,
std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;

__esimd_lsc_store_stateless<StoreType, L1H, L3H, _AddressScale, _ImmOffset,
ActualDS, _VS, _Transposed, N>(
pred.data(), Addrs.data(),
sycl::bit_cast<__ESIMD_DNS::vector_type_t<StoreType, FactoredNElts>>(
vals.data()));
return __ESIMD_DNS::block_store_impl<T, NElts, L1H, L3H>(p, vals, pred,
flags);
}

/// A variation of lsc_block_store without predicate parameter to simplify
Expand Down
40 changes: 1 addition & 39 deletions sycl/test-e2e/ESIMD/unified_memory_api/Inputs/block_load.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,49 +13,11 @@
#include <iostream>

#include "../../esimd_test_utils.hpp"
#include "common.hpp"

using namespace sycl;
using namespace sycl::ext::intel::esimd;

template <typename Key, typename PropertiesT>
constexpr cache_hint getCacheHint(PropertiesT) {
if constexpr (PropertiesT::template has_property<Key>()) {
constexpr auto ValueT = PropertiesT::template get_property<Key>();
return ValueT.hint;
} else {
return cache_hint::none;
}
}

template <typename PropertiesT>
constexpr size_t getAlignment(PropertiesT, size_t DefaultAlignment) {
if constexpr (PropertiesT::template has_property<
sycl::ext::intel::esimd::alignment_key>()) {
constexpr auto ValueT = PropertiesT::template get_property<
sycl::ext::intel::esimd::alignment_key>();
return ValueT.value;
} else {
return DefaultAlignment;
}
}

template <typename T, uint16_t N, bool UseMask, typename PropertiesT>
constexpr size_t getAlignment(PropertiesT Props) {
constexpr cache_hint L1Hint =
getCacheHint<sycl::ext::intel::esimd::cache_hint_L1_key>(Props);
constexpr cache_hint L2Hint =
getCacheHint<sycl::ext::intel::esimd::cache_hint_L2_key>(Props);
constexpr bool RequiresPVC =
L1Hint != cache_hint::none || L2Hint != cache_hint::none || UseMask;

constexpr bool IsMaxLoadSizePVC = RequiresPVC && (N * sizeof(T) > 256);
constexpr size_t RequiredAlignment =
IsMaxLoadSizePVC ? 8 : (RequiresPVC ? 4 : sizeof(T));
constexpr size_t RequestedAlignment = getAlignment(Props, RequiredAlignment);
static_assert(RequestedAlignment >= RequiredAlignment, "Too small alignment");
return RequestedAlignment;
}

// Returns true iff verification is passed.
template <typename T>
bool verify(const T *In, const T *Out, size_t Size, int N,
Expand Down
178 changes: 178 additions & 0 deletions sycl/test-e2e/ESIMD/unified_memory_api/Inputs/block_store.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
//==------- block_store.hpp - DPC++ ESIMD on-device test ----------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===-------------------------------------------------------------------===//

#include "common.hpp"

using namespace sycl;
using namespace sycl::ext::intel::esimd;

template <typename T, uint16_t N, bool UseMask, bool UseProperties,
typename StorePropertiesT>
bool testUSM(queue Q, uint32_t Groups, uint32_t Threads,
StorePropertiesT StoreProperties) {

uint16_t Size = Groups * Threads * N;
using Tuint = sycl::_V1::ext::intel::esimd::detail::uint_type_t<sizeof(T)>;

std::cout << "USM case: T=" << esimd_test::type_name<T>() << ",N=" << N
<< ",UseMask=" << UseMask << ",UseProperties=" << UseProperties
<< std::endl;

sycl::range<1> GlobalRange{Groups};
sycl::range<1> LocalRange{Threads};
sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange};
constexpr size_t Alignment = getAlignment<T, N, UseMask>(StoreProperties);
T *Out = sycl::aligned_alloc_shared<T>(Alignment, Size, Q);
T Out_val = esimd_test::getRandomValue<T>();
for (int i = 0; i < Size; i++)
Out[i] = Out_val;

try {
Q.submit([&](handler &cgh) {
cgh.parallel_for(Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL {
uint16_t GlobalID = ndi.get_global_id(0);
uint32_t ElemOff = GlobalID * N;
// TODO: these 2 lines work-around the problem with scalar
// conversions to bfloat16. It could be just: "simd<T, N>
// PassThru(ElemOffset, 1);"
simd<uint32_t, N> PassThruInt(ElemOff, 1);
simd<T, N> Vals = PassThruInt;
if constexpr (UseMask) {
simd_mask<1> Mask = (GlobalID + 1) % 1;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi Nick,
I just found an error in block_load.hpp, and will fix it soon and it seems you copy-pasted it to this block_store.hpp.
The code that was supposed to be here and in few other places is: simd_mask<1> Mask = (GlobalID + 1) & 0x1;

Copy link
Contributor

@v-klochkov v-klochkov Oct 31, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(Val % 1) always gives 0.
Can you please fix it in block_store.hpp file and test if your patch still works correctly.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will do this now, thanks for the heads up

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Luckily it passes, I'm making a PR now

block_store(Out + ElemOff, Vals, Mask, StorePropertiesT{});
Vals = block_load<T, N>(Out + ElemOff);
Vals += 1;
block_store(Out, ElemOff * sizeof(T), Vals, Mask,
StorePropertiesT{});
Vals = block_load<T, N>(Out + ElemOff);
Vals += 2;
auto View = Vals.template select<N, 1>();
block_store<T, N>(Out, ElemOff * sizeof(T), View, Mask,
StorePropertiesT{});
Vals = block_load<T, N>(Out + ElemOff);
Vals += 3;
View = Vals.template select<N, 1>();
block_store<T, N>(Out + ElemOff, View, Mask, StorePropertiesT{});
} else {
if constexpr (UseProperties)
block_store(Out + ElemOff, Vals, StorePropertiesT{});

else
block_store(Out + ElemOff, Vals);

Vals = block_load<T, N>(Out + ElemOff);
Vals += 1;
if constexpr (UseProperties)
block_store(Out, ElemOff * sizeof(T), Vals, StorePropertiesT{});
else
block_store(Out, ElemOff * sizeof(T), Vals);

Vals = block_load<T, N>(Out + ElemOff);
Vals += 2;
auto View = Vals.template select<N, 1>();
if constexpr (UseProperties)
block_store<T, N>(Out, ElemOff * sizeof(T), View,
StorePropertiesT{});
else
block_store<T, N>(Out, ElemOff * sizeof(T), View);

Vals = block_load<T, N>(Out + ElemOff);
Vals += 3;
View = Vals.template select<N, 1>();
if constexpr (UseProperties)
block_store<T, N>(Out + ElemOff, View, StorePropertiesT{});
else
block_store<T, N>(Out + ElemOff, View);
}
});
}).wait();
} catch (sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
sycl::free(Out, Q);
return false;
}

bool Passed = true;

for (int i = 0; i < Size; i++) {
bool IsMaskSet = (i / N + 1) % 1;
Tuint Expected = sycl::bit_cast<Tuint>(Out_val);
if (!UseMask || IsMaskSet)
Expected = sycl::bit_cast<Tuint>((T)(i + 6));
Tuint Computed = sycl::bit_cast<Tuint>(Out[i]);
if (Computed != Expected) {
Passed = false;
std::cout << "Out[" << i << "] = " << std::to_string(Computed) << " vs "
<< std::to_string(Expected) << std::endl;
}
}

sycl::free(Out, Q);

return Passed;
}

template <typename T, bool TestPVCFeatures> bool test_block_store(queue Q) {
constexpr bool CheckMask = true;
constexpr bool CheckProperties = true;
properties AlignOnlyProps{alignment<sizeof(T)>};

bool Passed = true;

// Test block_store() that is available on Gen12 and PVC.
Passed &= testUSM<T, 1, !CheckMask, CheckProperties>(Q, 2, 4, AlignOnlyProps);
Passed &= testUSM<T, 2, !CheckMask, CheckProperties>(Q, 1, 4, AlignOnlyProps);
Passed &= testUSM<T, 3, !CheckMask, CheckProperties>(Q, 2, 8, AlignOnlyProps);
Passed &= testUSM<T, 4, !CheckMask, CheckProperties>(Q, 2, 4, AlignOnlyProps);
Passed &= testUSM<T, 8, !CheckMask, CheckProperties>(Q, 2, 4, AlignOnlyProps);
Passed &=
testUSM<T, 16, !CheckMask, CheckProperties>(Q, 2, 4, AlignOnlyProps);
Passed &=
testUSM<T, 32, !CheckMask, CheckProperties>(Q, 2, 4, AlignOnlyProps);
// Intentionally check non-power-of-2 simd size - it must work.
Passed &=
testUSM<T, 33, !CheckMask, CheckProperties>(Q, 2, 4, AlignOnlyProps);
// TODO: Enable after failure fixed
// Passed &=
// testUSM<T, 67, !CheckMask, CheckProperties>(Q, 1, 4, AlignOnlyProps);
Copy link
Contributor Author

@sarnex sarnex Oct 26, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test case fails even when using the old API, I reproduced it in a standalone test case. I wanted to see what happens before we moved to intrinsics, but we actually assert that the size is a multiple of 16, so we couldn't do it in the old way. I made an internal tracker for this, it should be unrelated to this PR, it just exposed the test case. Maybe the test is wrong and only this case exposes it, but I don't see where.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My first guess is that GPU BE lowers LLVM IR store <T x 67> incorrectly.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That was my guess too but I did not have enough courage to say it :)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We still need to analyse/check it on our side first.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Of course, I have an internal tracker assigned to me for this.

// Intentionally check big simd size - it must work.
Passed &=
testUSM<T, 128, !CheckMask, CheckProperties>(Q, 2, 4, AlignOnlyProps);
Passed &=
testUSM<T, 256, !CheckMask, CheckProperties>(Q, 1, 4, AlignOnlyProps);

// Test block_store() without passing compile-time properties argument.
Passed &=
testUSM<T, 16, !CheckMask, !CheckProperties>(Q, 2, 4, AlignOnlyProps);
Passed &=
testUSM<T, 32, !CheckMask, !CheckProperties>(Q, 2, 4, AlignOnlyProps);

if constexpr (TestPVCFeatures) {
// Using cache hints adds the requirement to run tests on PVC.
// Also, PVC variant currently requires power-or-two elements and
// the number of bytes loaded per call must not exceed 512.
properties PVCProps{cache_hint_L1<cache_hint::write_back>,
cache_hint_L2<cache_hint::write_back>, alignment<16>};

if constexpr (sizeof(T) >= 4) // only d/q words are supported now
Passed &= testUSM<T, 1, !CheckMask, CheckProperties>(Q, 2, 4, PVCProps);
if constexpr (sizeof(T) >= 2) // only d/q words are supported now
Passed &= testUSM<T, 2, !CheckMask, CheckProperties>(Q, 5, 5, PVCProps);
Passed &= testUSM<T, 4, !CheckMask, CheckProperties>(Q, 5, 5, PVCProps);
Passed &= testUSM<T, 8, !CheckMask, CheckProperties>(Q, 5, 5, PVCProps);
Passed &= testUSM<T, 16, CheckMask, CheckProperties>(Q, 5, 5, PVCProps);
Passed &= testUSM<T, 32, !CheckMask, CheckProperties>(Q, 2, 4, PVCProps);
Passed &= testUSM<T, 64, !CheckMask, CheckProperties>(Q, 7, 1, PVCProps);
if constexpr (128 * sizeof(T) <= 512)
Passed &= testUSM<T, 128, CheckMask, CheckProperties>(Q, 1, 4, PVCProps);
if constexpr (256 * sizeof(T) <= 512)
Passed &= testUSM<T, 256, CheckMask, CheckProperties>(Q, 1, 4, PVCProps);
} // TestPVCFeatures

return Passed;
}
Loading