Skip to content

Commit d38206c

Browse files
authored
[SYCL][ESIMD] Implement unified memory API - block_store(usm, ...) (#11641)
This change adds the groundwork for adding overloads of the block_store APIs accepting compile time properties (L1,L2 cache hints, alignment). We have 8 overloads total, with various combinations of offset, predicate and simd_view. --------- Signed-off-by: Sarnie, Nick <[email protected]>
1 parent 0b5757b commit d38206c

File tree

9 files changed

+684
-130
lines changed

9 files changed

+684
-130
lines changed

sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -655,6 +655,39 @@ __esimd_lsc_load_stateless(__ESIMD_DNS::simd_mask_storage_t<N> pred,
655655
}
656656
#endif // __SYCL_DEVICE_ONLY__
657657

658+
/// USM pointer scatter.
659+
/// Supported platforms: DG2, PVC
660+
///
661+
/// Scatters elements to specific address.
662+
///
663+
/// @tparam Ty is element type.
664+
/// @tparam L1H is L1 cache hint.
665+
/// @tparam L2H is L2 cache hint.
666+
/// @tparam AddressScale is the address scale.
667+
/// @tparam ImmOffset is the immediate offset added to each address.
668+
/// @tparam DS is the data size.
669+
/// @tparam VS is the number of elements to load per address.
670+
/// @tparam Transposed indicates if the data is transposed during the transfer.
671+
/// @tparam N is the SIMD size of operation (the number of addresses to access)
672+
/// @param pred is predicates.
673+
/// @param addrs is the prefetch addresses.
674+
/// @param vals is values to store.
675+
template <typename Ty, __ESIMD_NS::cache_hint L1H, __ESIMD_NS::cache_hint L2H,
676+
uint16_t AddressScale, int ImmOffset, __ESIMD_DNS::lsc_data_size DS,
677+
__ESIMD_DNS::lsc_vector_size VS,
678+
__ESIMD_DNS::lsc_data_order _Transposed, int N>
679+
__ESIMD_INTRIN void __esimd_lsc_store_stateless(
680+
__ESIMD_DNS::simd_mask_storage_t<N> pred,
681+
__ESIMD_DNS::vector_type_t<uintptr_t, N> addrs,
682+
__ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_DNS::to_int<VS>()> vals)
683+
#ifdef __SYCL_DEVICE_ONLY__
684+
;
685+
#else // __SYCL_DEVICE_ONLY__
686+
{
687+
__ESIMD_UNSUPPORTED_ON_HOST;
688+
}
689+
#endif // __SYCL_DEVICE_ONLY__
690+
658691
// \brief Raw sends.
659692
//
660693
// @param modifier the send message flags (Bit-0: isSendc, Bit-1: isEOT).

sycl/include/sycl/ext/intel/esimd/memory.hpp

Lines changed: 336 additions & 1 deletion
Large diffs are not rendered by default.

sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp

Lines changed: 0 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -282,39 +282,6 @@ __ESIMD_INTRIN void __esimd_lsc_store_bti(
282282
}
283283
#endif // __SYCL_DEVICE_ONLY__
284284

285-
/// USM pointer scatter.
286-
/// Supported platforms: DG2, PVC
287-
///
288-
/// Scatters elements to specific address.
289-
///
290-
/// @tparam Ty is element type.
291-
/// @tparam L1H is L1 cache hint.
292-
/// @tparam L3H is L3 cache hint.
293-
/// @tparam AddressScale is the address scale.
294-
/// @tparam ImmOffset is the immediate offset added to each address.
295-
/// @tparam DS is the data size.
296-
/// @tparam VS is the number of elements to load per address.
297-
/// @tparam Transposed indicates if the data is transposed during the transfer.
298-
/// @tparam N is the SIMD size of operation (the number of addresses to access)
299-
/// @param pred is predicates.
300-
/// @param addrs is the prefetch addresses.
301-
/// @param vals is values to store.
302-
template <typename Ty, __ESIMD_ENS::cache_hint L1H, __ESIMD_ENS::cache_hint L3H,
303-
uint16_t AddressScale, int ImmOffset, __ESIMD_ENS::lsc_data_size DS,
304-
__ESIMD_EDNS::lsc_vector_size VS,
305-
__ESIMD_EDNS::lsc_data_order _Transposed, int N>
306-
__ESIMD_INTRIN void __esimd_lsc_store_stateless(
307-
__ESIMD_DNS::simd_mask_storage_t<N> pred,
308-
__ESIMD_DNS::vector_type_t<uintptr_t, N> addrs,
309-
__ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()> vals)
310-
#ifdef __SYCL_DEVICE_ONLY__
311-
;
312-
#else // __SYCL_DEVICE_ONLY__
313-
{
314-
__ESIMD_UNSUPPORTED_ON_HOST;
315-
}
316-
#endif // __SYCL_DEVICE_ONLY__
317-
318285
/// 2D USM pointer block load.
319286
/// Supported platforms: PVC
320287
///

sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp

Lines changed: 3 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -1765,7 +1765,7 @@ lsc_scatter(AccessorTy acc, __ESIMD_NS::simd<uint32_t, N> offsets,
17651765
///
17661766
/// @tparam T is element type.
17671767
/// @tparam NElts is the number of elements to store per address.
1768-
/// @tparam DS is the data size.
1768+
/// @tparam DS is the data size (unused/obsolete).
17691769
/// @tparam L1H is L1 cache hint.
17701770
/// @tparam L3H is L3 cache hint.
17711771
/// @param p is the base pointer.
@@ -1781,62 +1781,8 @@ template <typename T, int NElts, lsc_data_size DS = lsc_data_size::default_size,
17811781
__ESIMD_API std::enable_if_t<__ESIMD_NS::is_simd_flag_type_v<FlagsT>>
17821782
lsc_block_store(T *p, __ESIMD_NS::simd<T, NElts> vals,
17831783
__ESIMD_NS::simd_mask<1> pred = 1, FlagsT flags = FlagsT{}) {
1784-
detail::check_lsc_data_size<T, DS>();
1785-
detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
1786-
constexpr auto Alignment =
1787-
FlagsT::template alignment<__ESIMD_DNS::__raw_t<T>>;
1788-
static_assert(
1789-
(Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
1790-
(Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
1791-
"Incorrect alignment for the data type");
1792-
1793-
// Prepare template arguments for the call of intrinsic.
1794-
constexpr uint16_t _AddressScale = 1;
1795-
constexpr int _ImmOffset = 0;
1796-
constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
1797-
static_assert(_DS == lsc_data_size::u16 || _DS == lsc_data_size::u8 ||
1798-
_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
1799-
"Conversion data types are not supported");
1800-
constexpr detail::lsc_data_order _Transposed =
1801-
detail::lsc_data_order::transpose;
1802-
constexpr int N = 1;
1803-
__ESIMD_NS::simd<uintptr_t, N> Addrs = reinterpret_cast<uintptr_t>(p);
1804-
1805-
constexpr int SmallIntFactor32Bit =
1806-
(_DS == lsc_data_size::u16) ? 2 : (_DS == lsc_data_size::u8 ? 4 : 1);
1807-
static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
1808-
"Number of elements is not supported by Transposed store");
1809-
1810-
constexpr bool Use64BitData =
1811-
Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
1812-
(sizeof(T) == 8 ||
1813-
(DS == lsc_data_size::default_size && NElts / SmallIntFactor32Bit > 64 &&
1814-
(NElts * sizeof(T)) % 8 == 0));
1815-
constexpr int SmallIntFactor64Bit =
1816-
(_DS == lsc_data_size::u16)
1817-
? 4
1818-
: (_DS == lsc_data_size::u8 ? 8
1819-
: (_DS == lsc_data_size::u32 ? 2 : 1));
1820-
constexpr int SmallIntFactor =
1821-
Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
1822-
constexpr int FactoredNElts = NElts / SmallIntFactor;
1823-
constexpr lsc_data_size ActualDS = Use64BitData
1824-
? __ESIMD_ENS::lsc_data_size::u64
1825-
: __ESIMD_ENS::lsc_data_size::u32;
1826-
1827-
detail::check_lsc_vector_size<FactoredNElts>();
1828-
constexpr detail::lsc_vector_size _VS =
1829-
detail::to_lsc_vector_size<FactoredNElts>();
1830-
1831-
using StoreType = __ESIMD_DNS::__raw_t<
1832-
std::conditional_t<SmallIntFactor == 1, T,
1833-
std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
1834-
1835-
__esimd_lsc_store_stateless<StoreType, L1H, L3H, _AddressScale, _ImmOffset,
1836-
ActualDS, _VS, _Transposed, N>(
1837-
pred.data(), Addrs.data(),
1838-
sycl::bit_cast<__ESIMD_DNS::vector_type_t<StoreType, FactoredNElts>>(
1839-
vals.data()));
1784+
return __ESIMD_DNS::block_store_impl<T, NElts, L1H, L3H>(p, vals, pred,
1785+
flags);
18401786
}
18411787

18421788
/// A variation of lsc_block_store without predicate parameter to simplify

sycl/test-e2e/ESIMD/unified_memory_api/Inputs/block_load.hpp

Lines changed: 1 addition & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -13,49 +13,11 @@
1313
#include <iostream>
1414

1515
#include "../../esimd_test_utils.hpp"
16+
#include "common.hpp"
1617

1718
using namespace sycl;
1819
using namespace sycl::ext::intel::esimd;
1920

20-
template <typename Key, typename PropertiesT>
21-
constexpr cache_hint getCacheHint(PropertiesT) {
22-
if constexpr (PropertiesT::template has_property<Key>()) {
23-
constexpr auto ValueT = PropertiesT::template get_property<Key>();
24-
return ValueT.hint;
25-
} else {
26-
return cache_hint::none;
27-
}
28-
}
29-
30-
template <typename PropertiesT>
31-
constexpr size_t getAlignment(PropertiesT, size_t DefaultAlignment) {
32-
if constexpr (PropertiesT::template has_property<
33-
sycl::ext::intel::esimd::alignment_key>()) {
34-
constexpr auto ValueT = PropertiesT::template get_property<
35-
sycl::ext::intel::esimd::alignment_key>();
36-
return ValueT.value;
37-
} else {
38-
return DefaultAlignment;
39-
}
40-
}
41-
42-
template <typename T, uint16_t N, bool UseMask, typename PropertiesT>
43-
constexpr size_t getAlignment(PropertiesT Props) {
44-
constexpr cache_hint L1Hint =
45-
getCacheHint<sycl::ext::intel::esimd::cache_hint_L1_key>(Props);
46-
constexpr cache_hint L2Hint =
47-
getCacheHint<sycl::ext::intel::esimd::cache_hint_L2_key>(Props);
48-
constexpr bool RequiresPVC =
49-
L1Hint != cache_hint::none || L2Hint != cache_hint::none || UseMask;
50-
51-
constexpr bool IsMaxLoadSizePVC = RequiresPVC && (N * sizeof(T) > 256);
52-
constexpr size_t RequiredAlignment =
53-
IsMaxLoadSizePVC ? 8 : (RequiresPVC ? 4 : sizeof(T));
54-
constexpr size_t RequestedAlignment = getAlignment(Props, RequiredAlignment);
55-
static_assert(RequestedAlignment >= RequiredAlignment, "Too small alignment");
56-
return RequestedAlignment;
57-
}
58-
5921
// Returns true iff verification is passed.
6022
template <typename T>
6123
bool verify(const T *In, const T *Out, size_t Size, int N,
Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
//==------- block_store.hpp - DPC++ ESIMD on-device test ----------------==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===-------------------------------------------------------------------===//
8+
9+
#include "common.hpp"
10+
11+
using namespace sycl;
12+
using namespace sycl::ext::intel::esimd;
13+
14+
template <typename T, uint16_t N, bool UseMask, bool UseProperties,
15+
typename StorePropertiesT>
16+
bool testUSM(queue Q, uint32_t Groups, uint32_t Threads,
17+
StorePropertiesT StoreProperties) {
18+
19+
uint16_t Size = Groups * Threads * N;
20+
using Tuint = sycl::_V1::ext::intel::esimd::detail::uint_type_t<sizeof(T)>;
21+
22+
std::cout << "USM case: T=" << esimd_test::type_name<T>() << ",N=" << N
23+
<< ",UseMask=" << UseMask << ",UseProperties=" << UseProperties
24+
<< std::endl;
25+
26+
sycl::range<1> GlobalRange{Groups};
27+
sycl::range<1> LocalRange{Threads};
28+
sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange};
29+
constexpr size_t Alignment = getAlignment<T, N, UseMask>(StoreProperties);
30+
T *Out = sycl::aligned_alloc_shared<T>(Alignment, Size, Q);
31+
T Out_val = esimd_test::getRandomValue<T>();
32+
for (int i = 0; i < Size; i++)
33+
Out[i] = Out_val;
34+
35+
try {
36+
Q.submit([&](handler &cgh) {
37+
cgh.parallel_for(Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL {
38+
uint16_t GlobalID = ndi.get_global_id(0);
39+
uint32_t ElemOff = GlobalID * N;
40+
// TODO: these 2 lines work-around the problem with scalar
41+
// conversions to bfloat16. It could be just: "simd<T, N>
42+
// PassThru(ElemOffset, 1);"
43+
simd<uint32_t, N> PassThruInt(ElemOff, 1);
44+
simd<T, N> Vals = PassThruInt;
45+
if constexpr (UseMask) {
46+
simd_mask<1> Mask = (GlobalID + 1) % 1;
47+
block_store(Out + ElemOff, Vals, Mask, StorePropertiesT{});
48+
Vals = block_load<T, N>(Out + ElemOff);
49+
Vals += 1;
50+
block_store(Out, ElemOff * sizeof(T), Vals, Mask,
51+
StorePropertiesT{});
52+
Vals = block_load<T, N>(Out + ElemOff);
53+
Vals += 2;
54+
auto View = Vals.template select<N, 1>();
55+
block_store<T, N>(Out, ElemOff * sizeof(T), View, Mask,
56+
StorePropertiesT{});
57+
Vals = block_load<T, N>(Out + ElemOff);
58+
Vals += 3;
59+
View = Vals.template select<N, 1>();
60+
block_store<T, N>(Out + ElemOff, View, Mask, StorePropertiesT{});
61+
} else {
62+
if constexpr (UseProperties)
63+
block_store(Out + ElemOff, Vals, StorePropertiesT{});
64+
65+
else
66+
block_store(Out + ElemOff, Vals);
67+
68+
Vals = block_load<T, N>(Out + ElemOff);
69+
Vals += 1;
70+
if constexpr (UseProperties)
71+
block_store(Out, ElemOff * sizeof(T), Vals, StorePropertiesT{});
72+
else
73+
block_store(Out, ElemOff * sizeof(T), Vals);
74+
75+
Vals = block_load<T, N>(Out + ElemOff);
76+
Vals += 2;
77+
auto View = Vals.template select<N, 1>();
78+
if constexpr (UseProperties)
79+
block_store<T, N>(Out, ElemOff * sizeof(T), View,
80+
StorePropertiesT{});
81+
else
82+
block_store<T, N>(Out, ElemOff * sizeof(T), View);
83+
84+
Vals = block_load<T, N>(Out + ElemOff);
85+
Vals += 3;
86+
View = Vals.template select<N, 1>();
87+
if constexpr (UseProperties)
88+
block_store<T, N>(Out + ElemOff, View, StorePropertiesT{});
89+
else
90+
block_store<T, N>(Out + ElemOff, View);
91+
}
92+
});
93+
}).wait();
94+
} catch (sycl::exception const &e) {
95+
std::cout << "SYCL exception caught: " << e.what() << '\n';
96+
sycl::free(Out, Q);
97+
return false;
98+
}
99+
100+
bool Passed = true;
101+
102+
for (int i = 0; i < Size; i++) {
103+
bool IsMaskSet = (i / N + 1) % 1;
104+
Tuint Expected = sycl::bit_cast<Tuint>(Out_val);
105+
if (!UseMask || IsMaskSet)
106+
Expected = sycl::bit_cast<Tuint>((T)(i + 6));
107+
Tuint Computed = sycl::bit_cast<Tuint>(Out[i]);
108+
if (Computed != Expected) {
109+
Passed = false;
110+
std::cout << "Out[" << i << "] = " << std::to_string(Computed) << " vs "
111+
<< std::to_string(Expected) << std::endl;
112+
}
113+
}
114+
115+
sycl::free(Out, Q);
116+
117+
return Passed;
118+
}
119+
120+
template <typename T, bool TestPVCFeatures> bool test_block_store(queue Q) {
121+
constexpr bool CheckMask = true;
122+
constexpr bool CheckProperties = true;
123+
properties AlignOnlyProps{alignment<sizeof(T)>};
124+
125+
bool Passed = true;
126+
127+
// Test block_store() that is available on Gen12 and PVC.
128+
Passed &= testUSM<T, 1, !CheckMask, CheckProperties>(Q, 2, 4, AlignOnlyProps);
129+
Passed &= testUSM<T, 2, !CheckMask, CheckProperties>(Q, 1, 4, AlignOnlyProps);
130+
Passed &= testUSM<T, 3, !CheckMask, CheckProperties>(Q, 2, 8, AlignOnlyProps);
131+
Passed &= testUSM<T, 4, !CheckMask, CheckProperties>(Q, 2, 4, AlignOnlyProps);
132+
Passed &= testUSM<T, 8, !CheckMask, CheckProperties>(Q, 2, 4, AlignOnlyProps);
133+
Passed &=
134+
testUSM<T, 16, !CheckMask, CheckProperties>(Q, 2, 4, AlignOnlyProps);
135+
Passed &=
136+
testUSM<T, 32, !CheckMask, CheckProperties>(Q, 2, 4, AlignOnlyProps);
137+
// Intentionally check non-power-of-2 simd size - it must work.
138+
Passed &=
139+
testUSM<T, 33, !CheckMask, CheckProperties>(Q, 2, 4, AlignOnlyProps);
140+
// TODO: Enable after failure fixed
141+
// Passed &=
142+
// testUSM<T, 67, !CheckMask, CheckProperties>(Q, 1, 4, AlignOnlyProps);
143+
// Intentionally check big simd size - it must work.
144+
Passed &=
145+
testUSM<T, 128, !CheckMask, CheckProperties>(Q, 2, 4, AlignOnlyProps);
146+
Passed &=
147+
testUSM<T, 256, !CheckMask, CheckProperties>(Q, 1, 4, AlignOnlyProps);
148+
149+
// Test block_store() without passing compile-time properties argument.
150+
Passed &=
151+
testUSM<T, 16, !CheckMask, !CheckProperties>(Q, 2, 4, AlignOnlyProps);
152+
Passed &=
153+
testUSM<T, 32, !CheckMask, !CheckProperties>(Q, 2, 4, AlignOnlyProps);
154+
155+
if constexpr (TestPVCFeatures) {
156+
// Using cache hints adds the requirement to run tests on PVC.
157+
// Also, PVC variant currently requires power-or-two elements and
158+
// the number of bytes loaded per call must not exceed 512.
159+
properties PVCProps{cache_hint_L1<cache_hint::write_back>,
160+
cache_hint_L2<cache_hint::write_back>, alignment<16>};
161+
162+
if constexpr (sizeof(T) >= 4) // only d/q words are supported now
163+
Passed &= testUSM<T, 1, !CheckMask, CheckProperties>(Q, 2, 4, PVCProps);
164+
if constexpr (sizeof(T) >= 2) // only d/q words are supported now
165+
Passed &= testUSM<T, 2, !CheckMask, CheckProperties>(Q, 5, 5, PVCProps);
166+
Passed &= testUSM<T, 4, !CheckMask, CheckProperties>(Q, 5, 5, PVCProps);
167+
Passed &= testUSM<T, 8, !CheckMask, CheckProperties>(Q, 5, 5, PVCProps);
168+
Passed &= testUSM<T, 16, CheckMask, CheckProperties>(Q, 5, 5, PVCProps);
169+
Passed &= testUSM<T, 32, !CheckMask, CheckProperties>(Q, 2, 4, PVCProps);
170+
Passed &= testUSM<T, 64, !CheckMask, CheckProperties>(Q, 7, 1, PVCProps);
171+
if constexpr (128 * sizeof(T) <= 512)
172+
Passed &= testUSM<T, 128, CheckMask, CheckProperties>(Q, 1, 4, PVCProps);
173+
if constexpr (256 * sizeof(T) <= 512)
174+
Passed &= testUSM<T, 256, CheckMask, CheckProperties>(Q, 1, 4, PVCProps);
175+
} // TestPVCFeatures
176+
177+
return Passed;
178+
}

0 commit comments

Comments
 (0)