-
Notifications
You must be signed in to change notification settings - Fork 795
[SYCL][ESIMD] Add compile time properties overload of USM block store #11641
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,178 @@ | ||
//==------- block_store.hpp - DPC++ ESIMD on-device test ----------------==// | ||
// | ||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===-------------------------------------------------------------------===// | ||
|
||
#include "common.hpp" | ||
|
||
using namespace sycl; | ||
using namespace sycl::ext::intel::esimd; | ||
|
||
template <typename T, uint16_t N, bool UseMask, bool UseProperties, | ||
typename StorePropertiesT> | ||
bool testUSM(queue Q, uint32_t Groups, uint32_t Threads, | ||
StorePropertiesT StoreProperties) { | ||
|
||
uint16_t Size = Groups * Threads * N; | ||
using Tuint = sycl::_V1::ext::intel::esimd::detail::uint_type_t<sizeof(T)>; | ||
|
||
std::cout << "USM case: T=" << esimd_test::type_name<T>() << ",N=" << N | ||
<< ",UseMask=" << UseMask << ",UseProperties=" << UseProperties | ||
<< std::endl; | ||
|
||
sycl::range<1> GlobalRange{Groups}; | ||
sycl::range<1> LocalRange{Threads}; | ||
sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange}; | ||
constexpr size_t Alignment = getAlignment<T, N, UseMask>(StoreProperties); | ||
T *Out = sycl::aligned_alloc_shared<T>(Alignment, Size, Q); | ||
T Out_val = esimd_test::getRandomValue<T>(); | ||
for (int i = 0; i < Size; i++) | ||
Out[i] = Out_val; | ||
|
||
try { | ||
Q.submit([&](handler &cgh) { | ||
cgh.parallel_for(Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL { | ||
uint16_t GlobalID = ndi.get_global_id(0); | ||
uint32_t ElemOff = GlobalID * N; | ||
// TODO: these 2 lines work-around the problem with scalar | ||
// conversions to bfloat16. It could be just: "simd<T, N> | ||
// PassThru(ElemOffset, 1);" | ||
simd<uint32_t, N> PassThruInt(ElemOff, 1); | ||
simd<T, N> Vals = PassThruInt; | ||
if constexpr (UseMask) { | ||
simd_mask<1> Mask = (GlobalID + 1) % 1; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi Nick, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (Val % 1) always gives 0. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will do this now, thanks for the heads up There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Luckily it passes, I'm making a PR now |
||
block_store(Out + ElemOff, Vals, Mask, StorePropertiesT{}); | ||
Vals = block_load<T, N>(Out + ElemOff); | ||
Vals += 1; | ||
block_store(Out, ElemOff * sizeof(T), Vals, Mask, | ||
StorePropertiesT{}); | ||
Vals = block_load<T, N>(Out + ElemOff); | ||
Vals += 2; | ||
auto View = Vals.template select<N, 1>(); | ||
block_store<T, N>(Out, ElemOff * sizeof(T), View, Mask, | ||
StorePropertiesT{}); | ||
Vals = block_load<T, N>(Out + ElemOff); | ||
Vals += 3; | ||
View = Vals.template select<N, 1>(); | ||
block_store<T, N>(Out + ElemOff, View, Mask, StorePropertiesT{}); | ||
} else { | ||
if constexpr (UseProperties) | ||
block_store(Out + ElemOff, Vals, StorePropertiesT{}); | ||
|
||
else | ||
block_store(Out + ElemOff, Vals); | ||
|
||
Vals = block_load<T, N>(Out + ElemOff); | ||
Vals += 1; | ||
if constexpr (UseProperties) | ||
block_store(Out, ElemOff * sizeof(T), Vals, StorePropertiesT{}); | ||
else | ||
block_store(Out, ElemOff * sizeof(T), Vals); | ||
|
||
Vals = block_load<T, N>(Out + ElemOff); | ||
Vals += 2; | ||
auto View = Vals.template select<N, 1>(); | ||
if constexpr (UseProperties) | ||
block_store<T, N>(Out, ElemOff * sizeof(T), View, | ||
StorePropertiesT{}); | ||
else | ||
block_store<T, N>(Out, ElemOff * sizeof(T), View); | ||
|
||
Vals = block_load<T, N>(Out + ElemOff); | ||
Vals += 3; | ||
View = Vals.template select<N, 1>(); | ||
if constexpr (UseProperties) | ||
block_store<T, N>(Out + ElemOff, View, StorePropertiesT{}); | ||
else | ||
block_store<T, N>(Out + ElemOff, View); | ||
} | ||
}); | ||
}).wait(); | ||
} catch (sycl::exception const &e) { | ||
std::cout << "SYCL exception caught: " << e.what() << '\n'; | ||
sycl::free(Out, Q); | ||
return false; | ||
} | ||
|
||
bool Passed = true; | ||
|
||
for (int i = 0; i < Size; i++) { | ||
bool IsMaskSet = (i / N + 1) % 1; | ||
Tuint Expected = sycl::bit_cast<Tuint>(Out_val); | ||
if (!UseMask || IsMaskSet) | ||
Expected = sycl::bit_cast<Tuint>((T)(i + 6)); | ||
Tuint Computed = sycl::bit_cast<Tuint>(Out[i]); | ||
if (Computed != Expected) { | ||
Passed = false; | ||
std::cout << "Out[" << i << "] = " << std::to_string(Computed) << " vs " | ||
<< std::to_string(Expected) << std::endl; | ||
} | ||
} | ||
|
||
sycl::free(Out, Q); | ||
|
||
return Passed; | ||
} | ||
|
||
template <typename T, bool TestPVCFeatures> bool test_block_store(queue Q) { | ||
constexpr bool CheckMask = true; | ||
constexpr bool CheckProperties = true; | ||
properties AlignOnlyProps{alignment<sizeof(T)>}; | ||
|
||
bool Passed = true; | ||
|
||
// Test block_store() that is available on Gen12 and PVC. | ||
Passed &= testUSM<T, 1, !CheckMask, CheckProperties>(Q, 2, 4, AlignOnlyProps); | ||
Passed &= testUSM<T, 2, !CheckMask, CheckProperties>(Q, 1, 4, AlignOnlyProps); | ||
Passed &= testUSM<T, 3, !CheckMask, CheckProperties>(Q, 2, 8, AlignOnlyProps); | ||
Passed &= testUSM<T, 4, !CheckMask, CheckProperties>(Q, 2, 4, AlignOnlyProps); | ||
Passed &= testUSM<T, 8, !CheckMask, CheckProperties>(Q, 2, 4, AlignOnlyProps); | ||
Passed &= | ||
testUSM<T, 16, !CheckMask, CheckProperties>(Q, 2, 4, AlignOnlyProps); | ||
Passed &= | ||
testUSM<T, 32, !CheckMask, CheckProperties>(Q, 2, 4, AlignOnlyProps); | ||
// Intentionally check non-power-of-2 simd size - it must work. | ||
Passed &= | ||
testUSM<T, 33, !CheckMask, CheckProperties>(Q, 2, 4, AlignOnlyProps); | ||
// TODO: Enable after failure fixed | ||
// Passed &= | ||
// testUSM<T, 67, !CheckMask, CheckProperties>(Q, 1, 4, AlignOnlyProps); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test case fails even when using the old API, I reproduced it in a standalone test case. I wanted to see what happens before we moved to intrinsics, but we actually assert that the size is a multiple of 16, so we couldn't do it in the old way. I made an internal tracker for this, it should be unrelated to this PR, it just exposed the test case. Maybe the test is wrong and only this case exposes it, but I don't see where. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My first guess is that GPU BE lowers LLVM IR store <T x 67> incorrectly. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That was my guess too but I did not have enough courage to say it :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We still need to analyse/check it on our side first. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Of course, I have an internal tracker assigned to me for this. |
||
// Intentionally check big simd size - it must work. | ||
Passed &= | ||
testUSM<T, 128, !CheckMask, CheckProperties>(Q, 2, 4, AlignOnlyProps); | ||
Passed &= | ||
testUSM<T, 256, !CheckMask, CheckProperties>(Q, 1, 4, AlignOnlyProps); | ||
|
||
// Test block_store() without passing compile-time properties argument. | ||
Passed &= | ||
testUSM<T, 16, !CheckMask, !CheckProperties>(Q, 2, 4, AlignOnlyProps); | ||
Passed &= | ||
testUSM<T, 32, !CheckMask, !CheckProperties>(Q, 2, 4, AlignOnlyProps); | ||
|
||
if constexpr (TestPVCFeatures) { | ||
// Using cache hints adds the requirement to run tests on PVC. | ||
// Also, PVC variant currently requires power-or-two elements and | ||
// the number of bytes loaded per call must not exceed 512. | ||
properties PVCProps{cache_hint_L1<cache_hint::write_back>, | ||
cache_hint_L2<cache_hint::write_back>, alignment<16>}; | ||
|
||
if constexpr (sizeof(T) >= 4) // only d/q words are supported now | ||
Passed &= testUSM<T, 1, !CheckMask, CheckProperties>(Q, 2, 4, PVCProps); | ||
if constexpr (sizeof(T) >= 2) // only d/q words are supported now | ||
Passed &= testUSM<T, 2, !CheckMask, CheckProperties>(Q, 5, 5, PVCProps); | ||
Passed &= testUSM<T, 4, !CheckMask, CheckProperties>(Q, 5, 5, PVCProps); | ||
Passed &= testUSM<T, 8, !CheckMask, CheckProperties>(Q, 5, 5, PVCProps); | ||
Passed &= testUSM<T, 16, CheckMask, CheckProperties>(Q, 5, 5, PVCProps); | ||
Passed &= testUSM<T, 32, !CheckMask, CheckProperties>(Q, 2, 4, PVCProps); | ||
Passed &= testUSM<T, 64, !CheckMask, CheckProperties>(Q, 7, 1, PVCProps); | ||
if constexpr (128 * sizeof(T) <= 512) | ||
Passed &= testUSM<T, 128, CheckMask, CheckProperties>(Q, 1, 4, PVCProps); | ||
if constexpr (256 * sizeof(T) <= 512) | ||
Passed &= testUSM<T, 256, CheckMask, CheckProperties>(Q, 1, 4, PVCProps); | ||
} // TestPVCFeatures | ||
|
||
return Passed; | ||
} |
Uh oh!
There was an error while loading. Please reload this page.