intel · v-klochkov · Oct 30, 2023 · Oct 20, 2023 · Oct 27, 2023 · Oct 27, 2023
@@ -655,6 +655,39 @@ __esimd_lsc_load_stateless(__ESIMD_DNS::simd_mask_storage_t<N> pred,
 }
 #endif // __SYCL_DEVICE_ONLY__
 
+/// USM pointer scatter.
+/// Supported platforms: DG2, PVC
+///
+/// Scatters elements to specific address.
+///
+/// @tparam Ty is element type.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L2H is L2 cache hint.
+/// @tparam AddressScale is the address scale.
+/// @tparam ImmOffset is the immediate offset added to each address.
+/// @tparam DS is the data size.
+/// @tparam VS is the number of elements to load per address.
+/// @tparam Transposed indicates if the data is transposed during the transfer.
+/// @tparam N is the SIMD size of operation (the number of addresses to access)
+/// @param pred is predicates.
+/// @param addrs is the prefetch addresses.
+/// @param vals is values to store.
+template <typename Ty, __ESIMD_NS::cache_hint L1H, __ESIMD_NS::cache_hint L2H,
+          uint16_t AddressScale, int ImmOffset, __ESIMD_DNS::lsc_data_size DS,
+          __ESIMD_DNS::lsc_vector_size VS,
+          __ESIMD_DNS::lsc_data_order _Transposed, int N>
+__ESIMD_INTRIN void __esimd_lsc_store_stateless(
+    __ESIMD_DNS::simd_mask_storage_t<N> pred,
+    __ESIMD_DNS::vector_type_t<uintptr_t, N> addrs,
+    __ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_DNS::to_int<VS>()> vals)
+#ifdef __SYCL_DEVICE_ONLY__
+    ;
+#else  // __SYCL_DEVICE_ONLY__
+{
+  __ESIMD_UNSUPPORTED_ON_HOST;
+}
+#endif // __SYCL_DEVICE_ONLY__
+
 // \brief Raw sends.
 //
 // @param modifier	the send message flags (Bit-0: isSendc, Bit-1: isEOT).

@@ -282,39 +282,6 @@ __ESIMD_INTRIN void __esimd_lsc_store_bti(
 }
 #endif // __SYCL_DEVICE_ONLY__
 
-/// USM pointer scatter.
-/// Supported platforms: DG2, PVC
-///
-/// Scatters elements to specific address.
-///
-/// @tparam Ty is element type.
-/// @tparam L1H is L1 cache hint.
-/// @tparam L3H is L3 cache hint.
-/// @tparam AddressScale is the address scale.
-/// @tparam ImmOffset is the immediate offset added to each address.
-/// @tparam DS is the data size.
-/// @tparam VS is the number of elements to load per address.
-/// @tparam Transposed indicates if the data is transposed during the transfer.
-/// @tparam N is the SIMD size of operation (the number of addresses to access)
-/// @param pred is predicates.
-/// @param addrs is the prefetch addresses.
-/// @param vals is values to store.
-template <typename Ty, __ESIMD_ENS::cache_hint L1H, __ESIMD_ENS::cache_hint L3H,
-          uint16_t AddressScale, int ImmOffset, __ESIMD_ENS::lsc_data_size DS,
-          __ESIMD_EDNS::lsc_vector_size VS,
-          __ESIMD_EDNS::lsc_data_order _Transposed, int N>
-__ESIMD_INTRIN void __esimd_lsc_store_stateless(
-    __ESIMD_DNS::simd_mask_storage_t<N> pred,
-    __ESIMD_DNS::vector_type_t<uintptr_t, N> addrs,
-    __ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()> vals)
-#ifdef __SYCL_DEVICE_ONLY__
-    ;
-#else  // __SYCL_DEVICE_ONLY__
-{
-  __ESIMD_UNSUPPORTED_ON_HOST;
-}
-#endif // __SYCL_DEVICE_ONLY__
-
 /// 2D USM pointer block load.
 /// Supported platforms: PVC
 ///

@@ -1765,7 +1765,7 @@ lsc_scatter(AccessorTy acc, __ESIMD_NS::simd<uint32_t, N> offsets,
 ///
 /// @tparam T is element type.
 /// @tparam NElts is the number of elements to store per address.
-/// @tparam DS is the data size.
+/// @tparam DS is the data size (unused/obsolete).
 /// @tparam L1H is L1 cache hint.
 /// @tparam L3H is L3 cache hint.
 /// @param p is the base pointer.
@@ -1781,62 +1781,8 @@ template <typename T, int NElts, lsc_data_size DS = lsc_data_size::default_size,
 __ESIMD_API std::enable_if_t<__ESIMD_NS::is_simd_flag_type_v<FlagsT>>
 lsc_block_store(T *p, __ESIMD_NS::simd<T, NElts> vals,
                 __ESIMD_NS::simd_mask<1> pred = 1, FlagsT flags = FlagsT{}) {
-  detail::check_lsc_data_size<T, DS>();
-  detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
-  constexpr auto Alignment =
-      FlagsT::template alignment<__ESIMD_DNS::__raw_t<T>>;
-  static_assert(
-      (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
-          (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
-      "Incorrect alignment for the data type");
-
-  // Prepare template arguments for the call of intrinsic.
-  constexpr uint16_t _AddressScale = 1;
-  constexpr int _ImmOffset = 0;
-  constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
-  static_assert(_DS == lsc_data_size::u16 || _DS == lsc_data_size::u8 ||
-                    _DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
-                "Conversion data types are not supported");
-  constexpr detail::lsc_data_order _Transposed =
-      detail::lsc_data_order::transpose;
-  constexpr int N = 1;
-  __ESIMD_NS::simd<uintptr_t, N> Addrs = reinterpret_cast<uintptr_t>(p);
-
-  constexpr int SmallIntFactor32Bit =
-      (_DS == lsc_data_size::u16) ? 2 : (_DS == lsc_data_size::u8 ? 4 : 1);
-  static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
-                "Number of elements is not supported by Transposed store");
-
-  constexpr bool Use64BitData =
-      Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
-      (sizeof(T) == 8 ||
-       (DS == lsc_data_size::default_size && NElts / SmallIntFactor32Bit > 64 &&
-        (NElts * sizeof(T)) % 8 == 0));
-  constexpr int SmallIntFactor64Bit =
-      (_DS == lsc_data_size::u16)
-          ? 4
-          : (_DS == lsc_data_size::u8 ? 8
-                                      : (_DS == lsc_data_size::u32 ? 2 : 1));
-  constexpr int SmallIntFactor =
-      Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
-  constexpr int FactoredNElts = NElts / SmallIntFactor;
-  constexpr lsc_data_size ActualDS = Use64BitData
-                                         ? __ESIMD_ENS::lsc_data_size::u64
-                                         : __ESIMD_ENS::lsc_data_size::u32;
-
-  detail::check_lsc_vector_size<FactoredNElts>();
-  constexpr detail::lsc_vector_size _VS =
-      detail::to_lsc_vector_size<FactoredNElts>();
-
-  using StoreType = __ESIMD_DNS::__raw_t<
-      std::conditional_t<SmallIntFactor == 1, T,
-                         std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
-
-  __esimd_lsc_store_stateless<StoreType, L1H, L3H, _AddressScale, _ImmOffset,
-                              ActualDS, _VS, _Transposed, N>(
-      pred.data(), Addrs.data(),
-      sycl::bit_cast<__ESIMD_DNS::vector_type_t<StoreType, FactoredNElts>>(
-          vals.data()));
+  return __ESIMD_DNS::block_store_impl<T, NElts, L1H, L3H>(p, vals, pred,
+                                                           flags);
 }
 
 /// A variation of lsc_block_store without predicate parameter to simplify

@@ -13,49 +13,11 @@
 #include <iostream>
 
 #include "../../esimd_test_utils.hpp"
+#include "common.hpp"
 
 using namespace sycl;
 using namespace sycl::ext::intel::esimd;
 
-template <typename Key, typename PropertiesT>
-constexpr cache_hint getCacheHint(PropertiesT) {
-  if constexpr (PropertiesT::template has_property<Key>()) {
-    constexpr auto ValueT = PropertiesT::template get_property<Key>();
-    return ValueT.hint;
-  } else {
-    return cache_hint::none;
-  }
-}
-
-template <typename PropertiesT>
-constexpr size_t getAlignment(PropertiesT, size_t DefaultAlignment) {
-  if constexpr (PropertiesT::template has_property<
-                    sycl::ext::intel::esimd::alignment_key>()) {
-    constexpr auto ValueT = PropertiesT::template get_property<
-        sycl::ext::intel::esimd::alignment_key>();
-    return ValueT.value;
-  } else {
-    return DefaultAlignment;
-  }
-}
-
-template <typename T, uint16_t N, bool UseMask, typename PropertiesT>
-constexpr size_t getAlignment(PropertiesT Props) {
-  constexpr cache_hint L1Hint =
-      getCacheHint<sycl::ext::intel::esimd::cache_hint_L1_key>(Props);
-  constexpr cache_hint L2Hint =
-      getCacheHint<sycl::ext::intel::esimd::cache_hint_L2_key>(Props);
-  constexpr bool RequiresPVC =
-      L1Hint != cache_hint::none || L2Hint != cache_hint::none || UseMask;
-
-  constexpr bool IsMaxLoadSizePVC = RequiresPVC && (N * sizeof(T) > 256);
-  constexpr size_t RequiredAlignment =
-      IsMaxLoadSizePVC ? 8 : (RequiresPVC ? 4 : sizeof(T));
-  constexpr size_t RequestedAlignment = getAlignment(Props, RequiredAlignment);
-  static_assert(RequestedAlignment >= RequiredAlignment, "Too small alignment");
-  return RequestedAlignment;
-}
-
 // Returns true iff verification is passed.
 template <typename T>
 bool verify(const T *In, const T *Out, size_t Size, int N,

@@ -0,0 +1,178 @@
+//==------- block_store.hpp - DPC++ ESIMD on-device test ----------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-------------------------------------------------------------------===//
+
+#include "common.hpp"
+
+using namespace sycl;
+using namespace sycl::ext::intel::esimd;
+
+template <typename T, uint16_t N, bool UseMask, bool UseProperties,
+          typename StorePropertiesT>
+bool testUSM(queue Q, uint32_t Groups, uint32_t Threads,
+             StorePropertiesT StoreProperties) {
+
+  uint16_t Size = Groups * Threads * N;
+  using Tuint = sycl::_V1::ext::intel::esimd::detail::uint_type_t<sizeof(T)>;
+
+  std::cout << "USM case: T=" << esimd_test::type_name<T>() << ",N=" << N
+            << ",UseMask=" << UseMask << ",UseProperties=" << UseProperties
+            << std::endl;
+
+  sycl::range<1> GlobalRange{Groups};
+  sycl::range<1> LocalRange{Threads};
+  sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange};
+  constexpr size_t Alignment = getAlignment<T, N, UseMask>(StoreProperties);
+  T *Out = sycl::aligned_alloc_shared<T>(Alignment, Size, Q);
+  T Out_val = esimd_test::getRandomValue<T>();
+  for (int i = 0; i < Size; i++)
+    Out[i] = Out_val;
+
+  try {
+    Q.submit([&](handler &cgh) {
+       cgh.parallel_for(Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL {
+         uint16_t GlobalID = ndi.get_global_id(0);
+         uint32_t ElemOff = GlobalID * N;
+         //  TODO: these 2 lines work-around the problem with scalar
+         //  conversions to bfloat16. It could be just: "simd<T, N>
+         //  PassThru(ElemOffset, 1);"
+         simd<uint32_t, N> PassThruInt(ElemOff, 1);
+         simd<T, N> Vals = PassThruInt;
+         if constexpr (UseMask) {
+           simd_mask<1> Mask = (GlobalID + 1) % 1;
+           block_store(Out + ElemOff, Vals, Mask, StorePropertiesT{});
+           Vals = block_load<T, N>(Out + ElemOff);
+           Vals += 1;
+           block_store(Out, ElemOff * sizeof(T), Vals, Mask,
+                       StorePropertiesT{});
+           Vals = block_load<T, N>(Out + ElemOff);
+           Vals += 2;
+           auto View = Vals.template select<N, 1>();
+           block_store<T, N>(Out, ElemOff * sizeof(T), View, Mask,
+                             StorePropertiesT{});
+           Vals = block_load<T, N>(Out + ElemOff);
+           Vals += 3;
+           View = Vals.template select<N, 1>();
+           block_store<T, N>(Out + ElemOff, View, Mask, StorePropertiesT{});
+         } else {
+           if constexpr (UseProperties)
+             block_store(Out + ElemOff, Vals, StorePropertiesT{});
+
+           else
+             block_store(Out + ElemOff, Vals);
+
+           Vals = block_load<T, N>(Out + ElemOff);
+           Vals += 1;
+           if constexpr (UseProperties)
+             block_store(Out, ElemOff * sizeof(T), Vals, StorePropertiesT{});
+           else
+             block_store(Out, ElemOff * sizeof(T), Vals);
+
+           Vals = block_load<T, N>(Out + ElemOff);
+           Vals += 2;
+           auto View = Vals.template select<N, 1>();
+           if constexpr (UseProperties)
+             block_store<T, N>(Out, ElemOff * sizeof(T), View,
+                               StorePropertiesT{});
+           else
+             block_store<T, N>(Out, ElemOff * sizeof(T), View);
+
+           Vals = block_load<T, N>(Out + ElemOff);
+           Vals += 3;
+           View = Vals.template select<N, 1>();
+           if constexpr (UseProperties)
+             block_store<T, N>(Out + ElemOff, View, StorePropertiesT{});
+           else
+             block_store<T, N>(Out + ElemOff, View);
+         }
+       });
+     }).wait();
+  } catch (sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    sycl::free(Out, Q);
+    return false;
+  }
+
+  bool Passed = true;
+
+  for (int i = 0; i < Size; i++) {
+    bool IsMaskSet = (i / N + 1) % 1;
+    Tuint Expected = sycl::bit_cast<Tuint>(Out_val);
+    if (!UseMask || IsMaskSet)
+      Expected = sycl::bit_cast<Tuint>((T)(i + 6));
+    Tuint Computed = sycl::bit_cast<Tuint>(Out[i]);
+    if (Computed != Expected) {
+      Passed = false;
+      std::cout << "Out[" << i << "] = " << std::to_string(Computed) << " vs "
+                << std::to_string(Expected) << std::endl;
+    }
+  }
+
+  sycl::free(Out, Q);
+
+  return Passed;
+}
+
+template <typename T, bool TestPVCFeatures> bool test_block_store(queue Q) {
+  constexpr bool CheckMask = true;
+  constexpr bool CheckProperties = true;
+  properties AlignOnlyProps{alignment<sizeof(T)>};
+
+  bool Passed = true;
+
+  // Test block_store() that is available on Gen12 and PVC.
+  Passed &= testUSM<T, 1, !CheckMask, CheckProperties>(Q, 2, 4, AlignOnlyProps);
+  Passed &= testUSM<T, 2, !CheckMask, CheckProperties>(Q, 1, 4, AlignOnlyProps);
+  Passed &= testUSM<T, 3, !CheckMask, CheckProperties>(Q, 2, 8, AlignOnlyProps);
+  Passed &= testUSM<T, 4, !CheckMask, CheckProperties>(Q, 2, 4, AlignOnlyProps);
+  Passed &= testUSM<T, 8, !CheckMask, CheckProperties>(Q, 2, 4, AlignOnlyProps);
+  Passed &=
+      testUSM<T, 16, !CheckMask, CheckProperties>(Q, 2, 4, AlignOnlyProps);
+  Passed &=
+      testUSM<T, 32, !CheckMask, CheckProperties>(Q, 2, 4, AlignOnlyProps);
+  // Intentionally check non-power-of-2 simd size - it must work.
+  Passed &=
+      testUSM<T, 33, !CheckMask, CheckProperties>(Q, 2, 4, AlignOnlyProps);
+  // TODO: Enable after failure fixed
+  // Passed &=
+  //    testUSM<T, 67, !CheckMask, CheckProperties>(Q, 1, 4, AlignOnlyProps);
+  // Intentionally check big simd size - it must work.
+  Passed &=
+      testUSM<T, 128, !CheckMask, CheckProperties>(Q, 2, 4, AlignOnlyProps);
+  Passed &=
+      testUSM<T, 256, !CheckMask, CheckProperties>(Q, 1, 4, AlignOnlyProps);
+
+  // Test block_store() without passing compile-time properties argument.
+  Passed &=
+      testUSM<T, 16, !CheckMask, !CheckProperties>(Q, 2, 4, AlignOnlyProps);
+  Passed &=
+      testUSM<T, 32, !CheckMask, !CheckProperties>(Q, 2, 4, AlignOnlyProps);
+
+  if constexpr (TestPVCFeatures) {
+    // Using cache hints adds the requirement to run tests on PVC.
+    // Also, PVC variant currently requires power-or-two elements and
+    // the number of bytes loaded per call must not exceed 512.
+    properties PVCProps{cache_hint_L1<cache_hint::write_back>,
+                        cache_hint_L2<cache_hint::write_back>, alignment<16>};
+
+    if constexpr (sizeof(T) >= 4) // only d/q words are supported now
+      Passed &= testUSM<T, 1, !CheckMask, CheckProperties>(Q, 2, 4, PVCProps);
+    if constexpr (sizeof(T) >= 2) // only d/q words are supported now
+      Passed &= testUSM<T, 2, !CheckMask, CheckProperties>(Q, 5, 5, PVCProps);
+    Passed &= testUSM<T, 4, !CheckMask, CheckProperties>(Q, 5, 5, PVCProps);
+    Passed &= testUSM<T, 8, !CheckMask, CheckProperties>(Q, 5, 5, PVCProps);
+    Passed &= testUSM<T, 16, CheckMask, CheckProperties>(Q, 5, 5, PVCProps);
+    Passed &= testUSM<T, 32, !CheckMask, CheckProperties>(Q, 2, 4, PVCProps);
+    Passed &= testUSM<T, 64, !CheckMask, CheckProperties>(Q, 7, 1, PVCProps);
+    if constexpr (128 * sizeof(T) <= 512)
+      Passed &= testUSM<T, 128, CheckMask, CheckProperties>(Q, 1, 4, PVCProps);
+    if constexpr (256 * sizeof(T) <= 512)
+      Passed &= testUSM<T, 256, CheckMask, CheckProperties>(Q, 1, 4, PVCProps);
+  } // TestPVCFeatures
+
+  return Passed;
+}