From 26ce4dad5776e889b878965cc78c57a6e5a7b782 Mon Sep 17 00:00:00 2001 From: Vladimir Lazarev Date: Wed, 10 Feb 2021 13:06:09 +0300 Subject: [PATCH 01/11] [SYCL] Initial implementation --- sycl/include/CL/__spirv/spirv_ops.hpp | 44 ++++++++ sycl/include/CL/__spirv/spirv_types.hpp | 40 +++++++ sycl/include/CL/sycl/ONEAPI/sub_group.hpp | 130 ++++++++++++++++++++++ sycl/include/CL/sycl/access/access.hpp | 60 +++++++--- sycl/include/CL/sycl/accessor.hpp | 17 +++ 5 files changed, 275 insertions(+), 16 deletions(-) diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp index dc8515f0bbe05..4594b5ee439f7 100644 --- a/sycl/include/CL/__spirv/spirv_ops.hpp +++ b/sycl/include/CL/__spirv/spirv_ops.hpp @@ -169,6 +169,50 @@ __SPIRV_ATOMICS(__SPIRV_ATOMIC_UNSIGNED, unsigned long) __SPIRV_ATOMICS(__SPIRV_ATOMIC_UNSIGNED, unsigned long long) __SPIRV_ATOMICS(__SPIRV_ATOMIC_MINMAX, Min) __SPIRV_ATOMICS(__SPIRV_ATOMIC_MINMAX, Max) +__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __attribute__((opencl_generic)) void * +__spirv_PtrCastToGeneric(const void *Ptr) noexcept; + +__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __attribute__((opencl_global)) void * +__spirv_GenericCastToPtrExplicit_ToGlobal(const void *Ptr, + __spv::StorageClass::Flag S) noexcept; + +__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __attribute__((opencl_local)) void * +__spirv_GenericCastToPtrExplicit_ToLocal(const void *Ptr, + __spv::StorageClass::Flag S) noexcept; + +__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __attribute__((opencl_private)) void * +__spirv_GenericCastToPtrExplicit_ToPrivate( + const void *Ptr, __spv::StorageClass::Flag S) noexcept; + +template +__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __attribute__((opencl_generic)) dataT * +__spirv_PtrCastToGeneric(const void *Ptr) noexcept { + return (__attribute__((opencl_generic)) dataT *)__spirv_PtrCastToGeneric(Ptr); +} + +template +__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __attribute__((opencl_global)) dataT * +__spirv_GenericCastToPtrExplicit_ToGlobal( + const void *Ptr, __spv::StorageClass::Flag S) noexcept { + return (__attribute__((opencl_global)) + dataT *)__spirv_GenericCastToPtrExplicit_ToGlobal(Ptr, S); +} + +template +__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __attribute__((opencl_local)) dataT * +__spirv_GenericCastToPtrExplicit_ToLocal(const void *Ptr, + __spv::StorageClass::Flag S) noexcept { + return (__attribute__((opencl_local)) + dataT *)__spirv_GenericCastToPtrExplicit_ToLocal(Ptr, S); +} + +template +__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __attribute__((opencl_private)) dataT * +__spirv_GenericCastToPtrExplicit_ToPrivate( + const void *Ptr, __spv::StorageClass::Flag S) noexcept { + return (__attribute__((opencl_private)) + dataT *)__spirv_GenericCastToPtrExplicit_ToPrivate(Ptr, S); +} template __SYCL_CONVERGENT__ extern SYCL_EXTERNAL dataT diff --git a/sycl/include/CL/__spirv/spirv_types.hpp b/sycl/include/CL/__spirv/spirv_types.hpp index a938ae0732da0..8141cb2fcbddf 100644 --- a/sycl/include/CL/__spirv/spirv_types.hpp +++ b/sycl/include/CL/__spirv/spirv_types.hpp @@ -36,6 +36,46 @@ struct Scope { Flag flag_value; }; +struct StorageClass { + enum Flag : uint32_t { + UniformConstant = 0, + Input = 1, + sUniform = 2, + Output = 3, + Workgroup = 4, + CrossWorkgroup = 5, + Private = 6, + Function = 7, + Generic = 8, + PushConstant = 9, + AtomicCounter = 10, + Image = 11, + StorageBuffer = 12, + CallableDataKHR = 5328, + CallableDataNV = 5328, + IncomingCallableDataKHR = 5329, + IncomingCallableDataNV = 5329, + RayPayloadKHR = 5338, + RayPayloadNV = 5338, + HitAttributeKHR = 5339, + HitAttributeNV = 5339, + IncomingRayPayloadKHR = 5342, + IncomingRayPayloadNV = 5342, + RecordBufferKHR = 5343, + ShaderRecordBufferNV = 5343, + PhysicalStorageBuffer = 5349, + PhysicalStorageBufferEXT = 5349, + CodeSectionINTEL = 5605, + DeviceOnlyINTEL = 5936, + HostOnlyINTEL = 5937, + Max = 0x7fffffff, + CapabilityUSMStorageClassesINTEL = 5935, + }; + constexpr StorageClass(Flag flag) : flag_value(flag) {} + constexpr operator uint32_t() const { return flag_value; } + Flag flag_value; +}; + struct MemorySemanticsMask { enum Flag : uint32_t { diff --git a/sycl/include/CL/sycl/ONEAPI/sub_group.hpp b/sycl/include/CL/sycl/ONEAPI/sub_group.hpp index 71ec92f176b47..eddff626a13c5 100644 --- a/sycl/include/CL/sycl/ONEAPI/sub_group.hpp +++ b/sycl/include/CL/sycl/ONEAPI/sub_group.hpp @@ -45,6 +45,11 @@ using AcceptableForLocalLoadStore = bool_constant>::value && Space == access::address_space::local_space>; +template +using AcceptableForPrivateLoadStore = + bool_constant>::value && + Space == access::address_space::private_space>; + #ifdef __SYCL_DEVICE_ONLY__ template T load(const multi_ptr src) { @@ -224,6 +229,50 @@ struct sub_group { /* --- sub_group load/stores --- */ /* these can map to SIMD or block read/write hardware where available */ +#ifdef __SYCL_DEVICE_ONLY__ + // Method for decorated pointer + template + detail::enable_if_t< + !std::is_same::type, T>::value, T> + load(T *src) const { + return load(sycl::multi_ptr::type, + sycl::detail::deduce_AS::value>( + (typename detail::remove_AS::type *)src)); + } + +#ifndef SYCL_USE_DECORATED_REF + // Method for raw pointer + template + detail::enable_if_t< + std::is_same::type, T>::value, T> + load(T *src) const { + + auto p = __spirv_GenericCastToPtrExplicit_ToPrivate( + src, __spv::StorageClass::Function); + if (p) + return load(p); + + auto l = __spirv_GenericCastToPtrExplicit_ToLocal( + src, __spv::StorageClass::Workgroup); + if (l) + return load(l); + + auto g = __spirv_GenericCastToPtrExplicit_ToGlobal( + src, __spv::StorageClass::CrossWorkgroup); + if (g) + return load(g); + + // Fallback for other address spaces to be mapped to global + return load(__spirv_PtrCastToGeneric(src)); + } +#endif // SYCL_USE_DECORATED_REF +#else //__SYCL_DEVICE_ONLY__ + template T load(T *src) const { + (void)src; + throw runtime_error("Sub-groups are not supported on host device.", + PI_INVALID_DEVICE); + } +#endif //__SYCL_DEVICE_ONLY__ template sycl::detail::enable_if_t< @@ -255,6 +304,20 @@ struct sub_group { #endif } + template + sycl::detail::enable_if_t< + sycl::detail::sub_group::AcceptableForPrivateLoadStore::value, + T> + load(const multi_ptr src) const { +#ifdef __SYCL_DEVICE_ONLY__ + return src.get()[get_local_id()[0]]; +#else + (void)src; + throw runtime_error("Sub-groups are not supported on host device.", + PI_INVALID_DEVICE); +#endif + } + template sycl::detail::enable_if_t< sycl::detail::sub_group::AcceptableForGlobalLoadStore::value && @@ -315,6 +378,59 @@ struct sub_group { #endif } +#ifdef __SYCL_DEVICE_ONLY__ + // Method for decorated pointer + template + detail::enable_if_t< + !std::is_same::type, T>::value> + store(T *dst, const typename detail::remove_AS::type &x) const { + store(sycl::multi_ptr::type, + sycl::detail::deduce_AS::value>( + (typename detail::remove_AS::type *)dst), + x); + } + +#ifndef SYCL_USE_DECORATED_REF + // Method for raw pointer + template + detail::enable_if_t< + std::is_same::type, T>::value> + store(T *dst, const typename detail::remove_AS::type &x) const { + + auto p = __spirv_GenericCastToPtrExplicit_ToPrivate( + dst, __spv::StorageClass::Function); + if (p) { + store(p, x); + return; + } + + auto l = __spirv_GenericCastToPtrExplicit_ToLocal( + dst, __spv::StorageClass::Workgroup); + if (l) { + store(l, x); + return; + } + + auto g = __spirv_GenericCastToPtrExplicit_ToGlobal( + dst, __spv::StorageClass::CrossWorkgroup); + if (g) { + store(g, x); + return; + } + + // Fallback for other address spaces to be mapped to global + store(__spirv_PtrCastToGeneric(dst), x); + } +#endif // SYCL_USE_DECORATED_REF +#else //__SYCL_DEVICE_ONLY__ + template void store(T *dst, const T &x) const { + (void)dst; + (void)x; + throw runtime_error("Sub-groups are not supported on host device.", + PI_INVALID_DEVICE); + } +#endif //__SYCL_DEVICE_ONLY__ + template sycl::detail::enable_if_t< sycl::detail::sub_group::AcceptableForGlobalLoadStore::value> @@ -347,6 +463,20 @@ struct sub_group { #endif } + template + sycl::detail::enable_if_t< + sycl::detail::sub_group::AcceptableForPrivateLoadStore::value> + store(multi_ptr dst, const T &x) const { +#ifdef __SYCL_DEVICE_ONLY__ + dst.get()[get_local_id()[0]] = x; +#else + (void)dst; + (void)x; + throw runtime_error("Sub-groups are not supported on host device.", + PI_INVALID_DEVICE); +#endif + } + template sycl::detail::enable_if_t< sycl::detail::sub_group::AcceptableForGlobalLoadStore::value && diff --git a/sycl/include/CL/sycl/access/access.hpp b/sycl/include/CL/sycl/access/access.hpp index fbca743baaa14..23b351b23ba63 100644 --- a/sycl/include/CL/sycl/access/access.hpp +++ b/sycl/include/CL/sycl/access/access.hpp @@ -115,6 +115,7 @@ constexpr bool modeWritesNewData(access::mode m) { #define __OPENCL_LOCAL_AS__ __attribute__((opencl_local)) #define __OPENCL_CONSTANT_AS__ __attribute__((opencl_constant)) #define __OPENCL_PRIVATE_AS__ __attribute__((opencl_private)) +#define __OPENCL_GENERIC_AS__ __attribute__((opencl_generic)) #else #define __OPENCL_GLOBAL_AS__ #define __OPENCL_GLOBAL_DEVICE_AS__ @@ -122,6 +123,7 @@ constexpr bool modeWritesNewData(access::mode m) { #define __OPENCL_LOCAL_AS__ #define __OPENCL_CONSTANT_AS__ #define __OPENCL_PRIVATE_AS__ +#define __OPENCL_GENERIC_AS__ #endif template struct TargetToAS { @@ -187,17 +189,15 @@ template struct DecoratedType { using type = __OPENCL_LOCAL_AS__ ElementType; }; +template struct remove_AS { typedef T type; }; -template -struct remove_AS { - typedef T type; +template struct deduce_AS { + static const access::address_space value = + access::address_space::global_space; }; #ifdef __SYCL_DEVICE_ONLY__ -template -struct remove_AS<__OPENCL_GLOBAL_AS__ T> { - typedef T type; -}; +template struct remove_AS<__OPENCL_GLOBAL_AS__ T> { typedef T type; }; #ifdef __ENABLE_USM_ADDR_SPACE__ template struct remove_AS<__OPENCL_GLOBAL_DEVICE_AS__ T> { @@ -207,22 +207,50 @@ template struct remove_AS<__OPENCL_GLOBAL_DEVICE_AS__ T> { template struct remove_AS<__OPENCL_GLOBAL_HOST_AS__ T> { typedef T type; }; + +template struct deduce_AS<__OPENCL_GLOBAL_DEVICE_AS__ T> { + static const access::address_space value = + access::address_space::global_device_space; +}; + +template struct deduce_AS<__OPENCL_GLOBAL_HOST_AS__ T> { + static const access::address_space value = + access::address_space::global_host_space; +}; #endif // __ENABLE_USM_ADDR_SPACE__ -template -struct remove_AS<__OPENCL_PRIVATE_AS__ T> { +template struct remove_AS<__OPENCL_PRIVATE_AS__ T> { typedef T type; }; -template -struct remove_AS<__OPENCL_LOCAL_AS__ T> { +template struct remove_AS<__OPENCL_LOCAL_AS__ T> { typedef T type; }; + +template struct remove_AS<__OPENCL_CONSTANT_AS__ T> { typedef T type; }; -template -struct remove_AS<__OPENCL_CONSTANT_AS__ T> { +template struct remove_AS<__OPENCL_GENERIC_AS__ T> { typedef T type; }; + +template struct deduce_AS<__OPENCL_PRIVATE_AS__ T> { + static const access::address_space value = + access::address_space::private_space; +}; + +template struct deduce_AS<__OPENCL_LOCAL_AS__ T> { + static const access::address_space value = access::address_space::local_space; +}; + +template struct deduce_AS<__OPENCL_CONSTANT_AS__ T> { + static const access::address_space value = + access::address_space::constant_space; +}; + +template struct deduce_AS<__OPENCL_GENERIC_AS__ T> { + static const access::address_space value = + access::address_space::global_space; +}; #endif #undef __OPENCL_GLOBAL_AS__ @@ -231,8 +259,8 @@ struct remove_AS<__OPENCL_CONSTANT_AS__ T> { #undef __OPENCL_LOCAL_AS__ #undef __OPENCL_CONSTANT_AS__ #undef __OPENCL_PRIVATE_AS__ - +#undef __OPENCL_GENERIC_AS__ } // namespace detail -} // namespace sycl -} // __SYCL_INLINE_NAMESPACE(cl) +} // namespace sycl +} // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/include/CL/sycl/accessor.hpp b/sycl/include/CL/sycl/accessor.hpp index 76be09cae4a6b..142154e6228da 100755 --- a/sycl/include/CL/sycl/accessor.hpp +++ b/sycl/include/CL/sycl/accessor.hpp @@ -286,8 +286,14 @@ class accessor_common { constexpr static bool IsAccessReadWrite = AccessMode == access::mode::read_write; +#if defined(__SYCL_DEVICE_ONLY__) && defined(SYCL_USE_DECORATED_REF) + using RefType = detail::const_if_const_AS< + AS, typename detail::DecoratedType::type> &; + using ConstRefType = const typename detail::DecoratedType::type &; +#else using RefType = detail::const_if_const_AS &; using ConstRefType = const DataT &; +#endif using PtrType = detail::const_if_const_AS *; using AccType = accessor::type *; +#if defined(__SYCL_DEVICE_ONLY__) && defined(SYCL_USE_DECORATED_REF) + using RefType = detail::const_if_const_AS< + AS, typename detail::DecoratedType::type> &; + using ConstRefType = const typename detail::DecoratedType::type &; +#else using RefType = detail::const_if_const_AS &; using ConstRefType = const DataT &; +#endif using PtrType = detail::const_if_const_AS *; template size_t getLinearIndex(id Id) const { @@ -1794,7 +1806,12 @@ class accessor::type *; +#if defined(__SYCL_DEVICE_ONLY__) && defined(SYCL_USE_DECORATED_REF) + using RefType = detail::const_if_const_AS< + AS, typename detail::DecoratedType::type> &; +#else using RefType = detail::const_if_const_AS &; +#endif using PtrType = detail::const_if_const_AS *; #ifdef __SYCL_DEVICE_ONLY__ From 1ca5d64c64ab74e7c1c0003632ea37339a4683dd Mon Sep 17 00:00:00 2001 From: Vladimir Lazarev Date: Wed, 24 Feb 2021 12:01:07 +0300 Subject: [PATCH 02/11] [SYCL] Add test --- .../test/on-device/extensions/subgroup_as.cpp | 84 +++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 sycl/test/on-device/extensions/subgroup_as.cpp diff --git a/sycl/test/on-device/extensions/subgroup_as.cpp b/sycl/test/on-device/extensions/subgroup_as.cpp new file mode 100644 index 0000000000000..9516de45c8e6e --- /dev/null +++ b/sycl/test/on-device/extensions/subgroup_as.cpp @@ -0,0 +1,84 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -DSYCL_USE_DECORATED_REF %s -o %t_dr.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// Sub-groups are not suported on Host +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t_dr.out +// Execution on CPU and FPGA takes 10000 times longer +// RUNx: %CPU_RUN_PLACEHOLDER %t.out +// RUNx: %ACC_RUN_PLACEHOLDER %t.out + +#include +#include +#include +#include +#include + +int main(int argc, char *argv[]) { + cl::sycl::queue queue; + printf("Device Name = %s\n", + queue.get_device().get_info().c_str()); + + // Initialize some host memory + constexpr int N = 64; + int host_mem[N]; + for (int i = 0; i < N; ++i) { + host_mem[i] = i*10000; + } + + // Use the device to transform each value + { + cl::sycl::buffer buf(host_mem, N); + queue.submit([&](cl::sycl::handler &cgh) { + auto global = + buf.get_access(cgh); + sycl::accessor + local(N, cgh); + + cgh.parallel_for( + cl::sycl::nd_range<1>(N, 32), [=](cl::sycl::nd_item<1> it) { + int v[N] = {0,1,2,3,4,5,6,7,8,9, + 10,11,12,13,14,15,16,17,18,19, + 20,21,22,23,24,25,26,27,28,29, + 30,31,32,33,34,35,36,37,38,39, + 40,41,42,43,44,45,46,47,48,49, + 50,51,52,53,54,55,56,57,58,59, + 60,61,62,63}; + cl::sycl::ONEAPI::sub_group sg = it.get_sub_group(); + if (!it.get_local_id(0)) { + int end = it.get_global_id(0)+it.get_local_range()[0]; + for (int i = it.get_global_id(0); i < end; i++) { + local[i] = i * 100; + } + } + it.barrier(); + + int i = (it.get_global_id(0) / sg.get_max_local_range()[0]) * + sg.get_max_local_range()[0]; + // Global address space + auto x = sg.load(&global[i]); + + // Local address space + auto y = sg.load(&local[i]); + +#if SYCL_USE_DECORATED_REF + int z = v[it.get_global_id(0)]; +#else + auto z = sg.load(v+i); +#endif + sg.store(&global[i], x + y + z); + }); + }); + } + + // Print results and tidy up + for (int i = 0; i < N; ++i) { + if(i*10101 != host_mem[i]) { + printf("Unexpected result %06d vs %06d\n", i*10101, host_mem[i]); + return 1; + } + } + printf("Success!\n"); + return 0; +} From 8923e56441059841092d29cfe3b855bb14732440 Mon Sep 17 00:00:00 2001 From: Vladimir Lazarev Date: Fri, 26 Feb 2021 09:40:20 +0300 Subject: [PATCH 03/11] Modify after review - remove unnecessary attributes for SPIR-V declaration and definitions; - fix SPIR-V enumeration values; - add tests; - generate assert if load/store is called for private pointer; - remove decoreded reference code; - use spir64-unknown-unknown-sycldevice target in LIT infra to use device side asserts; - update documentation. --- .../SYCL_INTEL_sub_group_algorithms.asciidoc | 7 ++ sycl/include/CL/__spirv/spirv_ops.hpp | 16 ++-- sycl/include/CL/__spirv/spirv_types.hpp | 6 +- sycl/include/CL/sycl/ONEAPI/sub_group.hpp | 59 +++--------- sycl/include/CL/sycl/accessor.hpp | 17 ---- sycl/test/CMakeLists.txt | 2 +- sycl/test/extensions/sub_group_as.cpp | 90 +++++++++++++++++++ sycl/test/lit.cfg.py | 2 +- .../{subgroup_as.cpp => sub_group_as.cpp} | 30 ++----- .../extensions/sub_group_as_private.cpp | 78 ++++++++++++++++ .../on-device/extensions/sub_group_as_vec.cpp | 72 +++++++++++++++ sycl/test/on-device/lit.cfg.py | 2 +- 12 files changed, 279 insertions(+), 102 deletions(-) create mode 100644 sycl/test/extensions/sub_group_as.cpp rename sycl/test/on-device/extensions/{subgroup_as.cpp => sub_group_as.cpp} (68%) create mode 100644 sycl/test/on-device/extensions/sub_group_as_private.cpp create mode 100644 sycl/test/on-device/extensions/sub_group_as_vec.cpp diff --git a/sycl/doc/extensions/SubGroupAlgorithms/SYCL_INTEL_sub_group_algorithms.asciidoc b/sycl/doc/extensions/SubGroupAlgorithms/SYCL_INTEL_sub_group_algorithms.asciidoc index af7ed1771f393..d7063afc706e8 100755 --- a/sycl/doc/extensions/SubGroupAlgorithms/SYCL_INTEL_sub_group_algorithms.asciidoc +++ b/sycl/doc/extensions/SubGroupAlgorithms/SYCL_INTEL_sub_group_algorithms.asciidoc @@ -134,12 +134,18 @@ The load and store sub-group functions enable developers to assert that all work |=== |Function|Description +|+template T load(sub_group sg, const T *src)+ +|Load contiguous data from _src_. Returns one element per work-item, corresponding to the memory location at _src_ + +get_local_id()+. The value of _src_ must be the same for all work-items in the sub-group. The address space information is deduced autmaticaly. Only pointers to global and local address spaces are fully valid. Passing pointer in private address space will cause assertion. Other address spaces are casted to global with potentially undefined behaviour. + |+template T load(sub_group sg, const multi_ptr src)+ |Load contiguous data from _src_. Returns one element per work-item, corresponding to the memory location at _src_ + +get_local_id()+. The value of _src_ must be the same for all work-items in the sub-group. _Space_ must be +access::address_space::global_space+ or +access::address_space::local_space+. |+template vec load(sub_group sg, const multi_ptr src)+ |Load contiguous data from _src_. Returns _N_ elements per work-item, corresponding to the _N_ memory locations at _src_ + +i+ * +get_max_local_range()+ + +get_local_id()+ for +i+ between 0 and _N_. The value of _src_ must be the same for all work-items in the sub-group. _Space_ must be +access::address_space::global_space+ or +access::address_space::local_space+. +|+template void store(sub_group sg, T *dst, const T& x)+ +|Store contiguous data to _dst_. The value of _x_ from each work-item is written to the memory location at _dst_ + +get_local_id()+. The value of _dst_ must be the same for all work-items in the sub-group. The address space information is deduced autmaticaly. Only pointers to global and local address spaces are fully valid. Passing pointer in private address space will cause assertion. Other address spaces are casted to global with potentially undefined behaviour. + |+template void store(sub_group sg, multi_ptr dst, const T& x)+ |Store contiguous data to _dst_. The value of _x_ from each work-item is written to the memory location at _dst_ + +get_local_id()+. The value of _dst_ must be the same for all work-items in the sub-group. _Space_ must be +access::address_space::global_space+ or +access::address_space::local_space+. @@ -165,6 +171,7 @@ None. |======================================== |Rev|Date|Author|Changes |1|2020-03-16|John Pennycook|*Initial public working draft* +|2|2021-02-26|Vladimir Lazarev|*Add load/store method for raw pointers* |======================================== //************************************************************************ diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp index 4594b5ee439f7..6506152039a42 100644 --- a/sycl/include/CL/__spirv/spirv_ops.hpp +++ b/sycl/include/CL/__spirv/spirv_ops.hpp @@ -169,29 +169,29 @@ __SPIRV_ATOMICS(__SPIRV_ATOMIC_UNSIGNED, unsigned long) __SPIRV_ATOMICS(__SPIRV_ATOMIC_UNSIGNED, unsigned long long) __SPIRV_ATOMICS(__SPIRV_ATOMIC_MINMAX, Min) __SPIRV_ATOMICS(__SPIRV_ATOMIC_MINMAX, Max) -__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __attribute__((opencl_generic)) void * +extern SYCL_EXTERNAL __attribute__((opencl_generic)) void * __spirv_PtrCastToGeneric(const void *Ptr) noexcept; -__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __attribute__((opencl_global)) void * +extern SYCL_EXTERNAL __attribute__((opencl_global)) void * __spirv_GenericCastToPtrExplicit_ToGlobal(const void *Ptr, __spv::StorageClass::Flag S) noexcept; -__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __attribute__((opencl_local)) void * +extern SYCL_EXTERNAL __attribute__((opencl_local)) void * __spirv_GenericCastToPtrExplicit_ToLocal(const void *Ptr, __spv::StorageClass::Flag S) noexcept; -__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __attribute__((opencl_private)) void * +extern SYCL_EXTERNAL __attribute__((opencl_private)) void * __spirv_GenericCastToPtrExplicit_ToPrivate( const void *Ptr, __spv::StorageClass::Flag S) noexcept; template -__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __attribute__((opencl_generic)) dataT * +extern __attribute__((opencl_generic)) dataT * __spirv_PtrCastToGeneric(const void *Ptr) noexcept { return (__attribute__((opencl_generic)) dataT *)__spirv_PtrCastToGeneric(Ptr); } template -__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __attribute__((opencl_global)) dataT * +extern __attribute__((opencl_global)) dataT * __spirv_GenericCastToPtrExplicit_ToGlobal( const void *Ptr, __spv::StorageClass::Flag S) noexcept { return (__attribute__((opencl_global)) @@ -199,7 +199,7 @@ __spirv_GenericCastToPtrExplicit_ToGlobal( } template -__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __attribute__((opencl_local)) dataT * +extern __attribute__((opencl_local)) dataT * __spirv_GenericCastToPtrExplicit_ToLocal(const void *Ptr, __spv::StorageClass::Flag S) noexcept { return (__attribute__((opencl_local)) @@ -207,7 +207,7 @@ __spirv_GenericCastToPtrExplicit_ToLocal(const void *Ptr, } template -__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __attribute__((opencl_private)) dataT * +extern __attribute__((opencl_private)) dataT * __spirv_GenericCastToPtrExplicit_ToPrivate( const void *Ptr, __spv::StorageClass::Flag S) noexcept { return (__attribute__((opencl_private)) diff --git a/sycl/include/CL/__spirv/spirv_types.hpp b/sycl/include/CL/__spirv/spirv_types.hpp index 8141cb2fcbddf..15564c11a59dc 100644 --- a/sycl/include/CL/__spirv/spirv_types.hpp +++ b/sycl/include/CL/__spirv/spirv_types.hpp @@ -40,7 +40,7 @@ struct StorageClass { enum Flag : uint32_t { UniformConstant = 0, Input = 1, - sUniform = 2, + Uniform = 2, Output = 3, Workgroup = 4, CrossWorkgroup = 5, @@ -61,15 +61,15 @@ struct StorageClass { HitAttributeNV = 5339, IncomingRayPayloadKHR = 5342, IncomingRayPayloadNV = 5342, - RecordBufferKHR = 5343, + ShaderRecordBufferKHR = 5343, ShaderRecordBufferNV = 5343, PhysicalStorageBuffer = 5349, PhysicalStorageBufferEXT = 5349, CodeSectionINTEL = 5605, + CapabilityUSMStorageClassesINTEL = 5935, DeviceOnlyINTEL = 5936, HostOnlyINTEL = 5937, Max = 0x7fffffff, - CapabilityUSMStorageClassesINTEL = 5935, }; constexpr StorageClass(Flag flag) : flag_value(flag) {} constexpr operator uint32_t() const { return flag_value; } diff --git a/sycl/include/CL/sycl/ONEAPI/sub_group.hpp b/sycl/include/CL/sycl/ONEAPI/sub_group.hpp index eddff626a13c5..0ce464b6137d7 100644 --- a/sycl/include/CL/sycl/ONEAPI/sub_group.hpp +++ b/sycl/include/CL/sycl/ONEAPI/sub_group.hpp @@ -45,11 +45,6 @@ using AcceptableForLocalLoadStore = bool_constant>::value && Space == access::address_space::local_space>; -template -using AcceptableForPrivateLoadStore = - bool_constant>::value && - Space == access::address_space::private_space>; - #ifdef __SYCL_DEVICE_ONLY__ template T load(const multi_ptr src) { @@ -240,18 +235,12 @@ struct sub_group { (typename detail::remove_AS::type *)src)); } -#ifndef SYCL_USE_DECORATED_REF // Method for raw pointer template detail::enable_if_t< std::is_same::type, T>::value, T> load(T *src) const { - auto p = __spirv_GenericCastToPtrExplicit_ToPrivate( - src, __spv::StorageClass::Function); - if (p) - return load(p); - auto l = __spirv_GenericCastToPtrExplicit_ToLocal( src, __spv::StorageClass::Workgroup); if (l) @@ -262,10 +251,14 @@ struct sub_group { if (g) return load(g); + auto p = __spirv_GenericCastToPtrExplicit_ToPrivate( + src, __spv::StorageClass::Function); + assert((p == nullptr) && + "Sub-group load() is not supported for private pointers."); + // Fallback for other address spaces to be mapped to global return load(__spirv_PtrCastToGeneric(src)); } -#endif // SYCL_USE_DECORATED_REF #else //__SYCL_DEVICE_ONLY__ template T load(T *src) const { (void)src; @@ -304,20 +297,6 @@ struct sub_group { #endif } - template - sycl::detail::enable_if_t< - sycl::detail::sub_group::AcceptableForPrivateLoadStore::value, - T> - load(const multi_ptr src) const { -#ifdef __SYCL_DEVICE_ONLY__ - return src.get()[get_local_id()[0]]; -#else - (void)src; - throw runtime_error("Sub-groups are not supported on host device.", - PI_INVALID_DEVICE); -#endif - } - template sycl::detail::enable_if_t< sycl::detail::sub_group::AcceptableForGlobalLoadStore::value && @@ -390,20 +369,12 @@ struct sub_group { x); } -#ifndef SYCL_USE_DECORATED_REF // Method for raw pointer template detail::enable_if_t< std::is_same::type, T>::value> store(T *dst, const typename detail::remove_AS::type &x) const { - auto p = __spirv_GenericCastToPtrExplicit_ToPrivate( - dst, __spv::StorageClass::Function); - if (p) { - store(p, x); - return; - } - auto l = __spirv_GenericCastToPtrExplicit_ToLocal( dst, __spv::StorageClass::Workgroup); if (l) { @@ -418,10 +389,14 @@ struct sub_group { return; } + auto p = __spirv_GenericCastToPtrExplicit_ToPrivate( + dst, __spv::StorageClass::Function); + assert((p == nullptr) && + "Sub-group store() is not supported for private pointers."); + // Fallback for other address spaces to be mapped to global store(__spirv_PtrCastToGeneric(dst), x); } -#endif // SYCL_USE_DECORATED_REF #else //__SYCL_DEVICE_ONLY__ template void store(T *dst, const T &x) const { (void)dst; @@ -463,20 +438,6 @@ struct sub_group { #endif } - template - sycl::detail::enable_if_t< - sycl::detail::sub_group::AcceptableForPrivateLoadStore::value> - store(multi_ptr dst, const T &x) const { -#ifdef __SYCL_DEVICE_ONLY__ - dst.get()[get_local_id()[0]] = x; -#else - (void)dst; - (void)x; - throw runtime_error("Sub-groups are not supported on host device.", - PI_INVALID_DEVICE); -#endif - } - template sycl::detail::enable_if_t< sycl::detail::sub_group::AcceptableForGlobalLoadStore::value && diff --git a/sycl/include/CL/sycl/accessor.hpp b/sycl/include/CL/sycl/accessor.hpp index 142154e6228da..76be09cae4a6b 100755 --- a/sycl/include/CL/sycl/accessor.hpp +++ b/sycl/include/CL/sycl/accessor.hpp @@ -286,14 +286,8 @@ class accessor_common { constexpr static bool IsAccessReadWrite = AccessMode == access::mode::read_write; -#if defined(__SYCL_DEVICE_ONLY__) && defined(SYCL_USE_DECORATED_REF) - using RefType = detail::const_if_const_AS< - AS, typename detail::DecoratedType::type> &; - using ConstRefType = const typename detail::DecoratedType::type &; -#else using RefType = detail::const_if_const_AS &; using ConstRefType = const DataT &; -#endif using PtrType = detail::const_if_const_AS *; using AccType = accessor::type *; -#if defined(__SYCL_DEVICE_ONLY__) && defined(SYCL_USE_DECORATED_REF) - using RefType = detail::const_if_const_AS< - AS, typename detail::DecoratedType::type> &; - using ConstRefType = const typename detail::DecoratedType::type &; -#else using RefType = detail::const_if_const_AS &; using ConstRefType = const DataT &; -#endif using PtrType = detail::const_if_const_AS *; template size_t getLinearIndex(id Id) const { @@ -1806,12 +1794,7 @@ class accessor::type *; -#if defined(__SYCL_DEVICE_ONLY__) && defined(SYCL_USE_DECORATED_REF) - using RefType = detail::const_if_const_AS< - AS, typename detail::DecoratedType::type> &; -#else using RefType = detail::const_if_const_AS &; -#endif using PtrType = detail::const_if_const_AS *; #ifdef __SYCL_DEVICE_ONLY__ diff --git a/sycl/test/CMakeLists.txt b/sycl/test/CMakeLists.txt index ec833e14f13dc..a2a12c8797a76 100644 --- a/sycl/test/CMakeLists.txt +++ b/sycl/test/CMakeLists.txt @@ -54,7 +54,7 @@ set_target_properties(check-sycl-deploy PROPERTIES FOLDER "SYCL tests") add_lit_testsuite(check-sycl-spirv "Running device-agnostic SYCL regression tests for SPIR-V" ${CMAKE_CURRENT_BINARY_DIR} ARGS ${RT_TEST_ARGS} - PARAMS "SYCL_TRIPLE=spir64-unknown-linux-sycldevice" + PARAMS "SYCL_TRIPLE=spir64-unknown-unknown-sycldevice" DEPENDS ${SYCL_TEST_DEPS} EXCLUDE_FROM_CHECK_ALL ) diff --git a/sycl/test/extensions/sub_group_as.cpp b/sycl/test/extensions/sub_group_as.cpp new file mode 100644 index 0000000000000..086762dc56c12 --- /dev/null +++ b/sycl/test/extensions/sub_group_as.cpp @@ -0,0 +1,90 @@ +// RUN: %clangxx -fsycl -fsycl-explicit-simd -fsycl-device-only -O3 -S -emit-llvm -x c++ %s -o - | FileCheck %s + +#include +#include +#include +#include +#include + +int main(int argc, char *argv[]) { + cl::sycl::queue queue; + printf("Device Name = %s\n", + queue.get_device().get_info().c_str()); + + // Initialize some host memory + constexpr int N = 64; + int host_mem[N]; + for (int i = 0; i < N; ++i) { + host_mem[i] = i * 100; + } + + // Use the device to transform each value + { + cl::sycl::buffer buf(host_mem, N); + queue.submit([&](cl::sycl::handler &cgh) { + auto global = + buf.get_access(cgh); + sycl::accessor + local(N, cgh); + + // Check that load/store functions for raw pointer was called + // + // CHECK: spir_kernel void @"_ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEE4test" + // CHECK-COUNT-3: call spir_func i32 + // {{.*}}loadIiEENSt9enable_ifIXsr3std7is_sameINS0_6detail9remove_ASIT_E4typeES7_EE5valueES7_E4typeEPS7_{{.*}}i32 + // addrspace(4)* + // CHECK: call spir_func void {{.*}}storeIiEENSt9enable_ifIXsr3std7is_sameINS0_6detail9remove_ASIT_E4typeES7_EE5valueEvE4typeEPS7_RKS9_({{.*}} addrspace(4)*{{.*}}, + cgh.parallel_for( + cl::sycl::nd_range<1>(N, 32), [=](cl::sycl::nd_item<1> it) { + int v[N] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, + 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, + 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, + 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63}; + cl::sycl::ONEAPI::sub_group sg = it.get_sub_group(); + if (!it.get_local_id(0)) { + int end = it.get_global_id(0) + it.get_local_range()[0]; + for (int i = it.get_global_id(0); i < end; i++) { + local[i] = i; + } + } + it.barrier(); + + int i = (it.get_global_id(0) / sg.get_max_local_range()[0]) * + sg.get_max_local_range()[0]; + + // CHECK: spir_func i32{{.*}}loadIiEENSt9enable_ifIXsr3std7is_sameINS0_6detail9remove_ASIT_E4typeES7_EE5valueES7_E4typeEPS7_ + // CHECK: call spir_func i8 addrspace(3)* @_Z40__spirv_GenericCastToPtrExplicit_ToLocalPKvN5__spv12StorageClass4FlagE(i8 addrspace(4)* + // CHECK: call spir_func i32 @_Z33__spirv_SubgroupLocalInvocationIdv + // CHECK: call spir_func i8 addrspace(1)* @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPKvN5__spv12StorageClass4FlagE(i8 addrspace(4)* + // CHECK: call spir_func i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(i32 addrspace(1)* + // CHECK: call spir_func i8* @_Z42__spirv_GenericCastToPtrExplicit_ToPrivatePKvN5__spv12StorageClass4FlagE(i8 addrspace(4)* + // CHECK: call spir_func void @__assert_fail + // CHECK: call spir_func i8 addrspace(4)* @_Z24__spirv_PtrCastToGenericPKv(i8 addrspace(4)* + // CHECK: call spir_func i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(i32 addrspace(1)* + // Global address space + auto x = sg.load(&global[i]); + + // Local address space + auto y = sg.load(&local[i]); + + auto z = sg.load(v + i); + + // CHECK: spir_func void {{.*}}storeIiEENSt9enable_ifIXsr3std7is_sameINS0_6detail9remove_ASIT_E4typeES7_EE5valueEvE4typeEPS7_RKS9_ + // CHECK: call spir_func i8 addrspace(3)* @_Z40__spirv_GenericCastToPtrExplicit_ToLocalPKvN5__spv12StorageClass4FlagE(i8 addrspace(4)* + // CHECK: call spir_func i32 @_Z33__spirv_SubgroupLocalInvocationIdv + // CHECK: call spir_func i8 addrspace(1)* @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPKvN5__spv12StorageClass4FlagE(i8 addrspace(4)* + // CHECK: call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIjEvPU3AS1jT_(i32 addrspace(1)* + // CHECK: call spir_func i8* @_Z42__spirv_GenericCastToPtrExplicit_ToPrivatePKvN5__spv12StorageClass4FlagE(i8 addrspace(4)* + // CHECK: call spir_func void @__assert_fail + // CHECK: call spir_func i8 addrspace(4)* @_Z24__spirv_PtrCastToGenericPKv(i8 addrspace(4)* + // CHECK: call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIjEvPU3AS1jT_(i32 addrspace(1)* + sg.store(&global[i], x + y + z); + }); + }); + } + + return 0; +} diff --git a/sycl/test/lit.cfg.py b/sycl/test/lit.cfg.py index e1d2982a2b13e..d8dd5611d9f8a 100644 --- a/sycl/test/lit.cfg.py +++ b/sycl/test/lit.cfg.py @@ -84,7 +84,7 @@ # Every SYCL implementation provides a host implementation. config.available_features.add('host') -triple=lit_config.params.get('SYCL_TRIPLE', 'spir64-unknown-linux-sycldevice') +triple=lit_config.params.get('SYCL_TRIPLE', 'spir64-unknown-unknown-sycldevice') lit_config.note("Triple: {}".format(triple)) config.substitutions.append( ('%sycl_triple', triple ) ) diff --git a/sycl/test/on-device/extensions/subgroup_as.cpp b/sycl/test/on-device/extensions/sub_group_as.cpp similarity index 68% rename from sycl/test/on-device/extensions/subgroup_as.cpp rename to sycl/test/on-device/extensions/sub_group_as.cpp index 9516de45c8e6e..bea285f2a2e16 100644 --- a/sycl/test/on-device/extensions/subgroup_as.cpp +++ b/sycl/test/on-device/extensions/sub_group_as.cpp @@ -1,9 +1,7 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -DSYCL_USE_DECORATED_REF %s -o %t_dr.out // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // Sub-groups are not suported on Host // RUN: %GPU_RUN_PLACEHOLDER %t.out -// RUN: %GPU_RUN_PLACEHOLDER %t_dr.out -// Execution on CPU and FPGA takes 10000 times longer +// Execution on CPU and FPGA takes 100000 times longer // RUNx: %CPU_RUN_PLACEHOLDER %t.out // RUNx: %ACC_RUN_PLACEHOLDER %t.out @@ -22,7 +20,7 @@ int main(int argc, char *argv[]) { constexpr int N = 64; int host_mem[N]; for (int i = 0; i < N; ++i) { - host_mem[i] = i*10000; + host_mem[i] = i * 100; } // Use the device to transform each value @@ -38,18 +36,11 @@ int main(int argc, char *argv[]) { cgh.parallel_for( cl::sycl::nd_range<1>(N, 32), [=](cl::sycl::nd_item<1> it) { - int v[N] = {0,1,2,3,4,5,6,7,8,9, - 10,11,12,13,14,15,16,17,18,19, - 20,21,22,23,24,25,26,27,28,29, - 30,31,32,33,34,35,36,37,38,39, - 40,41,42,43,44,45,46,47,48,49, - 50,51,52,53,54,55,56,57,58,59, - 60,61,62,63}; cl::sycl::ONEAPI::sub_group sg = it.get_sub_group(); if (!it.get_local_id(0)) { - int end = it.get_global_id(0)+it.get_local_range()[0]; + int end = it.get_global_id(0) + it.get_local_range()[0]; for (int i = it.get_global_id(0); i < end; i++) { - local[i] = i * 100; + local[i] = i; } } it.barrier(); @@ -62,21 +53,16 @@ int main(int argc, char *argv[]) { // Local address space auto y = sg.load(&local[i]); -#if SYCL_USE_DECORATED_REF - int z = v[it.get_global_id(0)]; -#else - auto z = sg.load(v+i); -#endif - sg.store(&global[i], x + y + z); + sg.store(&global[i], x + y); }); }); } // Print results and tidy up for (int i = 0; i < N; ++i) { - if(i*10101 != host_mem[i]) { - printf("Unexpected result %06d vs %06d\n", i*10101, host_mem[i]); - return 1; + if (i * 101 != host_mem[i]) { + printf("Unexpected result %04d vs %04d\n", i * 101, host_mem[i]); + return 1; } } printf("Success!\n"); diff --git a/sycl/test/on-device/extensions/sub_group_as_private.cpp b/sycl/test/on-device/extensions/sub_group_as_private.cpp new file mode 100644 index 0000000000000..07e33b21a7b35 --- /dev/null +++ b/sycl/test/on-device/extensions/sub_group_as_private.cpp @@ -0,0 +1,78 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// Sub-groups are not suported on Host +// RUN: %GPU_RUN_PLACEHOLDER %t.out %GPU_CHECK_PLACEHOLDER +// Execution on CPU and FPGA takes 100000 times longer +// RUNx: %CPU_RUN_PLACEHOLDER %t.out %CPU_CHECK_PLACEHOLDER +// RUNx: %ACC_RUN_PLACEHOLDER %t.out %ACC_CHECK_PLACEHOLDER + +#include +#include +#include +#include +#include + +int main(int argc, char *argv[]) { + cl::sycl::queue queue; + printf("Device Name = %s\n", + queue.get_device().get_info().c_str()); + + // Initialize some host memory + constexpr int N = 64; + int host_mem[N]; + for (int i = 0; i < N; ++i) { + host_mem[i] = i * 100; + } + + // Use the device to transform each value + { + cl::sycl::buffer buf(host_mem, N); + queue.submit([&](cl::sycl::handler &cgh) { + auto global = + buf.get_access(cgh); + sycl::accessor + local(N, cgh); + + cgh.parallel_for( + cl::sycl::nd_range<1>(N, 32), [=](cl::sycl::nd_item<1> it) { + int v[N] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, + 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, + 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, + 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63}; + cl::sycl::ONEAPI::sub_group sg = it.get_sub_group(); + if (!it.get_local_id(0)) { + int end = it.get_global_id(0) + it.get_local_range()[0]; + for (int i = it.get_global_id(0); i < end; i++) { + local[i] = i; + } + } + it.barrier(); + + int i = (it.get_global_id(0) / sg.get_max_local_range()[0]) * + sg.get_max_local_range()[0]; + // Global address space + auto x = sg.load(&global[i]); + + // Local address space + auto y = sg.load(&local[i]); + + // CHECK: Sub-group load() is not supported for private pointers. + auto z = sg.load(v + i); + + sg.store(&global[i], x + y); + }); + }); + } + + // Print results and tidy up + for (int i = 0; i < N; ++i) { + if (i * 101 != host_mem[i]) { + printf("Unexpected result %04d vs %04d\n", i * 101, host_mem[i]); + return 1; + } + } + printf("Success!\n"); + return 0; +} diff --git a/sycl/test/on-device/extensions/sub_group_as_vec.cpp b/sycl/test/on-device/extensions/sub_group_as_vec.cpp new file mode 100644 index 0000000000000..8f2e82e20e402 --- /dev/null +++ b/sycl/test/on-device/extensions/sub_group_as_vec.cpp @@ -0,0 +1,72 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// Sub-groups are not suported on Host +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// Execution on CPU and FPGA takes 100000 times longer +// RUNx: %CPU_RUN_PLACEHOLDER %t.out +// RUNx: %ACC_RUN_PLACEHOLDER %t.out + +#include +#include +#include +#include +#include + +int main(int argc, char *argv[]) { + cl::sycl::queue queue; + printf("Device Name = %s\n", + queue.get_device().get_info().c_str()); + + // Initialize some host memory + constexpr int N = 64; + sycl::vec host_mem[N]; + for (int i = 0; i < N; ++i) { + host_mem[i].s0() = i; + host_mem[i].s1() = 0; + } + + // Use the device to transform each value + { + cl::sycl::buffer, 1> buf(host_mem, N); + queue.submit([&](cl::sycl::handler &cgh) { + auto global = + buf.get_access(cgh); + sycl::accessor, 1, sycl::access::mode::read_write, + sycl::access::target::local> + local(N, cgh); + cgh.parallel_for( + cl::sycl::nd_range<1>(N, 32), [=](cl::sycl::nd_item<1> it) { + cl::sycl::ONEAPI::sub_group sg = it.get_sub_group(); + if (!it.get_local_id(0)) { + int end = it.get_global_id(0) + it.get_local_range()[0]; + for (int i = it.get_global_id(0); i < end; i++) { + local[i].s0() = 0; + local[i].s1() = i; + } + } + it.barrier(); + + int i = (it.get_global_id(0) / sg.get_max_local_range()[0]) * + sg.get_max_local_range()[0]; + // Global address space + auto x = sg.load(&global[i]); + + // Local address space + auto y = sg.load(&local[i]); + + sg.store(&global[i], x + y); + }); + }); + } + + // Print results and tidy up + for (int i = 0; i < N; ++i) { + if (i != host_mem[i].s0() || i != host_mem[i].s1()) { + printf("Unexpected result [%02d,%02d] vs [%02d,%02d]\n", i, i, + host_mem[i].s0(), host_mem[i].s1()); + return 1; + } + } + printf("Success!\n"); + return 0; +} diff --git a/sycl/test/on-device/lit.cfg.py b/sycl/test/on-device/lit.cfg.py index e96b00c53a50a..a1af1000bacac 100644 --- a/sycl/test/on-device/lit.cfg.py +++ b/sycl/test/on-device/lit.cfg.py @@ -211,7 +211,7 @@ def getDeviceCount(device_type): if cuda: config.substitutions.append( ('%sycl_triple', "nvptx64-nvidia-cuda-sycldevice" ) ) else: - config.substitutions.append( ('%sycl_triple', "spir64-unknown-linux-sycldevice" ) ) + config.substitutions.append( ('%sycl_triple', "spir64-unknown-unknown-sycldevice" ) ) if "opencl-aot" in config.llvm_enable_projects: lit_config.note("Using opencl-aot version which is built as part of the project") From ee479d8b0b51d96f1f9a12151e1be2c7d7a4076a Mon Sep 17 00:00:00 2001 From: Vladimir Lazarev Date: Fri, 26 Feb 2021 16:33:48 +0300 Subject: [PATCH 04/11] Fix bugs exposed in CI - implement fallback for CUDA BE; - fix assert function name on Windows. --- sycl/include/CL/sycl/ONEAPI/sub_group.hpp | 8 ++++++++ sycl/test/extensions/sub_group_as.cpp | 4 ++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/sycl/include/CL/sycl/ONEAPI/sub_group.hpp b/sycl/include/CL/sycl/ONEAPI/sub_group.hpp index 0ce464b6137d7..13d77e3e6c572 100644 --- a/sycl/include/CL/sycl/ONEAPI/sub_group.hpp +++ b/sycl/include/CL/sycl/ONEAPI/sub_group.hpp @@ -241,6 +241,9 @@ struct sub_group { std::is_same::type, T>::value, T> load(T *src) const { +#ifdef __NVPTX__ + return src[get_local_id()[0]]; +#else // __NVPTX__ auto l = __spirv_GenericCastToPtrExplicit_ToLocal( src, __spv::StorageClass::Workgroup); if (l) @@ -258,6 +261,7 @@ struct sub_group { // Fallback for other address spaces to be mapped to global return load(__spirv_PtrCastToGeneric(src)); +#endif // __NVPTX__ } #else //__SYCL_DEVICE_ONLY__ template T load(T *src) const { @@ -375,6 +379,9 @@ struct sub_group { std::is_same::type, T>::value> store(T *dst, const typename detail::remove_AS::type &x) const { +#ifdef __NVPTX__ + dst[get_local_id()[0]] = x; +#else // __NVPTX__ auto l = __spirv_GenericCastToPtrExplicit_ToLocal( dst, __spv::StorageClass::Workgroup); if (l) { @@ -396,6 +403,7 @@ struct sub_group { // Fallback for other address spaces to be mapped to global store(__spirv_PtrCastToGeneric(dst), x); +#endif // __NVPTX__ } #else //__SYCL_DEVICE_ONLY__ template void store(T *dst, const T &x) const { diff --git a/sycl/test/extensions/sub_group_as.cpp b/sycl/test/extensions/sub_group_as.cpp index 086762dc56c12..b7b226445e7c7 100644 --- a/sycl/test/extensions/sub_group_as.cpp +++ b/sycl/test/extensions/sub_group_as.cpp @@ -61,7 +61,7 @@ int main(int argc, char *argv[]) { // CHECK: call spir_func i8 addrspace(1)* @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPKvN5__spv12StorageClass4FlagE(i8 addrspace(4)* // CHECK: call spir_func i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(i32 addrspace(1)* // CHECK: call spir_func i8* @_Z42__spirv_GenericCastToPtrExplicit_ToPrivatePKvN5__spv12StorageClass4FlagE(i8 addrspace(4)* - // CHECK: call spir_func void @__assert_fail + // CHECK: call spir_func void {{.*}}assert // CHECK: call spir_func i8 addrspace(4)* @_Z24__spirv_PtrCastToGenericPKv(i8 addrspace(4)* // CHECK: call spir_func i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(i32 addrspace(1)* // Global address space @@ -78,7 +78,7 @@ int main(int argc, char *argv[]) { // CHECK: call spir_func i8 addrspace(1)* @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPKvN5__spv12StorageClass4FlagE(i8 addrspace(4)* // CHECK: call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIjEvPU3AS1jT_(i32 addrspace(1)* // CHECK: call spir_func i8* @_Z42__spirv_GenericCastToPtrExplicit_ToPrivatePKvN5__spv12StorageClass4FlagE(i8 addrspace(4)* - // CHECK: call spir_func void @__assert_fail + // CHECK: call spir_func void {{.*}}assert // CHECK: call spir_func i8 addrspace(4)* @_Z24__spirv_PtrCastToGenericPKv(i8 addrspace(4)* // CHECK: call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIjEvPU3AS1jT_(i32 addrspace(1)* sg.store(&global[i], x + y + z); From 31dc1d7b28639f0c71fa058e8cc9c6b98cf7012f Mon Sep 17 00:00:00 2001 From: Vladimir Lazarev Date: Fri, 26 Feb 2021 17:40:15 +0300 Subject: [PATCH 05/11] Fix clang-format issue --- sycl/include/CL/sycl/ONEAPI/sub_group.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/include/CL/sycl/ONEAPI/sub_group.hpp b/sycl/include/CL/sycl/ONEAPI/sub_group.hpp index 13d77e3e6c572..427398e28f55f 100644 --- a/sycl/include/CL/sycl/ONEAPI/sub_group.hpp +++ b/sycl/include/CL/sycl/ONEAPI/sub_group.hpp @@ -243,7 +243,7 @@ struct sub_group { #ifdef __NVPTX__ return src[get_local_id()[0]]; -#else // __NVPTX__ +#else // __NVPTX__ auto l = __spirv_GenericCastToPtrExplicit_ToLocal( src, __spv::StorageClass::Workgroup); if (l) @@ -381,7 +381,7 @@ struct sub_group { #ifdef __NVPTX__ dst[get_local_id()[0]] = x; -#else // __NVPTX__ +#else // __NVPTX__ auto l = __spirv_GenericCastToPtrExplicit_ToLocal( dst, __spv::StorageClass::Workgroup); if (l) { From e0bb91ef4f61ccc1ca0efc9d1cb1578a00e232df Mon Sep 17 00:00:00 2001 From: Vladimir Lazarev Date: Fri, 26 Feb 2021 18:19:41 +0300 Subject: [PATCH 06/11] Fix failures --- sycl/test/on-device/extensions/sub_group_as_private.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sycl/test/on-device/extensions/sub_group_as_private.cpp b/sycl/test/on-device/extensions/sub_group_as_private.cpp index 07e33b21a7b35..a852065dc9272 100644 --- a/sycl/test/on-device/extensions/sub_group_as_private.cpp +++ b/sycl/test/on-device/extensions/sub_group_as_private.cpp @@ -5,6 +5,10 @@ // RUNx: %CPU_RUN_PLACEHOLDER %t.out %CPU_CHECK_PLACEHOLDER // RUNx: %ACC_RUN_PLACEHOLDER %t.out %ACC_CHECK_PLACEHOLDER +// Assertion is not raised on CUDA because fallback mechanism is used. +// UNSUPORTED: cuda +// Assertion information is not displayed properly on windows +// XFAIL: windows #include #include #include From 1aeccc41bc41d1482a729f6e7ec800e082c7ae6e Mon Sep 17 00:00:00 2001 From: vladimirlaz Date: Fri, 26 Feb 2021 18:22:11 +0300 Subject: [PATCH 07/11] Apply suggestions from code review Co-authored-by: John Pennycook --- .../SYCL_INTEL_sub_group_algorithms.asciidoc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/doc/extensions/SubGroupAlgorithms/SYCL_INTEL_sub_group_algorithms.asciidoc b/sycl/doc/extensions/SubGroupAlgorithms/SYCL_INTEL_sub_group_algorithms.asciidoc index d7063afc706e8..addbdf7d16162 100755 --- a/sycl/doc/extensions/SubGroupAlgorithms/SYCL_INTEL_sub_group_algorithms.asciidoc +++ b/sycl/doc/extensions/SubGroupAlgorithms/SYCL_INTEL_sub_group_algorithms.asciidoc @@ -135,7 +135,7 @@ The load and store sub-group functions enable developers to assert that all work |Function|Description |+template T load(sub_group sg, const T *src)+ -|Load contiguous data from _src_. Returns one element per work-item, corresponding to the memory location at _src_ + +get_local_id()+. The value of _src_ must be the same for all work-items in the sub-group. The address space information is deduced autmaticaly. Only pointers to global and local address spaces are fully valid. Passing pointer in private address space will cause assertion. Other address spaces are casted to global with potentially undefined behaviour. +|Load contiguous data from _src_. Returns one element per work-item, corresponding to the memory location at _src_ + +get_local_id()+. The value of _src_ must be the same for all work-items in the sub-group. The address space information is deduced automatically. Only pointers to global and local address spaces are valid. Passing a pointer to private address space will cause an assertion. Other address spaces are cast to global with potentially undefined behavior. |+template T load(sub_group sg, const multi_ptr src)+ |Load contiguous data from _src_. Returns one element per work-item, corresponding to the memory location at _src_ + +get_local_id()+. The value of _src_ must be the same for all work-items in the sub-group. _Space_ must be +access::address_space::global_space+ or +access::address_space::local_space+. @@ -144,7 +144,7 @@ The load and store sub-group functions enable developers to assert that all work |Load contiguous data from _src_. Returns _N_ elements per work-item, corresponding to the _N_ memory locations at _src_ + +i+ * +get_max_local_range()+ + +get_local_id()+ for +i+ between 0 and _N_. The value of _src_ must be the same for all work-items in the sub-group. _Space_ must be +access::address_space::global_space+ or +access::address_space::local_space+. |+template void store(sub_group sg, T *dst, const T& x)+ -|Store contiguous data to _dst_. The value of _x_ from each work-item is written to the memory location at _dst_ + +get_local_id()+. The value of _dst_ must be the same for all work-items in the sub-group. The address space information is deduced autmaticaly. Only pointers to global and local address spaces are fully valid. Passing pointer in private address space will cause assertion. Other address spaces are casted to global with potentially undefined behaviour. +|Store contiguous data to _dst_. The value of _x_ from each work-item is written to the memory location at _dst_ + +get_local_id()+. The value of _dst_ must be the same for all work-items in the sub-group. The address space information is deduced automatically. Only pointers to global and local address spaces are valid. Passing a pointer to private address space will cause an assertion. Other address spaces are cast to global with potentially undefined behavior. |+template void store(sub_group sg, multi_ptr dst, const T& x)+ |Store contiguous data to _dst_. The value of _x_ from each work-item is written to the memory location at _dst_ + +get_local_id()+. The value of _dst_ must be the same for all work-items in the sub-group. _Space_ must be +access::address_space::global_space+ or +access::address_space::local_space+. From 72c8a8ff41e8bda9daf87dfb8aebf808bc41dc36 Mon Sep 17 00:00:00 2001 From: Vladimir Lazarev Date: Fri, 26 Feb 2021 20:23:59 +0300 Subject: [PATCH 08/11] Fixed misprint --- sycl/test/on-device/extensions/sub_group_as_private.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test/on-device/extensions/sub_group_as_private.cpp b/sycl/test/on-device/extensions/sub_group_as_private.cpp index a852065dc9272..809769263f51a 100644 --- a/sycl/test/on-device/extensions/sub_group_as_private.cpp +++ b/sycl/test/on-device/extensions/sub_group_as_private.cpp @@ -6,7 +6,7 @@ // RUNx: %ACC_RUN_PLACEHOLDER %t.out %ACC_CHECK_PLACEHOLDER // Assertion is not raised on CUDA because fallback mechanism is used. -// UNSUPORTED: cuda +// UNSUPPORTED: cuda // Assertion information is not displayed properly on windows // XFAIL: windows #include From 828659c7ab0c79be9cd10e560f67f02a83356747 Mon Sep 17 00:00:00 2001 From: Vladimir Lazarev Date: Mon, 8 Mar 2021 16:03:53 +0300 Subject: [PATCH 09/11] Allow load/store only for pointers from local and global AS --- sycl/include/CL/__spirv/spirv_ops.hpp | 20 ----------- sycl/include/CL/sycl/ONEAPI/sub_group.hpp | 19 +++------- sycl/include/CL/sycl/access/access.hpp | 11 ------ sycl/test/extensions/sub_group_as.cpp | 36 +++++++++---------- .../extensions/sub_group_as_private.cpp | 2 +- 5 files changed, 23 insertions(+), 65 deletions(-) diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp index 6506152039a42..8de17670f1e21 100644 --- a/sycl/include/CL/__spirv/spirv_ops.hpp +++ b/sycl/include/CL/__spirv/spirv_ops.hpp @@ -169,8 +169,6 @@ __SPIRV_ATOMICS(__SPIRV_ATOMIC_UNSIGNED, unsigned long) __SPIRV_ATOMICS(__SPIRV_ATOMIC_UNSIGNED, unsigned long long) __SPIRV_ATOMICS(__SPIRV_ATOMIC_MINMAX, Min) __SPIRV_ATOMICS(__SPIRV_ATOMIC_MINMAX, Max) -extern SYCL_EXTERNAL __attribute__((opencl_generic)) void * -__spirv_PtrCastToGeneric(const void *Ptr) noexcept; extern SYCL_EXTERNAL __attribute__((opencl_global)) void * __spirv_GenericCastToPtrExplicit_ToGlobal(const void *Ptr, @@ -180,16 +178,6 @@ extern SYCL_EXTERNAL __attribute__((opencl_local)) void * __spirv_GenericCastToPtrExplicit_ToLocal(const void *Ptr, __spv::StorageClass::Flag S) noexcept; -extern SYCL_EXTERNAL __attribute__((opencl_private)) void * -__spirv_GenericCastToPtrExplicit_ToPrivate( - const void *Ptr, __spv::StorageClass::Flag S) noexcept; - -template -extern __attribute__((opencl_generic)) dataT * -__spirv_PtrCastToGeneric(const void *Ptr) noexcept { - return (__attribute__((opencl_generic)) dataT *)__spirv_PtrCastToGeneric(Ptr); -} - template extern __attribute__((opencl_global)) dataT * __spirv_GenericCastToPtrExplicit_ToGlobal( @@ -206,14 +194,6 @@ __spirv_GenericCastToPtrExplicit_ToLocal(const void *Ptr, dataT *)__spirv_GenericCastToPtrExplicit_ToLocal(Ptr, S); } -template -extern __attribute__((opencl_private)) dataT * -__spirv_GenericCastToPtrExplicit_ToPrivate( - const void *Ptr, __spv::StorageClass::Flag S) noexcept { - return (__attribute__((opencl_private)) - dataT *)__spirv_GenericCastToPtrExplicit_ToPrivate(Ptr, S); -} - template __SYCL_CONVERGENT__ extern SYCL_EXTERNAL dataT __spirv_SubgroupShuffleINTEL(dataT Data, uint32_t InvocationId) noexcept; diff --git a/sycl/include/CL/sycl/ONEAPI/sub_group.hpp b/sycl/include/CL/sycl/ONEAPI/sub_group.hpp index 427398e28f55f..b51098b8c78e4 100644 --- a/sycl/include/CL/sycl/ONEAPI/sub_group.hpp +++ b/sycl/include/CL/sycl/ONEAPI/sub_group.hpp @@ -254,13 +254,8 @@ struct sub_group { if (g) return load(g); - auto p = __spirv_GenericCastToPtrExplicit_ToPrivate( - src, __spv::StorageClass::Function); - assert((p == nullptr) && - "Sub-group load() is not supported for private pointers."); - - // Fallback for other address spaces to be mapped to global - return load(__spirv_PtrCastToGeneric(src)); + assert(!"Sub-group load() is supported for local or global pointers only."); + return 0; #endif // __NVPTX__ } #else //__SYCL_DEVICE_ONLY__ @@ -396,13 +391,9 @@ struct sub_group { return; } - auto p = __spirv_GenericCastToPtrExplicit_ToPrivate( - dst, __spv::StorageClass::Function); - assert((p == nullptr) && - "Sub-group store() is not supported for private pointers."); - - // Fallback for other address spaces to be mapped to global - store(__spirv_PtrCastToGeneric(dst), x); + assert( + !"Sub-group store() is supported for local or global pointers only."); + return; #endif // __NVPTX__ } #else //__SYCL_DEVICE_ONLY__ diff --git a/sycl/include/CL/sycl/access/access.hpp b/sycl/include/CL/sycl/access/access.hpp index 23b351b23ba63..da8c1569b459b 100644 --- a/sycl/include/CL/sycl/access/access.hpp +++ b/sycl/include/CL/sycl/access/access.hpp @@ -115,7 +115,6 @@ constexpr bool modeWritesNewData(access::mode m) { #define __OPENCL_LOCAL_AS__ __attribute__((opencl_local)) #define __OPENCL_CONSTANT_AS__ __attribute__((opencl_constant)) #define __OPENCL_PRIVATE_AS__ __attribute__((opencl_private)) -#define __OPENCL_GENERIC_AS__ __attribute__((opencl_generic)) #else #define __OPENCL_GLOBAL_AS__ #define __OPENCL_GLOBAL_DEVICE_AS__ @@ -123,7 +122,6 @@ constexpr bool modeWritesNewData(access::mode m) { #define __OPENCL_LOCAL_AS__ #define __OPENCL_CONSTANT_AS__ #define __OPENCL_PRIVATE_AS__ -#define __OPENCL_GENERIC_AS__ #endif template struct TargetToAS { @@ -229,10 +227,6 @@ template struct remove_AS<__OPENCL_CONSTANT_AS__ T> { typedef T type; }; -template struct remove_AS<__OPENCL_GENERIC_AS__ T> { - typedef T type; -}; - template struct deduce_AS<__OPENCL_PRIVATE_AS__ T> { static const access::address_space value = access::address_space::private_space; @@ -246,11 +240,6 @@ template struct deduce_AS<__OPENCL_CONSTANT_AS__ T> { static const access::address_space value = access::address_space::constant_space; }; - -template struct deduce_AS<__OPENCL_GENERIC_AS__ T> { - static const access::address_space value = - access::address_space::global_space; -}; #endif #undef __OPENCL_GLOBAL_AS__ diff --git a/sycl/test/extensions/sub_group_as.cpp b/sycl/test/extensions/sub_group_as.cpp index b7b226445e7c7..ba395b852ba55 100644 --- a/sycl/test/extensions/sub_group_as.cpp +++ b/sycl/test/extensions/sub_group_as.cpp @@ -29,13 +29,6 @@ int main(int argc, char *argv[]) { sycl::access::target::local> local(N, cgh); - // Check that load/store functions for raw pointer was called - // - // CHECK: spir_kernel void @"_ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEE4test" - // CHECK-COUNT-3: call spir_func i32 - // {{.*}}loadIiEENSt9enable_ifIXsr3std7is_sameINS0_6detail9remove_ASIT_E4typeES7_EE5valueES7_E4typeEPS7_{{.*}}i32 - // addrspace(4)* - // CHECK: call spir_func void {{.*}}storeIiEENSt9enable_ifIXsr3std7is_sameINS0_6detail9remove_ASIT_E4typeES7_EE5valueEvE4typeEPS7_RKS9_({{.*}} addrspace(4)*{{.*}}, cgh.parallel_for( cl::sycl::nd_range<1>(N, 32), [=](cl::sycl::nd_item<1> it) { int v[N] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, @@ -50,37 +43,42 @@ int main(int argc, char *argv[]) { local[i] = i; } } + // CHECK: call void @_Z22__spirv_ControlBarrierjjj it.barrier(); int i = (it.get_global_id(0) / sg.get_max_local_range()[0]) * sg.get_max_local_range()[0]; - // CHECK: spir_func i32{{.*}}loadIiEENSt9enable_ifIXsr3std7is_sameINS0_6detail9remove_ASIT_E4typeES7_EE5valueES7_E4typeEPS7_ + // load for global address space // CHECK: call spir_func i8 addrspace(3)* @_Z40__spirv_GenericCastToPtrExplicit_ToLocalPKvN5__spv12StorageClass4FlagE(i8 addrspace(4)* - // CHECK: call spir_func i32 @_Z33__spirv_SubgroupLocalInvocationIdv + // CHECK: call spir_func i32 @_Z33__spirv_SubgroupLocalInvocationIdv() // CHECK: call spir_func i8 addrspace(1)* @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPKvN5__spv12StorageClass4FlagE(i8 addrspace(4)* // CHECK: call spir_func i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(i32 addrspace(1)* - // CHECK: call spir_func i8* @_Z42__spirv_GenericCastToPtrExplicit_ToPrivatePKvN5__spv12StorageClass4FlagE(i8 addrspace(4)* // CHECK: call spir_func void {{.*}}assert - // CHECK: call spir_func i8 addrspace(4)* @_Z24__spirv_PtrCastToGenericPKv(i8 addrspace(4)* - // CHECK: call spir_func i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(i32 addrspace(1)* - // Global address space auto x = sg.load(&global[i]); - // Local address space + // load() for local address space + // CHECK: call spir_func i8 addrspace(3)* @_Z40__spirv_GenericCastToPtrExplicit_ToLocalPKvN5__spv12StorageClass4FlagE(i8 addrspace(4)* + // CHECK: call spir_func i32 @_Z33__spirv_SubgroupLocalInvocationIdv() + // CHECK: call spir_func i8 addrspace(1)* @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPKvN5__spv12StorageClass4FlagE(i8 addrspace(4)* + // CHECK: call spir_func i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(i32 addrspace(1)* + // CHECK: call spir_func void {{.*}}assert auto y = sg.load(&local[i]); + // load() for private address space + // CHECK: call spir_func i8 addrspace(3)* @_Z40__spirv_GenericCastToPtrExplicit_ToLocalPKvN5__spv12StorageClass4FlagE(i8 addrspace(4)* + // CHECK: call spir_func i32 @_Z33__spirv_SubgroupLocalInvocationIdv() + // CHECK: call spir_func i8 addrspace(1)* @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPKvN5__spv12StorageClass4FlagE(i8 addrspace(4)* + // CHECK: call spir_func i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(i32 addrspace(1)* + // CHECK: call spir_func void {{.*}}assert auto z = sg.load(v + i); - // CHECK: spir_func void {{.*}}storeIiEENSt9enable_ifIXsr3std7is_sameINS0_6detail9remove_ASIT_E4typeES7_EE5valueEvE4typeEPS7_RKS9_ + // store() for global address space // CHECK: call spir_func i8 addrspace(3)* @_Z40__spirv_GenericCastToPtrExplicit_ToLocalPKvN5__spv12StorageClass4FlagE(i8 addrspace(4)* - // CHECK: call spir_func i32 @_Z33__spirv_SubgroupLocalInvocationIdv + // CHECK: call spir_func i32 @_Z33__spirv_SubgroupLocalInvocationIdv() #7, !noalias !29 // CHECK: call spir_func i8 addrspace(1)* @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPKvN5__spv12StorageClass4FlagE(i8 addrspace(4)* // CHECK: call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIjEvPU3AS1jT_(i32 addrspace(1)* - // CHECK: call spir_func i8* @_Z42__spirv_GenericCastToPtrExplicit_ToPrivatePKvN5__spv12StorageClass4FlagE(i8 addrspace(4)* // CHECK: call spir_func void {{.*}}assert - // CHECK: call spir_func i8 addrspace(4)* @_Z24__spirv_PtrCastToGenericPKv(i8 addrspace(4)* - // CHECK: call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIjEvPU3AS1jT_(i32 addrspace(1)* sg.store(&global[i], x + y + z); }); }); diff --git a/sycl/test/on-device/extensions/sub_group_as_private.cpp b/sycl/test/on-device/extensions/sub_group_as_private.cpp index 809769263f51a..a809cf53d81c6 100644 --- a/sycl/test/on-device/extensions/sub_group_as_private.cpp +++ b/sycl/test/on-device/extensions/sub_group_as_private.cpp @@ -62,7 +62,7 @@ int main(int argc, char *argv[]) { // Local address space auto y = sg.load(&local[i]); - // CHECK: Sub-group load() is not supported for private pointers. + // CHECK: Sub-group load() is supported for local or global pointers only auto z = sg.load(v + i); sg.store(&global[i], x + y); From 92ea392379c40242f114b6ede4ce845c14c6fb34 Mon Sep 17 00:00:00 2001 From: Vladimir Lazarev Date: Mon, 8 Mar 2021 20:40:26 +0300 Subject: [PATCH 10/11] Fix lit failure --- sycl/test/extensions/sub_group_as.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test/extensions/sub_group_as.cpp b/sycl/test/extensions/sub_group_as.cpp index ba395b852ba55..1dfa35b5d8949 100644 --- a/sycl/test/extensions/sub_group_as.cpp +++ b/sycl/test/extensions/sub_group_as.cpp @@ -75,7 +75,7 @@ int main(int argc, char *argv[]) { // store() for global address space // CHECK: call spir_func i8 addrspace(3)* @_Z40__spirv_GenericCastToPtrExplicit_ToLocalPKvN5__spv12StorageClass4FlagE(i8 addrspace(4)* - // CHECK: call spir_func i32 @_Z33__spirv_SubgroupLocalInvocationIdv() #7, !noalias !29 + // CHECK: call spir_func i32 @_Z33__spirv_SubgroupLocalInvocationIdv() // CHECK: call spir_func i8 addrspace(1)* @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPKvN5__spv12StorageClass4FlagE(i8 addrspace(4)* // CHECK: call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIjEvPU3AS1jT_(i32 addrspace(1)* // CHECK: call spir_func void {{.*}}assert From 66551d9382913019cc7765b7d80153bb51c83f63 Mon Sep 17 00:00:00 2001 From: Vladimir Lazarev Date: Tue, 9 Mar 2021 16:57:12 +0300 Subject: [PATCH 11/11] Apply review comments --- .../SYCL_INTEL_sub_group_algorithms.asciidoc | 4 +- sycl/include/CL/sycl/ONEAPI/sub_group.hpp | 2 +- sycl/include/CL/sycl/access/access.hpp | 12 ++- .../test/basic_tests/address_space_traits.cpp | 38 +++++++++ sycl/test/extensions/sub_group_as.cpp | 36 ++++---- .../extensions/sub_group_as_private.cpp | 82 ------------------- 6 files changed, 67 insertions(+), 107 deletions(-) create mode 100644 sycl/test/basic_tests/address_space_traits.cpp delete mode 100644 sycl/test/on-device/extensions/sub_group_as_private.cpp diff --git a/sycl/doc/extensions/SubGroupAlgorithms/SYCL_INTEL_sub_group_algorithms.asciidoc b/sycl/doc/extensions/SubGroupAlgorithms/SYCL_INTEL_sub_group_algorithms.asciidoc index addbdf7d16162..e414430bda03f 100755 --- a/sycl/doc/extensions/SubGroupAlgorithms/SYCL_INTEL_sub_group_algorithms.asciidoc +++ b/sycl/doc/extensions/SubGroupAlgorithms/SYCL_INTEL_sub_group_algorithms.asciidoc @@ -135,7 +135,7 @@ The load and store sub-group functions enable developers to assert that all work |Function|Description |+template T load(sub_group sg, const T *src)+ -|Load contiguous data from _src_. Returns one element per work-item, corresponding to the memory location at _src_ + +get_local_id()+. The value of _src_ must be the same for all work-items in the sub-group. The address space information is deduced automatically. Only pointers to global and local address spaces are valid. Passing a pointer to private address space will cause an assertion. Other address spaces are cast to global with potentially undefined behavior. +|Load contiguous data from _src_. Returns one element per work-item, corresponding to the memory location at _src_ + +get_local_id()+. The value of _src_ must be the same for all work-items in the sub-group. The address space information is deduced automatically. Only pointers to global and local address spaces are valid. Passing a pointer to other address spaces will cause the run time assertion. |+template T load(sub_group sg, const multi_ptr src)+ |Load contiguous data from _src_. Returns one element per work-item, corresponding to the memory location at _src_ + +get_local_id()+. The value of _src_ must be the same for all work-items in the sub-group. _Space_ must be +access::address_space::global_space+ or +access::address_space::local_space+. @@ -144,7 +144,7 @@ The load and store sub-group functions enable developers to assert that all work |Load contiguous data from _src_. Returns _N_ elements per work-item, corresponding to the _N_ memory locations at _src_ + +i+ * +get_max_local_range()+ + +get_local_id()+ for +i+ between 0 and _N_. The value of _src_ must be the same for all work-items in the sub-group. _Space_ must be +access::address_space::global_space+ or +access::address_space::local_space+. |+template void store(sub_group sg, T *dst, const T& x)+ -|Store contiguous data to _dst_. The value of _x_ from each work-item is written to the memory location at _dst_ + +get_local_id()+. The value of _dst_ must be the same for all work-items in the sub-group. The address space information is deduced automatically. Only pointers to global and local address spaces are valid. Passing a pointer to private address space will cause an assertion. Other address spaces are cast to global with potentially undefined behavior. +|Store contiguous data to _dst_. The value of _x_ from each work-item is written to the memory location at _dst_ + +get_local_id()+. The value of _dst_ must be the same for all work-items in the sub-group. The address space information is deduced automatically. Only pointers to global and local address spaces are valid. Passing a pointer to other address spaces will cause the run time assertion. |+template void store(sub_group sg, multi_ptr dst, const T& x)+ |Store contiguous data to _dst_. The value of _x_ from each work-item is written to the memory location at _dst_ + +get_local_id()+. The value of _dst_ must be the same for all work-items in the sub-group. _Space_ must be +access::address_space::global_space+ or +access::address_space::local_space+. diff --git a/sycl/include/CL/sycl/ONEAPI/sub_group.hpp b/sycl/include/CL/sycl/ONEAPI/sub_group.hpp index b51098b8c78e4..121d94a5141a2 100644 --- a/sycl/include/CL/sycl/ONEAPI/sub_group.hpp +++ b/sycl/include/CL/sycl/ONEAPI/sub_group.hpp @@ -255,7 +255,7 @@ struct sub_group { return load(g); assert(!"Sub-group load() is supported for local or global pointers only."); - return 0; + return {}; #endif // __NVPTX__ } #else //__SYCL_DEVICE_ONLY__ diff --git a/sycl/include/CL/sycl/access/access.hpp b/sycl/include/CL/sycl/access/access.hpp index da8c1569b459b..8f4b9a92804e8 100644 --- a/sycl/include/CL/sycl/access/access.hpp +++ b/sycl/include/CL/sycl/access/access.hpp @@ -189,12 +189,12 @@ struct DecoratedType { }; template struct remove_AS { typedef T type; }; +#ifdef __SYCL_DEVICE_ONLY__ template struct deduce_AS { - static const access::address_space value = - access::address_space::global_space; + static_assert(!std::is_same::type, T>::value, + "Only types with address space attributes are supported"); }; -#ifdef __SYCL_DEVICE_ONLY__ template struct remove_AS<__OPENCL_GLOBAL_AS__ T> { typedef T type; }; #ifdef __ENABLE_USM_ADDR_SPACE__ @@ -227,6 +227,11 @@ template struct remove_AS<__OPENCL_CONSTANT_AS__ T> { typedef T type; }; +template struct deduce_AS<__OPENCL_GLOBAL_AS__ T> { + static const access::address_space value = + access::address_space::global_space; +}; + template struct deduce_AS<__OPENCL_PRIVATE_AS__ T> { static const access::address_space value = access::address_space::private_space; @@ -248,7 +253,6 @@ template struct deduce_AS<__OPENCL_CONSTANT_AS__ T> { #undef __OPENCL_LOCAL_AS__ #undef __OPENCL_CONSTANT_AS__ #undef __OPENCL_PRIVATE_AS__ -#undef __OPENCL_GENERIC_AS__ } // namespace detail } // namespace sycl diff --git a/sycl/test/basic_tests/address_space_traits.cpp b/sycl/test/basic_tests/address_space_traits.cpp new file mode 100644 index 0000000000000..92f345e87fd80 --- /dev/null +++ b/sycl/test/basic_tests/address_space_traits.cpp @@ -0,0 +1,38 @@ +// RUN: %clangxx -fsycl -fsycl-device-only -D__ENABLE_USM_ADDR_SPACE__ -fsycl-targets=%sycl_triple %s -c + +#include +#include + +using namespace cl::sycl; +int main() { + + queue myQueue; + myQueue.submit([&](handler &cgh) { + cgh.single_task([=]() { + static_assert( + detail::deduce_AS<__attribute__((opencl_global)) int>::value == + access::address_space::global_space, + "Unexpected address space"); + static_assert( + detail::deduce_AS<__attribute__((opencl_local)) int>::value == + access::address_space::local_space, + "Unexpected address space"); + static_assert( + detail::deduce_AS<__attribute__((opencl_private)) int>::value == + access::address_space::private_space, + "Unexpected address space"); + static_assert( + detail::deduce_AS<__attribute__((opencl_constant)) int>::value == + access::address_space::constant_space, + "Unexpected address space"); + static_assert( + detail::deduce_AS<__attribute__((opencl_global_device)) int>::value == + access::address_space::global_device_space, + "Unexpected address space"); + static_assert( + detail::deduce_AS<__attribute__((opencl_global_host)) int>::value == + access::address_space::global_host_space, + "Unexpected address space"); + }); + }); +} diff --git a/sycl/test/extensions/sub_group_as.cpp b/sycl/test/extensions/sub_group_as.cpp index 1dfa35b5d8949..bb515a7f23042 100644 --- a/sycl/test/extensions/sub_group_as.cpp +++ b/sycl/test/extensions/sub_group_as.cpp @@ -1,4 +1,4 @@ -// RUN: %clangxx -fsycl -fsycl-explicit-simd -fsycl-device-only -O3 -S -emit-llvm -x c++ %s -o - | FileCheck %s +// RUN: %clangxx -fsycl -fsycl-device-only -O3 -S -emit-llvm -x c++ %s -o - | FileCheck %s #include #include @@ -43,41 +43,41 @@ int main(int argc, char *argv[]) { local[i] = i; } } - // CHECK: call void @_Z22__spirv_ControlBarrierjjj + // CHECK: call void {{.*}}spirv_ControlBarrierjjj it.barrier(); int i = (it.get_global_id(0) / sg.get_max_local_range()[0]) * sg.get_max_local_range()[0]; // load for global address space - // CHECK: call spir_func i8 addrspace(3)* @_Z40__spirv_GenericCastToPtrExplicit_ToLocalPKvN5__spv12StorageClass4FlagE(i8 addrspace(4)* - // CHECK: call spir_func i32 @_Z33__spirv_SubgroupLocalInvocationIdv() - // CHECK: call spir_func i8 addrspace(1)* @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPKvN5__spv12StorageClass4FlagE(i8 addrspace(4)* - // CHECK: call spir_func i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(i32 addrspace(1)* + // CHECK: call spir_func i8 addrspace(3)* {{.*}}spirv_GenericCastToPtrExplicit_ToLocal{{.*}}(i8 addrspace(4)* + // CHECK: {{.*}}SubgroupLocalInvocationId + // CHECK: call spir_func i8 addrspace(1)* {{.*}}spirv_GenericCastToPtrExplicit_ToGlobal{{.*}}(i8 addrspace(4)* + // CHECK: call spir_func i32 {{.*}}spirv_SubgroupBlockRead{{.*}}(i32 addrspace(1)* // CHECK: call spir_func void {{.*}}assert auto x = sg.load(&global[i]); // load() for local address space - // CHECK: call spir_func i8 addrspace(3)* @_Z40__spirv_GenericCastToPtrExplicit_ToLocalPKvN5__spv12StorageClass4FlagE(i8 addrspace(4)* - // CHECK: call spir_func i32 @_Z33__spirv_SubgroupLocalInvocationIdv() - // CHECK: call spir_func i8 addrspace(1)* @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPKvN5__spv12StorageClass4FlagE(i8 addrspace(4)* - // CHECK: call spir_func i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(i32 addrspace(1)* + // CHECK: call spir_func i8 addrspace(3)* {{.*}}spirv_GenericCastToPtrExplicit_ToLocal{{.*}}(i8 addrspace(4)* + // CHECK: {{.*}}SubgroupLocalInvocationId + // CHECK: call spir_func i8 addrspace(1)* {{.*}}spirv_GenericCastToPtrExplicit_ToGlobal{{.*}}(i8 addrspace(4)* + // CHECK: call spir_func i32 {{.*}}spirv_SubgroupBlockRead{{.*}}(i32 addrspace(1)* // CHECK: call spir_func void {{.*}}assert auto y = sg.load(&local[i]); // load() for private address space - // CHECK: call spir_func i8 addrspace(3)* @_Z40__spirv_GenericCastToPtrExplicit_ToLocalPKvN5__spv12StorageClass4FlagE(i8 addrspace(4)* - // CHECK: call spir_func i32 @_Z33__spirv_SubgroupLocalInvocationIdv() - // CHECK: call spir_func i8 addrspace(1)* @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPKvN5__spv12StorageClass4FlagE(i8 addrspace(4)* - // CHECK: call spir_func i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(i32 addrspace(1)* + // CHECK: call spir_func i8 addrspace(3)* {{.*}}spirv_GenericCastToPtrExplicit_ToLocal{{.*}}(i8 addrspace(4)* + // CHECK: {{.*}}SubgroupLocalInvocationId + // CHECK: call spir_func i8 addrspace(1)* {{.*}}spirv_GenericCastToPtrExplicit_ToGlobal{{.*}}(i8 addrspace(4)* + // CHECK: call spir_func i32 {{.*}}spirv_SubgroupBlockRead{{.*}}(i32 addrspace(1)* // CHECK: call spir_func void {{.*}}assert auto z = sg.load(v + i); // store() for global address space - // CHECK: call spir_func i8 addrspace(3)* @_Z40__spirv_GenericCastToPtrExplicit_ToLocalPKvN5__spv12StorageClass4FlagE(i8 addrspace(4)* - // CHECK: call spir_func i32 @_Z33__spirv_SubgroupLocalInvocationIdv() - // CHECK: call spir_func i8 addrspace(1)* @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPKvN5__spv12StorageClass4FlagE(i8 addrspace(4)* - // CHECK: call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIjEvPU3AS1jT_(i32 addrspace(1)* + // CHECK: call spir_func i8 addrspace(3)* {{.*}}spirv_GenericCastToPtrExplicit_ToLocal{{.*}}(i8 addrspace(4)* + // CHECK: {{.*}}SubgroupLocalInvocationId + // CHECK: call spir_func i8 addrspace(1)* {{.*}}spirv_GenericCastToPtrExplicit_ToGlobal{{.*}}(i8 addrspace(4)* + // CHECK: call spir_func void {{.*}}spirv_SubgroupBlockWriteINTEL{{.*}}(i32 addrspace(1)* // CHECK: call spir_func void {{.*}}assert sg.store(&global[i], x + y + z); }); diff --git a/sycl/test/on-device/extensions/sub_group_as_private.cpp b/sycl/test/on-device/extensions/sub_group_as_private.cpp deleted file mode 100644 index a809cf53d81c6..0000000000000 --- a/sycl/test/on-device/extensions/sub_group_as_private.cpp +++ /dev/null @@ -1,82 +0,0 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -// Sub-groups are not suported on Host -// RUN: %GPU_RUN_PLACEHOLDER %t.out %GPU_CHECK_PLACEHOLDER -// Execution on CPU and FPGA takes 100000 times longer -// RUNx: %CPU_RUN_PLACEHOLDER %t.out %CPU_CHECK_PLACEHOLDER -// RUNx: %ACC_RUN_PLACEHOLDER %t.out %ACC_CHECK_PLACEHOLDER - -// Assertion is not raised on CUDA because fallback mechanism is used. -// UNSUPPORTED: cuda -// Assertion information is not displayed properly on windows -// XFAIL: windows -#include -#include -#include -#include -#include - -int main(int argc, char *argv[]) { - cl::sycl::queue queue; - printf("Device Name = %s\n", - queue.get_device().get_info().c_str()); - - // Initialize some host memory - constexpr int N = 64; - int host_mem[N]; - for (int i = 0; i < N; ++i) { - host_mem[i] = i * 100; - } - - // Use the device to transform each value - { - cl::sycl::buffer buf(host_mem, N); - queue.submit([&](cl::sycl::handler &cgh) { - auto global = - buf.get_access(cgh); - sycl::accessor - local(N, cgh); - - cgh.parallel_for( - cl::sycl::nd_range<1>(N, 32), [=](cl::sycl::nd_item<1> it) { - int v[N] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, - 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, - 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, - 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, - 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63}; - cl::sycl::ONEAPI::sub_group sg = it.get_sub_group(); - if (!it.get_local_id(0)) { - int end = it.get_global_id(0) + it.get_local_range()[0]; - for (int i = it.get_global_id(0); i < end; i++) { - local[i] = i; - } - } - it.barrier(); - - int i = (it.get_global_id(0) / sg.get_max_local_range()[0]) * - sg.get_max_local_range()[0]; - // Global address space - auto x = sg.load(&global[i]); - - // Local address space - auto y = sg.load(&local[i]); - - // CHECK: Sub-group load() is supported for local or global pointers only - auto z = sg.load(v + i); - - sg.store(&global[i], x + y); - }); - }); - } - - // Print results and tidy up - for (int i = 0; i < N; ++i) { - if (i * 101 != host_mem[i]) { - printf("Unexpected result %04d vs %04d\n", i * 101, host_mem[i]); - return 1; - } - } - printf("Success!\n"); - return 0; -}