diff --git a/offload/DeviceRTL/include/Synchronization.h b/offload/DeviceRTL/include/Synchronization.h index 3205582e87b99..5a789441b9d35 100644 --- a/offload/DeviceRTL/include/Synchronization.h +++ b/offload/DeviceRTL/include/Synchronization.h @@ -28,23 +28,17 @@ enum OrderingTy { seq_cst = __ATOMIC_SEQ_CST, }; -enum ScopeTy { +enum MemScopeTy { system = __MEMORY_SCOPE_SYSTEM, - device_ = __MEMORY_SCOPE_DEVICE, + device = __MEMORY_SCOPE_DEVICE, workgroup = __MEMORY_SCOPE_WRKGRP, wavefront = __MEMORY_SCOPE_WVFRNT, single = __MEMORY_SCOPE_SINGLE, }; -enum MemScopeTy { - all, // All threads on all devices - device, // All threads on the device - cgroup // All threads in the contention group, e.g. the team -}; - /// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics. uint32_t inc(uint32_t *Addr, uint32_t V, OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::all); + MemScopeTy MemScope = MemScopeTy::device); /// Atomically perform on \p V and \p *Addr with \p Ordering semantics. The /// result is stored in \p *Addr; @@ -52,120 +46,127 @@ uint32_t inc(uint32_t *Addr, uint32_t V, OrderingTy Ordering, template > bool cas(Ty *Address, V ExpectedV, V DesiredV, atomic::OrderingTy OrderingSucc, - atomic::OrderingTy OrderingFail) { + atomic::OrderingTy OrderingFail, + MemScopeTy MemScope = MemScopeTy::device) { return __scoped_atomic_compare_exchange(Address, &ExpectedV, &DesiredV, false, - OrderingSucc, OrderingFail, - __MEMORY_SCOPE_DEVICE); + OrderingSucc, OrderingFail, MemScope); } template > -V add(Ty *Address, V Val, atomic::OrderingTy Ordering) { - return __scoped_atomic_fetch_add(Address, Val, Ordering, - __MEMORY_SCOPE_DEVICE); +V add(Ty *Address, V Val, atomic::OrderingTy Ordering, + MemScopeTy MemScope = MemScopeTy::device) { + return __scoped_atomic_fetch_add(Address, Val, Ordering, MemScope); } template > -V load(Ty *Address, atomic::OrderingTy Ordering) { - return __scoped_atomic_load_n(Address, Ordering, __MEMORY_SCOPE_DEVICE); +V load(Ty *Address, atomic::OrderingTy Ordering, + MemScopeTy MemScope = MemScopeTy::device) { + return __scoped_atomic_load_n(Address, Ordering, MemScope); } template > -void store(Ty *Address, V Val, atomic::OrderingTy Ordering) { - __scoped_atomic_store_n(Address, Val, Ordering, __MEMORY_SCOPE_DEVICE); +void store(Ty *Address, V Val, atomic::OrderingTy Ordering, + MemScopeTy MemScope = MemScopeTy::device) { + __scoped_atomic_store_n(Address, Val, Ordering, MemScope); } template > -V mul(Ty *Address, V Val, atomic::OrderingTy Ordering) { +V mul(Ty *Address, V Val, atomic::OrderingTy Ordering, + MemScopeTy MemScope = MemScopeTy::device) { Ty TypedCurrentVal, TypedResultVal, TypedNewVal; bool Success; do { TypedCurrentVal = atomic::load(Address, Ordering); TypedNewVal = TypedCurrentVal * Val; Success = atomic::cas(Address, TypedCurrentVal, TypedNewVal, Ordering, - atomic::relaxed); + atomic::relaxed, MemScope); } while (!Success); return TypedResultVal; } template > utils::enable_if_t, V> -max(Ty *Address, V Val, atomic::OrderingTy Ordering) { - return __scoped_atomic_fetch_max(Address, Val, Ordering, - __MEMORY_SCOPE_DEVICE); +max(Ty *Address, V Val, atomic::OrderingTy Ordering, + MemScopeTy MemScope = MemScopeTy::device) { + return __scoped_atomic_fetch_max(Address, Val, Ordering, MemScope); } template > utils::enable_if_t, V> -max(Ty *Address, V Val, atomic::OrderingTy Ordering) { +max(Ty *Address, V Val, atomic::OrderingTy Ordering, + MemScopeTy MemScope = MemScopeTy::device) { if (Val >= 0) - return utils::bitCast( - max((int32_t *)Address, utils::bitCast(Val), Ordering)); - return utils::bitCast( - min((uint32_t *)Address, utils::bitCast(Val), Ordering)); + return utils::bitCast(max( + (int32_t *)Address, utils::bitCast(Val), Ordering, MemScope)); + return utils::bitCast(min( + (uint32_t *)Address, utils::bitCast(Val), Ordering, MemScope)); } template > utils::enable_if_t, V> -max(Ty *Address, V Val, atomic::OrderingTy Ordering) { +max(Ty *Address, V Val, atomic::OrderingTy Ordering, + MemScopeTy MemScope = MemScopeTy::device) { if (Val >= 0) - return utils::bitCast( - max((int64_t *)Address, utils::bitCast(Val), Ordering)); - return utils::bitCast( - min((uint64_t *)Address, utils::bitCast(Val), Ordering)); + return utils::bitCast(max( + (int64_t *)Address, utils::bitCast(Val), Ordering, MemScope)); + return utils::bitCast(min( + (uint64_t *)Address, utils::bitCast(Val), Ordering, MemScope)); } template > utils::enable_if_t, V> -min(Ty *Address, V Val, atomic::OrderingTy Ordering) { - return __scoped_atomic_fetch_min(Address, Val, Ordering, - __MEMORY_SCOPE_DEVICE); +min(Ty *Address, V Val, atomic::OrderingTy Ordering, + MemScopeTy MemScope = MemScopeTy::device) { + return __scoped_atomic_fetch_min(Address, Val, Ordering, MemScope); } // TODO: Implement this with __atomic_fetch_max and remove the duplication. template > utils::enable_if_t, V> -min(Ty *Address, V Val, atomic::OrderingTy Ordering) { +min(Ty *Address, V Val, atomic::OrderingTy Ordering, + MemScopeTy MemScope = MemScopeTy::device) { if (Val >= 0) - return utils::bitCast( - min((int32_t *)Address, utils::bitCast(Val), Ordering)); - return utils::bitCast( - max((uint32_t *)Address, utils::bitCast(Val), Ordering)); + return utils::bitCast(min( + (int32_t *)Address, utils::bitCast(Val), Ordering, MemScope)); + return utils::bitCast(max( + (uint32_t *)Address, utils::bitCast(Val), Ordering, MemScope)); } // TODO: Implement this with __atomic_fetch_max and remove the duplication. template > utils::enable_if_t, V> -min(Ty *Address, utils::remove_addrspace_t Val, - atomic::OrderingTy Ordering) { +min(Ty *Address, utils::remove_addrspace_t Val, atomic::OrderingTy Ordering, + MemScopeTy MemScope = MemScopeTy::device) { if (Val >= 0) - return utils::bitCast( - min((int64_t *)Address, utils::bitCast(Val), Ordering)); - return utils::bitCast( - max((uint64_t *)Address, utils::bitCast(Val), Ordering)); + return utils::bitCast(min( + (int64_t *)Address, utils::bitCast(Val), Ordering, MemScope)); + return utils::bitCast(max( + (uint64_t *)Address, utils::bitCast(Val), Ordering, MemScope)); } template > -V bit_or(Ty *Address, V Val, atomic::OrderingTy Ordering) { - return __scoped_atomic_fetch_or(Address, Val, Ordering, - __MEMORY_SCOPE_DEVICE); +V bit_or(Ty *Address, V Val, atomic::OrderingTy Ordering, + MemScopeTy MemScope = MemScopeTy::device) { + return __scoped_atomic_fetch_or(Address, Val, Ordering, MemScope); } template > -V bit_and(Ty *Address, V Val, atomic::OrderingTy Ordering) { - return __scoped_atomic_fetch_and(Address, Val, Ordering, - __MEMORY_SCOPE_DEVICE); +V bit_and(Ty *Address, V Val, atomic::OrderingTy Ordering, + MemScopeTy MemScope = MemScopeTy::device) { + return __scoped_atomic_fetch_and(Address, Val, Ordering, MemScope); } template > -V bit_xor(Ty *Address, V Val, atomic::OrderingTy Ordering) { - return __scoped_atomic_fetch_xor(Address, Val, Ordering, - __MEMORY_SCOPE_DEVICE); +V bit_xor(Ty *Address, V Val, atomic::OrderingTy Ordering, + MemScopeTy MemScope = MemScopeTy::device) { + return __scoped_atomic_fetch_xor(Address, Val, Ordering, MemScope); } -static inline uint32_t atomicExchange(uint32_t *Address, uint32_t Val, - atomic::OrderingTy Ordering) { +static inline uint32_t +atomicExchange(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering, + MemScopeTy MemScope = MemScopeTy::device) { uint32_t R; - __scoped_atomic_exchange(Address, &Val, &R, Ordering, __MEMORY_SCOPE_DEVICE); + __scoped_atomic_exchange(Address, &Val, &R, Ordering, MemScope); return R; } diff --git a/offload/DeviceRTL/src/Synchronization.cpp b/offload/DeviceRTL/src/Synchronization.cpp index 72a97ae3fcfb4..e0e277928fa91 100644 --- a/offload/DeviceRTL/src/Synchronization.cpp +++ b/offload/DeviceRTL/src/Synchronization.cpp @@ -64,12 +64,16 @@ uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering, #define ScopeSwitch(ORDER) \ switch (MemScope) { \ - case atomic::MemScopeTy::all: \ + case atomic::MemScopeTy::system: \ return __builtin_amdgcn_atomic_inc32(A, V, ORDER, ""); \ case atomic::MemScopeTy::device: \ return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "agent"); \ - case atomic::MemScopeTy::cgroup: \ + case atomic::MemScopeTy::workgroup: \ return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "workgroup"); \ + case atomic::MemScopeTy::wavefront: \ + return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "wavefront"); \ + case atomic::MemScopeTy::single: \ + return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "singlethread"); \ } #define Case(ORDER) \ @@ -148,7 +152,7 @@ void fenceTeam(atomic::OrderingTy Ordering) { } void fenceKernel(atomic::OrderingTy Ordering) { - return __scoped_atomic_thread_fence(Ordering, atomic::device_); + return __scoped_atomic_thread_fence(Ordering, atomic::device); } void fenceSystem(atomic::OrderingTy Ordering) {