Skip to content

Commit 1fb2aae

Browse files
committed
AMDGPU: Expand flat atomics that may access private memory
If the runtime flat address resolves to a scratch address, 64-bit atomics do not work correctly. Insert a runtime address space check (which is quite likely to be uniform) and select between the non-atomic and real atomic cases. Consider noalias.addrspace metadata and avoid this expansion when possible (we also need to consider it to avoid infinitely expanding after adding the predication code).
1 parent f0cf74a commit 1fb2aae

22 files changed

+16753
-4863
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 106 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
#include "llvm/IR/IntrinsicInst.h"
4040
#include "llvm/IR/IntrinsicsAMDGPU.h"
4141
#include "llvm/IR/IntrinsicsR600.h"
42+
#include "llvm/IR/MDBuilder.h"
4243
#include "llvm/Support/CommandLine.h"
4344
#include "llvm/Support/KnownBits.h"
4445
#include "llvm/Support/ModRef.h"
@@ -16243,12 +16244,39 @@ atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
1624316244
: TargetLowering::AtomicExpansionKind::CmpXChg;
1624416245
}
1624516246

16247+
/// Return if a flat address space atomicrmw can access private memory.
16248+
static bool flatInstrMayAccessPrivate(const Instruction *I) {
16249+
const MDNode *NoaliasAddrSpaceMD =
16250+
I->getMetadata(LLVMContext::MD_noalias_addrspace);
16251+
if (!NoaliasAddrSpaceMD)
16252+
return true;
16253+
16254+
// FIXME: Can this actually fail? Why is this optional?
16255+
if (std::optional<ConstantRange> CR =
16256+
getConstantRangeFromMetadata(*NoaliasAddrSpaceMD)) {
16257+
return !CR->contains(APInt(32, AMDGPUAS::PRIVATE_ADDRESS));
16258+
}
16259+
16260+
llvm_unreachable("Why is getConstantRangeFromMetadata optional");
16261+
}
16262+
1624616263
TargetLowering::AtomicExpansionKind
1624716264
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1624816265
unsigned AS = RMW->getPointerAddressSpace();
1624916266
if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1625016267
return AtomicExpansionKind::NotAtomic;
1625116268

16269+
// 64-bit flat atomics that dynamically reside in private memory will silently
16270+
// be dropped.
16271+
//
16272+
// Note that we will emit a new copy of the original atomic in the expansion,
16273+
// which will be incrementally relegalized.
16274+
const DataLayout &DL = RMW->getFunction()->getDataLayout();
16275+
if (AS == AMDGPUAS::FLAT_ADDRESS &&
16276+
DL.getTypeSizeInBits(RMW->getType()) == 64 &&
16277+
flatInstrMayAccessPrivate(RMW))
16278+
return AtomicExpansionKind::Expand;
16279+
1625216280
auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
1625316281
OptimizationRemarkEmitter ORE(RMW->getFunction());
1625416282
ORE.emit([=]() {
@@ -16647,20 +16675,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1664716675

1664816676
if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
1664916677
Op == AtomicRMWInst::Xor) {
16650-
// atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16651-
assert(cast<Constant>(AI->getValOperand())->isNullValue() &&
16652-
"this cannot be replaced with add");
16653-
AI->setOperation(AtomicRMWInst::Add);
16654-
return;
16678+
if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16679+
ConstVal && ConstVal->isNullValue()) {
16680+
// atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16681+
AI->setOperation(AtomicRMWInst::Add);
16682+
16683+
// TODO: Turn the below private handling into a no-op for idempotent
16684+
// cases.
16685+
}
1665516686
}
1665616687

16657-
assert(Subtarget->hasAtomicFaddInsts() &&
16658-
"target should have atomic fadd instructions");
16659-
assert(AI->getType()->isFloatTy() &&
16660-
AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
16661-
"generic atomicrmw expansion only supports FP32 operand in flat "
16662-
"address space");
16663-
assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now");
16688+
// The non-flat expansions should only perform the de-canonicalization of
16689+
// identity values.
16690+
if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16691+
return;
16692+
16693+
// FullFlatEmulation is true if we need to issue the private, shared, and
16694+
// global cases.
16695+
//
16696+
// If this is false, we are only dealing with the flat-targeting-private case,
16697+
// where we only insert a check for private and still use the flat instruction
16698+
// for global and shared.
16699+
16700+
// TODO: Avoid the private check for the fadd case depending on
16701+
// noalias.addrspace.
16702+
16703+
bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
16704+
Subtarget->hasAtomicFaddInsts() &&
16705+
AI->getType()->isFloatTy();
1666416706

1666516707
// Given: atomicrmw fadd ptr %addr, float %val ordering
1666616708
//
@@ -16700,6 +16742,10 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1670016742
//
1670116743
// atomicrmw.end:
1670216744
// [...]
16745+
//
16746+
//
16747+
// For 64-bit atomics which may reside in private memory, we perform a simpler
16748+
// version that only inserts the private check, and uses the flat operation.
1670316749

1670416750
IRBuilder<> Builder(AI);
1670516751
LLVMContext &Ctx = Builder.getContext();
@@ -16711,9 +16757,15 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1671116757
Function *F = BB->getParent();
1671216758
BasicBlock *ExitBB =
1671316759
BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
16714-
BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16715-
BasicBlock *CheckPrivateBB =
16716-
BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16760+
BasicBlock *SharedBB = nullptr;
16761+
16762+
BasicBlock *CheckPrivateBB = BB;
16763+
if (FullFlatEmulation) {
16764+
SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16765+
CheckPrivateBB =
16766+
BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16767+
}
16768+
1671716769
BasicBlock *PrivateBB =
1671816770
BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
1671916771
BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
@@ -16726,23 +16778,26 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1672616778

1672716779
std::prev(BB->end())->eraseFromParent();
1672816780
Builder.SetInsertPoint(BB);
16729-
CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
16730-
{Addr}, nullptr, "is.shared");
16731-
Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
1673216781

16733-
Builder.SetInsertPoint(SharedBB);
16734-
Value *CastToLocal = Builder.CreateAddrSpaceCast(
16735-
Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
16782+
Value *LoadedShared = nullptr;
16783+
if (FullFlatEmulation) {
16784+
CallInst *IsShared = Builder.CreateIntrinsic(
16785+
Intrinsic::amdgcn_is_shared, {}, {Addr}, nullptr, "is.shared");
16786+
Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16787+
Builder.SetInsertPoint(SharedBB);
16788+
Value *CastToLocal = Builder.CreateAddrSpaceCast(
16789+
Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
1673616790

16737-
Instruction *Clone = AI->clone();
16738-
Clone->insertInto(SharedBB, SharedBB->end());
16739-
Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16740-
.set(CastToLocal);
16741-
Instruction *LoadedShared = Clone;
16791+
Instruction *Clone = AI->clone();
16792+
Clone->insertInto(SharedBB, SharedBB->end());
16793+
Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16794+
.set(CastToLocal);
16795+
LoadedShared = Clone;
1674216796

16743-
Builder.CreateBr(PhiBB);
16797+
Builder.CreateBr(PhiBB);
16798+
Builder.SetInsertPoint(CheckPrivateBB);
16799+
}
1674416800

16745-
Builder.SetInsertPoint(CheckPrivateBB);
1674616801
CallInst *IsPrivate = Builder.CreateIntrinsic(
1674716802
Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
1674816803
Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
@@ -16759,23 +16814,41 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1675916814
Builder.CreateBr(PhiBB);
1676016815

1676116816
Builder.SetInsertPoint(GlobalBB);
16762-
Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16763-
Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16764-
Value *LoadedGlobal = AI;
1676516817

16766-
AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex()).set(CastToGlobal);
16818+
// Continue using a flat instruction if we only emitted the check for private.
16819+
Instruction *LoadedGlobal = AI;
16820+
if (FullFlatEmulation) {
16821+
Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16822+
Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16823+
AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16824+
.set(CastToGlobal);
16825+
}
1676716826

1676816827
AI->removeFromParent();
1676916828
AI->insertInto(GlobalBB, GlobalBB->end());
1677016829

16830+
// The new atomicrmw may go through another round of legalization later.
16831+
if (!FullFlatEmulation) {
16832+
// We inserted the runtime check already, make sure we do not try to
16833+
// re-expand this.
16834+
// TODO: Should union with any existing metadata.
16835+
MDBuilder MDB(F->getContext());
16836+
MDNode *RangeNotPrivate =
16837+
MDB.createRange(APInt(32, AMDGPUAS::PRIVATE_ADDRESS),
16838+
APInt(32, AMDGPUAS::PRIVATE_ADDRESS + 1));
16839+
LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
16840+
RangeNotPrivate);
16841+
}
16842+
1677116843
Builder.CreateBr(PhiBB);
1677216844

1677316845
Builder.SetInsertPoint(PhiBB);
1677416846

1677516847
if (ReturnValueIsUsed) {
1677616848
PHINode *Loaded = Builder.CreatePHI(ValTy, 3);
1677716849
AI->replaceAllUsesWith(Loaded);
16778-
Loaded->addIncoming(LoadedShared, SharedBB);
16850+
if (FullFlatEmulation)
16851+
Loaded->addIncoming(LoadedShared, SharedBB);
1677916852
Loaded->addIncoming(LoadedPrivate, PrivateBB);
1678016853
Loaded->addIncoming(LoadedGlobal, GlobalBB);
1678116854
Loaded->takeName(AI);

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1343,7 +1343,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
13431343
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13441344
; GFX7-NEXT: buffer_wbinvl1
13451345
; GFX7-NEXT: s_setpc_b64 s[30:31]
1346-
%result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
1346+
%result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
13471347
ret double %result
13481348
}
13491349

@@ -1494,7 +1494,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
14941494
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14951495
; GFX7-NEXT: buffer_wbinvl1
14961496
; GFX7-NEXT: s_setpc_b64 s[30:31]
1497-
%unused = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
1497+
%unused = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
14981498
ret void
14991499
}
15001500

@@ -2230,3 +2230,4 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
22302230
}
22312231

22322232
!0 = !{}
2233+
!1 = !{i32 5, i32 6}

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1343,7 +1343,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
13431343
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13441344
; GFX7-NEXT: buffer_wbinvl1
13451345
; GFX7-NEXT: s_setpc_b64 s[30:31]
1346-
%result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
1346+
%result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
13471347
ret double %result
13481348
}
13491349

@@ -1494,7 +1494,7 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
14941494
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14951495
; GFX7-NEXT: buffer_wbinvl1
14961496
; GFX7-NEXT: s_setpc_b64 s[30:31]
1497-
%unused = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
1497+
%unused = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
14981498
ret void
14991499
}
15001500

@@ -2230,3 +2230,4 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
22302230
}
22312231

22322232
!0 = !{}
2233+
!1 = !{i32 5, i32 6}

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1657,7 +1657,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
16571657
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
16581658
; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1]
16591659
; GFX11-NEXT: s_endpgm
1660-
%result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8
1660+
%result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
16611661
store i64 %result, ptr %out, align 4
16621662
ret void
16631663
}
@@ -1759,7 +1759,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
17591759
; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1]
17601760
; GFX11-NEXT: s_endpgm
17611761
%gep = getelementptr i64, ptr %ptr, i32 4
1762-
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
1762+
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
17631763
store i64 %result, ptr %out, align 4
17641764
ret void
17651765
}
@@ -1832,7 +1832,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
18321832
; GFX11-NEXT: buffer_gl1_inv
18331833
; GFX11-NEXT: buffer_gl0_inv
18341834
; GFX11-NEXT: s_endpgm
1835-
%result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8
1835+
%result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
18361836
ret void
18371837
}
18381838

@@ -1911,7 +1911,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
19111911
; GFX11-NEXT: buffer_gl0_inv
19121912
; GFX11-NEXT: s_endpgm
19131913
%gep = getelementptr i64, ptr %ptr, i32 4
1914-
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
1914+
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
19151915
ret void
19161916
}
19171917

@@ -1990,7 +1990,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
19901990
; GFX11-NEXT: buffer_gl0_inv
19911991
; GFX11-NEXT: s_endpgm
19921992
%gep = getelementptr i64, ptr %ptr, i32 4
1993-
%result = atomicrmw udec_wrap ptr %gep, i64 42 seq_cst, align 8
1993+
%result = atomicrmw udec_wrap ptr %gep, i64 42 seq_cst, align 8, !noalias.addrspace !0
19941994
ret void
19951995
}
19961996

@@ -2118,7 +2118,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
21182118
%gep.tid = getelementptr i64, ptr %ptr, i32 %id
21192119
%out.gep = getelementptr i64, ptr %out, i32 %id
21202120
%gep = getelementptr i64, ptr %gep.tid, i32 5
2121-
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
2121+
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
21222122
store i64 %result, ptr %out.gep, align 4
21232123
ret void
21242124
}
@@ -2217,7 +2217,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
22172217
%id = call i32 @llvm.amdgcn.workitem.id.x()
22182218
%gep.tid = getelementptr i64, ptr %ptr, i32 %id
22192219
%gep = getelementptr i64, ptr %gep.tid, i32 5
2220-
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
2220+
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
22212221
ret void
22222222
}
22232223

@@ -3340,7 +3340,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
33403340
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2
33413341
%idx.0 = add nsw i32 %tid.x, 2
33423342
%arrayidx0 = getelementptr inbounds [512 x i64], ptr addrspace(3) @lds1, i32 0, i32 %idx.0
3343-
%result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i64 9 syncscope("agent") seq_cst, align 8
3343+
%result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i64 9 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
33443344
store i32 %idx.0, ptr addrspace(1) %add_use, align 4
33453345
store i64 %result, ptr addrspace(1) %out, align 4
33463346
ret void
@@ -3349,5 +3349,8 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
33493349
attributes #0 = { nounwind speculatable willreturn memory(none) }
33503350
attributes #1 = { nounwind }
33513351
attributes #2 = { nounwind memory(none) }
3352+
3353+
!0 = !{i32 5, i32 6}
3354+
33523355
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
33533356
; GCN: {{.*}}

0 commit comments

Comments
 (0)