Skip to content

Commit f65ec5a

Browse files
committed
AMDGPU: Expand flat atomics that may access private memory
If the runtime flat address resolves to a scratch address, 64-bit atomics do not work correctly. Insert a runtime address space check (which is quite likely to be uniform) and select between the non-atomic and real atomic cases. Consider noalias.addrspace metadata and avoid this expansion when possible (we also need to consider it to avoid infinitely expanding after adding the predication code).
1 parent 62ab68e commit f65ec5a

22 files changed

+16753
-4863
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 106 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
#include "llvm/IR/IntrinsicInst.h"
4040
#include "llvm/IR/IntrinsicsAMDGPU.h"
4141
#include "llvm/IR/IntrinsicsR600.h"
42+
#include "llvm/IR/MDBuilder.h"
4243
#include "llvm/Support/CommandLine.h"
4344
#include "llvm/Support/KnownBits.h"
4445
#include "llvm/Support/ModRef.h"
@@ -16236,12 +16237,39 @@ atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
1623616237
: TargetLowering::AtomicExpansionKind::CmpXChg;
1623716238
}
1623816239

16240+
/// Return if a flat address space atomicrmw can access private memory.
16241+
static bool flatInstrMayAccessPrivate(const Instruction *I) {
16242+
const MDNode *NoaliasAddrSpaceMD =
16243+
I->getMetadata(LLVMContext::MD_noalias_addrspace);
16244+
if (!NoaliasAddrSpaceMD)
16245+
return true;
16246+
16247+
// FIXME: Can this actually fail? Why is this optional?
16248+
if (std::optional<ConstantRange> CR =
16249+
getConstantRangeFromMetadata(*NoaliasAddrSpaceMD)) {
16250+
return !CR->contains(APInt(32, AMDGPUAS::PRIVATE_ADDRESS));
16251+
}
16252+
16253+
llvm_unreachable("Why is getConstantRangeFromMetadata optional");
16254+
}
16255+
1623916256
TargetLowering::AtomicExpansionKind
1624016257
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1624116258
unsigned AS = RMW->getPointerAddressSpace();
1624216259
if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1624316260
return AtomicExpansionKind::NotAtomic;
1624416261

16262+
// 64-bit flat atomics that dynamically reside in private memory will silently
16263+
// be dropped.
16264+
//
16265+
// Note that we will emit a new copy of the original atomic in the expansion,
16266+
// which will be incrementally relegalized.
16267+
const DataLayout &DL = RMW->getFunction()->getDataLayout();
16268+
if (AS == AMDGPUAS::FLAT_ADDRESS &&
16269+
DL.getTypeSizeInBits(RMW->getType()) == 64 &&
16270+
flatInstrMayAccessPrivate(RMW))
16271+
return AtomicExpansionKind::Expand;
16272+
1624516273
auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
1624616274
OptimizationRemarkEmitter ORE(RMW->getFunction());
1624716275
ORE.emit([=]() {
@@ -16640,20 +16668,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1664016668

1664116669
if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
1664216670
Op == AtomicRMWInst::Xor) {
16643-
// atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16644-
assert(cast<Constant>(AI->getValOperand())->isNullValue() &&
16645-
"this cannot be replaced with add");
16646-
AI->setOperation(AtomicRMWInst::Add);
16647-
return;
16671+
if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16672+
ConstVal && ConstVal->isNullValue()) {
16673+
// atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16674+
AI->setOperation(AtomicRMWInst::Add);
16675+
16676+
// TODO: Turn the below private handling into a no-op for idempotent
16677+
// cases.
16678+
}
1664816679
}
1664916680

16650-
assert(Subtarget->hasAtomicFaddInsts() &&
16651-
"target should have atomic fadd instructions");
16652-
assert(AI->getType()->isFloatTy() &&
16653-
AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
16654-
"generic atomicrmw expansion only supports FP32 operand in flat "
16655-
"address space");
16656-
assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now");
16681+
// The non-flat expansions should only perform the de-canonicalization of
16682+
// identity values.
16683+
if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16684+
return;
16685+
16686+
// FullFlatEmulation is true if we need to issue the private, shared, and
16687+
// global cases.
16688+
//
16689+
// If this is false, we are only dealing with the flat-targeting-private case,
16690+
// where we only insert a check for private and still use the flat instruction
16691+
// for global and shared.
16692+
16693+
// TODO: Avoid the private check for the fadd case depending on
16694+
// noalias.addrspace.
16695+
16696+
bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
16697+
Subtarget->hasAtomicFaddInsts() &&
16698+
AI->getType()->isFloatTy();
1665716699

1665816700
// Given: atomicrmw fadd ptr %addr, float %val ordering
1665916701
//
@@ -16693,6 +16735,10 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1669316735
//
1669416736
// atomicrmw.end:
1669516737
// [...]
16738+
//
16739+
//
16740+
// For 64-bit atomics which may reside in private memory, we perform a simpler
16741+
// version that only inserts the private check, and uses the flat operation.
1669616742

1669716743
IRBuilder<> Builder(AI);
1669816744
LLVMContext &Ctx = Builder.getContext();
@@ -16704,9 +16750,15 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1670416750
Function *F = BB->getParent();
1670516751
BasicBlock *ExitBB =
1670616752
BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
16707-
BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16708-
BasicBlock *CheckPrivateBB =
16709-
BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16753+
BasicBlock *SharedBB = nullptr;
16754+
16755+
BasicBlock *CheckPrivateBB = BB;
16756+
if (FullFlatEmulation) {
16757+
SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16758+
CheckPrivateBB =
16759+
BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16760+
}
16761+
1671016762
BasicBlock *PrivateBB =
1671116763
BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
1671216764
BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
@@ -16719,23 +16771,26 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1671916771

1672016772
std::prev(BB->end())->eraseFromParent();
1672116773
Builder.SetInsertPoint(BB);
16722-
CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
16723-
{Addr}, nullptr, "is.shared");
16724-
Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
1672516774

16726-
Builder.SetInsertPoint(SharedBB);
16727-
Value *CastToLocal = Builder.CreateAddrSpaceCast(
16728-
Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
16775+
Value *LoadedShared = nullptr;
16776+
if (FullFlatEmulation) {
16777+
CallInst *IsShared = Builder.CreateIntrinsic(
16778+
Intrinsic::amdgcn_is_shared, {}, {Addr}, nullptr, "is.shared");
16779+
Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16780+
Builder.SetInsertPoint(SharedBB);
16781+
Value *CastToLocal = Builder.CreateAddrSpaceCast(
16782+
Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
1672916783

16730-
Instruction *Clone = AI->clone();
16731-
Clone->insertInto(SharedBB, SharedBB->end());
16732-
Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16733-
.set(CastToLocal);
16734-
Instruction *LoadedShared = Clone;
16784+
Instruction *Clone = AI->clone();
16785+
Clone->insertInto(SharedBB, SharedBB->end());
16786+
Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16787+
.set(CastToLocal);
16788+
LoadedShared = Clone;
1673516789

16736-
Builder.CreateBr(PhiBB);
16790+
Builder.CreateBr(PhiBB);
16791+
Builder.SetInsertPoint(CheckPrivateBB);
16792+
}
1673716793

16738-
Builder.SetInsertPoint(CheckPrivateBB);
1673916794
CallInst *IsPrivate = Builder.CreateIntrinsic(
1674016795
Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
1674116796
Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
@@ -16752,23 +16807,41 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1675216807
Builder.CreateBr(PhiBB);
1675316808

1675416809
Builder.SetInsertPoint(GlobalBB);
16755-
Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16756-
Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16757-
Value *LoadedGlobal = AI;
1675816810

16759-
AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex()).set(CastToGlobal);
16811+
// Continue using a flat instruction if we only emitted the check for private.
16812+
Instruction *LoadedGlobal = AI;
16813+
if (FullFlatEmulation) {
16814+
Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16815+
Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16816+
AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16817+
.set(CastToGlobal);
16818+
}
1676016819

1676116820
AI->removeFromParent();
1676216821
AI->insertInto(GlobalBB, GlobalBB->end());
1676316822

16823+
// The new atomicrmw may go through another round of legalization later.
16824+
if (!FullFlatEmulation) {
16825+
// We inserted the runtime check already, make sure we do not try to
16826+
// re-expand this.
16827+
// TODO: Should union with any existing metadata.
16828+
MDBuilder MDB(F->getContext());
16829+
MDNode *RangeNotPrivate =
16830+
MDB.createRange(APInt(32, AMDGPUAS::PRIVATE_ADDRESS),
16831+
APInt(32, AMDGPUAS::PRIVATE_ADDRESS + 1));
16832+
LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
16833+
RangeNotPrivate);
16834+
}
16835+
1676416836
Builder.CreateBr(PhiBB);
1676516837

1676616838
Builder.SetInsertPoint(PhiBB);
1676716839

1676816840
if (ReturnValueIsUsed) {
1676916841
PHINode *Loaded = Builder.CreatePHI(ValTy, 3);
1677016842
AI->replaceAllUsesWith(Loaded);
16771-
Loaded->addIncoming(LoadedShared, SharedBB);
16843+
if (FullFlatEmulation)
16844+
Loaded->addIncoming(LoadedShared, SharedBB);
1677216845
Loaded->addIncoming(LoadedPrivate, PrivateBB);
1677316846
Loaded->addIncoming(LoadedGlobal, GlobalBB);
1677416847
Loaded->takeName(AI);

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1343,7 +1343,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
13431343
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13441344
; GFX7-NEXT: buffer_wbinvl1
13451345
; GFX7-NEXT: s_setpc_b64 s[30:31]
1346-
%result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
1346+
%result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
13471347
ret double %result
13481348
}
13491349

@@ -1494,7 +1494,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
14941494
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14951495
; GFX7-NEXT: buffer_wbinvl1
14961496
; GFX7-NEXT: s_setpc_b64 s[30:31]
1497-
%unused = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
1497+
%unused = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
14981498
ret void
14991499
}
15001500

@@ -2230,3 +2230,4 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
22302230
}
22312231

22322232
!0 = !{}
2233+
!1 = !{i32 5, i32 6}

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1343,7 +1343,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
13431343
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13441344
; GFX7-NEXT: buffer_wbinvl1
13451345
; GFX7-NEXT: s_setpc_b64 s[30:31]
1346-
%result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
1346+
%result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
13471347
ret double %result
13481348
}
13491349

@@ -1494,7 +1494,7 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
14941494
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14951495
; GFX7-NEXT: buffer_wbinvl1
14961496
; GFX7-NEXT: s_setpc_b64 s[30:31]
1497-
%unused = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
1497+
%unused = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
14981498
ret void
14991499
}
15001500

@@ -2230,3 +2230,4 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
22302230
}
22312231

22322232
!0 = !{}
2233+
!1 = !{i32 5, i32 6}

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1657,7 +1657,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
16571657
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
16581658
; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1]
16591659
; GFX11-NEXT: s_endpgm
1660-
%result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8
1660+
%result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
16611661
store i64 %result, ptr %out, align 4
16621662
ret void
16631663
}
@@ -1759,7 +1759,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
17591759
; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1]
17601760
; GFX11-NEXT: s_endpgm
17611761
%gep = getelementptr i64, ptr %ptr, i32 4
1762-
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
1762+
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
17631763
store i64 %result, ptr %out, align 4
17641764
ret void
17651765
}
@@ -1832,7 +1832,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
18321832
; GFX11-NEXT: buffer_gl1_inv
18331833
; GFX11-NEXT: buffer_gl0_inv
18341834
; GFX11-NEXT: s_endpgm
1835-
%result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8
1835+
%result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
18361836
ret void
18371837
}
18381838

@@ -1911,7 +1911,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
19111911
; GFX11-NEXT: buffer_gl0_inv
19121912
; GFX11-NEXT: s_endpgm
19131913
%gep = getelementptr i64, ptr %ptr, i32 4
1914-
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
1914+
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
19151915
ret void
19161916
}
19171917

@@ -1990,7 +1990,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
19901990
; GFX11-NEXT: buffer_gl0_inv
19911991
; GFX11-NEXT: s_endpgm
19921992
%gep = getelementptr i64, ptr %ptr, i32 4
1993-
%result = atomicrmw udec_wrap ptr %gep, i64 42 seq_cst, align 8
1993+
%result = atomicrmw udec_wrap ptr %gep, i64 42 seq_cst, align 8, !noalias.addrspace !0
19941994
ret void
19951995
}
19961996

@@ -2118,7 +2118,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
21182118
%gep.tid = getelementptr i64, ptr %ptr, i32 %id
21192119
%out.gep = getelementptr i64, ptr %out, i32 %id
21202120
%gep = getelementptr i64, ptr %gep.tid, i32 5
2121-
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
2121+
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
21222122
store i64 %result, ptr %out.gep, align 4
21232123
ret void
21242124
}
@@ -2217,7 +2217,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
22172217
%id = call i32 @llvm.amdgcn.workitem.id.x()
22182218
%gep.tid = getelementptr i64, ptr %ptr, i32 %id
22192219
%gep = getelementptr i64, ptr %gep.tid, i32 5
2220-
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
2220+
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
22212221
ret void
22222222
}
22232223

@@ -3340,7 +3340,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
33403340
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2
33413341
%idx.0 = add nsw i32 %tid.x, 2
33423342
%arrayidx0 = getelementptr inbounds [512 x i64], ptr addrspace(3) @lds1, i32 0, i32 %idx.0
3343-
%result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i64 9 syncscope("agent") seq_cst, align 8
3343+
%result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i64 9 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
33443344
store i32 %idx.0, ptr addrspace(1) %add_use, align 4
33453345
store i64 %result, ptr addrspace(1) %out, align 4
33463346
ret void
@@ -3349,5 +3349,8 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
33493349
attributes #0 = { nounwind speculatable willreturn memory(none) }
33503350
attributes #1 = { nounwind }
33513351
attributes #2 = { nounwind memory(none) }
3352+
3353+
!0 = !{i32 5, i32 6}
3354+
33523355
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
33533356
; GCN: {{.*}}

0 commit comments

Comments
 (0)