Skip to content

Commit 33f17da

Browse files
committed
AMDGPU: Expand flat atomics that may access private memory
If the runtime flat address resolves to a scratch address, 64-bit atomics do not work correctly. Insert a runtime address space check (which is quite likely to be uniform) and select between the non-atomic and real atomic cases. Consider noalias.addrspace metadata and avoid this expansion when possible (we also need to consider it to avoid infinitely expanding after adding the predication code).
1 parent 962b996 commit 33f17da

22 files changed

+28722
-4973
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 112 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
#include "llvm/IR/IntrinsicInst.h"
4040
#include "llvm/IR/IntrinsicsAMDGPU.h"
4141
#include "llvm/IR/IntrinsicsR600.h"
42+
#include "llvm/IR/MDBuilder.h"
4243
#include "llvm/Support/CommandLine.h"
4344
#include "llvm/Support/KnownBits.h"
4445
#include "llvm/Support/ModRef.h"
@@ -16340,12 +16341,45 @@ atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
1634016341
: TargetLowering::AtomicExpansionKind::CmpXChg;
1634116342
}
1634216343

16344+
/// Return if a flat address space atomicrmw can access private memory.
16345+
static bool flatInstrMayAccessPrivate(const Instruction *I) {
16346+
const MDNode *NoaliasAddrSpaceMD =
16347+
I->getMetadata(LLVMContext::MD_noalias_addrspace);
16348+
if (!NoaliasAddrSpaceMD)
16349+
return true;
16350+
16351+
for (unsigned I = 0, E = NoaliasAddrSpaceMD->getNumOperands() / 2; I != E;
16352+
++I) {
16353+
auto *Low = mdconst::extract<ConstantInt>(
16354+
NoaliasAddrSpaceMD->getOperand(2 * I + 0));
16355+
auto *High = mdconst::extract<ConstantInt>(
16356+
NoaliasAddrSpaceMD->getOperand(2 * I + 1));
16357+
16358+
if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS) &&
16359+
High->getValue().ult(AMDGPUAS::PRIVATE_ADDRESS))
16360+
return true;
16361+
}
16362+
16363+
return false;
16364+
}
16365+
1634316366
TargetLowering::AtomicExpansionKind
1634416367
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1634516368
unsigned AS = RMW->getPointerAddressSpace();
1634616369
if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1634716370
return AtomicExpansionKind::NotAtomic;
1634816371

16372+
// 64-bit flat atomics that dynamically reside in private memory will silently
16373+
// be dropped.
16374+
//
16375+
// Note that we will emit a new copy of the original atomic in the expansion,
16376+
// which will be incrementally relegalized.
16377+
const DataLayout &DL = RMW->getFunction()->getDataLayout();
16378+
if (AS == AMDGPUAS::FLAT_ADDRESS &&
16379+
DL.getTypeSizeInBits(RMW->getType()) == 64 &&
16380+
flatInstrMayAccessPrivate(RMW))
16381+
return AtomicExpansionKind::Expand;
16382+
1634916383
auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
1635016384
OptimizationRemarkEmitter ORE(RMW->getFunction());
1635116385
ORE.emit([=]() {
@@ -16744,20 +16778,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1674416778

1674516779
if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
1674616780
Op == AtomicRMWInst::Xor) {
16747-
// atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16748-
assert(cast<Constant>(AI->getValOperand())->isNullValue() &&
16749-
"this cannot be replaced with add");
16750-
AI->setOperation(AtomicRMWInst::Add);
16751-
return;
16781+
if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16782+
ConstVal && ConstVal->isNullValue()) {
16783+
// atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16784+
AI->setOperation(AtomicRMWInst::Add);
16785+
16786+
// TODO: Turn the below private handling into a no-op for idempotent
16787+
// cases.
16788+
}
1675216789
}
1675316790

16754-
assert(Subtarget->hasAtomicFaddInsts() &&
16755-
"target should have atomic fadd instructions");
16756-
assert(AI->getType()->isFloatTy() &&
16757-
AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
16758-
"generic atomicrmw expansion only supports FP32 operand in flat "
16759-
"address space");
16760-
assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now");
16791+
// The non-flat expansions should only perform the de-canonicalization of
16792+
// identity values.
16793+
if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16794+
return;
16795+
16796+
// FullFlatEmulation is true if we need to issue the private, shared, and
16797+
// global cases.
16798+
//
16799+
// If this is false, we are only dealing with the flat-targeting-private case,
16800+
// where we only insert a check for private and still use the flat instruction
16801+
// for global and shared.
16802+
16803+
// TODO: Avoid the private check for the fadd case depending on
16804+
// noalias.addrspace.
16805+
16806+
bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
16807+
Subtarget->hasAtomicFaddInsts() &&
16808+
AI->getType()->isFloatTy();
1676116809

1676216810
// Given: atomicrmw fadd ptr %addr, float %val ordering
1676316811
//
@@ -16797,6 +16845,10 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1679716845
//
1679816846
// atomicrmw.end:
1679916847
// [...]
16848+
//
16849+
//
16850+
// For 64-bit atomics which may reside in private memory, we perform a simpler
16851+
// version that only inserts the private check, and uses the flat operation.
1680016852

1680116853
IRBuilder<> Builder(AI);
1680216854
LLVMContext &Ctx = Builder.getContext();
@@ -16808,9 +16860,15 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1680816860
Function *F = BB->getParent();
1680916861
BasicBlock *ExitBB =
1681016862
BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
16811-
BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16812-
BasicBlock *CheckPrivateBB =
16813-
BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16863+
BasicBlock *SharedBB = nullptr;
16864+
16865+
BasicBlock *CheckPrivateBB = BB;
16866+
if (FullFlatEmulation) {
16867+
SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16868+
CheckPrivateBB =
16869+
BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16870+
}
16871+
1681416872
BasicBlock *PrivateBB =
1681516873
BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
1681616874
BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
@@ -16823,23 +16881,26 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1682316881

1682416882
std::prev(BB->end())->eraseFromParent();
1682516883
Builder.SetInsertPoint(BB);
16826-
CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
16827-
{Addr}, nullptr, "is.shared");
16828-
Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
1682916884

16830-
Builder.SetInsertPoint(SharedBB);
16831-
Value *CastToLocal = Builder.CreateAddrSpaceCast(
16832-
Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
16885+
Value *LoadedShared = nullptr;
16886+
if (FullFlatEmulation) {
16887+
CallInst *IsShared = Builder.CreateIntrinsic(
16888+
Intrinsic::amdgcn_is_shared, {}, {Addr}, nullptr, "is.shared");
16889+
Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16890+
Builder.SetInsertPoint(SharedBB);
16891+
Value *CastToLocal = Builder.CreateAddrSpaceCast(
16892+
Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
1683316893

16834-
Instruction *Clone = AI->clone();
16835-
Clone->insertInto(SharedBB, SharedBB->end());
16836-
Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16837-
.set(CastToLocal);
16838-
Instruction *LoadedShared = Clone;
16894+
Instruction *Clone = AI->clone();
16895+
Clone->insertInto(SharedBB, SharedBB->end());
16896+
Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16897+
.set(CastToLocal);
16898+
LoadedShared = Clone;
1683916899

16840-
Builder.CreateBr(PhiBB);
16900+
Builder.CreateBr(PhiBB);
16901+
Builder.SetInsertPoint(CheckPrivateBB);
16902+
}
1684116903

16842-
Builder.SetInsertPoint(CheckPrivateBB);
1684316904
CallInst *IsPrivate = Builder.CreateIntrinsic(
1684416905
Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
1684516906
Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
@@ -16856,23 +16917,41 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1685616917
Builder.CreateBr(PhiBB);
1685716918

1685816919
Builder.SetInsertPoint(GlobalBB);
16859-
Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16860-
Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16861-
Value *LoadedGlobal = AI;
1686216920

16863-
AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex()).set(CastToGlobal);
16921+
// Continue using a flat instruction if we only emitted the check for private.
16922+
Instruction *LoadedGlobal = AI;
16923+
if (FullFlatEmulation) {
16924+
Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16925+
Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16926+
AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16927+
.set(CastToGlobal);
16928+
}
1686416929

1686516930
AI->removeFromParent();
1686616931
AI->insertInto(GlobalBB, GlobalBB->end());
1686716932

16933+
// The new atomicrmw may go through another round of legalization later.
16934+
if (!FullFlatEmulation) {
16935+
// We inserted the runtime check already, make sure we do not try to
16936+
// re-expand this.
16937+
// TODO: Should union with any existing metadata.
16938+
MDBuilder MDB(F->getContext());
16939+
MDNode *RangeNotPrivate =
16940+
MDB.createRange(APInt(32, AMDGPUAS::PRIVATE_ADDRESS),
16941+
APInt(32, AMDGPUAS::PRIVATE_ADDRESS + 1));
16942+
LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
16943+
RangeNotPrivate);
16944+
}
16945+
1686816946
Builder.CreateBr(PhiBB);
1686916947

1687016948
Builder.SetInsertPoint(PhiBB);
1687116949

1687216950
if (ReturnValueIsUsed) {
1687316951
PHINode *Loaded = Builder.CreatePHI(ValTy, 3);
1687416952
AI->replaceAllUsesWith(Loaded);
16875-
Loaded->addIncoming(LoadedShared, SharedBB);
16953+
if (FullFlatEmulation)
16954+
Loaded->addIncoming(LoadedShared, SharedBB);
1687616955
Loaded->addIncoming(LoadedPrivate, PrivateBB);
1687716956
Loaded->addIncoming(LoadedGlobal, GlobalBB);
1687816957
Loaded->takeName(AI);

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1332,7 +1332,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
13321332
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13331333
; GFX7-NEXT: buffer_wbinvl1
13341334
; GFX7-NEXT: s_setpc_b64 s[30:31]
1335-
%result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
1335+
%result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
13361336
ret double %result
13371337
}
13381338

@@ -1482,7 +1482,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
14821482
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14831483
; GFX7-NEXT: buffer_wbinvl1
14841484
; GFX7-NEXT: s_setpc_b64 s[30:31]
1485-
%unused = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
1485+
%unused = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
14861486
ret void
14871487
}
14881488

@@ -2215,3 +2215,4 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
22152215
}
22162216

22172217
!0 = !{}
2218+
!1 = !{i32 5, i32 6}

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1332,7 +1332,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
13321332
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13331333
; GFX7-NEXT: buffer_wbinvl1
13341334
; GFX7-NEXT: s_setpc_b64 s[30:31]
1335-
%result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
1335+
%result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
13361336
ret double %result
13371337
}
13381338

@@ -1482,7 +1482,7 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
14821482
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14831483
; GFX7-NEXT: buffer_wbinvl1
14841484
; GFX7-NEXT: s_setpc_b64 s[30:31]
1485-
%unused = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
1485+
%unused = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
14861486
ret void
14871487
}
14881488

@@ -2215,3 +2215,4 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
22152215
}
22162216

22172217
!0 = !{}
2218+
!1 = !{i32 5, i32 6}

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1657,7 +1657,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
16571657
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
16581658
; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1]
16591659
; GFX11-NEXT: s_endpgm
1660-
%result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8
1660+
%result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
16611661
store i64 %result, ptr %out, align 4
16621662
ret void
16631663
}
@@ -1759,7 +1759,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
17591759
; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1]
17601760
; GFX11-NEXT: s_endpgm
17611761
%gep = getelementptr i64, ptr %ptr, i32 4
1762-
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
1762+
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
17631763
store i64 %result, ptr %out, align 4
17641764
ret void
17651765
}
@@ -1832,7 +1832,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
18321832
; GFX11-NEXT: buffer_gl1_inv
18331833
; GFX11-NEXT: buffer_gl0_inv
18341834
; GFX11-NEXT: s_endpgm
1835-
%result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8
1835+
%result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
18361836
ret void
18371837
}
18381838

@@ -1911,7 +1911,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
19111911
; GFX11-NEXT: buffer_gl0_inv
19121912
; GFX11-NEXT: s_endpgm
19131913
%gep = getelementptr i64, ptr %ptr, i32 4
1914-
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
1914+
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
19151915
ret void
19161916
}
19171917

@@ -1990,7 +1990,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
19901990
; GFX11-NEXT: buffer_gl0_inv
19911991
; GFX11-NEXT: s_endpgm
19921992
%gep = getelementptr i64, ptr %ptr, i32 4
1993-
%result = atomicrmw udec_wrap ptr %gep, i64 42 seq_cst, align 8
1993+
%result = atomicrmw udec_wrap ptr %gep, i64 42 seq_cst, align 8, !noalias.addrspace !0
19941994
ret void
19951995
}
19961996

@@ -2118,7 +2118,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
21182118
%gep.tid = getelementptr i64, ptr %ptr, i32 %id
21192119
%out.gep = getelementptr i64, ptr %out, i32 %id
21202120
%gep = getelementptr i64, ptr %gep.tid, i32 5
2121-
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
2121+
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
21222122
store i64 %result, ptr %out.gep, align 4
21232123
ret void
21242124
}
@@ -2217,7 +2217,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
22172217
%id = call i32 @llvm.amdgcn.workitem.id.x()
22182218
%gep.tid = getelementptr i64, ptr %ptr, i32 %id
22192219
%gep = getelementptr i64, ptr %gep.tid, i32 5
2220-
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
2220+
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
22212221
ret void
22222222
}
22232223

@@ -3340,7 +3340,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
33403340
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2
33413341
%idx.0 = add nsw i32 %tid.x, 2
33423342
%arrayidx0 = getelementptr inbounds [512 x i64], ptr addrspace(3) @lds1, i32 0, i32 %idx.0
3343-
%result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i64 9 syncscope("agent") seq_cst, align 8
3343+
%result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i64 9 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
33443344
store i32 %idx.0, ptr addrspace(1) %add_use, align 4
33453345
store i64 %result, ptr addrspace(1) %out, align 4
33463346
ret void
@@ -3349,5 +3349,8 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
33493349
attributes #0 = { nounwind speculatable willreturn memory(none) }
33503350
attributes #1 = { nounwind }
33513351
attributes #2 = { nounwind memory(none) }
3352+
3353+
!0 = !{i32 5, i32 6}
3354+
33523355
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
33533356
; GCN: {{.*}}

0 commit comments

Comments
 (0)