Skip to content

Commit 1d03708

Browse files
authored
AMDGPU: Expand flat atomics that may access private memory (#109407)
If the runtime flat address resolves to a scratch address, 64-bit atomics do not work correctly. Insert a runtime address space check (which is quite likely to be uniform) and select between the non-atomic and real atomic cases. Consider noalias.addrspace metadata and avoid this expansion when possible (we also need to consider it to avoid infinitely expanding after adding the predication code).
1 parent d6a0602 commit 1d03708

22 files changed

+28722
-4973
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 112 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
#include "llvm/IR/IntrinsicInst.h"
4040
#include "llvm/IR/IntrinsicsAMDGPU.h"
4141
#include "llvm/IR/IntrinsicsR600.h"
42+
#include "llvm/IR/MDBuilder.h"
4243
#include "llvm/Support/CommandLine.h"
4344
#include "llvm/Support/KnownBits.h"
4445
#include "llvm/Support/ModRef.h"
@@ -16310,12 +16311,45 @@ atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
1631016311
: TargetLowering::AtomicExpansionKind::CmpXChg;
1631116312
}
1631216313

16314+
/// Return if a flat address space atomicrmw can access private memory.
16315+
static bool flatInstrMayAccessPrivate(const Instruction *I) {
16316+
const MDNode *NoaliasAddrSpaceMD =
16317+
I->getMetadata(LLVMContext::MD_noalias_addrspace);
16318+
if (!NoaliasAddrSpaceMD)
16319+
return true;
16320+
16321+
for (unsigned I = 0, E = NoaliasAddrSpaceMD->getNumOperands() / 2; I != E;
16322+
++I) {
16323+
auto *Low = mdconst::extract<ConstantInt>(
16324+
NoaliasAddrSpaceMD->getOperand(2 * I + 0));
16325+
auto *High = mdconst::extract<ConstantInt>(
16326+
NoaliasAddrSpaceMD->getOperand(2 * I + 1));
16327+
16328+
if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS) &&
16329+
High->getValue().ult(AMDGPUAS::PRIVATE_ADDRESS))
16330+
return true;
16331+
}
16332+
16333+
return false;
16334+
}
16335+
1631316336
TargetLowering::AtomicExpansionKind
1631416337
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1631516338
unsigned AS = RMW->getPointerAddressSpace();
1631616339
if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1631716340
return AtomicExpansionKind::NotAtomic;
1631816341

16342+
// 64-bit flat atomics that dynamically reside in private memory will silently
16343+
// be dropped.
16344+
//
16345+
// Note that we will emit a new copy of the original atomic in the expansion,
16346+
// which will be incrementally relegalized.
16347+
const DataLayout &DL = RMW->getFunction()->getDataLayout();
16348+
if (AS == AMDGPUAS::FLAT_ADDRESS &&
16349+
DL.getTypeSizeInBits(RMW->getType()) == 64 &&
16350+
flatInstrMayAccessPrivate(RMW))
16351+
return AtomicExpansionKind::Expand;
16352+
1631916353
auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
1632016354
OptimizationRemarkEmitter ORE(RMW->getFunction());
1632116355
ORE.emit([=]() {
@@ -16716,20 +16750,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1671616750

1671716751
if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
1671816752
Op == AtomicRMWInst::Xor) {
16719-
// atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16720-
assert(cast<Constant>(AI->getValOperand())->isNullValue() &&
16721-
"this cannot be replaced with add");
16722-
AI->setOperation(AtomicRMWInst::Add);
16723-
return;
16753+
if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16754+
ConstVal && ConstVal->isNullValue()) {
16755+
// atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16756+
AI->setOperation(AtomicRMWInst::Add);
16757+
16758+
// TODO: Turn the below private handling into a no-op for idempotent
16759+
// cases.
16760+
}
1672416761
}
1672516762

16726-
assert(Subtarget->hasAtomicFaddInsts() &&
16727-
"target should have atomic fadd instructions");
16728-
assert(AI->getType()->isFloatTy() &&
16729-
AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
16730-
"generic atomicrmw expansion only supports FP32 operand in flat "
16731-
"address space");
16732-
assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now");
16763+
// The non-flat expansions should only perform the de-canonicalization of
16764+
// identity values.
16765+
if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16766+
return;
16767+
16768+
// FullFlatEmulation is true if we need to issue the private, shared, and
16769+
// global cases.
16770+
//
16771+
// If this is false, we are only dealing with the flat-targeting-private case,
16772+
// where we only insert a check for private and still use the flat instruction
16773+
// for global and shared.
16774+
16775+
// TODO: Avoid the private check for the fadd case depending on
16776+
// noalias.addrspace.
16777+
16778+
bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
16779+
Subtarget->hasAtomicFaddInsts() &&
16780+
AI->getType()->isFloatTy();
1673316781

1673416782
// Given: atomicrmw fadd ptr %addr, float %val ordering
1673516783
//
@@ -16769,6 +16817,10 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1676916817
//
1677016818
// atomicrmw.end:
1677116819
// [...]
16820+
//
16821+
//
16822+
// For 64-bit atomics which may reside in private memory, we perform a simpler
16823+
// version that only inserts the private check, and uses the flat operation.
1677216824

1677316825
IRBuilder<> Builder(AI);
1677416826
LLVMContext &Ctx = Builder.getContext();
@@ -16780,9 +16832,15 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1678016832
Function *F = BB->getParent();
1678116833
BasicBlock *ExitBB =
1678216834
BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
16783-
BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16784-
BasicBlock *CheckPrivateBB =
16785-
BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16835+
BasicBlock *SharedBB = nullptr;
16836+
16837+
BasicBlock *CheckPrivateBB = BB;
16838+
if (FullFlatEmulation) {
16839+
SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16840+
CheckPrivateBB =
16841+
BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16842+
}
16843+
1678616844
BasicBlock *PrivateBB =
1678716845
BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
1678816846
BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
@@ -16795,23 +16853,26 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1679516853

1679616854
std::prev(BB->end())->eraseFromParent();
1679716855
Builder.SetInsertPoint(BB);
16798-
CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
16799-
{Addr}, nullptr, "is.shared");
16800-
Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
1680116856

16802-
Builder.SetInsertPoint(SharedBB);
16803-
Value *CastToLocal = Builder.CreateAddrSpaceCast(
16804-
Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
16857+
Value *LoadedShared = nullptr;
16858+
if (FullFlatEmulation) {
16859+
CallInst *IsShared = Builder.CreateIntrinsic(
16860+
Intrinsic::amdgcn_is_shared, {}, {Addr}, nullptr, "is.shared");
16861+
Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16862+
Builder.SetInsertPoint(SharedBB);
16863+
Value *CastToLocal = Builder.CreateAddrSpaceCast(
16864+
Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
1680516865

16806-
Instruction *Clone = AI->clone();
16807-
Clone->insertInto(SharedBB, SharedBB->end());
16808-
Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16809-
.set(CastToLocal);
16810-
Instruction *LoadedShared = Clone;
16866+
Instruction *Clone = AI->clone();
16867+
Clone->insertInto(SharedBB, SharedBB->end());
16868+
Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16869+
.set(CastToLocal);
16870+
LoadedShared = Clone;
1681116871

16812-
Builder.CreateBr(PhiBB);
16872+
Builder.CreateBr(PhiBB);
16873+
Builder.SetInsertPoint(CheckPrivateBB);
16874+
}
1681316875

16814-
Builder.SetInsertPoint(CheckPrivateBB);
1681516876
CallInst *IsPrivate = Builder.CreateIntrinsic(
1681616877
Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
1681716878
Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
@@ -16828,23 +16889,41 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1682816889
Builder.CreateBr(PhiBB);
1682916890

1683016891
Builder.SetInsertPoint(GlobalBB);
16831-
Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16832-
Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16833-
Value *LoadedGlobal = AI;
1683416892

16835-
AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex()).set(CastToGlobal);
16893+
// Continue using a flat instruction if we only emitted the check for private.
16894+
Instruction *LoadedGlobal = AI;
16895+
if (FullFlatEmulation) {
16896+
Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16897+
Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16898+
AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16899+
.set(CastToGlobal);
16900+
}
1683616901

1683716902
AI->removeFromParent();
1683816903
AI->insertInto(GlobalBB, GlobalBB->end());
1683916904

16905+
// The new atomicrmw may go through another round of legalization later.
16906+
if (!FullFlatEmulation) {
16907+
// We inserted the runtime check already, make sure we do not try to
16908+
// re-expand this.
16909+
// TODO: Should union with any existing metadata.
16910+
MDBuilder MDB(F->getContext());
16911+
MDNode *RangeNotPrivate =
16912+
MDB.createRange(APInt(32, AMDGPUAS::PRIVATE_ADDRESS),
16913+
APInt(32, AMDGPUAS::PRIVATE_ADDRESS + 1));
16914+
LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
16915+
RangeNotPrivate);
16916+
}
16917+
1684016918
Builder.CreateBr(PhiBB);
1684116919

1684216920
Builder.SetInsertPoint(PhiBB);
1684316921

1684416922
if (ReturnValueIsUsed) {
1684516923
PHINode *Loaded = Builder.CreatePHI(ValTy, 3);
1684616924
AI->replaceAllUsesWith(Loaded);
16847-
Loaded->addIncoming(LoadedShared, SharedBB);
16925+
if (FullFlatEmulation)
16926+
Loaded->addIncoming(LoadedShared, SharedBB);
1684816927
Loaded->addIncoming(LoadedPrivate, PrivateBB);
1684916928
Loaded->addIncoming(LoadedGlobal, GlobalBB);
1685016929
Loaded->takeName(AI);

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1332,7 +1332,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
13321332
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13331333
; GFX7-NEXT: buffer_wbinvl1
13341334
; GFX7-NEXT: s_setpc_b64 s[30:31]
1335-
%result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
1335+
%result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
13361336
ret double %result
13371337
}
13381338

@@ -1482,7 +1482,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
14821482
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14831483
; GFX7-NEXT: buffer_wbinvl1
14841484
; GFX7-NEXT: s_setpc_b64 s[30:31]
1485-
%unused = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
1485+
%unused = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
14861486
ret void
14871487
}
14881488

@@ -2215,3 +2215,4 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
22152215
}
22162216

22172217
!0 = !{}
2218+
!1 = !{i32 5, i32 6}

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1332,7 +1332,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
13321332
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13331333
; GFX7-NEXT: buffer_wbinvl1
13341334
; GFX7-NEXT: s_setpc_b64 s[30:31]
1335-
%result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
1335+
%result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
13361336
ret double %result
13371337
}
13381338

@@ -1482,7 +1482,7 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
14821482
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14831483
; GFX7-NEXT: buffer_wbinvl1
14841484
; GFX7-NEXT: s_setpc_b64 s[30:31]
1485-
%unused = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
1485+
%unused = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
14861486
ret void
14871487
}
14881488

@@ -2215,3 +2215,4 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
22152215
}
22162216

22172217
!0 = !{}
2218+
!1 = !{i32 5, i32 6}

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1645,7 +1645,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
16451645
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
16461646
; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1]
16471647
; GFX11-NEXT: s_endpgm
1648-
%result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8
1648+
%result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
16491649
store i64 %result, ptr %out, align 4
16501650
ret void
16511651
}
@@ -1747,7 +1747,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
17471747
; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1]
17481748
; GFX11-NEXT: s_endpgm
17491749
%gep = getelementptr i64, ptr %ptr, i32 4
1750-
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
1750+
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
17511751
store i64 %result, ptr %out, align 4
17521752
ret void
17531753
}
@@ -1820,7 +1820,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
18201820
; GFX11-NEXT: buffer_gl1_inv
18211821
; GFX11-NEXT: buffer_gl0_inv
18221822
; GFX11-NEXT: s_endpgm
1823-
%result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8
1823+
%result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
18241824
ret void
18251825
}
18261826

@@ -1899,7 +1899,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
18991899
; GFX11-NEXT: buffer_gl0_inv
19001900
; GFX11-NEXT: s_endpgm
19011901
%gep = getelementptr i64, ptr %ptr, i32 4
1902-
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
1902+
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
19031903
ret void
19041904
}
19051905

@@ -1978,7 +1978,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
19781978
; GFX11-NEXT: buffer_gl0_inv
19791979
; GFX11-NEXT: s_endpgm
19801980
%gep = getelementptr i64, ptr %ptr, i32 4
1981-
%result = atomicrmw udec_wrap ptr %gep, i64 42 seq_cst, align 8
1981+
%result = atomicrmw udec_wrap ptr %gep, i64 42 seq_cst, align 8, !noalias.addrspace !0
19821982
ret void
19831983
}
19841984

@@ -2106,7 +2106,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
21062106
%gep.tid = getelementptr i64, ptr %ptr, i32 %id
21072107
%out.gep = getelementptr i64, ptr %out, i32 %id
21082108
%gep = getelementptr i64, ptr %gep.tid, i32 5
2109-
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
2109+
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
21102110
store i64 %result, ptr %out.gep, align 4
21112111
ret void
21122112
}
@@ -2205,7 +2205,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
22052205
%id = call i32 @llvm.amdgcn.workitem.id.x()
22062206
%gep.tid = getelementptr i64, ptr %ptr, i32 %id
22072207
%gep = getelementptr i64, ptr %gep.tid, i32 5
2208-
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
2208+
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
22092209
ret void
22102210
}
22112211

@@ -3312,7 +3312,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
33123312
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2
33133313
%idx.0 = add nsw i32 %tid.x, 2
33143314
%arrayidx0 = getelementptr inbounds [512 x i64], ptr addrspace(3) @lds1, i32 0, i32 %idx.0
3315-
%result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i64 9 syncscope("agent") seq_cst, align 8
3315+
%result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i64 9 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
33163316
store i32 %idx.0, ptr addrspace(1) %add_use, align 4
33173317
store i64 %result, ptr addrspace(1) %out, align 4
33183318
ret void
@@ -3321,5 +3321,8 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
33213321
attributes #0 = { nounwind speculatable willreturn memory(none) }
33223322
attributes #1 = { nounwind }
33233323
attributes #2 = { nounwind memory(none) }
3324+
3325+
!0 = !{i32 5, i32 6}
3326+
33243327
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
33253328
; GCN: {{.*}}

0 commit comments

Comments
 (0)