Skip to content

Commit d5a1f00

Browse files
committed
AMDGPU: Expand flat atomics that may access private memory
If the runtime flat address resolves to a scratch address, 64-bit atomics do not work correctly. Insert a runtime address space check (which is quite likely to be uniform) and select between the non-atomic and real atomic cases. Consider noalias.addrspace metadata and avoid this expansion when possible (we also need to consider it to avoid infinitely expanding after adding the predication code).
1 parent 56dcfbe commit d5a1f00

22 files changed

+28722
-4973
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 112 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
#include "llvm/IR/IntrinsicInst.h"
4040
#include "llvm/IR/IntrinsicsAMDGPU.h"
4141
#include "llvm/IR/IntrinsicsR600.h"
42+
#include "llvm/IR/MDBuilder.h"
4243
#include "llvm/Support/CommandLine.h"
4344
#include "llvm/Support/KnownBits.h"
4445
#include "llvm/Support/ModRef.h"
@@ -16308,12 +16309,45 @@ atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
1630816309
: TargetLowering::AtomicExpansionKind::CmpXChg;
1630916310
}
1631016311

16312+
/// Return if a flat address space atomicrmw can access private memory.
16313+
static bool flatInstrMayAccessPrivate(const Instruction *I) {
16314+
const MDNode *NoaliasAddrSpaceMD =
16315+
I->getMetadata(LLVMContext::MD_noalias_addrspace);
16316+
if (!NoaliasAddrSpaceMD)
16317+
return true;
16318+
16319+
for (unsigned I = 0, E = NoaliasAddrSpaceMD->getNumOperands() / 2; I != E;
16320+
++I) {
16321+
auto *Low = mdconst::extract<ConstantInt>(
16322+
NoaliasAddrSpaceMD->getOperand(2 * I + 0));
16323+
auto *High = mdconst::extract<ConstantInt>(
16324+
NoaliasAddrSpaceMD->getOperand(2 * I + 1));
16325+
16326+
if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS) &&
16327+
High->getValue().ult(AMDGPUAS::PRIVATE_ADDRESS))
16328+
return true;
16329+
}
16330+
16331+
return false;
16332+
}
16333+
1631116334
TargetLowering::AtomicExpansionKind
1631216335
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1631316336
unsigned AS = RMW->getPointerAddressSpace();
1631416337
if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1631516338
return AtomicExpansionKind::NotAtomic;
1631616339

16340+
// 64-bit flat atomics that dynamically reside in private memory will silently
16341+
// be dropped.
16342+
//
16343+
// Note that we will emit a new copy of the original atomic in the expansion,
16344+
// which will be incrementally relegalized.
16345+
const DataLayout &DL = RMW->getFunction()->getDataLayout();
16346+
if (AS == AMDGPUAS::FLAT_ADDRESS &&
16347+
DL.getTypeSizeInBits(RMW->getType()) == 64 &&
16348+
flatInstrMayAccessPrivate(RMW))
16349+
return AtomicExpansionKind::Expand;
16350+
1631716351
auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
1631816352
OptimizationRemarkEmitter ORE(RMW->getFunction());
1631916353
ORE.emit([=]() {
@@ -16714,20 +16748,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1671416748

1671516749
if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
1671616750
Op == AtomicRMWInst::Xor) {
16717-
// atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16718-
assert(cast<Constant>(AI->getValOperand())->isNullValue() &&
16719-
"this cannot be replaced with add");
16720-
AI->setOperation(AtomicRMWInst::Add);
16721-
return;
16751+
if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16752+
ConstVal && ConstVal->isNullValue()) {
16753+
// atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16754+
AI->setOperation(AtomicRMWInst::Add);
16755+
16756+
// TODO: Turn the below private handling into a no-op for idempotent
16757+
// cases.
16758+
}
1672216759
}
1672316760

16724-
assert(Subtarget->hasAtomicFaddInsts() &&
16725-
"target should have atomic fadd instructions");
16726-
assert(AI->getType()->isFloatTy() &&
16727-
AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
16728-
"generic atomicrmw expansion only supports FP32 operand in flat "
16729-
"address space");
16730-
assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now");
16761+
// The non-flat expansions should only perform the de-canonicalization of
16762+
// identity values.
16763+
if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16764+
return;
16765+
16766+
// FullFlatEmulation is true if we need to issue the private, shared, and
16767+
// global cases.
16768+
//
16769+
// If this is false, we are only dealing with the flat-targeting-private case,
16770+
// where we only insert a check for private and still use the flat instruction
16771+
// for global and shared.
16772+
16773+
// TODO: Avoid the private check for the fadd case depending on
16774+
// noalias.addrspace.
16775+
16776+
bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
16777+
Subtarget->hasAtomicFaddInsts() &&
16778+
AI->getType()->isFloatTy();
1673116779

1673216780
// Given: atomicrmw fadd ptr %addr, float %val ordering
1673316781
//
@@ -16767,6 +16815,10 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1676716815
//
1676816816
// atomicrmw.end:
1676916817
// [...]
16818+
//
16819+
//
16820+
// For 64-bit atomics which may reside in private memory, we perform a simpler
16821+
// version that only inserts the private check, and uses the flat operation.
1677016822

1677116823
IRBuilder<> Builder(AI);
1677216824
LLVMContext &Ctx = Builder.getContext();
@@ -16778,9 +16830,15 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1677816830
Function *F = BB->getParent();
1677916831
BasicBlock *ExitBB =
1678016832
BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
16781-
BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16782-
BasicBlock *CheckPrivateBB =
16783-
BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16833+
BasicBlock *SharedBB = nullptr;
16834+
16835+
BasicBlock *CheckPrivateBB = BB;
16836+
if (FullFlatEmulation) {
16837+
SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16838+
CheckPrivateBB =
16839+
BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16840+
}
16841+
1678416842
BasicBlock *PrivateBB =
1678516843
BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
1678616844
BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
@@ -16793,23 +16851,26 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1679316851

1679416852
std::prev(BB->end())->eraseFromParent();
1679516853
Builder.SetInsertPoint(BB);
16796-
CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
16797-
{Addr}, nullptr, "is.shared");
16798-
Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
1679916854

16800-
Builder.SetInsertPoint(SharedBB);
16801-
Value *CastToLocal = Builder.CreateAddrSpaceCast(
16802-
Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
16855+
Value *LoadedShared = nullptr;
16856+
if (FullFlatEmulation) {
16857+
CallInst *IsShared = Builder.CreateIntrinsic(
16858+
Intrinsic::amdgcn_is_shared, {}, {Addr}, nullptr, "is.shared");
16859+
Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16860+
Builder.SetInsertPoint(SharedBB);
16861+
Value *CastToLocal = Builder.CreateAddrSpaceCast(
16862+
Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
1680316863

16804-
Instruction *Clone = AI->clone();
16805-
Clone->insertInto(SharedBB, SharedBB->end());
16806-
Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16807-
.set(CastToLocal);
16808-
Instruction *LoadedShared = Clone;
16864+
Instruction *Clone = AI->clone();
16865+
Clone->insertInto(SharedBB, SharedBB->end());
16866+
Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16867+
.set(CastToLocal);
16868+
LoadedShared = Clone;
1680916869

16810-
Builder.CreateBr(PhiBB);
16870+
Builder.CreateBr(PhiBB);
16871+
Builder.SetInsertPoint(CheckPrivateBB);
16872+
}
1681116873

16812-
Builder.SetInsertPoint(CheckPrivateBB);
1681316874
CallInst *IsPrivate = Builder.CreateIntrinsic(
1681416875
Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
1681516876
Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
@@ -16826,23 +16887,41 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1682616887
Builder.CreateBr(PhiBB);
1682716888

1682816889
Builder.SetInsertPoint(GlobalBB);
16829-
Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16830-
Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16831-
Value *LoadedGlobal = AI;
1683216890

16833-
AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex()).set(CastToGlobal);
16891+
// Continue using a flat instruction if we only emitted the check for private.
16892+
Instruction *LoadedGlobal = AI;
16893+
if (FullFlatEmulation) {
16894+
Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16895+
Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16896+
AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16897+
.set(CastToGlobal);
16898+
}
1683416899

1683516900
AI->removeFromParent();
1683616901
AI->insertInto(GlobalBB, GlobalBB->end());
1683716902

16903+
// The new atomicrmw may go through another round of legalization later.
16904+
if (!FullFlatEmulation) {
16905+
// We inserted the runtime check already, make sure we do not try to
16906+
// re-expand this.
16907+
// TODO: Should union with any existing metadata.
16908+
MDBuilder MDB(F->getContext());
16909+
MDNode *RangeNotPrivate =
16910+
MDB.createRange(APInt(32, AMDGPUAS::PRIVATE_ADDRESS),
16911+
APInt(32, AMDGPUAS::PRIVATE_ADDRESS + 1));
16912+
LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
16913+
RangeNotPrivate);
16914+
}
16915+
1683816916
Builder.CreateBr(PhiBB);
1683916917

1684016918
Builder.SetInsertPoint(PhiBB);
1684116919

1684216920
if (ReturnValueIsUsed) {
1684316921
PHINode *Loaded = Builder.CreatePHI(ValTy, 3);
1684416922
AI->replaceAllUsesWith(Loaded);
16845-
Loaded->addIncoming(LoadedShared, SharedBB);
16923+
if (FullFlatEmulation)
16924+
Loaded->addIncoming(LoadedShared, SharedBB);
1684616925
Loaded->addIncoming(LoadedPrivate, PrivateBB);
1684716926
Loaded->addIncoming(LoadedGlobal, GlobalBB);
1684816927
Loaded->takeName(AI);

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1332,7 +1332,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
13321332
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13331333
; GFX7-NEXT: buffer_wbinvl1
13341334
; GFX7-NEXT: s_setpc_b64 s[30:31]
1335-
%result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
1335+
%result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
13361336
ret double %result
13371337
}
13381338

@@ -1482,7 +1482,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
14821482
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14831483
; GFX7-NEXT: buffer_wbinvl1
14841484
; GFX7-NEXT: s_setpc_b64 s[30:31]
1485-
%unused = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
1485+
%unused = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
14861486
ret void
14871487
}
14881488

@@ -2215,3 +2215,4 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
22152215
}
22162216

22172217
!0 = !{}
2218+
!1 = !{i32 5, i32 6}

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1332,7 +1332,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
13321332
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
13331333
; GFX7-NEXT: buffer_wbinvl1
13341334
; GFX7-NEXT: s_setpc_b64 s[30:31]
1335-
%result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
1335+
%result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
13361336
ret double %result
13371337
}
13381338

@@ -1482,7 +1482,7 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
14821482
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
14831483
; GFX7-NEXT: buffer_wbinvl1
14841484
; GFX7-NEXT: s_setpc_b64 s[30:31]
1485-
%unused = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
1485+
%unused = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
14861486
ret void
14871487
}
14881488

@@ -2215,3 +2215,4 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
22152215
}
22162216

22172217
!0 = !{}
2218+
!1 = !{i32 5, i32 6}

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1645,7 +1645,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
16451645
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
16461646
; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1]
16471647
; GFX11-NEXT: s_endpgm
1648-
%result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8
1648+
%result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
16491649
store i64 %result, ptr %out, align 4
16501650
ret void
16511651
}
@@ -1747,7 +1747,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
17471747
; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1]
17481748
; GFX11-NEXT: s_endpgm
17491749
%gep = getelementptr i64, ptr %ptr, i32 4
1750-
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
1750+
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
17511751
store i64 %result, ptr %out, align 4
17521752
ret void
17531753
}
@@ -1820,7 +1820,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
18201820
; GFX11-NEXT: buffer_gl1_inv
18211821
; GFX11-NEXT: buffer_gl0_inv
18221822
; GFX11-NEXT: s_endpgm
1823-
%result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8
1823+
%result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
18241824
ret void
18251825
}
18261826

@@ -1899,7 +1899,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
18991899
; GFX11-NEXT: buffer_gl0_inv
19001900
; GFX11-NEXT: s_endpgm
19011901
%gep = getelementptr i64, ptr %ptr, i32 4
1902-
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
1902+
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
19031903
ret void
19041904
}
19051905

@@ -1978,7 +1978,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
19781978
; GFX11-NEXT: buffer_gl0_inv
19791979
; GFX11-NEXT: s_endpgm
19801980
%gep = getelementptr i64, ptr %ptr, i32 4
1981-
%result = atomicrmw udec_wrap ptr %gep, i64 42 seq_cst, align 8
1981+
%result = atomicrmw udec_wrap ptr %gep, i64 42 seq_cst, align 8, !noalias.addrspace !0
19821982
ret void
19831983
}
19841984

@@ -2106,7 +2106,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
21062106
%gep.tid = getelementptr i64, ptr %ptr, i32 %id
21072107
%out.gep = getelementptr i64, ptr %out, i32 %id
21082108
%gep = getelementptr i64, ptr %gep.tid, i32 5
2109-
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
2109+
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
21102110
store i64 %result, ptr %out.gep, align 4
21112111
ret void
21122112
}
@@ -2205,7 +2205,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
22052205
%id = call i32 @llvm.amdgcn.workitem.id.x()
22062206
%gep.tid = getelementptr i64, ptr %ptr, i32 %id
22072207
%gep = getelementptr i64, ptr %gep.tid, i32 5
2208-
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8
2208+
%result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
22092209
ret void
22102210
}
22112211

@@ -3312,7 +3312,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
33123312
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2
33133313
%idx.0 = add nsw i32 %tid.x, 2
33143314
%arrayidx0 = getelementptr inbounds [512 x i64], ptr addrspace(3) @lds1, i32 0, i32 %idx.0
3315-
%result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i64 9 syncscope("agent") seq_cst, align 8
3315+
%result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i64 9 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0
33163316
store i32 %idx.0, ptr addrspace(1) %add_use, align 4
33173317
store i64 %result, ptr addrspace(1) %out, align 4
33183318
ret void
@@ -3321,5 +3321,8 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
33213321
attributes #0 = { nounwind speculatable willreturn memory(none) }
33223322
attributes #1 = { nounwind }
33233323
attributes #2 = { nounwind memory(none) }
3324+
3325+
!0 = !{i32 5, i32 6}
3326+
33243327
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
33253328
; GCN: {{.*}}

0 commit comments

Comments
 (0)