39
39
#include "llvm/IR/IntrinsicInst.h"
40
40
#include "llvm/IR/IntrinsicsAMDGPU.h"
41
41
#include "llvm/IR/IntrinsicsR600.h"
42
+ #include "llvm/IR/MDBuilder.h"
42
43
#include "llvm/Support/CommandLine.h"
43
44
#include "llvm/Support/KnownBits.h"
44
45
#include "llvm/Support/ModRef.h"
@@ -16327,12 +16328,39 @@ atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
16327
16328
: TargetLowering::AtomicExpansionKind::CmpXChg;
16328
16329
}
16329
16330
16331
+ /// Return if a flat address space atomicrmw can access private memory.
16332
+ static bool flatInstrMayAccessPrivate(const Instruction *I) {
16333
+ const MDNode *NoaliasAddrSpaceMD =
16334
+ I->getMetadata(LLVMContext::MD_noalias_addrspace);
16335
+ if (!NoaliasAddrSpaceMD)
16336
+ return true;
16337
+
16338
+ // FIXME: Can this actually fail? Why is this optional?
16339
+ if (std::optional<ConstantRange> CR =
16340
+ getConstantRangeFromMetadata(*NoaliasAddrSpaceMD)) {
16341
+ return !CR->contains(APInt(32, AMDGPUAS::PRIVATE_ADDRESS));
16342
+ }
16343
+
16344
+ llvm_unreachable("Why is getConstantRangeFromMetadata optional");
16345
+ }
16346
+
16330
16347
TargetLowering::AtomicExpansionKind
16331
16348
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
16332
16349
unsigned AS = RMW->getPointerAddressSpace();
16333
16350
if (AS == AMDGPUAS::PRIVATE_ADDRESS)
16334
16351
return AtomicExpansionKind::NotAtomic;
16335
16352
16353
+ // 64-bit flat atomics that dynamically reside in private memory will silently
16354
+ // be dropped.
16355
+ //
16356
+ // Note that we will emit a new copy of the original atomic in the expansion,
16357
+ // which will be incrementally relegalized.
16358
+ const DataLayout &DL = RMW->getFunction()->getDataLayout();
16359
+ if (AS == AMDGPUAS::FLAT_ADDRESS &&
16360
+ DL.getTypeSizeInBits(RMW->getType()) == 64 &&
16361
+ flatInstrMayAccessPrivate(RMW))
16362
+ return AtomicExpansionKind::Expand;
16363
+
16336
16364
auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
16337
16365
OptimizationRemarkEmitter ORE(RMW->getFunction());
16338
16366
ORE.emit([=]() {
@@ -16731,20 +16759,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16731
16759
16732
16760
if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16733
16761
Op == AtomicRMWInst::Xor) {
16734
- // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16735
- assert(cast<Constant>(AI->getValOperand())->isNullValue() &&
16736
- "this cannot be replaced with add");
16737
- AI->setOperation(AtomicRMWInst::Add);
16738
- return;
16762
+ if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16763
+ ConstVal && ConstVal->isNullValue()) {
16764
+ // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16765
+ AI->setOperation(AtomicRMWInst::Add);
16766
+
16767
+ // TODO: Turn the below private handling into a no-op for idempotent
16768
+ // cases.
16769
+ }
16739
16770
}
16740
16771
16741
- assert(Subtarget->hasAtomicFaddInsts() &&
16742
- "target should have atomic fadd instructions");
16743
- assert(AI->getType()->isFloatTy() &&
16744
- AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
16745
- "generic atomicrmw expansion only supports FP32 operand in flat "
16746
- "address space");
16747
- assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now");
16772
+ // The non-flat expansions should only perform the de-canonicalization of
16773
+ // identity values.
16774
+ if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16775
+ return;
16776
+
16777
+ // FullFlatEmulation is true if we need to issue the private, shared, and
16778
+ // global cases.
16779
+ //
16780
+ // If this is false, we are only dealing with the flat-targeting-private case,
16781
+ // where we only insert a check for private and still use the flat instruction
16782
+ // for global and shared.
16783
+
16784
+ // TODO: Avoid the private check for the fadd case depending on
16785
+ // noalias.addrspace.
16786
+
16787
+ bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
16788
+ Subtarget->hasAtomicFaddInsts() &&
16789
+ AI->getType()->isFloatTy();
16748
16790
16749
16791
// Given: atomicrmw fadd ptr %addr, float %val ordering
16750
16792
//
@@ -16784,6 +16826,10 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16784
16826
//
16785
16827
// atomicrmw.end:
16786
16828
// [...]
16829
+ //
16830
+ //
16831
+ // For 64-bit atomics which may reside in private memory, we perform a simpler
16832
+ // version that only inserts the private check, and uses the flat operation.
16787
16833
16788
16834
IRBuilder<> Builder(AI);
16789
16835
LLVMContext &Ctx = Builder.getContext();
@@ -16795,9 +16841,15 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16795
16841
Function *F = BB->getParent();
16796
16842
BasicBlock *ExitBB =
16797
16843
BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
16798
- BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16799
- BasicBlock *CheckPrivateBB =
16800
- BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16844
+ BasicBlock *SharedBB = nullptr;
16845
+
16846
+ BasicBlock *CheckPrivateBB = BB;
16847
+ if (FullFlatEmulation) {
16848
+ SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16849
+ CheckPrivateBB =
16850
+ BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16851
+ }
16852
+
16801
16853
BasicBlock *PrivateBB =
16802
16854
BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
16803
16855
BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
@@ -16810,23 +16862,26 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16810
16862
16811
16863
std::prev(BB->end())->eraseFromParent();
16812
16864
Builder.SetInsertPoint(BB);
16813
- CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
16814
- {Addr}, nullptr, "is.shared");
16815
- Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16816
16865
16817
- Builder.SetInsertPoint(SharedBB);
16818
- Value *CastToLocal = Builder.CreateAddrSpaceCast(
16819
- Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
16866
+ Value *LoadedShared = nullptr;
16867
+ if (FullFlatEmulation) {
16868
+ CallInst *IsShared = Builder.CreateIntrinsic(
16869
+ Intrinsic::amdgcn_is_shared, {}, {Addr}, nullptr, "is.shared");
16870
+ Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16871
+ Builder.SetInsertPoint(SharedBB);
16872
+ Value *CastToLocal = Builder.CreateAddrSpaceCast(
16873
+ Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
16820
16874
16821
- Instruction *Clone = AI->clone();
16822
- Clone->insertInto(SharedBB, SharedBB->end());
16823
- Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16824
- .set(CastToLocal);
16825
- Instruction * LoadedShared = Clone;
16875
+ Instruction *Clone = AI->clone();
16876
+ Clone->insertInto(SharedBB, SharedBB->end());
16877
+ Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16878
+ .set(CastToLocal);
16879
+ LoadedShared = Clone;
16826
16880
16827
- Builder.CreateBr(PhiBB);
16881
+ Builder.CreateBr(PhiBB);
16882
+ Builder.SetInsertPoint(CheckPrivateBB);
16883
+ }
16828
16884
16829
- Builder.SetInsertPoint(CheckPrivateBB);
16830
16885
CallInst *IsPrivate = Builder.CreateIntrinsic(
16831
16886
Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
16832
16887
Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
@@ -16843,23 +16898,41 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16843
16898
Builder.CreateBr(PhiBB);
16844
16899
16845
16900
Builder.SetInsertPoint(GlobalBB);
16846
- Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16847
- Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16848
- Value *LoadedGlobal = AI;
16849
16901
16850
- AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex()).set(CastToGlobal);
16902
+ // Continue using a flat instruction if we only emitted the check for private.
16903
+ Instruction *LoadedGlobal = AI;
16904
+ if (FullFlatEmulation) {
16905
+ Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16906
+ Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16907
+ AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16908
+ .set(CastToGlobal);
16909
+ }
16851
16910
16852
16911
AI->removeFromParent();
16853
16912
AI->insertInto(GlobalBB, GlobalBB->end());
16854
16913
16914
+ // The new atomicrmw may go through another round of legalization later.
16915
+ if (!FullFlatEmulation) {
16916
+ // We inserted the runtime check already, make sure we do not try to
16917
+ // re-expand this.
16918
+ // TODO: Should union with any existing metadata.
16919
+ MDBuilder MDB(F->getContext());
16920
+ MDNode *RangeNotPrivate =
16921
+ MDB.createRange(APInt(32, AMDGPUAS::PRIVATE_ADDRESS),
16922
+ APInt(32, AMDGPUAS::PRIVATE_ADDRESS + 1));
16923
+ LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
16924
+ RangeNotPrivate);
16925
+ }
16926
+
16855
16927
Builder.CreateBr(PhiBB);
16856
16928
16857
16929
Builder.SetInsertPoint(PhiBB);
16858
16930
16859
16931
if (ReturnValueIsUsed) {
16860
16932
PHINode *Loaded = Builder.CreatePHI(ValTy, 3);
16861
16933
AI->replaceAllUsesWith(Loaded);
16862
- Loaded->addIncoming(LoadedShared, SharedBB);
16934
+ if (FullFlatEmulation)
16935
+ Loaded->addIncoming(LoadedShared, SharedBB);
16863
16936
Loaded->addIncoming(LoadedPrivate, PrivateBB);
16864
16937
Loaded->addIncoming(LoadedGlobal, GlobalBB);
16865
16938
Loaded->takeName(AI);
0 commit comments