39
39
#include "llvm/IR/IntrinsicInst.h"
40
40
#include "llvm/IR/IntrinsicsAMDGPU.h"
41
41
#include "llvm/IR/IntrinsicsR600.h"
42
+ #include "llvm/IR/MDBuilder.h"
42
43
#include "llvm/Support/CommandLine.h"
43
44
#include "llvm/Support/KnownBits.h"
44
45
#include "llvm/Support/ModRef.h"
@@ -16243,12 +16244,39 @@ atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
16243
16244
: TargetLowering::AtomicExpansionKind::CmpXChg;
16244
16245
}
16245
16246
16247
+ /// Return if a flat address space atomicrmw can access private memory.
16248
+ static bool flatInstrMayAccessPrivate(const Instruction *I) {
16249
+ const MDNode *NoaliasAddrSpaceMD =
16250
+ I->getMetadata(LLVMContext::MD_noalias_addrspace);
16251
+ if (!NoaliasAddrSpaceMD)
16252
+ return true;
16253
+
16254
+ // FIXME: Can this actually fail? Why is this optional?
16255
+ if (std::optional<ConstantRange> CR =
16256
+ getConstantRangeFromMetadata(*NoaliasAddrSpaceMD)) {
16257
+ return !CR->contains(APInt(32, AMDGPUAS::PRIVATE_ADDRESS));
16258
+ }
16259
+
16260
+ llvm_unreachable("Why is getConstantRangeFromMetadata optional");
16261
+ }
16262
+
16246
16263
TargetLowering::AtomicExpansionKind
16247
16264
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
16248
16265
unsigned AS = RMW->getPointerAddressSpace();
16249
16266
if (AS == AMDGPUAS::PRIVATE_ADDRESS)
16250
16267
return AtomicExpansionKind::NotAtomic;
16251
16268
16269
+ // 64-bit flat atomics that dynamically reside in private memory will silently
16270
+ // be dropped.
16271
+ //
16272
+ // Note that we will emit a new copy of the original atomic in the expansion,
16273
+ // which will be incrementally relegalized.
16274
+ const DataLayout &DL = RMW->getFunction()->getDataLayout();
16275
+ if (AS == AMDGPUAS::FLAT_ADDRESS &&
16276
+ DL.getTypeSizeInBits(RMW->getType()) == 64 &&
16277
+ flatInstrMayAccessPrivate(RMW))
16278
+ return AtomicExpansionKind::Expand;
16279
+
16252
16280
auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
16253
16281
OptimizationRemarkEmitter ORE(RMW->getFunction());
16254
16282
ORE.emit([=]() {
@@ -16647,20 +16675,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16647
16675
16648
16676
if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16649
16677
Op == AtomicRMWInst::Xor) {
16650
- // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16651
- assert(cast<Constant>(AI->getValOperand())->isNullValue() &&
16652
- "this cannot be replaced with add");
16653
- AI->setOperation(AtomicRMWInst::Add);
16654
- return;
16678
+ if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16679
+ ConstVal && ConstVal->isNullValue()) {
16680
+ // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16681
+ AI->setOperation(AtomicRMWInst::Add);
16682
+
16683
+ // TODO: Turn the below private handling into a no-op for idempotent
16684
+ // cases.
16685
+ }
16655
16686
}
16656
16687
16657
- assert(Subtarget->hasAtomicFaddInsts() &&
16658
- "target should have atomic fadd instructions");
16659
- assert(AI->getType()->isFloatTy() &&
16660
- AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
16661
- "generic atomicrmw expansion only supports FP32 operand in flat "
16662
- "address space");
16663
- assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now");
16688
+ // The non-flat expansions should only perform the de-canonicalization of
16689
+ // identity values.
16690
+ if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16691
+ return;
16692
+
16693
+ // FullFlatEmulation is true if we need to issue the private, shared, and
16694
+ // global cases.
16695
+ //
16696
+ // If this is false, we are only dealing with the flat-targeting-private case,
16697
+ // where we only insert a check for private and still use the flat instruction
16698
+ // for global and shared.
16699
+
16700
+ // TODO: Avoid the private check for the fadd case depending on
16701
+ // noalias.addrspace.
16702
+
16703
+ bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
16704
+ Subtarget->hasAtomicFaddInsts() &&
16705
+ AI->getType()->isFloatTy();
16664
16706
16665
16707
// Given: atomicrmw fadd ptr %addr, float %val ordering
16666
16708
//
@@ -16700,6 +16742,10 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16700
16742
//
16701
16743
// atomicrmw.end:
16702
16744
// [...]
16745
+ //
16746
+ //
16747
+ // For 64-bit atomics which may reside in private memory, we perform a simpler
16748
+ // version that only inserts the private check, and uses the flat operation.
16703
16749
16704
16750
IRBuilder<> Builder(AI);
16705
16751
LLVMContext &Ctx = Builder.getContext();
@@ -16711,9 +16757,15 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16711
16757
Function *F = BB->getParent();
16712
16758
BasicBlock *ExitBB =
16713
16759
BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
16714
- BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16715
- BasicBlock *CheckPrivateBB =
16716
- BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16760
+ BasicBlock *SharedBB = nullptr;
16761
+
16762
+ BasicBlock *CheckPrivateBB = BB;
16763
+ if (FullFlatEmulation) {
16764
+ SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16765
+ CheckPrivateBB =
16766
+ BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16767
+ }
16768
+
16717
16769
BasicBlock *PrivateBB =
16718
16770
BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
16719
16771
BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
@@ -16726,23 +16778,26 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16726
16778
16727
16779
std::prev(BB->end())->eraseFromParent();
16728
16780
Builder.SetInsertPoint(BB);
16729
- CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
16730
- {Addr}, nullptr, "is.shared");
16731
- Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16732
16781
16733
- Builder.SetInsertPoint(SharedBB);
16734
- Value *CastToLocal = Builder.CreateAddrSpaceCast(
16735
- Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
16782
+ Value *LoadedShared = nullptr;
16783
+ if (FullFlatEmulation) {
16784
+ CallInst *IsShared = Builder.CreateIntrinsic(
16785
+ Intrinsic::amdgcn_is_shared, {}, {Addr}, nullptr, "is.shared");
16786
+ Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16787
+ Builder.SetInsertPoint(SharedBB);
16788
+ Value *CastToLocal = Builder.CreateAddrSpaceCast(
16789
+ Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
16736
16790
16737
- Instruction *Clone = AI->clone();
16738
- Clone->insertInto(SharedBB, SharedBB->end());
16739
- Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16740
- .set(CastToLocal);
16741
- Instruction * LoadedShared = Clone;
16791
+ Instruction *Clone = AI->clone();
16792
+ Clone->insertInto(SharedBB, SharedBB->end());
16793
+ Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16794
+ .set(CastToLocal);
16795
+ LoadedShared = Clone;
16742
16796
16743
- Builder.CreateBr(PhiBB);
16797
+ Builder.CreateBr(PhiBB);
16798
+ Builder.SetInsertPoint(CheckPrivateBB);
16799
+ }
16744
16800
16745
- Builder.SetInsertPoint(CheckPrivateBB);
16746
16801
CallInst *IsPrivate = Builder.CreateIntrinsic(
16747
16802
Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
16748
16803
Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
@@ -16759,23 +16814,41 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16759
16814
Builder.CreateBr(PhiBB);
16760
16815
16761
16816
Builder.SetInsertPoint(GlobalBB);
16762
- Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16763
- Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16764
- Value *LoadedGlobal = AI;
16765
16817
16766
- AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex()).set(CastToGlobal);
16818
+ // Continue using a flat instruction if we only emitted the check for private.
16819
+ Instruction *LoadedGlobal = AI;
16820
+ if (FullFlatEmulation) {
16821
+ Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16822
+ Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16823
+ AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16824
+ .set(CastToGlobal);
16825
+ }
16767
16826
16768
16827
AI->removeFromParent();
16769
16828
AI->insertInto(GlobalBB, GlobalBB->end());
16770
16829
16830
+ // The new atomicrmw may go through another round of legalization later.
16831
+ if (!FullFlatEmulation) {
16832
+ // We inserted the runtime check already, make sure we do not try to
16833
+ // re-expand this.
16834
+ // TODO: Should union with any existing metadata.
16835
+ MDBuilder MDB(F->getContext());
16836
+ MDNode *RangeNotPrivate =
16837
+ MDB.createRange(APInt(32, AMDGPUAS::PRIVATE_ADDRESS),
16838
+ APInt(32, AMDGPUAS::PRIVATE_ADDRESS + 1));
16839
+ LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
16840
+ RangeNotPrivate);
16841
+ }
16842
+
16771
16843
Builder.CreateBr(PhiBB);
16772
16844
16773
16845
Builder.SetInsertPoint(PhiBB);
16774
16846
16775
16847
if (ReturnValueIsUsed) {
16776
16848
PHINode *Loaded = Builder.CreatePHI(ValTy, 3);
16777
16849
AI->replaceAllUsesWith(Loaded);
16778
- Loaded->addIncoming(LoadedShared, SharedBB);
16850
+ if (FullFlatEmulation)
16851
+ Loaded->addIncoming(LoadedShared, SharedBB);
16779
16852
Loaded->addIncoming(LoadedPrivate, PrivateBB);
16780
16853
Loaded->addIncoming(LoadedGlobal, GlobalBB);
16781
16854
Loaded->takeName(AI);
0 commit comments