39
39
#include "llvm/IR/IntrinsicInst.h"
40
40
#include "llvm/IR/IntrinsicsAMDGPU.h"
41
41
#include "llvm/IR/IntrinsicsR600.h"
42
+ #include "llvm/IR/MDBuilder.h"
42
43
#include "llvm/Support/CommandLine.h"
43
44
#include "llvm/Support/KnownBits.h"
44
45
#include "llvm/Support/ModRef.h"
@@ -16236,12 +16237,39 @@ atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
16236
16237
: TargetLowering::AtomicExpansionKind::CmpXChg;
16237
16238
}
16238
16239
16240
+ /// Return if a flat address space atomicrmw can access private memory.
16241
+ static bool flatInstrMayAccessPrivate(const Instruction *I) {
16242
+ const MDNode *NoaliasAddrSpaceMD =
16243
+ I->getMetadata(LLVMContext::MD_noalias_addrspace);
16244
+ if (!NoaliasAddrSpaceMD)
16245
+ return true;
16246
+
16247
+ // FIXME: Can this actually fail? Why is this optional?
16248
+ if (std::optional<ConstantRange> CR =
16249
+ getConstantRangeFromMetadata(*NoaliasAddrSpaceMD)) {
16250
+ return !CR->contains(APInt(32, AMDGPUAS::PRIVATE_ADDRESS));
16251
+ }
16252
+
16253
+ llvm_unreachable("Why is getConstantRangeFromMetadata optional");
16254
+ }
16255
+
16239
16256
TargetLowering::AtomicExpansionKind
16240
16257
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
16241
16258
unsigned AS = RMW->getPointerAddressSpace();
16242
16259
if (AS == AMDGPUAS::PRIVATE_ADDRESS)
16243
16260
return AtomicExpansionKind::NotAtomic;
16244
16261
16262
+ // 64-bit flat atomics that dynamically reside in private memory will silently
16263
+ // be dropped.
16264
+ //
16265
+ // Note that we will emit a new copy of the original atomic in the expansion,
16266
+ // which will be incrementally relegalized.
16267
+ const DataLayout &DL = RMW->getFunction()->getDataLayout();
16268
+ if (AS == AMDGPUAS::FLAT_ADDRESS &&
16269
+ DL.getTypeSizeInBits(RMW->getType()) == 64 &&
16270
+ flatInstrMayAccessPrivate(RMW))
16271
+ return AtomicExpansionKind::Expand;
16272
+
16245
16273
auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
16246
16274
OptimizationRemarkEmitter ORE(RMW->getFunction());
16247
16275
ORE.emit([=]() {
@@ -16640,20 +16668,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16640
16668
16641
16669
if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16642
16670
Op == AtomicRMWInst::Xor) {
16643
- // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16644
- assert(cast<Constant>(AI->getValOperand())->isNullValue() &&
16645
- "this cannot be replaced with add");
16646
- AI->setOperation(AtomicRMWInst::Add);
16647
- return;
16671
+ if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16672
+ ConstVal && ConstVal->isNullValue()) {
16673
+ // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16674
+ AI->setOperation(AtomicRMWInst::Add);
16675
+
16676
+ // TODO: Turn the below private handling into a no-op for idempotent
16677
+ // cases.
16678
+ }
16648
16679
}
16649
16680
16650
- assert(Subtarget->hasAtomicFaddInsts() &&
16651
- "target should have atomic fadd instructions");
16652
- assert(AI->getType()->isFloatTy() &&
16653
- AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
16654
- "generic atomicrmw expansion only supports FP32 operand in flat "
16655
- "address space");
16656
- assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now");
16681
+ // The non-flat expansions should only perform the de-canonicalization of
16682
+ // identity values.
16683
+ if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16684
+ return;
16685
+
16686
+ // FullFlatEmulation is true if we need to issue the private, shared, and
16687
+ // global cases.
16688
+ //
16689
+ // If this is false, we are only dealing with the flat-targeting-private case,
16690
+ // where we only insert a check for private and still use the flat instruction
16691
+ // for global and shared.
16692
+
16693
+ // TODO: Avoid the private check for the fadd case depending on
16694
+ // noalias.addrspace.
16695
+
16696
+ bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
16697
+ Subtarget->hasAtomicFaddInsts() &&
16698
+ AI->getType()->isFloatTy();
16657
16699
16658
16700
// Given: atomicrmw fadd ptr %addr, float %val ordering
16659
16701
//
@@ -16693,6 +16735,10 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16693
16735
//
16694
16736
// atomicrmw.end:
16695
16737
// [...]
16738
+ //
16739
+ //
16740
+ // For 64-bit atomics which may reside in private memory, we perform a simpler
16741
+ // version that only inserts the private check, and uses the flat operation.
16696
16742
16697
16743
IRBuilder<> Builder(AI);
16698
16744
LLVMContext &Ctx = Builder.getContext();
@@ -16704,9 +16750,15 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16704
16750
Function *F = BB->getParent();
16705
16751
BasicBlock *ExitBB =
16706
16752
BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
16707
- BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16708
- BasicBlock *CheckPrivateBB =
16709
- BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16753
+ BasicBlock *SharedBB = nullptr;
16754
+
16755
+ BasicBlock *CheckPrivateBB = BB;
16756
+ if (FullFlatEmulation) {
16757
+ SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16758
+ CheckPrivateBB =
16759
+ BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16760
+ }
16761
+
16710
16762
BasicBlock *PrivateBB =
16711
16763
BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
16712
16764
BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
@@ -16719,23 +16771,26 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16719
16771
16720
16772
std::prev(BB->end())->eraseFromParent();
16721
16773
Builder.SetInsertPoint(BB);
16722
- CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
16723
- {Addr}, nullptr, "is.shared");
16724
- Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16725
16774
16726
- Builder.SetInsertPoint(SharedBB);
16727
- Value *CastToLocal = Builder.CreateAddrSpaceCast(
16728
- Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
16775
+ Value *LoadedShared = nullptr;
16776
+ if (FullFlatEmulation) {
16777
+ CallInst *IsShared = Builder.CreateIntrinsic(
16778
+ Intrinsic::amdgcn_is_shared, {}, {Addr}, nullptr, "is.shared");
16779
+ Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16780
+ Builder.SetInsertPoint(SharedBB);
16781
+ Value *CastToLocal = Builder.CreateAddrSpaceCast(
16782
+ Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
16729
16783
16730
- Instruction *Clone = AI->clone();
16731
- Clone->insertInto(SharedBB, SharedBB->end());
16732
- Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16733
- .set(CastToLocal);
16734
- Instruction * LoadedShared = Clone;
16784
+ Instruction *Clone = AI->clone();
16785
+ Clone->insertInto(SharedBB, SharedBB->end());
16786
+ Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16787
+ .set(CastToLocal);
16788
+ LoadedShared = Clone;
16735
16789
16736
- Builder.CreateBr(PhiBB);
16790
+ Builder.CreateBr(PhiBB);
16791
+ Builder.SetInsertPoint(CheckPrivateBB);
16792
+ }
16737
16793
16738
- Builder.SetInsertPoint(CheckPrivateBB);
16739
16794
CallInst *IsPrivate = Builder.CreateIntrinsic(
16740
16795
Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
16741
16796
Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
@@ -16752,23 +16807,41 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16752
16807
Builder.CreateBr(PhiBB);
16753
16808
16754
16809
Builder.SetInsertPoint(GlobalBB);
16755
- Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16756
- Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16757
- Value *LoadedGlobal = AI;
16758
16810
16759
- AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex()).set(CastToGlobal);
16811
+ // Continue using a flat instruction if we only emitted the check for private.
16812
+ Instruction *LoadedGlobal = AI;
16813
+ if (FullFlatEmulation) {
16814
+ Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16815
+ Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16816
+ AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16817
+ .set(CastToGlobal);
16818
+ }
16760
16819
16761
16820
AI->removeFromParent();
16762
16821
AI->insertInto(GlobalBB, GlobalBB->end());
16763
16822
16823
+ // The new atomicrmw may go through another round of legalization later.
16824
+ if (!FullFlatEmulation) {
16825
+ // We inserted the runtime check already, make sure we do not try to
16826
+ // re-expand this.
16827
+ // TODO: Should union with any existing metadata.
16828
+ MDBuilder MDB(F->getContext());
16829
+ MDNode *RangeNotPrivate =
16830
+ MDB.createRange(APInt(32, AMDGPUAS::PRIVATE_ADDRESS),
16831
+ APInt(32, AMDGPUAS::PRIVATE_ADDRESS + 1));
16832
+ LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
16833
+ RangeNotPrivate);
16834
+ }
16835
+
16764
16836
Builder.CreateBr(PhiBB);
16765
16837
16766
16838
Builder.SetInsertPoint(PhiBB);
16767
16839
16768
16840
if (ReturnValueIsUsed) {
16769
16841
PHINode *Loaded = Builder.CreatePHI(ValTy, 3);
16770
16842
AI->replaceAllUsesWith(Loaded);
16771
- Loaded->addIncoming(LoadedShared, SharedBB);
16843
+ if (FullFlatEmulation)
16844
+ Loaded->addIncoming(LoadedShared, SharedBB);
16772
16845
Loaded->addIncoming(LoadedPrivate, PrivateBB);
16773
16846
Loaded->addIncoming(LoadedGlobal, GlobalBB);
16774
16847
Loaded->takeName(AI);
0 commit comments