39
39
#include "llvm/IR/IntrinsicInst.h"
40
40
#include "llvm/IR/IntrinsicsAMDGPU.h"
41
41
#include "llvm/IR/IntrinsicsR600.h"
42
+ #include "llvm/IR/MDBuilder.h"
42
43
#include "llvm/Support/CommandLine.h"
43
44
#include "llvm/Support/KnownBits.h"
44
45
#include "llvm/Support/ModRef.h"
@@ -16308,12 +16309,45 @@ atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
16308
16309
: TargetLowering::AtomicExpansionKind::CmpXChg;
16309
16310
}
16310
16311
16312
+ /// Return if a flat address space atomicrmw can access private memory.
16313
+ static bool flatInstrMayAccessPrivate(const Instruction *I) {
16314
+ const MDNode *NoaliasAddrSpaceMD =
16315
+ I->getMetadata(LLVMContext::MD_noalias_addrspace);
16316
+ if (!NoaliasAddrSpaceMD)
16317
+ return true;
16318
+
16319
+ for (unsigned I = 0, E = NoaliasAddrSpaceMD->getNumOperands() / 2; I != E;
16320
+ ++I) {
16321
+ auto *Low = mdconst::extract<ConstantInt>(
16322
+ NoaliasAddrSpaceMD->getOperand(2 * I + 0));
16323
+ auto *High = mdconst::extract<ConstantInt>(
16324
+ NoaliasAddrSpaceMD->getOperand(2 * I + 1));
16325
+
16326
+ if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS) &&
16327
+ High->getValue().ult(AMDGPUAS::PRIVATE_ADDRESS))
16328
+ return true;
16329
+ }
16330
+
16331
+ return false;
16332
+ }
16333
+
16311
16334
TargetLowering::AtomicExpansionKind
16312
16335
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
16313
16336
unsigned AS = RMW->getPointerAddressSpace();
16314
16337
if (AS == AMDGPUAS::PRIVATE_ADDRESS)
16315
16338
return AtomicExpansionKind::NotAtomic;
16316
16339
16340
+ // 64-bit flat atomics that dynamically reside in private memory will silently
16341
+ // be dropped.
16342
+ //
16343
+ // Note that we will emit a new copy of the original atomic in the expansion,
16344
+ // which will be incrementally relegalized.
16345
+ const DataLayout &DL = RMW->getFunction()->getDataLayout();
16346
+ if (AS == AMDGPUAS::FLAT_ADDRESS &&
16347
+ DL.getTypeSizeInBits(RMW->getType()) == 64 &&
16348
+ flatInstrMayAccessPrivate(RMW))
16349
+ return AtomicExpansionKind::Expand;
16350
+
16317
16351
auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
16318
16352
OptimizationRemarkEmitter ORE(RMW->getFunction());
16319
16353
ORE.emit([=]() {
@@ -16714,20 +16748,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16714
16748
16715
16749
if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16716
16750
Op == AtomicRMWInst::Xor) {
16717
- // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16718
- assert(cast<Constant>(AI->getValOperand())->isNullValue() &&
16719
- "this cannot be replaced with add");
16720
- AI->setOperation(AtomicRMWInst::Add);
16721
- return;
16751
+ if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16752
+ ConstVal && ConstVal->isNullValue()) {
16753
+ // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16754
+ AI->setOperation(AtomicRMWInst::Add);
16755
+
16756
+ // TODO: Turn the below private handling into a no-op for idempotent
16757
+ // cases.
16758
+ }
16722
16759
}
16723
16760
16724
- assert(Subtarget->hasAtomicFaddInsts() &&
16725
- "target should have atomic fadd instructions");
16726
- assert(AI->getType()->isFloatTy() &&
16727
- AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
16728
- "generic atomicrmw expansion only supports FP32 operand in flat "
16729
- "address space");
16730
- assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now");
16761
+ // The non-flat expansions should only perform the de-canonicalization of
16762
+ // identity values.
16763
+ if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16764
+ return;
16765
+
16766
+ // FullFlatEmulation is true if we need to issue the private, shared, and
16767
+ // global cases.
16768
+ //
16769
+ // If this is false, we are only dealing with the flat-targeting-private case,
16770
+ // where we only insert a check for private and still use the flat instruction
16771
+ // for global and shared.
16772
+
16773
+ // TODO: Avoid the private check for the fadd case depending on
16774
+ // noalias.addrspace.
16775
+
16776
+ bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
16777
+ Subtarget->hasAtomicFaddInsts() &&
16778
+ AI->getType()->isFloatTy();
16731
16779
16732
16780
// Given: atomicrmw fadd ptr %addr, float %val ordering
16733
16781
//
@@ -16767,6 +16815,10 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16767
16815
//
16768
16816
// atomicrmw.end:
16769
16817
// [...]
16818
+ //
16819
+ //
16820
+ // For 64-bit atomics which may reside in private memory, we perform a simpler
16821
+ // version that only inserts the private check, and uses the flat operation.
16770
16822
16771
16823
IRBuilder<> Builder(AI);
16772
16824
LLVMContext &Ctx = Builder.getContext();
@@ -16778,9 +16830,15 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16778
16830
Function *F = BB->getParent();
16779
16831
BasicBlock *ExitBB =
16780
16832
BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
16781
- BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16782
- BasicBlock *CheckPrivateBB =
16783
- BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16833
+ BasicBlock *SharedBB = nullptr;
16834
+
16835
+ BasicBlock *CheckPrivateBB = BB;
16836
+ if (FullFlatEmulation) {
16837
+ SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16838
+ CheckPrivateBB =
16839
+ BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16840
+ }
16841
+
16784
16842
BasicBlock *PrivateBB =
16785
16843
BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
16786
16844
BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
@@ -16793,23 +16851,26 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16793
16851
16794
16852
std::prev(BB->end())->eraseFromParent();
16795
16853
Builder.SetInsertPoint(BB);
16796
- CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
16797
- {Addr}, nullptr, "is.shared");
16798
- Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16799
16854
16800
- Builder.SetInsertPoint(SharedBB);
16801
- Value *CastToLocal = Builder.CreateAddrSpaceCast(
16802
- Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
16855
+ Value *LoadedShared = nullptr;
16856
+ if (FullFlatEmulation) {
16857
+ CallInst *IsShared = Builder.CreateIntrinsic(
16858
+ Intrinsic::amdgcn_is_shared, {}, {Addr}, nullptr, "is.shared");
16859
+ Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16860
+ Builder.SetInsertPoint(SharedBB);
16861
+ Value *CastToLocal = Builder.CreateAddrSpaceCast(
16862
+ Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
16803
16863
16804
- Instruction *Clone = AI->clone();
16805
- Clone->insertInto(SharedBB, SharedBB->end());
16806
- Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16807
- .set(CastToLocal);
16808
- Instruction * LoadedShared = Clone;
16864
+ Instruction *Clone = AI->clone();
16865
+ Clone->insertInto(SharedBB, SharedBB->end());
16866
+ Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16867
+ .set(CastToLocal);
16868
+ LoadedShared = Clone;
16809
16869
16810
- Builder.CreateBr(PhiBB);
16870
+ Builder.CreateBr(PhiBB);
16871
+ Builder.SetInsertPoint(CheckPrivateBB);
16872
+ }
16811
16873
16812
- Builder.SetInsertPoint(CheckPrivateBB);
16813
16874
CallInst *IsPrivate = Builder.CreateIntrinsic(
16814
16875
Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
16815
16876
Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
@@ -16826,23 +16887,41 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16826
16887
Builder.CreateBr(PhiBB);
16827
16888
16828
16889
Builder.SetInsertPoint(GlobalBB);
16829
- Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16830
- Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16831
- Value *LoadedGlobal = AI;
16832
16890
16833
- AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex()).set(CastToGlobal);
16891
+ // Continue using a flat instruction if we only emitted the check for private.
16892
+ Instruction *LoadedGlobal = AI;
16893
+ if (FullFlatEmulation) {
16894
+ Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16895
+ Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16896
+ AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16897
+ .set(CastToGlobal);
16898
+ }
16834
16899
16835
16900
AI->removeFromParent();
16836
16901
AI->insertInto(GlobalBB, GlobalBB->end());
16837
16902
16903
+ // The new atomicrmw may go through another round of legalization later.
16904
+ if (!FullFlatEmulation) {
16905
+ // We inserted the runtime check already, make sure we do not try to
16906
+ // re-expand this.
16907
+ // TODO: Should union with any existing metadata.
16908
+ MDBuilder MDB(F->getContext());
16909
+ MDNode *RangeNotPrivate =
16910
+ MDB.createRange(APInt(32, AMDGPUAS::PRIVATE_ADDRESS),
16911
+ APInt(32, AMDGPUAS::PRIVATE_ADDRESS + 1));
16912
+ LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
16913
+ RangeNotPrivate);
16914
+ }
16915
+
16838
16916
Builder.CreateBr(PhiBB);
16839
16917
16840
16918
Builder.SetInsertPoint(PhiBB);
16841
16919
16842
16920
if (ReturnValueIsUsed) {
16843
16921
PHINode *Loaded = Builder.CreatePHI(ValTy, 3);
16844
16922
AI->replaceAllUsesWith(Loaded);
16845
- Loaded->addIncoming(LoadedShared, SharedBB);
16923
+ if (FullFlatEmulation)
16924
+ Loaded->addIncoming(LoadedShared, SharedBB);
16846
16925
Loaded->addIncoming(LoadedPrivate, PrivateBB);
16847
16926
Loaded->addIncoming(LoadedGlobal, GlobalBB);
16848
16927
Loaded->takeName(AI);
0 commit comments