39
39
#include "llvm/IR/IntrinsicInst.h"
40
40
#include "llvm/IR/IntrinsicsAMDGPU.h"
41
41
#include "llvm/IR/IntrinsicsR600.h"
42
+ #include "llvm/IR/MDBuilder.h"
42
43
#include "llvm/Support/CommandLine.h"
43
44
#include "llvm/Support/KnownBits.h"
44
45
#include "llvm/Support/ModRef.h"
@@ -16310,12 +16311,45 @@ atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
16310
16311
: TargetLowering::AtomicExpansionKind::CmpXChg;
16311
16312
}
16312
16313
16314
+ /// Return if a flat address space atomicrmw can access private memory.
16315
+ static bool flatInstrMayAccessPrivate(const Instruction *I) {
16316
+ const MDNode *NoaliasAddrSpaceMD =
16317
+ I->getMetadata(LLVMContext::MD_noalias_addrspace);
16318
+ if (!NoaliasAddrSpaceMD)
16319
+ return true;
16320
+
16321
+ for (unsigned I = 0, E = NoaliasAddrSpaceMD->getNumOperands() / 2; I != E;
16322
+ ++I) {
16323
+ auto *Low = mdconst::extract<ConstantInt>(
16324
+ NoaliasAddrSpaceMD->getOperand(2 * I + 0));
16325
+ auto *High = mdconst::extract<ConstantInt>(
16326
+ NoaliasAddrSpaceMD->getOperand(2 * I + 1));
16327
+
16328
+ if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS) &&
16329
+ High->getValue().ult(AMDGPUAS::PRIVATE_ADDRESS))
16330
+ return true;
16331
+ }
16332
+
16333
+ return false;
16334
+ }
16335
+
16313
16336
TargetLowering::AtomicExpansionKind
16314
16337
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
16315
16338
unsigned AS = RMW->getPointerAddressSpace();
16316
16339
if (AS == AMDGPUAS::PRIVATE_ADDRESS)
16317
16340
return AtomicExpansionKind::NotAtomic;
16318
16341
16342
+ // 64-bit flat atomics that dynamically reside in private memory will silently
16343
+ // be dropped.
16344
+ //
16345
+ // Note that we will emit a new copy of the original atomic in the expansion,
16346
+ // which will be incrementally relegalized.
16347
+ const DataLayout &DL = RMW->getFunction()->getDataLayout();
16348
+ if (AS == AMDGPUAS::FLAT_ADDRESS &&
16349
+ DL.getTypeSizeInBits(RMW->getType()) == 64 &&
16350
+ flatInstrMayAccessPrivate(RMW))
16351
+ return AtomicExpansionKind::Expand;
16352
+
16319
16353
auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
16320
16354
OptimizationRemarkEmitter ORE(RMW->getFunction());
16321
16355
ORE.emit([=]() {
@@ -16716,20 +16750,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16716
16750
16717
16751
if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16718
16752
Op == AtomicRMWInst::Xor) {
16719
- // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16720
- assert(cast<Constant>(AI->getValOperand())->isNullValue() &&
16721
- "this cannot be replaced with add");
16722
- AI->setOperation(AtomicRMWInst::Add);
16723
- return;
16753
+ if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16754
+ ConstVal && ConstVal->isNullValue()) {
16755
+ // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16756
+ AI->setOperation(AtomicRMWInst::Add);
16757
+
16758
+ // TODO: Turn the below private handling into a no-op for idempotent
16759
+ // cases.
16760
+ }
16724
16761
}
16725
16762
16726
- assert(Subtarget->hasAtomicFaddInsts() &&
16727
- "target should have atomic fadd instructions");
16728
- assert(AI->getType()->isFloatTy() &&
16729
- AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
16730
- "generic atomicrmw expansion only supports FP32 operand in flat "
16731
- "address space");
16732
- assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now");
16763
+ // The non-flat expansions should only perform the de-canonicalization of
16764
+ // identity values.
16765
+ if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16766
+ return;
16767
+
16768
+ // FullFlatEmulation is true if we need to issue the private, shared, and
16769
+ // global cases.
16770
+ //
16771
+ // If this is false, we are only dealing with the flat-targeting-private case,
16772
+ // where we only insert a check for private and still use the flat instruction
16773
+ // for global and shared.
16774
+
16775
+ // TODO: Avoid the private check for the fadd case depending on
16776
+ // noalias.addrspace.
16777
+
16778
+ bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
16779
+ Subtarget->hasAtomicFaddInsts() &&
16780
+ AI->getType()->isFloatTy();
16733
16781
16734
16782
// Given: atomicrmw fadd ptr %addr, float %val ordering
16735
16783
//
@@ -16769,6 +16817,10 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16769
16817
//
16770
16818
// atomicrmw.end:
16771
16819
// [...]
16820
+ //
16821
+ //
16822
+ // For 64-bit atomics which may reside in private memory, we perform a simpler
16823
+ // version that only inserts the private check, and uses the flat operation.
16772
16824
16773
16825
IRBuilder<> Builder(AI);
16774
16826
LLVMContext &Ctx = Builder.getContext();
@@ -16780,9 +16832,15 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16780
16832
Function *F = BB->getParent();
16781
16833
BasicBlock *ExitBB =
16782
16834
BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
16783
- BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16784
- BasicBlock *CheckPrivateBB =
16785
- BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16835
+ BasicBlock *SharedBB = nullptr;
16836
+
16837
+ BasicBlock *CheckPrivateBB = BB;
16838
+ if (FullFlatEmulation) {
16839
+ SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16840
+ CheckPrivateBB =
16841
+ BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16842
+ }
16843
+
16786
16844
BasicBlock *PrivateBB =
16787
16845
BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
16788
16846
BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
@@ -16795,23 +16853,26 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16795
16853
16796
16854
std::prev(BB->end())->eraseFromParent();
16797
16855
Builder.SetInsertPoint(BB);
16798
- CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
16799
- {Addr}, nullptr, "is.shared");
16800
- Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16801
16856
16802
- Builder.SetInsertPoint(SharedBB);
16803
- Value *CastToLocal = Builder.CreateAddrSpaceCast(
16804
- Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
16857
+ Value *LoadedShared = nullptr;
16858
+ if (FullFlatEmulation) {
16859
+ CallInst *IsShared = Builder.CreateIntrinsic(
16860
+ Intrinsic::amdgcn_is_shared, {}, {Addr}, nullptr, "is.shared");
16861
+ Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16862
+ Builder.SetInsertPoint(SharedBB);
16863
+ Value *CastToLocal = Builder.CreateAddrSpaceCast(
16864
+ Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
16805
16865
16806
- Instruction *Clone = AI->clone();
16807
- Clone->insertInto(SharedBB, SharedBB->end());
16808
- Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16809
- .set(CastToLocal);
16810
- Instruction * LoadedShared = Clone;
16866
+ Instruction *Clone = AI->clone();
16867
+ Clone->insertInto(SharedBB, SharedBB->end());
16868
+ Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16869
+ .set(CastToLocal);
16870
+ LoadedShared = Clone;
16811
16871
16812
- Builder.CreateBr(PhiBB);
16872
+ Builder.CreateBr(PhiBB);
16873
+ Builder.SetInsertPoint(CheckPrivateBB);
16874
+ }
16813
16875
16814
- Builder.SetInsertPoint(CheckPrivateBB);
16815
16876
CallInst *IsPrivate = Builder.CreateIntrinsic(
16816
16877
Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
16817
16878
Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
@@ -16828,23 +16889,41 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16828
16889
Builder.CreateBr(PhiBB);
16829
16890
16830
16891
Builder.SetInsertPoint(GlobalBB);
16831
- Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16832
- Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16833
- Value *LoadedGlobal = AI;
16834
16892
16835
- AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex()).set(CastToGlobal);
16893
+ // Continue using a flat instruction if we only emitted the check for private.
16894
+ Instruction *LoadedGlobal = AI;
16895
+ if (FullFlatEmulation) {
16896
+ Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16897
+ Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16898
+ AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16899
+ .set(CastToGlobal);
16900
+ }
16836
16901
16837
16902
AI->removeFromParent();
16838
16903
AI->insertInto(GlobalBB, GlobalBB->end());
16839
16904
16905
+ // The new atomicrmw may go through another round of legalization later.
16906
+ if (!FullFlatEmulation) {
16907
+ // We inserted the runtime check already, make sure we do not try to
16908
+ // re-expand this.
16909
+ // TODO: Should union with any existing metadata.
16910
+ MDBuilder MDB(F->getContext());
16911
+ MDNode *RangeNotPrivate =
16912
+ MDB.createRange(APInt(32, AMDGPUAS::PRIVATE_ADDRESS),
16913
+ APInt(32, AMDGPUAS::PRIVATE_ADDRESS + 1));
16914
+ LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
16915
+ RangeNotPrivate);
16916
+ }
16917
+
16840
16918
Builder.CreateBr(PhiBB);
16841
16919
16842
16920
Builder.SetInsertPoint(PhiBB);
16843
16921
16844
16922
if (ReturnValueIsUsed) {
16845
16923
PHINode *Loaded = Builder.CreatePHI(ValTy, 3);
16846
16924
AI->replaceAllUsesWith(Loaded);
16847
- Loaded->addIncoming(LoadedShared, SharedBB);
16925
+ if (FullFlatEmulation)
16926
+ Loaded->addIncoming(LoadedShared, SharedBB);
16848
16927
Loaded->addIncoming(LoadedPrivate, PrivateBB);
16849
16928
Loaded->addIncoming(LoadedGlobal, GlobalBB);
16850
16929
Loaded->takeName(AI);
0 commit comments