39
39
#include "llvm/IR/IntrinsicInst.h"
40
40
#include "llvm/IR/IntrinsicsAMDGPU.h"
41
41
#include "llvm/IR/IntrinsicsR600.h"
42
+ #include "llvm/IR/MDBuilder.h"
42
43
#include "llvm/Support/CommandLine.h"
43
44
#include "llvm/Support/KnownBits.h"
44
45
#include "llvm/Support/ModRef.h"
@@ -16340,12 +16341,45 @@ atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
16340
16341
: TargetLowering::AtomicExpansionKind::CmpXChg;
16341
16342
}
16342
16343
16344
+ /// Return if a flat address space atomicrmw can access private memory.
16345
+ static bool flatInstrMayAccessPrivate(const Instruction *I) {
16346
+ const MDNode *NoaliasAddrSpaceMD =
16347
+ I->getMetadata(LLVMContext::MD_noalias_addrspace);
16348
+ if (!NoaliasAddrSpaceMD)
16349
+ return true;
16350
+
16351
+ for (unsigned I = 0, E = NoaliasAddrSpaceMD->getNumOperands() / 2; I != E;
16352
+ ++I) {
16353
+ auto *Low = mdconst::extract<ConstantInt>(
16354
+ NoaliasAddrSpaceMD->getOperand(2 * I + 0));
16355
+ auto *High = mdconst::extract<ConstantInt>(
16356
+ NoaliasAddrSpaceMD->getOperand(2 * I + 1));
16357
+
16358
+ if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS) &&
16359
+ High->getValue().ult(AMDGPUAS::PRIVATE_ADDRESS))
16360
+ return true;
16361
+ }
16362
+
16363
+ return false;
16364
+ }
16365
+
16343
16366
TargetLowering::AtomicExpansionKind
16344
16367
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
16345
16368
unsigned AS = RMW->getPointerAddressSpace();
16346
16369
if (AS == AMDGPUAS::PRIVATE_ADDRESS)
16347
16370
return AtomicExpansionKind::NotAtomic;
16348
16371
16372
+ // 64-bit flat atomics that dynamically reside in private memory will silently
16373
+ // be dropped.
16374
+ //
16375
+ // Note that we will emit a new copy of the original atomic in the expansion,
16376
+ // which will be incrementally relegalized.
16377
+ const DataLayout &DL = RMW->getFunction()->getDataLayout();
16378
+ if (AS == AMDGPUAS::FLAT_ADDRESS &&
16379
+ DL.getTypeSizeInBits(RMW->getType()) == 64 &&
16380
+ flatInstrMayAccessPrivate(RMW))
16381
+ return AtomicExpansionKind::Expand;
16382
+
16349
16383
auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
16350
16384
OptimizationRemarkEmitter ORE(RMW->getFunction());
16351
16385
ORE.emit([=]() {
@@ -16744,20 +16778,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16744
16778
16745
16779
if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16746
16780
Op == AtomicRMWInst::Xor) {
16747
- // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16748
- assert(cast<Constant>(AI->getValOperand())->isNullValue() &&
16749
- "this cannot be replaced with add");
16750
- AI->setOperation(AtomicRMWInst::Add);
16751
- return;
16781
+ if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16782
+ ConstVal && ConstVal->isNullValue()) {
16783
+ // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16784
+ AI->setOperation(AtomicRMWInst::Add);
16785
+
16786
+ // TODO: Turn the below private handling into a no-op for idempotent
16787
+ // cases.
16788
+ }
16752
16789
}
16753
16790
16754
- assert(Subtarget->hasAtomicFaddInsts() &&
16755
- "target should have atomic fadd instructions");
16756
- assert(AI->getType()->isFloatTy() &&
16757
- AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
16758
- "generic atomicrmw expansion only supports FP32 operand in flat "
16759
- "address space");
16760
- assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now");
16791
+ // The non-flat expansions should only perform the de-canonicalization of
16792
+ // identity values.
16793
+ if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16794
+ return;
16795
+
16796
+ // FullFlatEmulation is true if we need to issue the private, shared, and
16797
+ // global cases.
16798
+ //
16799
+ // If this is false, we are only dealing with the flat-targeting-private case,
16800
+ // where we only insert a check for private and still use the flat instruction
16801
+ // for global and shared.
16802
+
16803
+ // TODO: Avoid the private check for the fadd case depending on
16804
+ // noalias.addrspace.
16805
+
16806
+ bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
16807
+ Subtarget->hasAtomicFaddInsts() &&
16808
+ AI->getType()->isFloatTy();
16761
16809
16762
16810
// Given: atomicrmw fadd ptr %addr, float %val ordering
16763
16811
//
@@ -16797,6 +16845,10 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16797
16845
//
16798
16846
// atomicrmw.end:
16799
16847
// [...]
16848
+ //
16849
+ //
16850
+ // For 64-bit atomics which may reside in private memory, we perform a simpler
16851
+ // version that only inserts the private check, and uses the flat operation.
16800
16852
16801
16853
IRBuilder<> Builder(AI);
16802
16854
LLVMContext &Ctx = Builder.getContext();
@@ -16808,9 +16860,15 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16808
16860
Function *F = BB->getParent();
16809
16861
BasicBlock *ExitBB =
16810
16862
BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
16811
- BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16812
- BasicBlock *CheckPrivateBB =
16813
- BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16863
+ BasicBlock *SharedBB = nullptr;
16864
+
16865
+ BasicBlock *CheckPrivateBB = BB;
16866
+ if (FullFlatEmulation) {
16867
+ SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16868
+ CheckPrivateBB =
16869
+ BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16870
+ }
16871
+
16814
16872
BasicBlock *PrivateBB =
16815
16873
BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
16816
16874
BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
@@ -16823,23 +16881,26 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16823
16881
16824
16882
std::prev(BB->end())->eraseFromParent();
16825
16883
Builder.SetInsertPoint(BB);
16826
- CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
16827
- {Addr}, nullptr, "is.shared");
16828
- Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16829
16884
16830
- Builder.SetInsertPoint(SharedBB);
16831
- Value *CastToLocal = Builder.CreateAddrSpaceCast(
16832
- Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
16885
+ Value *LoadedShared = nullptr;
16886
+ if (FullFlatEmulation) {
16887
+ CallInst *IsShared = Builder.CreateIntrinsic(
16888
+ Intrinsic::amdgcn_is_shared, {}, {Addr}, nullptr, "is.shared");
16889
+ Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16890
+ Builder.SetInsertPoint(SharedBB);
16891
+ Value *CastToLocal = Builder.CreateAddrSpaceCast(
16892
+ Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
16833
16893
16834
- Instruction *Clone = AI->clone();
16835
- Clone->insertInto(SharedBB, SharedBB->end());
16836
- Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16837
- .set(CastToLocal);
16838
- Instruction * LoadedShared = Clone;
16894
+ Instruction *Clone = AI->clone();
16895
+ Clone->insertInto(SharedBB, SharedBB->end());
16896
+ Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16897
+ .set(CastToLocal);
16898
+ LoadedShared = Clone;
16839
16899
16840
- Builder.CreateBr(PhiBB);
16900
+ Builder.CreateBr(PhiBB);
16901
+ Builder.SetInsertPoint(CheckPrivateBB);
16902
+ }
16841
16903
16842
- Builder.SetInsertPoint(CheckPrivateBB);
16843
16904
CallInst *IsPrivate = Builder.CreateIntrinsic(
16844
16905
Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
16845
16906
Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
@@ -16856,23 +16917,41 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16856
16917
Builder.CreateBr(PhiBB);
16857
16918
16858
16919
Builder.SetInsertPoint(GlobalBB);
16859
- Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16860
- Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16861
- Value *LoadedGlobal = AI;
16862
16920
16863
- AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex()).set(CastToGlobal);
16921
+ // Continue using a flat instruction if we only emitted the check for private.
16922
+ Instruction *LoadedGlobal = AI;
16923
+ if (FullFlatEmulation) {
16924
+ Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16925
+ Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16926
+ AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16927
+ .set(CastToGlobal);
16928
+ }
16864
16929
16865
16930
AI->removeFromParent();
16866
16931
AI->insertInto(GlobalBB, GlobalBB->end());
16867
16932
16933
+ // The new atomicrmw may go through another round of legalization later.
16934
+ if (!FullFlatEmulation) {
16935
+ // We inserted the runtime check already, make sure we do not try to
16936
+ // re-expand this.
16937
+ // TODO: Should union with any existing metadata.
16938
+ MDBuilder MDB(F->getContext());
16939
+ MDNode *RangeNotPrivate =
16940
+ MDB.createRange(APInt(32, AMDGPUAS::PRIVATE_ADDRESS),
16941
+ APInt(32, AMDGPUAS::PRIVATE_ADDRESS + 1));
16942
+ LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
16943
+ RangeNotPrivate);
16944
+ }
16945
+
16868
16946
Builder.CreateBr(PhiBB);
16869
16947
16870
16948
Builder.SetInsertPoint(PhiBB);
16871
16949
16872
16950
if (ReturnValueIsUsed) {
16873
16951
PHINode *Loaded = Builder.CreatePHI(ValTy, 3);
16874
16952
AI->replaceAllUsesWith(Loaded);
16875
- Loaded->addIncoming(LoadedShared, SharedBB);
16953
+ if (FullFlatEmulation)
16954
+ Loaded->addIncoming(LoadedShared, SharedBB);
16876
16955
Loaded->addIncoming(LoadedPrivate, PrivateBB);
16877
16956
Loaded->addIncoming(LoadedGlobal, GlobalBB);
16878
16957
Loaded->takeName(AI);
0 commit comments