@@ -16497,9 +16497,21 @@ SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
16497
16497
16498
16498
TargetLowering::AtomicExpansionKind
16499
16499
SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
16500
- return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16501
- ? AtomicExpansionKind::NotAtomic
16502
- : AtomicExpansionKind::None;
16500
+ unsigned AddrSpace = CmpX->getPointerAddressSpace();
16501
+ if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
16502
+ return AtomicExpansionKind::NotAtomic;
16503
+
16504
+ if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
16505
+ return AtomicExpansionKind::None;
16506
+
16507
+ const DataLayout &DL = CmpX->getDataLayout();
16508
+
16509
+ Type *ValTy = CmpX->getNewValOperand()->getType();
16510
+
16511
+ // If a 64-bit flat atomic may alias private, we need to avoid using the
16512
+ // atomic in the private case.
16513
+ return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand
16514
+ : AtomicExpansionKind::None;
16503
16515
}
16504
16516
16505
16517
const TargetRegisterClass *
@@ -16663,40 +16675,8 @@ bool SITargetLowering::checkForPhysRegDependency(
16663
16675
return false;
16664
16676
}
16665
16677
16666
- void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16667
- AtomicRMWInst::BinOp Op = AI->getOperation();
16668
-
16669
- if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16670
- Op == AtomicRMWInst::Xor) {
16671
- if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16672
- ConstVal && ConstVal->isNullValue()) {
16673
- // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16674
- AI->setOperation(AtomicRMWInst::Add);
16675
-
16676
- // TODO: Turn the below private handling into a no-op for idempotent
16677
- // cases.
16678
- }
16679
- }
16680
-
16681
- // The non-flat expansions should only perform the de-canonicalization of
16682
- // identity values.
16683
- if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16684
- return;
16685
-
16686
- // FullFlatEmulation is true if we need to issue the private, shared, and
16687
- // global cases.
16688
- //
16689
- // If this is false, we are only dealing with the flat-targeting-private case,
16690
- // where we only insert a check for private and still use the flat instruction
16691
- // for global and shared.
16692
-
16693
- // TODO: Avoid the private check for the fadd case depending on
16694
- // noalias.addrspace.
16695
-
16696
- bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
16697
- Subtarget->hasAtomicFaddInsts() &&
16698
- AI->getType()->isFloatTy();
16699
-
16678
+ void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
16679
+ Instruction *AI) const {
16700
16680
// Given: atomicrmw fadd ptr %addr, float %val ordering
16701
16681
//
16702
16682
// With this expansion we produce the following code:
@@ -16743,6 +16723,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16743
16723
IRBuilder<> Builder(AI);
16744
16724
LLVMContext &Ctx = Builder.getContext();
16745
16725
16726
+ auto *RMW = dyn_cast<AtomicRMWInst>(AI);
16727
+ const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
16728
+ : AtomicCmpXchgInst::getPointerOperandIndex();
16729
+ Value *Addr = AI->getOperand(PtrOpIdx);
16730
+
16731
+ /// TODO: Only need to check private, then emit flat-known-not private (no
16732
+ /// need for shared block, or cast to global).
16733
+ AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(AI);
16734
+
16735
+ Align Alignment;
16736
+ if (RMW)
16737
+ Alignment = RMW->getAlign();
16738
+ else if (CX)
16739
+ Alignment = CX->getAlign();
16740
+ else
16741
+ llvm_unreachable("unhandled atomic operation");
16742
+
16743
+ // FullFlatEmulation is true if we need to issue the private, shared, and
16744
+ // global cases.
16745
+ //
16746
+ // If this is false, we are only dealing with the flat-targeting-private case,
16747
+ // where we only insert a check for private and still use the flat instruction
16748
+ // for global and shared.
16749
+
16750
+ bool FullFlatEmulation = RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
16751
+ Subtarget->hasAtomicFaddInsts() &&
16752
+ RMW->getType()->isFloatTy();
16753
+
16746
16754
// If the return value isn't used, do not introduce a false use in the phi.
16747
16755
bool ReturnValueIsUsed = !AI->use_empty();
16748
16756
@@ -16764,11 +16772,6 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16764
16772
BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
16765
16773
BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
16766
16774
16767
- Value *Val = AI->getValOperand();
16768
- Type *ValTy = Val->getType();
16769
- Value *Addr = AI->getPointerOperand();
16770
- Align Alignment = AI->getAlign();
16771
-
16772
16775
std::prev(BB->end())->eraseFromParent();
16773
16776
Builder.SetInsertPoint(BB);
16774
16777
@@ -16783,8 +16786,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16783
16786
16784
16787
Instruction *Clone = AI->clone();
16785
16788
Clone->insertInto(SharedBB, SharedBB->end());
16786
- Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16787
- .set(CastToLocal);
16789
+ Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
16788
16790
LoadedShared = Clone;
16789
16791
16790
16792
Builder.CreateBr(PhiBB);
@@ -16796,14 +16798,29 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16796
16798
Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
16797
16799
16798
16800
Builder.SetInsertPoint(PrivateBB);
16801
+
16799
16802
Value *CastToPrivate = Builder.CreateAddrSpaceCast(
16800
16803
Addr, PointerType::get(Ctx, AMDGPUAS::PRIVATE_ADDRESS));
16801
- Value *LoadedPrivate = Builder.CreateAlignedLoad(ValTy, CastToPrivate,
16802
- Alignment, "loaded.private");
16803
16804
16804
- Value *NewVal = buildAtomicRMWValue(Op, Builder, LoadedPrivate, Val);
16805
+ Value *LoadedPrivate;
16806
+ if (RMW) {
16807
+ LoadedPrivate = Builder.CreateAlignedLoad(
16808
+ RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
16809
+
16810
+ Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
16811
+ LoadedPrivate, RMW->getValOperand());
16812
+
16813
+ Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
16814
+ } else {
16815
+ auto [ResultLoad, Equal] =
16816
+ buildAtomicCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
16817
+ CX->getNewValOperand(), CX->getAlign());
16818
+
16819
+ Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
16820
+ ResultLoad, 0);
16821
+ LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
16822
+ }
16805
16823
16806
- Builder.CreateAlignedStore(NewVal, CastToPrivate, Alignment);
16807
16824
Builder.CreateBr(PhiBB);
16808
16825
16809
16826
Builder.SetInsertPoint(GlobalBB);
@@ -16813,8 +16830,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16813
16830
if (FullFlatEmulation) {
16814
16831
Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16815
16832
Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16816
- AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16817
- .set(CastToGlobal);
16833
+ AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
16818
16834
}
16819
16835
16820
16836
AI->removeFromParent();
@@ -16838,7 +16854,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16838
16854
Builder.SetInsertPoint(PhiBB);
16839
16855
16840
16856
if (ReturnValueIsUsed) {
16841
- PHINode *Loaded = Builder.CreatePHI(ValTy , 3);
16857
+ PHINode *Loaded = Builder.CreatePHI(AI->getType() , 3);
16842
16858
AI->replaceAllUsesWith(Loaded);
16843
16859
if (FullFlatEmulation)
16844
16860
Loaded->addIncoming(LoadedShared, SharedBB);
@@ -16850,6 +16866,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16850
16866
Builder.CreateBr(ExitBB);
16851
16867
}
16852
16868
16869
+ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16870
+ AtomicRMWInst::BinOp Op = AI->getOperation();
16871
+
16872
+ if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16873
+ Op == AtomicRMWInst::Xor) {
16874
+ if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16875
+ ConstVal && ConstVal->isNullValue()) {
16876
+ // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16877
+ AI->setOperation(AtomicRMWInst::Add);
16878
+
16879
+ // We may still need the private-alias-flat handling below.
16880
+
16881
+ // TODO: Skip this for cases where we cannot access remote memory.
16882
+ }
16883
+ }
16884
+
16885
+ // The non-flat expansions should only perform the de-canonicalization of
16886
+ // identity values.
16887
+ if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16888
+ return;
16889
+
16890
+ emitExpandAtomicAddrSpacePredicate(AI);
16891
+ }
16892
+
16893
+ void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
16894
+ emitExpandAtomicAddrSpacePredicate(CI);
16895
+ }
16896
+
16853
16897
LoadInst *
16854
16898
SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
16855
16899
IRBuilder<> Builder(AI);
0 commit comments