@@ -16504,9 +16504,21 @@ SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
16504
16504
16505
16505
TargetLowering::AtomicExpansionKind
16506
16506
SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
16507
- return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16508
- ? AtomicExpansionKind::NotAtomic
16509
- : AtomicExpansionKind::None;
16507
+ unsigned AddrSpace = CmpX->getPointerAddressSpace();
16508
+ if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
16509
+ return AtomicExpansionKind::NotAtomic;
16510
+
16511
+ if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
16512
+ return AtomicExpansionKind::None;
16513
+
16514
+ const DataLayout &DL = CmpX->getDataLayout();
16515
+
16516
+ Type *ValTy = CmpX->getNewValOperand()->getType();
16517
+
16518
+ // If a 64-bit flat atomic may alias private, we need to avoid using the
16519
+ // atomic in the private case.
16520
+ return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand
16521
+ : AtomicExpansionKind::None;
16510
16522
}
16511
16523
16512
16524
const TargetRegisterClass *
@@ -16670,40 +16682,8 @@ bool SITargetLowering::checkForPhysRegDependency(
16670
16682
return false;
16671
16683
}
16672
16684
16673
- void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16674
- AtomicRMWInst::BinOp Op = AI->getOperation();
16675
-
16676
- if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16677
- Op == AtomicRMWInst::Xor) {
16678
- if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16679
- ConstVal && ConstVal->isNullValue()) {
16680
- // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16681
- AI->setOperation(AtomicRMWInst::Add);
16682
-
16683
- // TODO: Turn the below private handling into a no-op for idempotent
16684
- // cases.
16685
- }
16686
- }
16687
-
16688
- // The non-flat expansions should only perform the de-canonicalization of
16689
- // identity values.
16690
- if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16691
- return;
16692
-
16693
- // FullFlatEmulation is true if we need to issue the private, shared, and
16694
- // global cases.
16695
- //
16696
- // If this is false, we are only dealing with the flat-targeting-private case,
16697
- // where we only insert a check for private and still use the flat instruction
16698
- // for global and shared.
16699
-
16700
- // TODO: Avoid the private check for the fadd case depending on
16701
- // noalias.addrspace.
16702
-
16703
- bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
16704
- Subtarget->hasAtomicFaddInsts() &&
16705
- AI->getType()->isFloatTy();
16706
-
16685
+ void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
16686
+ Instruction *AI) const {
16707
16687
// Given: atomicrmw fadd ptr %addr, float %val ordering
16708
16688
//
16709
16689
// With this expansion we produce the following code:
@@ -16750,6 +16730,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16750
16730
IRBuilder<> Builder(AI);
16751
16731
LLVMContext &Ctx = Builder.getContext();
16752
16732
16733
+ auto *RMW = dyn_cast<AtomicRMWInst>(AI);
16734
+ const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
16735
+ : AtomicCmpXchgInst::getPointerOperandIndex();
16736
+ Value *Addr = AI->getOperand(PtrOpIdx);
16737
+
16738
+ /// TODO: Only need to check private, then emit flat-known-not private (no
16739
+ /// need for shared block, or cast to global).
16740
+ AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(AI);
16741
+
16742
+ Align Alignment;
16743
+ if (RMW)
16744
+ Alignment = RMW->getAlign();
16745
+ else if (CX)
16746
+ Alignment = CX->getAlign();
16747
+ else
16748
+ llvm_unreachable("unhandled atomic operation");
16749
+
16750
+ // FullFlatEmulation is true if we need to issue the private, shared, and
16751
+ // global cases.
16752
+ //
16753
+ // If this is false, we are only dealing with the flat-targeting-private case,
16754
+ // where we only insert a check for private and still use the flat instruction
16755
+ // for global and shared.
16756
+
16757
+ bool FullFlatEmulation = RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
16758
+ Subtarget->hasAtomicFaddInsts() &&
16759
+ RMW->getType()->isFloatTy();
16760
+
16753
16761
// If the return value isn't used, do not introduce a false use in the phi.
16754
16762
bool ReturnValueIsUsed = !AI->use_empty();
16755
16763
@@ -16771,11 +16779,6 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16771
16779
BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
16772
16780
BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
16773
16781
16774
- Value *Val = AI->getValOperand();
16775
- Type *ValTy = Val->getType();
16776
- Value *Addr = AI->getPointerOperand();
16777
- Align Alignment = AI->getAlign();
16778
-
16779
16782
std::prev(BB->end())->eraseFromParent();
16780
16783
Builder.SetInsertPoint(BB);
16781
16784
@@ -16790,8 +16793,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16790
16793
16791
16794
Instruction *Clone = AI->clone();
16792
16795
Clone->insertInto(SharedBB, SharedBB->end());
16793
- Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16794
- .set(CastToLocal);
16796
+ Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
16795
16797
LoadedShared = Clone;
16796
16798
16797
16799
Builder.CreateBr(PhiBB);
@@ -16803,14 +16805,29 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16803
16805
Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
16804
16806
16805
16807
Builder.SetInsertPoint(PrivateBB);
16808
+
16806
16809
Value *CastToPrivate = Builder.CreateAddrSpaceCast(
16807
16810
Addr, PointerType::get(Ctx, AMDGPUAS::PRIVATE_ADDRESS));
16808
- Value *LoadedPrivate = Builder.CreateAlignedLoad(ValTy, CastToPrivate,
16809
- Alignment, "loaded.private");
16810
16811
16811
- Value *NewVal = buildAtomicRMWValue(Op, Builder, LoadedPrivate, Val);
16812
+ Value *LoadedPrivate;
16813
+ if (RMW) {
16814
+ LoadedPrivate = Builder.CreateAlignedLoad(
16815
+ RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
16816
+
16817
+ Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
16818
+ LoadedPrivate, RMW->getValOperand());
16819
+
16820
+ Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
16821
+ } else {
16822
+ auto [ResultLoad, Equal] =
16823
+ buildAtomicCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
16824
+ CX->getNewValOperand(), CX->getAlign());
16825
+
16826
+ Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
16827
+ ResultLoad, 0);
16828
+ LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
16829
+ }
16812
16830
16813
- Builder.CreateAlignedStore(NewVal, CastToPrivate, Alignment);
16814
16831
Builder.CreateBr(PhiBB);
16815
16832
16816
16833
Builder.SetInsertPoint(GlobalBB);
@@ -16820,8 +16837,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16820
16837
if (FullFlatEmulation) {
16821
16838
Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16822
16839
Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16823
- AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16824
- .set(CastToGlobal);
16840
+ AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
16825
16841
}
16826
16842
16827
16843
AI->removeFromParent();
@@ -16845,7 +16861,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16845
16861
Builder.SetInsertPoint(PhiBB);
16846
16862
16847
16863
if (ReturnValueIsUsed) {
16848
- PHINode *Loaded = Builder.CreatePHI(ValTy , 3);
16864
+ PHINode *Loaded = Builder.CreatePHI(AI->getType() , 3);
16849
16865
AI->replaceAllUsesWith(Loaded);
16850
16866
if (FullFlatEmulation)
16851
16867
Loaded->addIncoming(LoadedShared, SharedBB);
@@ -16857,6 +16873,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16857
16873
Builder.CreateBr(ExitBB);
16858
16874
}
16859
16875
16876
+ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16877
+ AtomicRMWInst::BinOp Op = AI->getOperation();
16878
+
16879
+ if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16880
+ Op == AtomicRMWInst::Xor) {
16881
+ if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16882
+ ConstVal && ConstVal->isNullValue()) {
16883
+ // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16884
+ AI->setOperation(AtomicRMWInst::Add);
16885
+
16886
+ // We may still need the private-alias-flat handling below.
16887
+
16888
+ // TODO: Skip this for cases where we cannot access remote memory.
16889
+ }
16890
+ }
16891
+
16892
+ // The non-flat expansions should only perform the de-canonicalization of
16893
+ // identity values.
16894
+ if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16895
+ return;
16896
+
16897
+ emitExpandAtomicAddrSpacePredicate(AI);
16898
+ }
16899
+
16900
+ void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
16901
+ emitExpandAtomicAddrSpacePredicate(CI);
16902
+ }
16903
+
16860
16904
LoadInst *
16861
16905
SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
16862
16906
IRBuilder<> Builder(AI);
0 commit comments