@@ -16577,9 +16577,21 @@ SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
16577
16577
16578
16578
TargetLowering::AtomicExpansionKind
16579
16579
SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
16580
- return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16581
- ? AtomicExpansionKind::NotAtomic
16582
- : AtomicExpansionKind::None;
16580
+ unsigned AddrSpace = CmpX->getPointerAddressSpace();
16581
+ if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
16582
+ return AtomicExpansionKind::NotAtomic;
16583
+
16584
+ if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
16585
+ return AtomicExpansionKind::None;
16586
+
16587
+ const DataLayout &DL = CmpX->getDataLayout();
16588
+
16589
+ Type *ValTy = CmpX->getNewValOperand()->getType();
16590
+
16591
+ // If a 64-bit flat atomic may alias private, we need to avoid using the
16592
+ // atomic in the private case.
16593
+ return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand
16594
+ : AtomicExpansionKind::None;
16583
16595
}
16584
16596
16585
16597
const TargetRegisterClass *
@@ -16745,40 +16757,8 @@ bool SITargetLowering::checkForPhysRegDependency(
16745
16757
return false;
16746
16758
}
16747
16759
16748
- void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16749
- AtomicRMWInst::BinOp Op = AI->getOperation();
16750
-
16751
- if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16752
- Op == AtomicRMWInst::Xor) {
16753
- if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16754
- ConstVal && ConstVal->isNullValue()) {
16755
- // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16756
- AI->setOperation(AtomicRMWInst::Add);
16757
-
16758
- // TODO: Turn the below private handling into a no-op for idempotent
16759
- // cases.
16760
- }
16761
- }
16762
-
16763
- // The non-flat expansions should only perform the de-canonicalization of
16764
- // identity values.
16765
- if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16766
- return;
16767
-
16768
- // FullFlatEmulation is true if we need to issue the private, shared, and
16769
- // global cases.
16770
- //
16771
- // If this is false, we are only dealing with the flat-targeting-private case,
16772
- // where we only insert a check for private and still use the flat instruction
16773
- // for global and shared.
16774
-
16775
- // TODO: Avoid the private check for the fadd case depending on
16776
- // noalias.addrspace.
16777
-
16778
- bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
16779
- Subtarget->hasAtomicFaddInsts() &&
16780
- AI->getType()->isFloatTy();
16781
-
16760
+ void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
16761
+ Instruction *AI) const {
16782
16762
// Given: atomicrmw fadd ptr %addr, float %val ordering
16783
16763
//
16784
16764
// With this expansion we produce the following code:
@@ -16825,6 +16805,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16825
16805
IRBuilder<> Builder(AI);
16826
16806
LLVMContext &Ctx = Builder.getContext();
16827
16807
16808
+ auto *RMW = dyn_cast<AtomicRMWInst>(AI);
16809
+ const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
16810
+ : AtomicCmpXchgInst::getPointerOperandIndex();
16811
+ Value *Addr = AI->getOperand(PtrOpIdx);
16812
+
16813
+ /// TODO: Only need to check private, then emit flat-known-not private (no
16814
+ /// need for shared block, or cast to global).
16815
+ AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(AI);
16816
+
16817
+ Align Alignment;
16818
+ if (RMW)
16819
+ Alignment = RMW->getAlign();
16820
+ else if (CX)
16821
+ Alignment = CX->getAlign();
16822
+ else
16823
+ llvm_unreachable("unhandled atomic operation");
16824
+
16825
+ // FullFlatEmulation is true if we need to issue the private, shared, and
16826
+ // global cases.
16827
+ //
16828
+ // If this is false, we are only dealing with the flat-targeting-private case,
16829
+ // where we only insert a check for private and still use the flat instruction
16830
+ // for global and shared.
16831
+
16832
+ bool FullFlatEmulation = RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
16833
+ Subtarget->hasAtomicFaddInsts() &&
16834
+ RMW->getType()->isFloatTy();
16835
+
16828
16836
// If the return value isn't used, do not introduce a false use in the phi.
16829
16837
bool ReturnValueIsUsed = !AI->use_empty();
16830
16838
@@ -16846,11 +16854,6 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16846
16854
BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
16847
16855
BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
16848
16856
16849
- Value *Val = AI->getValOperand();
16850
- Type *ValTy = Val->getType();
16851
- Value *Addr = AI->getPointerOperand();
16852
- Align Alignment = AI->getAlign();
16853
-
16854
16857
std::prev(BB->end())->eraseFromParent();
16855
16858
Builder.SetInsertPoint(BB);
16856
16859
@@ -16865,8 +16868,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16865
16868
16866
16869
Instruction *Clone = AI->clone();
16867
16870
Clone->insertInto(SharedBB, SharedBB->end());
16868
- Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16869
- .set(CastToLocal);
16871
+ Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
16870
16872
LoadedShared = Clone;
16871
16873
16872
16874
Builder.CreateBr(PhiBB);
@@ -16878,14 +16880,29 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16878
16880
Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
16879
16881
16880
16882
Builder.SetInsertPoint(PrivateBB);
16883
+
16881
16884
Value *CastToPrivate = Builder.CreateAddrSpaceCast(
16882
16885
Addr, PointerType::get(Ctx, AMDGPUAS::PRIVATE_ADDRESS));
16883
- Value *LoadedPrivate = Builder.CreateAlignedLoad(ValTy, CastToPrivate,
16884
- Alignment, "loaded.private");
16885
16886
16886
- Value *NewVal = buildAtomicRMWValue(Op, Builder, LoadedPrivate, Val);
16887
+ Value *LoadedPrivate;
16888
+ if (RMW) {
16889
+ LoadedPrivate = Builder.CreateAlignedLoad(
16890
+ RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
16891
+
16892
+ Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
16893
+ LoadedPrivate, RMW->getValOperand());
16894
+
16895
+ Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
16896
+ } else {
16897
+ auto [ResultLoad, Equal] =
16898
+ buildAtomicCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
16899
+ CX->getNewValOperand(), CX->getAlign());
16900
+
16901
+ Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
16902
+ ResultLoad, 0);
16903
+ LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
16904
+ }
16887
16905
16888
- Builder.CreateAlignedStore(NewVal, CastToPrivate, Alignment);
16889
16906
Builder.CreateBr(PhiBB);
16890
16907
16891
16908
Builder.SetInsertPoint(GlobalBB);
@@ -16895,8 +16912,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16895
16912
if (FullFlatEmulation) {
16896
16913
Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16897
16914
Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16898
- AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16899
- .set(CastToGlobal);
16915
+ AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
16900
16916
}
16901
16917
16902
16918
AI->removeFromParent();
@@ -16920,7 +16936,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16920
16936
Builder.SetInsertPoint(PhiBB);
16921
16937
16922
16938
if (ReturnValueIsUsed) {
16923
- PHINode *Loaded = Builder.CreatePHI(ValTy , 3);
16939
+ PHINode *Loaded = Builder.CreatePHI(AI->getType() , 3);
16924
16940
AI->replaceAllUsesWith(Loaded);
16925
16941
if (FullFlatEmulation)
16926
16942
Loaded->addIncoming(LoadedShared, SharedBB);
@@ -16932,6 +16948,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16932
16948
Builder.CreateBr(ExitBB);
16933
16949
}
16934
16950
16951
+ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16952
+ AtomicRMWInst::BinOp Op = AI->getOperation();
16953
+
16954
+ if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16955
+ Op == AtomicRMWInst::Xor) {
16956
+ if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16957
+ ConstVal && ConstVal->isNullValue()) {
16958
+ // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16959
+ AI->setOperation(AtomicRMWInst::Add);
16960
+
16961
+ // We may still need the private-alias-flat handling below.
16962
+
16963
+ // TODO: Skip this for cases where we cannot access remote memory.
16964
+ }
16965
+ }
16966
+
16967
+ // The non-flat expansions should only perform the de-canonicalization of
16968
+ // identity values.
16969
+ if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16970
+ return;
16971
+
16972
+ emitExpandAtomicAddrSpacePredicate(AI);
16973
+ }
16974
+
16975
+ void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
16976
+ emitExpandAtomicAddrSpacePredicate(CI);
16977
+ }
16978
+
16935
16979
LoadInst *
16936
16980
SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
16937
16981
IRBuilder<> Builder(AI);
0 commit comments