@@ -16588,9 +16588,21 @@ SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
16588
16588
16589
16589
TargetLowering::AtomicExpansionKind
16590
16590
SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
16591
- return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16592
- ? AtomicExpansionKind::NotAtomic
16593
- : AtomicExpansionKind::None;
16591
+ unsigned AddrSpace = CmpX->getPointerAddressSpace();
16592
+ if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
16593
+ return AtomicExpansionKind::NotAtomic;
16594
+
16595
+ if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
16596
+ return AtomicExpansionKind::None;
16597
+
16598
+ const DataLayout &DL = CmpX->getDataLayout();
16599
+
16600
+ Type *ValTy = CmpX->getNewValOperand()->getType();
16601
+
16602
+ // If a 64-bit flat atomic may alias private, we need to avoid using the
16603
+ // atomic in the private case.
16604
+ return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand
16605
+ : AtomicExpansionKind::None;
16594
16606
}
16595
16607
16596
16608
const TargetRegisterClass *
@@ -16754,40 +16766,8 @@ bool SITargetLowering::checkForPhysRegDependency(
16754
16766
return false;
16755
16767
}
16756
16768
16757
- void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16758
- AtomicRMWInst::BinOp Op = AI->getOperation();
16759
-
16760
- if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16761
- Op == AtomicRMWInst::Xor) {
16762
- if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16763
- ConstVal && ConstVal->isNullValue()) {
16764
- // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16765
- AI->setOperation(AtomicRMWInst::Add);
16766
-
16767
- // TODO: Turn the below private handling into a no-op for idempotent
16768
- // cases.
16769
- }
16770
- }
16771
-
16772
- // The non-flat expansions should only perform the de-canonicalization of
16773
- // identity values.
16774
- if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16775
- return;
16776
-
16777
- // FullFlatEmulation is true if we need to issue the private, shared, and
16778
- // global cases.
16779
- //
16780
- // If this is false, we are only dealing with the flat-targeting-private case,
16781
- // where we only insert a check for private and still use the flat instruction
16782
- // for global and shared.
16783
-
16784
- // TODO: Avoid the private check for the fadd case depending on
16785
- // noalias.addrspace.
16786
-
16787
- bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
16788
- Subtarget->hasAtomicFaddInsts() &&
16789
- AI->getType()->isFloatTy();
16790
-
16769
+ void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
16770
+ Instruction *AI) const {
16791
16771
// Given: atomicrmw fadd ptr %addr, float %val ordering
16792
16772
//
16793
16773
// With this expansion we produce the following code:
@@ -16834,6 +16814,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16834
16814
IRBuilder<> Builder(AI);
16835
16815
LLVMContext &Ctx = Builder.getContext();
16836
16816
16817
+ auto *RMW = dyn_cast<AtomicRMWInst>(AI);
16818
+ const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
16819
+ : AtomicCmpXchgInst::getPointerOperandIndex();
16820
+ Value *Addr = AI->getOperand(PtrOpIdx);
16821
+
16822
+ /// TODO: Only need to check private, then emit flat-known-not private (no
16823
+ /// need for shared block, or cast to global).
16824
+ AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(AI);
16825
+
16826
+ Align Alignment;
16827
+ if (RMW)
16828
+ Alignment = RMW->getAlign();
16829
+ else if (CX)
16830
+ Alignment = CX->getAlign();
16831
+ else
16832
+ llvm_unreachable("unhandled atomic operation");
16833
+
16834
+ // FullFlatEmulation is true if we need to issue the private, shared, and
16835
+ // global cases.
16836
+ //
16837
+ // If this is false, we are only dealing with the flat-targeting-private case,
16838
+ // where we only insert a check for private and still use the flat instruction
16839
+ // for global and shared.
16840
+
16841
+ bool FullFlatEmulation = RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
16842
+ Subtarget->hasAtomicFaddInsts() &&
16843
+ RMW->getType()->isFloatTy();
16844
+
16837
16845
// If the return value isn't used, do not introduce a false use in the phi.
16838
16846
bool ReturnValueIsUsed = !AI->use_empty();
16839
16847
@@ -16855,11 +16863,6 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16855
16863
BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
16856
16864
BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
16857
16865
16858
- Value *Val = AI->getValOperand();
16859
- Type *ValTy = Val->getType();
16860
- Value *Addr = AI->getPointerOperand();
16861
- Align Alignment = AI->getAlign();
16862
-
16863
16866
std::prev(BB->end())->eraseFromParent();
16864
16867
Builder.SetInsertPoint(BB);
16865
16868
@@ -16874,8 +16877,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16874
16877
16875
16878
Instruction *Clone = AI->clone();
16876
16879
Clone->insertInto(SharedBB, SharedBB->end());
16877
- Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16878
- .set(CastToLocal);
16880
+ Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
16879
16881
LoadedShared = Clone;
16880
16882
16881
16883
Builder.CreateBr(PhiBB);
@@ -16887,14 +16889,29 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16887
16889
Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
16888
16890
16889
16891
Builder.SetInsertPoint(PrivateBB);
16892
+
16890
16893
Value *CastToPrivate = Builder.CreateAddrSpaceCast(
16891
16894
Addr, PointerType::get(Ctx, AMDGPUAS::PRIVATE_ADDRESS));
16892
- Value *LoadedPrivate = Builder.CreateAlignedLoad(ValTy, CastToPrivate,
16893
- Alignment, "loaded.private");
16894
16895
16895
- Value *NewVal = buildAtomicRMWValue(Op, Builder, LoadedPrivate, Val);
16896
+ Value *LoadedPrivate;
16897
+ if (RMW) {
16898
+ LoadedPrivate = Builder.CreateAlignedLoad(
16899
+ RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
16900
+
16901
+ Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
16902
+ LoadedPrivate, RMW->getValOperand());
16903
+
16904
+ Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
16905
+ } else {
16906
+ auto [ResultLoad, Equal] =
16907
+ buildAtomicCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
16908
+ CX->getNewValOperand(), CX->getAlign());
16909
+
16910
+ Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
16911
+ ResultLoad, 0);
16912
+ LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
16913
+ }
16896
16914
16897
- Builder.CreateAlignedStore(NewVal, CastToPrivate, Alignment);
16898
16915
Builder.CreateBr(PhiBB);
16899
16916
16900
16917
Builder.SetInsertPoint(GlobalBB);
@@ -16904,8 +16921,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16904
16921
if (FullFlatEmulation) {
16905
16922
Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16906
16923
Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16907
- AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16908
- .set(CastToGlobal);
16924
+ AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
16909
16925
}
16910
16926
16911
16927
AI->removeFromParent();
@@ -16929,7 +16945,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16929
16945
Builder.SetInsertPoint(PhiBB);
16930
16946
16931
16947
if (ReturnValueIsUsed) {
16932
- PHINode *Loaded = Builder.CreatePHI(ValTy , 3);
16948
+ PHINode *Loaded = Builder.CreatePHI(AI->getType() , 3);
16933
16949
AI->replaceAllUsesWith(Loaded);
16934
16950
if (FullFlatEmulation)
16935
16951
Loaded->addIncoming(LoadedShared, SharedBB);
@@ -16941,6 +16957,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16941
16957
Builder.CreateBr(ExitBB);
16942
16958
}
16943
16959
16960
+ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16961
+ AtomicRMWInst::BinOp Op = AI->getOperation();
16962
+
16963
+ if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16964
+ Op == AtomicRMWInst::Xor) {
16965
+ if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16966
+ ConstVal && ConstVal->isNullValue()) {
16967
+ // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16968
+ AI->setOperation(AtomicRMWInst::Add);
16969
+
16970
+ // We may still need the private-alias-flat handling below.
16971
+
16972
+ // TODO: Skip this for cases where we cannot access remote memory.
16973
+ }
16974
+ }
16975
+
16976
+ // The non-flat expansions should only perform the de-canonicalization of
16977
+ // identity values.
16978
+ if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16979
+ return;
16980
+
16981
+ emitExpandAtomicAddrSpacePredicate(AI);
16982
+ }
16983
+
16984
+ void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
16985
+ emitExpandAtomicAddrSpacePredicate(CI);
16986
+ }
16987
+
16944
16988
LoadInst *
16945
16989
SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
16946
16990
IRBuilder<> Builder(AI);
0 commit comments