@@ -16318,15 +16318,14 @@ static bool flatInstrMayAccessPrivate(const Instruction *I) {
16318
16318
++I) {
16319
16319
auto *Low = mdconst::extract<ConstantInt>(
16320
16320
NoaliasAddrSpaceMD->getOperand(2 * I + 0));
16321
- auto *High = mdconst::extract<ConstantInt>(
16322
- NoaliasAddrSpaceMD->getOperand(2 * I + 1));
16323
-
16324
- if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS) &&
16325
- High->getValue().ult(AMDGPUAS::PRIVATE_ADDRESS))
16326
- return true;
16321
+ if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS)) {
16322
+ auto *High = mdconst::extract<ConstantInt>(
16323
+ NoaliasAddrSpaceMD->getOperand(2 * I + 1));
16324
+ return High->getValue().ule(AMDGPUAS::PRIVATE_ADDRESS);
16325
+ }
16327
16326
}
16328
16327
16329
- return false ;
16328
+ return true ;
16330
16329
}
16331
16330
16332
16331
TargetLowering::AtomicExpansionKind
@@ -16573,9 +16572,21 @@ SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
16573
16572
16574
16573
TargetLowering::AtomicExpansionKind
16575
16574
SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
16576
- return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16577
- ? AtomicExpansionKind::NotAtomic
16578
- : AtomicExpansionKind::None;
16575
+ unsigned AddrSpace = CmpX->getPointerAddressSpace();
16576
+ if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
16577
+ return AtomicExpansionKind::NotAtomic;
16578
+
16579
+ if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
16580
+ return AtomicExpansionKind::None;
16581
+
16582
+ const DataLayout &DL = CmpX->getDataLayout();
16583
+
16584
+ Type *ValTy = CmpX->getNewValOperand()->getType();
16585
+
16586
+ // If a 64-bit flat atomic may alias private, we need to avoid using the
16587
+ // atomic in the private case.
16588
+ return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand
16589
+ : AtomicExpansionKind::None;
16579
16590
}
16580
16591
16581
16592
const TargetRegisterClass *
@@ -16741,40 +16752,8 @@ bool SITargetLowering::checkForPhysRegDependency(
16741
16752
return false;
16742
16753
}
16743
16754
16744
- void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16745
- AtomicRMWInst::BinOp Op = AI->getOperation();
16746
-
16747
- if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16748
- Op == AtomicRMWInst::Xor) {
16749
- if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16750
- ConstVal && ConstVal->isNullValue()) {
16751
- // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16752
- AI->setOperation(AtomicRMWInst::Add);
16753
-
16754
- // TODO: Turn the below private handling into a no-op for idempotent
16755
- // cases.
16756
- }
16757
- }
16758
-
16759
- // The non-flat expansions should only perform the de-canonicalization of
16760
- // identity values.
16761
- if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16762
- return;
16763
-
16764
- // FullFlatEmulation is true if we need to issue the private, shared, and
16765
- // global cases.
16766
- //
16767
- // If this is false, we are only dealing with the flat-targeting-private case,
16768
- // where we only insert a check for private and still use the flat instruction
16769
- // for global and shared.
16770
-
16771
- // TODO: Avoid the private check for the fadd case depending on
16772
- // noalias.addrspace.
16773
-
16774
- bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
16775
- Subtarget->hasAtomicFaddInsts() &&
16776
- AI->getType()->isFloatTy();
16777
-
16755
+ void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
16756
+ Instruction *AI) const {
16778
16757
// Given: atomicrmw fadd ptr %addr, float %val ordering
16779
16758
//
16780
16759
// With this expansion we produce the following code:
@@ -16821,6 +16800,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16821
16800
IRBuilder<> Builder(AI);
16822
16801
LLVMContext &Ctx = Builder.getContext();
16823
16802
16803
+ auto *RMW = dyn_cast<AtomicRMWInst>(AI);
16804
+ const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
16805
+ : AtomicCmpXchgInst::getPointerOperandIndex();
16806
+ Value *Addr = AI->getOperand(PtrOpIdx);
16807
+
16808
+ /// TODO: Only need to check private, then emit flat-known-not private (no
16809
+ /// need for shared block, or cast to global).
16810
+ AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(AI);
16811
+
16812
+ Align Alignment;
16813
+ if (RMW)
16814
+ Alignment = RMW->getAlign();
16815
+ else if (CX)
16816
+ Alignment = CX->getAlign();
16817
+ else
16818
+ llvm_unreachable("unhandled atomic operation");
16819
+
16820
+ // FullFlatEmulation is true if we need to issue the private, shared, and
16821
+ // global cases.
16822
+ //
16823
+ // If this is false, we are only dealing with the flat-targeting-private case,
16824
+ // where we only insert a check for private and still use the flat instruction
16825
+ // for global and shared.
16826
+
16827
+ bool FullFlatEmulation = RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
16828
+ Subtarget->hasAtomicFaddInsts() &&
16829
+ RMW->getType()->isFloatTy();
16830
+
16824
16831
// If the return value isn't used, do not introduce a false use in the phi.
16825
16832
bool ReturnValueIsUsed = !AI->use_empty();
16826
16833
@@ -16842,11 +16849,6 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16842
16849
BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
16843
16850
BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
16844
16851
16845
- Value *Val = AI->getValOperand();
16846
- Type *ValTy = Val->getType();
16847
- Value *Addr = AI->getPointerOperand();
16848
- Align Alignment = AI->getAlign();
16849
-
16850
16852
std::prev(BB->end())->eraseFromParent();
16851
16853
Builder.SetInsertPoint(BB);
16852
16854
@@ -16861,8 +16863,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16861
16863
16862
16864
Instruction *Clone = AI->clone();
16863
16865
Clone->insertInto(SharedBB, SharedBB->end());
16864
- Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16865
- .set(CastToLocal);
16866
+ Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
16866
16867
LoadedShared = Clone;
16867
16868
16868
16869
Builder.CreateBr(PhiBB);
@@ -16874,14 +16875,29 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16874
16875
Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
16875
16876
16876
16877
Builder.SetInsertPoint(PrivateBB);
16878
+
16877
16879
Value *CastToPrivate = Builder.CreateAddrSpaceCast(
16878
16880
Addr, PointerType::get(Ctx, AMDGPUAS::PRIVATE_ADDRESS));
16879
- Value *LoadedPrivate = Builder.CreateAlignedLoad(ValTy, CastToPrivate,
16880
- Alignment, "loaded.private");
16881
16881
16882
- Value *NewVal = buildAtomicRMWValue(Op, Builder, LoadedPrivate, Val);
16882
+ Value *LoadedPrivate;
16883
+ if (RMW) {
16884
+ LoadedPrivate = Builder.CreateAlignedLoad(
16885
+ RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
16886
+
16887
+ Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
16888
+ LoadedPrivate, RMW->getValOperand());
16889
+
16890
+ Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
16891
+ } else {
16892
+ auto [ResultLoad, Equal] =
16893
+ buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
16894
+ CX->getNewValOperand(), CX->getAlign());
16895
+
16896
+ Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
16897
+ ResultLoad, 0);
16898
+ LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
16899
+ }
16883
16900
16884
- Builder.CreateAlignedStore(NewVal, CastToPrivate, Alignment);
16885
16901
Builder.CreateBr(PhiBB);
16886
16902
16887
16903
Builder.SetInsertPoint(GlobalBB);
@@ -16891,8 +16907,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16891
16907
if (FullFlatEmulation) {
16892
16908
Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16893
16909
Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16894
- AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16895
- .set(CastToGlobal);
16910
+ AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
16896
16911
}
16897
16912
16898
16913
AI->removeFromParent();
@@ -16916,7 +16931,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16916
16931
Builder.SetInsertPoint(PhiBB);
16917
16932
16918
16933
if (ReturnValueIsUsed) {
16919
- PHINode *Loaded = Builder.CreatePHI(ValTy , 3);
16934
+ PHINode *Loaded = Builder.CreatePHI(AI->getType() , 3);
16920
16935
AI->replaceAllUsesWith(Loaded);
16921
16936
if (FullFlatEmulation)
16922
16937
Loaded->addIncoming(LoadedShared, SharedBB);
@@ -16928,6 +16943,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16928
16943
Builder.CreateBr(ExitBB);
16929
16944
}
16930
16945
16946
+ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16947
+ AtomicRMWInst::BinOp Op = AI->getOperation();
16948
+
16949
+ if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16950
+ Op == AtomicRMWInst::Xor) {
16951
+ if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16952
+ ConstVal && ConstVal->isNullValue()) {
16953
+ // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16954
+ AI->setOperation(AtomicRMWInst::Add);
16955
+
16956
+ // We may still need the private-alias-flat handling below.
16957
+
16958
+ // TODO: Skip this for cases where we cannot access remote memory.
16959
+ }
16960
+ }
16961
+
16962
+ // The non-flat expansions should only perform the de-canonicalization of
16963
+ // identity values.
16964
+ if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16965
+ return;
16966
+
16967
+ emitExpandAtomicAddrSpacePredicate(AI);
16968
+ }
16969
+
16970
+ void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
16971
+ emitExpandAtomicAddrSpacePredicate(CI);
16972
+ }
16973
+
16931
16974
LoadInst *
16932
16975
SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
16933
16976
IRBuilder<> Builder(AI);
0 commit comments