Skip to content

Commit 229594f

Browse files
arsenmPhilippRados
authored andcommitted
AMDGPU: Custom expand flat cmpxchg which may access private (llvm#109410)
64-bit flat cmpxchg instructions do not work correctly for scratch addresses, and need to be expanded as non-atomic. Allow custom expansion of cmpxchg in AtomicExpand, as is already the case for atomicrmw.
1 parent 2bcd160 commit 229594f

File tree

10 files changed

+1233
-172
lines changed

10 files changed

+1233
-172
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2204,6 +2204,11 @@ class TargetLoweringBase {
22042204
"Generic atomicrmw expansion unimplemented on this target");
22052205
}
22062206

2207+
/// Perform a cmpxchg expansion using a target-specific method.
2208+
virtual void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
2209+
llvm_unreachable("Generic cmpxchg expansion unimplemented on this target");
2210+
}
2211+
22072212
/// Perform a bit test atomicrmw using a target-specific intrinsic. This
22082213
/// represents the combined bit test intrinsic which will be lowered at a late
22092214
/// stage by the backend.

llvm/include/llvm/Transforms/Utils/LowerAtomic.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,12 @@ class IRBuilderBase;
2323
/// Convert the given Cmpxchg into primitive load and compare.
2424
bool lowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI);
2525

26+
/// Emit IR to implement the given cmpxchg operation on values in registers,
27+
/// returning the new value.
28+
std::pair<Value *, Value *> buildCmpXchgValue(IRBuilderBase &Builder,
29+
Value *Ptr, Value *Cmp,
30+
Value *Val, Align Alignment);
31+
2632
/// Convert the given RMWI into primitive load and stores,
2733
/// assuming that doing so is legal. Return true if the lowering
2834
/// succeeds.

llvm/lib/CodeGen/AtomicExpandPass.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1674,6 +1674,10 @@ bool AtomicExpandImpl::tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
16741674
return true;
16751675
case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
16761676
return lowerAtomicCmpXchgInst(CI);
1677+
case TargetLoweringBase::AtomicExpansionKind::Expand: {
1678+
TLI->emitExpandAtomicCmpXchg(CI);
1679+
return true;
1680+
}
16771681
}
16781682
}
16791683

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 101 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -16318,15 +16318,14 @@ static bool flatInstrMayAccessPrivate(const Instruction *I) {
1631816318
++I) {
1631916319
auto *Low = mdconst::extract<ConstantInt>(
1632016320
NoaliasAddrSpaceMD->getOperand(2 * I + 0));
16321-
auto *High = mdconst::extract<ConstantInt>(
16322-
NoaliasAddrSpaceMD->getOperand(2 * I + 1));
16323-
16324-
if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS) &&
16325-
High->getValue().ult(AMDGPUAS::PRIVATE_ADDRESS))
16326-
return true;
16321+
if (Low->getValue().uge(AMDGPUAS::PRIVATE_ADDRESS)) {
16322+
auto *High = mdconst::extract<ConstantInt>(
16323+
NoaliasAddrSpaceMD->getOperand(2 * I + 1));
16324+
return High->getValue().ule(AMDGPUAS::PRIVATE_ADDRESS);
16325+
}
1632716326
}
1632816327

16329-
return false;
16328+
return true;
1633016329
}
1633116330

1633216331
TargetLowering::AtomicExpansionKind
@@ -16573,9 +16572,21 @@ SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
1657316572

1657416573
TargetLowering::AtomicExpansionKind
1657516574
SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
16576-
return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16577-
? AtomicExpansionKind::NotAtomic
16578-
: AtomicExpansionKind::None;
16575+
unsigned AddrSpace = CmpX->getPointerAddressSpace();
16576+
if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
16577+
return AtomicExpansionKind::NotAtomic;
16578+
16579+
if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
16580+
return AtomicExpansionKind::None;
16581+
16582+
const DataLayout &DL = CmpX->getDataLayout();
16583+
16584+
Type *ValTy = CmpX->getNewValOperand()->getType();
16585+
16586+
// If a 64-bit flat atomic may alias private, we need to avoid using the
16587+
// atomic in the private case.
16588+
return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand
16589+
: AtomicExpansionKind::None;
1657916590
}
1658016591

1658116592
const TargetRegisterClass *
@@ -16741,40 +16752,8 @@ bool SITargetLowering::checkForPhysRegDependency(
1674116752
return false;
1674216753
}
1674316754

16744-
void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16745-
AtomicRMWInst::BinOp Op = AI->getOperation();
16746-
16747-
if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16748-
Op == AtomicRMWInst::Xor) {
16749-
if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16750-
ConstVal && ConstVal->isNullValue()) {
16751-
// atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16752-
AI->setOperation(AtomicRMWInst::Add);
16753-
16754-
// TODO: Turn the below private handling into a no-op for idempotent
16755-
// cases.
16756-
}
16757-
}
16758-
16759-
// The non-flat expansions should only perform the de-canonicalization of
16760-
// identity values.
16761-
if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16762-
return;
16763-
16764-
// FullFlatEmulation is true if we need to issue the private, shared, and
16765-
// global cases.
16766-
//
16767-
// If this is false, we are only dealing with the flat-targeting-private case,
16768-
// where we only insert a check for private and still use the flat instruction
16769-
// for global and shared.
16770-
16771-
// TODO: Avoid the private check for the fadd case depending on
16772-
// noalias.addrspace.
16773-
16774-
bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
16775-
Subtarget->hasAtomicFaddInsts() &&
16776-
AI->getType()->isFloatTy();
16777-
16755+
void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
16756+
Instruction *AI) const {
1677816757
// Given: atomicrmw fadd ptr %addr, float %val ordering
1677916758
//
1678016759
// With this expansion we produce the following code:
@@ -16821,6 +16800,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1682116800
IRBuilder<> Builder(AI);
1682216801
LLVMContext &Ctx = Builder.getContext();
1682316802

16803+
auto *RMW = dyn_cast<AtomicRMWInst>(AI);
16804+
const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
16805+
: AtomicCmpXchgInst::getPointerOperandIndex();
16806+
Value *Addr = AI->getOperand(PtrOpIdx);
16807+
16808+
/// TODO: Only need to check private, then emit flat-known-not private (no
16809+
/// need for shared block, or cast to global).
16810+
AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(AI);
16811+
16812+
Align Alignment;
16813+
if (RMW)
16814+
Alignment = RMW->getAlign();
16815+
else if (CX)
16816+
Alignment = CX->getAlign();
16817+
else
16818+
llvm_unreachable("unhandled atomic operation");
16819+
16820+
// FullFlatEmulation is true if we need to issue the private, shared, and
16821+
// global cases.
16822+
//
16823+
// If this is false, we are only dealing with the flat-targeting-private case,
16824+
// where we only insert a check for private and still use the flat instruction
16825+
// for global and shared.
16826+
16827+
bool FullFlatEmulation = RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
16828+
Subtarget->hasAtomicFaddInsts() &&
16829+
RMW->getType()->isFloatTy();
16830+
1682416831
// If the return value isn't used, do not introduce a false use in the phi.
1682516832
bool ReturnValueIsUsed = !AI->use_empty();
1682616833

@@ -16842,11 +16849,6 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1684216849
BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
1684316850
BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
1684416851

16845-
Value *Val = AI->getValOperand();
16846-
Type *ValTy = Val->getType();
16847-
Value *Addr = AI->getPointerOperand();
16848-
Align Alignment = AI->getAlign();
16849-
1685016852
std::prev(BB->end())->eraseFromParent();
1685116853
Builder.SetInsertPoint(BB);
1685216854

@@ -16861,8 +16863,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1686116863

1686216864
Instruction *Clone = AI->clone();
1686316865
Clone->insertInto(SharedBB, SharedBB->end());
16864-
Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16865-
.set(CastToLocal);
16866+
Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
1686616867
LoadedShared = Clone;
1686716868

1686816869
Builder.CreateBr(PhiBB);
@@ -16874,14 +16875,29 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1687416875
Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
1687516876

1687616877
Builder.SetInsertPoint(PrivateBB);
16878+
1687716879
Value *CastToPrivate = Builder.CreateAddrSpaceCast(
1687816880
Addr, PointerType::get(Ctx, AMDGPUAS::PRIVATE_ADDRESS));
16879-
Value *LoadedPrivate = Builder.CreateAlignedLoad(ValTy, CastToPrivate,
16880-
Alignment, "loaded.private");
1688116881

16882-
Value *NewVal = buildAtomicRMWValue(Op, Builder, LoadedPrivate, Val);
16882+
Value *LoadedPrivate;
16883+
if (RMW) {
16884+
LoadedPrivate = Builder.CreateAlignedLoad(
16885+
RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
16886+
16887+
Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
16888+
LoadedPrivate, RMW->getValOperand());
16889+
16890+
Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
16891+
} else {
16892+
auto [ResultLoad, Equal] =
16893+
buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
16894+
CX->getNewValOperand(), CX->getAlign());
16895+
16896+
Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
16897+
ResultLoad, 0);
16898+
LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
16899+
}
1688316900

16884-
Builder.CreateAlignedStore(NewVal, CastToPrivate, Alignment);
1688516901
Builder.CreateBr(PhiBB);
1688616902

1688716903
Builder.SetInsertPoint(GlobalBB);
@@ -16891,8 +16907,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1689116907
if (FullFlatEmulation) {
1689216908
Value *CastToGlobal = Builder.CreateAddrSpaceCast(
1689316909
Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16894-
AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16895-
.set(CastToGlobal);
16910+
AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
1689616911
}
1689716912

1689816913
AI->removeFromParent();
@@ -16916,7 +16931,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1691616931
Builder.SetInsertPoint(PhiBB);
1691716932

1691816933
if (ReturnValueIsUsed) {
16919-
PHINode *Loaded = Builder.CreatePHI(ValTy, 3);
16934+
PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
1692016935
AI->replaceAllUsesWith(Loaded);
1692116936
if (FullFlatEmulation)
1692216937
Loaded->addIncoming(LoadedShared, SharedBB);
@@ -16928,6 +16943,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1692816943
Builder.CreateBr(ExitBB);
1692916944
}
1693016945

16946+
void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16947+
AtomicRMWInst::BinOp Op = AI->getOperation();
16948+
16949+
if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16950+
Op == AtomicRMWInst::Xor) {
16951+
if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16952+
ConstVal && ConstVal->isNullValue()) {
16953+
// atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16954+
AI->setOperation(AtomicRMWInst::Add);
16955+
16956+
// We may still need the private-alias-flat handling below.
16957+
16958+
// TODO: Skip this for cases where we cannot access remote memory.
16959+
}
16960+
}
16961+
16962+
// The non-flat expansions should only perform the de-canonicalization of
16963+
// identity values.
16964+
if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16965+
return;
16966+
16967+
emitExpandAtomicAddrSpacePredicate(AI);
16968+
}
16969+
16970+
void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
16971+
emitExpandAtomicAddrSpacePredicate(CI);
16972+
}
16973+
1693116974
LoadInst *
1693216975
SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
1693316976
IRBuilder<> Builder(AI);

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -544,7 +544,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
544544
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
545545
AtomicExpansionKind
546546
shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
547+
548+
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const;
547549
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override;
550+
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override;
548551

549552
LoadInst *
550553
lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;

llvm/lib/Transforms/Utils/LowerAtomic.cpp

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,20 +25,30 @@ bool llvm::lowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) {
2525
Value *Cmp = CXI->getCompareOperand();
2626
Value *Val = CXI->getNewValOperand();
2727

28-
LoadInst *Orig =
29-
Builder.CreateAlignedLoad(Val->getType(), Ptr, CXI->getAlign());
30-
Value *Equal = Builder.CreateICmpEQ(Orig, Cmp);
31-
Value *Res = Builder.CreateSelect(Equal, Val, Orig);
32-
Builder.CreateAlignedStore(Res, Ptr, CXI->getAlign());
28+
auto [Orig, Equal] =
29+
buildCmpXchgValue(Builder, Ptr, Cmp, Val, CXI->getAlign());
3330

34-
Res = Builder.CreateInsertValue(PoisonValue::get(CXI->getType()), Orig, 0);
31+
Value *Res =
32+
Builder.CreateInsertValue(PoisonValue::get(CXI->getType()), Orig, 0);
3533
Res = Builder.CreateInsertValue(Res, Equal, 1);
3634

3735
CXI->replaceAllUsesWith(Res);
3836
CXI->eraseFromParent();
3937
return true;
4038
}
4139

40+
std::pair<Value *, Value *> llvm::buildCmpXchgValue(IRBuilderBase &Builder,
41+
Value *Ptr, Value *Cmp,
42+
Value *Val,
43+
Align Alignment) {
44+
LoadInst *Orig = Builder.CreateAlignedLoad(Val->getType(), Ptr, Alignment);
45+
Value *Equal = Builder.CreateICmpEQ(Orig, Cmp);
46+
Value *Res = Builder.CreateSelect(Equal, Val, Orig);
47+
Builder.CreateAlignedStore(Res, Ptr, Alignment);
48+
49+
return {Orig, Equal};
50+
}
51+
4252
Value *llvm::buildAtomicRMWValue(AtomicRMWInst::BinOp Op,
4353
IRBuilderBase &Builder, Value *Loaded,
4454
Value *Val) {

0 commit comments

Comments
 (0)