Skip to content

Commit 71c1435

Browse files
committed
AMDGPU: Custom expand flat cmpxchg which may access private
64-bit flat cmpxchg instructions do not work correctly for scratch addresses, and need to be expanded as non-atomic. Allow custom expansion of cmpxchg in AtomicExpand, as is already the case for atomicrmw.
1 parent 500dc8a commit 71c1435

File tree

10 files changed

+1161
-168
lines changed

10 files changed

+1161
-168
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2204,6 +2204,11 @@ class TargetLoweringBase {
22042204
"Generic atomicrmw expansion unimplemented on this target");
22052205
}
22062206

2207+
/// Perform a cmpxchg expansion using a target-specific method.
2208+
virtual void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
2209+
llvm_unreachable("Generic cmpxchg expansion unimplemented on this target");
2210+
}
2211+
22072212
/// Perform a bit test atomicrmw using a target-specific intrinsic. This
22082213
/// represents the combined bit test intrinsic which will be lowered at a late
22092214
/// stage by the backend.

llvm/include/llvm/Transforms/Utils/LowerAtomic.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,13 @@ class IRBuilderBase;
2323
/// Convert the given Cmpxchg into primitive load and compare.
2424
bool lowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI);
2525

26+
/// Emit IR to implement the given cmpxchg operation on values in registers,
27+
/// returning the new value.
28+
std::pair<Value *, Value *> buildAtomicCmpXchgValue(IRBuilderBase &Builder,
29+
Value *Ptr, Value *Cmp,
30+
Value *Val,
31+
Align Alignment);
32+
2633
/// Convert the given RMWI into primitive load and stores,
2734
/// assuming that doing so is legal. Return true if the lowering
2835
/// succeeds.

llvm/lib/CodeGen/AtomicExpandPass.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1671,6 +1671,10 @@ bool AtomicExpandImpl::tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
16711671
return true;
16721672
case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
16731673
return lowerAtomicCmpXchgInst(CI);
1674+
case TargetLoweringBase::AtomicExpansionKind::Expand: {
1675+
TLI->emitExpandAtomicCmpXchg(CI);
1676+
return true;
1677+
}
16741678
}
16751679
}
16761680

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 95 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -16497,9 +16497,21 @@ SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
1649716497

1649816498
TargetLowering::AtomicExpansionKind
1649916499
SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
16500-
return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16501-
? AtomicExpansionKind::NotAtomic
16502-
: AtomicExpansionKind::None;
16500+
unsigned AddrSpace = CmpX->getPointerAddressSpace();
16501+
if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
16502+
return AtomicExpansionKind::NotAtomic;
16503+
16504+
if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
16505+
return AtomicExpansionKind::None;
16506+
16507+
const DataLayout &DL = CmpX->getDataLayout();
16508+
16509+
Type *ValTy = CmpX->getNewValOperand()->getType();
16510+
16511+
// If a 64-bit flat atomic may alias private, we need to avoid using the
16512+
// atomic in the private case.
16513+
return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand
16514+
: AtomicExpansionKind::None;
1650316515
}
1650416516

1650516517
const TargetRegisterClass *
@@ -16663,40 +16675,8 @@ bool SITargetLowering::checkForPhysRegDependency(
1666316675
return false;
1666416676
}
1666516677

16666-
void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16667-
AtomicRMWInst::BinOp Op = AI->getOperation();
16668-
16669-
if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16670-
Op == AtomicRMWInst::Xor) {
16671-
if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16672-
ConstVal && ConstVal->isNullValue()) {
16673-
// atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16674-
AI->setOperation(AtomicRMWInst::Add);
16675-
16676-
// TODO: Turn the below private handling into a no-op for idempotent
16677-
// cases.
16678-
}
16679-
}
16680-
16681-
// The non-flat expansions should only perform the de-canonicalization of
16682-
// identity values.
16683-
if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16684-
return;
16685-
16686-
// FullFlatEmulation is true if we need to issue the private, shared, and
16687-
// global cases.
16688-
//
16689-
// If this is false, we are only dealing with the flat-targeting-private case,
16690-
// where we only insert a check for private and still use the flat instruction
16691-
// for global and shared.
16692-
16693-
// TODO: Avoid the private check for the fadd case depending on
16694-
// noalias.addrspace.
16695-
16696-
bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
16697-
Subtarget->hasAtomicFaddInsts() &&
16698-
AI->getType()->isFloatTy();
16699-
16678+
void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
16679+
Instruction *AI) const {
1670016680
// Given: atomicrmw fadd ptr %addr, float %val ordering
1670116681
//
1670216682
// With this expansion we produce the following code:
@@ -16743,6 +16723,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1674316723
IRBuilder<> Builder(AI);
1674416724
LLVMContext &Ctx = Builder.getContext();
1674516725

16726+
auto *RMW = dyn_cast<AtomicRMWInst>(AI);
16727+
const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
16728+
: AtomicCmpXchgInst::getPointerOperandIndex();
16729+
Value *Addr = AI->getOperand(PtrOpIdx);
16730+
16731+
/// TODO: Only need to check private, then emit flat-known-not private (no
16732+
/// need for shared block, or cast to global).
16733+
AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(AI);
16734+
16735+
Align Alignment;
16736+
if (RMW)
16737+
Alignment = RMW->getAlign();
16738+
else if (CX)
16739+
Alignment = CX->getAlign();
16740+
else
16741+
llvm_unreachable("unhandled atomic operation");
16742+
16743+
// FullFlatEmulation is true if we need to issue the private, shared, and
16744+
// global cases.
16745+
//
16746+
// If this is false, we are only dealing with the flat-targeting-private case,
16747+
// where we only insert a check for private and still use the flat instruction
16748+
// for global and shared.
16749+
16750+
bool FullFlatEmulation = RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
16751+
Subtarget->hasAtomicFaddInsts() &&
16752+
RMW->getType()->isFloatTy();
16753+
1674616754
// If the return value isn't used, do not introduce a false use in the phi.
1674716755
bool ReturnValueIsUsed = !AI->use_empty();
1674816756

@@ -16764,11 +16772,6 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1676416772
BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
1676516773
BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
1676616774

16767-
Value *Val = AI->getValOperand();
16768-
Type *ValTy = Val->getType();
16769-
Value *Addr = AI->getPointerOperand();
16770-
Align Alignment = AI->getAlign();
16771-
1677216775
std::prev(BB->end())->eraseFromParent();
1677316776
Builder.SetInsertPoint(BB);
1677416777

@@ -16783,8 +16786,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1678316786

1678416787
Instruction *Clone = AI->clone();
1678516788
Clone->insertInto(SharedBB, SharedBB->end());
16786-
Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16787-
.set(CastToLocal);
16789+
Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
1678816790
LoadedShared = Clone;
1678916791

1679016792
Builder.CreateBr(PhiBB);
@@ -16796,14 +16798,29 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1679616798
Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
1679716799

1679816800
Builder.SetInsertPoint(PrivateBB);
16801+
1679916802
Value *CastToPrivate = Builder.CreateAddrSpaceCast(
1680016803
Addr, PointerType::get(Ctx, AMDGPUAS::PRIVATE_ADDRESS));
16801-
Value *LoadedPrivate = Builder.CreateAlignedLoad(ValTy, CastToPrivate,
16802-
Alignment, "loaded.private");
1680316804

16804-
Value *NewVal = buildAtomicRMWValue(Op, Builder, LoadedPrivate, Val);
16805+
Value *LoadedPrivate;
16806+
if (RMW) {
16807+
LoadedPrivate = Builder.CreateAlignedLoad(
16808+
RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
16809+
16810+
Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
16811+
LoadedPrivate, RMW->getValOperand());
16812+
16813+
Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
16814+
} else {
16815+
auto [ResultLoad, Equal] =
16816+
buildAtomicCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
16817+
CX->getNewValOperand(), CX->getAlign());
16818+
16819+
Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
16820+
ResultLoad, 0);
16821+
LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
16822+
}
1680516823

16806-
Builder.CreateAlignedStore(NewVal, CastToPrivate, Alignment);
1680716824
Builder.CreateBr(PhiBB);
1680816825

1680916826
Builder.SetInsertPoint(GlobalBB);
@@ -16813,8 +16830,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1681316830
if (FullFlatEmulation) {
1681416831
Value *CastToGlobal = Builder.CreateAddrSpaceCast(
1681516832
Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
16816-
AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
16817-
.set(CastToGlobal);
16833+
AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
1681816834
}
1681916835

1682016836
AI->removeFromParent();
@@ -16838,7 +16854,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1683816854
Builder.SetInsertPoint(PhiBB);
1683916855

1684016856
if (ReturnValueIsUsed) {
16841-
PHINode *Loaded = Builder.CreatePHI(ValTy, 3);
16857+
PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
1684216858
AI->replaceAllUsesWith(Loaded);
1684316859
if (FullFlatEmulation)
1684416860
Loaded->addIncoming(LoadedShared, SharedBB);
@@ -16850,6 +16866,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
1685016866
Builder.CreateBr(ExitBB);
1685116867
}
1685216868

16869+
void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16870+
AtomicRMWInst::BinOp Op = AI->getOperation();
16871+
16872+
if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16873+
Op == AtomicRMWInst::Xor) {
16874+
if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
16875+
ConstVal && ConstVal->isNullValue()) {
16876+
// atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16877+
AI->setOperation(AtomicRMWInst::Add);
16878+
16879+
// We may still need the private-alias-flat handling below.
16880+
16881+
// TODO: Skip this for cases where we cannot access remote memory.
16882+
}
16883+
}
16884+
16885+
// The non-flat expansions should only perform the de-canonicalization of
16886+
// identity values.
16887+
if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
16888+
return;
16889+
16890+
emitExpandAtomicAddrSpacePredicate(AI);
16891+
}
16892+
16893+
void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
16894+
emitExpandAtomicAddrSpacePredicate(CI);
16895+
}
16896+
1685316897
LoadInst *
1685416898
SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
1685516899
IRBuilder<> Builder(AI);

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -544,7 +544,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
544544
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
545545
AtomicExpansionKind
546546
shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
547+
548+
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const;
547549
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override;
550+
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override;
548551

549552
LoadInst *
550553
lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;

llvm/lib/Transforms/Utils/LowerAtomic.cpp

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,20 +25,29 @@ bool llvm::lowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) {
2525
Value *Cmp = CXI->getCompareOperand();
2626
Value *Val = CXI->getNewValOperand();
2727

28-
LoadInst *Orig =
29-
Builder.CreateAlignedLoad(Val->getType(), Ptr, CXI->getAlign());
30-
Value *Equal = Builder.CreateICmpEQ(Orig, Cmp);
31-
Value *Res = Builder.CreateSelect(Equal, Val, Orig);
32-
Builder.CreateAlignedStore(Res, Ptr, CXI->getAlign());
28+
auto [Orig, Equal] =
29+
buildAtomicCmpXchgValue(Builder, Ptr, Cmp, Val, CXI->getAlign());
3330

34-
Res = Builder.CreateInsertValue(PoisonValue::get(CXI->getType()), Orig, 0);
31+
Value *Res =
32+
Builder.CreateInsertValue(PoisonValue::get(CXI->getType()), Orig, 0);
3533
Res = Builder.CreateInsertValue(Res, Equal, 1);
3634

3735
CXI->replaceAllUsesWith(Res);
3836
CXI->eraseFromParent();
3937
return true;
4038
}
4139

40+
std::pair<Value *, Value *>
41+
llvm::buildAtomicCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp,
42+
Value *Val, Align Alignment) {
43+
LoadInst *Orig = Builder.CreateAlignedLoad(Val->getType(), Ptr, Alignment);
44+
Value *Equal = Builder.CreateICmpEQ(Orig, Cmp);
45+
Value *Res = Builder.CreateSelect(Equal, Val, Orig);
46+
Builder.CreateAlignedStore(Res, Ptr, Alignment);
47+
48+
return {Orig, Equal};
49+
}
50+
4251
Value *llvm::buildAtomicRMWValue(AtomicRMWInst::BinOp Op,
4352
IRBuilderBase &Builder, Value *Loaded,
4453
Value *Val) {

0 commit comments

Comments
 (0)