-
Notifications
You must be signed in to change notification settings - Fork 13.5k
AMDGPU: Custom expand flat cmpxchg which may access private #109410
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU: Custom expand flat cmpxchg which may access private #109410
Conversation
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) Changes64-bit flat cmpxchg instructions do not work correctly for scratch Allow custom expansion of cmpxchg in AtomicExpand, as is Patch is 90.79 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/109410.diff 10 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 3842af56e6b3d7..678b169568afcf 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2204,6 +2204,11 @@ class TargetLoweringBase {
"Generic atomicrmw expansion unimplemented on this target");
}
+ /// Perform a cmpxchg expansion using a target-specific method.
+ virtual void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
+ llvm_unreachable("Generic cmpxchg expansion unimplemented on this target");
+ }
+
/// Perform a bit test atomicrmw using a target-specific intrinsic. This
/// represents the combined bit test intrinsic which will be lowered at a late
/// stage by the backend.
diff --git a/llvm/include/llvm/Transforms/Utils/LowerAtomic.h b/llvm/include/llvm/Transforms/Utils/LowerAtomic.h
index b25b281667f9cb..295c2bd2b4b47e 100644
--- a/llvm/include/llvm/Transforms/Utils/LowerAtomic.h
+++ b/llvm/include/llvm/Transforms/Utils/LowerAtomic.h
@@ -23,6 +23,13 @@ class IRBuilderBase;
/// Convert the given Cmpxchg into primitive load and compare.
bool lowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI);
+/// Emit IR to implement the given cmpxchg operation on values in registers,
+/// returning the new value.
+std::pair<Value *, Value *> buildAtomicCmpXchgValue(IRBuilderBase &Builder,
+ Value *Ptr, Value *Cmp,
+ Value *Val,
+ Align Alignment);
+
/// Convert the given RMWI into primitive load and stores,
/// assuming that doing so is legal. Return true if the lowering
/// succeeds.
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 5a3e529e5ebd02..37242d16001ca3 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -1671,6 +1671,10 @@ bool AtomicExpandImpl::tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
return true;
case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
return lowerAtomicCmpXchgInst(CI);
+ case TargetLoweringBase::AtomicExpansionKind::Expand: {
+ TLI->emitExpandAtomicCmpXchg(CI);
+ return true;
+ }
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index febd741f947ee1..889c174eec07c6 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16497,9 +16497,21 @@ SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
- return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
- ? AtomicExpansionKind::NotAtomic
- : AtomicExpansionKind::None;
+ unsigned AddrSpace = CmpX->getPointerAddressSpace();
+ if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
+ return AtomicExpansionKind::NotAtomic;
+
+ if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
+ return AtomicExpansionKind::None;
+
+ const DataLayout &DL = CmpX->getDataLayout();
+
+ Type *ValTy = CmpX->getNewValOperand()->getType();
+
+ // If a 64-bit flat atomic may alias private, we need to avoid using the
+ // atomic in the private case.
+ return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand
+ : AtomicExpansionKind::None;
}
const TargetRegisterClass *
@@ -16663,40 +16675,8 @@ bool SITargetLowering::checkForPhysRegDependency(
return false;
}
-void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
- AtomicRMWInst::BinOp Op = AI->getOperation();
-
- if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
- Op == AtomicRMWInst::Xor) {
- if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
- ConstVal && ConstVal->isNullValue()) {
- // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
- AI->setOperation(AtomicRMWInst::Add);
-
- // TODO: Turn the below private handling into a no-op for idempotent
- // cases.
- }
- }
-
- // The non-flat expansions should only perform the de-canonicalization of
- // identity values.
- if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
- return;
-
- // FullFlatEmulation is true if we need to issue the private, shared, and
- // global cases.
- //
- // If this is false, we are only dealing with the flat-targeting-private case,
- // where we only insert a check for private and still use the flat instruction
- // for global and shared.
-
- // TODO: Avoid the private check for the fadd case depending on
- // noalias.addrspace.
-
- bool FullFlatEmulation = Op == AtomicRMWInst::FAdd &&
- Subtarget->hasAtomicFaddInsts() &&
- AI->getType()->isFloatTy();
-
+void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
+ Instruction *AI) const {
// Given: atomicrmw fadd ptr %addr, float %val ordering
//
// With this expansion we produce the following code:
@@ -16743,6 +16723,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
IRBuilder<> Builder(AI);
LLVMContext &Ctx = Builder.getContext();
+ auto *RMW = dyn_cast<AtomicRMWInst>(AI);
+ const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
+ : AtomicCmpXchgInst::getPointerOperandIndex();
+ Value *Addr = AI->getOperand(PtrOpIdx);
+
+ /// TODO: Only need to check private, then emit flat-known-not private (no
+ /// need for shared block, or cast to global).
+ AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(AI);
+
+ Align Alignment;
+ if (RMW)
+ Alignment = RMW->getAlign();
+ else if (CX)
+ Alignment = CX->getAlign();
+ else
+ llvm_unreachable("unhandled atomic operation");
+
+ // FullFlatEmulation is true if we need to issue the private, shared, and
+ // global cases.
+ //
+ // If this is false, we are only dealing with the flat-targeting-private case,
+ // where we only insert a check for private and still use the flat instruction
+ // for global and shared.
+
+ bool FullFlatEmulation = RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
+ Subtarget->hasAtomicFaddInsts() &&
+ RMW->getType()->isFloatTy();
+
// If the return value isn't used, do not introduce a false use in the phi.
bool ReturnValueIsUsed = !AI->use_empty();
@@ -16764,11 +16772,6 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
- Value *Val = AI->getValOperand();
- Type *ValTy = Val->getType();
- Value *Addr = AI->getPointerOperand();
- Align Alignment = AI->getAlign();
-
std::prev(BB->end())->eraseFromParent();
Builder.SetInsertPoint(BB);
@@ -16783,8 +16786,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
Instruction *Clone = AI->clone();
Clone->insertInto(SharedBB, SharedBB->end());
- Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
- .set(CastToLocal);
+ Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
LoadedShared = Clone;
Builder.CreateBr(PhiBB);
@@ -16796,14 +16798,29 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
Builder.SetInsertPoint(PrivateBB);
+
Value *CastToPrivate = Builder.CreateAddrSpaceCast(
Addr, PointerType::get(Ctx, AMDGPUAS::PRIVATE_ADDRESS));
- Value *LoadedPrivate = Builder.CreateAlignedLoad(ValTy, CastToPrivate,
- Alignment, "loaded.private");
- Value *NewVal = buildAtomicRMWValue(Op, Builder, LoadedPrivate, Val);
+ Value *LoadedPrivate;
+ if (RMW) {
+ LoadedPrivate = Builder.CreateAlignedLoad(
+ RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
+
+ Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
+ LoadedPrivate, RMW->getValOperand());
+
+ Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
+ } else {
+ auto [ResultLoad, Equal] =
+ buildAtomicCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
+ CX->getNewValOperand(), CX->getAlign());
+
+ Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
+ ResultLoad, 0);
+ LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
+ }
- Builder.CreateAlignedStore(NewVal, CastToPrivate, Alignment);
Builder.CreateBr(PhiBB);
Builder.SetInsertPoint(GlobalBB);
@@ -16813,8 +16830,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
if (FullFlatEmulation) {
Value *CastToGlobal = Builder.CreateAddrSpaceCast(
Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
- AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex())
- .set(CastToGlobal);
+ AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
}
AI->removeFromParent();
@@ -16838,7 +16854,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
Builder.SetInsertPoint(PhiBB);
if (ReturnValueIsUsed) {
- PHINode *Loaded = Builder.CreatePHI(ValTy, 3);
+ PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
AI->replaceAllUsesWith(Loaded);
if (FullFlatEmulation)
Loaded->addIncoming(LoadedShared, SharedBB);
@@ -16850,6 +16866,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
Builder.CreateBr(ExitBB);
}
+void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
+ AtomicRMWInst::BinOp Op = AI->getOperation();
+
+ if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
+ Op == AtomicRMWInst::Xor) {
+ if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
+ ConstVal && ConstVal->isNullValue()) {
+ // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
+ AI->setOperation(AtomicRMWInst::Add);
+
+ // We may still need the private-alias-flat handling below.
+
+ // TODO: Skip this for cases where we cannot access remote memory.
+ }
+ }
+
+ // The non-flat expansions should only perform the de-canonicalization of
+ // identity values.
+ if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
+ return;
+
+ emitExpandAtomicAddrSpacePredicate(AI);
+}
+
+void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
+ emitExpandAtomicAddrSpacePredicate(CI);
+}
+
LoadInst *
SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
IRBuilder<> Builder(AI);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 6c3edf37945e24..32e110fdfa84d4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -544,7 +544,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
AtomicExpansionKind
shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
+
+ void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const;
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override;
+ void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override;
LoadInst *
lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
diff --git a/llvm/lib/Transforms/Utils/LowerAtomic.cpp b/llvm/lib/Transforms/Utils/LowerAtomic.cpp
index 8b3a0ce338e577..89c49d4a0732f6 100644
--- a/llvm/lib/Transforms/Utils/LowerAtomic.cpp
+++ b/llvm/lib/Transforms/Utils/LowerAtomic.cpp
@@ -25,13 +25,11 @@ bool llvm::lowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) {
Value *Cmp = CXI->getCompareOperand();
Value *Val = CXI->getNewValOperand();
- LoadInst *Orig =
- Builder.CreateAlignedLoad(Val->getType(), Ptr, CXI->getAlign());
- Value *Equal = Builder.CreateICmpEQ(Orig, Cmp);
- Value *Res = Builder.CreateSelect(Equal, Val, Orig);
- Builder.CreateAlignedStore(Res, Ptr, CXI->getAlign());
+ auto [Orig, Equal] =
+ buildAtomicCmpXchgValue(Builder, Ptr, Cmp, Val, CXI->getAlign());
- Res = Builder.CreateInsertValue(PoisonValue::get(CXI->getType()), Orig, 0);
+ Value *Res =
+ Builder.CreateInsertValue(PoisonValue::get(CXI->getType()), Orig, 0);
Res = Builder.CreateInsertValue(Res, Equal, 1);
CXI->replaceAllUsesWith(Res);
@@ -39,6 +37,17 @@ bool llvm::lowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) {
return true;
}
+std::pair<Value *, Value *>
+llvm::buildAtomicCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp,
+ Value *Val, Align Alignment) {
+ LoadInst *Orig = Builder.CreateAlignedLoad(Val->getType(), Ptr, Alignment);
+ Value *Equal = Builder.CreateICmpEQ(Orig, Cmp);
+ Value *Res = Builder.CreateSelect(Equal, Val, Orig);
+ Builder.CreateAlignedStore(Res, Ptr, Alignment);
+
+ return {Orig, Equal};
+}
+
Value *llvm::buildAtomicRMWValue(AtomicRMWInst::BinOp Op,
IRBuilderBase &Builder, Value *Loaded,
Value *Val) {
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
index 7b5797d4139a19..53d63c297d8098 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
@@ -5220,11 +5220,29 @@ entry:
define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old) {
; GCN1-LABEL: atomic_cmpxchg_i64_offset:
; GCN1: ; %bb.0: ; %entry
+; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GCN1-NEXT: s_mov_b32 s14, -1
+; GCN1-NEXT: s_mov_b32 s15, 0xe8f000
; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GCN1-NEXT: s_load_dword s8, s[2:3], 0x3f
; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GCN1-NEXT: s_add_u32 s12, s12, s9
+; GCN1-NEXT: s_addc_u32 s13, s13, 0
; GCN1-NEXT: s_waitcnt lgkmcnt(0)
; GCN1-NEXT: s_add_u32 s2, s4, 32
; GCN1-NEXT: s_addc_u32 s3, s5, 0
+; GCN1-NEXT: s_cmp_eq_u32 s3, s8
+; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GCN1-NEXT: s_mov_b64 s[4:5], -1
+; GCN1-NEXT: s_cbranch_vccnz .LBB90_3
+; GCN1-NEXT: ; %bb.1: ; %Flow
+; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GCN1-NEXT: s_cbranch_vccz .LBB90_4
+; GCN1-NEXT: .LBB90_2: ; %atomicrmw.phi
+; GCN1-NEXT: s_endpgm
+; GCN1-NEXT: .LBB90_3: ; %atomicrmw.global
; GCN1-NEXT: v_mov_b32_e32 v5, s3
; GCN1-NEXT: v_mov_b32_e32 v0, s6
; GCN1-NEXT: v_mov_b32_e32 v1, s7
@@ -5234,15 +5252,51 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old
; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: s_cbranch_execnz .LBB90_2
+; GCN1-NEXT: .LBB90_4: ; %atomicrmw.private
+; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0
+; GCN1-NEXT: v_mov_b32_e32 v5, s6
+; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec
+; GCN1-NEXT: s_cselect_b32 s2, s2, -1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
+; GCN1-NEXT: s_add_i32 s2, s2, 4
+; GCN1-NEXT: v_mov_b32_e32 v3, s2
+; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
+; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen
+; GCN1-NEXT: v_mov_b32_e32 v4, s7
+; GCN1-NEXT: s_waitcnt vmcnt(0)
+; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, s[0:1], v[0:1]
+; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
+; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen
+; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen
; GCN1-NEXT: s_endpgm
;
; GCN2-LABEL: atomic_cmpxchg_i64_offset:
; GCN2: ; %bb.0: ; %entry
+; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
+; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
+; GCN2-NEXT: s_mov_b32 s90, -1
+; GCN2-NEXT: s_mov_b32 s91, 0xe80000
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN2-NEXT: s_load_dword s8, s[2:3], 0xfc
; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GCN2-NEXT: s_add_u32 s88, s88, s9
+; GCN2-NEXT: s_addc_u32 s89, s89, 0
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_add_u32 s2, s4, 32
; GCN2-NEXT: s_addc_u32 s3, s5, 0
+; GCN2-NEXT: s_cmp_eq_u32 s3, s8
+; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GCN2-NEXT: s_mov_b64 s[4:5], -1
+; GCN2-NEXT: s_cbranch_vccnz .LBB90_3
+; GCN2-NEXT: ; %bb.1: ; %Flow
+; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GCN2-NEXT: s_cbranch_vccz .LBB90_4
+; GCN2-NEXT: .LBB90_2: ; %atomicrmw.phi
+; GCN2-NEXT: s_endpgm
+; GCN2-NEXT: .LBB90_3: ; %atomicrmw.global
; GCN2-NEXT: v_mov_b32_e32 v5, s3
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
@@ -5252,6 +5306,23 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: s_cbranch_execnz .LBB90_2
+; GCN2-NEXT: .LBB90_4: ; %atomicrmw.private
+; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GCN2-NEXT: s_cselect_b32 s2, s2, -1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: s_add_i32 s2, s2, 4
+; GCN2-NEXT: v_mov_b32_e32 v3, s2
+; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen
+; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen
+; GCN2-NEXT: v_mov_b32_e32 v5, s6
+; GCN2-NEXT: v_mov_b32_e32 v4, s7
+; GCN2-NEXT: s_waitcnt vmcnt(0)
+; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, s[0:1], v[0:1]
+; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
+; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
+; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_cmpxchg_i64_offset:
@@ -5259,14 +5330,39 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
+; GFX12-NEXT: s_mov_b64 s[8:9], src_private_base
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], 32
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_cmp_eq_u32 s3, s9
+; GFX12-NEXT: s_cselect_b32 s4, -1, 0
+; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT: s_mov_b32 s4, -1
+; GFX12-NEXT: s_cbranch_vccnz .LBB90_3
+; GFX12-NEXT: ; %bb.1: ; %Flow
+; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_vccz .LBB90_4
+; GFX12-NEXT: .LBB90_2: ; %atomicrmw.phi
+; GFX12-NEXT: s_endpgm
+; GFX12-NEXT: .LBB90_3: ; %atomicrmw.global
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
; GFX12-NEXT: global_wb scope:SCOPE_DEV
-; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32 scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] scope:SCOPE_DEV
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_cbranch_execnz .LBB90_2
+; GFX12-NEXT: .LBB90_4: ; %atomicrmw.private
+; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX12-NEXT: s_cselect_b32 s2, s2, -1
+; GFX12-NEXT: scratch_load_b64 v[0:1], off, s2
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[0:1], v[0:1]
+; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s7, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s6, vcc_lo
+; GFX12-NEXT: scratch_store_b64 off, v[0:1], s2
; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, p...
[truncated]
|
4596ffa
to
500dc8a
Compare
f2c9bc6
to
71c1435
Compare
500dc8a
to
512e83a
Compare
71c1435
to
b1a9c07
Compare
@@ -43,7 +43,7 @@ define i64 @test_flat_atomicrmw_sub_0_i64_agent(ptr %ptr) { | |||
; ALL: [[ATOMICRMW_PRIVATE]]: | |||
; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5) | |||
; ALL-NEXT: [[LOADED_PRIVATE:%.*]] = load i64, ptr addrspace(5) [[TMP1]], align 8 | |||
; ALL-NEXT: [[NEW:%.*]] = sub i64 [[LOADED_PRIVATE]], 0 | |||
; ALL-NEXT: [[NEW:%.*]] = add i64 [[LOADED_PRIVATE]], 0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why does this transform happen more often now?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Because it would require more work to avoid doing it, but there's not much reason to.
All of the 64-bit cases now go to expand. emitExpandAtomicRMW isn't bothering to restrict this to the specific cases where it's needed
512e83a
to
d9f79c2
Compare
b1a9c07
to
e72aee2
Compare
d9f79c2
to
6418d44
Compare
e72aee2
to
cf7c0c0
Compare
6418d44
to
841f2f9
Compare
cf7c0c0
to
6022e58
Compare
841f2f9
to
b31b4f5
Compare
6022e58
to
20afcbd
Compare
b31b4f5
to
cefd4d5
Compare
20afcbd
to
da75158
Compare
cefd4d5
to
1b406d1
Compare
da75158
to
f6891da
Compare
1b406d1
to
4406069
Compare
f6891da
to
2793c97
Compare
fdb6761
to
df51d36
Compare
2793c97
to
37a7ec4
Compare
64-bit flat cmpxchg instructions do not work correctly for scratch addresses, and need to be expanded as non-atomic. Allow custom expansion of cmpxchg in AtomicExpand, as is already the case for atomicrmw.
Switch from using range parsing was off and also didn't handle the wrapped range case
37a7ec4
to
67036ee
Compare
64-bit flat cmpxchg instructions do not work correctly for scratch
addresses, and need to be expanded as non-atomic.
Allow custom expansion of cmpxchg in AtomicExpand, as is
already the case for atomicrmw.