-
Notifications
You must be signed in to change notification settings - Fork 13.5k
AtomicExpand: Copy metadata from atomicrmw to cmpxchg #109409
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AtomicExpand: Copy metadata from atomicrmw to cmpxchg #109409
Conversation
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesWhen expanding an atomicrmw with a cmpxchg, preserve any metadata The initial load should also probably receive the same metadata Patch is 863.20 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/109409.diff 20 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/AtomicExpandUtils.h b/llvm/include/llvm/CodeGen/AtomicExpandUtils.h
index 1cb410a0c31c69..feb05de20b4571 100644
--- a/llvm/include/llvm/CodeGen/AtomicExpandUtils.h
+++ b/llvm/include/llvm/CodeGen/AtomicExpandUtils.h
@@ -20,10 +20,11 @@ class Value;
/// Parameters (see the expansion example below):
/// (the builder, %addr, %loaded, %new_val, ordering,
-/// /* OUT */ %success, /* OUT */ %new_loaded)
-using CreateCmpXchgInstFun =
- function_ref<void(IRBuilderBase &, Value *, Value *, Value *, Align,
- AtomicOrdering, SyncScope::ID, Value *&, Value *&)>;
+/// /* OUT */ %success, /* OUT */ %new_loaded,
+/// %MetadataSrc)
+using CreateCmpXchgInstFun = function_ref<void(
+ IRBuilderBase &, Value *, Value *, Value *, Align, AtomicOrdering,
+ SyncScope::ID, Value *&, Value *&, Instruction *)>;
/// Expand an atomic RMW instruction into a loop utilizing
/// cmpxchg. You'll want to make sure your target machine likes cmpxchg
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 3d4e2cb196a16a..5a3e529e5ebd02 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -98,7 +98,7 @@ class AtomicExpandImpl {
IRBuilderBase &Builder, Type *ResultType, Value *Addr, Align AddrAlign,
AtomicOrdering MemOpOrder, SyncScope::ID SSID,
function_ref<Value *(IRBuilderBase &, Value *)> PerformOp,
- CreateCmpXchgInstFun CreateCmpXchg);
+ CreateCmpXchgInstFun CreateCmpXchg, Instruction *MetadataSrc);
bool tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI);
bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI);
@@ -600,7 +600,8 @@ void AtomicExpandImpl::expandAtomicStore(StoreInst *SI) {
static void createCmpXchgInstFun(IRBuilderBase &Builder, Value *Addr,
Value *Loaded, Value *NewVal, Align AddrAlign,
AtomicOrdering MemOpOrder, SyncScope::ID SSID,
- Value *&Success, Value *&NewLoaded) {
+ Value *&Success, Value *&NewLoaded,
+ Instruction *MetadataSrc) {
Type *OrigTy = NewVal->getType();
// This code can go away when cmpxchg supports FP and vector types.
@@ -612,9 +613,12 @@ static void createCmpXchgInstFun(IRBuilderBase &Builder, Value *Addr,
Loaded = Builder.CreateBitCast(Loaded, IntTy);
}
- Value *Pair = Builder.CreateAtomicCmpXchg(
+ AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg(
Addr, Loaded, NewVal, AddrAlign, MemOpOrder,
AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder), SSID);
+ if (MetadataSrc)
+ Pair->copyMetadata(*MetadataSrc);
+
Success = Builder.CreateExtractValue(Pair, 1, "success");
NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
@@ -951,9 +955,9 @@ void AtomicExpandImpl::expandPartwordAtomicRMW(
Value *OldResult;
if (ExpansionKind == TargetLoweringBase::AtomicExpansionKind::CmpXChg) {
- OldResult = insertRMWCmpXchgLoop(Builder, PMV.WordType, PMV.AlignedAddr,
- PMV.AlignedAddrAlignment, MemOpOrder, SSID,
- PerformPartwordOp, createCmpXchgInstFun);
+ OldResult = insertRMWCmpXchgLoop(
+ Builder, PMV.WordType, PMV.AlignedAddr, PMV.AlignedAddrAlignment,
+ MemOpOrder, SSID, PerformPartwordOp, createCmpXchgInstFun, AI);
} else {
assert(ExpansionKind == TargetLoweringBase::AtomicExpansionKind::LLSC);
OldResult = insertRMWLLSCLoop(Builder, PMV.WordType, PMV.AlignedAddr,
@@ -1591,7 +1595,7 @@ Value *AtomicExpandImpl::insertRMWCmpXchgLoop(
IRBuilderBase &Builder, Type *ResultTy, Value *Addr, Align AddrAlign,
AtomicOrdering MemOpOrder, SyncScope::ID SSID,
function_ref<Value *(IRBuilderBase &, Value *)> PerformOp,
- CreateCmpXchgInstFun CreateCmpXchg) {
+ CreateCmpXchgInstFun CreateCmpXchg, Instruction *MetadataSrc) {
LLVMContext &Ctx = Builder.getContext();
BasicBlock *BB = Builder.GetInsertBlock();
Function *F = BB->getParent();
@@ -1637,7 +1641,7 @@ Value *AtomicExpandImpl::insertRMWCmpXchgLoop(
MemOpOrder == AtomicOrdering::Unordered
? AtomicOrdering::Monotonic
: MemOpOrder,
- SSID, Success, NewLoaded);
+ SSID, Success, NewLoaded, MetadataSrc);
assert(Success && NewLoaded);
Loaded->addIncoming(NewLoaded, LoopBB);
@@ -1686,7 +1690,7 @@ bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
return buildAtomicRMWValue(AI->getOperation(), Builder, Loaded,
AI->getValOperand());
},
- CreateCmpXchg);
+ CreateCmpXchg, /*MetadataSrc=*/AI);
AI->replaceAllUsesWith(Loaded);
AI->eraseFromParent();
@@ -1838,11 +1842,15 @@ void AtomicExpandImpl::expandAtomicRMWToLibcall(AtomicRMWInst *I) {
expandAtomicRMWToCmpXchg(
I, [this](IRBuilderBase &Builder, Value *Addr, Value *Loaded,
Value *NewVal, Align Alignment, AtomicOrdering MemOpOrder,
- SyncScope::ID SSID, Value *&Success, Value *&NewLoaded) {
+ SyncScope::ID SSID, Value *&Success, Value *&NewLoaded,
+ Instruction *MetadataSrc) {
// Create the CAS instruction normally...
AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg(
Addr, Loaded, NewVal, Alignment, MemOpOrder,
AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder), SSID);
+ if (MetadataSrc)
+ Pair->copyMetadata(*MetadataSrc);
+
Success = Builder.CreateExtractValue(Pair, 1, "success");
NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll
index d3fb9d8ee522e7..443c5d18e68949 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll
@@ -187,7 +187,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -204,7 +204,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -221,7 +221,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -238,7 +238,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -260,7 +260,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -292,7 +292,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -309,7 +309,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -326,7 +326,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -343,7 +343,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -365,7 +365,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -382,7 +382,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr
; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -409,7 +409,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -426,7 +426,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -443,7 +443,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -460,7 +460,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -482,7 +482,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -514,7 +514,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor
; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
@@ -531,7 +531,7 @@ define float @tes...
[truncated]
|
edf2464
to
f85fb78
Compare
4596ffa
to
500dc8a
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure we can assume all metadata which applies to an atomicrmw also makes sense on the generated cmpxchg. I mean, most metadata probably does, but say we start allowing !align
metadata on atomicrmw...
In terms of the generated code, I think we're fine; we don't rely on the load producing a value that's consistent with atomic ordering, I think. That said, it probably should be atomic, because strictly speaking a race is UB if we use a non-atomic load. |
f85fb78
to
caecd58
Compare
500dc8a
to
512e83a
Compare
caecd58
to
4934c7d
Compare
512e83a
to
d9f79c2
Compare
4934c7d
to
b69ead1
Compare
d9f79c2
to
6418d44
Compare
b69ead1
to
fe3d55c
Compare
6418d44
to
841f2f9
Compare
ping |
Did you address #109409 (review) ? |
I switched this over to copyMetadataForAtomic but I guess I never uploaded the change |
fe3d55c
to
899877c
Compare
841f2f9
to
b31b4f5
Compare
ping |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
899877c
to
3cf356c
Compare
b31b4f5
to
cefd4d5
Compare
3cf356c
to
ed5bf5c
Compare
cefd4d5
to
1b406d1
Compare
ed5bf5c
to
125aedf
Compare
1b406d1
to
4406069
Compare
Merge activity
|
125aedf
to
a34a3fb
Compare
4406069
to
fdb6761
Compare
When expanding an atomicrmw with a cmpxchg, preserve any metadata attached to it. This will avoid unwanted double expansions in a future commit. The initial load should also probably receive the same metadata (which for some reason is not emitted as an atomic).
fdb6761
to
df51d36
Compare
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/30/builds/9313 Here is the relevant piece of the build log for the reference
|
When expanding an atomicrmw with a cmpxchg, preserve any metadata attached to it. This will avoid unwanted double expansions in a future commit. The initial load should also probably receive the same metadata (which for some reason is not emitted as an atomic).
When expanding an atomicrmw with a cmpxchg, preserve any metadata attached to it. This will avoid unwanted double expansions in a future commit. The initial load should also probably receive the same metadata (which for some reason is not emitted as an atomic).
When expanding an atomicrmw with a cmpxchg, preserve any metadata
attached to it. This will avoid unwanted double expansions
in a future commit.
The initial load should also probably receive the same metadata
(which for some reason is not emitted as an atomic).