-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[AMDGPU] Fix some cache policy checks for GFX12+ #116396
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Fix coding errors found by inspection and check that the swz bit still serves to prevent merging of buffer loads/stores on GFX12+.
@llvm/pr-subscribers-backend-amdgpu Author: Jay Foad (jayfoad) ChangesFix coding errors found by inspection and check that the swz bit still Full diff: https://github.com/llvm/llvm-project/pull/116396.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 13de93e829fab2..412692bd532298 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3267,9 +3267,14 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
MIB.add(MI.getOperand(1)); // rsrc
MIB.add(MI.getOperand(5 + OpOffset)); // soffset
MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
+ bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
- MIB.addImm(Aux & AMDGPU::CPol::ALL); // cpol
- MIB.addImm(Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0); // swz
+ MIB.addImm(Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL
+ : AMDGPU::CPol::ALL_pregfx12)); // cpol
+ MIB.addImm(
+ Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
+ ? 1
+ : 0); // swz
MachineMemOperand *LoadMMO = *MI.memoperands_begin();
MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 428a19c391374f..344028c4b48689 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -160,7 +160,7 @@ void AMDGPUInstPrinter::printCPol(const MCInst *MI, unsigned OpNo,
O << " dlc";
if ((Imm & CPol::SCC) && AMDGPU::isGFX90A(STI))
O << (AMDGPU::isGFX940(STI) ? " sc1" : " scc");
- if (Imm & ~CPol::ALL)
+ if (Imm & ~CPol::ALL_pregfx12)
O << " /* unexpected cache policy bit */";
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index b186dafb4c0ded..748ee872c8e1fc 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9840,11 +9840,16 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Ops.push_back(Rsrc);
Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
+ bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
- Ops.push_back(
- DAG.getTargetConstant(Aux & AMDGPU::CPol::ALL, DL, MVT::i8)); // cpol
Ops.push_back(DAG.getTargetConstant(
- Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0, DL, MVT::i8)); // swz
+ Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
+ DL, MVT::i8)); // cpol
+ Ops.push_back(DAG.getTargetConstant(
+ Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
+ ? 1
+ : 0,
+ DL, MVT::i8)); // swz
Ops.push_back(M0Val.getValue(0)); // Chain
Ops.push_back(M0Val.getValue(1)); // Glue
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll
index 0c6bba2426947e..b42ba7d75094a6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll
@@ -1312,8 +1312,8 @@ main_body:
ret void
}
-define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc) {
-; PREGFX10-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged:
+define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged_pregfx12(<4 x i32> inreg %rsrc) {
+; PREGFX10-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged_pregfx12:
; PREGFX10: ; %bb.0: ; %main_body
; PREGFX10-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4
; PREGFX10-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8
@@ -1327,7 +1327,7 @@ define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged(<4 x i32> i
; PREGFX10-NEXT: exp mrt0 v4, v5, v0, v0 done vm
; PREGFX10-NEXT: s_endpgm
;
-; GFX10-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged:
+; GFX10-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged_pregfx12:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: s_clause 0x5
; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4
@@ -1342,7 +1342,7 @@ define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged(<4 x i32> i
; GFX10-NEXT: exp mrt0 v4, v5, v0, v0 done vm
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged:
+; GFX11-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged_pregfx12:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: s_clause 0x5
; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 offset:4
@@ -1357,7 +1357,7 @@ define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged(<4 x i32> i
; GFX11-NEXT: exp mrt0 v4, v5, v0, v0 done
; GFX11-NEXT: s_endpgm
;
-; GFX12-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged:
+; GFX12-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged_pregfx12:
; GFX12: ; %bb.0: ; %main_body
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[0:3], null offset:4 scope:SCOPE_SE
@@ -1379,6 +1379,65 @@ main_body:
ret void
}
+define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc) {
+; PREGFX10-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged:
+; PREGFX10: ; %bb.0: ; %main_body
+; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:4
+; PREGFX10-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 offset:28
+; PREGFX10-NEXT: s_waitcnt vmcnt(1)
+; PREGFX10-NEXT: exp mrt0 v0, v1, v2, v3 done vm
+; PREGFX10-NEXT: s_waitcnt vmcnt(0)
+; PREGFX10-NEXT: exp mrt0 v4, v5, v0, v0 done vm
+; PREGFX10-NEXT: s_endpgm
+;
+; GFX10-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:4
+; GFX10-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 offset:28
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: exp mrt0 v0, v1, v2, v3 done vm
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: exp mrt0 v4, v5, v0, v0 done vm
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged:
+; GFX11: ; %bb.0: ; %main_body
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 offset:4
+; GFX11-NEXT: buffer_load_b64 v[4:5], off, s[0:3], 0 offset:28
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: exp mrt0 v0, v1, v2, v3 done
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: exp mrt0 v4, v5, v0, v0 done
+; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: s_clause 0x5
+; GFX12-NEXT: buffer_load_b32 v0, off, s[0:3], null offset:4
+; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], null offset:8
+; GFX12-NEXT: buffer_load_b32 v2, off, s[0:3], null offset:12
+; GFX12-NEXT: buffer_load_b32 v3, off, s[0:3], null offset:16
+; GFX12-NEXT: buffer_load_b32 v4, off, s[0:3], null offset:28
+; GFX12-NEXT: buffer_load_b32 v5, off, s[0:3], null offset:32
+; GFX12-NEXT: s_wait_loadcnt 0x2
+; GFX12-NEXT: export mrt0 v0, v1, v2, v3 done
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: export mrt0 v4, v5, v0, v0 done
+; GFX12-NEXT: s_endpgm
+main_body:
+ %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4, i32 0, i32 64)
+ %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 8, i32 0, i32 64)
+ %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 12, i32 0, i32 64)
+ %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 16, i32 0, i32 64)
+ %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 28, i32 0, i32 64)
+ %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 32, i32 0, i32 64)
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true)
+ ret void
+}
+
declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #0
declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32>, i32, i32, i32) #0
declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll
index fd6e354b274a42..3493de1497d11f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll
@@ -2,6 +2,7 @@
; RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=GFX68,VERDE %s
; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GFX68,GFX8 %s
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck -check-prefixes=GFX12 %s
define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
; GFX68-LABEL: buffer_store:
@@ -497,8 +498,8 @@ define amdgpu_ps void @raw_buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc,
ret void
}
-define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
-; GFX68-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged:
+define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged_pregfx12(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
+; GFX68-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged_pregfx12:
; GFX68: ; %bb.0:
; GFX68-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
; GFX68-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:8
@@ -508,7 +509,7 @@ define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged(<4 x i32>
; GFX68-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:32
; GFX68-NEXT: s_endpgm
;
-; GFX11-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged:
+; GFX11-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged_pregfx12:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x5
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 offset:4
@@ -527,6 +528,26 @@ define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged(<4 x i32>
ret void
}
+define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
+; GFX12-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_clause 0x5
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null offset:4
+; GFX12-NEXT: buffer_store_b32 v1, off, s[0:3], null offset:8
+; GFX12-NEXT: buffer_store_b32 v2, off, s[0:3], null offset:12
+; GFX12-NEXT: buffer_store_b32 v3, off, s[0:3], null offset:16
+; GFX12-NEXT: buffer_store_b32 v4, off, s[0:3], null offset:28
+; GFX12-NEXT: buffer_store_b32 v5, off, s[0:3], null offset:32
+; GFX12-NEXT: s_endpgm
+ call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 64)
+ call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 64)
+ call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 64)
+ call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 16, i32 0, i32 64)
+ call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 28, i32 0, i32 64)
+ call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 32, i32 0, i32 64)
+ ret void
+}
+
declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32) #0
declare void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32) #0
declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #0
|
unsigned Aux = MI.getOperand(7 + OpOffset).getImm(); | ||
MIB.addImm(Aux & AMDGPU::CPol::ALL); // cpol | ||
MIB.addImm(Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0); // swz | ||
MIB.addImm(Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I assume the DAG path already does this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It did not; that's what the change in SIISelLowering.cpp in this patch is for.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
pretty ugly
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/65/builds/8125 Here is the relevant piece of the build log for the reference
|
Fix coding errors found by inspection and check that the swz bit still
serves to prevent merging of buffer loads/stores on GFX12+.