Skip to content

Commit ade0750

Browse files
authored
[AMDGPU] Fix some cache policy checks for GFX12+ (#116396)
Fix coding errors found by inspection and check that the swz bit still serves to prevent merging of buffer loads/stores on GFX12+.
1 parent 6f76b2a commit ade0750

File tree

5 files changed

+104
-14
lines changed

5 files changed

+104
-14
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3285,9 +3285,14 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
32853285
MIB.add(MI.getOperand(1)); // rsrc
32863286
MIB.add(MI.getOperand(5 + OpOffset)); // soffset
32873287
MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3288+
bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
32883289
unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3289-
MIB.addImm(Aux & AMDGPU::CPol::ALL); // cpol
3290-
MIB.addImm(Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0); // swz
3290+
MIB.addImm(Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL
3291+
: AMDGPU::CPol::ALL_pregfx12)); // cpol
3292+
MIB.addImm(
3293+
Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
3294+
? 1
3295+
: 0); // swz
32913296

32923297
MachineMemOperand *LoadMMO = *MI.memoperands_begin();
32933298
MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();

llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ void AMDGPUInstPrinter::printCPol(const MCInst *MI, unsigned OpNo,
160160
O << " dlc";
161161
if ((Imm & CPol::SCC) && AMDGPU::isGFX90A(STI))
162162
O << (AMDGPU::isGFX940(STI) ? " sc1" : " scc");
163-
if (Imm & ~CPol::ALL)
163+
if (Imm & ~CPol::ALL_pregfx12)
164164
O << " /* unexpected cache policy bit */";
165165
}
166166

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9860,11 +9860,16 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
98609860
Ops.push_back(Rsrc);
98619861
Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
98629862
Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
9863+
bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
98639864
unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
9864-
Ops.push_back(
9865-
DAG.getTargetConstant(Aux & AMDGPU::CPol::ALL, DL, MVT::i8)); // cpol
98669865
Ops.push_back(DAG.getTargetConstant(
9867-
Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0, DL, MVT::i8)); // swz
9866+
Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
9867+
DL, MVT::i8)); // cpol
9868+
Ops.push_back(DAG.getTargetConstant(
9869+
Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
9870+
? 1
9871+
: 0,
9872+
DL, MVT::i8)); // swz
98689873
Ops.push_back(M0Val.getValue(0)); // Chain
98699874
Ops.push_back(M0Val.getValue(1)); // Glue
98709875

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll

Lines changed: 64 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1312,8 +1312,8 @@ main_body:
13121312
ret void
13131313
}
13141314

1315-
define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc) {
1316-
; PREGFX10-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged:
1315+
define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged_pregfx12(<4 x i32> inreg %rsrc) {
1316+
; PREGFX10-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged_pregfx12:
13171317
; PREGFX10: ; %bb.0: ; %main_body
13181318
; PREGFX10-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4
13191319
; PREGFX10-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8
@@ -1327,7 +1327,7 @@ define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged(<4 x i32> i
13271327
; PREGFX10-NEXT: exp mrt0 v4, v5, v0, v0 done vm
13281328
; PREGFX10-NEXT: s_endpgm
13291329
;
1330-
; GFX10-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged:
1330+
; GFX10-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged_pregfx12:
13311331
; GFX10: ; %bb.0: ; %main_body
13321332
; GFX10-NEXT: s_clause 0x5
13331333
; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4
@@ -1342,7 +1342,7 @@ define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged(<4 x i32> i
13421342
; GFX10-NEXT: exp mrt0 v4, v5, v0, v0 done vm
13431343
; GFX10-NEXT: s_endpgm
13441344
;
1345-
; GFX11-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged:
1345+
; GFX11-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged_pregfx12:
13461346
; GFX11: ; %bb.0: ; %main_body
13471347
; GFX11-NEXT: s_clause 0x5
13481348
; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 offset:4
@@ -1357,7 +1357,7 @@ define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged(<4 x i32> i
13571357
; GFX11-NEXT: exp mrt0 v4, v5, v0, v0 done
13581358
; GFX11-NEXT: s_endpgm
13591359
;
1360-
; GFX12-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged:
1360+
; GFX12-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged_pregfx12:
13611361
; GFX12: ; %bb.0: ; %main_body
13621362
; GFX12-NEXT: s_clause 0x1
13631363
; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[0:3], null offset:4 scope:SCOPE_SE
@@ -1379,6 +1379,65 @@ main_body:
13791379
ret void
13801380
}
13811381

1382+
define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc) {
1383+
; PREGFX10-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged:
1384+
; PREGFX10: ; %bb.0: ; %main_body
1385+
; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:4
1386+
; PREGFX10-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 offset:28
1387+
; PREGFX10-NEXT: s_waitcnt vmcnt(1)
1388+
; PREGFX10-NEXT: exp mrt0 v0, v1, v2, v3 done vm
1389+
; PREGFX10-NEXT: s_waitcnt vmcnt(0)
1390+
; PREGFX10-NEXT: exp mrt0 v4, v5, v0, v0 done vm
1391+
; PREGFX10-NEXT: s_endpgm
1392+
;
1393+
; GFX10-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged:
1394+
; GFX10: ; %bb.0: ; %main_body
1395+
; GFX10-NEXT: s_clause 0x1
1396+
; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:4
1397+
; GFX10-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 offset:28
1398+
; GFX10-NEXT: s_waitcnt vmcnt(1)
1399+
; GFX10-NEXT: exp mrt0 v0, v1, v2, v3 done vm
1400+
; GFX10-NEXT: s_waitcnt vmcnt(0)
1401+
; GFX10-NEXT: exp mrt0 v4, v5, v0, v0 done vm
1402+
; GFX10-NEXT: s_endpgm
1403+
;
1404+
; GFX11-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged:
1405+
; GFX11: ; %bb.0: ; %main_body
1406+
; GFX11-NEXT: s_clause 0x1
1407+
; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 offset:4
1408+
; GFX11-NEXT: buffer_load_b64 v[4:5], off, s[0:3], 0 offset:28
1409+
; GFX11-NEXT: s_waitcnt vmcnt(1)
1410+
; GFX11-NEXT: exp mrt0 v0, v1, v2, v3 done
1411+
; GFX11-NEXT: s_waitcnt vmcnt(0)
1412+
; GFX11-NEXT: exp mrt0 v4, v5, v0, v0 done
1413+
; GFX11-NEXT: s_endpgm
1414+
;
1415+
; GFX12-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged:
1416+
; GFX12: ; %bb.0: ; %main_body
1417+
; GFX12-NEXT: s_clause 0x5
1418+
; GFX12-NEXT: buffer_load_b32 v0, off, s[0:3], null offset:4
1419+
; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], null offset:8
1420+
; GFX12-NEXT: buffer_load_b32 v2, off, s[0:3], null offset:12
1421+
; GFX12-NEXT: buffer_load_b32 v3, off, s[0:3], null offset:16
1422+
; GFX12-NEXT: buffer_load_b32 v4, off, s[0:3], null offset:28
1423+
; GFX12-NEXT: buffer_load_b32 v5, off, s[0:3], null offset:32
1424+
; GFX12-NEXT: s_wait_loadcnt 0x2
1425+
; GFX12-NEXT: export mrt0 v0, v1, v2, v3 done
1426+
; GFX12-NEXT: s_wait_loadcnt 0x0
1427+
; GFX12-NEXT: export mrt0 v4, v5, v0, v0 done
1428+
; GFX12-NEXT: s_endpgm
1429+
main_body:
1430+
%r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4, i32 0, i32 64)
1431+
%r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 8, i32 0, i32 64)
1432+
%r3 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 12, i32 0, i32 64)
1433+
%r4 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 16, i32 0, i32 64)
1434+
%r5 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 28, i32 0, i32 64)
1435+
%r6 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 32, i32 0, i32 64)
1436+
call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
1437+
call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true)
1438+
ret void
1439+
}
1440+
13821441
declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #0
13831442
declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32>, i32, i32, i32) #0
13841443
declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #0

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
; RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=GFX68,VERDE %s
33
; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GFX68,GFX8 %s
44
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GFX11 %s
5+
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck -check-prefixes=GFX12 %s
56

67
define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
78
; GFX68-LABEL: buffer_store:
@@ -497,8 +498,8 @@ define amdgpu_ps void @raw_buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc,
497498
ret void
498499
}
499500

500-
define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
501-
; GFX68-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged:
501+
define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged_pregfx12(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
502+
; GFX68-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged_pregfx12:
502503
; GFX68: ; %bb.0:
503504
; GFX68-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
504505
; GFX68-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:8
@@ -508,7 +509,7 @@ define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged(<4 x i32>
508509
; GFX68-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:32
509510
; GFX68-NEXT: s_endpgm
510511
;
511-
; GFX11-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged:
512+
; GFX11-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged_pregfx12:
512513
; GFX11: ; %bb.0:
513514
; GFX11-NEXT: s_clause 0x5
514515
; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 offset:4
@@ -527,6 +528,26 @@ define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged(<4 x i32>
527528
ret void
528529
}
529530

531+
define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
532+
; GFX12-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged:
533+
; GFX12: ; %bb.0:
534+
; GFX12-NEXT: s_clause 0x5
535+
; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null offset:4
536+
; GFX12-NEXT: buffer_store_b32 v1, off, s[0:3], null offset:8
537+
; GFX12-NEXT: buffer_store_b32 v2, off, s[0:3], null offset:12
538+
; GFX12-NEXT: buffer_store_b32 v3, off, s[0:3], null offset:16
539+
; GFX12-NEXT: buffer_store_b32 v4, off, s[0:3], null offset:28
540+
; GFX12-NEXT: buffer_store_b32 v5, off, s[0:3], null offset:32
541+
; GFX12-NEXT: s_endpgm
542+
call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 64)
543+
call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 64)
544+
call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 64)
545+
call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 16, i32 0, i32 64)
546+
call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 28, i32 0, i32 64)
547+
call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 32, i32 0, i32 64)
548+
ret void
549+
}
550+
530551
declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32) #0
531552
declare void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32) #0
532553
declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #0

0 commit comments

Comments
 (0)