Skip to content

Commit 2ca30eb

Browse files
authored
AMDGPU/GlobalISel: Handle mubuf load/store for more types (#68268)
Fixes MUBUF path for most vectors and pointers, which unblocks fixing the gfx6/7 run lines in assorted tests. Also fixes inconsistent behavior for -flat-for-global.
1 parent ea71d2d commit 2ca30eb

15 files changed

+3549
-1486
lines changed

llvm/lib/Target/AMDGPU/BUFInstructions.td

Lines changed: 34 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -574,11 +574,11 @@ class MUBUF_Store_Pseudo <string opName,
574574

575575
multiclass MUBUF_Pseudo_Store_Pats<string BaseInst, ValueType store_vt = i32, SDPatternOperator st = null_frag> {
576576

577-
def _OFFSET : GCNPat <
577+
def : GCNPat <
578578
(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset)),
579579
(!cast<MUBUF_Pseudo>(BaseInst # _OFFSET) store_vt:$vdata, v4i32:$srsrc, i32:$soffset, i32:$offset)>;
580580

581-
def _ADDR64 : GCNPat <
581+
def : GCNPat <
582582
(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset)),
583583
(!cast<MUBUF_Pseudo>(BaseInst # _ADDR64) store_vt:$vdata, i64:$vaddr, v4i32:$srsrc, i32:$soffset, i32:$offset)>;
584584
}
@@ -912,10 +912,22 @@ defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SBYTE", i32, sextloadi8_global>;
912912
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, extloadi16_global>;
913913
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, zextloadi16_global>;
914914
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SSHORT", i32, sextloadi16_global>;
915-
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORD", i32, load_global>;
916-
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX2", v2i32, load_global>;
917-
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX3", v3i32, load_global>;
918-
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX4", v4i32, load_global>;
915+
916+
foreach vt = Reg32Types.types in {
917+
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORD", vt, load_global>;
918+
}
919+
920+
foreach vt = VReg_64.RegTypes in {
921+
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX2", vt, load_global>;
922+
}
923+
924+
foreach vt = VReg_96.RegTypes in {
925+
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX3", vt, load_global>;
926+
}
927+
928+
foreach vt = VReg_128.RegTypes in {
929+
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX4", vt, load_global>;
930+
}
919931

920932
defm BUFFER_STORE_BYTE : MUBUF_Pseudo_Stores <
921933
"buffer_store_byte", i32
@@ -938,10 +950,22 @@ defm BUFFER_STORE_DWORDX4 : MUBUF_Pseudo_Stores <
938950

939951
defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_BYTE", i32, truncstorei8_global>;
940952
defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_SHORT", i32, truncstorei16_global>;
941-
defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORD", i32, store_global>;
942-
defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX2", v2i32, store_global>;
943-
defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX3", v3i32, store_global>;
944-
defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX4", v4i32, store_global>;
953+
954+
foreach vt = Reg32Types.types in {
955+
defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORD", vt, store_global>;
956+
}
957+
958+
foreach vt = VReg_64.RegTypes in {
959+
defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX2", vt, store_global>;
960+
}
961+
962+
foreach vt = VReg_96.RegTypes in {
963+
defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX3", vt, store_global>;
964+
}
965+
966+
foreach vt = VReg_128.RegTypes in {
967+
defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX4", vt, store_global>;
968+
}
945969

946970
defm BUFFER_ATOMIC_SWAP : MUBUF_Pseudo_Atomics <
947971
"buffer_atomic_swap", VGPR_32, i32

llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,10 @@ define amdgpu_ps i16 @extractelement_vgpr_v4i16_sgpr_idx(ptr addrspace(1) %ptr,
7777
;
7878
; GFX7-LABEL: extractelement_vgpr_v4i16_sgpr_idx:
7979
; GFX7: ; %bb.0:
80-
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
80+
; GFX7-NEXT: s_mov_b32 s6, 0
81+
; GFX7-NEXT: s_mov_b32 s7, 0xf000
82+
; GFX7-NEXT: s_mov_b64 s[4:5], 0
83+
; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
8184
; GFX7-NEXT: s_lshr_b32 s0, s2, 1
8285
; GFX7-NEXT: s_and_b32 s1, s2, 1
8386
; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
@@ -150,7 +153,10 @@ define i16 @extractelement_vgpr_v4i16_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx)
150153
; GFX7-LABEL: extractelement_vgpr_v4i16_vgpr_idx:
151154
; GFX7: ; %bb.0:
152155
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
153-
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
156+
; GFX7-NEXT: s_mov_b32 s6, 0
157+
; GFX7-NEXT: s_mov_b32 s7, 0xf000
158+
; GFX7-NEXT: s_mov_b64 s[4:5], 0
159+
; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
154160
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 1, v2
155161
; GFX7-NEXT: v_and_b32_e32 v2, 1, v2
156162
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
@@ -360,7 +366,10 @@ define i16 @extractelement_vgpr_v4i16_idx0(ptr addrspace(1) %ptr) {
360366
; GFX7-LABEL: extractelement_vgpr_v4i16_idx0:
361367
; GFX7: ; %bb.0:
362368
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
363-
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
369+
; GFX7-NEXT: s_mov_b32 s6, 0
370+
; GFX7-NEXT: s_mov_b32 s7, 0xf000
371+
; GFX7-NEXT: s_mov_b64 s[4:5], 0
372+
; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
364373
; GFX7-NEXT: s_waitcnt vmcnt(0)
365374
; GFX7-NEXT: s_setpc_b64 s[30:31]
366375
;
@@ -402,7 +411,10 @@ define i16 @extractelement_vgpr_v4i16_idx1(ptr addrspace(1) %ptr) {
402411
; GFX7-LABEL: extractelement_vgpr_v4i16_idx1:
403412
; GFX7: ; %bb.0:
404413
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
405-
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
414+
; GFX7-NEXT: s_mov_b32 s6, 0
415+
; GFX7-NEXT: s_mov_b32 s7, 0xf000
416+
; GFX7-NEXT: s_mov_b64 s[4:5], 0
417+
; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
406418
; GFX7-NEXT: s_waitcnt vmcnt(0)
407419
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
408420
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -447,7 +459,10 @@ define i16 @extractelement_vgpr_v4i16_idx2(ptr addrspace(1) %ptr) {
447459
; GFX7-LABEL: extractelement_vgpr_v4i16_idx2:
448460
; GFX7: ; %bb.0:
449461
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
450-
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
462+
; GFX7-NEXT: s_mov_b32 s6, 0
463+
; GFX7-NEXT: s_mov_b32 s7, 0xf000
464+
; GFX7-NEXT: s_mov_b64 s[4:5], 0
465+
; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
451466
; GFX7-NEXT: s_waitcnt vmcnt(0)
452467
; GFX7-NEXT: v_mov_b32_e32 v0, v1
453468
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -492,7 +507,10 @@ define i16 @extractelement_vgpr_v4i16_idx3(ptr addrspace(1) %ptr) {
492507
; GFX7-LABEL: extractelement_vgpr_v4i16_idx3:
493508
; GFX7: ; %bb.0:
494509
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
495-
; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
510+
; GFX7-NEXT: s_mov_b32 s6, 0
511+
; GFX7-NEXT: s_mov_b32 s7, 0xf000
512+
; GFX7-NEXT: s_mov_b64 s[4:5], 0
513+
; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
496514
; GFX7-NEXT: s_waitcnt vmcnt(0)
497515
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v1
498516
; GFX7-NEXT: s_setpc_b64 s[30:31]

llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,8 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
318318
; CI-NEXT: s_waitcnt lgkmcnt(0)
319319
; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
320320
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
321+
; CI-NEXT: s_mov_b32 s6, -1
322+
; CI-NEXT: s_mov_b32 s7, 0xf000
321323
; CI-NEXT: s_waitcnt lgkmcnt(0)
322324
; CI-NEXT: v_mov_b32_e32 v0, s0
323325
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -334,9 +336,7 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
334336
; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[2:3]
335337
; CI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3]
336338
; CI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[2:3]
337-
; CI-NEXT: v_mov_b32_e32 v2, s4
338-
; CI-NEXT: v_mov_b32_e32 v3, s5
339-
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
339+
; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
340340
; CI-NEXT: s_endpgm
341341
;
342342
; VI-LABEL: frem_f64:
@@ -381,6 +381,8 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
381381
; CI-NEXT: s_waitcnt lgkmcnt(0)
382382
; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
383383
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
384+
; CI-NEXT: s_mov_b32 s6, -1
385+
; CI-NEXT: s_mov_b32 s7, 0xf000
384386
; CI-NEXT: s_waitcnt lgkmcnt(0)
385387
; CI-NEXT: v_rcp_f64_e32 v[0:1], s[0:1]
386388
; CI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
@@ -394,9 +396,7 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
394396
; CI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
395397
; CI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
396398
; CI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[0:1], v[2:3]
397-
; CI-NEXT: v_mov_b32_e32 v2, s4
398-
; CI-NEXT: v_mov_b32_e32 v3, s5
399-
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
399+
; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
400400
; CI-NEXT: s_endpgm
401401
;
402402
; VI-LABEL: fast_frem_f64:
@@ -438,6 +438,8 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
438438
; CI-NEXT: s_waitcnt lgkmcnt(0)
439439
; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
440440
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
441+
; CI-NEXT: s_mov_b32 s6, -1
442+
; CI-NEXT: s_mov_b32 s7, 0xf000
441443
; CI-NEXT: s_waitcnt lgkmcnt(0)
442444
; CI-NEXT: v_rcp_f64_e32 v[0:1], s[0:1]
443445
; CI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
@@ -451,9 +453,7 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
451453
; CI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
452454
; CI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
453455
; CI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[0:1], v[2:3]
454-
; CI-NEXT: v_mov_b32_e32 v2, s4
455-
; CI-NEXT: v_mov_b32_e32 v3, s5
456-
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
456+
; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
457457
; CI-NEXT: s_endpgm
458458
;
459459
; VI-LABEL: unsafe_frem_f64:
@@ -532,15 +532,15 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
532532
; CI-NEXT: v_fma_f32 v3, -v3, v6, v4
533533
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
534534
; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6
535+
; CI-NEXT: s_mov_b32 s6, -1
536+
; CI-NEXT: s_mov_b32 s7, 0xf000
535537
; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v1
536538
; CI-NEXT: v_trunc_f32_e32 v3, v3
537539
; CI-NEXT: v_fma_f32 v1, -v3, v2, v1
538540
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
539541
; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
540-
; CI-NEXT: v_or_b32_e32 v2, v0, v1
541-
; CI-NEXT: v_mov_b32_e32 v0, s4
542-
; CI-NEXT: v_mov_b32_e32 v1, s5
543-
; CI-NEXT: flat_store_dword v[0:1], v2
542+
; CI-NEXT: v_or_b32_e32 v0, v0, v1
543+
; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0
544544
; CI-NEXT: s_endpgm
545545
;
546546
; VI-LABEL: frem_v2f16:
@@ -669,15 +669,15 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
669669
; CI-NEXT: v_div_fmas_f32 v5, v5, v7, v8
670670
; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
671671
; CI-NEXT: v_or_b32_e32 v0, v0, v1
672+
; CI-NEXT: s_mov_b32 s6, -1
673+
; CI-NEXT: s_mov_b32 s7, 0xf000
672674
; CI-NEXT: v_div_fixup_f32 v5, v5, v4, v3
673675
; CI-NEXT: v_trunc_f32_e32 v5, v5
674676
; CI-NEXT: v_fma_f32 v3, -v5, v4, v3
675677
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
676678
; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
677679
; CI-NEXT: v_or_b32_e32 v1, v2, v1
678-
; CI-NEXT: v_mov_b32_e32 v2, s4
679-
; CI-NEXT: v_mov_b32_e32 v3, s5
680-
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
680+
; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
681681
; CI-NEXT: s_endpgm
682682
;
683683
; VI-LABEL: frem_v4f16:
@@ -1017,6 +1017,8 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
10171017
; CI-NEXT: v_mov_b32_e32 v1, s9
10181018
; CI-NEXT: v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], s[0:1]
10191019
; CI-NEXT: v_div_scale_f64 v[8:9], vcc, s[0:1], v[0:1], s[0:1]
1020+
; CI-NEXT: s_mov_b32 s6, -1
1021+
; CI-NEXT: s_mov_b32 s7, 0xf000
10201022
; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
10211023
; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
10221024
; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
@@ -1043,9 +1045,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
10431045
; CI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], s[2:3]
10441046
; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
10451047
; CI-NEXT: v_fma_f64 v[2:3], -v[4:5], v[2:3], s[2:3]
1046-
; CI-NEXT: v_mov_b32_e32 v4, s4
1047-
; CI-NEXT: v_mov_b32_e32 v5, s5
1048-
; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1048+
; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
10491049
; CI-NEXT: s_endpgm
10501050
;
10511051
; VI-LABEL: frem_v2f64:

0 commit comments

Comments
 (0)