@@ -318,6 +318,8 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
318
318
; CI-NEXT: s_waitcnt lgkmcnt(0)
319
319
; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
320
320
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
321
+ ; CI-NEXT: s_mov_b32 s6, -1
322
+ ; CI-NEXT: s_mov_b32 s7, 0xf000
321
323
; CI-NEXT: s_waitcnt lgkmcnt(0)
322
324
; CI-NEXT: v_mov_b32_e32 v0, s0
323
325
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -334,9 +336,7 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
334
336
; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[2:3]
335
337
; CI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3]
336
338
; CI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[2:3]
337
- ; CI-NEXT: v_mov_b32_e32 v2, s4
338
- ; CI-NEXT: v_mov_b32_e32 v3, s5
339
- ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
339
+ ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
340
340
; CI-NEXT: s_endpgm
341
341
;
342
342
; VI-LABEL: frem_f64:
@@ -381,6 +381,8 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
381
381
; CI-NEXT: s_waitcnt lgkmcnt(0)
382
382
; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
383
383
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
384
+ ; CI-NEXT: s_mov_b32 s6, -1
385
+ ; CI-NEXT: s_mov_b32 s7, 0xf000
384
386
; CI-NEXT: s_waitcnt lgkmcnt(0)
385
387
; CI-NEXT: v_rcp_f64_e32 v[0:1], s[0:1]
386
388
; CI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
@@ -394,9 +396,7 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
394
396
; CI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
395
397
; CI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
396
398
; CI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[0:1], v[2:3]
397
- ; CI-NEXT: v_mov_b32_e32 v2, s4
398
- ; CI-NEXT: v_mov_b32_e32 v3, s5
399
- ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
399
+ ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
400
400
; CI-NEXT: s_endpgm
401
401
;
402
402
; VI-LABEL: fast_frem_f64:
@@ -438,6 +438,8 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
438
438
; CI-NEXT: s_waitcnt lgkmcnt(0)
439
439
; CI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
440
440
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
441
+ ; CI-NEXT: s_mov_b32 s6, -1
442
+ ; CI-NEXT: s_mov_b32 s7, 0xf000
441
443
; CI-NEXT: s_waitcnt lgkmcnt(0)
442
444
; CI-NEXT: v_rcp_f64_e32 v[0:1], s[0:1]
443
445
; CI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
@@ -451,9 +453,7 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
451
453
; CI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
452
454
; CI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
453
455
; CI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[0:1], v[2:3]
454
- ; CI-NEXT: v_mov_b32_e32 v2, s4
455
- ; CI-NEXT: v_mov_b32_e32 v3, s5
456
- ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
456
+ ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
457
457
; CI-NEXT: s_endpgm
458
458
;
459
459
; VI-LABEL: unsafe_frem_f64:
@@ -532,15 +532,15 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
532
532
; CI-NEXT: v_fma_f32 v3, -v3, v6, v4
533
533
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
534
534
; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6
535
+ ; CI-NEXT: s_mov_b32 s6, -1
536
+ ; CI-NEXT: s_mov_b32 s7, 0xf000
535
537
; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v1
536
538
; CI-NEXT: v_trunc_f32_e32 v3, v3
537
539
; CI-NEXT: v_fma_f32 v1, -v3, v2, v1
538
540
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
539
541
; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
540
- ; CI-NEXT: v_or_b32_e32 v2, v0, v1
541
- ; CI-NEXT: v_mov_b32_e32 v0, s4
542
- ; CI-NEXT: v_mov_b32_e32 v1, s5
543
- ; CI-NEXT: flat_store_dword v[0:1], v2
542
+ ; CI-NEXT: v_or_b32_e32 v0, v0, v1
543
+ ; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0
544
544
; CI-NEXT: s_endpgm
545
545
;
546
546
; VI-LABEL: frem_v2f16:
@@ -669,15 +669,15 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
669
669
; CI-NEXT: v_div_fmas_f32 v5, v5, v7, v8
670
670
; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
671
671
; CI-NEXT: v_or_b32_e32 v0, v0, v1
672
+ ; CI-NEXT: s_mov_b32 s6, -1
673
+ ; CI-NEXT: s_mov_b32 s7, 0xf000
672
674
; CI-NEXT: v_div_fixup_f32 v5, v5, v4, v3
673
675
; CI-NEXT: v_trunc_f32_e32 v5, v5
674
676
; CI-NEXT: v_fma_f32 v3, -v5, v4, v3
675
677
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
676
678
; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
677
679
; CI-NEXT: v_or_b32_e32 v1, v2, v1
678
- ; CI-NEXT: v_mov_b32_e32 v2, s4
679
- ; CI-NEXT: v_mov_b32_e32 v3, s5
680
- ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
680
+ ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
681
681
; CI-NEXT: s_endpgm
682
682
;
683
683
; VI-LABEL: frem_v4f16:
@@ -1017,6 +1017,8 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
1017
1017
; CI-NEXT: v_mov_b32_e32 v1, s9
1018
1018
; CI-NEXT: v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], s[0:1]
1019
1019
; CI-NEXT: v_div_scale_f64 v[8:9], vcc, s[0:1], v[0:1], s[0:1]
1020
+ ; CI-NEXT: s_mov_b32 s6, -1
1021
+ ; CI-NEXT: s_mov_b32 s7, 0xf000
1020
1022
; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
1021
1023
; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
1022
1024
; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
@@ -1043,9 +1045,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
1043
1045
; CI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], s[2:3]
1044
1046
; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
1045
1047
; CI-NEXT: v_fma_f64 v[2:3], -v[4:5], v[2:3], s[2:3]
1046
- ; CI-NEXT: v_mov_b32_e32 v4, s4
1047
- ; CI-NEXT: v_mov_b32_e32 v5, s5
1048
- ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1048
+ ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1049
1049
; CI-NEXT: s_endpgm
1050
1050
;
1051
1051
; VI-LABEL: frem_v2f64:
0 commit comments