Skip to content

Commit ca1154d

Browse files
authored
AMDGPU: Disable pattern matching "x<<32-y>>32-y" to "bfe x, 0, y" (#114279)
It is not correct to lower "x<<32-y>>32-y" to "bfe x, 0, y". When y equals to 32, the left-hand side is still x (unchanged), however, the right-hand side will be evaluated to 0. So it is not always correct to do such transformation. We may be able to keep the pattern for immediate y while y is within [0, 31]. However, the immediate operands of the sub (32 - y) are easily folded, and "(x << imm) >> imm" will be lowered to "and x, (2^(32-imm))-1" anyway. So no bfe matching is needed.
1 parent 5545f76 commit ca1154d

File tree

3 files changed

+27
-22
lines changed

3 files changed

+27
-22
lines changed

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3553,19 +3553,6 @@ def : AMDGPUPat <
35533553
(V_BFE_U32_e64 $src, (i32 0), $width)
35543554
>;
35553555

3556-
// x << (bitwidth - y) >> (bitwidth - y)
3557-
def : AMDGPUPat <
3558-
(DivergentBinFrag<srl> (shl_oneuse i32:$src, (sub 32, i32:$width)),
3559-
(sub 32, i32:$width)),
3560-
(V_BFE_U32_e64 $src, (i32 0), $width)
3561-
>;
3562-
3563-
def : AMDGPUPat <
3564-
(DivergentBinFrag<sra> (shl_oneuse i32:$src, (sub 32, i32:$width)),
3565-
(sub 32, i32:$width)),
3566-
(V_BFE_I32_e64 $src, (i32 0), $width)
3567-
>;
3568-
35693556
// SHA-256 Ma patterns
35703557

35713558
// ((x & z) | (y & (x | z))) -> BFI (XOR x, y), z, y

llvm/test/CodeGen/AMDGPU/bfe-patterns.ll

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
1717
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
1818
; SI-NEXT: s_waitcnt vmcnt(0)
1919
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
20-
; SI-NEXT: v_bfe_u32 v2, v2, 0, v3
20+
; SI-NEXT: v_sub_i32_e32 v3, vcc, 32, v3
21+
; SI-NEXT: v_lshlrev_b32_e32 v2, v3, v2
22+
; SI-NEXT: v_lshrrev_b32_e32 v2, v3, v2
2123
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
2224
; SI-NEXT: s_endpgm
2325
;
@@ -36,7 +38,9 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
3638
; VI-NEXT: v_mov_b32_e32 v1, s1
3739
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
3840
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
39-
; VI-NEXT: v_bfe_u32 v2, v3, 0, v4
41+
; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
42+
; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3
43+
; VI-NEXT: v_lshrrev_b32_e32 v2, v2, v3
4044
; VI-NEXT: flat_store_dword v[0:1], v2
4145
; VI-NEXT: s_endpgm
4246
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -215,7 +219,9 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
215219
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
216220
; SI-NEXT: s_waitcnt vmcnt(0)
217221
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
218-
; SI-NEXT: v_bfe_i32 v2, v2, 0, v3
222+
; SI-NEXT: v_sub_i32_e32 v3, vcc, 32, v3
223+
; SI-NEXT: v_lshlrev_b32_e32 v2, v3, v2
224+
; SI-NEXT: v_ashrrev_i32_e32 v2, v3, v2
219225
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
220226
; SI-NEXT: s_endpgm
221227
;
@@ -234,7 +240,9 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
234240
; VI-NEXT: v_mov_b32_e32 v1, s1
235241
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
236242
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
237-
; VI-NEXT: v_bfe_i32 v2, v3, 0, v4
243+
; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
244+
; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3
245+
; VI-NEXT: v_ashrrev_i32_e32 v2, v2, v3
238246
; VI-NEXT: flat_store_dword v[0:1], v2
239247
; VI-NEXT: s_endpgm
240248
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()

llvm/test/CodeGen/AMDGPU/extract-lowbits.ll

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -150,11 +150,21 @@ define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind {
150150
; ---------------------------------------------------------------------------- ;
151151

152152
define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind {
153-
; GCN-LABEL: bzhi32_d0:
154-
; GCN: ; %bb.0:
155-
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
156-
; GCN-NEXT: v_bfe_u32 v0, v0, 0, v1
157-
; GCN-NEXT: s_setpc_b64 s[30:31]
153+
; SI-LABEL: bzhi32_d0:
154+
; SI: ; %bb.0:
155+
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
156+
; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1
157+
; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
158+
; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
159+
; SI-NEXT: s_setpc_b64 s[30:31]
160+
;
161+
; VI-LABEL: bzhi32_d0:
162+
; VI: ; %bb.0:
163+
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
164+
; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v1
165+
; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
166+
; VI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
167+
; VI-NEXT: s_setpc_b64 s[30:31]
158168
%numhighbits = sub i32 32, %numlowbits
159169
%highbitscleared = shl i32 %val, %numhighbits
160170
%masked = lshr i32 %highbitscleared, %numhighbits

0 commit comments

Comments
 (0)