Skip to content

Commit 5c68a1c

Browse files
arsenmtstellar
authored andcommitted
AMDGPU: Make various vector undefs legal
Surprisingly these were getting legalized to something zero initialized. This fixes an infinite loop when combining some vector types. Also fixes zero initializing some undef values. SimplifyDemandedVectorElts / SimplifyDemandedBits are not checking for the legality of the output undefs they are replacing unused operations with. This resulted in turning vectors into undefs that were later re-legalized back into zero vectors. (cherry picked from commit 7a84624)
1 parent 80a9fc8 commit 5c68a1c

11 files changed

+306
-458
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
249249
case ISD::STORE:
250250
case ISD::BUILD_VECTOR:
251251
case ISD::BITCAST:
252+
case ISD::UNDEF:
252253
case ISD::EXTRACT_VECTOR_ELT:
253254
case ISD::INSERT_VECTOR_ELT:
254255
case ISD::EXTRACT_SUBVECTOR:
@@ -516,6 +517,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
516517
case ISD::STORE:
517518
case ISD::BUILD_VECTOR:
518519
case ISD::BITCAST:
520+
case ISD::UNDEF:
519521
case ISD::EXTRACT_VECTOR_ELT:
520522
case ISD::INSERT_VECTOR_ELT:
521523
case ISD::INSERT_SUBVECTOR:

llvm/test/CodeGen/AMDGPU/commute-shifts.ll

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,6 @@
55
define amdgpu_ps float @main(float %arg0, float %arg1) #0 {
66
; SI-LABEL: main:
77
; SI: ; %bb.0: ; %bb
8-
; SI-NEXT: s_mov_b32 s0, 0
9-
; SI-NEXT: s_mov_b32 s1, s0
10-
; SI-NEXT: s_mov_b32 s2, s0
11-
; SI-NEXT: s_mov_b32 s3, s0
12-
; SI-NEXT: s_mov_b32 s4, s0
13-
; SI-NEXT: s_mov_b32 s5, s0
14-
; SI-NEXT: s_mov_b32 s6, s0
15-
; SI-NEXT: s_mov_b32 s7, s0
168
; SI-NEXT: image_load v2, v0, s[0:7] dmask:0x1 unorm
179
; SI-NEXT: v_cvt_i32_f32_e32 v0, v0
1810
; SI-NEXT: v_and_b32_e32 v0, 7, v0
@@ -26,14 +18,6 @@ define amdgpu_ps float @main(float %arg0, float %arg1) #0 {
2618
;
2719
; VI-LABEL: main:
2820
; VI: ; %bb.0: ; %bb
29-
; VI-NEXT: s_mov_b32 s0, 0
30-
; VI-NEXT: s_mov_b32 s1, s0
31-
; VI-NEXT: s_mov_b32 s2, s0
32-
; VI-NEXT: s_mov_b32 s3, s0
33-
; VI-NEXT: s_mov_b32 s4, s0
34-
; VI-NEXT: s_mov_b32 s5, s0
35-
; VI-NEXT: s_mov_b32 s6, s0
36-
; VI-NEXT: s_mov_b32 s7, s0
3721
; VI-NEXT: image_load v2, v0, s[0:7] dmask:0x1 unorm
3822
; VI-NEXT: v_cvt_i32_f32_e32 v0, v0
3923
; VI-NEXT: v_and_b32_e32 v0, 7, v0

llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ if.else: ; preds = %entry
213213
br label %if.end
214214

215215
if.end: ; preds = %if.else, %if.then
216-
%call6.sink = phi <3 x i16> [ %call6, %if.else ], [ undef, %if.then ]
216+
%call6.sink = phi <3 x i16> [ %call6, %if.else ], [ zeroinitializer, %if.then ]
217217
store <3 x i16> %call6.sink, <3 x i16> addrspace(1)* undef
218218
ret void
219219
}
@@ -266,7 +266,7 @@ if.else: ; preds = %entry
266266
br label %if.end
267267

268268
if.end: ; preds = %if.else, %if.then
269-
%call6.sink = phi <3 x half> [ %call6, %if.else ], [ undef, %if.then ]
269+
%call6.sink = phi <3 x half> [ %call6, %if.else ], [ zeroinitializer, %if.then ]
270270
store <3 x half> %call6.sink, <3 x half> addrspace(1)* undef
271271
ret void
272272
}

llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,8 @@
44
define amdgpu_ps float @_amdgpu_ps_main() #0 {
55
; GCN-LABEL: _amdgpu_ps_main:
66
; GCN: ; %bb.0: ; %.entry
7-
; GCN-NEXT: s_mov_b32 s0, 0
8-
; GCN-NEXT: v_mov_b32_e32 v4, 0
9-
; GCN-NEXT: s_mov_b32 s1, s0
10-
; GCN-NEXT: s_mov_b32 s2, s0
11-
; GCN-NEXT: s_mov_b32 s3, s0
12-
; GCN-NEXT: s_mov_b32 s4, s0
13-
; GCN-NEXT: s_mov_b32 s5, s0
14-
; GCN-NEXT: s_mov_b32 s6, s0
15-
; GCN-NEXT: s_mov_b32 s7, s0
167
; GCN-NEXT: image_sample v[0:1], v[0:1], s[0:7], s[0:3] dmask:0x3 dim:SQ_RSRC_IMG_2D
8+
; GCN-NEXT: v_mov_b32_e32 v4, 0
179
; GCN-NEXT: s_waitcnt vmcnt(0)
1810
; GCN-NEXT: s_clause 0x1
1911
; GCN-NEXT: image_sample v2, v[0:1], s[0:7], s[0:3] dmask:0x4 dim:SQ_RSRC_IMG_2D

llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll

Lines changed: 6 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -100,14 +100,7 @@ define <4 x i16> @vec_8xi16_extract_4xi16(<8 x i16> addrspace(1) * %p0, <8 x i16
100100
; GFX9-NEXT: s_cbranch_execz .LBB0_3
101101
; GFX9-NEXT: s_branch .LBB0_4
102102
; GFX9-NEXT: .LBB0_2:
103-
; GFX9-NEXT: s_mov_b32 s8, 0
104-
; GFX9-NEXT: s_mov_b32 s9, s8
105-
; GFX9-NEXT: s_mov_b32 s10, s8
106-
; GFX9-NEXT: s_mov_b32 s11, s8
107-
; GFX9-NEXT: v_mov_b32_e32 v2, s8
108-
; GFX9-NEXT: v_mov_b32_e32 v3, s9
109-
; GFX9-NEXT: v_mov_b32_e32 v4, s10
110-
; GFX9-NEXT: v_mov_b32_e32 v5, s11
103+
; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
111104
; GFX9-NEXT: .LBB0_3: ; %T
112105
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc
113106
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -244,14 +237,7 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(<8 x i16> addrspace(1) * %p0, <8 x i
244237
; GFX9-NEXT: s_cbranch_execz .LBB1_3
245238
; GFX9-NEXT: s_branch .LBB1_4
246239
; GFX9-NEXT: .LBB1_2:
247-
; GFX9-NEXT: s_mov_b32 s8, 0
248-
; GFX9-NEXT: s_mov_b32 s9, s8
249-
; GFX9-NEXT: s_mov_b32 s10, s8
250-
; GFX9-NEXT: s_mov_b32 s11, s8
251-
; GFX9-NEXT: v_mov_b32_e32 v2, s8
252-
; GFX9-NEXT: v_mov_b32_e32 v3, s9
253-
; GFX9-NEXT: v_mov_b32_e32 v4, s10
254-
; GFX9-NEXT: v_mov_b32_e32 v5, s11
240+
; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
255241
; GFX9-NEXT: .LBB1_3: ; %T
256242
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc
257243
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -386,14 +372,7 @@ define <4 x half> @vec_8xf16_extract_4xf16(<8 x half> addrspace(1) * %p0, <8 x h
386372
; GFX9-NEXT: s_cbranch_execz .LBB2_3
387373
; GFX9-NEXT: s_branch .LBB2_4
388374
; GFX9-NEXT: .LBB2_2:
389-
; GFX9-NEXT: s_mov_b32 s8, 0
390-
; GFX9-NEXT: s_mov_b32 s9, s8
391-
; GFX9-NEXT: s_mov_b32 s10, s8
392-
; GFX9-NEXT: s_mov_b32 s11, s8
393-
; GFX9-NEXT: v_mov_b32_e32 v2, s8
394-
; GFX9-NEXT: v_mov_b32_e32 v3, s9
395-
; GFX9-NEXT: v_mov_b32_e32 v4, s10
396-
; GFX9-NEXT: v_mov_b32_e32 v5, s11
375+
; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
397376
; GFX9-NEXT: .LBB2_3: ; %T
398377
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc
399378
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -567,22 +546,7 @@ define <4 x i16> @vec_16xi16_extract_4xi16(<16 x i16> addrspace(1) * %p0, <16 x
567546
; GFX9-NEXT: s_cbranch_execz .LBB3_3
568547
; GFX9-NEXT: s_branch .LBB3_4
569548
; GFX9-NEXT: .LBB3_2:
570-
; GFX9-NEXT: s_mov_b32 s8, 0
571-
; GFX9-NEXT: s_mov_b32 s9, s8
572-
; GFX9-NEXT: s_mov_b32 s10, s8
573-
; GFX9-NEXT: s_mov_b32 s11, s8
574-
; GFX9-NEXT: s_mov_b32 s12, s8
575-
; GFX9-NEXT: s_mov_b32 s13, s8
576-
; GFX9-NEXT: s_mov_b32 s14, s8
577-
; GFX9-NEXT: s_mov_b32 s15, s8
578-
; GFX9-NEXT: v_mov_b32_e32 v4, s8
579-
; GFX9-NEXT: v_mov_b32_e32 v5, s9
580-
; GFX9-NEXT: v_mov_b32_e32 v6, s10
581-
; GFX9-NEXT: v_mov_b32_e32 v7, s11
582-
; GFX9-NEXT: v_mov_b32_e32 v8, s12
583-
; GFX9-NEXT: v_mov_b32_e32 v9, s13
584-
; GFX9-NEXT: v_mov_b32_e32 v10, s14
585-
; GFX9-NEXT: v_mov_b32_e32 v11, s15
549+
; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
586550
; GFX9-NEXT: .LBB3_3: ; %T
587551
; GFX9-NEXT: s_waitcnt vmcnt(0)
588552
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
@@ -759,22 +723,7 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(<16 x i16> addrspace(1) * %p0, <16
759723
; GFX9-NEXT: s_cbranch_execz .LBB4_3
760724
; GFX9-NEXT: s_branch .LBB4_4
761725
; GFX9-NEXT: .LBB4_2:
762-
; GFX9-NEXT: s_mov_b32 s8, 0
763-
; GFX9-NEXT: s_mov_b32 s9, s8
764-
; GFX9-NEXT: s_mov_b32 s10, s8
765-
; GFX9-NEXT: s_mov_b32 s11, s8
766-
; GFX9-NEXT: s_mov_b32 s12, s8
767-
; GFX9-NEXT: s_mov_b32 s13, s8
768-
; GFX9-NEXT: s_mov_b32 s14, s8
769-
; GFX9-NEXT: s_mov_b32 s15, s8
770-
; GFX9-NEXT: v_mov_b32_e32 v4, s8
771-
; GFX9-NEXT: v_mov_b32_e32 v5, s9
772-
; GFX9-NEXT: v_mov_b32_e32 v6, s10
773-
; GFX9-NEXT: v_mov_b32_e32 v7, s11
774-
; GFX9-NEXT: v_mov_b32_e32 v8, s12
775-
; GFX9-NEXT: v_mov_b32_e32 v9, s13
776-
; GFX9-NEXT: v_mov_b32_e32 v10, s14
777-
; GFX9-NEXT: v_mov_b32_e32 v11, s15
726+
; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
778727
; GFX9-NEXT: .LBB4_3: ; %T
779728
; GFX9-NEXT: s_waitcnt vmcnt(0)
780729
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
@@ -949,22 +898,7 @@ define <4 x half> @vec_16xf16_extract_4xf16(<16 x half> addrspace(1) * %p0, <16
949898
; GFX9-NEXT: s_cbranch_execz .LBB5_3
950899
; GFX9-NEXT: s_branch .LBB5_4
951900
; GFX9-NEXT: .LBB5_2:
952-
; GFX9-NEXT: s_mov_b32 s8, 0
953-
; GFX9-NEXT: s_mov_b32 s9, s8
954-
; GFX9-NEXT: s_mov_b32 s10, s8
955-
; GFX9-NEXT: s_mov_b32 s11, s8
956-
; GFX9-NEXT: s_mov_b32 s12, s8
957-
; GFX9-NEXT: s_mov_b32 s13, s8
958-
; GFX9-NEXT: s_mov_b32 s14, s8
959-
; GFX9-NEXT: s_mov_b32 s15, s8
960-
; GFX9-NEXT: v_mov_b32_e32 v4, s8
961-
; GFX9-NEXT: v_mov_b32_e32 v5, s9
962-
; GFX9-NEXT: v_mov_b32_e32 v6, s10
963-
; GFX9-NEXT: v_mov_b32_e32 v7, s11
964-
; GFX9-NEXT: v_mov_b32_e32 v8, s12
965-
; GFX9-NEXT: v_mov_b32_e32 v9, s13
966-
; GFX9-NEXT: v_mov_b32_e32 v10, s14
967-
; GFX9-NEXT: v_mov_b32_e32 v11, s15
901+
; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
968902
; GFX9-NEXT: .LBB5_3: ; %T
969903
; GFX9-NEXT: s_waitcnt vmcnt(0)
970904
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc

llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -374,18 +374,10 @@ define <4 x float> @insertelement_to_sgpr() nounwind {
374374
; GCN-LABEL: insertelement_to_sgpr:
375375
; GCN: ; %bb.0:
376376
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
377-
; GCN-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0
377+
; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
378378
; GCN-NEXT: s_waitcnt lgkmcnt(0)
379-
; GCN-NEXT: s_mov_b32 s12, 0
380-
; GCN-NEXT: s_mov_b32 s4, s12
381-
; GCN-NEXT: s_mov_b32 s5, s12
382-
; GCN-NEXT: s_mov_b32 s6, s12
383-
; GCN-NEXT: s_mov_b32 s7, s12
384-
; GCN-NEXT: s_mov_b32 s8, s12
385-
; GCN-NEXT: s_mov_b32 s9, s12
386-
; GCN-NEXT: s_mov_b32 s10, s12
387-
; GCN-NEXT: s_mov_b32 s11, s12
388-
; GCN-NEXT: image_gather4_lz v[0:3], v[0:1], s[4:11], s[12:15] dmask:0x1
379+
; GCN-NEXT: s_mov_b32 s4, 0
380+
; GCN-NEXT: image_gather4_lz v[0:3], v[0:1], s[4:11], s[4:7] dmask:0x1
389381
; GCN-NEXT: s_waitcnt vmcnt(0)
390382
; GCN-NEXT: s_setpc_b64 s[30:31]
391383
%tmp = load <4 x i32>, <4 x i32> addrspace(4)* undef

0 commit comments

Comments
 (0)