Skip to content

Commit ab2c499

Browse files
[SLP] Add insertelement instructions to vectorizable tree
Add new type of tree node for `InsertElementInst` chain forming vector. These instructions could be either removed, or replaced by shuffles during vectorization and we can add this node to cost model, so naturally estimating their cost, getting rid of `CompensateCost` tricks and reducing further work for InstCombine. This fixes PR40522 and PR35732 in a natural way. Also this patch is the first step towards revectorization of partially vectorization (to fix PR42022 completely). After adding inserts to tree the next step is to add vector instructions there (for instance, to merge `store <2 x float>` and `store <2 x float>` to `store <4 x float>`). Fixes PR40522 and PR35732. Differential Revision: https://reviews.llvm.org/D98714
1 parent cd90900 commit ab2c499

File tree

67 files changed

+1917
-5717
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

67 files changed

+1917
-5717
lines changed

llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -94,15 +94,9 @@ struct SLPVectorizerPass : public PassInfoMixin<SLPVectorizerPass> {
9494
bool tryToVectorizePair(Value *A, Value *B, slpvectorizer::BoUpSLP &R);
9595

9696
/// Try to vectorize a list of operands.
97-
/// When \p InsertUses is provided and its entries are non-zero
98-
/// then users of \p VL are known to be InsertElement instructions
99-
/// each associated with same VL entry index. Their cost is then
100-
/// used to adjust cost of the vectorization assuming instcombine pass
101-
/// then optimizes ExtractElement-InsertElement sequence.
10297
/// \returns true if a value was vectorized.
10398
bool tryToVectorizeList(ArrayRef<Value *> VL, slpvectorizer::BoUpSLP &R,
104-
bool AllowReorder = false,
105-
ArrayRef<Value *> InsertUses = None);
99+
bool AllowReorder = false);
106100

107101
/// Try to vectorize a chain that may start at the operands of \p I.
108102
bool tryToVectorize(Instruction *I, slpvectorizer::BoUpSLP &R);

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 185 additions & 113 deletions
Large diffs are not rendered by default.

llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll

Lines changed: 69 additions & 295 deletions
Large diffs are not rendered by default.

llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll

Lines changed: 69 additions & 295 deletions
Large diffs are not rendered by default.

llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll

Lines changed: 44 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -29,40 +29,24 @@ define void @PR28330(i32 %n) {
2929
; GATHER: for.body:
3030
; GATHER-NEXT: [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
3131
; GATHER-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 7
32-
; GATHER-NEXT: [[TMP3:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
33-
; GATHER-NEXT: [[TMP4:%.*]] = insertelement <8 x i1> poison, i1 [[TMP3]], i32 0
34-
; GATHER-NEXT: [[TMP5:%.*]] = extractelement <8 x i1> [[TMP1]], i32 1
35-
; GATHER-NEXT: [[TMP6:%.*]] = insertelement <8 x i1> [[TMP4]], i1 [[TMP5]], i32 1
36-
; GATHER-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP1]], i32 2
37-
; GATHER-NEXT: [[TMP8:%.*]] = insertelement <8 x i1> [[TMP6]], i1 [[TMP7]], i32 2
38-
; GATHER-NEXT: [[TMP9:%.*]] = extractelement <8 x i1> [[TMP1]], i32 3
39-
; GATHER-NEXT: [[TMP10:%.*]] = insertelement <8 x i1> [[TMP8]], i1 [[TMP9]], i32 3
40-
; GATHER-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP1]], i32 4
41-
; GATHER-NEXT: [[TMP12:%.*]] = insertelement <8 x i1> [[TMP10]], i1 [[TMP11]], i32 4
42-
; GATHER-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[TMP1]], i32 5
43-
; GATHER-NEXT: [[TMP14:%.*]] = insertelement <8 x i1> [[TMP12]], i1 [[TMP13]], i32 5
44-
; GATHER-NEXT: [[TMP15:%.*]] = extractelement <8 x i1> [[TMP1]], i32 6
45-
; GATHER-NEXT: [[TMP16:%.*]] = insertelement <8 x i1> [[TMP14]], i1 [[TMP15]], i32 6
46-
; GATHER-NEXT: [[TMP17:%.*]] = insertelement <8 x i1> [[TMP16]], i1 [[TMP2]], i32 7
47-
; GATHER-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
48-
; GATHER-NEXT: [[TMP19:%.*]] = extractelement <8 x i32> [[TMP18]], i32 0
49-
; GATHER-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP18]], i32 1
50-
; GATHER-NEXT: [[TMP21:%.*]] = extractelement <8 x i32> [[TMP18]], i32 2
51-
; GATHER-NEXT: [[TMP22:%.*]] = extractelement <8 x i32> [[TMP18]], i32 3
52-
; GATHER-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP18]], i32 4
53-
; GATHER-NEXT: [[TMP24:%.*]] = extractelement <8 x i32> [[TMP18]], i32 5
54-
; GATHER-NEXT: [[TMP25:%.*]] = extractelement <8 x i32> [[TMP18]], i32 6
55-
; GATHER-NEXT: [[TMP26:%.*]] = insertelement <8 x i32> poison, i32 [[TMP19]], i32 0
56-
; GATHER-NEXT: [[TMP27:%.*]] = insertelement <8 x i32> [[TMP26]], i32 [[TMP20]], i32 1
57-
; GATHER-NEXT: [[TMP28:%.*]] = insertelement <8 x i32> [[TMP27]], i32 [[TMP21]], i32 2
58-
; GATHER-NEXT: [[TMP29:%.*]] = insertelement <8 x i32> [[TMP28]], i32 [[TMP22]], i32 3
59-
; GATHER-NEXT: [[TMP30:%.*]] = insertelement <8 x i32> [[TMP29]], i32 [[TMP23]], i32 4
60-
; GATHER-NEXT: [[TMP31:%.*]] = insertelement <8 x i32> [[TMP30]], i32 [[TMP24]], i32 5
61-
; GATHER-NEXT: [[TMP32:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP25]], i32 6
62-
; GATHER-NEXT: [[TMP33:%.*]] = extractelement <8 x i32> [[TMP18]], i32 7
63-
; GATHER-NEXT: [[TMP34:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP33]], i32 7
64-
; GATHER-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP34]])
65-
; GATHER-NEXT: [[OP_EXTRA]] = add i32 [[TMP35]], [[P17]]
32+
; GATHER-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
33+
; GATHER-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP1]], i32 6
34+
; GATHER-NEXT: [[TMP5:%.*]] = extractelement <8 x i1> [[TMP1]], i32 5
35+
; GATHER-NEXT: [[TMP6:%.*]] = extractelement <8 x i1> [[TMP1]], i32 4
36+
; GATHER-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP1]], i32 3
37+
; GATHER-NEXT: [[TMP8:%.*]] = extractelement <8 x i1> [[TMP1]], i32 2
38+
; GATHER-NEXT: [[TMP9:%.*]] = extractelement <8 x i1> [[TMP1]], i32 1
39+
; GATHER-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
40+
; GATHER-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
41+
; GATHER-NEXT: [[TMP12:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
42+
; GATHER-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
43+
; GATHER-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
44+
; GATHER-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
45+
; GATHER-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
46+
; GATHER-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
47+
; GATHER-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP3]])
48+
; GATHER-NEXT: [[OP_EXTRA]] = add i32 [[TMP18]], [[P17]]
49+
; GATHER-NEXT: [[TMP19:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
6650
; GATHER-NEXT: br label [[FOR_BODY]]
6751
;
6852
; MAX-COST-LABEL: @PR28330(
@@ -165,40 +149,24 @@ define void @PR32038(i32 %n) {
165149
; GATHER: for.body:
166150
; GATHER-NEXT: [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
167151
; GATHER-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 7
168-
; GATHER-NEXT: [[TMP3:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
169-
; GATHER-NEXT: [[TMP4:%.*]] = insertelement <8 x i1> poison, i1 [[TMP3]], i32 0
170-
; GATHER-NEXT: [[TMP5:%.*]] = extractelement <8 x i1> [[TMP1]], i32 1
171-
; GATHER-NEXT: [[TMP6:%.*]] = insertelement <8 x i1> [[TMP4]], i1 [[TMP5]], i32 1
172-
; GATHER-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP1]], i32 2
173-
; GATHER-NEXT: [[TMP8:%.*]] = insertelement <8 x i1> [[TMP6]], i1 [[TMP7]], i32 2
174-
; GATHER-NEXT: [[TMP9:%.*]] = extractelement <8 x i1> [[TMP1]], i32 3
175-
; GATHER-NEXT: [[TMP10:%.*]] = insertelement <8 x i1> [[TMP8]], i1 [[TMP9]], i32 3
176-
; GATHER-NEXT: [[TMP11:%.*]] = extractelement <8 x i1> [[TMP1]], i32 4
177-
; GATHER-NEXT: [[TMP12:%.*]] = insertelement <8 x i1> [[TMP10]], i1 [[TMP11]], i32 4
178-
; GATHER-NEXT: [[TMP13:%.*]] = extractelement <8 x i1> [[TMP1]], i32 5
179-
; GATHER-NEXT: [[TMP14:%.*]] = insertelement <8 x i1> [[TMP12]], i1 [[TMP13]], i32 5
180-
; GATHER-NEXT: [[TMP15:%.*]] = extractelement <8 x i1> [[TMP1]], i32 6
181-
; GATHER-NEXT: [[TMP16:%.*]] = insertelement <8 x i1> [[TMP14]], i1 [[TMP15]], i32 6
182-
; GATHER-NEXT: [[TMP17:%.*]] = insertelement <8 x i1> [[TMP16]], i1 [[TMP2]], i32 7
183-
; GATHER-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
184-
; GATHER-NEXT: [[TMP19:%.*]] = extractelement <8 x i32> [[TMP18]], i32 0
185-
; GATHER-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP18]], i32 1
186-
; GATHER-NEXT: [[TMP21:%.*]] = extractelement <8 x i32> [[TMP18]], i32 2
187-
; GATHER-NEXT: [[TMP22:%.*]] = extractelement <8 x i32> [[TMP18]], i32 3
188-
; GATHER-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP18]], i32 4
189-
; GATHER-NEXT: [[TMP24:%.*]] = extractelement <8 x i32> [[TMP18]], i32 5
190-
; GATHER-NEXT: [[TMP25:%.*]] = extractelement <8 x i32> [[TMP18]], i32 6
191-
; GATHER-NEXT: [[TMP26:%.*]] = insertelement <8 x i32> poison, i32 [[TMP19]], i32 0
192-
; GATHER-NEXT: [[TMP27:%.*]] = insertelement <8 x i32> [[TMP26]], i32 [[TMP20]], i32 1
193-
; GATHER-NEXT: [[TMP28:%.*]] = insertelement <8 x i32> [[TMP27]], i32 [[TMP21]], i32 2
194-
; GATHER-NEXT: [[TMP29:%.*]] = insertelement <8 x i32> [[TMP28]], i32 [[TMP22]], i32 3
195-
; GATHER-NEXT: [[TMP30:%.*]] = insertelement <8 x i32> [[TMP29]], i32 [[TMP23]], i32 4
196-
; GATHER-NEXT: [[TMP31:%.*]] = insertelement <8 x i32> [[TMP30]], i32 [[TMP24]], i32 5
197-
; GATHER-NEXT: [[TMP32:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP25]], i32 6
198-
; GATHER-NEXT: [[TMP33:%.*]] = extractelement <8 x i32> [[TMP18]], i32 7
199-
; GATHER-NEXT: [[TMP34:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP33]], i32 7
200-
; GATHER-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP34]])
201-
; GATHER-NEXT: [[OP_EXTRA]] = add i32 [[TMP35]], -5
152+
; GATHER-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
153+
; GATHER-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP1]], i32 6
154+
; GATHER-NEXT: [[TMP5:%.*]] = extractelement <8 x i1> [[TMP1]], i32 5
155+
; GATHER-NEXT: [[TMP6:%.*]] = extractelement <8 x i1> [[TMP1]], i32 4
156+
; GATHER-NEXT: [[TMP7:%.*]] = extractelement <8 x i1> [[TMP1]], i32 3
157+
; GATHER-NEXT: [[TMP8:%.*]] = extractelement <8 x i1> [[TMP1]], i32 2
158+
; GATHER-NEXT: [[TMP9:%.*]] = extractelement <8 x i1> [[TMP1]], i32 1
159+
; GATHER-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
160+
; GATHER-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
161+
; GATHER-NEXT: [[TMP12:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
162+
; GATHER-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
163+
; GATHER-NEXT: [[TMP14:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
164+
; GATHER-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
165+
; GATHER-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
166+
; GATHER-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
167+
; GATHER-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP3]])
168+
; GATHER-NEXT: [[OP_EXTRA]] = add i32 [[TMP18]], -5
169+
; GATHER-NEXT: [[TMP19:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
202170
; GATHER-NEXT: br label [[FOR_BODY]]
203171
;
204172
; MAX-COST-LABEL: @PR32038(
@@ -220,16 +188,16 @@ define void @PR32038(i32 %n) {
220188
; MAX-COST-NEXT: br label [[FOR_BODY:%.*]]
221189
; MAX-COST: for.body:
222190
; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[P34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
223-
; MAX-COST-NEXT: [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
224-
; MAX-COST-NEXT: [[TMP3:%.*]] = insertelement <4 x i1> poison, i1 [[TMP2]], i32 0
225-
; MAX-COST-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
226-
; MAX-COST-NEXT: [[TMP5:%.*]] = insertelement <4 x i1> [[TMP3]], i1 [[TMP4]], i32 1
227-
; MAX-COST-NEXT: [[TMP6:%.*]] = insertelement <4 x i1> [[TMP5]], i1 [[P5]], i32 2
228-
; MAX-COST-NEXT: [[TMP7:%.*]] = insertelement <4 x i1> [[TMP6]], i1 [[P7]], i32 3
229-
; MAX-COST-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> <i32 -720, i32 -720, i32 -720, i32 -720>, <4 x i32> <i32 -80, i32 -80, i32 -80, i32 -80>
191+
; MAX-COST-NEXT: [[TMP2:%.*]] = shufflevector <2 x i1> [[TMP1]], <2 x i1> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
192+
; MAX-COST-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> poison, <4 x i1> [[TMP2]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
193+
; MAX-COST-NEXT: [[TMP4:%.*]] = insertelement <4 x i1> [[TMP3]], i1 [[P5]], i32 2
194+
; MAX-COST-NEXT: [[TMP5:%.*]] = insertelement <4 x i1> [[TMP4]], i1 [[P7]], i32 3
195+
; MAX-COST-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> <i32 -720, i32 -720, i32 -720, i32 -720>, <4 x i32> <i32 -80, i32 -80, i32 -80, i32 -80>
196+
; MAX-COST-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
197+
; MAX-COST-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
230198
; MAX-COST-NEXT: [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80
231199
; MAX-COST-NEXT: [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80
232-
; MAX-COST-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
200+
; MAX-COST-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP6]])
233201
; MAX-COST-NEXT: [[TMP10:%.*]] = add i32 [[TMP9]], [[P27]]
234202
; MAX-COST-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[P29]]
235203
; MAX-COST-NEXT: [[OP_EXTRA:%.*]] = add i32 [[TMP11]], -5

llvm/test/Transforms/SLPVectorizer/AArch64/insertelement-inseltpoison.ll

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,7 @@ target triple = "aarch64-unknown-linux-gnu"
77
define <2 x float> @insertelement-fixed-vector() {
88
; CHECK-LABEL: @insertelement-fixed-vector(
99
; CHECK-NEXT: [[TMP1:%.*]] = call fast <2 x float> @llvm.fabs.v2f32(<2 x float> undef)
10-
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
11-
; CHECK-NEXT: [[I0:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
12-
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
13-
; CHECK-NEXT: [[I1:%.*]] = insertelement <2 x float> [[I0]], float [[TMP3]], i32 1
14-
; CHECK-NEXT: ret <2 x float> [[I1]]
10+
; CHECK-NEXT: ret <2 x float> [[TMP1]]
1511
;
1612
%f0 = tail call fast float @llvm.fabs.f32(float undef)
1713
%f1 = tail call fast float @llvm.fabs.f32(float undef)

llvm/test/Transforms/SLPVectorizer/AArch64/insertelement.ll

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,7 @@ target triple = "aarch64-unknown-linux-gnu"
77
define <2 x float> @insertelement-fixed-vector() {
88
; CHECK-LABEL: @insertelement-fixed-vector(
99
; CHECK-NEXT: [[TMP1:%.*]] = call fast <2 x float> @llvm.fabs.v2f32(<2 x float> undef)
10-
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
11-
; CHECK-NEXT: [[I0:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0
12-
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
13-
; CHECK-NEXT: [[I1:%.*]] = insertelement <2 x float> [[I0]], float [[TMP3]], i32 1
14-
; CHECK-NEXT: ret <2 x float> [[I1]]
10+
; CHECK-NEXT: ret <2 x float> [[TMP1]]
1511
;
1612
%f0 = tail call fast float @llvm.fabs.f32(float undef)
1713
%f1 = tail call fast float @llvm.fabs.f32(float undef)

llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -123,8 +123,8 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) {
123123
; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[V0]], [[V1]]
124124
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> <i32 0, i32 3>
125125
; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP8]], [[TMP5]]
126-
; CHECK-NEXT: [[TMP3_3:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
127-
; CHECK-NEXT: ret <4 x i32> [[TMP3_3]]
126+
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
127+
; CHECK-NEXT: ret <4 x i32> [[SHUFFLE]]
128128
;
129129
%v0.0 = extractelement <2 x i32> %v0, i32 0
130130
%v0.1 = extractelement <2 x i32> %v0, i32 1
@@ -152,15 +152,15 @@ define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
152152
; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]]
153153
; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]]
154154
; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i32> [[V0]], [[V1]]
155-
; CHECK-NEXT: [[TMP1_0:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]]
156-
; CHECK-NEXT: [[TMP1_1:%.*]] = sub i32 [[TMP0_0]], [[TMP0_1]]
157-
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>
158-
; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]]
159-
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
160-
; CHECK-NEXT: [[TMP2_0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1_0]], i32 0
161-
; CHECK-NEXT: [[TMP2_1:%.*]] = insertelement <4 x i32> [[TMP2_0]], i32 [[TMP1_1]], i32 1
162-
; CHECK-NEXT: [[TMP2_3:%.*]] = shufflevector <4 x i32> [[TMP2_1]], <4 x i32> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
163-
; CHECK-NEXT: ret <4 x i32> [[TMP2_3]]
155+
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_0]], i32 0
156+
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i32 0
157+
; CHECK-NEXT: [[TMP8:%.*]] = sub <2 x i32> [[TMP6]], [[TMP7]]
158+
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>
159+
; CHECK-NEXT: [[TMP10:%.*]] = sub <2 x i32> [[TMP5]], [[TMP9]]
160+
; CHECK-NEXT: [[TMP2_11:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
161+
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
162+
; CHECK-NEXT: [[TMP2_32:%.*]] = shufflevector <4 x i32> [[TMP2_11]], <4 x i32> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
163+
; CHECK-NEXT: ret <4 x i32> [[TMP2_32]]
164164
;
165165
%v0.0 = extractelement <2 x i32> %v0, i32 0
166166
%v0.1 = extractelement <2 x i32> %v0, i32 1
@@ -198,11 +198,11 @@ define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) {
198198
; CHECK-NEXT: [[TMP2_0:%.*]] = add i32 [[TMP0_0]], [[TMP0_1]]
199199
; CHECK-NEXT: [[TMP2_1:%.*]] = add i32 [[TMP1_0]], [[TMP1_1]]
200200
; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]]
201-
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
202201
; CHECK-NEXT: [[TMP3_0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2_0]], i32 0
203202
; CHECK-NEXT: [[TMP3_1:%.*]] = insertelement <4 x i32> [[TMP3_0]], i32 [[TMP2_1]], i32 1
204-
; CHECK-NEXT: [[TMP3_3:%.*]] = shufflevector <4 x i32> [[TMP3_1]], <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
205-
; CHECK-NEXT: ret <4 x i32> [[TMP3_3]]
203+
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
204+
; CHECK-NEXT: [[TMP3_31:%.*]] = shufflevector <4 x i32> [[TMP3_1]], <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
205+
; CHECK-NEXT: ret <4 x i32> [[TMP3_31]]
206206
;
207207
%v0.0 = extractelement <2 x i32> %v0, i32 0
208208
%v0.1 = extractelement <2 x i32> %v0, i32 1

0 commit comments

Comments
 (0)