Skip to content

Commit 8a70102

Browse files
committed
[ARM] Lower i1 concat via MVETRUNC
The MVETRUNC operation can perform the same truncate of two vectors, without requiring lane inserts/extracts from every vector lane. This moves the concat i1 lowering to use it for v8i1 and v16i1 result types, trading a bit of extra stack space for less instructions.
1 parent e494a96 commit 8a70102

8 files changed

+460
-732
lines changed

llvm/lib/Target/ARM/ARMISelLowering.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9095,13 +9095,21 @@ static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG,
90959095
getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT();
90969096
unsigned NumElts = 2 * Op1VT.getVectorNumElements();
90979097

9098+
EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
9099+
if (Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) {
9100+
// Use MVETRUNC to truncate the combined NewV1::NewV2 into the smaller
9101+
// ConcatVT.
9102+
SDValue ConVec =
9103+
DAG.getNode(ARMISD::MVETRUNC, dl, ConcatVT, NewV1, NewV2);
9104+
return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
9105+
DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9106+
}
9107+
90989108
// Extract the vector elements from Op1 and Op2 one by one and truncate them
90999109
// to be the right size for the destination. For example, if Op1 is v4i1
91009110
// then the promoted vector is v4i32. The result of concatenation gives a
91019111
// v8i1, which when promoted is v8i16. That means each i32 element from Op1
91029112
// needs truncating to i16 and inserting in the result.
9103-
EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
9104-
SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);
91059113
auto ExtractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {
91069114
EVT NewVT = NewV.getValueType();
91079115
EVT ConcatVT = ConVec.getValueType();
@@ -9119,6 +9127,7 @@ static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG,
91199127
return ConVec;
91209128
};
91219129
unsigned j = 0;
9130+
SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);
91229131
ConVec = ExtractInto(NewV1, ConVec, j);
91239132
ConVec = ExtractInto(NewV2, ConVec, j);
91249133

llvm/test/CodeGen/ARM/fadd-select-fneg-combine.ll

Lines changed: 15 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -284,40 +284,33 @@ define half @fadd_select_fneg_posk_f16(i32 %arg0, half %x, half %y) {
284284
define <8 x half> @fadd_vselect_fneg_posk_v8f16(<8 x i32> %arg0, <8 x half> %x, <8 x half> %y) {
285285
; CHECK-LABEL: fadd_vselect_fneg_posk_v8f16:
286286
; CHECK: @ %bb.0:
287-
; CHECK-NEXT: push {r4, r5, r6, lr}
287+
; CHECK-NEXT: sub sp, sp, #16
288288
; CHECK-NEXT: vmov d0, r0, r1
289-
; CHECK-NEXT: vmov.i8 q1, #0xff
290-
; CHECK-NEXT: vmov d1, r2, r3
291289
; CHECK-NEXT: add r0, sp, #16
290+
; CHECK-NEXT: vmov d1, r2, r3
291+
; CHECK-NEXT: vldrw.u32 q3, [r0]
292292
; CHECK-NEXT: vcmp.i32 eq, q0, zr
293293
; CHECK-NEXT: vmov.i8 q0, #0x0
294+
; CHECK-NEXT: vmov.i8 q1, #0xff
295+
; CHECK-NEXT: mov r0, sp
294296
; CHECK-NEXT: vpsel q2, q1, q0
295-
; CHECK-NEXT: vldrw.u32 q3, [r0]
296-
; CHECK-NEXT: vmov r2, r1, d4
297-
; CHECK-NEXT: add r12, sp, #32
298-
; CHECK-NEXT: vmov r4, r5, d5
299-
; CHECK-NEXT: vmov.16 q2[0], r2
300-
; CHECK-NEXT: vmov.16 q2[1], r1
301297
; CHECK-NEXT: vcmp.i32 eq, q3, zr
302-
; CHECK-NEXT: vpsel q1, q1, q0
303-
; CHECK-NEXT: vmov.16 q2[2], r4
304-
; CHECK-NEXT: vmov r3, r0, d2
305-
; CHECK-NEXT: vmov.16 q2[3], r5
306-
; CHECK-NEXT: vmov.16 q2[4], r3
307-
; CHECK-NEXT: vmov r6, lr, d3
308-
; CHECK-NEXT: vmov.16 q2[5], r0
309-
; CHECK-NEXT: vldrw.u32 q1, [r12]
310-
; CHECK-NEXT: vmov.16 q2[6], r6
311-
; CHECK-NEXT: vmov.i16 q0, #0xc400
312-
; CHECK-NEXT: vmov.16 q2[7], lr
298+
; CHECK-NEXT: vpsel q0, q1, q0
299+
; CHECK-NEXT: vstrh.32 q2, [r0]
300+
; CHECK-NEXT: vstrh.32 q0, [r0, #8]
301+
; CHECK-NEXT: add r1, sp, #32
302+
; CHECK-NEXT: vldrw.u32 q2, [r0]
303+
; CHECK-NEXT: vldrw.u32 q0, [r1]
304+
; CHECK-NEXT: vmov.i16 q1, #0xc400
313305
; CHECK-NEXT: add r0, sp, #48
314306
; CHECK-NEXT: vcmp.i16 ne, q2, zr
315-
; CHECK-NEXT: vpsel q0, q1, q0
307+
; CHECK-NEXT: vpsel q0, q0, q1
316308
; CHECK-NEXT: vldrw.u32 q1, [r0]
317309
; CHECK-NEXT: vsub.f16 q0, q1, q0
318310
; CHECK-NEXT: vmov r0, r1, d0
319311
; CHECK-NEXT: vmov r2, r3, d1
320-
; CHECK-NEXT: pop {r4, r5, r6, pc}
312+
; CHECK-NEXT: add sp, sp, #16
313+
; CHECK-NEXT: bx lr
321314
%cmp = icmp eq <8 x i32> %arg0, zeroinitializer
322315
%neg.x = fneg <8 x half> %x
323316
%select = select <8 x i1> %cmp, <8 x half> %neg.x, <8 x half> <half 4.0, half 4.0, half 4.0, half 4.0, half 4.0, half 4.0, half 4.0, half 4.0>

llvm/test/CodeGen/Thumb2/active_lane_mask.ll

Lines changed: 76 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -146,54 +146,47 @@ define <7 x i32> @v7i32(i32 %index, i32 %TC, <7 x i32> %V1, <7 x i32> %V2) {
146146
define <8 x i16> @v8i16(i32 %index, i32 %TC, <8 x i16> %V1, <8 x i16> %V2) {
147147
; CHECK-LABEL: v8i16:
148148
; CHECK: @ %bb.0:
149-
; CHECK-NEXT: vpush {d8, d9}
149+
; CHECK-NEXT: push {r4, lr}
150+
; CHECK-NEXT: sub sp, #16
150151
; CHECK-NEXT: adr.w r12, .LCPI3_0
151152
; CHECK-NEXT: vdup.32 q1, r1
152153
; CHECK-NEXT: vldrw.u32 q0, [r12]
153-
; CHECK-NEXT: vmov.i8 q2, #0x0
154-
; CHECK-NEXT: vmov.i8 q3, #0xff
154+
; CHECK-NEXT: vmov.i8 q2, #0xff
155+
; CHECK-NEXT: mov r4, sp
156+
; CHECK-NEXT: adr r1, .LCPI3_1
155157
; CHECK-NEXT: vqadd.u32 q0, q0, r0
156158
; CHECK-NEXT: vcmp.u32 hi, q1, q0
157-
; CHECK-NEXT: vpsel q4, q3, q2
158-
; CHECK-NEXT: vmov r1, r12, d8
159-
; CHECK-NEXT: vmov.16 q0[0], r1
160-
; CHECK-NEXT: vmov.16 q0[1], r12
161-
; CHECK-NEXT: vmov r1, r12, d9
162-
; CHECK-NEXT: vmov.16 q0[2], r1
163-
; CHECK-NEXT: adr r1, .LCPI3_1
164-
; CHECK-NEXT: vldrw.u32 q4, [r1]
165-
; CHECK-NEXT: vmov.16 q0[3], r12
166-
; CHECK-NEXT: vqadd.u32 q4, q4, r0
167-
; CHECK-NEXT: vcmp.u32 hi, q1, q4
168-
; CHECK-NEXT: vpsel q1, q3, q2
169-
; CHECK-NEXT: vmov r0, r1, d2
170-
; CHECK-NEXT: vmov.16 q0[4], r0
171-
; CHECK-NEXT: vmov.16 q0[5], r1
172-
; CHECK-NEXT: vmov r0, r1, d3
173-
; CHECK-NEXT: vmov.16 q0[6], r0
174-
; CHECK-NEXT: add r0, sp, #24
175-
; CHECK-NEXT: vmov.16 q0[7], r1
159+
; CHECK-NEXT: vmov.i8 q0, #0x0
160+
; CHECK-NEXT: vpsel q3, q2, q0
161+
; CHECK-NEXT: vstrh.32 q3, [r4, #8]
162+
; CHECK-NEXT: vldrw.u32 q3, [r1]
163+
; CHECK-NEXT: vqadd.u32 q3, q3, r0
164+
; CHECK-NEXT: add r0, sp, #32
165+
; CHECK-NEXT: vcmp.u32 hi, q1, q3
176166
; CHECK-NEXT: vldrw.u32 q1, [r0]
177-
; CHECK-NEXT: vcmp.i16 ne, q0, zr
178-
; CHECK-NEXT: vldr d1, [sp, #16]
167+
; CHECK-NEXT: vpsel q0, q2, q0
168+
; CHECK-NEXT: vstrh.32 q0, [r4]
169+
; CHECK-NEXT: vldr d1, [sp, #24]
170+
; CHECK-NEXT: vldrw.u32 q2, [r4]
179171
; CHECK-NEXT: vmov d0, r2, r3
172+
; CHECK-NEXT: vcmp.i16 ne, q2, zr
180173
; CHECK-NEXT: vpsel q0, q0, q1
181174
; CHECK-NEXT: vmov r0, r1, d0
182175
; CHECK-NEXT: vmov r2, r3, d1
183-
; CHECK-NEXT: vpop {d8, d9}
184-
; CHECK-NEXT: bx lr
176+
; CHECK-NEXT: add sp, #16
177+
; CHECK-NEXT: pop {r4, pc}
185178
; CHECK-NEXT: .p2align 4
186179
; CHECK-NEXT: @ %bb.1:
187180
; CHECK-NEXT: .LCPI3_0:
188-
; CHECK-NEXT: .long 0 @ 0x0
189-
; CHECK-NEXT: .long 1 @ 0x1
190-
; CHECK-NEXT: .long 2 @ 0x2
191-
; CHECK-NEXT: .long 3 @ 0x3
192-
; CHECK-NEXT: .LCPI3_1:
193181
; CHECK-NEXT: .long 4 @ 0x4
194182
; CHECK-NEXT: .long 5 @ 0x5
195183
; CHECK-NEXT: .long 6 @ 0x6
196184
; CHECK-NEXT: .long 7 @ 0x7
185+
; CHECK-NEXT: .LCPI3_1:
186+
; CHECK-NEXT: .long 0 @ 0x0
187+
; CHECK-NEXT: .long 1 @ 0x1
188+
; CHECK-NEXT: .long 2 @ 0x2
189+
; CHECK-NEXT: .long 3 @ 0x3
197190
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %TC)
198191
%select = select <8 x i1> %active.lane.mask, <8 x i16> %V1, <8 x i16> %V2
199192
ret <8 x i16> %select
@@ -202,122 +195,79 @@ define <8 x i16> @v8i16(i32 %index, i32 %TC, <8 x i16> %V1, <8 x i16> %V2) {
202195
define <16 x i8> @v16i8(i32 %index, i32 %TC, <16 x i8> %V1, <16 x i8> %V2) {
203196
; CHECK-LABEL: v16i8:
204197
; CHECK: @ %bb.0:
205-
; CHECK-NEXT: vpush {d8, d9, d10, d11}
198+
; CHECK-NEXT: push {r4, r5, r7, lr}
199+
; CHECK-NEXT: sub sp, #48
206200
; CHECK-NEXT: adr.w r12, .LCPI4_0
207-
; CHECK-NEXT: vdup.32 q3, r1
201+
; CHECK-NEXT: vdup.32 q2, r1
208202
; CHECK-NEXT: vldrw.u32 q0, [r12]
209203
; CHECK-NEXT: vmov.i8 q1, #0xff
204+
; CHECK-NEXT: add r5, sp, #16
205+
; CHECK-NEXT: adr r1, .LCPI4_1
210206
; CHECK-NEXT: vqadd.u32 q0, q0, r0
211-
; CHECK-NEXT: vcmp.u32 hi, q3, q0
207+
; CHECK-NEXT: adr r4, .LCPI4_3
208+
; CHECK-NEXT: vcmp.u32 hi, q2, q0
212209
; CHECK-NEXT: vmov.i8 q0, #0x0
213-
; CHECK-NEXT: vpsel q4, q1, q0
214-
; CHECK-NEXT: vmov r1, r12, d8
215-
; CHECK-NEXT: vmov.16 q2[0], r1
216-
; CHECK-NEXT: vmov.16 q2[1], r12
217-
; CHECK-NEXT: vmov r1, r12, d9
218-
; CHECK-NEXT: vmov.16 q2[2], r1
219-
; CHECK-NEXT: adr r1, .LCPI4_1
220-
; CHECK-NEXT: vldrw.u32 q4, [r1]
221-
; CHECK-NEXT: vmov.16 q2[3], r12
222-
; CHECK-NEXT: vqadd.u32 q4, q4, r0
223-
; CHECK-NEXT: vcmp.u32 hi, q3, q4
224-
; CHECK-NEXT: vpsel q4, q1, q0
225-
; CHECK-NEXT: vmov r1, r12, d8
226-
; CHECK-NEXT: vmov.16 q2[4], r1
227-
; CHECK-NEXT: vmov.16 q2[5], r12
228-
; CHECK-NEXT: vmov r1, r12, d9
229-
; CHECK-NEXT: vmov.16 q2[6], r1
230-
; CHECK-NEXT: vmov.16 q2[7], r12
231-
; CHECK-NEXT: vcmp.i16 ne, q2, zr
232-
; CHECK-NEXT: vpsel q4, q1, q0
233-
; CHECK-NEXT: vmov.u16 r1, q4[0]
234-
; CHECK-NEXT: vmov.8 q2[0], r1
235-
; CHECK-NEXT: vmov.u16 r1, q4[1]
236-
; CHECK-NEXT: vmov.8 q2[1], r1
237-
; CHECK-NEXT: vmov.u16 r1, q4[2]
238-
; CHECK-NEXT: vmov.8 q2[2], r1
239-
; CHECK-NEXT: vmov.u16 r1, q4[3]
240-
; CHECK-NEXT: vmov.8 q2[3], r1
241-
; CHECK-NEXT: vmov.u16 r1, q4[4]
242-
; CHECK-NEXT: vmov.8 q2[4], r1
243-
; CHECK-NEXT: vmov.u16 r1, q4[5]
244-
; CHECK-NEXT: vmov.8 q2[5], r1
245-
; CHECK-NEXT: vmov.u16 r1, q4[6]
246-
; CHECK-NEXT: vmov.8 q2[6], r1
247-
; CHECK-NEXT: vmov.u16 r1, q4[7]
248-
; CHECK-NEXT: vmov.8 q2[7], r1
210+
; CHECK-NEXT: vpsel q3, q1, q0
211+
; CHECK-NEXT: vstrh.32 q3, [r5, #8]
212+
; CHECK-NEXT: vldrw.u32 q3, [r1]
249213
; CHECK-NEXT: adr r1, .LCPI4_2
250-
; CHECK-NEXT: vldrw.u32 q4, [r1]
251-
; CHECK-NEXT: vqadd.u32 q4, q4, r0
252-
; CHECK-NEXT: vcmp.u32 hi, q3, q4
253-
; CHECK-NEXT: vpsel q5, q1, q0
254-
; CHECK-NEXT: vmov r1, r12, d10
255-
; CHECK-NEXT: vmov.16 q4[0], r1
256-
; CHECK-NEXT: vmov.16 q4[1], r12
257-
; CHECK-NEXT: vmov r1, r12, d11
258-
; CHECK-NEXT: vmov.16 q4[2], r1
259-
; CHECK-NEXT: adr r1, .LCPI4_3
260-
; CHECK-NEXT: vldrw.u32 q5, [r1]
261-
; CHECK-NEXT: vmov.16 q4[3], r12
262-
; CHECK-NEXT: vqadd.u32 q5, q5, r0
263-
; CHECK-NEXT: vcmp.u32 hi, q3, q5
214+
; CHECK-NEXT: vqadd.u32 q3, q3, r0
215+
; CHECK-NEXT: vcmp.u32 hi, q2, q3
216+
; CHECK-NEXT: vpsel q3, q1, q0
217+
; CHECK-NEXT: vstrh.32 q3, [r5]
218+
; CHECK-NEXT: vldrw.u32 q3, [r1]
219+
; CHECK-NEXT: mov r1, sp
220+
; CHECK-NEXT: vqadd.u32 q3, q3, r0
221+
; CHECK-NEXT: vcmp.u32 hi, q2, q3
264222
; CHECK-NEXT: vpsel q3, q1, q0
265-
; CHECK-NEXT: vmov r0, r1, d6
266-
; CHECK-NEXT: vmov.16 q4[4], r0
267-
; CHECK-NEXT: vmov.16 q4[5], r1
268-
; CHECK-NEXT: vmov r0, r1, d7
269-
; CHECK-NEXT: vmov.16 q4[6], r0
270-
; CHECK-NEXT: vmov.16 q4[7], r1
271-
; CHECK-NEXT: vcmp.i16 ne, q4, zr
223+
; CHECK-NEXT: vstrh.32 q3, [r1, #8]
224+
; CHECK-NEXT: vldrw.u32 q3, [r4]
225+
; CHECK-NEXT: vqadd.u32 q3, q3, r0
226+
; CHECK-NEXT: add r0, sp, #32
227+
; CHECK-NEXT: vcmp.u32 hi, q2, q3
228+
; CHECK-NEXT: vpsel q2, q1, q0
229+
; CHECK-NEXT: vstrh.32 q2, [r1]
230+
; CHECK-NEXT: vldrw.u32 q2, [r5]
231+
; CHECK-NEXT: vcmp.i16 ne, q2, zr
232+
; CHECK-NEXT: vpsel q2, q1, q0
233+
; CHECK-NEXT: vstrb.16 q2, [r0, #8]
234+
; CHECK-NEXT: vldrw.u32 q2, [r1]
235+
; CHECK-NEXT: add r1, sp, #72
236+
; CHECK-NEXT: vcmp.i16 ne, q2, zr
272237
; CHECK-NEXT: vpsel q0, q1, q0
273-
; CHECK-NEXT: vmov.u16 r0, q0[0]
274-
; CHECK-NEXT: vmov.8 q2[8], r0
275-
; CHECK-NEXT: vmov.u16 r0, q0[1]
276-
; CHECK-NEXT: vmov.8 q2[9], r0
277-
; CHECK-NEXT: vmov.u16 r0, q0[2]
278-
; CHECK-NEXT: vmov.8 q2[10], r0
279-
; CHECK-NEXT: vmov.u16 r0, q0[3]
280-
; CHECK-NEXT: vmov.8 q2[11], r0
281-
; CHECK-NEXT: vmov.u16 r0, q0[4]
282-
; CHECK-NEXT: vmov.8 q2[12], r0
283-
; CHECK-NEXT: vmov.u16 r0, q0[5]
284-
; CHECK-NEXT: vmov.8 q2[13], r0
285-
; CHECK-NEXT: vmov.u16 r0, q0[6]
286-
; CHECK-NEXT: vmov.8 q2[14], r0
287-
; CHECK-NEXT: vmov.u16 r0, q0[7]
288-
; CHECK-NEXT: vmov.8 q2[15], r0
289-
; CHECK-NEXT: add r0, sp, #40
290-
; CHECK-NEXT: vldr d1, [sp, #32]
291-
; CHECK-NEXT: vldrw.u32 q1, [r0]
292-
; CHECK-NEXT: vcmp.i8 ne, q2, zr
238+
; CHECK-NEXT: vldrw.u32 q1, [r1]
239+
; CHECK-NEXT: vstrb.16 q0, [r0]
240+
; CHECK-NEXT: vldr d1, [sp, #64]
241+
; CHECK-NEXT: vldrw.u32 q2, [r0]
293242
; CHECK-NEXT: vmov d0, r2, r3
243+
; CHECK-NEXT: vcmp.i8 ne, q2, zr
294244
; CHECK-NEXT: vpsel q0, q0, q1
295245
; CHECK-NEXT: vmov r0, r1, d0
296246
; CHECK-NEXT: vmov r2, r3, d1
297-
; CHECK-NEXT: vpop {d8, d9, d10, d11}
298-
; CHECK-NEXT: bx lr
247+
; CHECK-NEXT: add sp, #48
248+
; CHECK-NEXT: pop {r4, r5, r7, pc}
299249
; CHECK-NEXT: .p2align 4
300250
; CHECK-NEXT: @ %bb.1:
301251
; CHECK-NEXT: .LCPI4_0:
302-
; CHECK-NEXT: .long 0 @ 0x0
303-
; CHECK-NEXT: .long 1 @ 0x1
304-
; CHECK-NEXT: .long 2 @ 0x2
305-
; CHECK-NEXT: .long 3 @ 0x3
252+
; CHECK-NEXT: .long 12 @ 0xc
253+
; CHECK-NEXT: .long 13 @ 0xd
254+
; CHECK-NEXT: .long 14 @ 0xe
255+
; CHECK-NEXT: .long 15 @ 0xf
306256
; CHECK-NEXT: .LCPI4_1:
307-
; CHECK-NEXT: .long 4 @ 0x4
308-
; CHECK-NEXT: .long 5 @ 0x5
309-
; CHECK-NEXT: .long 6 @ 0x6
310-
; CHECK-NEXT: .long 7 @ 0x7
311-
; CHECK-NEXT: .LCPI4_2:
312257
; CHECK-NEXT: .long 8 @ 0x8
313258
; CHECK-NEXT: .long 9 @ 0x9
314259
; CHECK-NEXT: .long 10 @ 0xa
315260
; CHECK-NEXT: .long 11 @ 0xb
261+
; CHECK-NEXT: .LCPI4_2:
262+
; CHECK-NEXT: .long 4 @ 0x4
263+
; CHECK-NEXT: .long 5 @ 0x5
264+
; CHECK-NEXT: .long 6 @ 0x6
265+
; CHECK-NEXT: .long 7 @ 0x7
316266
; CHECK-NEXT: .LCPI4_3:
317-
; CHECK-NEXT: .long 12 @ 0xc
318-
; CHECK-NEXT: .long 13 @ 0xd
319-
; CHECK-NEXT: .long 14 @ 0xe
320-
; CHECK-NEXT: .long 15 @ 0xf
267+
; CHECK-NEXT: .long 0 @ 0x0
268+
; CHECK-NEXT: .long 1 @ 0x1
269+
; CHECK-NEXT: .long 2 @ 0x2
270+
; CHECK-NEXT: .long 3 @ 0x3
321271
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %TC)
322272
%select = select <16 x i1> %active.lane.mask, <16 x i8> %V1, <16 x i8> %V2
323273
ret <16 x i8> %select

0 commit comments

Comments
 (0)