Skip to content

Commit b5ff71e

Browse files
authored
[RISCV] Shrink vslideup's LMUL when lowering fixed insert_subvector (#65997)
Similar to #65598, if we're using a vslideup to insert a fixed length vector into another vector, then we can work out the minimum number of registers it will need to slide up across given the minimum VLEN, and shrink the type operated on to reduce LMUL accordingly. This is somewhat dependent on #66211 , since it introduces a subregister copy that triggers a crash with -early-live-intervals in one of the tests. Stacked upon #66211
1 parent 3fa5035 commit b5ff71e

File tree

4 files changed

+229
-244
lines changed

4 files changed

+229
-244
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8629,6 +8629,18 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
86298629
ContainerVT = getContainerForFixedLengthVector(VecVT);
86308630
Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
86318631
}
8632+
8633+
// Shrink down Vec so we're performing the slideup on a smaller LMUL.
8634+
unsigned LastIdx = OrigIdx + SubVecVT.getVectorNumElements() - 1;
8635+
MVT OrigContainerVT = ContainerVT;
8636+
SDValue OrigVec = Vec;
8637+
if (auto ShrunkVT =
8638+
getSmallestVTForIndex(ContainerVT, LastIdx, DL, DAG, Subtarget)) {
8639+
ContainerVT = *ShrunkVT;
8640+
Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ContainerVT, Vec,
8641+
DAG.getVectorIdxConstant(0, DL));
8642+
}
8643+
86328644
SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT,
86338645
DAG.getUNDEF(ContainerVT), SubVec,
86348646
DAG.getConstant(0, DL, XLenVT));
@@ -8659,6 +8671,12 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
86598671
SlideupAmt, Mask, VL, Policy);
86608672
}
86618673

8674+
// If we performed the slideup on a smaller LMUL, insert the result back
8675+
// into the rest of the vector.
8676+
if (ContainerVT != OrigContainerVT)
8677+
SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, OrigContainerVT, OrigVec,
8678+
SubVec, DAG.getVectorIdxConstant(0, DL));
8679+
86628680
if (VecVT.isFixedLengthVector())
86638681
SubVec = convertFromScalableVector(VecVT, SubVec, DAG, Subtarget);
86648682
return DAG.getBitcast(Op.getValueType(), SubVec);

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll

Lines changed: 21 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ define <vscale x 8 x i32> @insert_nxv8i32_v2i32_0(<vscale x 8 x i32> %vec, ptr %
1414
; CHECK: # %bb.0:
1515
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
1616
; CHECK-NEXT: vle32.v v12, (a0)
17-
; CHECK-NEXT: vsetivli zero, 2, e32, m4, tu, ma
17+
; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma
1818
; CHECK-NEXT: vmv.v.v v8, v12
1919
; CHECK-NEXT: ret
2020
%sv = load <2 x i32>, ptr %svp
@@ -27,7 +27,7 @@ define <vscale x 8 x i32> @insert_nxv8i32_v2i32_2(<vscale x 8 x i32> %vec, ptr %
2727
; CHECK: # %bb.0:
2828
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
2929
; CHECK-NEXT: vle32.v v12, (a0)
30-
; CHECK-NEXT: vsetivli zero, 4, e32, m4, tu, ma
30+
; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, ma
3131
; CHECK-NEXT: vslideup.vi v8, v12, 2
3232
; CHECK-NEXT: ret
3333
%sv = load <2 x i32>, ptr %svp
@@ -40,7 +40,7 @@ define <vscale x 8 x i32> @insert_nxv8i32_v2i32_6(<vscale x 8 x i32> %vec, ptr %
4040
; CHECK: # %bb.0:
4141
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
4242
; CHECK-NEXT: vle32.v v12, (a0)
43-
; CHECK-NEXT: vsetivli zero, 8, e32, m4, tu, ma
43+
; CHECK-NEXT: vsetivli zero, 8, e32, m2, tu, ma
4444
; CHECK-NEXT: vslideup.vi v8, v12, 6
4545
; CHECK-NEXT: ret
4646
%sv = load <2 x i32>, ptr %svp
@@ -51,22 +51,19 @@ define <vscale x 8 x i32> @insert_nxv8i32_v2i32_6(<vscale x 8 x i32> %vec, ptr %
5151
define <vscale x 8 x i32> @insert_nxv8i32_v8i32_0(<vscale x 8 x i32> %vec, ptr %svp) {
5252
; LMULMAX2-LABEL: insert_nxv8i32_v8i32_0:
5353
; LMULMAX2: # %bb.0:
54-
; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma
55-
; LMULMAX2-NEXT: vle32.v v12, (a0)
56-
; LMULMAX2-NEXT: vsetivli zero, 8, e32, m4, tu, ma
57-
; LMULMAX2-NEXT: vmv.v.v v8, v12
54+
; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, tu, ma
55+
; LMULMAX2-NEXT: vle32.v v8, (a0)
5856
; LMULMAX2-NEXT: ret
5957
;
6058
; LMULMAX1-LABEL: insert_nxv8i32_v8i32_0:
6159
; LMULMAX1: # %bb.0:
60+
; LMULMAX1-NEXT: addi a1, a0, 16
6261
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
63-
; LMULMAX1-NEXT: vle32.v v12, (a0)
64-
; LMULMAX1-NEXT: addi a0, a0, 16
65-
; LMULMAX1-NEXT: vle32.v v16, (a0)
66-
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m4, tu, ma
67-
; LMULMAX1-NEXT: vmv.v.v v8, v12
68-
; LMULMAX1-NEXT: vsetivli zero, 8, e32, m4, tu, ma
69-
; LMULMAX1-NEXT: vslideup.vi v8, v16, 4
62+
; LMULMAX1-NEXT: vle32.v v12, (a1)
63+
; LMULMAX1-NEXT: vsetvli zero, zero, e32, m1, tu, ma
64+
; LMULMAX1-NEXT: vle32.v v8, (a0)
65+
; LMULMAX1-NEXT: vsetivli zero, 8, e32, m2, tu, ma
66+
; LMULMAX1-NEXT: vslideup.vi v8, v12, 4
7067
; LMULMAX1-NEXT: ret
7168
%sv = load <8 x i32>, ptr %svp
7269
%v = call <vscale x 8 x i32> @llvm.vector.insert.v8i32.nxv8i32(<vscale x 8 x i32> %vec, <8 x i32> %sv, i64 0)
@@ -84,14 +81,14 @@ define <vscale x 8 x i32> @insert_nxv8i32_v8i32_8(<vscale x 8 x i32> %vec, ptr %
8481
;
8582
; LMULMAX1-LABEL: insert_nxv8i32_v8i32_8:
8683
; LMULMAX1: # %bb.0:
87-
; LMULMAX1-NEXT: addi a1, a0, 16
8884
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
89-
; LMULMAX1-NEXT: vle32.v v12, (a1)
85+
; LMULMAX1-NEXT: vle32.v v12, (a0)
86+
; LMULMAX1-NEXT: addi a0, a0, 16
9087
; LMULMAX1-NEXT: vle32.v v16, (a0)
9188
; LMULMAX1-NEXT: vsetivli zero, 12, e32, m4, tu, ma
92-
; LMULMAX1-NEXT: vslideup.vi v8, v16, 8
89+
; LMULMAX1-NEXT: vslideup.vi v8, v12, 8
9390
; LMULMAX1-NEXT: vsetivli zero, 16, e32, m4, tu, ma
94-
; LMULMAX1-NEXT: vslideup.vi v8, v12, 12
91+
; LMULMAX1-NEXT: vslideup.vi v8, v16, 12
9592
; LMULMAX1-NEXT: ret
9693
%sv = load <8 x i32>, ptr %svp
9794
%v = call <vscale x 8 x i32> @llvm.vector.insert.v8i32.nxv8i32(<vscale x 8 x i32> %vec, <8 x i32> %sv, i64 8)
@@ -166,7 +163,7 @@ define void @insert_v8i32_v2i32_0(ptr %vp, ptr %svp) {
166163
; LMULMAX2-NEXT: vle32.v v8, (a1)
167164
; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma
168165
; LMULMAX2-NEXT: vle32.v v10, (a0)
169-
; LMULMAX2-NEXT: vsetivli zero, 2, e32, m2, tu, ma
166+
; LMULMAX2-NEXT: vsetivli zero, 2, e32, m1, tu, ma
170167
; LMULMAX2-NEXT: vmv.v.v v10, v8
171168
; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma
172169
; LMULMAX2-NEXT: vse32.v v10, (a0)
@@ -197,7 +194,7 @@ define void @insert_v8i32_v2i32_2(ptr %vp, ptr %svp) {
197194
; LMULMAX2-NEXT: vle32.v v8, (a1)
198195
; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma
199196
; LMULMAX2-NEXT: vle32.v v10, (a0)
200-
; LMULMAX2-NEXT: vsetivli zero, 4, e32, m2, tu, ma
197+
; LMULMAX2-NEXT: vsetivli zero, 4, e32, m1, tu, ma
201198
; LMULMAX2-NEXT: vslideup.vi v10, v8, 2
202199
; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma
203200
; LMULMAX2-NEXT: vse32.v v10, (a0)
@@ -508,9 +505,9 @@ define void @insert_v2i64_nxv16i64(ptr %psv0, ptr %psv1, <vscale x 16 x i64>* %o
508505
; CHECK: # %bb.0:
509506
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
510507
; CHECK-NEXT: vle64.v v8, (a0)
511-
; CHECK-NEXT: vle64.v v16, (a1)
512-
; CHECK-NEXT: vsetivli zero, 6, e64, m8, tu, ma
513-
; CHECK-NEXT: vslideup.vi v8, v16, 4
508+
; CHECK-NEXT: vle64.v v12, (a1)
509+
; CHECK-NEXT: vsetivli zero, 6, e64, m4, tu, ma
510+
; CHECK-NEXT: vslideup.vi v8, v12, 4
514511
; CHECK-NEXT: vs8r.v v8, (a2)
515512
; CHECK-NEXT: ret
516513
%sv0 = load <2 x i64>, ptr %psv0
@@ -539,7 +536,7 @@ define void @insert_v2i64_nxv16i64_lo2(ptr %psv, <vscale x 16 x i64>* %out) {
539536
; CHECK: # %bb.0:
540537
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
541538
; CHECK-NEXT: vle64.v v8, (a0)
542-
; CHECK-NEXT: vsetivli zero, 4, e64, m8, ta, ma
539+
; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
543540
; CHECK-NEXT: vslideup.vi v16, v8, 2
544541
; CHECK-NEXT: vs8r.v v16, (a1)
545542
; CHECK-NEXT: ret

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll

Lines changed: 40 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,13 @@ define void @widen_3xv4i16(ptr %x, ptr %z) {
2727
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
2828
; CHECK-NEXT: vle16.v v8, (a0)
2929
; CHECK-NEXT: addi a2, a0, 8
30-
; CHECK-NEXT: vle16.v v10, (a2)
30+
; CHECK-NEXT: vle16.v v9, (a2)
3131
; CHECK-NEXT: addi a0, a0, 16
32-
; CHECK-NEXT: vle16.v v12, (a0)
33-
; CHECK-NEXT: vsetivli zero, 8, e16, m2, tu, ma
34-
; CHECK-NEXT: vslideup.vi v8, v10, 4
32+
; CHECK-NEXT: vle16.v v10, (a0)
33+
; CHECK-NEXT: vsetivli zero, 8, e16, m1, tu, ma
34+
; CHECK-NEXT: vslideup.vi v8, v9, 4
3535
; CHECK-NEXT: vsetivli zero, 12, e16, m2, tu, ma
36-
; CHECK-NEXT: vslideup.vi v8, v12, 8
36+
; CHECK-NEXT: vslideup.vi v8, v10, 8
3737
; CHECK-NEXT: vse16.v v8, (a1)
3838
; CHECK-NEXT: ret
3939
%a = load <4 x i16>, ptr %x
@@ -75,17 +75,17 @@ define void @widen_4xv4i16_unaligned(ptr %x, ptr %z) {
7575
; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
7676
; CHECK-NO-MISALIGN-NEXT: vle8.v v8, (a0)
7777
; CHECK-NO-MISALIGN-NEXT: addi a2, a0, 8
78-
; CHECK-NO-MISALIGN-NEXT: vle8.v v10, (a2)
78+
; CHECK-NO-MISALIGN-NEXT: vle8.v v9, (a2)
7979
; CHECK-NO-MISALIGN-NEXT: addi a2, a0, 16
80-
; CHECK-NO-MISALIGN-NEXT: vle8.v v12, (a2)
80+
; CHECK-NO-MISALIGN-NEXT: vle8.v v10, (a2)
8181
; CHECK-NO-MISALIGN-NEXT: addi a0, a0, 24
82-
; CHECK-NO-MISALIGN-NEXT: vle8.v v14, (a0)
83-
; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 8, e16, m2, tu, ma
84-
; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v10, 4
82+
; CHECK-NO-MISALIGN-NEXT: vle8.v v12, (a0)
83+
; CHECK-NO-MISALIGN-NEXT: vsetvli zero, zero, e16, m1, tu, ma
84+
; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v9, 4
8585
; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 12, e16, m2, tu, ma
86-
; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v12, 8
86+
; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v10, 8
8787
; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 16, e16, m2, ta, ma
88-
; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v14, 12
88+
; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v12, 12
8989
; CHECK-NO-MISALIGN-NEXT: vse16.v v8, (a1)
9090
; CHECK-NO-MISALIGN-NEXT: ret
9191
;
@@ -188,17 +188,17 @@ define void @strided_constant_mismatch_4xv4i16(ptr %x, ptr %z) {
188188
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
189189
; CHECK-NEXT: vle16.v v8, (a0)
190190
; CHECK-NEXT: addi a2, a0, 2
191-
; CHECK-NEXT: vle16.v v10, (a2)
191+
; CHECK-NEXT: vle16.v v9, (a2)
192192
; CHECK-NEXT: addi a2, a0, 6
193-
; CHECK-NEXT: vle16.v v12, (a2)
193+
; CHECK-NEXT: vle16.v v10, (a2)
194194
; CHECK-NEXT: addi a0, a0, 8
195-
; CHECK-NEXT: vle16.v v14, (a0)
196-
; CHECK-NEXT: vsetivli zero, 8, e16, m2, tu, ma
197-
; CHECK-NEXT: vslideup.vi v8, v10, 4
195+
; CHECK-NEXT: vle16.v v12, (a0)
196+
; CHECK-NEXT: vsetivli zero, 8, e16, m1, tu, ma
197+
; CHECK-NEXT: vslideup.vi v8, v9, 4
198198
; CHECK-NEXT: vsetivli zero, 12, e16, m2, tu, ma
199-
; CHECK-NEXT: vslideup.vi v8, v12, 8
199+
; CHECK-NEXT: vslideup.vi v8, v10, 8
200200
; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
201-
; CHECK-NEXT: vslideup.vi v8, v14, 12
201+
; CHECK-NEXT: vslideup.vi v8, v12, 12
202202
; CHECK-NEXT: vse16.v v8, (a1)
203203
; CHECK-NEXT: ret
204204
%a = load <4 x i16>, ptr %x
@@ -258,17 +258,17 @@ define void @strided_runtime_mismatch_4xv4i16(ptr %x, ptr %z, i64 %s, i64 %t) {
258258
; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
259259
; RV32-NEXT: vle16.v v8, (a0)
260260
; RV32-NEXT: add a0, a0, a2
261-
; RV32-NEXT: vle16.v v10, (a0)
261+
; RV32-NEXT: vle16.v v9, (a0)
262262
; RV32-NEXT: add a0, a0, a4
263-
; RV32-NEXT: vle16.v v12, (a0)
263+
; RV32-NEXT: vle16.v v10, (a0)
264264
; RV32-NEXT: add a0, a0, a2
265-
; RV32-NEXT: vle16.v v14, (a0)
266-
; RV32-NEXT: vsetivli zero, 8, e16, m2, tu, ma
267-
; RV32-NEXT: vslideup.vi v8, v10, 4
265+
; RV32-NEXT: vle16.v v12, (a0)
266+
; RV32-NEXT: vsetivli zero, 8, e16, m1, tu, ma
267+
; RV32-NEXT: vslideup.vi v8, v9, 4
268268
; RV32-NEXT: vsetivli zero, 12, e16, m2, tu, ma
269-
; RV32-NEXT: vslideup.vi v8, v12, 8
269+
; RV32-NEXT: vslideup.vi v8, v10, 8
270270
; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma
271-
; RV32-NEXT: vslideup.vi v8, v14, 12
271+
; RV32-NEXT: vslideup.vi v8, v12, 12
272272
; RV32-NEXT: vse16.v v8, (a1)
273273
; RV32-NEXT: ret
274274
;
@@ -277,17 +277,17 @@ define void @strided_runtime_mismatch_4xv4i16(ptr %x, ptr %z, i64 %s, i64 %t) {
277277
; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
278278
; RV64-NEXT: vle16.v v8, (a0)
279279
; RV64-NEXT: add a0, a0, a2
280-
; RV64-NEXT: vle16.v v10, (a0)
280+
; RV64-NEXT: vle16.v v9, (a0)
281281
; RV64-NEXT: add a0, a0, a3
282-
; RV64-NEXT: vle16.v v12, (a0)
282+
; RV64-NEXT: vle16.v v10, (a0)
283283
; RV64-NEXT: add a0, a0, a2
284-
; RV64-NEXT: vle16.v v14, (a0)
285-
; RV64-NEXT: vsetivli zero, 8, e16, m2, tu, ma
286-
; RV64-NEXT: vslideup.vi v8, v10, 4
284+
; RV64-NEXT: vle16.v v12, (a0)
285+
; RV64-NEXT: vsetivli zero, 8, e16, m1, tu, ma
286+
; RV64-NEXT: vslideup.vi v8, v9, 4
287287
; RV64-NEXT: vsetivli zero, 12, e16, m2, tu, ma
288-
; RV64-NEXT: vslideup.vi v8, v12, 8
288+
; RV64-NEXT: vslideup.vi v8, v10, 8
289289
; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma
290-
; RV64-NEXT: vslideup.vi v8, v14, 12
290+
; RV64-NEXT: vslideup.vi v8, v12, 12
291291
; RV64-NEXT: vse16.v v8, (a1)
292292
; RV64-NEXT: ret
293293
;
@@ -296,17 +296,17 @@ define void @strided_runtime_mismatch_4xv4i16(ptr %x, ptr %z, i64 %s, i64 %t) {
296296
; ZVE64F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
297297
; ZVE64F-NEXT: vle16.v v8, (a0)
298298
; ZVE64F-NEXT: add a0, a0, a2
299-
; ZVE64F-NEXT: vle16.v v10, (a0)
299+
; ZVE64F-NEXT: vle16.v v9, (a0)
300300
; ZVE64F-NEXT: add a0, a0, a3
301-
; ZVE64F-NEXT: vle16.v v12, (a0)
301+
; ZVE64F-NEXT: vle16.v v10, (a0)
302302
; ZVE64F-NEXT: add a0, a0, a2
303-
; ZVE64F-NEXT: vle16.v v14, (a0)
304-
; ZVE64F-NEXT: vsetivli zero, 8, e16, m2, tu, ma
305-
; ZVE64F-NEXT: vslideup.vi v8, v10, 4
303+
; ZVE64F-NEXT: vle16.v v12, (a0)
304+
; ZVE64F-NEXT: vsetivli zero, 8, e16, m1, tu, ma
305+
; ZVE64F-NEXT: vslideup.vi v8, v9, 4
306306
; ZVE64F-NEXT: vsetivli zero, 12, e16, m2, tu, ma
307-
; ZVE64F-NEXT: vslideup.vi v8, v12, 8
307+
; ZVE64F-NEXT: vslideup.vi v8, v10, 8
308308
; ZVE64F-NEXT: vsetivli zero, 16, e16, m2, ta, ma
309-
; ZVE64F-NEXT: vslideup.vi v8, v14, 12
309+
; ZVE64F-NEXT: vslideup.vi v8, v12, 12
310310
; ZVE64F-NEXT: vse16.v v8, (a1)
311311
; ZVE64F-NEXT: ret
312312
%a = load <4 x i16>, ptr %x

0 commit comments

Comments
 (0)