Skip to content

Commit 1cb8598

Browse files
committed
Revert "Revert "[RISCV] Shrink vslideup's LMUL when lowering fixed insert_subvector (llvm#65997)""
This reverts commit 3a6cc52.
1 parent ca8d02d commit 1cb8598

File tree

4 files changed

+228
-244
lines changed

4 files changed

+228
-244
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8920,6 +8920,17 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
89208920
return DAG.getBitcast(Op.getValueType(), SubVec);
89218921
}
89228922

8923+
// Shrink down Vec so we're performing the slideup on a smaller LMUL.
8924+
unsigned LastIdx = OrigIdx + SubVecVT.getVectorNumElements() - 1;
8925+
MVT OrigContainerVT = ContainerVT;
8926+
SDValue OrigVec = Vec;
8927+
if (auto ShrunkVT =
8928+
getSmallestVTForIndex(ContainerVT, LastIdx, DL, DAG, Subtarget)) {
8929+
ContainerVT = *ShrunkVT;
8930+
Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ContainerVT, Vec,
8931+
DAG.getVectorIdxConstant(0, DL));
8932+
}
8933+
89238934
SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT,
89248935
DAG.getUNDEF(ContainerVT), SubVec,
89258936
DAG.getConstant(0, DL, XLenVT));
@@ -8946,6 +8957,12 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
89468957
SlideupAmt, Mask, VL, Policy);
89478958
}
89488959

8960+
// If we performed the slideup on a smaller LMUL, insert the result back
8961+
// into the rest of the vector.
8962+
if (ContainerVT != OrigContainerVT)
8963+
SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, OrigContainerVT, OrigVec,
8964+
SubVec, DAG.getVectorIdxConstant(0, DL));
8965+
89498966
if (VecVT.isFixedLengthVector())
89508967
SubVec = convertFromScalableVector(VecVT, SubVec, DAG, Subtarget);
89518968
return DAG.getBitcast(Op.getValueType(), SubVec);

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll

Lines changed: 21 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ define <vscale x 8 x i32> @insert_nxv8i32_v2i32_0(<vscale x 8 x i32> %vec, ptr %
1414
; CHECK: # %bb.0:
1515
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
1616
; CHECK-NEXT: vle32.v v12, (a0)
17-
; CHECK-NEXT: vsetivli zero, 2, e32, m4, tu, ma
17+
; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma
1818
; CHECK-NEXT: vmv.v.v v8, v12
1919
; CHECK-NEXT: ret
2020
%sv = load <2 x i32>, ptr %svp
@@ -27,7 +27,7 @@ define <vscale x 8 x i32> @insert_nxv8i32_v2i32_2(<vscale x 8 x i32> %vec, ptr %
2727
; CHECK: # %bb.0:
2828
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
2929
; CHECK-NEXT: vle32.v v12, (a0)
30-
; CHECK-NEXT: vsetivli zero, 4, e32, m4, tu, ma
30+
; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, ma
3131
; CHECK-NEXT: vslideup.vi v8, v12, 2
3232
; CHECK-NEXT: ret
3333
%sv = load <2 x i32>, ptr %svp
@@ -40,7 +40,7 @@ define <vscale x 8 x i32> @insert_nxv8i32_v2i32_6(<vscale x 8 x i32> %vec, ptr %
4040
; CHECK: # %bb.0:
4141
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
4242
; CHECK-NEXT: vle32.v v12, (a0)
43-
; CHECK-NEXT: vsetivli zero, 8, e32, m4, tu, ma
43+
; CHECK-NEXT: vsetivli zero, 8, e32, m2, tu, ma
4444
; CHECK-NEXT: vslideup.vi v8, v12, 6
4545
; CHECK-NEXT: ret
4646
%sv = load <2 x i32>, ptr %svp
@@ -51,22 +51,19 @@ define <vscale x 8 x i32> @insert_nxv8i32_v2i32_6(<vscale x 8 x i32> %vec, ptr %
5151
define <vscale x 8 x i32> @insert_nxv8i32_v8i32_0(<vscale x 8 x i32> %vec, ptr %svp) {
5252
; LMULMAX2-LABEL: insert_nxv8i32_v8i32_0:
5353
; LMULMAX2: # %bb.0:
54-
; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma
55-
; LMULMAX2-NEXT: vle32.v v12, (a0)
56-
; LMULMAX2-NEXT: vsetivli zero, 8, e32, m4, tu, ma
57-
; LMULMAX2-NEXT: vmv.v.v v8, v12
54+
; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, tu, ma
55+
; LMULMAX2-NEXT: vle32.v v8, (a0)
5856
; LMULMAX2-NEXT: ret
5957
;
6058
; LMULMAX1-LABEL: insert_nxv8i32_v8i32_0:
6159
; LMULMAX1: # %bb.0:
60+
; LMULMAX1-NEXT: addi a1, a0, 16
6261
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
63-
; LMULMAX1-NEXT: vle32.v v12, (a0)
64-
; LMULMAX1-NEXT: addi a0, a0, 16
65-
; LMULMAX1-NEXT: vle32.v v16, (a0)
66-
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m4, tu, ma
67-
; LMULMAX1-NEXT: vmv.v.v v8, v12
68-
; LMULMAX1-NEXT: vsetivli zero, 8, e32, m4, tu, ma
69-
; LMULMAX1-NEXT: vslideup.vi v8, v16, 4
62+
; LMULMAX1-NEXT: vle32.v v12, (a1)
63+
; LMULMAX1-NEXT: vsetvli zero, zero, e32, m1, tu, ma
64+
; LMULMAX1-NEXT: vle32.v v8, (a0)
65+
; LMULMAX1-NEXT: vsetivli zero, 8, e32, m2, tu, ma
66+
; LMULMAX1-NEXT: vslideup.vi v8, v12, 4
7067
; LMULMAX1-NEXT: ret
7168
%sv = load <8 x i32>, ptr %svp
7269
%v = call <vscale x 8 x i32> @llvm.vector.insert.v8i32.nxv8i32(<vscale x 8 x i32> %vec, <8 x i32> %sv, i64 0)
@@ -84,14 +81,14 @@ define <vscale x 8 x i32> @insert_nxv8i32_v8i32_8(<vscale x 8 x i32> %vec, ptr %
8481
;
8582
; LMULMAX1-LABEL: insert_nxv8i32_v8i32_8:
8683
; LMULMAX1: # %bb.0:
87-
; LMULMAX1-NEXT: addi a1, a0, 16
8884
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
89-
; LMULMAX1-NEXT: vle32.v v12, (a1)
85+
; LMULMAX1-NEXT: vle32.v v12, (a0)
86+
; LMULMAX1-NEXT: addi a0, a0, 16
9087
; LMULMAX1-NEXT: vle32.v v16, (a0)
9188
; LMULMAX1-NEXT: vsetivli zero, 12, e32, m4, tu, ma
92-
; LMULMAX1-NEXT: vslideup.vi v8, v16, 8
89+
; LMULMAX1-NEXT: vslideup.vi v8, v12, 8
9390
; LMULMAX1-NEXT: vsetivli zero, 16, e32, m4, tu, ma
94-
; LMULMAX1-NEXT: vslideup.vi v8, v12, 12
91+
; LMULMAX1-NEXT: vslideup.vi v8, v16, 12
9592
; LMULMAX1-NEXT: ret
9693
%sv = load <8 x i32>, ptr %svp
9794
%v = call <vscale x 8 x i32> @llvm.vector.insert.v8i32.nxv8i32(<vscale x 8 x i32> %vec, <8 x i32> %sv, i64 8)
@@ -166,7 +163,7 @@ define void @insert_v8i32_v2i32_0(ptr %vp, ptr %svp) {
166163
; LMULMAX2-NEXT: vle32.v v8, (a1)
167164
; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma
168165
; LMULMAX2-NEXT: vle32.v v10, (a0)
169-
; LMULMAX2-NEXT: vsetivli zero, 2, e32, m2, tu, ma
166+
; LMULMAX2-NEXT: vsetivli zero, 2, e32, m1, tu, ma
170167
; LMULMAX2-NEXT: vmv.v.v v10, v8
171168
; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma
172169
; LMULMAX2-NEXT: vse32.v v10, (a0)
@@ -197,7 +194,7 @@ define void @insert_v8i32_v2i32_2(ptr %vp, ptr %svp) {
197194
; LMULMAX2-NEXT: vle32.v v8, (a1)
198195
; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma
199196
; LMULMAX2-NEXT: vle32.v v10, (a0)
200-
; LMULMAX2-NEXT: vsetivli zero, 4, e32, m2, tu, ma
197+
; LMULMAX2-NEXT: vsetivli zero, 4, e32, m1, tu, ma
201198
; LMULMAX2-NEXT: vslideup.vi v10, v8, 2
202199
; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma
203200
; LMULMAX2-NEXT: vse32.v v10, (a0)
@@ -508,9 +505,9 @@ define void @insert_v2i64_nxv16i64(ptr %psv0, ptr %psv1, <vscale x 16 x i64>* %o
508505
; CHECK: # %bb.0:
509506
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
510507
; CHECK-NEXT: vle64.v v8, (a0)
511-
; CHECK-NEXT: vle64.v v16, (a1)
512-
; CHECK-NEXT: vsetivli zero, 6, e64, m8, tu, ma
513-
; CHECK-NEXT: vslideup.vi v8, v16, 4
508+
; CHECK-NEXT: vle64.v v12, (a1)
509+
; CHECK-NEXT: vsetivli zero, 6, e64, m4, tu, ma
510+
; CHECK-NEXT: vslideup.vi v8, v12, 4
514511
; CHECK-NEXT: vs8r.v v8, (a2)
515512
; CHECK-NEXT: ret
516513
%sv0 = load <2 x i64>, ptr %psv0
@@ -539,7 +536,7 @@ define void @insert_v2i64_nxv16i64_lo2(ptr %psv, <vscale x 16 x i64>* %out) {
539536
; CHECK: # %bb.0:
540537
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
541538
; CHECK-NEXT: vle64.v v8, (a0)
542-
; CHECK-NEXT: vsetivli zero, 4, e64, m8, ta, ma
539+
; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
543540
; CHECK-NEXT: vslideup.vi v16, v8, 2
544541
; CHECK-NEXT: vs8r.v v16, (a1)
545542
; CHECK-NEXT: ret

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll

Lines changed: 40 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,13 @@ define void @widen_3xv4i16(ptr %x, ptr %z) {
2727
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
2828
; CHECK-NEXT: vle16.v v8, (a0)
2929
; CHECK-NEXT: addi a2, a0, 8
30-
; CHECK-NEXT: vle16.v v10, (a2)
30+
; CHECK-NEXT: vle16.v v9, (a2)
3131
; CHECK-NEXT: addi a0, a0, 16
32-
; CHECK-NEXT: vle16.v v12, (a0)
33-
; CHECK-NEXT: vsetivli zero, 8, e16, m2, tu, ma
34-
; CHECK-NEXT: vslideup.vi v8, v10, 4
32+
; CHECK-NEXT: vle16.v v10, (a0)
33+
; CHECK-NEXT: vsetivli zero, 8, e16, m1, tu, ma
34+
; CHECK-NEXT: vslideup.vi v8, v9, 4
3535
; CHECK-NEXT: vsetivli zero, 12, e16, m2, tu, ma
36-
; CHECK-NEXT: vslideup.vi v8, v12, 8
36+
; CHECK-NEXT: vslideup.vi v8, v10, 8
3737
; CHECK-NEXT: vse16.v v8, (a1)
3838
; CHECK-NEXT: ret
3939
%a = load <4 x i16>, ptr %x
@@ -75,17 +75,17 @@ define void @widen_4xv4i16_unaligned(ptr %x, ptr %z) {
7575
; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
7676
; CHECK-NO-MISALIGN-NEXT: vle8.v v8, (a0)
7777
; CHECK-NO-MISALIGN-NEXT: addi a2, a0, 8
78-
; CHECK-NO-MISALIGN-NEXT: vle8.v v10, (a2)
78+
; CHECK-NO-MISALIGN-NEXT: vle8.v v9, (a2)
7979
; CHECK-NO-MISALIGN-NEXT: addi a2, a0, 16
80-
; CHECK-NO-MISALIGN-NEXT: vle8.v v12, (a2)
80+
; CHECK-NO-MISALIGN-NEXT: vle8.v v10, (a2)
8181
; CHECK-NO-MISALIGN-NEXT: addi a0, a0, 24
82-
; CHECK-NO-MISALIGN-NEXT: vle8.v v14, (a0)
83-
; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 8, e16, m2, tu, ma
84-
; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v10, 4
82+
; CHECK-NO-MISALIGN-NEXT: vle8.v v12, (a0)
83+
; CHECK-NO-MISALIGN-NEXT: vsetvli zero, zero, e16, m1, tu, ma
84+
; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v9, 4
8585
; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 12, e16, m2, tu, ma
86-
; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v12, 8
86+
; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v10, 8
8787
; CHECK-NO-MISALIGN-NEXT: vsetivli zero, 16, e16, m2, ta, ma
88-
; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v14, 12
88+
; CHECK-NO-MISALIGN-NEXT: vslideup.vi v8, v12, 12
8989
; CHECK-NO-MISALIGN-NEXT: vse16.v v8, (a1)
9090
; CHECK-NO-MISALIGN-NEXT: ret
9191
;
@@ -188,17 +188,17 @@ define void @strided_constant_mismatch_4xv4i16(ptr %x, ptr %z) {
188188
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
189189
; CHECK-NEXT: vle16.v v8, (a0)
190190
; CHECK-NEXT: addi a2, a0, 2
191-
; CHECK-NEXT: vle16.v v10, (a2)
191+
; CHECK-NEXT: vle16.v v9, (a2)
192192
; CHECK-NEXT: addi a2, a0, 6
193-
; CHECK-NEXT: vle16.v v12, (a2)
193+
; CHECK-NEXT: vle16.v v10, (a2)
194194
; CHECK-NEXT: addi a0, a0, 8
195-
; CHECK-NEXT: vle16.v v14, (a0)
196-
; CHECK-NEXT: vsetivli zero, 8, e16, m2, tu, ma
197-
; CHECK-NEXT: vslideup.vi v8, v10, 4
195+
; CHECK-NEXT: vle16.v v12, (a0)
196+
; CHECK-NEXT: vsetivli zero, 8, e16, m1, tu, ma
197+
; CHECK-NEXT: vslideup.vi v8, v9, 4
198198
; CHECK-NEXT: vsetivli zero, 12, e16, m2, tu, ma
199-
; CHECK-NEXT: vslideup.vi v8, v12, 8
199+
; CHECK-NEXT: vslideup.vi v8, v10, 8
200200
; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
201-
; CHECK-NEXT: vslideup.vi v8, v14, 12
201+
; CHECK-NEXT: vslideup.vi v8, v12, 12
202202
; CHECK-NEXT: vse16.v v8, (a1)
203203
; CHECK-NEXT: ret
204204
%a = load <4 x i16>, ptr %x
@@ -258,17 +258,17 @@ define void @strided_runtime_mismatch_4xv4i16(ptr %x, ptr %z, i64 %s, i64 %t) {
258258
; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
259259
; RV32-NEXT: vle16.v v8, (a0)
260260
; RV32-NEXT: add a0, a0, a2
261-
; RV32-NEXT: vle16.v v10, (a0)
261+
; RV32-NEXT: vle16.v v9, (a0)
262262
; RV32-NEXT: add a0, a0, a4
263-
; RV32-NEXT: vle16.v v12, (a0)
263+
; RV32-NEXT: vle16.v v10, (a0)
264264
; RV32-NEXT: add a0, a0, a2
265-
; RV32-NEXT: vle16.v v14, (a0)
266-
; RV32-NEXT: vsetivli zero, 8, e16, m2, tu, ma
267-
; RV32-NEXT: vslideup.vi v8, v10, 4
265+
; RV32-NEXT: vle16.v v12, (a0)
266+
; RV32-NEXT: vsetivli zero, 8, e16, m1, tu, ma
267+
; RV32-NEXT: vslideup.vi v8, v9, 4
268268
; RV32-NEXT: vsetivli zero, 12, e16, m2, tu, ma
269-
; RV32-NEXT: vslideup.vi v8, v12, 8
269+
; RV32-NEXT: vslideup.vi v8, v10, 8
270270
; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma
271-
; RV32-NEXT: vslideup.vi v8, v14, 12
271+
; RV32-NEXT: vslideup.vi v8, v12, 12
272272
; RV32-NEXT: vse16.v v8, (a1)
273273
; RV32-NEXT: ret
274274
;
@@ -277,17 +277,17 @@ define void @strided_runtime_mismatch_4xv4i16(ptr %x, ptr %z, i64 %s, i64 %t) {
277277
; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
278278
; RV64-NEXT: vle16.v v8, (a0)
279279
; RV64-NEXT: add a0, a0, a2
280-
; RV64-NEXT: vle16.v v10, (a0)
280+
; RV64-NEXT: vle16.v v9, (a0)
281281
; RV64-NEXT: add a0, a0, a3
282-
; RV64-NEXT: vle16.v v12, (a0)
282+
; RV64-NEXT: vle16.v v10, (a0)
283283
; RV64-NEXT: add a0, a0, a2
284-
; RV64-NEXT: vle16.v v14, (a0)
285-
; RV64-NEXT: vsetivli zero, 8, e16, m2, tu, ma
286-
; RV64-NEXT: vslideup.vi v8, v10, 4
284+
; RV64-NEXT: vle16.v v12, (a0)
285+
; RV64-NEXT: vsetivli zero, 8, e16, m1, tu, ma
286+
; RV64-NEXT: vslideup.vi v8, v9, 4
287287
; RV64-NEXT: vsetivli zero, 12, e16, m2, tu, ma
288-
; RV64-NEXT: vslideup.vi v8, v12, 8
288+
; RV64-NEXT: vslideup.vi v8, v10, 8
289289
; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma
290-
; RV64-NEXT: vslideup.vi v8, v14, 12
290+
; RV64-NEXT: vslideup.vi v8, v12, 12
291291
; RV64-NEXT: vse16.v v8, (a1)
292292
; RV64-NEXT: ret
293293
;
@@ -296,17 +296,17 @@ define void @strided_runtime_mismatch_4xv4i16(ptr %x, ptr %z, i64 %s, i64 %t) {
296296
; ZVE64F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
297297
; ZVE64F-NEXT: vle16.v v8, (a0)
298298
; ZVE64F-NEXT: add a0, a0, a2
299-
; ZVE64F-NEXT: vle16.v v10, (a0)
299+
; ZVE64F-NEXT: vle16.v v9, (a0)
300300
; ZVE64F-NEXT: add a0, a0, a3
301-
; ZVE64F-NEXT: vle16.v v12, (a0)
301+
; ZVE64F-NEXT: vle16.v v10, (a0)
302302
; ZVE64F-NEXT: add a0, a0, a2
303-
; ZVE64F-NEXT: vle16.v v14, (a0)
304-
; ZVE64F-NEXT: vsetivli zero, 8, e16, m2, tu, ma
305-
; ZVE64F-NEXT: vslideup.vi v8, v10, 4
303+
; ZVE64F-NEXT: vle16.v v12, (a0)
304+
; ZVE64F-NEXT: vsetivli zero, 8, e16, m1, tu, ma
305+
; ZVE64F-NEXT: vslideup.vi v8, v9, 4
306306
; ZVE64F-NEXT: vsetivli zero, 12, e16, m2, tu, ma
307-
; ZVE64F-NEXT: vslideup.vi v8, v12, 8
307+
; ZVE64F-NEXT: vslideup.vi v8, v10, 8
308308
; ZVE64F-NEXT: vsetivli zero, 16, e16, m2, ta, ma
309-
; ZVE64F-NEXT: vslideup.vi v8, v14, 12
309+
; ZVE64F-NEXT: vslideup.vi v8, v12, 12
310310
; ZVE64F-NEXT: vse16.v v8, (a1)
311311
; ZVE64F-NEXT: ret
312312
%a = load <4 x i16>, ptr %x

0 commit comments

Comments
 (0)