Skip to content

Commit f0a9aac

Browse files
committed
[RISCV] Use vmv.s.x for a constant build_vector when the entire size is less than 32 bits
We have a variant of this for splats already, but hadn't handled the case where a single copy of the wider element can be inserted producing the entire required bit pattern. This shows up mostly in very small vector shuffle tests. Differential Revision: https://reviews.llvm.org/D157299
1 parent 2116921 commit f0a9aac

File tree

4 files changed

+170
-64
lines changed

4 files changed

+170
-64
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

+42-1
Original file line numberDiff line numberDiff line change
@@ -3275,6 +3275,48 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG,
32753275
}
32763276
}
32773277

3278+
// For very small build_vectors, use a single scalar insert of a constant.
3279+
// TODO: Base this on constant rematerialization cost, not size.
3280+
const unsigned EltBitSize = VT.getScalarSizeInBits();
3281+
if (VT.getSizeInBits() <= 32 &&
3282+
ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
3283+
MVT ViaIntVT = MVT::getIntegerVT(VT.getSizeInBits());
3284+
assert((ViaIntVT == MVT::i16 || ViaIntVT == MVT::i32) &&
3285+
"Unexpected sequence type");
3286+
// If we can use the original VL with the modified element type, this
3287+
// means we only have a VTYPE toggle, not a VL toggle. TODO: Should this
3288+
// be moved into InsertVSETVLI?
3289+
unsigned ViaVecLen =
3290+
(Subtarget.getRealMinVLen() >= VT.getSizeInBits() * NumElts) ? NumElts : 1;
3291+
MVT ViaVecVT = MVT::getVectorVT(ViaIntVT, ViaVecLen);
3292+
3293+
uint64_t EltMask = maskTrailingOnes<uint64_t>(EltBitSize);
3294+
uint64_t SplatValue = 0;
3295+
// Construct the amalgamated value at this larger vector type.
3296+
for (const auto &OpIdx : enumerate(Op->op_values())) {
3297+
const auto &SeqV = OpIdx.value();
3298+
if (!SeqV.isUndef())
3299+
SplatValue |= ((cast<ConstantSDNode>(SeqV)->getZExtValue() & EltMask)
3300+
<< (OpIdx.index() * EltBitSize));
3301+
}
3302+
3303+
// On RV64, sign-extend from 32 to 64 bits where possible in order to
3304+
// achieve better constant materializion.
3305+
if (Subtarget.is64Bit() && ViaIntVT == MVT::i32)
3306+
SplatValue = SignExtend64<32>(SplatValue);
3307+
3308+
SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ViaVecVT,
3309+
DAG.getUNDEF(ViaVecVT),
3310+
DAG.getConstant(SplatValue, DL, XLenVT),
3311+
DAG.getConstant(0, DL, XLenVT));
3312+
if (ViaVecLen != 1)
3313+
Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL,
3314+
MVT::getVectorVT(ViaIntVT, 1), Vec,
3315+
DAG.getConstant(0, DL, XLenVT));
3316+
return DAG.getBitcast(VT, Vec);
3317+
}
3318+
3319+
32783320
// Attempt to detect "hidden" splats, which only reveal themselves as splats
32793321
// when re-interpreted as a vector with a larger element type. For example,
32803322
// v4i16 = build_vector i16 0, i16 1, i16 0, i16 1
@@ -3283,7 +3325,6 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG,
32833325
// TODO: This optimization could also work on non-constant splats, but it
32843326
// would require bit-manipulation instructions to construct the splat value.
32853327
SmallVector<SDValue> Sequence;
3286-
unsigned EltBitSize = VT.getScalarSizeInBits();
32873328
const auto *BV = cast<BuildVectorSDNode>(Op);
32883329
if (VT.isInteger() && EltBitSize < 64 &&
32893330
ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll

+61-32
Original file line numberDiff line numberDiff line change
@@ -233,27 +233,49 @@ define <4 x i64> @buildvec_vid_step2_add0_v4i64() {
233233
%x6v4i8 = type {<4 x i8>, <4 x i8>, <4 x i8>, <4 x i8>, <4 x i8>, <4 x i8>}
234234

235235
define %x6v4i8 @buildvec_no_vid_v4i8() {
236-
; CHECK-LABEL: buildvec_no_vid_v4i8:
237-
; CHECK: # %bb.0:
238-
; CHECK-NEXT: lui a0, %hi(.LCPI14_0)
239-
; CHECK-NEXT: addi a0, a0, %lo(.LCPI14_0)
240-
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
241-
; CHECK-NEXT: vle8.v v8, (a0)
242-
; CHECK-NEXT: lui a0, %hi(.LCPI14_1)
243-
; CHECK-NEXT: addi a0, a0, %lo(.LCPI14_1)
244-
; CHECK-NEXT: vle8.v v9, (a0)
245-
; CHECK-NEXT: li a0, 1
246-
; CHECK-NEXT: slli a0, a0, 11
247-
; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
248-
; CHECK-NEXT: vmv.v.x v10, a0
249-
; CHECK-NEXT: li a0, 2047
250-
; CHECK-NEXT: vmv.v.x v11, a0
251-
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
252-
; CHECK-NEXT: lui a0, %hi(.LCPI14_2)
253-
; CHECK-NEXT: addi a0, a0, %lo(.LCPI14_2)
254-
; CHECK-NEXT: vle8.v v13, (a0)
255-
; CHECK-NEXT: vmv.v.i v12, -2
256-
; CHECK-NEXT: ret
236+
; RV32-LABEL: buildvec_no_vid_v4i8:
237+
; RV32: # %bb.0:
238+
; RV32-NEXT: lui a0, 28768
239+
; RV32-NEXT: addi a0, a0, 769
240+
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
241+
; RV32-NEXT: vmv.s.x v8, a0
242+
; RV32-NEXT: lui a0, 28752
243+
; RV32-NEXT: addi a0, a0, 512
244+
; RV32-NEXT: vmv.s.x v9, a0
245+
; RV32-NEXT: lui a0, 32768
246+
; RV32-NEXT: vmv.s.x v10, a0
247+
; RV32-NEXT: lui a0, 28672
248+
; RV32-NEXT: addi a0, a0, 255
249+
; RV32-NEXT: vmv.s.x v11, a0
250+
; RV32-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
251+
; RV32-NEXT: vmv.v.i v12, -2
252+
; RV32-NEXT: lui a0, 1032144
253+
; RV32-NEXT: addi a0, a0, -257
254+
; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma
255+
; RV32-NEXT: vmv.s.x v13, a0
256+
; RV32-NEXT: ret
257+
;
258+
; RV64-LABEL: buildvec_no_vid_v4i8:
259+
; RV64: # %bb.0:
260+
; RV64-NEXT: lui a0, 28768
261+
; RV64-NEXT: addiw a0, a0, 769
262+
; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
263+
; RV64-NEXT: vmv.s.x v8, a0
264+
; RV64-NEXT: lui a0, 28752
265+
; RV64-NEXT: addiw a0, a0, 512
266+
; RV64-NEXT: vmv.s.x v9, a0
267+
; RV64-NEXT: lui a0, 32768
268+
; RV64-NEXT: vmv.s.x v10, a0
269+
; RV64-NEXT: lui a0, 28672
270+
; RV64-NEXT: addiw a0, a0, 255
271+
; RV64-NEXT: vmv.s.x v11, a0
272+
; RV64-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
273+
; RV64-NEXT: vmv.v.i v12, -2
274+
; RV64-NEXT: lui a0, 1032144
275+
; RV64-NEXT: addiw a0, a0, -257
276+
; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma
277+
; RV64-NEXT: vmv.s.x v13, a0
278+
; RV64-NEXT: ret
257279
%1 = insertvalue %x6v4i8 poison, <4 x i8> <i8 1, i8 3, i8 6, i8 7>, 0
258280
%2 = insertvalue %x6v4i8 %1, <4 x i8> <i8 undef, i8 2, i8 5, i8 7>, 1
259281
%3 = insertvalue %x6v4i8 %2, <4 x i8> <i8 0, i8 undef, i8 undef, i8 8>, 2
@@ -662,22 +684,29 @@ define <8 x i16> @splat_idx_v8i16(<8 x i16> %v, i64 %idx) {
662684
define <4 x i8> @buildvec_not_vid_v4i8_1() {
663685
; CHECK-LABEL: buildvec_not_vid_v4i8_1:
664686
; CHECK: # %bb.0:
665-
; CHECK-NEXT: lui a0, %hi(.LCPI37_0)
666-
; CHECK-NEXT: addi a0, a0, %lo(.LCPI37_0)
667-
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
668-
; CHECK-NEXT: vle8.v v8, (a0)
687+
; CHECK-NEXT: lui a0, 12320
688+
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
689+
; CHECK-NEXT: vmv.s.x v8, a0
669690
; CHECK-NEXT: ret
670691
ret <4 x i8> <i8 0, i8 0, i8 2, i8 3>
671692
}
672693

673694
define <4 x i8> @buildvec_not_vid_v4i8_2() {
674-
; CHECK-LABEL: buildvec_not_vid_v4i8_2:
675-
; CHECK: # %bb.0:
676-
; CHECK-NEXT: lui a0, %hi(.LCPI38_0)
677-
; CHECK-NEXT: addi a0, a0, %lo(.LCPI38_0)
678-
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
679-
; CHECK-NEXT: vle8.v v8, (a0)
680-
; CHECK-NEXT: ret
695+
; RV32-LABEL: buildvec_not_vid_v4i8_2:
696+
; RV32: # %bb.0:
697+
; RV32-NEXT: lui a0, 16
698+
; RV32-NEXT: addi a0, a0, 771
699+
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
700+
; RV32-NEXT: vmv.s.x v8, a0
701+
; RV32-NEXT: ret
702+
;
703+
; RV64-LABEL: buildvec_not_vid_v4i8_2:
704+
; RV64: # %bb.0:
705+
; RV64-NEXT: lui a0, 16
706+
; RV64-NEXT: addiw a0, a0, 771
707+
; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
708+
; RV64-NEXT: vmv.s.x v8, a0
709+
; RV64-NEXT: ret
681710
ret <4 x i8> <i8 3, i8 3, i8 1, i8 0>
682711
}
683712

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll

+42-18
Original file line numberDiff line numberDiff line change
@@ -551,25 +551,49 @@ define <4 x i8> @unary_interleave_v4i8(<4 x i8> %x) {
551551

552552
; This shouldn't be interleaved
553553
define <4 x i8> @unary_interleave_v4i8_invalid(<4 x i8> %x) {
554-
; V128-LABEL: unary_interleave_v4i8_invalid:
555-
; V128: # %bb.0:
556-
; V128-NEXT: lui a0, %hi(.LCPI19_0)
557-
; V128-NEXT: addi a0, a0, %lo(.LCPI19_0)
558-
; V128-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
559-
; V128-NEXT: vle8.v v10, (a0)
560-
; V128-NEXT: vrgather.vv v9, v8, v10
561-
; V128-NEXT: vmv1r.v v8, v9
562-
; V128-NEXT: ret
554+
; RV32-V128-LABEL: unary_interleave_v4i8_invalid:
555+
; RV32-V128: # %bb.0:
556+
; RV32-V128-NEXT: lui a0, 16
557+
; RV32-V128-NEXT: addi a0, a0, 768
558+
; RV32-V128-NEXT: vsetivli zero, 4, e32, m1, ta, ma
559+
; RV32-V128-NEXT: vmv.s.x v10, a0
560+
; RV32-V128-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
561+
; RV32-V128-NEXT: vrgather.vv v9, v8, v10
562+
; RV32-V128-NEXT: vmv1r.v v8, v9
563+
; RV32-V128-NEXT: ret
563564
;
564-
; V512-LABEL: unary_interleave_v4i8_invalid:
565-
; V512: # %bb.0:
566-
; V512-NEXT: lui a0, %hi(.LCPI19_0)
567-
; V512-NEXT: addi a0, a0, %lo(.LCPI19_0)
568-
; V512-NEXT: vsetivli zero, 4, e8, mf8, ta, ma
569-
; V512-NEXT: vle8.v v10, (a0)
570-
; V512-NEXT: vrgather.vv v9, v8, v10
571-
; V512-NEXT: vmv1r.v v8, v9
572-
; V512-NEXT: ret
565+
; RV64-V128-LABEL: unary_interleave_v4i8_invalid:
566+
; RV64-V128: # %bb.0:
567+
; RV64-V128-NEXT: lui a0, 16
568+
; RV64-V128-NEXT: addiw a0, a0, 768
569+
; RV64-V128-NEXT: vsetivli zero, 4, e32, m1, ta, ma
570+
; RV64-V128-NEXT: vmv.s.x v10, a0
571+
; RV64-V128-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
572+
; RV64-V128-NEXT: vrgather.vv v9, v8, v10
573+
; RV64-V128-NEXT: vmv1r.v v8, v9
574+
; RV64-V128-NEXT: ret
575+
;
576+
; RV32-V512-LABEL: unary_interleave_v4i8_invalid:
577+
; RV32-V512: # %bb.0:
578+
; RV32-V512-NEXT: lui a0, 16
579+
; RV32-V512-NEXT: addi a0, a0, 768
580+
; RV32-V512-NEXT: vsetivli zero, 4, e32, mf2, ta, ma
581+
; RV32-V512-NEXT: vmv.s.x v10, a0
582+
; RV32-V512-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
583+
; RV32-V512-NEXT: vrgather.vv v9, v8, v10
584+
; RV32-V512-NEXT: vmv1r.v v8, v9
585+
; RV32-V512-NEXT: ret
586+
;
587+
; RV64-V512-LABEL: unary_interleave_v4i8_invalid:
588+
; RV64-V512: # %bb.0:
589+
; RV64-V512-NEXT: lui a0, 16
590+
; RV64-V512-NEXT: addiw a0, a0, 768
591+
; RV64-V512-NEXT: vsetivli zero, 4, e32, mf2, ta, ma
592+
; RV64-V512-NEXT: vmv.s.x v10, a0
593+
; RV64-V512-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
594+
; RV64-V512-NEXT: vrgather.vv v9, v8, v10
595+
; RV64-V512-NEXT: vmv1r.v v8, v9
596+
; RV64-V512-NEXT: ret
573597
%a = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> <i32 0, i32 3, i32 1, i32 4>
574598
ret <4 x i8> %a
575599
}

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1up.ll

+25-13
Original file line numberDiff line numberDiff line change
@@ -373,10 +373,10 @@ define <4 x i8> @vslide1up_4xi8_neg_undef_insert(<4 x i8> %v, i8 %b) {
373373
define <4 x i8> @vslide1up_4xi8_neg_incorrect_insert(<4 x i8> %v, i8 %b) {
374374
; CHECK-LABEL: vslide1up_4xi8_neg_incorrect_insert:
375375
; CHECK: # %bb.0:
376-
; CHECK-NEXT: lui a0, %hi(.LCPI23_0)
377-
; CHECK-NEXT: addi a0, a0, %lo(.LCPI23_0)
378-
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
379-
; CHECK-NEXT: vle8.v v10, (a0)
376+
; CHECK-NEXT: lui a0, 8208
377+
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
378+
; CHECK-NEXT: vmv.s.x v10, a0
379+
; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
380380
; CHECK-NEXT: vrgather.vv v9, v8, v10
381381
; CHECK-NEXT: vmv1r.v v8, v9
382382
; CHECK-NEXT: ret
@@ -397,15 +397,27 @@ define <4 x i8> @vslide1up_4xi8_neg_incorrect_insert2(<4 x i8> %v, i8 %b) {
397397
}
398398

399399
define <4 x i8> @vslide1up_4xi8_neg_incorrect_insert3(<4 x i8> %v, i8 %b) {
400-
; CHECK-LABEL: vslide1up_4xi8_neg_incorrect_insert3:
401-
; CHECK: # %bb.0:
402-
; CHECK-NEXT: lui a0, %hi(.LCPI25_0)
403-
; CHECK-NEXT: addi a0, a0, %lo(.LCPI25_0)
404-
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
405-
; CHECK-NEXT: vle8.v v10, (a0)
406-
; CHECK-NEXT: vrgather.vv v9, v8, v10
407-
; CHECK-NEXT: vmv1r.v v8, v9
408-
; CHECK-NEXT: ret
400+
; RV32-LABEL: vslide1up_4xi8_neg_incorrect_insert3:
401+
; RV32: # %bb.0:
402+
; RV32-NEXT: lui a0, 8208
403+
; RV32-NEXT: addi a0, a0, 1
404+
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
405+
; RV32-NEXT: vmv.s.x v10, a0
406+
; RV32-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
407+
; RV32-NEXT: vrgather.vv v9, v8, v10
408+
; RV32-NEXT: vmv1r.v v8, v9
409+
; RV32-NEXT: ret
410+
;
411+
; RV64-LABEL: vslide1up_4xi8_neg_incorrect_insert3:
412+
; RV64: # %bb.0:
413+
; RV64-NEXT: lui a0, 8208
414+
; RV64-NEXT: addiw a0, a0, 1
415+
; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
416+
; RV64-NEXT: vmv.s.x v10, a0
417+
; RV64-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
418+
; RV64-NEXT: vrgather.vv v9, v8, v10
419+
; RV64-NEXT: vmv1r.v v8, v9
420+
; RV64-NEXT: ret
409421
%v2 = shufflevector <4 x i8> poison, <4 x i8> %v, <4 x i32> <i32 5, i32 4, i32 5, i32 6>
410422
ret <4 x i8> %v2
411423
}

0 commit comments

Comments
 (0)