diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 98c25bc93a8a2..2738505e9e192 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -4563,6 +4563,50 @@ static bool isInterleaveShuffle(ArrayRef Mask, MVT VT, int &EvenSrc, return ((EvenSrc % HalfNumElts) == 0) && ((OddSrc % HalfNumElts) == 0); } +/// Is this mask representing a masked combination of two slides? +static bool isMaskedSlidePair(ArrayRef Mask, + std::pair SrcInfo[2]) { + int NumElts = Mask.size(); + int SignalValue = NumElts * 2; + SrcInfo[0] = {-1, SignalValue}; + SrcInfo[1] = {-1, SignalValue}; + for (unsigned i = 0; i != Mask.size(); ++i) { + int M = Mask[i]; + if (M < 0) + continue; + int Src = M >= (int)NumElts; + int Diff = (int)i - (M % NumElts); + bool Match = false; + for (int j = 0; j < 2; j++) { + if (SrcInfo[j].first == -1) { + assert(SrcInfo[j].second == SignalValue); + SrcInfo[j].first = Src; + SrcInfo[j].second = Diff; + } + if (SrcInfo[j].first == Src && SrcInfo[j].second == Diff) { + Match = true; + break; + } + } + if (!Match) + return false; + } + + // Avoid matching unconditional slides for now. This is reasonably + // covered by existing matchers. + if (SrcInfo[0].first == -1 || SrcInfo[1].first == -1) + return false; + // Avoid matching vselect idioms + if (SrcInfo[0].second == 0 && SrcInfo[1].second == 0) + return false; + // Prefer vslideup as the second instruction, and identity + // only as the initial instruction. + if ((SrcInfo[0].second > 0 && SrcInfo[1].second < 0) || + SrcInfo[1].second == 0) + std::swap(SrcInfo[0], SrcInfo[1]); + return true; +} + /// Match shuffles that concatenate two vectors, rotate the concatenation, /// and then extract the original number of elements from the rotated result. /// This is equivalent to vector.splice or X86's PALIGNR instruction. The @@ -5651,6 +5695,75 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, return getWideningInterleave(EvenV, OddV, DL, DAG, Subtarget); } + // Recognize a pattern which can handled via a pair of vslideup/vslidedown + // instructions (in any combination) with masking on the second instruction. + // Avoid matching bit rotates as slide pairs. This is a performance + // heuristic, not a functional check. + // TODO: Generalize this slightly to allow single instruction cases, and + // prune the logic above which is mostly covered by this already. + std::pair SrcInfo[2]; + unsigned RotateAmt; + MVT RotateVT; + if (isMaskedSlidePair(Mask, SrcInfo) && + !isLegalBitRotate(Mask, VT, Subtarget, RotateVT, RotateAmt)) { + SDValue Sources[2]; + auto GetSourceFor = [&](const std::pair &Info) { + int SrcIdx = Info.first; + assert(SrcIdx == 0 || SrcIdx == 1); + SDValue &Src = Sources[SrcIdx]; + if (!Src) { + SDValue SrcV = SrcIdx == 0 ? V1 : V2; + Src = convertToScalableVector(ContainerVT, SrcV, DAG, Subtarget); + } + return Src; + }; + auto GetSlide = [&](const std::pair &Src, SDValue Mask, + SDValue Passthru) { + SDValue SrcV = GetSourceFor(Src); + int SlideAmt = Src.second; + if (SlideAmt == 0) { + // Should never be second operation + assert(Mask == TrueMask); + return SrcV; + } + if (SlideAmt < 0) + return getVSlidedown(DAG, Subtarget, DL, ContainerVT, Passthru, SrcV, + DAG.getConstant(-SlideAmt, DL, XLenVT), Mask, VL, + RISCVVType::TAIL_AGNOSTIC); + return getVSlideup(DAG, Subtarget, DL, ContainerVT, Passthru, SrcV, + DAG.getConstant(SlideAmt, DL, XLenVT), Mask, VL, + RISCVVType::TAIL_AGNOSTIC); + }; + + // Build the mask. Note that vslideup unconditionally preserves elements + // below the slide amount in the destination, and thus those elements are + // undefined in the mask. If the mask ends up all true (or undef), it + // will be folded away by general logic. + SmallVector MaskVals; + for (unsigned i = 0; i != Mask.size(); ++i) { + int M = Mask[i]; + if (M < 0 || (SrcInfo[1].second > 0 && i < (unsigned)SrcInfo[1].second)) { + MaskVals.push_back(DAG.getUNDEF(XLenVT)); + continue; + } + int Src = M >= (int)NumElts; + int Diff = (int)i - (M % NumElts); + bool C = Src == SrcInfo[1].first && Diff == SrcInfo[1].second; + assert(C ^ (Src == SrcInfo[0].first && Diff == SrcInfo[0].second) && + "Must match exactly one of the two slides"); + MaskVals.push_back(DAG.getConstant(C, DL, XLenVT)); + } + assert(MaskVals.size() == NumElts && "Unexpected select-like shuffle"); + MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts); + SDValue SelectMask = convertToScalableVector( + ContainerVT.changeVectorElementType(MVT::i1), + DAG.getBuildVector(MaskVT, DL, MaskVals), DAG, Subtarget); + + SDValue Res = DAG.getUNDEF(ContainerVT); + Res = GetSlide(SrcInfo[0], TrueMask, Res); + Res = GetSlide(SrcInfo[1], SelectMask, Res); + return convertFromScalableVector(VT, Res, DAG, Subtarget); + } // Handle any remaining single source shuffles assert(!V1.isUndef() && "Unexpected shuffle canonicalization"); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll index e82891f90d85e..9bd1da2e53dce 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll @@ -39,18 +39,18 @@ define void @buildvec_no_vid_v4f32(ptr %x) { define <4 x float> @hang_when_merging_stores_after_legalization(<8 x float> %x, <8 x float> %y) optsize { ; CHECK-LABEL: hang_when_merging_stores_after_legalization: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vmv.v.i v12, -14 -; CHECK-NEXT: vid.v v14 -; CHECK-NEXT: li a0, 7 -; CHECK-NEXT: vmadd.vx v14, a0, v12 -; CHECK-NEXT: li a0, 129 -; CHECK-NEXT: vmv.s.x v15, a0 -; CHECK-NEXT: vmv.v.i v0, 12 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; CHECK-NEXT: vcompress.vm v12, v8, v15 -; CHECK-NEXT: vrgatherei16.vv v12, v10, v14, v0.t -; CHECK-NEXT: vmv1r.v v8, v12 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.i v0, 4 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; CHECK-NEXT: vslidedown.vi v12, v10, 4 +; CHECK-NEXT: vslideup.vi v12, v10, 2, v0.t +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.i v0, 2 +; CHECK-NEXT: vmv.v.i v10, 12 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; CHECK-NEXT: vslidedown.vi v8, v8, 6, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vmerge.vvm v8, v8, v12, v0 ; CHECK-NEXT: ret %z = shufflevector <8 x float> %x, <8 x float> %y, <4 x i32> ret <4 x float> %z diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll index ac78a252cf9cd..7817f010c4deb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll @@ -38,44 +38,27 @@ define <4 x float> @interleave_v2f32(<2 x float> %x, <2 x float> %y) { define <4 x double> @interleave_v2f64(<2 x double> %x, <2 x double> %y) { ; V128-LABEL: interleave_v2f64: ; V128: # %bb.0: -; V128-NEXT: csrr a0, vlenb -; V128-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; V128-NEXT: vid.v v10 +; V128-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; V128-NEXT: vmv1r.v v10, v9 ; V128-NEXT: vmv.v.i v0, 10 -; V128-NEXT: srli a0, a0, 3 -; V128-NEXT: vsrl.vi v10, v10, 1 -; V128-NEXT: vslidedown.vx v11, v10, a0 -; V128-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; V128-NEXT: vrgatherei16.vv v13, v9, v11 -; V128-NEXT: vrgatherei16.vv v12, v9, v10 -; V128-NEXT: vrgatherei16.vv v15, v8, v11 -; V128-NEXT: vrgatherei16.vv v14, v8, v10 ; V128-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; V128-NEXT: vmerge.vvm v8, v14, v12, v0 +; V128-NEXT: vslideup.vi v12, v10, 1 +; V128-NEXT: vslideup.vi v12, v10, 2 +; V128-NEXT: vmv2r.v v10, v8 +; V128-NEXT: vslideup.vi v10, v8, 1 +; V128-NEXT: vmerge.vvm v8, v10, v12, v0 ; V128-NEXT: ret ; -; RV32-V512-LABEL: interleave_v2f64: -; RV32-V512: # %bb.0: -; RV32-V512-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; RV32-V512-NEXT: vid.v v10 -; RV32-V512-NEXT: vsrl.vi v11, v10, 1 -; RV32-V512-NEXT: vmv.v.i v0, 10 -; RV32-V512-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; RV32-V512-NEXT: vrgatherei16.vv v10, v8, v11 -; RV32-V512-NEXT: vrgatherei16.vv v10, v9, v11, v0.t -; RV32-V512-NEXT: vmv.v.v v8, v10 -; RV32-V512-NEXT: ret -; -; RV64-V512-LABEL: interleave_v2f64: -; RV64-V512: # %bb.0: -; RV64-V512-NEXT: vsetivli zero, 4, e64, m1, ta, mu -; RV64-V512-NEXT: vid.v v10 -; RV64-V512-NEXT: vsrl.vi v11, v10, 1 -; RV64-V512-NEXT: vmv.v.i v0, 10 -; RV64-V512-NEXT: vrgather.vv v10, v8, v11 -; RV64-V512-NEXT: vrgather.vv v10, v9, v11, v0.t -; RV64-V512-NEXT: vmv.v.v v8, v10 -; RV64-V512-NEXT: ret +; V512-LABEL: interleave_v2f64: +; V512: # %bb.0: +; V512-NEXT: vsetivli zero, 4, e64, m1, ta, ma +; V512-NEXT: vslideup.vi v10, v9, 1 +; V512-NEXT: vmv1r.v v11, v8 +; V512-NEXT: vslideup.vi v10, v9, 2 +; V512-NEXT: vmv.v.i v0, 10 +; V512-NEXT: vslideup.vi v11, v8, 1 +; V512-NEXT: vmerge.vvm v8, v11, v10, v0 +; V512-NEXT: ret %a = shufflevector <2 x double> %x, <2 x double> %y, <4 x i32> ret <4 x double> %a } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll index 41d8abb9b73eb..a749736097331 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll @@ -69,14 +69,9 @@ define <4 x double> @shuffle_vf_v4f64(<4 x double> %x) { define <4 x double> @vrgather_permute_shuffle_vu_v4f64(<4 x double> %x) { ; CHECK-LABEL: vrgather_permute_shuffle_vu_v4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 4096 -; CHECK-NEXT: addi a0, a0, 513 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vsext.vf2 v12, v10 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vrgatherei16.vv v10, v8, v12 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v10, v8, 1 +; CHECK-NEXT: vslideup.vi v10, v8, 2 ; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret %s = shufflevector <4 x double> %x, <4 x double> poison, <4 x i32> @@ -86,14 +81,9 @@ define <4 x double> @vrgather_permute_shuffle_vu_v4f64(<4 x double> %x) { define <4 x double> @vrgather_permute_shuffle_uv_v4f64(<4 x double> %x) { ; CHECK-LABEL: vrgather_permute_shuffle_uv_v4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 4096 -; CHECK-NEXT: addi a0, a0, 513 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vsext.vf2 v12, v10 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vrgatherei16.vv v10, v8, v12 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v10, v8, 1 +; CHECK-NEXT: vslideup.vi v10, v8, 2 ; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret %s = shufflevector <4 x double> poison, <4 x double> %x, <4 x i32> @@ -103,13 +93,12 @@ define <4 x double> @vrgather_permute_shuffle_uv_v4f64(<4 x double> %x) { define <4 x double> @vrgather_shuffle_vv_v4f64(<4 x double> %x, <4 x double> %y) { ; CHECK-LABEL: vrgather_shuffle_vv_v4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI7_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI7_0) -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v14, (a0) +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v12, v8, 1 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.i v0, 8 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vrgatherei16.vv v12, v8, v14 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; CHECK-NEXT: vslideup.vi v12, v8, 2 ; CHECK-NEXT: vrgather.vi v12, v10, 1, v0.t ; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret @@ -120,16 +109,18 @@ define <4 x double> @vrgather_shuffle_vv_v4f64(<4 x double> %x, <4 x double> %y) define <4 x double> @vrgather_shuffle_xv_v4f64(<4 x double> %x) { ; CHECK-LABEL: vrgather_shuffle_xv_v4f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.i v0, 8 ; CHECK-NEXT: lui a0, %hi(.LCPI8_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI8_0)(a0) -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vrsub.vi v12, v10, 4 +; CHECK-NEXT: vmv2r.v v10, v8 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; CHECK-NEXT: vslideup.vi v10, v8, 2, v0.t +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.i v0, 12 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfmv.v.f v10, fa5 -; CHECK-NEXT: vrgatherei16.vv v10, v8, v12, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vfmv.v.f v8, fa5 +; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0 ; CHECK-NEXT: ret %s = shufflevector <4 x double> , <4 x double> %x, <4 x i32> ret <4 x double> %s @@ -138,17 +129,17 @@ define <4 x double> @vrgather_shuffle_xv_v4f64(<4 x double> %x) { define <4 x double> @vrgather_shuffle_vx_v4f64(<4 x double> %x) { ; CHECK-LABEL: vrgather_shuffle_vx_v4f64: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.i v0, 2 ; CHECK-NEXT: lui a0, %hi(.LCPI9_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI9_0)(a0) -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vmv.v.i v10, 9 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vcompress.vm v12, v8, v10 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; CHECK-NEXT: vslidedown.vi v8, v8, 2, v0.t ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.i v0, 3 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vfmv.v.f v8, fa5 -; CHECK-NEXT: vmerge.vvm v8, v8, v12, v0 +; CHECK-NEXT: vfmv.v.f v10, fa5 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %s = shufflevector <4 x double> %x, <4 x double> , <4 x i32> ret <4 x double> %s @@ -311,13 +302,9 @@ define <8 x double> @splice_binary2(<8 x double> %x, <8 x double> %y) { define <4 x bfloat> @vrgather_permute_shuffle_vu_v4bf16(<4 x bfloat> %x) { ; CHECK-LABEL: vrgather_permute_shuffle_vu_v4bf16: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 4096 -; CHECK-NEXT: addi a0, a0, 513 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vsext.vf2 v10, v9 -; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vslidedown.vi v9, v8, 1 +; CHECK-NEXT: vslideup.vi v9, v8, 2 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %s = shufflevector <4 x bfloat> %x, <4 x bfloat> poison, <4 x i32> @@ -327,12 +314,10 @@ define <4 x bfloat> @vrgather_permute_shuffle_vu_v4bf16(<4 x bfloat> %x) { define <4 x bfloat> @vrgather_shuffle_vv_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) { ; CHECK-LABEL: vrgather_shuffle_vv_v4bf16: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI25_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI25_0) ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vle16.v v11, (a0) +; CHECK-NEXT: vslidedown.vi v10, v8, 1 ; CHECK-NEXT: vmv.v.i v0, 8 -; CHECK-NEXT: vrgather.vv v10, v8, v11 +; CHECK-NEXT: vslideup.vi v10, v8, 2 ; CHECK-NEXT: vrgather.vi v10, v9, 1, v0.t ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret @@ -355,13 +340,9 @@ define <4 x bfloat> @vrgather_shuffle_vx_v4bf16_load(ptr %p) { define <4 x half> @vrgather_permute_shuffle_vu_v4f16(<4 x half> %x) { ; CHECK-LABEL: vrgather_permute_shuffle_vu_v4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 4096 -; CHECK-NEXT: addi a0, a0, 513 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vsext.vf2 v10, v9 -; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vslidedown.vi v9, v8, 1 +; CHECK-NEXT: vslideup.vi v9, v8, 2 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %s = shufflevector <4 x half> %x, <4 x half> poison, <4 x i32> @@ -371,12 +352,10 @@ define <4 x half> @vrgather_permute_shuffle_vu_v4f16(<4 x half> %x) { define <4 x half> @vrgather_shuffle_vv_v4f16(<4 x half> %x, <4 x half> %y) { ; CHECK-LABEL: vrgather_shuffle_vv_v4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI28_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI28_0) ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vle16.v v11, (a0) +; CHECK-NEXT: vslidedown.vi v10, v8, 1 ; CHECK-NEXT: vmv.v.i v0, 8 -; CHECK-NEXT: vrgather.vv v10, v8, v11 +; CHECK-NEXT: vslideup.vi v10, v8, 2 ; CHECK-NEXT: vrgather.vi v10, v9, 1, v0.t ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll index beaf75d5b0cfa..4911c340c9154 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll @@ -51,44 +51,27 @@ define <4 x i32> @interleave_v2i32(<2 x i32> %x, <2 x i32> %y) { define <4 x i64> @interleave_v2i64(<2 x i64> %x, <2 x i64> %y) { ; V128-LABEL: interleave_v2i64: ; V128: # %bb.0: -; V128-NEXT: csrr a0, vlenb -; V128-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; V128-NEXT: vid.v v10 +; V128-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; V128-NEXT: vmv1r.v v10, v9 ; V128-NEXT: vmv.v.i v0, 10 -; V128-NEXT: srli a0, a0, 3 -; V128-NEXT: vsrl.vi v10, v10, 1 -; V128-NEXT: vslidedown.vx v11, v10, a0 -; V128-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; V128-NEXT: vrgatherei16.vv v13, v9, v11 -; V128-NEXT: vrgatherei16.vv v12, v9, v10 -; V128-NEXT: vrgatherei16.vv v15, v8, v11 -; V128-NEXT: vrgatherei16.vv v14, v8, v10 ; V128-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; V128-NEXT: vmerge.vvm v8, v14, v12, v0 +; V128-NEXT: vslideup.vi v12, v10, 1 +; V128-NEXT: vslideup.vi v12, v10, 2 +; V128-NEXT: vmv2r.v v10, v8 +; V128-NEXT: vslideup.vi v10, v8, 1 +; V128-NEXT: vmerge.vvm v8, v10, v12, v0 ; V128-NEXT: ret ; -; RV32-V512-LABEL: interleave_v2i64: -; RV32-V512: # %bb.0: -; RV32-V512-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; RV32-V512-NEXT: vid.v v10 -; RV32-V512-NEXT: vsrl.vi v11, v10, 1 -; RV32-V512-NEXT: vmv.v.i v0, 10 -; RV32-V512-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; RV32-V512-NEXT: vrgatherei16.vv v10, v8, v11 -; RV32-V512-NEXT: vrgatherei16.vv v10, v9, v11, v0.t -; RV32-V512-NEXT: vmv.v.v v8, v10 -; RV32-V512-NEXT: ret -; -; RV64-V512-LABEL: interleave_v2i64: -; RV64-V512: # %bb.0: -; RV64-V512-NEXT: vsetivli zero, 4, e64, m1, ta, mu -; RV64-V512-NEXT: vid.v v10 -; RV64-V512-NEXT: vsrl.vi v11, v10, 1 -; RV64-V512-NEXT: vmv.v.i v0, 10 -; RV64-V512-NEXT: vrgather.vv v10, v8, v11 -; RV64-V512-NEXT: vrgather.vv v10, v9, v11, v0.t -; RV64-V512-NEXT: vmv.v.v v8, v10 -; RV64-V512-NEXT: ret +; V512-LABEL: interleave_v2i64: +; V512: # %bb.0: +; V512-NEXT: vsetivli zero, 4, e64, m1, ta, ma +; V512-NEXT: vslideup.vi v10, v9, 1 +; V512-NEXT: vmv1r.v v11, v8 +; V512-NEXT: vslideup.vi v10, v9, 2 +; V512-NEXT: vmv.v.i v0, 10 +; V512-NEXT: vslideup.vi v11, v8, 1 +; V512-NEXT: vmerge.vvm v8, v11, v10, v0 +; V512-NEXT: ret %a = shufflevector <2 x i64> %x, <2 x i64> %y, <4 x i32> ret <4 x i64> %a } @@ -191,30 +174,28 @@ define <4 x i32> @interleave_v4i32_offset_2(<4 x i32> %x, <4 x i32> %y) { define <4 x i32> @interleave_v4i32_offset_1(<4 x i32> %x, <4 x i32> %y) { ; V128-LABEL: interleave_v4i32_offset_1: ; V128: # %bb.0: -; V128-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; V128-NEXT: vid.v v10 +; V128-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; V128-NEXT: vmv.v.i v0, 8 +; V128-NEXT: vmv1r.v v10, v9 +; V128-NEXT: vslideup.vi v10, v9, 1, v0.t ; V128-NEXT: vmv.v.i v0, 10 -; V128-NEXT: vsrl.vi v10, v10, 1 -; V128-NEXT: vadd.vi v11, v10, 1 ; V128-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; V128-NEXT: vzext.vf2 v10, v8 -; V128-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; V128-NEXT: vrgather.vv v10, v9, v11, v0.t -; V128-NEXT: vmv.v.v v8, v10 +; V128-NEXT: vzext.vf2 v9, v8 +; V128-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; V128-NEXT: vmerge.vvm v8, v9, v10, v0 ; V128-NEXT: ret ; ; V512-LABEL: interleave_v4i32_offset_1: ; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, ma -; V512-NEXT: vid.v v10 +; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, mu +; V512-NEXT: vmv.v.i v0, 8 +; V512-NEXT: vmv1r.v v10, v9 +; V512-NEXT: vslideup.vi v10, v9, 1, v0.t ; V512-NEXT: vmv.v.i v0, 10 -; V512-NEXT: vsrl.vi v10, v10, 1 -; V512-NEXT: vadd.vi v11, v10, 1 ; V512-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; V512-NEXT: vzext.vf2 v10, v8 -; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, mu -; V512-NEXT: vrgather.vv v10, v9, v11, v0.t -; V512-NEXT: vmv1r.v v8, v10 +; V512-NEXT: vzext.vf2 v9, v8 +; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, ma +; V512-NEXT: vmerge.vvm v8, v9, v10, v0 ; V512-NEXT: ret %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> ret <4 x i32> %a diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll index 3e31c9de61657..f307ebb422c6c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll @@ -50,13 +50,9 @@ define <4 x i16> @shuffle_vx_v4i16(<4 x i16> %x) { define <4 x i16> @vrgather_permute_shuffle_vu_v4i16(<4 x i16> %x) { ; CHECK-LABEL: vrgather_permute_shuffle_vu_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 4096 -; CHECK-NEXT: addi a0, a0, 513 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vsext.vf2 v10, v9 -; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vslidedown.vi v9, v8, 1 +; CHECK-NEXT: vslideup.vi v9, v8, 2 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %s = shufflevector <4 x i16> %x, <4 x i16> poison, <4 x i32> @@ -66,13 +62,9 @@ define <4 x i16> @vrgather_permute_shuffle_vu_v4i16(<4 x i16> %x) { define <4 x i16> @vrgather_permute_shuffle_uv_v4i16(<4 x i16> %x) { ; CHECK-LABEL: vrgather_permute_shuffle_uv_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 4096 -; CHECK-NEXT: addi a0, a0, 513 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vsext.vf2 v10, v9 -; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vslidedown.vi v9, v8, 1 +; CHECK-NEXT: vslideup.vi v9, v8, 2 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %s = shufflevector <4 x i16> poison, <4 x i16> %x, <4 x i32> @@ -82,12 +74,10 @@ define <4 x i16> @vrgather_permute_shuffle_uv_v4i16(<4 x i16> %x) { define <4 x i16> @vrgather_shuffle_vv_v4i16(<4 x i16> %x, <4 x i16> %y) { ; CHECK-LABEL: vrgather_shuffle_vv_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI6_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI6_0) ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vle16.v v11, (a0) +; CHECK-NEXT: vslidedown.vi v10, v8, 1 ; CHECK-NEXT: vmv.v.i v0, 8 -; CHECK-NEXT: vrgather.vv v10, v8, v11 +; CHECK-NEXT: vslideup.vi v10, v8, 2 ; CHECK-NEXT: vrgather.vi v10, v9, 1, v0.t ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret @@ -99,12 +89,12 @@ define <4 x i16> @vrgather_shuffle_xv_v4i16(<4 x i16> %x) { ; CHECK-LABEL: vrgather_shuffle_xv_v4i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; CHECK-NEXT: vid.v v9 +; CHECK-NEXT: vmv.v.i v0, 8 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vslideup.vi v9, v8, 2, v0.t ; CHECK-NEXT: vmv.v.i v0, 12 -; CHECK-NEXT: vrsub.vi v10, v9, 4 -; CHECK-NEXT: vmv.v.i v9, 5 -; CHECK-NEXT: vrgather.vv v9, v8, v10, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vmv.v.i v8, 5 +; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0 ; CHECK-NEXT: ret %s = shufflevector <4 x i16> , <4 x i16> %x, <4 x i32> ret <4 x i16> %s @@ -113,12 +103,13 @@ define <4 x i16> @vrgather_shuffle_xv_v4i16(<4 x i16> %x) { define <4 x i16> @vrgather_shuffle_vx_v4i16(<4 x i16> %x) { ; CHECK-LABEL: vrgather_shuffle_vx_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vmv.v.i v9, 9 -; CHECK-NEXT: vmv.v.i v0, 3 -; CHECK-NEXT: vcompress.vm v10, v8, v9 -; CHECK-NEXT: vmv.v.i v8, 5 -; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; CHECK-NEXT: vmv.v.i v0, 2 +; CHECK-NEXT: vmv.v.i v9, 3 +; CHECK-NEXT: vslidedown.vi v8, v8, 2, v0.t +; CHECK-NEXT: vmv.v.i v10, 5 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %s = shufflevector <4 x i16> %x, <4 x i16> , <4 x i32> ret <4 x i16> %s @@ -603,8 +594,8 @@ define <8 x i8> @concat_4xi8_start_undef(<8 x i8> %v, <8 x i8> %w) { define <8 x i8> @concat_4xi8_start_undef_at_start(<8 x i8> %v, <8 x i8> %w) { ; CHECK-LABEL: concat_4xi8_start_undef_at_start: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v9, 2 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v9, 4 ; CHECK-NEXT: ret %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> ret <8 x i8> %res @@ -754,11 +745,10 @@ define <8 x i16> @shuffle_compress_singlesrc_e16(<8 x i16> %v) { define <8 x i32> @shuffle_compress_singlesrc_e32(<8 x i32> %v) { ; CHECK-LABEL: shuffle_compress_singlesrc_e32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vmv.v.i v12, 13 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vcompress.vm v10, v8, v12 -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: li a0, 28 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vslidedown.vi v8, v8, 2, v0.t ; CHECK-NEXT: ret %out = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> ret <8 x i32> %out @@ -856,16 +846,10 @@ define <8 x i32> @shuffle_spread3_singlesrc_e32(<8 x i32> %v) { define <8 x i32> @shuffle_spread4_singlesrc_e32(<8 x i32> %v) { ; CHECK-LABEL: shuffle_spread4_singlesrc_e32: ; CHECK: # %bb.0: -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vid.v v9 -; CHECK-NEXT: srli a0, a0, 2 -; CHECK-NEXT: vsrl.vi v9, v9, 2 -; CHECK-NEXT: vslidedown.vx v10, v9, a0 -; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma -; CHECK-NEXT: vrgatherei16.vv v11, v8, v10 -; CHECK-NEXT: vrgatherei16.vv v10, v8, v9 -; CHECK-NEXT: vmv2r.v v8, v10 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vmv2r.v v10, v8 +; CHECK-NEXT: vslideup.vi v10, v8, 3 +; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret %out = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> ret <8 x i32> %out @@ -959,11 +943,9 @@ define <8 x i32> @shuffle_decompress_singlesrc_e32(<8 x i32> %v) { define <8 x i8> @shuffle_decompress_singlesrc_e8(<8 x i8> %v) { ; CHECK-LABEL: shuffle_decompress_singlesrc_e8: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI66_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI66_0) ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v10, (a0) -; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vslideup.vi v9, v8, 3 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %out = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> @@ -1367,13 +1349,12 @@ define void @shuffle_i256_splat(ptr %p) nounwind { define <16 x i32> @shuffle_m1_prefix(<16 x i32> %a) { ; CHECK-LABEL: shuffle_m1_prefix: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 8208 -; CHECK-NEXT: addi a0, a0, 770 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vsext.vf4 v10, v9 -; CHECK-NEXT: vrgather.vv v12, v8, v10 -; CHECK-NEXT: vmv4r.v v8, v12 +; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.i v0, 12 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; CHECK-NEXT: vslidedown.vi v12, v8, 2 +; CHECK-NEXT: vslideup.vi v12, v8, 1, v0.t +; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret %out = shufflevector <16 x i32> %a, <16 x i32> poison, <16 x i32> ret <16 x i32> %out diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index 1516c67bf7ecc..4200837227899 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -187,175 +187,220 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: sub sp, sp, a2 ; RV32-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd8, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 88 * vlenb -; RV32-NEXT: addi a3, a1, 256 ; RV32-NEXT: addi a4, a1, 128 +; RV32-NEXT: addi a5, a1, 256 ; RV32-NEXT: li a2, 32 -; RV32-NEXT: lui a5, 12291 -; RV32-NEXT: lui a6, %hi(.LCPI8_0) -; RV32-NEXT: addi a6, a6, %lo(.LCPI8_0) -; RV32-NEXT: li a7, 768 -; RV32-NEXT: lui t0, 49164 +; RV32-NEXT: lui a3, 12 +; RV32-NEXT: lui a6, 12291 +; RV32-NEXT: lui a7, %hi(.LCPI8_0) +; RV32-NEXT: addi a7, a7, %lo(.LCPI8_0) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vle32.v v16, (a1) +; RV32-NEXT: vle32.v v16, (a5) +; RV32-NEXT: csrr a5, vlenb +; RV32-NEXT: li t0, 48 +; RV32-NEXT: mul a5, a5, t0 +; RV32-NEXT: add a5, sp, a5 +; RV32-NEXT: addi a5, a5, 16 +; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill +; RV32-NEXT: vmv.s.x v3, a3 +; RV32-NEXT: vle32.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li t1, 72 -; RV32-NEXT: mul a1, a1, t1 +; RV32-NEXT: li a5, 72 +; RV32-NEXT: mul a1, a1, a5 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vle32.v v8, (a4) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 6 +; RV32-NEXT: li a4, 80 +; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a5, a5, 3 +; RV32-NEXT: addi a6, a6, 3 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vle16.v v6, (a6) -; RV32-NEXT: vmv.s.x v0, a5 -; RV32-NEXT: lui a1, %hi(.LCPI8_1) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_1) -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmerge.vvm v16, v8, v16, v0 -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vrgatherei16.vv v24, v16, v6 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 48 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vle32.v v16, (a3) -; RV32-NEXT: addi t0, t0, 12 -; RV32-NEXT: vmv.s.x v0, a7 -; RV32-NEXT: vmv.s.x v7, t0 -; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV32-NEXT: vle16.v v4, (a1) +; RV32-NEXT: vle16.v v6, (a7) +; RV32-NEXT: vmv.s.x v2, a6 +; RV32-NEXT: vslideup.vi v8, v16, 4 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV32-NEXT: vslidedown.vi v24, v16, 16 +; RV32-NEXT: vmv1r.v v0, v3 +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV32-NEXT: vslideup.vi v8, v24, 10, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 56 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 6 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 80 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: li a4, 56 +; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmerge.vvm v20, v24, v16, v0 +; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vmv1r.v v0, v2 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 36 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: li a4, 72 +; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vmv1r.v v0, v7 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 72 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: li a4, 80 +; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmerge.vvm v24, v8, v16, v0 +; RV32-NEXT: vmerge.vvm v16, v16, v8, v0 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vrgatherei16.vv v8, v24, v4 +; RV32-NEXT: vrgatherei16.vv v8, v16, v6 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 40 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: li a4, 40 +; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: li a1, 3 -; RV32-NEXT: lui a3, 196656 -; RV32-NEXT: lui a4, %hi(.LCPI8_2) -; RV32-NEXT: addi a4, a4, %lo(.LCPI8_2) -; RV32-NEXT: slli a1, a1, 10 -; RV32-NEXT: addi a3, a3, 48 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a4, 48 +; RV32-NEXT: mul a1, a1, a4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; RV32-NEXT: vslideup.vi v8, v16, 2 +; RV32-NEXT: vmv1r.v v0, v3 +; RV32-NEXT: vslideup.vi v8, v24, 8, v0.t +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a4, 60 +; RV32-NEXT: mul a1, a1, a4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: lui a1, 49164 +; RV32-NEXT: lui a4, %hi(.LCPI8_1) +; RV32-NEXT: addi a4, a4, %lo(.LCPI8_1) +; RV32-NEXT: lui a5, 196656 +; RV32-NEXT: lui a6, %hi(.LCPI8_2) +; RV32-NEXT: addi a6, a6, %lo(.LCPI8_2) +; RV32-NEXT: addi a1, a1, 12 +; RV32-NEXT: addi a5, a5, 48 ; RV32-NEXT: vmv.s.x v0, a1 -; RV32-NEXT: vle16.v v14, (a4) -; RV32-NEXT: vmv.s.x v12, a3 +; RV32-NEXT: vle16.v v8, (a4) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 80 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: li a4, 24 +; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vmv4r.v v8, v24 +; RV32-NEXT: vs2r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vmv.s.x v8, a5 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 56 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: li a4, 12 +; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; RV32-NEXT: vmerge.vvm v8, v24, v8, v0 +; RV32-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vle16.v v8, (a6) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 24 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: li a4, 20 +; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vmv1r.v v0, v12 +; RV32-NEXT: vs2r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 6 +; RV32-NEXT: li a4, 80 +; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a4, 72 +; RV32-NEXT: mul a1, a1, a4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmerge.vvm v24, v24, v16, v0 +; RV32-NEXT: vmerge.vvm v8, v8, v16, v0 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a4, 24 +; RV32-NEXT: mul a1, a1, a4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl2r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vrgatherei16.vv v16, v24, v14 +; RV32-NEXT: vrgatherei16.vv v0, v8, v24 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 28 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v0, (a1) # Unknown-size Folded Spill +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a4, 12 +; RV32-NEXT: mul a1, a1, a4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a4, 80 +; RV32-NEXT: mul a1, a1, a4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV32-NEXT: vmerge.vvm v8, v8, v16, v0 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a4, 20 +; RV32-NEXT: mul a1, a1, a4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl2r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vrgatherei16.vv v16, v8, v24 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a4, 24 +; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, 3 -; RV32-NEXT: lui a3, 786624 -; RV32-NEXT: lui a4, 12 +; RV32-NEXT: lui a4, 786624 ; RV32-NEXT: lui a5, 768 ; RV32-NEXT: li a6, 48 ; RV32-NEXT: lui a7, 3073 -; RV32-NEXT: li t0, 192 ; RV32-NEXT: addi a1, a1, 3 -; RV32-NEXT: addi a3, a3, 192 -; RV32-NEXT: addi a4, a4, 12 +; RV32-NEXT: addi a4, a4, 192 +; RV32-NEXT: addi a3, a3, 12 ; RV32-NEXT: addi a5, a5, 768 ; RV32-NEXT: addi a7, a7, -1024 -; RV32-NEXT: vmv.s.x v13, a6 -; RV32-NEXT: vmv.s.x v2, t0 +; RV32-NEXT: vmv.s.x v8, a6 +; RV32-NEXT: csrr a6, vlenb +; RV32-NEXT: li t0, 20 +; RV32-NEXT: mul a6, a6, t0 +; RV32-NEXT: add a6, sp, a6 +; RV32-NEXT: addi a6, a6, 16 +; RV32-NEXT: vs1r.v v8, (a6) # Unknown-size Folded Spill ; RV32-NEXT: vmv.s.x v0, a1 -; RV32-NEXT: vmv.s.x v12, a3 -; RV32-NEXT: vmv.s.x v3, a4 -; RV32-NEXT: vmv.s.x v14, a5 -; RV32-NEXT: vmv.s.x v1, a7 +; RV32-NEXT: vmv.s.x v16, a4 +; RV32-NEXT: vmv.s.x v3, a3 +; RV32-NEXT: vmv.s.x v1, a5 +; RV32-NEXT: vmv.s.x v2, a7 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 56 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 6 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vmv4r.v v8, v16 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 80 +; RV32-NEXT: li a3, 48 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; RV32-NEXT: vmerge.vvm v20, v8, v16, v0 -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vmv1r.v v0, v12 +; RV32-NEXT: vmerge.vvm v4, v8, v24, v0 +; RV32-NEXT: vmv1r.v v0, v16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 6 +; RV32-NEXT: li a3, 80 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload @@ -364,45 +409,56 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmerge.vvm v24, v16, v24, v0 +; RV32-NEXT: vmerge.vvm v16, v16, v8, v0 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li a3, 12 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vmv1r.v v0, v3 ; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 6 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vmerge.vvm v12, v8, v24, v0 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li a3, 80 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmerge.vvm v4, v8, v24, v0 -; RV32-NEXT: vmv1r.v v0, v14 +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li a3, 72 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmerge.vvm v24, v16, v24, v0 +; RV32-NEXT: vmerge.vvm v16, v16, v8, v0 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: li a3, 20 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vmv1r.v v0, v13 +; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 80 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 6 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vmerge.vvm v12, v8, v24, v0 ; RV32-NEXT: csrr a1, vlenb @@ -411,102 +467,88 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vmv1r.v v0, v2 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 72 +; RV32-NEXT: li a3, 80 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 6 +; RV32-NEXT: li a3, 72 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vmerge.vvm v16, v24, v16, v0 +; RV32-NEXT: vmerge.vvm v16, v16, v8, v0 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 72 +; RV32-NEXT: li a2, 80 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vmv1r.v v0, v2 +; RV32-NEXT: lui a1, %hi(.LCPI8_3) +; RV32-NEXT: addi a1, a1, %lo(.LCPI8_3) +; RV32-NEXT: li a2, 192 +; RV32-NEXT: vmv.s.x v0, a2 +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vle16.v v3, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 80 -; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: slli a1, a1, 6 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vmerge.vvm v8, v8, v16, v0 +; RV32-NEXT: vmerge.vvm v8, v8, v24, v0 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 6 +; RV32-NEXT: li a2, 72 +; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: lui a1, 32 -; RV32-NEXT: addi a1, a1, 4 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 36 +; RV32-NEXT: li a2, 40 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vrgatherei16.vv v16, v8, v12 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 48 +; RV32-NEXT: li a2, 56 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v16, v8 +; RV32-NEXT: vmv.v.v v12, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 80 +; RV32-NEXT: li a2, 56 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: lui a1, 48 -; RV32-NEXT: lui a2, %hi(.LCPI8_3) -; RV32-NEXT: addi a2, a2, %lo(.LCPI8_3) -; RV32-NEXT: addi a1, a1, 5 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vle16.v v24, (a2) -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v25, a1 +; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 24 +; RV32-NEXT: li a2, 60 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vrgatherei16.vv v16, v8, v25 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 40 -; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: slli a1, a1, 5 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v16, v8 +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vmv.v.v v8, v24 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 56 +; RV32-NEXT: li a2, 60 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vrgatherei16.vv v16, v8, v24 +; RV32-NEXT: vrgatherei16.vv v16, v4, v3 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 28 +; RV32-NEXT: li a2, 24 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 @@ -514,8 +556,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma ; RV32-NEXT: vmv.v.v v16, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 40 -; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: slli a1, a1, 6 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill @@ -524,118 +565,96 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: lui a2, %hi(.LCPI8_5) ; RV32-NEXT: addi a2, a2, %lo(.LCPI8_5) ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV32-NEXT: vle16.v v26, (a1) +; RV32-NEXT: vle16.v v24, (a1) ; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vle16.v v24, (a2) +; RV32-NEXT: vle16.v v28, (a2) ; RV32-NEXT: lui a1, %hi(.LCPI8_6) ; RV32-NEXT: addi a1, a1, %lo(.LCPI8_6) ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vle16.v v2, (a1) +; RV32-NEXT: vle16.v v30, (a1) ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li a2, 12 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v16, v26 -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vrgatherei16.vv v20, v4, v24 -; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v20, v8 +; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v8, v0, v24 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vrgatherei16.vv v24, v12, v28 +; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma +; RV32-NEXT: vmv.v.v v24, v8 +; RV32-NEXT: addi a1, sp, 16 ; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vrgatherei16.vv v24, v8, v2 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 48 -; RV32-NEXT: mul a1, a1, a2 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vrgatherei16.vv v0, v8, v30 ; RV32-NEXT: lui a1, %hi(.LCPI8_7) ; RV32-NEXT: addi a1, a1, %lo(.LCPI8_7) ; RV32-NEXT: lui a2, %hi(.LCPI8_8) ; RV32-NEXT: addi a2, a2, %lo(.LCPI8_8) ; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vle16.v v12, (a1) +; RV32-NEXT: vle16.v v8, (a1) ; RV32-NEXT: lui a1, %hi(.LCPI8_9) ; RV32-NEXT: addi a1, a1, %lo(.LCPI8_9) ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV32-NEXT: vle16.v v16, (a2) +; RV32-NEXT: vle16.v v10, (a2) ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vle16.v v18, (a1) +; RV32-NEXT: vle16.v v9, (a1) ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li a2, 20 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v0, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v24, v0, v12 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 48 -; RV32-NEXT: mul a1, a1, a2 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl4r.v v28, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v12, v28, v8 ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v24, v0 +; RV32-NEXT: vmv.v.v v12, v0 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 72 +; RV32-NEXT: li a2, 80 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vrgatherei16.vv v8, v0, v16 +; RV32-NEXT: vrgatherei16.vv v16, v0, v10 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 48 +; RV32-NEXT: li a2, 72 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 6 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV32-NEXT: vrgatherei16.vv v8, v4, v18 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 48 -; RV32-NEXT: mul a1, a1, a2 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v28, v4, v9 ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v8, v0 +; RV32-NEXT: vmv.v.v v28, v16 ; RV32-NEXT: addi a1, a0, 320 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vse32.v v8, (a1) +; RV32-NEXT: vse32.v v28, (a1) ; RV32-NEXT: addi a1, a0, 256 -; RV32-NEXT: vse32.v v24, (a1) +; RV32-NEXT: vse32.v v12, (a1) ; RV32-NEXT: addi a1, a0, 192 -; RV32-NEXT: vse32.v v20, (a1) +; RV32-NEXT: vse32.v v24, (a1) ; RV32-NEXT: addi a1, a0, 128 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 40 -; RV32-NEXT: mul a2, a2, a3 +; RV32-NEXT: slli a2, a2, 6 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 ; RV32-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vse32.v v8, (a1) ; RV32-NEXT: addi a1, a0, 64 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 56 +; RV32-NEXT: li a3, 60 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 ; RV32-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vse32.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 80 +; RV32-NEXT: li a2, 56 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 @@ -655,366 +674,351 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a3, 88 +; RV64-NEXT: li a3, 93 ; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: sub sp, sp, a2 -; RV64-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd8, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 88 * vlenb -; RV64-NEXT: addi a3, a1, 128 -; RV64-NEXT: addi a6, a1, 256 -; RV64-NEXT: li a4, 128 -; RV64-NEXT: lui a2, 1 -; RV64-NEXT: lui a5, %hi(.LCPI8_0) -; RV64-NEXT: addi a5, a5, %lo(.LCPI8_0) -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vmv.v.i v16, 6 +; RV64-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xdd, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 93 * vlenb ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vle64.v v8, (a6) -; RV64-NEXT: lui a6, 16 -; RV64-NEXT: addi a6, a6, 7 -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vmv.v.x v17, a6 -; RV64-NEXT: addi a6, a2, 65 +; RV64-NEXT: vle64.v v8, (a1) +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 85 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV64-NEXT: addi a2, a1, 128 +; RV64-NEXT: addi a3, a1, 256 +; RV64-NEXT: li a4, 128 +; RV64-NEXT: lui a1, 1 +; RV64-NEXT: vle64.v v8, (a2) +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a5, 77 +; RV64-NEXT: mul a2, a2, a5 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV64-NEXT: lui a2, %hi(.LCPI8_0) +; RV64-NEXT: addi a2, a2, %lo(.LCPI8_0) +; RV64-NEXT: vle64.v v8, (a3) +; RV64-NEXT: vmv.s.x v0, a4 +; RV64-NEXT: addi a3, a1, 65 +; RV64-NEXT: vle16.v v16, (a2) +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a4, 53 +; RV64-NEXT: mul a2, a2, a4 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs2r.v v16, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vmv.s.x v6, a3 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-NEXT: vrgather.vi v4, v8, 4 -; RV64-NEXT: vrgather.vi v20, v8, 5 -; RV64-NEXT: csrr a7, vlenb -; RV64-NEXT: li t0, 84 -; RV64-NEXT: mul a7, a7, t0 -; RV64-NEXT: add a7, sp, a7 -; RV64-NEXT: addi a7, a7, 16 -; RV64-NEXT: vs4r.v v20, (a7) # Unknown-size Folded Spill -; RV64-NEXT: vrgatherei16.vv v20, v8, v16 -; RV64-NEXT: csrr a7, vlenb -; RV64-NEXT: slli a7, a7, 6 -; RV64-NEXT: add a7, sp, a7 -; RV64-NEXT: addi a7, a7, 16 -; RV64-NEXT: vs4r.v v20, (a7) # Unknown-size Folded Spill -; RV64-NEXT: vrgatherei16.vv v20, v8, v17 -; RV64-NEXT: csrr a7, vlenb -; RV64-NEXT: li t0, 56 -; RV64-NEXT: mul a7, a7, t0 -; RV64-NEXT: add a7, sp, a7 -; RV64-NEXT: addi a7, a7, 16 -; RV64-NEXT: vs4r.v v20, (a7) # Unknown-size Folded Spill -; RV64-NEXT: vrgather.vi v16, v8, 2 -; RV64-NEXT: csrr a7, vlenb -; RV64-NEXT: li t0, 72 -; RV64-NEXT: mul a7, a7, t0 -; RV64-NEXT: add a7, sp, a7 -; RV64-NEXT: addi a7, a7, 16 -; RV64-NEXT: vs4r.v v16, (a7) # Unknown-size Folded Spill -; RV64-NEXT: vrgather.vi v16, v8, 3 -; RV64-NEXT: csrr a7, vlenb -; RV64-NEXT: li t0, 68 -; RV64-NEXT: mul a7, a7, t0 -; RV64-NEXT: add a7, sp, a7 -; RV64-NEXT: addi a7, a7, 16 -; RV64-NEXT: vs4r.v v16, (a7) # Unknown-size Folded Spill +; RV64-NEXT: vslideup.vi v20, v8, 2 ; RV64-NEXT: vsetivli zero, 8, e64, m8, ta, ma -; RV64-NEXT: vslidedown.vi v8, v8, 8 -; RV64-NEXT: csrr a7, vlenb -; RV64-NEXT: li t0, 40 -; RV64-NEXT: mul a7, a7, t0 -; RV64-NEXT: add a7, sp, a7 -; RV64-NEXT: addi a7, a7, 16 -; RV64-NEXT: vs8r.v v8, (a7) # Unknown-size Folded Spill -; RV64-NEXT: vmv.s.x v0, a4 -; RV64-NEXT: csrr a4, vlenb -; RV64-NEXT: slli a4, a4, 5 -; RV64-NEXT: add a4, sp, a4 -; RV64-NEXT: addi a4, a4, 16 -; RV64-NEXT: vs1r.v v0, (a4) # Unknown-size Folded Spill -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vle64.v v24, (a1) -; RV64-NEXT: vle64.v v16, (a3) -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 76 -; RV64-NEXT: mul a1, a1, a3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV64-NEXT: vle16.v v12, (a5) -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 4 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs2r.v v12, (a1) # Unknown-size Folded Spill -; RV64-NEXT: vmv.s.x v2, a6 +; RV64-NEXT: vslidedown.vi v24, v8, 8 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 69 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vmv1r.v v12, v0 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vrgather.vi v4, v8, 2, v0.t -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 60 -; RV64-NEXT: mul a1, a1, a3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v4, (a1) # Unknown-size Folded Spill -; RV64-NEXT: vmv1r.v v0, v2 -; RV64-NEXT: vmv8r.v v8, v24 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 48 -; RV64-NEXT: mul a1, a1, a3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vslideup.vi v20, v24, 5, v0.t +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a3, a2, 6 +; RV64-NEXT: add a2, a3, a2 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs4r.v v20, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vmv1r.v v0, v6 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 85 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 77 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vmerge.vvm v24, v16, v24, v0 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 4 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl2r.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vrgatherei16.vv v0, v24, v16 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 24 -; RV64-NEXT: mul a1, a1, a3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v0, (a1) # Unknown-size Folded Spill -; RV64-NEXT: lui a1, 2 -; RV64-NEXT: lui a3, %hi(.LCPI8_1) -; RV64-NEXT: addi a3, a3, %lo(.LCPI8_1) -; RV64-NEXT: addi a1, a1, 130 -; RV64-NEXT: vle16.v v16, (a3) -; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: slli a3, a3, 3 -; RV64-NEXT: add a3, sp, a3 -; RV64-NEXT: addi a3, a3, 16 -; RV64-NEXT: vs2r.v v16, (a3) # Unknown-size Folded Spill -; RV64-NEXT: vmv.s.x v2, a1 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 5 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 84 -; RV64-NEXT: mul a1, a1, a3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v24, (a1) # Unknown-size Folded Reload -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 40 -; RV64-NEXT: mul a1, a1, a3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 53 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl2r.v v14, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vrgatherei16.vv v0, v24, v14 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 41 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v0, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vmv4r.v v24, v8 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 57 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vrgather.vi v24, v16, 3, v0.t -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 84 -; RV64-NEXT: mul a1, a1, a3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill -; RV64-NEXT: vmv1r.v v0, v2 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 76 -; RV64-NEXT: mul a1, a1, a3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vslideup.vi v8, v24, 1 +; RV64-NEXT: vmv1r.v v1, v12 +; RV64-NEXT: vmv1r.v v0, v12 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 69 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vslideup.vi v8, v16, 4, v0.t +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 49 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs4r.v v8, (a2) # Unknown-size Folded Spill +; RV64-NEXT: lui a2, 2 +; RV64-NEXT: lui a3, 4 +; RV64-NEXT: li a4, 32 +; RV64-NEXT: addi a2, a2, 130 +; RV64-NEXT: addi a3, a3, 260 +; RV64-NEXT: vmv.s.x v2, a4 +; RV64-NEXT: vmv.s.x v0, a2 +; RV64-NEXT: vmv.s.x v3, a3 +; RV64-NEXT: vmv4r.v v4, v24 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 85 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 77 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vmerge.vvm v24, v16, v8, v0 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl2r.v v8, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vrgatherei16.vv v0, v24, v8 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 4 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v0, (a1) # Unknown-size Folded Spill -; RV64-NEXT: lui a1, 4 -; RV64-NEXT: lui a3, 8 -; RV64-NEXT: addi a1, a1, 260 -; RV64-NEXT: addi a3, a3, 520 -; RV64-NEXT: vmv.s.x v0, a1 -; RV64-NEXT: vmv.s.x v2, a3 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 48 -; RV64-NEXT: mul a1, a1, a3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vmerge.vvm v8, v16, v24, v0 -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 5 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl1r.v v7, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vmv1r.v v0, v7 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 6 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 40 -; RV64-NEXT: mul a1, a1, a3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vmv4r.v v8, v16 +; RV64-NEXT: vmerge.vvm v16, v24, v8, v0 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 4 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vmv1r.v v0, v3 +; RV64-NEXT: vmerge.vvm v8, v24, v8, v0 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vmv1r.v v0, v2 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 57 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vrgather.vi v12, v16, 4, v0.t -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 6 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vslideup.vi v4, v8, 5, v0.t +; RV64-NEXT: vmv1r.v v0, v1 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 69 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vrgather.vi v4, v24, 4, v0.t +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 37 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs4r.v v4, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vslidedown.vi v20, v8, 1 ; RV64-NEXT: vmv1r.v v0, v2 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 76 -; RV64-NEXT: mul a1, a1, a3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vslideup.vi v20, v8, 4, v0.t +; RV64-NEXT: vmv1r.v v0, v1 +; RV64-NEXT: vrgather.vi v20, v24, 5, v0.t +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 53 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs4r.v v20, (a2) # Unknown-size Folded Spill +; RV64-NEXT: lui a2, 8 +; RV64-NEXT: addi a2, a2, 520 +; RV64-NEXT: vmv.s.x v0, a2 +; RV64-NEXT: vslideup.vi v8, v24, 6 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 77 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 85 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v24, (a2) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vmerge.vvm v16, v16, v24, v0 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV64-NEXT: vmv1r.v v0, v7 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 56 -; RV64-NEXT: mul a1, a1, a3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v24, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vrgather.vi v24, v8, 5, v0.t -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 56 -; RV64-NEXT: mul a1, a1, a3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill -; RV64-NEXT: lui a1, 96 -; RV64-NEXT: li a3, 192 -; RV64-NEXT: vmv.s.x v3, a3 -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vmv.v.x v24, a1 -; RV64-NEXT: vmv1r.v v0, v3 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 72 -; RV64-NEXT: mul a1, a1, a3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v28, (a1) # Unknown-size Folded Reload +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 29 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vmv1r.v v0, v1 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 69 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vrgatherei16.vv v28, v8, v24, v0.t -; RV64-NEXT: vmv4r.v v16, v8 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 72 -; RV64-NEXT: mul a1, a1, a3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v28, (a1) # Unknown-size Folded Spill -; RV64-NEXT: lui a1, %hi(.LCPI8_2) -; RV64-NEXT: addi a1, a1, %lo(.LCPI8_2) -; RV64-NEXT: li a3, 1040 -; RV64-NEXT: lui a4, 112 -; RV64-NEXT: addi a4, a4, 1 -; RV64-NEXT: vmv.s.x v0, a3 -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vmv.v.x v5, a4 +; RV64-NEXT: vslideup.vi v8, v16, 1, v0.t +; RV64-NEXT: lui a2, %hi(.LCPI8_1) +; RV64-NEXT: addi a2, a2, %lo(.LCPI8_1) +; RV64-NEXT: lui a3, %hi(.LCPI8_2) +; RV64-NEXT: addi a3, a3, %lo(.LCPI8_2) +; RV64-NEXT: li a4, 192 +; RV64-NEXT: vmv.s.x v0, a4 +; RV64-NEXT: csrr a4, vlenb +; RV64-NEXT: li a5, 28 +; RV64-NEXT: mul a4, a4, a5 +; RV64-NEXT: add a4, sp, a4 +; RV64-NEXT: addi a4, a4, 16 +; RV64-NEXT: vs1r.v v0, (a4) # Unknown-size Folded Spill +; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; RV64-NEXT: vle16.v v28, (a2) +; RV64-NEXT: vle16.v v30, (a3) +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 57 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: vrgather.vi v24, v16, 2 +; RV64-NEXT: vmerge.vvm v8, v24, v8, v0 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 24 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs4r.v v8, (a2) # Unknown-size Folded Spill +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 4 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vle16.v v6, (a1) +; RV64-NEXT: vrgatherei16.vv v8, v16, v28 +; RV64-NEXT: addi a2, sp, 16 +; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vrgatherei16.vv v16, v8, v30 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 4 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV64-NEXT: li a2, 1040 +; RV64-NEXT: li a3, 64 +; RV64-NEXT: addi a1, a1, -2016 +; RV64-NEXT: vmv.s.x v0, a2 +; RV64-NEXT: vmv.s.x v2, a3 +; RV64-NEXT: vmv.s.x v1, a1 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 76 -; RV64-NEXT: mul a1, a1, a3 +; RV64-NEXT: li a2, 77 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 48 -; RV64-NEXT: mul a1, a1, a3 +; RV64-NEXT: li a2, 85 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vmerge.vvm v24, v8, v24, v0 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 5 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill -; RV64-NEXT: vmv1r.v v0, v3 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 68 -; RV64-NEXT: mul a1, a1, a3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v28, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vrgatherei16.vv v28, v16, v5, v0.t +; RV64-NEXT: vmerge.vvm v8, v16, v24, v0 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 68 -; RV64-NEXT: mul a1, a1, a3 +; RV64-NEXT: slli a1, a1, 3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v28, (a1) # Unknown-size Folded Spill -; RV64-NEXT: addi a1, a2, -2016 -; RV64-NEXT: vmv.s.x v0, a1 -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vrgatherei16.vv v16, v24, v6 +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vmv1r.v v0, v1 +; RV64-NEXT: vmerge.vvm v8, v16, v24, v0 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 40 +; RV64-NEXT: li a2, 85 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 48 +; RV64-NEXT: li a2, 69 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vmv4r.v v20, v8 +; RV64-NEXT: vmv1r.v v0, v2 +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64-NEXT: vslideup.vi v20, v8, 5, v0.t ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 76 +; RV64-NEXT: li a2, 41 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vmerge.vvm v8, v16, v8, v0 +; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 76 -; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: slli a2, a1, 6 +; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV64-NEXT: lui a1, %hi(.LCPI8_3) -; RV64-NEXT: addi a1, a1, %lo(.LCPI8_3) -; RV64-NEXT: vle16.v v8, (a1) +; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vsetivli zero, 6, e64, m4, tu, ma +; RV64-NEXT: vmv.v.v v8, v24 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 60 -; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: slli a2, a1, 6 +; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 24 +; RV64-NEXT: li a2, 49 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vsetivli zero, 6, e64, m4, tu, ma -; RV64-NEXT: vmv.v.v v12, v0 +; RV64-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vmv.v.v v12, v24 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 60 +; RV64-NEXT: li a2, 49 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill +; RV64-NEXT: lui a1, %hi(.LCPI8_3) +; RV64-NEXT: addi a1, a1, %lo(.LCPI8_3) +; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; RV64-NEXT: vle16.v v18, (a1) +; RV64-NEXT: lui a1, %hi(.LCPI8_4) +; RV64-NEXT: addi a1, a1, %lo(.LCPI8_4) +; RV64-NEXT: vle16.v v16, (a1) ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 84 +; RV64-NEXT: li a2, 37 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 @@ -1024,114 +1028,117 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma ; RV64-NEXT: vmv.v.v v12, v24 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 84 +; RV64-NEXT: li a2, 29 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vrgatherei16.vv v24, v0, v18 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 6 +; RV64-NEXT: li a2, 53 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl4r.v v28, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma +; RV64-NEXT: vmv.v.v v28, v24 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 40 +; RV64-NEXT: li a2, 53 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma -; RV64-NEXT: vmv.v.v v16, v24 +; RV64-NEXT: vs4r.v v28, (a1) # Unknown-size Folded Spill ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: slli a1, a1, 3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vrgatherei16.vv v0, v24, v8 -; RV64-NEXT: lui a1, %hi(.LCPI8_4) -; RV64-NEXT: addi a1, a1, %lo(.LCPI8_4) -; RV64-NEXT: vle16.v v8, (a1) +; RV64-NEXT: vrgatherei16.vv v24, v0, v16 ; RV64-NEXT: lui a1, %hi(.LCPI8_5) ; RV64-NEXT: addi a1, a1, %lo(.LCPI8_5) -; RV64-NEXT: vle16.v v10, (a1) +; RV64-NEXT: vle16.v v16, (a1) ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 6 +; RV64-NEXT: li a2, 77 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs2r.v v10, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vs2r.v v16, (a1) # Unknown-size Folded Spill ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 56 +; RV64-NEXT: li a2, 24 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma -; RV64-NEXT: vmv.v.v v12, v0 +; RV64-NEXT: vmv.v.v v16, v24 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 5 +; RV64-NEXT: li a2, 57 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vrgatherei16.vv v24, v0, v8 +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: vrgather.vi v24, v0, 3 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 72 +; RV64-NEXT: li a2, 28 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma -; RV64-NEXT: vmv.v.v v8, v24 +; RV64-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vmerge.vvm v20, v24, v20, v0 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 76 +; RV64-NEXT: li a2, 85 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 6 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl2r.v v20, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vrgatherei16.vv v24, v0, v20 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 68 +; RV64-NEXT: li a2, 77 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v28, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl2r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vrgatherei16.vv v24, v0, v8 ; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma -; RV64-NEXT: vmv.v.v v28, v24 +; RV64-NEXT: vmv.v.v v20, v24 ; RV64-NEXT: addi a1, a0, 256 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-NEXT: vse64.v v8, (a1) +; RV64-NEXT: vse64.v v16, (a1) ; RV64-NEXT: addi a1, a0, 320 -; RV64-NEXT: vse64.v v28, (a1) +; RV64-NEXT: vse64.v v20, (a1) ; RV64-NEXT: addi a1, a0, 192 -; RV64-NEXT: vse64.v v12, (a1) -; RV64-NEXT: addi a1, a0, 128 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 53 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl4r.v v16, (a2) # Unknown-size Folded Reload ; RV64-NEXT: vse64.v v16, (a1) +; RV64-NEXT: addi a1, a0, 128 +; RV64-NEXT: vse64.v v12, (a1) ; RV64-NEXT: addi a1, a0, 64 ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a3, 84 +; RV64-NEXT: li a3, 49 ; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 -; RV64-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload -; RV64-NEXT: vse64.v v8, (a1) +; RV64-NEXT: vl4r.v v12, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vse64.v v12, (a1) ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 60 -; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: slli a2, a1, 6 +; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vse64.v v8, (a0) ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: li a1, 88 +; RV64-NEXT: li a1, 93 ; RV64-NEXT: mul a0, a0, a1 ; RV64-NEXT: add sp, sp, a0 ; RV64-NEXT: .cfi_def_cfa sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll index e4b6e5c47fd98..acb1802181540 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll @@ -78,12 +78,11 @@ define <8 x i1> @v8i1_v16i1(<16 x i1>) { define <4 x i32> @v4i32_v8i32(<8 x i32>) { ; CHECK-LABEL: v4i32_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 8 +; CHECK-NEXT: vslidedown.vi v10, v8, 2 +; CHECK-NEXT: vslideup.vi v10, v8, 1, v0.t ; CHECK-NEXT: vmv.v.i v0, 5 -; CHECK-NEXT: vsrl.vi v10, v10, 1 -; CHECK-NEXT: vrsub.vi v11, v10, 3 -; CHECK-NEXT: vrgather.vv v10, v8, v11 ; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma ; CHECK-NEXT: vslidedown.vi v8, v8, 4 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu @@ -95,43 +94,21 @@ define <4 x i32> @v4i32_v8i32(<8 x i32>) { } define <4 x i32> @v4i32_v16i32(<16 x i32>) { -; RV32-LABEL: v4i32_v16i32: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e16, m1, ta, ma -; RV32-NEXT: vmv.v.i v12, 1 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vmv.v.i v14, 6 -; RV32-NEXT: li a0, 32 -; RV32-NEXT: vmv.v.i v0, 10 -; RV32-NEXT: vsetivli zero, 2, e16, m1, tu, ma -; RV32-NEXT: vslideup.vi v14, v12, 1 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vnsrl.wx v12, v8, a0 -; RV32-NEXT: vsetivli zero, 8, e32, m4, ta, ma -; RV32-NEXT: vslidedown.vi v8, v8, 8 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV32-NEXT: vrgatherei16.vv v12, v8, v14, v0.t -; RV32-NEXT: vmv1r.v v8, v12 -; RV32-NEXT: ret -; -; RV64-LABEL: v4i32_v16i32: -; RV64: # %bb.0: -; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vmv.v.i v0, 10 -; RV64-NEXT: vnsrl.wx v12, v8, a0 -; RV64-NEXT: vsetivli zero, 8, e32, m4, ta, ma -; RV64-NEXT: vslidedown.vi v8, v8, 8 -; RV64-NEXT: li a0, 3 -; RV64-NEXT: slli a0, a0, 33 -; RV64-NEXT: addi a0, a0, 1 -; RV64-NEXT: slli a0, a0, 16 -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vmv.v.x v10, a0 -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64-NEXT: vrgatherei16.vv v12, v8, v10, v0.t -; RV64-NEXT: vmv1r.v v8, v12 -; RV64-NEXT: ret +; CHECK-LABEL: v4i32_v16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.i v0, 8 +; CHECK-NEXT: vsetivli zero, 8, e32, m4, ta, ma +; CHECK-NEXT: vslidedown.vi v12, v8, 8 +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; CHECK-NEXT: vslidedown.vi v12, v12, 3, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.i v0, 10 +; CHECK-NEXT: vnsrl.wx v10, v8, a0 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 +; CHECK-NEXT: ret %2 = shufflevector <16 x i32> %0, <16 x i32> poison, <4 x i32> ret <4 x i32> %2 } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll index ad18c801069f4..59ddc021f4999 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll @@ -98,20 +98,17 @@ define void @deinterleave5_0_i8(ptr %in, ptr %out) { ; CHECK-LABEL: deinterleave5_0_i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: li a0, 33 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: lui a0, 28704 -; CHECK-NEXT: addi a0, a0, 1280 ; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v9, v8, 8 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vmv.v.x v9, a0 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vrgather.vv v10, v8, v9 -; CHECK-NEXT: vse8.v v10, (a1) +; CHECK-NEXT: vslidedown.vi v10, v9, 8 +; CHECK-NEXT: vmv.v.i v8, 10 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vmerge.vvm v9, v10, v9, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vslidedown.vi v9, v9, 4, v0.t +; CHECK-NEXT: vse8.v v9, (a1) ; CHECK-NEXT: ret entry: %0 = load <16 x i8>, ptr %in, align 1 @@ -125,16 +122,16 @@ define void @deinterleave5_8_i8(ptr %in, ptr %out) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: li a0, 66 +; CHECK-NEXT: vmv.v.i v0, 2 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vslidedown.vi v9, v8, 1 +; CHECK-NEXT: vslidedown.vi v9, v8, 5, v0.t ; CHECK-NEXT: vmv.v.i v0, 4 -; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vcompress.vm v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v8, v8, 8 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vrgather.vi v10, v8, 3, v0.t -; CHECK-NEXT: vse8.v v10, (a1) +; CHECK-NEXT: vrgather.vi v9, v8, 3, v0.t +; CHECK-NEXT: vse8.v v9, (a1) ; CHECK-NEXT: ret entry: %0 = load <16 x i8>, ptr %in, align 1 @@ -148,15 +145,15 @@ define void @deinterleave6_0_i8(ptr %in, ptr %out) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: li a0, 65 -; CHECK-NEXT: vmv.v.i v0, 4 -; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vcompress.vm v10, v8, v9 +; CHECK-NEXT: vmv.v.i v0, 2 ; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v8, 8 +; CHECK-NEXT: vslidedown.vi v9, v8, 8 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vmv.v.i v8, 4 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vrgather.vi v10, v8, 4, v0.t +; CHECK-NEXT: vslidedown.vi v10, v10, 5, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vrgather.vi v10, v9, 4, v0.t ; CHECK-NEXT: vse8.v v10, (a1) ; CHECK-NEXT: ret entry: @@ -171,16 +168,16 @@ define void @deinterleave6_8_i8(ptr %in, ptr %out) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: li a0, 130 +; CHECK-NEXT: vmv.v.i v0, 2 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vslidedown.vi v9, v8, 1 +; CHECK-NEXT: vslidedown.vi v9, v8, 6, v0.t ; CHECK-NEXT: vmv.v.i v0, 4 -; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vcompress.vm v10, v8, v9 ; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v8, v8, 8 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vrgather.vi v10, v8, 5, v0.t -; CHECK-NEXT: vse8.v v10, (a1) +; CHECK-NEXT: vrgather.vi v9, v8, 5, v0.t +; CHECK-NEXT: vse8.v v9, (a1) ; CHECK-NEXT: ret entry: %0 = load <16 x i8>, ptr %in, align 1 @@ -194,15 +191,15 @@ define void @deinterleave7_0_i8(ptr %in, ptr %out) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: li a0, 129 -; CHECK-NEXT: vmv.v.i v0, 4 -; CHECK-NEXT: vmv.s.x v9, a0 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vcompress.vm v10, v8, v9 +; CHECK-NEXT: vmv.v.i v0, 2 ; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v8, 8 +; CHECK-NEXT: vslidedown.vi v9, v8, 8 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vmv.v.i v8, 4 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vrgather.vi v10, v8, 6, v0.t +; CHECK-NEXT: vslidedown.vi v10, v10, 6, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vrgather.vi v10, v9, 6, v0.t ; CHECK-NEXT: vse8.v v10, (a1) ; CHECK-NEXT: ret entry: @@ -217,18 +214,16 @@ define void @deinterleave7_8_i8(ptr %in, ptr %out) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.v.i v9, -6 -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: li a0, 6 -; CHECK-NEXT: vmv.v.i v0, 6 -; CHECK-NEXT: vmadd.vx v10, a0, v9 +; CHECK-NEXT: vmv.v.i v0, 2 ; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v9, v8, 8 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vrgather.vi v11, v8, 1 -; CHECK-NEXT: vrgather.vv v11, v9, v10, v0.t -; CHECK-NEXT: vse8.v v11, (a1) +; CHECK-NEXT: vslidedown.vi v10, v9, 4 +; CHECK-NEXT: vslideup.vi v10, v9, 1, v0.t +; CHECK-NEXT: vmv.v.i v0, 6 +; CHECK-NEXT: vrgather.vi v9, v8, 1 +; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 +; CHECK-NEXT: vse8.v v8, (a1) ; CHECK-NEXT: ret entry: %0 = load <16 x i8>, ptr %in, align 1 @@ -285,17 +280,17 @@ define void @deinterleave7_0_i64(ptr %in, ptr %out) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: li a0, 129 -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vmv.v.i v0, 4 -; CHECK-NEXT: vmv.s.x v16, a0 -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vcompress.vm v20, v8, v16 ; CHECK-NEXT: vsetivli zero, 8, e64, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v8, 8 +; CHECK-NEXT: vslidedown.vi v16, v8, 8 +; CHECK-NEXT: vmv4r.v v12, v8 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.i v0, 2 +; CHECK-NEXT: vmv.v.i v8, 4 ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; CHECK-NEXT: vrgather.vi v20, v8, 6, v0.t -; CHECK-NEXT: vse64.v v20, (a1) +; CHECK-NEXT: vslidedown.vi v12, v12, 6, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vrgather.vi v12, v16, 6, v0.t +; CHECK-NEXT: vse64.v v12, (a1) ; CHECK-NEXT: ret entry: %0 = load <16 x i64>, ptr %in @@ -329,18 +324,18 @@ define void @deinterleave7_0_i32_subvec(ptr %in, ptr %out) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: li a0, 129 -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vmv.v.i v0, 4 -; CHECK-NEXT: vmv.s.x v12, a0 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vcompress.vm v14, v8, v12 ; CHECK-NEXT: vsetivli zero, 8, e32, m4, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v8, 8 +; CHECK-NEXT: vslidedown.vi v12, v8, 8 +; CHECK-NEXT: vmv2r.v v10, v8 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.i v0, 2 +; CHECK-NEXT: vmv.v.i v8, 4 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; CHECK-NEXT: vrgather.vi v14, v8, 6, v0.t +; CHECK-NEXT: vslidedown.vi v10, v10, 6, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vrgather.vi v10, v12, 6, v0.t ; CHECK-NEXT: vsetivli zero, 3, e32, m1, ta, ma -; CHECK-NEXT: vse32.v v14, (a1) +; CHECK-NEXT: vse32.v v10, (a1) ; CHECK-NEXT: ret entry: %0 = load <16 x i32>, ptr %in @@ -444,8 +439,8 @@ define void @deinterleave8_8_i8_two_source(ptr %in0, ptr %in1, ptr %out) { ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vle8.v v9, (a1) -; CHECK-NEXT: vmv.v.i v0, -3 -; CHECK-NEXT: vrgather.vi v9, v8, 1, v0.t +; CHECK-NEXT: vmv.v.i v0, 1 +; CHECK-NEXT: vslidedown.vi v9, v8, 1, v0.t ; CHECK-NEXT: vse8.v v9, (a2) ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll index c0c17d4e0623e..2da18fbb8e41c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll @@ -182,19 +182,19 @@ define void @shuffle1(ptr %explicit_0, ptr %explicit_1) vscale_range(2,2) { ; CHECK-LABEL: shuffle1: ; CHECK: # %bb.0: ; CHECK-NEXT: addi a0, a0, 252 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vid.v v10 ; CHECK-NEXT: vsetivli zero, 3, e32, m1, ta, ma -; CHECK-NEXT: vle32.v v11, (a0) -; CHECK-NEXT: vmv.v.i v0, 5 +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vmv.v.i v0, 1 +; CHECK-NEXT: vmv.v.i v8, 5 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vsrl.vi v10, v10, 1 -; CHECK-NEXT: vadd.vi v10, v10, 1 -; CHECK-NEXT: vrgather.vv v9, v11, v10, v0.t +; CHECK-NEXT: vslidedown.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmerge.vvm v11, v11, v9, v0 ; CHECK-NEXT: addi a0, a1, 672 -; CHECK-NEXT: vs2r.v v8, (a0) +; CHECK-NEXT: vs2r.v v10, (a0) ; CHECK-NEXT: ret %1 = getelementptr i32, ptr %explicit_0, i64 63 %2 = load <3 x i32>, ptr %1, align 1 @@ -209,15 +209,14 @@ define void @shuffle1(ptr %explicit_0, ptr %explicit_1) vscale_range(2,2) { define <16 x float> @shuffle2(<4 x float> %a) vscale_range(2,2) { ; CHECK-LABEL: shuffle2: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v12, v8, 1 +; CHECK-NEXT: vmv.v.i v0, 6 +; CHECK-NEXT: vslideup.vi v12, v8, 2 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vmv1r.v v12, v8 ; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vid.v v13 -; CHECK-NEXT: vadd.vv v13, v13, v13 -; CHECK-NEXT: vmv.v.i v0, 6 -; CHECK-NEXT: vrsub.vi v13, v13, 4 -; CHECK-NEXT: vrgather.vv v9, v12, v13, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v12, v0 ; CHECK-NEXT: ret %b = extractelement <4 x float> %a, i32 2 %c = insertelement <16 x float> , float %b, i32 5 @@ -234,7 +233,7 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) vsca ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; RV32-NEXT: vmv.v.i v0, 1 ; RV32-NEXT: li a1, 32 -; RV32-NEXT: vrgather.vi v18, v15, 1, v0.t +; RV32-NEXT: vslidedown.vi v18, v15, 1, v0.t ; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma ; RV32-NEXT: vslidedown.vx v8, v16, a0 ; RV32-NEXT: vmv.x.s a0, v8 @@ -260,7 +259,7 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) vsca ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vmv.v.i v16, 0 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; RV64-NEXT: vrgather.vi v18, v15, 1, v0.t +; RV64-NEXT: vslidedown.vi v18, v15, 1, v0.t ; RV64-NEXT: mv s2, sp ; RV64-NEXT: vs8r.v v16, (s2) ; RV64-NEXT: andi a0, a0, 15 @@ -290,14 +289,13 @@ define <4 x double> @shuffles_add(<4 x double> %0, <4 x double> %1) vscale_range ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vmv1r.v v13, v10 -; CHECK-NEXT: vslideup.vi v13, v11, 1 -; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: vmv.v.i v0, 1 +; CHECK-NEXT: vslideup.vi v13, v11, 1 +; CHECK-NEXT: vslidedown.vi v11, v10, 1, v0.t +; CHECK-NEXT: vmv1r.v v10, v9 ; CHECK-NEXT: vrgather.vi v12, v9, 0 -; CHECK-NEXT: vmv1r.v v9, v11 -; CHECK-NEXT: vrgather.vi v9, v10, 1, v0.t ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vfadd.vv v8, v12, v8 +; CHECK-NEXT: vfadd.vv v8, v12, v10 ; CHECK-NEXT: ret %3 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> %4 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> @@ -325,10 +323,9 @@ entry: define <16 x i32> @m4_linear_num_of_shuffles_in_chunks(<16 x i32> %0) vscale_range(2,2) { ; CHECK-LABEL: m4_linear_num_of_shuffles_in_chunks: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vmv.v.i v0, 8 -; CHECK-NEXT: vrgather.vi v12, v10, 0 -; CHECK-NEXT: vrgather.vi v12, v11, 0, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vslideup.vi v12, v10, 2 +; CHECK-NEXT: vslideup.vi v12, v11, 3 ; CHECK-NEXT: vrgather.vi v14, v8, 2 ; CHECK-NEXT: vrgather.vi v15, v10, 3 ; CHECK-NEXT: vmv4r.v v8, v12 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll index c9fe39685fbc6..5b8e312a06ad4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll @@ -510,15 +510,12 @@ define <8 x i16> @shuffle_v8i16_as_i64_16(<8 x i16> %v) { ; ; ZVKB-ZVE32X-LABEL: shuffle_v8i16_as_i64_16: ; ZVKB-ZVE32X: # %bb.0: -; ZVKB-ZVE32X-NEXT: lui a0, %hi(.LCPI19_0) -; ZVKB-ZVE32X-NEXT: addi a0, a0, %lo(.LCPI19_0) -; ZVKB-ZVE32X-NEXT: vsetivli zero, 8, e16, m2, ta, ma -; ZVKB-ZVE32X-NEXT: vle8.v v10, (a0) -; ZVKB-ZVE32X-NEXT: vsext.vf2 v12, v10 -; ZVKB-ZVE32X-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; ZVKB-ZVE32X-NEXT: vrgather.vv v11, v9, v12 -; ZVKB-ZVE32X-NEXT: vrgather.vv v10, v8, v12 -; ZVKB-ZVE32X-NEXT: vmv2r.v v8, v10 +; ZVKB-ZVE32X-NEXT: li a0, 136 +; ZVKB-ZVE32X-NEXT: vsetivli zero, 8, e16, m2, ta, mu +; ZVKB-ZVE32X-NEXT: vmv.s.x v0, a0 +; ZVKB-ZVE32X-NEXT: vslidedown.vi v10, v8, 1 +; ZVKB-ZVE32X-NEXT: vslideup.vi v10, v8, 3, v0.t +; ZVKB-ZVE32X-NEXT: vmv.v.v v8, v10 ; ZVKB-ZVE32X-NEXT: ret %shuffle = shufflevector <8 x i16> %v, <8 x i16> poison, <8 x i32> ret <8 x i16> %shuffle @@ -558,16 +555,12 @@ define <8 x i16> @shuffle_v8i16_as_i64_32(<8 x i16> %v) { ; ; ZVKB-ZVE32X-LABEL: shuffle_v8i16_as_i64_32: ; ZVKB-ZVE32X: # %bb.0: -; ZVKB-ZVE32X-NEXT: lui a0, 8240 -; ZVKB-ZVE32X-NEXT: addi a0, a0, 1 -; ZVKB-ZVE32X-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; ZVKB-ZVE32X-NEXT: vmv.s.x v10, a0 -; ZVKB-ZVE32X-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; ZVKB-ZVE32X-NEXT: vsext.vf2 v12, v10 -; ZVKB-ZVE32X-NEXT: vsetvli a0, zero, e32, m1, ta, ma -; ZVKB-ZVE32X-NEXT: vrgatherei16.vv v11, v9, v12 -; ZVKB-ZVE32X-NEXT: vrgatherei16.vv v10, v8, v12 -; ZVKB-ZVE32X-NEXT: vmv2r.v v8, v10 +; ZVKB-ZVE32X-NEXT: li a0, 204 +; ZVKB-ZVE32X-NEXT: vsetivli zero, 8, e16, m2, ta, mu +; ZVKB-ZVE32X-NEXT: vmv.s.x v0, a0 +; ZVKB-ZVE32X-NEXT: vslidedown.vi v10, v8, 2 +; ZVKB-ZVE32X-NEXT: vslideup.vi v10, v8, 2, v0.t +; ZVKB-ZVE32X-NEXT: vmv.v.v v8, v10 ; ZVKB-ZVE32X-NEXT: ret %shuffle = shufflevector <8 x i16> %v, <8 x i16> poison, <8 x i32> ret <8 x i16> %shuffle @@ -607,15 +600,12 @@ define <8 x i16> @shuffle_v8i16_as_i64_48(<8 x i16> %v) { ; ; ZVKB-ZVE32X-LABEL: shuffle_v8i16_as_i64_48: ; ZVKB-ZVE32X: # %bb.0: -; ZVKB-ZVE32X-NEXT: lui a0, %hi(.LCPI21_0) -; ZVKB-ZVE32X-NEXT: addi a0, a0, %lo(.LCPI21_0) -; ZVKB-ZVE32X-NEXT: vsetivli zero, 8, e16, m2, ta, ma -; ZVKB-ZVE32X-NEXT: vle8.v v10, (a0) -; ZVKB-ZVE32X-NEXT: vsext.vf2 v12, v10 -; ZVKB-ZVE32X-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; ZVKB-ZVE32X-NEXT: vrgather.vv v11, v9, v12 -; ZVKB-ZVE32X-NEXT: vrgather.vv v10, v8, v12 -; ZVKB-ZVE32X-NEXT: vmv2r.v v8, v10 +; ZVKB-ZVE32X-NEXT: li a0, -18 +; ZVKB-ZVE32X-NEXT: vsetivli zero, 8, e16, m2, ta, mu +; ZVKB-ZVE32X-NEXT: vmv.s.x v0, a0 +; ZVKB-ZVE32X-NEXT: vslidedown.vi v10, v8, 3 +; ZVKB-ZVE32X-NEXT: vslideup.vi v10, v8, 1, v0.t +; ZVKB-ZVE32X-NEXT: vmv.v.v v8, v10 ; ZVKB-ZVE32X-NEXT: ret %shuffle = shufflevector <8 x i16> %v, <8 x i16> poison, <8 x i32> ret <8 x i16> %shuffle @@ -655,17 +645,12 @@ define <8 x i32> @shuffle_v8i32_as_i64(<8 x i32> %v) { ; ; ZVKB-ZVE32X-LABEL: shuffle_v8i32_as_i64: ; ZVKB-ZVE32X: # %bb.0: -; ZVKB-ZVE32X-NEXT: lui a0, %hi(.LCPI22_0) -; ZVKB-ZVE32X-NEXT: addi a0, a0, %lo(.LCPI22_0) -; ZVKB-ZVE32X-NEXT: vsetivli zero, 8, e16, m2, ta, ma -; ZVKB-ZVE32X-NEXT: vle8.v v12, (a0) -; ZVKB-ZVE32X-NEXT: vsext.vf2 v16, v12 -; ZVKB-ZVE32X-NEXT: vsetvli a0, zero, e32, m1, ta, ma -; ZVKB-ZVE32X-NEXT: vrgatherei16.vv v13, v9, v16 -; ZVKB-ZVE32X-NEXT: vrgatherei16.vv v12, v8, v16 -; ZVKB-ZVE32X-NEXT: vrgatherei16.vv v14, v10, v16 -; ZVKB-ZVE32X-NEXT: vrgatherei16.vv v15, v11, v16 -; ZVKB-ZVE32X-NEXT: vmv4r.v v8, v12 +; ZVKB-ZVE32X-NEXT: li a0, 170 +; ZVKB-ZVE32X-NEXT: vsetivli zero, 8, e32, m4, ta, mu +; ZVKB-ZVE32X-NEXT: vmv.s.x v0, a0 +; ZVKB-ZVE32X-NEXT: vslidedown.vi v12, v8, 1 +; ZVKB-ZVE32X-NEXT: vslideup.vi v12, v8, 1, v0.t +; ZVKB-ZVE32X-NEXT: vmv.v.v v8, v12 ; ZVKB-ZVE32X-NEXT: ret %shuffle = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> ret <8 x i32> %shuffle @@ -729,15 +714,12 @@ define <8 x half> @shuffle_v8f16_as_i64_16(<8 x half> %v) { ; ; ZVKB-ZVE32X-LABEL: shuffle_v8f16_as_i64_16: ; ZVKB-ZVE32X: # %bb.0: -; ZVKB-ZVE32X-NEXT: lui a0, %hi(.LCPI24_0) -; ZVKB-ZVE32X-NEXT: addi a0, a0, %lo(.LCPI24_0) -; ZVKB-ZVE32X-NEXT: vsetivli zero, 8, e16, m2, ta, ma -; ZVKB-ZVE32X-NEXT: vle8.v v10, (a0) -; ZVKB-ZVE32X-NEXT: vsext.vf2 v12, v10 -; ZVKB-ZVE32X-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; ZVKB-ZVE32X-NEXT: vrgather.vv v11, v9, v12 -; ZVKB-ZVE32X-NEXT: vrgather.vv v10, v8, v12 -; ZVKB-ZVE32X-NEXT: vmv2r.v v8, v10 +; ZVKB-ZVE32X-NEXT: li a0, 136 +; ZVKB-ZVE32X-NEXT: vsetivli zero, 8, e16, m2, ta, mu +; ZVKB-ZVE32X-NEXT: vmv.s.x v0, a0 +; ZVKB-ZVE32X-NEXT: vslidedown.vi v10, v8, 1 +; ZVKB-ZVE32X-NEXT: vslideup.vi v10, v8, 3, v0.t +; ZVKB-ZVE32X-NEXT: vmv.v.v v8, v10 ; ZVKB-ZVE32X-NEXT: ret %shuffle = shufflevector <8 x half> %v, <8 x half> poison, <8 x i32> ret <8 x half> %shuffle @@ -777,16 +759,12 @@ define <8 x half> @shuffle_v8f16_as_i64_32(<8 x half> %v) { ; ; ZVKB-ZVE32X-LABEL: shuffle_v8f16_as_i64_32: ; ZVKB-ZVE32X: # %bb.0: -; ZVKB-ZVE32X-NEXT: lui a0, 8240 -; ZVKB-ZVE32X-NEXT: addi a0, a0, 1 -; ZVKB-ZVE32X-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; ZVKB-ZVE32X-NEXT: vmv.s.x v10, a0 -; ZVKB-ZVE32X-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; ZVKB-ZVE32X-NEXT: vsext.vf2 v12, v10 -; ZVKB-ZVE32X-NEXT: vsetvli a0, zero, e32, m1, ta, ma -; ZVKB-ZVE32X-NEXT: vrgatherei16.vv v11, v9, v12 -; ZVKB-ZVE32X-NEXT: vrgatherei16.vv v10, v8, v12 -; ZVKB-ZVE32X-NEXT: vmv2r.v v8, v10 +; ZVKB-ZVE32X-NEXT: li a0, 204 +; ZVKB-ZVE32X-NEXT: vsetivli zero, 8, e16, m2, ta, mu +; ZVKB-ZVE32X-NEXT: vmv.s.x v0, a0 +; ZVKB-ZVE32X-NEXT: vslidedown.vi v10, v8, 2 +; ZVKB-ZVE32X-NEXT: vslideup.vi v10, v8, 2, v0.t +; ZVKB-ZVE32X-NEXT: vmv.v.v v8, v10 ; ZVKB-ZVE32X-NEXT: ret %shuffle = shufflevector <8 x half> %v, <8 x half> poison, <8 x i32> ret <8 x half> %shuffle @@ -826,15 +804,12 @@ define <8 x half> @shuffle_v8f16_as_i64_48(<8 x half> %v) { ; ; ZVKB-ZVE32X-LABEL: shuffle_v8f16_as_i64_48: ; ZVKB-ZVE32X: # %bb.0: -; ZVKB-ZVE32X-NEXT: lui a0, %hi(.LCPI26_0) -; ZVKB-ZVE32X-NEXT: addi a0, a0, %lo(.LCPI26_0) -; ZVKB-ZVE32X-NEXT: vsetivli zero, 8, e16, m2, ta, ma -; ZVKB-ZVE32X-NEXT: vle8.v v10, (a0) -; ZVKB-ZVE32X-NEXT: vsext.vf2 v12, v10 -; ZVKB-ZVE32X-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; ZVKB-ZVE32X-NEXT: vrgather.vv v11, v9, v12 -; ZVKB-ZVE32X-NEXT: vrgather.vv v10, v8, v12 -; ZVKB-ZVE32X-NEXT: vmv2r.v v8, v10 +; ZVKB-ZVE32X-NEXT: li a0, -18 +; ZVKB-ZVE32X-NEXT: vsetivli zero, 8, e16, m2, ta, mu +; ZVKB-ZVE32X-NEXT: vmv.s.x v0, a0 +; ZVKB-ZVE32X-NEXT: vslidedown.vi v10, v8, 3 +; ZVKB-ZVE32X-NEXT: vslideup.vi v10, v8, 1, v0.t +; ZVKB-ZVE32X-NEXT: vmv.v.v v8, v10 ; ZVKB-ZVE32X-NEXT: ret %shuffle = shufflevector <8 x half> %v, <8 x half> poison, <8 x i32> ret <8 x half> %shuffle @@ -874,17 +849,12 @@ define <8 x float> @shuffle_v8f32_as_i64(<8 x float> %v) { ; ; ZVKB-ZVE32X-LABEL: shuffle_v8f32_as_i64: ; ZVKB-ZVE32X: # %bb.0: -; ZVKB-ZVE32X-NEXT: lui a0, %hi(.LCPI27_0) -; ZVKB-ZVE32X-NEXT: addi a0, a0, %lo(.LCPI27_0) -; ZVKB-ZVE32X-NEXT: vsetivli zero, 8, e16, m2, ta, ma -; ZVKB-ZVE32X-NEXT: vle8.v v12, (a0) -; ZVKB-ZVE32X-NEXT: vsext.vf2 v16, v12 -; ZVKB-ZVE32X-NEXT: vsetvli a0, zero, e32, m1, ta, ma -; ZVKB-ZVE32X-NEXT: vrgatherei16.vv v13, v9, v16 -; ZVKB-ZVE32X-NEXT: vrgatherei16.vv v12, v8, v16 -; ZVKB-ZVE32X-NEXT: vrgatherei16.vv v14, v10, v16 -; ZVKB-ZVE32X-NEXT: vrgatherei16.vv v15, v11, v16 -; ZVKB-ZVE32X-NEXT: vmv4r.v v8, v12 +; ZVKB-ZVE32X-NEXT: li a0, 170 +; ZVKB-ZVE32X-NEXT: vsetivli zero, 8, e32, m4, ta, mu +; ZVKB-ZVE32X-NEXT: vmv.s.x v0, a0 +; ZVKB-ZVE32X-NEXT: vslidedown.vi v12, v8, 1 +; ZVKB-ZVE32X-NEXT: vslideup.vi v12, v8, 1, v0.t +; ZVKB-ZVE32X-NEXT: vmv.v.v v8, v12 ; ZVKB-ZVE32X-NEXT: ret %shuffle = shufflevector <8 x float> %v, <8 x float> poison, <8 x i32> ret <8 x float> %shuffle @@ -924,13 +894,12 @@ define <8 x float> @shuffle_v8f32_as_i64_exact(<8 x float> %v) vscale_range(2,2) ; ; ZVKB-ZVE32X-LABEL: shuffle_v8f32_as_i64_exact: ; ZVKB-ZVE32X: # %bb.0: -; ZVKB-ZVE32X-NEXT: lui a0, 8240 -; ZVKB-ZVE32X-NEXT: addi a0, a0, 1 -; ZVKB-ZVE32X-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; ZVKB-ZVE32X-NEXT: vmv.s.x v10, a0 -; ZVKB-ZVE32X-NEXT: vsext.vf4 v12, v10 -; ZVKB-ZVE32X-NEXT: vrgather.vv v11, v9, v12 -; ZVKB-ZVE32X-NEXT: vrgather.vv v10, v8, v12 +; ZVKB-ZVE32X-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; ZVKB-ZVE32X-NEXT: vmv.v.i v0, 10 +; ZVKB-ZVE32X-NEXT: vslidedown.vi v11, v9, 1 +; ZVKB-ZVE32X-NEXT: vslideup.vi v11, v9, 1, v0.t +; ZVKB-ZVE32X-NEXT: vslidedown.vi v10, v8, 1 +; ZVKB-ZVE32X-NEXT: vslideup.vi v10, v8, 1, v0.t ; ZVKB-ZVE32X-NEXT: vmv2r.v v8, v10 ; ZVKB-ZVE32X-NEXT: ret %shuffle = shufflevector <8 x float> %v, <8 x float> poison, <8 x i32> @@ -940,30 +909,22 @@ define <8 x float> @shuffle_v8f32_as_i64_exact(<8 x float> %v) vscale_range(2,2) define <8 x i64> @shuffle_v8i64_as_i128(<8 x i64> %v) { ; CHECK-LABEL: shuffle_v8i64_as_i128: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI29_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI29_0) -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; CHECK-NEXT: vrgatherei16.vv v13, v9, v16 -; CHECK-NEXT: vrgatherei16.vv v12, v8, v16 -; CHECK-NEXT: vrgatherei16.vv v14, v10, v16 -; CHECK-NEXT: vrgatherei16.vv v15, v11, v16 -; CHECK-NEXT: vmv4r.v v8, v12 +; CHECK-NEXT: li a0, 170 +; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vslidedown.vi v12, v8, 1 +; CHECK-NEXT: vslideup.vi v12, v8, 1, v0.t +; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret ; ; ZVKB-V-LABEL: shuffle_v8i64_as_i128: ; ZVKB-V: # %bb.0: -; ZVKB-V-NEXT: lui a0, %hi(.LCPI29_0) -; ZVKB-V-NEXT: addi a0, a0, %lo(.LCPI29_0) -; ZVKB-V-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVKB-V-NEXT: vle16.v v16, (a0) -; ZVKB-V-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; ZVKB-V-NEXT: vrgatherei16.vv v13, v9, v16 -; ZVKB-V-NEXT: vrgatherei16.vv v12, v8, v16 -; ZVKB-V-NEXT: vrgatherei16.vv v14, v10, v16 -; ZVKB-V-NEXT: vrgatherei16.vv v15, v11, v16 -; ZVKB-V-NEXT: vmv4r.v v8, v12 +; ZVKB-V-NEXT: li a0, 170 +; ZVKB-V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; ZVKB-V-NEXT: vmv.s.x v0, a0 +; ZVKB-V-NEXT: vslidedown.vi v12, v8, 1 +; ZVKB-V-NEXT: vslideup.vi v12, v8, 1, v0.t +; ZVKB-V-NEXT: vmv.v.v v8, v12 ; ZVKB-V-NEXT: ret %shuffle = shufflevector <8 x i64> %v, <8 x i64> poison, <8 x i32> ret <8 x i64> %shuffle @@ -973,30 +934,22 @@ define <8 x i64> @shuffle_v8i64_as_i128(<8 x i64> %v) { define <8 x i64> @shuffle_v8i64_as_i128_2(<8 x i64> %v) { ; CHECK-LABEL: shuffle_v8i64_as_i128_2: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI30_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI30_0) -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; CHECK-NEXT: vrgatherei16.vv v13, v9, v16 -; CHECK-NEXT: vrgatherei16.vv v12, v8, v16 -; CHECK-NEXT: vrgatherei16.vv v14, v10, v16 -; CHECK-NEXT: vrgatherei16.vv v15, v11, v16 -; CHECK-NEXT: vmv4r.v v8, v12 +; CHECK-NEXT: li a0, 168 +; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vslidedown.vi v12, v8, 1 +; CHECK-NEXT: vslideup.vi v12, v8, 1, v0.t +; CHECK-NEXT: vmv.v.v v8, v12 ; CHECK-NEXT: ret ; ; ZVKB-V-LABEL: shuffle_v8i64_as_i128_2: ; ZVKB-V: # %bb.0: -; ZVKB-V-NEXT: lui a0, %hi(.LCPI30_0) -; ZVKB-V-NEXT: addi a0, a0, %lo(.LCPI30_0) -; ZVKB-V-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVKB-V-NEXT: vle16.v v16, (a0) -; ZVKB-V-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; ZVKB-V-NEXT: vrgatherei16.vv v13, v9, v16 -; ZVKB-V-NEXT: vrgatherei16.vv v12, v8, v16 -; ZVKB-V-NEXT: vrgatherei16.vv v14, v10, v16 -; ZVKB-V-NEXT: vrgatherei16.vv v15, v11, v16 -; ZVKB-V-NEXT: vmv4r.v v8, v12 +; ZVKB-V-NEXT: li a0, 168 +; ZVKB-V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; ZVKB-V-NEXT: vmv.s.x v0, a0 +; ZVKB-V-NEXT: vslidedown.vi v12, v8, 1 +; ZVKB-V-NEXT: vslideup.vi v12, v8, 1, v0.t +; ZVKB-V-NEXT: vmv.v.v v8, v12 ; ZVKB-V-NEXT: ret %shuffle = shufflevector <8 x i64> %v, <8 x i64> poison, <8 x i32> ret <8 x i64> %shuffle diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll index 814e35f201dca..ad7cf7eee5023 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll @@ -121,7 +121,7 @@ define <2 x i32> @trn2.v2i32(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vmv.v.i v0, 1 -; CHECK-NEXT: vrgather.vi v9, v8, 1, v0.t +; CHECK-NEXT: vslidedown.vi v9, v8, 1, v0.t ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> @@ -166,7 +166,7 @@ define <2 x i64> @trn2.v2i64(<2 x i64> %v0, <2 x i64> %v1) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vmv.v.i v0, 1 -; CHECK-NEXT: vrgather.vi v9, v8, 1, v0.t +; CHECK-NEXT: vslidedown.vi v9, v8, 1, v0.t ; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> @@ -188,7 +188,7 @@ define <2 x float> @trn2.v2f32(<2 x float> %v0, <2 x float> %v1) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vmv.v.i v0, 1 -; CHECK-NEXT: vrgather.vi v9, v8, 1, v0.t +; CHECK-NEXT: vslidedown.vi v9, v8, 1, v0.t ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> @@ -233,7 +233,7 @@ define <2 x double> @trn2.v2f64(<2 x double> %v0, <2 x double> %v1) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vmv.v.i v0, 1 -; CHECK-NEXT: vrgather.vi v9, v8, 1, v0.t +; CHECK-NEXT: vslidedown.vi v9, v8, 1, v0.t ; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1up.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1up.ll index 49f6acf9ba8c9..a171a7f8ac5f1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1up.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1up.ll @@ -374,11 +374,9 @@ define <4 x i8> @vslide1up_4xi8_neg_undef_insert(<4 x i8> %v, i8 %b) { define <4 x i8> @vslide1up_4xi8_neg_incorrect_insert(<4 x i8> %v, i8 %b) { ; CHECK-LABEL: vslide1up_4xi8_neg_incorrect_insert: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 8208 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma -; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vslideup.vi v9, v8, 1 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %v2 = shufflevector <4 x i8> poison, <4 x i8> %v, <4 x i32> @@ -400,12 +398,9 @@ define <4 x i8> @vslide1up_4xi8_neg_incorrect_insert2(<4 x i8> %v, i8 %b) { define <4 x i8> @vslide1up_4xi8_neg_incorrect_insert3(<4 x i8> %v, i8 %b) { ; CHECK-LABEL: vslide1up_4xi8_neg_incorrect_insert3: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 8208 -; CHECK-NEXT: addi a0, a0, 1 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmv.s.x v10, a0 -; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma -; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v9, v8, 1 +; CHECK-NEXT: vslideup.vi v9, v8, 1 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %v2 = shufflevector <4 x i8> poison, <4 x i8> %v, <4 x i32> diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll index 180579e47d075..0fbb139d5f461 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll @@ -182,7 +182,7 @@ define void @vnsrl_32_i32(ptr %in, ptr %out) { ; ZVE32F-NEXT: vmv.v.i v0, 1 ; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; ZVE32F-NEXT: vrgather.vi v9, v8, 1, v0.t +; ZVE32F-NEXT: vslidedown.vi v9, v8, 1, v0.t ; ZVE32F-NEXT: vse32.v v9, (a1) ; ZVE32F-NEXT: ret entry: @@ -236,7 +236,7 @@ define void @vnsrl_32_float(ptr %in, ptr %out) { ; ZVE32F-NEXT: vmv.v.i v0, 1 ; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu ; ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; ZVE32F-NEXT: vrgather.vi v9, v8, 1, v0.t +; ZVE32F-NEXT: vslidedown.vi v9, v8, 1, v0.t ; ZVE32F-NEXT: vse32.v v9, (a1) ; ZVE32F-NEXT: ret entry: @@ -279,7 +279,7 @@ define void @vnsrl_64_i64(ptr %in, ptr %out) { ; V-NEXT: vmv.v.i v0, 1 ; V-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; V-NEXT: vslidedown.vi v9, v8, 2 -; V-NEXT: vrgather.vi v9, v8, 1, v0.t +; V-NEXT: vslidedown.vi v9, v8, 1, v0.t ; V-NEXT: vse64.v v9, (a1) ; V-NEXT: ret ; @@ -330,7 +330,7 @@ define void @vnsrl_64_double(ptr %in, ptr %out) { ; V-NEXT: vmv.v.i v0, 1 ; V-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; V-NEXT: vslidedown.vi v9, v8, 2 -; V-NEXT: vrgather.vi v9, v8, 1, v0.t +; V-NEXT: vslidedown.vi v9, v8, 1, v0.t ; V-NEXT: vse64.v v9, (a1) ; V-NEXT: ret ; @@ -386,23 +386,20 @@ define void @vnsrl_0_i8_undef3(ptr %in, ptr %out) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 16, e8, mf2, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: li a0, -32 -; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: lui a0, 24640 -; CHECK-NEXT: vsetivli zero, 8, e8, mf4, ta, ma +; CHECK-NEXT: vmv.v.i v0, 8 +; CHECK-NEXT: vsetivli zero, 8, e8, mf4, ta, mu ; CHECK-NEXT: vid.v v9 -; CHECK-NEXT: addi a0, a0, 6 +; CHECK-NEXT: li a0, -32 ; CHECK-NEXT: vadd.vv v9, v9, v9 -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-NEXT: vmv.v.x v10, a0 -; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v10, v8, 2 +; CHECK-NEXT: vslidedown.vi v10, v8, 3, v0.t +; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vadd.vi v9, v9, -8 -; CHECK-NEXT: vrgather.vv v11, v8, v10 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vi v8, v8, 8 ; CHECK-NEXT: vsetivli zero, 8, e8, mf4, ta, mu -; CHECK-NEXT: vrgather.vv v11, v8, v9, v0.t -; CHECK-NEXT: vse8.v v11, (a1) +; CHECK-NEXT: vrgather.vv v10, v8, v9, v0.t +; CHECK-NEXT: vse8.v v10, (a1) ; CHECK-NEXT: ret entry: %0 = load <16 x i8>, ptr %in, align 1 @@ -417,21 +414,22 @@ define void @vnsrl_0_i8_undef_negative(ptr %in, ptr %out) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 16, e8, mf2, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: lui a0, %hi(.LCPI17_0) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI17_0) ; CHECK-NEXT: vsetivli zero, 8, e8, mf4, ta, ma -; CHECK-NEXT: vid.v v9 -; CHECK-NEXT: vle8.v v10, (a0) -; CHECK-NEXT: li a0, 48 -; CHECK-NEXT: vadd.vv v9, v9, v9 -; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vadd.vi v9, v9, -8 -; CHECK-NEXT: vrgather.vv v11, v8, v10 +; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v8, 8 +; CHECK-NEXT: vslidedown.vi v10, v8, 8 ; CHECK-NEXT: vsetivli zero, 8, e8, mf4, ta, mu -; CHECK-NEXT: vrgather.vv v11, v8, v9, v0.t -; CHECK-NEXT: vse8.v v11, (a1) +; CHECK-NEXT: vslideup.vi v11, v10, 4 +; CHECK-NEXT: vslideup.vi v11, v10, 3, v0.t +; CHECK-NEXT: li a0, 48 +; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vrgather.vv v10, v8, v9 +; CHECK-NEXT: vmerge.vvm v8, v10, v11, v0 +; CHECK-NEXT: vse8.v v8, (a1) ; CHECK-NEXT: ret entry: %0 = load <16 x i8>, ptr %in, align 1 @@ -822,7 +820,7 @@ define void @vnsrl_32_i32_two_source(ptr %in0, ptr %in1, ptr %out) { ; V-NEXT: vle32.v v8, (a0) ; V-NEXT: vle32.v v9, (a1) ; V-NEXT: vmv.v.i v0, 1 -; V-NEXT: vrgather.vi v9, v8, 1, v0.t +; V-NEXT: vslidedown.vi v9, v8, 1, v0.t ; V-NEXT: vse32.v v9, (a2) ; V-NEXT: ret ; @@ -832,7 +830,7 @@ define void @vnsrl_32_i32_two_source(ptr %in0, ptr %in1, ptr %out) { ; ZVE32F-NEXT: vle32.v v8, (a0) ; ZVE32F-NEXT: vle32.v v9, (a1) ; ZVE32F-NEXT: vmv.v.i v0, 1 -; ZVE32F-NEXT: vrgather.vi v9, v8, 1, v0.t +; ZVE32F-NEXT: vslidedown.vi v9, v8, 1, v0.t ; ZVE32F-NEXT: vse32.v v9, (a2) ; ZVE32F-NEXT: ret entry: @@ -876,7 +874,7 @@ define void @vnsrl_32_float_two_source(ptr %in0, ptr %in1, ptr %out) { ; V-NEXT: vle32.v v8, (a0) ; V-NEXT: vle32.v v9, (a1) ; V-NEXT: vmv.v.i v0, 1 -; V-NEXT: vrgather.vi v9, v8, 1, v0.t +; V-NEXT: vslidedown.vi v9, v8, 1, v0.t ; V-NEXT: vse32.v v9, (a2) ; V-NEXT: ret ; @@ -886,7 +884,7 @@ define void @vnsrl_32_float_two_source(ptr %in0, ptr %in1, ptr %out) { ; ZVE32F-NEXT: vle32.v v8, (a0) ; ZVE32F-NEXT: vle32.v v9, (a1) ; ZVE32F-NEXT: vmv.v.i v0, 1 -; ZVE32F-NEXT: vrgather.vi v9, v8, 1, v0.t +; ZVE32F-NEXT: vslidedown.vi v9, v8, 1, v0.t ; ZVE32F-NEXT: vse32.v v9, (a2) ; ZVE32F-NEXT: ret entry: @@ -930,7 +928,7 @@ define void @vnsrl_64_i64_two_source(ptr %in0, ptr %in1, ptr %out) { ; V-NEXT: vle64.v v8, (a0) ; V-NEXT: vle64.v v9, (a1) ; V-NEXT: vmv.v.i v0, 1 -; V-NEXT: vrgather.vi v9, v8, 1, v0.t +; V-NEXT: vslidedown.vi v9, v8, 1, v0.t ; V-NEXT: vse64.v v9, (a2) ; V-NEXT: ret ; @@ -983,7 +981,7 @@ define void @vnsrl_64_double_two_source(ptr %in0, ptr %in1, ptr %out) { ; V-NEXT: vle64.v v8, (a0) ; V-NEXT: vle64.v v9, (a1) ; V-NEXT: vmv.v.i v0, 1 -; V-NEXT: vrgather.vi v9, v8, 1, v0.t +; V-NEXT: vslidedown.vi v9, v8, 1, v0.t ; V-NEXT: vse64.v v9, (a2) ; V-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll index 8b41febced065..4337bedde8674 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll @@ -78,7 +78,7 @@ define {<2 x i64>, <2 x i64>} @vector_deinterleave_v2i64_v4i64(<4 x i64> %vec) { ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vmv.v.i v0, 1 ; CHECK-NEXT: vmv1r.v v9, v10 -; CHECK-NEXT: vrgather.vi v9, v8, 1, v0.t +; CHECK-NEXT: vslidedown.vi v9, v8, 1, v0.t ; CHECK-NEXT: vslideup.vi v8, v10, 1 ; CHECK-NEXT: ret %retval = call {<2 x i64>, <2 x i64>} @llvm.vector.deinterleave2.v4i64(<4 x i64> %vec) @@ -88,27 +88,33 @@ ret {<2 x i64>, <2 x i64>} %retval define {<4 x i64>, <4 x i64>} @vector_deinterleave_v4i64_v8i64(<8 x i64> %vec) { ; CHECK-LABEL: vector_deinterleave_v4i64_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vmv.v.i v14, 5 -; CHECK-NEXT: vid.v v15 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.i v0, 8 ; CHECK-NEXT: vsetivli zero, 4, e64, m4, ta, ma ; CHECK-NEXT: vslidedown.vi v16, v8, 4 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vmv.v.i v0, 12 -; CHECK-NEXT: vmv.v.i v18, 10 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vcompress.vm v12, v8, v14 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vadd.vv v14, v15, v15 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vcompress.vm v10, v8, v18 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vadd.vi v8, v14, -4 -; CHECK-NEXT: vadd.vi v9, v14, -3 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vrgatherei16.vv v12, v16, v8, v0.t -; CHECK-NEXT: vrgatherei16.vv v10, v16, v9, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vmv.v.i v10, 2 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vmv.v.i v11, 12 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; CHECK-NEXT: vslideup.vi v14, v16, 2 +; CHECK-NEXT: vslideup.vi v14, v16, 1, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vslidedown.vi v12, v8, 1, v0.t +; CHECK-NEXT: vmv1r.v v0, v11 +; CHECK-NEXT: vmerge.vvm v12, v12, v14, v0 +; CHECK-NEXT: vslidedown.vi v18, v8, 1 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.i v14, 4 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; CHECK-NEXT: vslidedown.vi v18, v8, 2, v0.t +; CHECK-NEXT: vmv2r.v v8, v16 +; CHECK-NEXT: vmv1r.v v0, v14 +; CHECK-NEXT: vslideup.vi v8, v16, 1, v0.t +; CHECK-NEXT: vmv1r.v v0, v11 +; CHECK-NEXT: vmerge.vvm v10, v18, v8, v0 +; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %retval = call {<4 x i64>, <4 x i64>} @llvm.vector.deinterleave2.v8i64(<8 x i64> %vec) ret {<4 x i64>, <4 x i64>} %retval @@ -481,7 +487,7 @@ define {<2 x double>, <2 x double>} @vector_deinterleave_v2f64_v4f64(<4 x double ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vmv.v.i v0, 1 ; CHECK-NEXT: vmv1r.v v9, v10 -; CHECK-NEXT: vrgather.vi v9, v8, 1, v0.t +; CHECK-NEXT: vslidedown.vi v9, v8, 1, v0.t ; CHECK-NEXT: vslideup.vi v8, v10, 1 ; CHECK-NEXT: ret %retval = call {<2 x double>, <2 x double>} @llvm.vector.deinterleave2.v4f64(<4 x double> %vec) @@ -491,27 +497,33 @@ ret {<2 x double>, <2 x double>} %retval define {<4 x double>, <4 x double>} @vector_deinterleave_v4f64_v8f64(<8 x double> %vec) { ; CHECK-LABEL: vector_deinterleave_v4f64_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vmv.v.i v14, 5 -; CHECK-NEXT: vid.v v15 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.i v0, 8 ; CHECK-NEXT: vsetivli zero, 4, e64, m4, ta, ma ; CHECK-NEXT: vslidedown.vi v16, v8, 4 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vmv.v.i v0, 12 -; CHECK-NEXT: vmv.v.i v18, 10 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vcompress.vm v12, v8, v14 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vadd.vv v14, v15, v15 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vcompress.vm v10, v8, v18 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vadd.vi v8, v14, -4 -; CHECK-NEXT: vadd.vi v9, v14, -3 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vrgatherei16.vv v12, v16, v8, v0.t -; CHECK-NEXT: vrgatherei16.vv v10, v16, v9, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vmv.v.i v10, 2 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vmv.v.i v11, 12 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; CHECK-NEXT: vslideup.vi v14, v16, 2 +; CHECK-NEXT: vslideup.vi v14, v16, 1, v0.t +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vslidedown.vi v12, v8, 1, v0.t +; CHECK-NEXT: vmv1r.v v0, v11 +; CHECK-NEXT: vmerge.vvm v12, v12, v14, v0 +; CHECK-NEXT: vslidedown.vi v18, v8, 1 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.i v14, 4 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; CHECK-NEXT: vslidedown.vi v18, v8, 2, v0.t +; CHECK-NEXT: vmv2r.v v8, v16 +; CHECK-NEXT: vmv1r.v v0, v14 +; CHECK-NEXT: vslideup.vi v8, v16, 1, v0.t +; CHECK-NEXT: vmv1r.v v0, v11 +; CHECK-NEXT: vmerge.vvm v10, v18, v8, v0 +; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %retval = call {<4 x double>, <4 x double>} @llvm.vector.deinterleave2.v8f64(<8 x double> %vec) ret {<4 x double>, <4 x double>} %retval