diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 7fbac14112af3..8c0a046d3a7e9 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -20209,7 +20209,7 @@ Arguments: The argument to this intrinsic must be a vector. -'``llvm.vector.deinterleave2/3/5/7``' Intrinsic +'``llvm.vector.deinterleave2/3/4/5/6/7/8``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Syntax: @@ -20227,8 +20227,8 @@ This is an overloaded intrinsic. Overview: """"""""" -The '``llvm.vector.deinterleave2/3/5/7``' intrinsics deinterleave adjacent lanes -into 2, 3, 5, and 7 separate vectors, respectively, and return them as the +The '``llvm.vector.deinterleave2/3/4/5/6/7/8``' intrinsics deinterleave adjacent lanes +into 2 through to 8 separate vectors, respectively, and return them as the result. This intrinsic works for both fixed and scalable vectors. While this intrinsic @@ -20250,7 +20250,7 @@ Arguments: The argument is a vector whose type corresponds to the logical concatenation of the aggregated result types. -'``llvm.vector.interleave2/3/5/7``' Intrinsic +'``llvm.vector.interleave2/3/4/5/6/7/8``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Syntax: @@ -20268,7 +20268,7 @@ This is an overloaded intrinsic. Overview: """"""""" -The '``llvm.vector.interleave2/3/5/7``' intrinsic constructs a vector +The '``llvm.vector.interleave2/3/4/5/6/7/8``' intrinsic constructs a vector by interleaving all the input vectors. This intrinsic works for both fixed and scalable vectors. While this intrinsic diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h index 6fb1bf9359b9a..5a810784a54bf 100644 --- a/llvm/include/llvm/IR/Intrinsics.h +++ b/llvm/include/llvm/IR/Intrinsics.h @@ -153,8 +153,11 @@ namespace Intrinsic { TruncArgument, HalfVecArgument, OneThirdVecArgument, + OneFourthVecArgument, OneFifthVecArgument, + OneSixthVecArgument, OneSeventhVecArgument, + OneEighthVecArgument, SameVecWidthArgument, VecOfAnyPtrsToElt, VecElementArgument, @@ -166,9 +169,12 @@ namespace Intrinsic { AArch64Svcount, } Kind; - // These three have to be contiguous. - static_assert(OneFifthVecArgument == OneThirdVecArgument + 1 && - OneSeventhVecArgument == OneFifthVecArgument + 1); + // These six have to be contiguous. + static_assert(OneFourthVecArgument == OneThirdVecArgument + 1 && + OneFifthVecArgument == OneFourthVecArgument + 1 && + OneSixthVecArgument == OneFifthVecArgument + 1 && + OneSeventhVecArgument == OneSixthVecArgument + 1 && + OneEighthVecArgument == OneSeventhVecArgument + 1); union { unsigned Integer_Width; unsigned Float_Width; @@ -188,19 +194,19 @@ namespace Intrinsic { unsigned getArgumentNumber() const { assert(Kind == Argument || Kind == ExtendArgument || Kind == TruncArgument || Kind == HalfVecArgument || - Kind == OneThirdVecArgument || Kind == OneFifthVecArgument || - Kind == OneSeventhVecArgument || Kind == SameVecWidthArgument || - Kind == VecElementArgument || Kind == Subdivide2Argument || - Kind == Subdivide4Argument || Kind == VecOfBitcastsToInt); + (Kind >= OneThirdVecArgument && Kind <= OneEighthVecArgument) || + Kind == SameVecWidthArgument || Kind == VecElementArgument || + Kind == Subdivide2Argument || Kind == Subdivide4Argument || + Kind == VecOfBitcastsToInt); return Argument_Info >> 3; } ArgKind getArgumentKind() const { assert(Kind == Argument || Kind == ExtendArgument || Kind == TruncArgument || Kind == HalfVecArgument || - Kind == OneThirdVecArgument || Kind == OneFifthVecArgument || - Kind == OneSeventhVecArgument || Kind == SameVecWidthArgument || - Kind == VecElementArgument || Kind == Subdivide2Argument || - Kind == Subdivide4Argument || Kind == VecOfBitcastsToInt); + (Kind >= OneThirdVecArgument && Kind <= OneEighthVecArgument) || + Kind == SameVecWidthArgument || Kind == VecElementArgument || + Kind == Subdivide2Argument || Kind == Subdivide4Argument || + Kind == VecOfBitcastsToInt); return (ArgKind)(Argument_Info & 7); } diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index d5481c6b81f9f..900c3469e2bb8 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -340,6 +340,9 @@ def IIT_ONE_FIFTH_VEC_ARG : IIT_Base<63>; def IIT_ONE_SEVENTH_VEC_ARG : IIT_Base<64>; def IIT_V2048: IIT_Vec<2048, 65>; def IIT_V4096: IIT_Vec<4096, 66>; +def IIT_ONE_FOURTH_VEC_ARG : IIT_Base<67>; +def IIT_ONE_SIXTH_VEC_ARG : IIT_Base<68>; +def IIT_ONE_EIGHTH_VEC_ARG : IIT_Base<69>; } defvar IIT_all_FixedTypes = !filter(iit, IIT_all, @@ -483,12 +486,21 @@ class LLVMHalfElementsVectorType class LLVMOneThirdElementsVectorType : LLVMMatchType; +class LLVMOneFourthElementsVectorType + : LLVMMatchType; + class LLVMOneFifthElementsVectorType : LLVMMatchType; +class LLVMOneSixthElementsVectorType + : LLVMMatchType; + class LLVMOneSeventhElementsVectorType : LLVMMatchType; +class LLVMOneEighthElementsVectorType + : LLVMMatchType; + // Match the type of another intrinsic parameter that is expected to be a // vector type (i.e. ) but with each element subdivided to // form a vector with more elements that are smaller than the original. @@ -2781,6 +2793,20 @@ def int_vector_deinterleave3 : DefaultAttrsIntrinsic<[LLVMOneThirdElementsVector [llvm_anyvector_ty], [IntrNoMem]>; +def int_vector_interleave4 : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMOneFourthElementsVectorType<0>, + LLVMOneFourthElementsVectorType<0>, + LLVMOneFourthElementsVectorType<0>, + LLVMOneFourthElementsVectorType<0>], + [IntrNoMem]>; + +def int_vector_deinterleave4 : DefaultAttrsIntrinsic<[LLVMOneFourthElementsVectorType<0>, + LLVMOneFourthElementsVectorType<0>, + LLVMOneFourthElementsVectorType<0>, + LLVMOneFourthElementsVectorType<0>], + [llvm_anyvector_ty], + [IntrNoMem]>; + def int_vector_interleave5 : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMOneFifthElementsVectorType<0>, LLVMOneFifthElementsVectorType<0>, @@ -2797,6 +2823,24 @@ def int_vector_deinterleave5 : DefaultAttrsIntrinsic<[LLVMOneFifthElementsVector [llvm_anyvector_ty], [IntrNoMem]>; +def int_vector_interleave6 : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMOneSixthElementsVectorType<0>, + LLVMOneSixthElementsVectorType<0>, + LLVMOneSixthElementsVectorType<0>, + LLVMOneSixthElementsVectorType<0>, + LLVMOneSixthElementsVectorType<0>, + LLVMOneSixthElementsVectorType<0>], + [IntrNoMem]>; + +def int_vector_deinterleave6 : DefaultAttrsIntrinsic<[LLVMOneSixthElementsVectorType<0>, + LLVMOneSixthElementsVectorType<0>, + LLVMOneSixthElementsVectorType<0>, + LLVMOneSixthElementsVectorType<0>, + LLVMOneSixthElementsVectorType<0>, + LLVMOneSixthElementsVectorType<0>], + [llvm_anyvector_ty], + [IntrNoMem]>; + def int_vector_interleave7 : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMOneSeventhElementsVectorType<0>, LLVMOneSeventhElementsVectorType<0>, @@ -2817,6 +2861,28 @@ def int_vector_deinterleave7 : DefaultAttrsIntrinsic<[LLVMOneSeventhElementsVect [llvm_anyvector_ty], [IntrNoMem]>; +def int_vector_interleave8 : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMOneEighthElementsVectorType<0>, + LLVMOneEighthElementsVectorType<0>, + LLVMOneEighthElementsVectorType<0>, + LLVMOneEighthElementsVectorType<0>, + LLVMOneEighthElementsVectorType<0>, + LLVMOneEighthElementsVectorType<0>, + LLVMOneEighthElementsVectorType<0>, + LLVMOneEighthElementsVectorType<0>], + [IntrNoMem]>; + +def int_vector_deinterleave8 : DefaultAttrsIntrinsic<[LLVMOneEighthElementsVectorType<0>, + LLVMOneEighthElementsVectorType<0>, + LLVMOneEighthElementsVectorType<0>, + LLVMOneEighthElementsVectorType<0>, + LLVMOneEighthElementsVectorType<0>, + LLVMOneEighthElementsVectorType<0>, + LLVMOneEighthElementsVectorType<0>, + LLVMOneEighthElementsVectorType<0>], + [llvm_anyvector_ty], + [IntrNoMem]>; + //===-------------- Intrinsics to perform partial reduction ---------------===// def int_experimental_vector_partial_reduce_add : DefaultAttrsIntrinsic<[LLVMMatchType<0>], diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 434484b671bf2..ca195cb37de8a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -8198,24 +8198,42 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, case Intrinsic::vector_interleave3: visitVectorInterleave(I, 3); return; + case Intrinsic::vector_interleave4: + visitVectorInterleave(I, 4); + return; case Intrinsic::vector_interleave5: visitVectorInterleave(I, 5); return; + case Intrinsic::vector_interleave6: + visitVectorInterleave(I, 6); + return; case Intrinsic::vector_interleave7: visitVectorInterleave(I, 7); return; + case Intrinsic::vector_interleave8: + visitVectorInterleave(I, 8); + return; case Intrinsic::vector_deinterleave2: visitVectorDeinterleave(I, 2); return; case Intrinsic::vector_deinterleave3: visitVectorDeinterleave(I, 3); return; + case Intrinsic::vector_deinterleave4: + visitVectorDeinterleave(I, 4); + return; case Intrinsic::vector_deinterleave5: visitVectorDeinterleave(I, 5); return; + case Intrinsic::vector_deinterleave6: + visitVectorDeinterleave(I, 6); + return; case Intrinsic::vector_deinterleave7: visitVectorDeinterleave(I, 7); return; + case Intrinsic::vector_deinterleave8: + visitVectorDeinterleave(I, 8); + return; case Intrinsic::experimental_vector_compress: setValue(&I, DAG.getNode(ISD::VECTOR_COMPRESS, sdl, getValue(I.getArgOperand(0)).getValueType(), diff --git a/llvm/lib/IR/Intrinsics.cpp b/llvm/lib/IR/Intrinsics.cpp index dabb5fe006b3c..28f7523476774 100644 --- a/llvm/lib/IR/Intrinsics.cpp +++ b/llvm/lib/IR/Intrinsics.cpp @@ -378,18 +378,36 @@ DecodeIITType(unsigned &NextElt, ArrayRef Infos, IITDescriptor::get(IITDescriptor::OneThirdVecArgument, ArgInfo)); return; } + case IIT_ONE_FOURTH_VEC_ARG: { + unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]); + OutputTable.push_back( + IITDescriptor::get(IITDescriptor::OneFourthVecArgument, ArgInfo)); + return; + } case IIT_ONE_FIFTH_VEC_ARG: { unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]); OutputTable.push_back( IITDescriptor::get(IITDescriptor::OneFifthVecArgument, ArgInfo)); return; } + case IIT_ONE_SIXTH_VEC_ARG: { + unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]); + OutputTable.push_back( + IITDescriptor::get(IITDescriptor::OneSixthVecArgument, ArgInfo)); + return; + } case IIT_ONE_SEVENTH_VEC_ARG: { unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]); OutputTable.push_back( IITDescriptor::get(IITDescriptor::OneSeventhVecArgument, ArgInfo)); return; } + case IIT_ONE_EIGHTH_VEC_ARG: { + unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]); + OutputTable.push_back( + IITDescriptor::get(IITDescriptor::OneEighthVecArgument, ArgInfo)); + return; + } case IIT_SAME_VEC_WIDTH_ARG: { unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]); OutputTable.push_back( @@ -584,11 +602,14 @@ static Type *DecodeFixedType(ArrayRef &Infos, return VectorType::getHalfElementsVectorType( cast(Tys[D.getArgumentNumber()])); case IITDescriptor::OneThirdVecArgument: + case IITDescriptor::OneFourthVecArgument: case IITDescriptor::OneFifthVecArgument: + case IITDescriptor::OneSixthVecArgument: case IITDescriptor::OneSeventhVecArgument: + case IITDescriptor::OneEighthVecArgument: return VectorType::getOneNthElementsVectorType( cast(Tys[D.getArgumentNumber()]), - 3 + (D.Kind - IITDescriptor::OneThirdVecArgument) * 2); + 3 + (D.Kind - IITDescriptor::OneThirdVecArgument)); case IITDescriptor::SameVecWidthArgument: { Type *EltTy = DecodeFixedType(Infos, Tys, Context); Type *Ty = Tys[D.getArgumentNumber()]; @@ -974,15 +995,18 @@ matchIntrinsicType(Type *Ty, ArrayRef &Infos, VectorType::getHalfElementsVectorType( cast(ArgTys[D.getArgumentNumber()])) != Ty; case IITDescriptor::OneThirdVecArgument: + case IITDescriptor::OneFourthVecArgument: case IITDescriptor::OneFifthVecArgument: + case IITDescriptor::OneSixthVecArgument: case IITDescriptor::OneSeventhVecArgument: + case IITDescriptor::OneEighthVecArgument: // If this is a forward reference, defer the check for later. if (D.getArgumentNumber() >= ArgTys.size()) return IsDeferredCheck || DeferCheck(Ty); return !isa(ArgTys[D.getArgumentNumber()]) || VectorType::getOneNthElementsVectorType( cast(ArgTys[D.getArgumentNumber()]), - 3 + (D.Kind - IITDescriptor::OneThirdVecArgument) * 2) != Ty; + 3 + (D.Kind - IITDescriptor::OneThirdVecArgument)) != Ty; case IITDescriptor::SameVecWidthArgument: { if (D.getArgumentNumber() >= ArgTys.size()) { // Defer check and subsequent check for the vector element type. diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll index f6b5a35aa06d6..aab2f08277831 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfh | FileCheck %s --check-prefixes=CHECK,V,RV32 -; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfh | FileCheck %s --check-prefixes=CHECK,V,RV64 -; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfh,+experimental-xrivosvizip | FileCheck %s --check-prefixes=CHECK,ZIP +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+m,+zvfh | FileCheck %s --check-prefixes=CHECK,V,RV32 +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+m,+zvfh | FileCheck %s --check-prefixes=CHECK,V,RV64 +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+m,+zvfh,+experimental-xrivosvizip | FileCheck %s --check-prefixes=CHECK,ZIP ; Integers @@ -189,15 +189,13 @@ define {<8 x i64>, <8 x i64>} @vector_deinterleave_v8i64_v16i64(<16 x i64> %vec) ret {<8 x i64>, <8 x i64>} %retval } -define {<2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave3_v2i32_v6i32(<6 x i32> %v) { +define {<2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave3_v2i32_v6i32(<6 x i32> %v) nounwind { ; CHECK-LABEL: vector_deinterleave3_v2i32_v6i32: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v12, v8, 2 @@ -215,24 +213,51 @@ define {<2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave3_v2i32_v6i32(<6 x ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %res = call {<2 x i32>, <2 x i32>, <2 x i32>} @llvm.vector.deinterleave3.v6i32(<6 x i32> %v) ret {<2 x i32>, <2 x i32>, <2 x i32>} %res } +define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave4_v2i32_v8i32(<8 x i32> %v) nounwind { +; CHECK-LABEL: vector_deinterleave4_v2i32_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetivli zero, 2, e32, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v10, v8, 6 +; CHECK-NEXT: vslidedown.vi v12, v8, 4 +; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: add a1, a0, a0 +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vslideup.vx v12, v10, a0 +; CHECK-NEXT: vslideup.vx v8, v9, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv.v.v v9, v12 +; CHECK-NEXT: vs2r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, ma +; CHECK-NEXT: vlseg4e32.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @llvm.vector.deinterleave4.v8i32(<8 x i32> %v) + ret {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} %res +} -define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vector_deinterleave5_v2i16_v10i16(<10 x i16> %v) { +define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vector_deinterleave5_v2i16_v10i16(<10 x i16> %v) nounwind { ; CHECK-LABEL: vector_deinterleave5_v2i16_v10i16: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: vsetivli zero, 2, e16, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v12, v8, 6 @@ -257,291 +282,142 @@ define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vector_deinterle ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %res = call {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @llvm.vector.deinterleave5.v10i16(<10 x i16> %v) ret {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res } -define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @vector_deinterleave7_v14i8_v2i8(<14 x i8> %v) { -; RV32-LABEL: vector_deinterleave7_v14i8_v2i8: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 -; RV32-NEXT: sw ra, 44(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 40(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s1, 36(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: .cfi_offset s1, -12 -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 2 -; RV32-NEXT: sub sp, sp, a0 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb -; RV32-NEXT: addi a0, sp, 32 -; RV32-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill -; RV32-NEXT: csrr s1, vlenb -; RV32-NEXT: vsetivli zero, 2, e8, m1, ta, ma -; RV32-NEXT: vslidedown.vi v11, v8, 10 -; RV32-NEXT: vslidedown.vi v10, v8, 8 -; RV32-NEXT: vslidedown.vi v9, v8, 2 -; RV32-NEXT: srli s0, s1, 3 -; RV32-NEXT: add a0, s0, s0 -; RV32-NEXT: vsetvli zero, a0, e8, mf2, tu, ma -; RV32-NEXT: vslideup.vx v10, v11, s0 -; RV32-NEXT: vmv1r.v v11, v8 -; RV32-NEXT: vslideup.vx v11, v9, s0 -; RV32-NEXT: vsetivli zero, 2, e8, m1, ta, ma -; RV32-NEXT: vslidedown.vi v9, v8, 12 -; RV32-NEXT: srli a0, s1, 2 -; RV32-NEXT: add a1, a0, s0 -; RV32-NEXT: vsetvli zero, a1, e8, mf2, tu, ma -; RV32-NEXT: vslideup.vx v10, v9, a0 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 1 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 32 -; RV32-NEXT: vs1r.v v10, (a2) # vscale x 8-byte Folded Spill -; RV32-NEXT: vsetivli zero, 2, e8, m1, ta, ma -; RV32-NEXT: vslidedown.vi v9, v8, 4 -; RV32-NEXT: vsetvli zero, a1, e8, mf2, tu, ma -; RV32-NEXT: vslideup.vx v11, v9, a0 -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 32 -; RV32-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill -; RV32-NEXT: li a1, 3 -; RV32-NEXT: mv a0, s0 -; RV32-NEXT: call __mulsi3 -; RV32-NEXT: add s0, a0, s0 -; RV32-NEXT: addi a1, sp, 32 -; RV32-NEXT: vl1r.v v8, (a1) # vscale x 8-byte Folded Reload -; RV32-NEXT: vsetivli zero, 2, e8, m1, ta, ma -; RV32-NEXT: vslidedown.vi v8, v8, 6 -; RV32-NEXT: srli s1, s1, 1 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 32 -; RV32-NEXT: vl1r.v v9, (a1) # vscale x 8-byte Folded Reload -; RV32-NEXT: vsetvli zero, s0, e8, mf2, ta, ma -; RV32-NEXT: vslideup.vx v9, v8, a0 -; RV32-NEXT: add a0, s1, s1 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 1 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 32 -; RV32-NEXT: vl1r.v v8, (a1) # vscale x 8-byte Folded Reload -; RV32-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; RV32-NEXT: vslideup.vx v9, v8, s1 -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a1, a0, 1 -; RV32-NEXT: add a0, a1, a0 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 32 -; RV32-NEXT: vs1r.v v9, (a0) -; RV32-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; RV32-NEXT: vlseg7e8.v v8, (a0) -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 2 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 48 -; RV32-NEXT: lw ra, 44(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s0, 40(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s1, 36(sp) # 4-byte Folded Reload -; RV32-NEXT: .cfi_restore ra -; RV32-NEXT: .cfi_restore s0 -; RV32-NEXT: .cfi_restore s1 -; RV32-NEXT: addi sp, sp, 48 -; RV32-NEXT: .cfi_def_cfa_offset 0 -; RV32-NEXT: ret -; -; RV64-LABEL: vector_deinterleave7_v14i8_v2i8: -; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -64 -; RV64-NEXT: .cfi_def_cfa_offset 64 -; RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s1, 40(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: .cfi_offset s1, -24 -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 2 -; RV64-NEXT: sub sp, sp, a0 -; RV64-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 4 * vlenb -; RV64-NEXT: addi a0, sp, 32 -; RV64-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill -; RV64-NEXT: csrr s1, vlenb -; RV64-NEXT: vsetivli zero, 2, e8, m1, ta, ma -; RV64-NEXT: vslidedown.vi v11, v8, 10 -; RV64-NEXT: vslidedown.vi v10, v8, 8 -; RV64-NEXT: vslidedown.vi v9, v8, 2 -; RV64-NEXT: srli s0, s1, 3 -; RV64-NEXT: add a0, s0, s0 -; RV64-NEXT: vsetvli zero, a0, e8, mf2, tu, ma -; RV64-NEXT: vslideup.vx v10, v11, s0 -; RV64-NEXT: vmv1r.v v11, v8 -; RV64-NEXT: vslideup.vx v11, v9, s0 -; RV64-NEXT: vsetivli zero, 2, e8, m1, ta, ma -; RV64-NEXT: vslidedown.vi v9, v8, 12 -; RV64-NEXT: srli a0, s1, 2 -; RV64-NEXT: add a1, a0, s0 -; RV64-NEXT: vsetvli zero, a1, e8, mf2, tu, ma -; RV64-NEXT: vslideup.vx v10, v9, a0 -; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: slli a2, a2, 1 -; RV64-NEXT: add a2, sp, a2 -; RV64-NEXT: addi a2, a2, 32 -; RV64-NEXT: vs1r.v v10, (a2) # vscale x 8-byte Folded Spill -; RV64-NEXT: vsetivli zero, 2, e8, m1, ta, ma -; RV64-NEXT: vslidedown.vi v9, v8, 4 -; RV64-NEXT: vsetvli zero, a1, e8, mf2, tu, ma -; RV64-NEXT: vslideup.vx v11, v9, a0 -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: add a0, sp, a0 -; RV64-NEXT: addi a0, a0, 32 -; RV64-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill -; RV64-NEXT: li a1, 3 -; RV64-NEXT: mv a0, s0 -; RV64-NEXT: call __muldi3 -; RV64-NEXT: add s0, a0, s0 -; RV64-NEXT: addi a1, sp, 32 -; RV64-NEXT: vl1r.v v8, (a1) # vscale x 8-byte Folded Reload -; RV64-NEXT: vsetivli zero, 2, e8, m1, ta, ma -; RV64-NEXT: vslidedown.vi v8, v8, 6 -; RV64-NEXT: srli s1, s1, 1 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 32 -; RV64-NEXT: vl1r.v v9, (a1) # vscale x 8-byte Folded Reload -; RV64-NEXT: vsetvli zero, s0, e8, mf2, ta, ma -; RV64-NEXT: vslideup.vx v9, v8, a0 -; RV64-NEXT: add a0, s1, s1 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 1 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 32 -; RV64-NEXT: vl1r.v v8, (a1) # vscale x 8-byte Folded Reload -; RV64-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; RV64-NEXT: vslideup.vx v9, v8, s1 -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a1, a0, 1 -; RV64-NEXT: add a0, a1, a0 -; RV64-NEXT: add a0, sp, a0 -; RV64-NEXT: addi a0, a0, 32 -; RV64-NEXT: vs1r.v v9, (a0) -; RV64-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; RV64-NEXT: vlseg7e8.v v8, (a0) -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 2 -; RV64-NEXT: add sp, sp, a0 -; RV64-NEXT: .cfi_def_cfa sp, 64 -; RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s1, 40(sp) # 8-byte Folded Reload -; RV64-NEXT: .cfi_restore ra -; RV64-NEXT: .cfi_restore s0 -; RV64-NEXT: .cfi_restore s1 -; RV64-NEXT: addi sp, sp, 64 -; RV64-NEXT: .cfi_def_cfa_offset 0 -; RV64-NEXT: ret -; -; ZIP-LABEL: vector_deinterleave7_v14i8_v2i8: -; ZIP: # %bb.0: -; ZIP-NEXT: addi sp, sp, -64 -; ZIP-NEXT: .cfi_def_cfa_offset 64 -; ZIP-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; ZIP-NEXT: sd s0, 48(sp) # 8-byte Folded Spill -; ZIP-NEXT: sd s1, 40(sp) # 8-byte Folded Spill -; ZIP-NEXT: .cfi_offset ra, -8 -; ZIP-NEXT: .cfi_offset s0, -16 -; ZIP-NEXT: .cfi_offset s1, -24 -; ZIP-NEXT: csrr a0, vlenb -; ZIP-NEXT: slli a0, a0, 2 -; ZIP-NEXT: sub sp, sp, a0 -; ZIP-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 4 * vlenb -; ZIP-NEXT: addi a0, sp, 32 -; ZIP-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill -; ZIP-NEXT: csrr s1, vlenb -; ZIP-NEXT: vsetivli zero, 2, e8, m1, ta, ma -; ZIP-NEXT: vslidedown.vi v11, v8, 10 -; ZIP-NEXT: vslidedown.vi v10, v8, 8 -; ZIP-NEXT: vslidedown.vi v9, v8, 2 -; ZIP-NEXT: srli s0, s1, 3 -; ZIP-NEXT: add a0, s0, s0 -; ZIP-NEXT: vsetvli zero, a0, e8, mf2, tu, ma -; ZIP-NEXT: vslideup.vx v10, v11, s0 -; ZIP-NEXT: vmv1r.v v11, v8 -; ZIP-NEXT: vslideup.vx v11, v9, s0 -; ZIP-NEXT: vsetivli zero, 2, e8, m1, ta, ma -; ZIP-NEXT: vslidedown.vi v9, v8, 12 -; ZIP-NEXT: srli a0, s1, 2 -; ZIP-NEXT: add a1, a0, s0 -; ZIP-NEXT: vsetvli zero, a1, e8, mf2, tu, ma -; ZIP-NEXT: vslideup.vx v10, v9, a0 -; ZIP-NEXT: csrr a2, vlenb -; ZIP-NEXT: slli a2, a2, 1 -; ZIP-NEXT: add a2, sp, a2 -; ZIP-NEXT: addi a2, a2, 32 -; ZIP-NEXT: vs1r.v v10, (a2) # vscale x 8-byte Folded Spill -; ZIP-NEXT: vsetivli zero, 2, e8, m1, ta, ma -; ZIP-NEXT: vslidedown.vi v9, v8, 4 -; ZIP-NEXT: vsetvli zero, a1, e8, mf2, tu, ma -; ZIP-NEXT: vslideup.vx v11, v9, a0 -; ZIP-NEXT: csrr a0, vlenb -; ZIP-NEXT: add a0, sp, a0 -; ZIP-NEXT: addi a0, a0, 32 -; ZIP-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill -; ZIP-NEXT: li a1, 3 -; ZIP-NEXT: mv a0, s0 -; ZIP-NEXT: call __muldi3 -; ZIP-NEXT: add s0, a0, s0 -; ZIP-NEXT: addi a1, sp, 32 -; ZIP-NEXT: vl1r.v v8, (a1) # vscale x 8-byte Folded Reload -; ZIP-NEXT: vsetivli zero, 2, e8, m1, ta, ma -; ZIP-NEXT: vslidedown.vi v8, v8, 6 -; ZIP-NEXT: srli s1, s1, 1 -; ZIP-NEXT: csrr a1, vlenb -; ZIP-NEXT: add a1, sp, a1 -; ZIP-NEXT: addi a1, a1, 32 -; ZIP-NEXT: vl1r.v v9, (a1) # vscale x 8-byte Folded Reload -; ZIP-NEXT: vsetvli zero, s0, e8, mf2, ta, ma -; ZIP-NEXT: vslideup.vx v9, v8, a0 -; ZIP-NEXT: add a0, s1, s1 -; ZIP-NEXT: csrr a1, vlenb -; ZIP-NEXT: slli a1, a1, 1 -; ZIP-NEXT: add a1, sp, a1 -; ZIP-NEXT: addi a1, a1, 32 -; ZIP-NEXT: vl1r.v v8, (a1) # vscale x 8-byte Folded Reload -; ZIP-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; ZIP-NEXT: vslideup.vx v9, v8, s1 -; ZIP-NEXT: csrr a0, vlenb -; ZIP-NEXT: slli a1, a0, 1 -; ZIP-NEXT: add a0, a1, a0 -; ZIP-NEXT: add a0, sp, a0 -; ZIP-NEXT: addi a0, a0, 32 -; ZIP-NEXT: vs1r.v v9, (a0) -; ZIP-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; ZIP-NEXT: vlseg7e8.v v8, (a0) -; ZIP-NEXT: csrr a0, vlenb -; ZIP-NEXT: slli a0, a0, 2 -; ZIP-NEXT: add sp, sp, a0 -; ZIP-NEXT: .cfi_def_cfa sp, 64 -; ZIP-NEXT: ld ra, 56(sp) # 8-byte Folded Reload -; ZIP-NEXT: ld s0, 48(sp) # 8-byte Folded Reload -; ZIP-NEXT: ld s1, 40(sp) # 8-byte Folded Reload -; ZIP-NEXT: .cfi_restore ra -; ZIP-NEXT: .cfi_restore s0 -; ZIP-NEXT: .cfi_restore s1 -; ZIP-NEXT: addi sp, sp, 64 -; ZIP-NEXT: .cfi_def_cfa_offset 0 -; ZIP-NEXT: ret +define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vector_deinterleave6_v2i16_v12i16(<12 x i16> %v) nounwind { +; CHECK-LABEL: vector_deinterleave6_v2i16_v12i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetivli zero, 2, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v14, v8, 6 +; CHECK-NEXT: vslidedown.vi v15, v8, 4 +; CHECK-NEXT: vslidedown.vi v16, v8, 2 +; CHECK-NEXT: vsetivli zero, 2, e16, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v10, v8, 10 +; CHECK-NEXT: vslidedown.vi v12, v8, 8 +; CHECK-NEXT: srli a1, a0, 3 +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: add a2, a1, a1 +; CHECK-NEXT: add a3, a0, a0 +; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma +; CHECK-NEXT: vslideup.vx v15, v14, a1 +; CHECK-NEXT: vslideup.vx v8, v16, a1 +; CHECK-NEXT: vslideup.vx v12, v10, a1 +; CHECK-NEXT: vsetvli zero, a3, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v8, v15, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv1r.v v9, v12 +; CHECK-NEXT: vs2r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vlseg6e16.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @llvm.vector.deinterleave6.v12i16(<12 x i16> %v) + ret {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res +} + +define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @vector_deinterleave7_v14i8_v2i8(<14 x i8> %v) nounwind { +; CHECK-LABEL: vector_deinterleave7_v14i8_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetivli zero, 2, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v9, v8, 10 +; CHECK-NEXT: vslidedown.vi v10, v8, 8 +; CHECK-NEXT: vslidedown.vi v11, v8, 12 +; CHECK-NEXT: vslidedown.vi v12, v8, 2 +; CHECK-NEXT: vslidedown.vi v13, v8, 4 +; CHECK-NEXT: vslidedown.vi v14, v8, 6 +; CHECK-NEXT: srli a1, a0, 3 +; CHECK-NEXT: srli a2, a0, 2 +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: add a3, a1, a1 +; CHECK-NEXT: add a4, a2, a1 +; CHECK-NEXT: vsetvli zero, a3, e8, mf2, tu, ma +; CHECK-NEXT: vslideup.vx v10, v9, a1 +; CHECK-NEXT: vslideup.vx v8, v12, a1 +; CHECK-NEXT: slli a3, a1, 1 +; CHECK-NEXT: vsetvli zero, a4, e8, mf2, tu, ma +; CHECK-NEXT: vslideup.vx v10, v11, a2 +; CHECK-NEXT: vslideup.vx v8, v13, a2 +; CHECK-NEXT: add a2, a0, a0 +; CHECK-NEXT: add a3, a3, a1 +; CHECK-NEXT: add a1, a3, a1 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vslideup.vx v8, v14, a3 +; CHECK-NEXT: vsetvli zero, a2, e8, m1, ta, ma +; CHECK-NEXT: vslideup.vx v8, v10, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs1r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma +; CHECK-NEXT: vlseg7e8.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret %res = call {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @llvm.vector.deinterleave7.v14i8(<14 x i8> %v) ret {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} %res } +define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @vector_deinterleave8_v16i8_v2i8(<16 x i8> %v) nounwind { +; CHECK-LABEL: vector_deinterleave8_v16i8_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetivli zero, 2, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v9, v8, 10 +; CHECK-NEXT: vslidedown.vi v10, v8, 8 +; CHECK-NEXT: vslidedown.vi v11, v8, 12 +; CHECK-NEXT: vslidedown.vi v12, v8, 14 +; CHECK-NEXT: vslidedown.vi v13, v8, 2 +; CHECK-NEXT: vslidedown.vi v14, v8, 4 +; CHECK-NEXT: vslidedown.vi v15, v8, 6 +; CHECK-NEXT: srli a1, a0, 3 +; CHECK-NEXT: srli a2, a0, 2 +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: add a3, a1, a1 +; CHECK-NEXT: add a4, a2, a1 +; CHECK-NEXT: slli a5, a1, 1 +; CHECK-NEXT: add a6, a0, a0 +; CHECK-NEXT: vsetvli zero, a3, e8, mf2, tu, ma +; CHECK-NEXT: vslideup.vx v10, v9, a1 +; CHECK-NEXT: add a5, a5, a1 +; CHECK-NEXT: vslideup.vx v8, v13, a1 +; CHECK-NEXT: vsetvli zero, a4, e8, mf2, tu, ma +; CHECK-NEXT: vslideup.vx v10, v11, a2 +; CHECK-NEXT: add a1, a5, a1 +; CHECK-NEXT: vslideup.vx v8, v14, a2 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vslideup.vx v10, v12, a5 +; CHECK-NEXT: vslideup.vx v8, v15, a5 +; CHECK-NEXT: vsetvli zero, a6, e8, m1, ta, ma +; CHECK-NEXT: vslideup.vx v8, v10, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs1r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma +; CHECK-NEXT: vlseg8e8.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @llvm.vector.deinterleave8.v16i8(<16 x i8> %v) + ret {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} %res +} ; Floats @@ -695,8 +571,8 @@ define {<4 x double>, <4 x double>} @vector_deinterleave_v4f64_v8f64(<8 x double ret {<4 x double>, <4 x double>} %retval } -define {<2 x float>, <2 x float>, <2 x float>} @vector_deinterleave3_v632_v2f32(<6 x float> %v) { -; CHECK-LABEL: vector_deinterleave3_v632_v2f32: +define {<2 x float>, <2 x float>, <2 x float>} @vector_deinterleave3_v6f32_v2f32(<6 x float> %v) { +; CHECK-LABEL: vector_deinterleave3_v6f32_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 @@ -729,6 +605,41 @@ define {<2 x float>, <2 x float>, <2 x float>} @vector_deinterleave3_v632_v2f32( ret {<2 x float>, <2 x float>, <2 x float>} %res } +define {<2 x float>, <2 x float>, <2 x float>, <2 x float>} @vector_deinterleave4_v8f32_v2f32(<8 x float> %v) { +; CHECK-LABEL: vector_deinterleave4_v8f32_v2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetivli zero, 2, e32, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v10, v8, 6 +; CHECK-NEXT: vslidedown.vi v12, v8, 4 +; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: add a1, a0, a0 +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vslideup.vx v12, v10, a0 +; CHECK-NEXT: vslideup.vx v8, v9, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv.v.v v9, v12 +; CHECK-NEXT: vs2r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, ma +; CHECK-NEXT: vlseg4e32.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %res = call {<2 x float>, <2 x float>, <2 x float>, <2 x float>} @llvm.vector.deinterleave4.v8f32(<8 x float> %v) + ret {<2 x float>, <2 x float>, <2 x float>, <2 x float>} %res +} define {<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>} @vector_deinterleave5_v10f16_v2f16(<10 x half> %v) { ; CHECK-LABEL: vector_deinterleave5_v10f16_v2f16: @@ -771,6 +682,49 @@ define {<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>} @vector_dein ret {<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>} %res } +define {<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>} @vector_deinterleave6_v12f16_v2f16(<12 x half> %v) { +; CHECK-LABEL: vector_deinterleave6_v12f16_v2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetivli zero, 2, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v14, v8, 6 +; CHECK-NEXT: vslidedown.vi v15, v8, 4 +; CHECK-NEXT: vslidedown.vi v16, v8, 2 +; CHECK-NEXT: vsetivli zero, 2, e16, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v10, v8, 10 +; CHECK-NEXT: vslidedown.vi v12, v8, 8 +; CHECK-NEXT: srli a1, a0, 3 +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: add a2, a1, a1 +; CHECK-NEXT: add a3, a0, a0 +; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma +; CHECK-NEXT: vslideup.vx v15, v14, a1 +; CHECK-NEXT: vslideup.vx v8, v16, a1 +; CHECK-NEXT: vslideup.vx v12, v10, a1 +; CHECK-NEXT: vsetvli zero, a3, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v8, v15, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv1r.v v9, v12 +; CHECK-NEXT: vs2r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vlseg6e16.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %res = call {<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>} @llvm.vector.deinterleave6.v12f16(<12 x half> %v) + ret {<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>} %res +} + define {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>} @vector_deinterleave7_v7f16_v1f16(<7 x half> %v) { ; CHECK-LABEL: vector_deinterleave7_v7f16_v1f16: ; CHECK: # %bb.0: @@ -817,3 +771,54 @@ define {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, %res = call {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>} @llvm.vector.deinterleave7.v7f16(<7 x half> %v) ret {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>} %res } + +define {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>} @vector_deinterleave8_v8f16_v1f16(<8 x half> %v) { +; CHECK-LABEL: vector_deinterleave8_v8f16_v1f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v10, v8, 7 +; CHECK-NEXT: vslidedown.vi v11, v8, 6 +; CHECK-NEXT: vslidedown.vi v12, v8, 5 +; CHECK-NEXT: srli a1, a0, 3 +; CHECK-NEXT: vslidedown.vi v9, v8, 4 +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: add a2, a1, a1 +; CHECK-NEXT: add a3, a0, a0 +; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma +; CHECK-NEXT: vslideup.vx v11, v10, a1 +; CHECK-NEXT: vslideup.vx v9, v12, a1 +; CHECK-NEXT: vsetvli zero, a3, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v9, v11, a0 +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v10, v8, 3 +; CHECK-NEXT: vslidedown.vi v11, v8, 2 +; CHECK-NEXT: vslidedown.vi v12, v8, 1 +; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma +; CHECK-NEXT: vslideup.vx v11, v10, a1 +; CHECK-NEXT: vslideup.vx v8, v12, a1 +; CHECK-NEXT: vsetvli zero, a3, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v8, v11, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs2r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vlseg8e16.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %res = call {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>} @llvm.vector.deinterleave8.v8f16(<8 x half> %v) + ret {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>} %res +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; RV32: {{.*}} +; RV64: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll index e316c022727ab..6a08f5a28a295 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll @@ -468,6 +468,131 @@ define {, , } @vector_dein ret {, , } %retval } +define {, , , } @vector_deinterleave_nxv16i1_nxv64i1( %vec) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv64i1: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vmv1r.v v8, v0 +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vmerge.vim v16, v10, 1, v0 +; CHECK-NEXT: srli a1, a0, 2 +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v0, a1 +; CHECK-NEXT: srli a1, a0, 1 +; CHECK-NEXT: vsetvli a2, zero, e8, m2, ta, ma +; CHECK-NEXT: vmerge.vim v18, v10, 1, v0 +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v8, a1 +; CHECK-NEXT: srli a1, a0, 3 +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: sub a0, a0, a1 +; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; CHECK-NEXT: vmerge.vim v20, v10, 1, v0 +; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v8, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; CHECK-NEXT: vmerge.vim v22, v10, 1, v0 +; CHECK-NEXT: vs8r.v v16, (a0) +; CHECK-NEXT: vlseg4e8.v v8, (a0) +; CHECK-NEXT: vmsne.vi v0, v8, 0 +; CHECK-NEXT: vmsne.vi v8, v10, 0 +; CHECK-NEXT: vmsne.vi v9, v12, 0 +; CHECK-NEXT: vmsne.vi v10, v14, 0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %retval = call {, , , } @llvm.vector.deinterleave4.nxv64i1( %vec) + ret {, , , } %retval +} + +define {, , , } @vector_deinterleave_nxv16i8_nxv64i8( %vec) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv16i8_nxv64i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; CHECK-NEXT: vlseg4e8.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %retval = call {, , , } @llvm.vector.deinterleave4.nxv48i8( %vec) + ret {, , , } %retval +} + +define {, , , } @vector_deinterleave_nxv8i16_nxv32i16( %vec) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv8i16_nxv32i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vlseg4e16.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %retval = call {, , , } @llvm.vector.deinterleave4.nxv32i16( %vec) + ret {, , , } %retval +} + +define {, , , } @vector_deinterleave_nxv4i32_nxv16i32( %vec) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv4i32_nxv16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; CHECK-NEXT: vlseg4e32.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %retval = call {, , , } @llvm.vector.deinterleave4.nxv16i32( %vec) + ret {, , , } %retval +} + +define {, , , } @vector_deinterleave_nxv2i64_nxv8i64( %vec) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv2i64_nxv8i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; CHECK-NEXT: vlseg4e64.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %retval = call {, , , } @llvm.vector.deinterleave4.nxv8i64( %vec) + ret {, , , } %retval +} + define {, , , , } @vector_deinterleave_nxv16i1_nxv80i1( %vec) nounwind { ; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv80i1: ; CHECK: # %bb.0: @@ -700,6 +825,240 @@ define {, , , , , , , } %retval } +define {, , , , , } @vector_deinterleave_nxv16i1_nxv96i1( %vec) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv96i1: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vmerge.vim v16, v10, 1, v0 +; CHECK-NEXT: srli a2, a0, 2 +; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v0, a2 +; CHECK-NEXT: srli a3, a0, 1 +; CHECK-NEXT: vsetvli a4, zero, e8, m2, ta, ma +; CHECK-NEXT: vmerge.vim v18, v10, 1, v0 +; CHECK-NEXT: vsetvli a4, zero, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v9, a3 +; CHECK-NEXT: srli a3, a0, 3 +; CHECK-NEXT: slli a3, a3, 1 +; CHECK-NEXT: sub a0, a0, a3 +; CHECK-NEXT: vsetvli a3, zero, e8, m2, ta, ma +; CHECK-NEXT: vmerge.vim v20, v10, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vim v26, v10, 1, v0 +; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v9, a0 +; CHECK-NEXT: vs8r.v v16, (a1) +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vmerge.vim v24, v10, 1, v0 +; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v8, a2 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vsetvli a2, zero, e8, m2, ta, ma +; CHECK-NEXT: vmerge.vim v28, v10, 1, v0 +; CHECK-NEXT: vs8r.v v24, (a0) +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma +; CHECK-NEXT: vlseg6e8.v v16, (a1) +; CHECK-NEXT: vlseg6e8.v v10, (a0) +; CHECK-NEXT: vmv2r.v v8, v16 +; CHECK-NEXT: vmv2r.v v22, v18 +; CHECK-NEXT: vmv2r.v v24, v20 +; CHECK-NEXT: vmv1r.v v9, v10 +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vmsne.vi v0, v8, 0 +; CHECK-NEXT: vmv1r.v v10, v17 +; CHECK-NEXT: vmsne.vi v8, v10, 0 +; CHECK-NEXT: vmv1r.v v23, v12 +; CHECK-NEXT: vmsne.vi v9, v22, 0 +; CHECK-NEXT: vmv1r.v v12, v19 +; CHECK-NEXT: vmsne.vi v10, v12, 0 +; CHECK-NEXT: vmv1r.v v25, v14 +; CHECK-NEXT: vmsne.vi v11, v24, 0 +; CHECK-NEXT: vmv1r.v v14, v21 +; CHECK-NEXT: vmsne.vi v12, v14, 0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %retval = call {, , , , , } @llvm.vector.deinterleave6.nxv96i1( %vec) + ret {, , , , , } %retval +} + +define {, , , , , } @vector_deinterleave_nxv16i8_nxv96i8( %vec) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv16i8_nxv96i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v28, v18 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vmv2r.v v24, v14 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vmv2r.v v26, v16 +; CHECK-NEXT: vs8r.v v24, (a1) +; CHECK-NEXT: vlseg6e8.v v24, (a0) +; CHECK-NEXT: vlseg6e8.v v18, (a1) +; CHECK-NEXT: vmv2r.v v8, v24 +; CHECK-NEXT: vmv1r.v v9, v18 +; CHECK-NEXT: vmv1r.v v18, v25 +; CHECK-NEXT: vmv2r.v v12, v26 +; CHECK-NEXT: vmv1r.v v13, v20 +; CHECK-NEXT: vmv1r.v v20, v27 +; CHECK-NEXT: vmv2r.v v16, v28 +; CHECK-NEXT: vmv1r.v v17, v22 +; CHECK-NEXT: vmv1r.v v22, v29 +; CHECK-NEXT: vmv2r.v v10, v18 +; CHECK-NEXT: vmv2r.v v14, v20 +; CHECK-NEXT: vmv2r.v v18, v22 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %retval = call {, , , , , } @llvm.vector.deinterleave6.nxv96i8( %vec) + ret {, , , , , } %retval +} + +define {, , , , , } @vector_deinterleave_nxv8i16_nxv48i16( %vec) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv8i16_nxv48i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv2r.v v28, v18 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vmv2r.v v24, v14 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vmv2r.v v26, v16 +; CHECK-NEXT: vs8r.v v24, (a1) +; CHECK-NEXT: vlseg6e16.v v24, (a0) +; CHECK-NEXT: vlseg6e16.v v18, (a1) +; CHECK-NEXT: vmv2r.v v8, v24 +; CHECK-NEXT: vmv1r.v v9, v18 +; CHECK-NEXT: vmv1r.v v18, v25 +; CHECK-NEXT: vmv2r.v v12, v26 +; CHECK-NEXT: vmv1r.v v13, v20 +; CHECK-NEXT: vmv1r.v v20, v27 +; CHECK-NEXT: vmv2r.v v16, v28 +; CHECK-NEXT: vmv1r.v v17, v22 +; CHECK-NEXT: vmv1r.v v22, v29 +; CHECK-NEXT: vmv2r.v v10, v18 +; CHECK-NEXT: vmv2r.v v14, v20 +; CHECK-NEXT: vmv2r.v v18, v22 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %retval = call {, , , , , } @llvm.vector.deinterleave6.nxv48i16( %vec) + ret {, , , , , } %retval +} + +define {, , , , , } @vector_deinterleave_nxv4i32_nxv24i32( %vec) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv4i32_nxv24i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vmv2r.v v28, v18 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vmv2r.v v24, v14 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vmv2r.v v26, v16 +; CHECK-NEXT: vs8r.v v24, (a1) +; CHECK-NEXT: vlseg6e32.v v24, (a0) +; CHECK-NEXT: vlseg6e32.v v18, (a1) +; CHECK-NEXT: vmv2r.v v8, v24 +; CHECK-NEXT: vmv1r.v v9, v18 +; CHECK-NEXT: vmv1r.v v18, v25 +; CHECK-NEXT: vmv2r.v v12, v26 +; CHECK-NEXT: vmv1r.v v13, v20 +; CHECK-NEXT: vmv1r.v v20, v27 +; CHECK-NEXT: vmv2r.v v16, v28 +; CHECK-NEXT: vmv1r.v v17, v22 +; CHECK-NEXT: vmv1r.v v22, v29 +; CHECK-NEXT: vmv2r.v v10, v18 +; CHECK-NEXT: vmv2r.v v14, v20 +; CHECK-NEXT: vmv2r.v v18, v22 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %retval = call {, , , , , } @llvm.vector.deinterleave6.nxv24i32( %vec) + ret {, , , , , } %retval +} + +define {, , , , , } @vector_deinterleave_nxv2i64_nxv12i64( %vec) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv2i64_nxv12i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; CHECK-NEXT: vmv2r.v v28, v18 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vmv2r.v v24, v14 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vmv2r.v v26, v16 +; CHECK-NEXT: vs8r.v v24, (a1) +; CHECK-NEXT: vlseg6e64.v v24, (a0) +; CHECK-NEXT: vlseg6e64.v v18, (a1) +; CHECK-NEXT: vmv2r.v v8, v24 +; CHECK-NEXT: vmv1r.v v9, v18 +; CHECK-NEXT: vmv1r.v v18, v25 +; CHECK-NEXT: vmv2r.v v12, v26 +; CHECK-NEXT: vmv1r.v v13, v20 +; CHECK-NEXT: vmv1r.v v20, v27 +; CHECK-NEXT: vmv2r.v v16, v28 +; CHECK-NEXT: vmv1r.v v17, v22 +; CHECK-NEXT: vmv1r.v v22, v29 +; CHECK-NEXT: vmv2r.v v10, v18 +; CHECK-NEXT: vmv2r.v v14, v20 +; CHECK-NEXT: vmv2r.v v18, v22 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %retval = call {, , , , , } @llvm.vector.deinterleave6.nxv12i64( %vec) + ret {, , , , , } %retval +} + define {, , , , , , } @vector_deinterleave_nxv16i1_nxv112i1( %vec) nounwind { ; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv112i1: ; CHECK: # %bb.0: @@ -971,26 +1330,277 @@ define {, , , , , , , , , } %retval } -; Floats - -define {, } @vector_deinterleave_nxv2bf16_nxv4bf16( %vec) { -; V-LABEL: vector_deinterleave_nxv2bf16_nxv4bf16: -; V: # %bb.0: -; V-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; V-NEXT: vnsrl.wi v10, v8, 0 -; V-NEXT: vnsrl.wi v9, v8, 16 -; V-NEXT: vmv1r.v v8, v10 -; V-NEXT: ret -; -; ZIP-LABEL: vector_deinterleave_nxv2bf16_nxv4bf16: -; ZIP: # %bb.0: -; ZIP-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; ZIP-NEXT: ri.vunzip2a.vv v10, v8, v9 -; ZIP-NEXT: ri.vunzip2b.vv v9, v8, v11 -; ZIP-NEXT: vmv.v.v v8, v10 -; ZIP-NEXT: ret -%retval = call {, } @llvm.vector.deinterleave2.nxv4bf16( %vec) -ret {, } %retval +define {, , , , , , , } @vector_deinterleave_nxv16i1_nxv128i1( %vec) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv128i1: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vmerge.vim v16, v10, 1, v0 +; CHECK-NEXT: srli a1, a0, 2 +; CHECK-NEXT: srli a2, a0, 1 +; CHECK-NEXT: srli a3, a0, 3 +; CHECK-NEXT: vsetvli a4, zero, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v0, a1 +; CHECK-NEXT: slli a3, a3, 1 +; CHECK-NEXT: vsetvli a4, zero, e8, m2, ta, ma +; CHECK-NEXT: vmerge.vim v18, v10, 1, v0 +; CHECK-NEXT: vsetvli a4, zero, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v9, a2 +; CHECK-NEXT: sub a0, a0, a3 +; CHECK-NEXT: vsetvli a3, zero, e8, m2, ta, ma +; CHECK-NEXT: vmerge.vim v20, v10, 1, v0 +; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v9, a0 +; CHECK-NEXT: vsetvli a3, zero, e8, m2, ta, ma +; CHECK-NEXT: vmerge.vim v22, v10, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vim v24, v10, 1, v0 +; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v8, a1 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vsetvli a3, zero, e8, m2, ta, ma +; CHECK-NEXT: vmerge.vim v26, v10, 1, v0 +; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v8, a2 +; CHECK-NEXT: vs8r.v v16, (a1) +; CHECK-NEXT: vsetvli a2, zero, e8, m2, ta, ma +; CHECK-NEXT: vmerge.vim v28, v10, 1, v0 +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v8, a0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vsetvli a2, zero, e8, m2, ta, ma +; CHECK-NEXT: vmerge.vim v30, v10, 1, v0 +; CHECK-NEXT: vs8r.v v24, (a0) +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma +; CHECK-NEXT: vlseg8e8.v v18, (a1) +; CHECK-NEXT: vlseg8e8.v v10, (a0) +; CHECK-NEXT: vmv2r.v v8, v18 +; CHECK-NEXT: vmv2r.v v26, v20 +; CHECK-NEXT: vmv2r.v v28, v22 +; CHECK-NEXT: vmv2r.v v30, v24 +; CHECK-NEXT: vmv1r.v v9, v10 +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vmsne.vi v0, v8, 0 +; CHECK-NEXT: vmv1r.v v10, v19 +; CHECK-NEXT: vmsne.vi v8, v10, 0 +; CHECK-NEXT: vmv1r.v v27, v12 +; CHECK-NEXT: vmsne.vi v9, v26, 0 +; CHECK-NEXT: vmv1r.v v12, v21 +; CHECK-NEXT: vmsne.vi v10, v12, 0 +; CHECK-NEXT: vmv1r.v v29, v14 +; CHECK-NEXT: vmsne.vi v11, v28, 0 +; CHECK-NEXT: vmv1r.v v14, v23 +; CHECK-NEXT: vmsne.vi v12, v14, 0 +; CHECK-NEXT: vmv1r.v v31, v16 +; CHECK-NEXT: vmsne.vi v13, v30, 0 +; CHECK-NEXT: vmv1r.v v16, v25 +; CHECK-NEXT: vmsne.vi v14, v16, 0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %retval = call {, , , , , , , } @llvm.vector.deinterleave8.nxv128i1( %vec) + ret {, , , , , , , } %retval +} + +define {, , , , , , , } @vector_deinterleave_nxv16i8_nxv128i8( %vec) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv16i8_nxv128i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vs8r.v v16, (a1) +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma +; CHECK-NEXT: vlseg8e8.v v0, (a0) +; CHECK-NEXT: vlseg8e8.v v22, (a1) +; CHECK-NEXT: vmv2r.v v8, v0 +; CHECK-NEXT: vmv1r.v v9, v22 +; CHECK-NEXT: vmv1r.v v22, v1 +; CHECK-NEXT: vmv2r.v v12, v2 +; CHECK-NEXT: vmv1r.v v13, v24 +; CHECK-NEXT: vmv1r.v v24, v3 +; CHECK-NEXT: vmv2r.v v16, v4 +; CHECK-NEXT: vmv1r.v v17, v26 +; CHECK-NEXT: vmv1r.v v26, v5 +; CHECK-NEXT: vmv2r.v v20, v6 +; CHECK-NEXT: vmv1r.v v21, v28 +; CHECK-NEXT: vmv1r.v v28, v7 +; CHECK-NEXT: vmv2r.v v10, v22 +; CHECK-NEXT: vmv2r.v v14, v24 +; CHECK-NEXT: vmv2r.v v18, v26 +; CHECK-NEXT: vmv2r.v v22, v28 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %retval = call {, , , , , , , } @llvm.vector.deinterleave8.nxv128i8( %vec) + ret {, , , , , , , } %retval +} + +define {, , , , , , , } @vector_deinterleave_nxv8i16_nxv64i16( %vec) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv8i16_nxv64i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vs8r.v v16, (a1) +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma +; CHECK-NEXT: vlseg8e16.v v0, (a0) +; CHECK-NEXT: vlseg8e16.v v22, (a1) +; CHECK-NEXT: vmv2r.v v8, v0 +; CHECK-NEXT: vmv1r.v v9, v22 +; CHECK-NEXT: vmv1r.v v22, v1 +; CHECK-NEXT: vmv2r.v v12, v2 +; CHECK-NEXT: vmv1r.v v13, v24 +; CHECK-NEXT: vmv1r.v v24, v3 +; CHECK-NEXT: vmv2r.v v16, v4 +; CHECK-NEXT: vmv1r.v v17, v26 +; CHECK-NEXT: vmv1r.v v26, v5 +; CHECK-NEXT: vmv2r.v v20, v6 +; CHECK-NEXT: vmv1r.v v21, v28 +; CHECK-NEXT: vmv1r.v v28, v7 +; CHECK-NEXT: vmv2r.v v10, v22 +; CHECK-NEXT: vmv2r.v v14, v24 +; CHECK-NEXT: vmv2r.v v18, v26 +; CHECK-NEXT: vmv2r.v v22, v28 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %retval = call {, , , , , , , } @llvm.vector.deinterleave8.nxv64i16( %vec) + ret {, , , , , , , } %retval +} + +define {, , , , , , , } @vector_deinterleave_nxv4i32_nxv32i32( %vec) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv4i32_nxv32i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vs8r.v v16, (a1) +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma +; CHECK-NEXT: vlseg8e32.v v0, (a0) +; CHECK-NEXT: vlseg8e32.v v22, (a1) +; CHECK-NEXT: vmv2r.v v8, v0 +; CHECK-NEXT: vmv1r.v v9, v22 +; CHECK-NEXT: vmv1r.v v22, v1 +; CHECK-NEXT: vmv2r.v v12, v2 +; CHECK-NEXT: vmv1r.v v13, v24 +; CHECK-NEXT: vmv1r.v v24, v3 +; CHECK-NEXT: vmv2r.v v16, v4 +; CHECK-NEXT: vmv1r.v v17, v26 +; CHECK-NEXT: vmv1r.v v26, v5 +; CHECK-NEXT: vmv2r.v v20, v6 +; CHECK-NEXT: vmv1r.v v21, v28 +; CHECK-NEXT: vmv1r.v v28, v7 +; CHECK-NEXT: vmv2r.v v10, v22 +; CHECK-NEXT: vmv2r.v v14, v24 +; CHECK-NEXT: vmv2r.v v18, v26 +; CHECK-NEXT: vmv2r.v v22, v28 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %retval = call {, , , , , , , } @llvm.vector.deinterleave8.nxv32i32( %vec) + ret {, , , , , , , } %retval +} + +define {, , , , , , , } @vector_deinterleave_nxv2i64_nxv16i64( %vec) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv2i64_nxv16i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vs8r.v v16, (a1) +; CHECK-NEXT: vsetvli a2, zero, e64, m1, ta, ma +; CHECK-NEXT: vlseg8e64.v v0, (a0) +; CHECK-NEXT: vlseg8e64.v v22, (a1) +; CHECK-NEXT: vmv2r.v v8, v0 +; CHECK-NEXT: vmv1r.v v9, v22 +; CHECK-NEXT: vmv1r.v v22, v1 +; CHECK-NEXT: vmv2r.v v12, v2 +; CHECK-NEXT: vmv1r.v v13, v24 +; CHECK-NEXT: vmv1r.v v24, v3 +; CHECK-NEXT: vmv2r.v v16, v4 +; CHECK-NEXT: vmv1r.v v17, v26 +; CHECK-NEXT: vmv1r.v v26, v5 +; CHECK-NEXT: vmv2r.v v20, v6 +; CHECK-NEXT: vmv1r.v v21, v28 +; CHECK-NEXT: vmv1r.v v28, v7 +; CHECK-NEXT: vmv2r.v v10, v22 +; CHECK-NEXT: vmv2r.v v14, v24 +; CHECK-NEXT: vmv2r.v v18, v26 +; CHECK-NEXT: vmv2r.v v22, v28 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %retval = call {, , , , , , , } @llvm.vector.deinterleave8.nxv16i64( %vec) + ret {, , , , , , , } %retval +} + +; Floats + +define {, } @vector_deinterleave_nxv2bf16_nxv4bf16( %vec) { +; V-LABEL: vector_deinterleave_nxv2bf16_nxv4bf16: +; V: # %bb.0: +; V-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; V-NEXT: vnsrl.wi v10, v8, 0 +; V-NEXT: vnsrl.wi v9, v8, 16 +; V-NEXT: vmv1r.v v8, v10 +; V-NEXT: ret +; +; ZIP-LABEL: vector_deinterleave_nxv2bf16_nxv4bf16: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZIP-NEXT: ri.vunzip2a.vv v10, v8, v9 +; ZIP-NEXT: ri.vunzip2b.vv v9, v8, v11 +; ZIP-NEXT: vmv.v.v v8, v10 +; ZIP-NEXT: ret +%retval = call {, } @llvm.vector.deinterleave2.nxv4bf16( %vec) +ret {, } %retval } define {, } @vector_deinterleave_nxv2f16_nxv4f16( %vec) { @@ -1550,35 +2160,48 @@ define {, , } @ve ret {, , } %res } -define {, , , , } @vector_deinterleave_nxv2f16_nxv10f16( %arg) nounwind { -; CHECK-LABEL: vector_deinterleave_nxv2f16_nxv10f16: +define {, , , } @vector_deinterleave_nxv2f16_nxv8f16( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv2f16_nxv8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs2r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vlseg4e16.v v8, (a0) ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 2 -; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vx v11, v9, a0 -; CHECK-NEXT: vslideup.vx v9, v11, a0 -; CHECK-NEXT: vslidedown.vx v11, v8, a0 -; CHECK-NEXT: vslideup.vx v8, v11, a0 +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call {, , , } @llvm.vector.deinterleave4.nxv8f16( %arg) + ret {, , , } %res +} + +define {, , , } @vector_deinterleave_nxv4f16_nxv16f16( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv4f16_nxv16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs4r.v v8, (a0) -; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma -; CHECK-NEXT: vlseg5e16.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vlseg4e16.v v8, (a0) ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 2 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret - %res = call {, , , , } @llvm.vector.deinterleave5.nxv10f16( %arg) - ret {, , , , } %res + %res = call {, , , } @llvm.vector.deinterleave4.nxv16f16( %arg) + ret {, , , } %res } -define {, , , , } @vector_deinterleave_nxv4f16_nxv20f16( %arg) nounwind { -; CHECK-LABEL: vector_deinterleave_nxv4f16_nxv20f16: +define {, , , } @vector_deinterleave_nxv8f16_nxv32f16( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv8f16_nxv32f16: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a0, vlenb @@ -1586,86 +2209,59 @@ define {, , , , , , , } @llvm.vector.deinterleave5.nxv20f16( %arg) - ret {, , , , } %res + %res = call {, , , } @llvm.vector.deinterleave4.nxv32f16( %arg) + ret {, , , } %res } -define {, , , , } @vector_deinterleave_nxv8f16_nxv40f16( %arg) nounwind { -; CHECK-LABEL: vector_deinterleave_nxv8f16_nxv40f16: +define {, , , } @vector_deinterleave_nxv2bf16_nxv8bf16( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv2bf16_nxv8bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; CHECK-NEXT: vmv1r.v v26, v15 -; CHECK-NEXT: vmv1r.v v27, v16 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vmv1r.v v24, v13 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vmv1r.v v25, v14 -; CHECK-NEXT: vs8r.v v8, (a0) -; CHECK-NEXT: vmv1r.v v28, v17 -; CHECK-NEXT: vs8r.v v24, (a1) -; CHECK-NEXT: vlseg5e16.v v12, (a0) -; CHECK-NEXT: vlseg5e16.v v18, (a1) -; CHECK-NEXT: vmv2r.v v8, v12 -; CHECK-NEXT: vmv1r.v v9, v18 -; CHECK-NEXT: vmv1r.v v18, v13 -; CHECK-NEXT: vmv2r.v v12, v14 -; CHECK-NEXT: vmv1r.v v13, v20 -; CHECK-NEXT: vmv1r.v v20, v15 -; CHECK-NEXT: vmv1r.v v17, v22 -; CHECK-NEXT: vmv2r.v v10, v18 -; CHECK-NEXT: vmv2r.v v14, v20 +; CHECK-NEXT: vs2r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vlseg4e16.v v8, (a0) ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret - %res = call {, , , , } @llvm.vector.deinterleave5.nxv40f16( %arg) - ret {, , , , } %res + %res = call {, , , } @llvm.vector.deinterleave4.nxv8bf16( %arg) + ret {, , , } %res } -define {, , , , } @vector_deinterleave_nxv2bf16_nxv10bf16( %arg) nounwind { -; CHECK-LABEL: vector_deinterleave_nxv2bf16_nxv10bf16: +define {, , , } @vector_deinterleave_nxv4bf16_nxv16bf16( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv4bf16_nxv16bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 2 ; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 2 -; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vx v11, v9, a0 -; CHECK-NEXT: vslideup.vx v9, v11, a0 -; CHECK-NEXT: vslidedown.vx v11, v8, a0 -; CHECK-NEXT: vslideup.vx v8, v11, a0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs4r.v v8, (a0) -; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma -; CHECK-NEXT: vlseg5e16.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vlseg4e16.v v8, (a0) ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 2 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret - %res = call {, , , , } @llvm.vector.deinterleave5.nxv10bf16( %arg) - ret {, , , , } %res + %res = call {, , , } @llvm.vector.deinterleave4.nxv16bf16( %arg) + ret {, , , } %res } -define {, , , , } @vector_deinterleave_nxv4bf16_nxv20bf16( %arg) nounwind { -; CHECK-LABEL: vector_deinterleave_nxv4bf16_nxv20bf16: +define {, , , } @vector_deinterleave_nxv8bf16_nxv32bf16( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv8bf16_nxv32bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a0, vlenb @@ -1673,7 +2269,241 @@ define {, , , , , , } @llvm.vector.deinterleave4.nxv32bf16( %arg) + ret {, , , } %res +} + +define {, , , } @vector_deinterleave_nxv1f32_nxv4f32( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv1f32_nxv4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs2r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, ma +; CHECK-NEXT: vlseg4e32.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call {, , , } @llvm.vector.deinterleave4.nxv4f32( %arg) + ret {, , , } %res +} + +define {, , , } @vector_deinterleave_nxv2f32_nxv8f32( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv2f32_nxv8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs4r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vlseg4e32.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call {, , , } @llvm.vector.deinterleave4.nxv6f32( %arg) + ret {, , , } %res +} + +define {, , , } @vector_deinterleave_nxv4f32_nxv16f32( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv4f32_nxv16f32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; CHECK-NEXT: vlseg4e32.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call {, , , } @llvm.vector.deinterleave4.nxv16f32( %arg) + ret {, , , } %res +} + +define {, , , } @vector_deinterleave_nxv1f64_nxv4f64( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv1f64_nxv4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs4r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; CHECK-NEXT: vlseg4e64.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call {, , , } @llvm.vector.deinterleave4.nxv4f64( %arg) + ret {, , , } %res +} + +define {, , , } @vector_deinterleave_nxv2f64_nxv8f64( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv2f64_nxv8f64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; CHECK-NEXT: vlseg4e64.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call {, , , } @llvm.vector.deinterleave4.nxv8f64( %arg) + ret {, , , } %res +} + +define {, , , , } @vector_deinterleave_nxv2f16_nxv10f16( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv2f16_nxv10f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v11, v9, a0 +; CHECK-NEXT: vslideup.vx v9, v11, a0 +; CHECK-NEXT: vslidedown.vx v11, v8, a0 +; CHECK-NEXT: vslideup.vx v8, v11, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs4r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vlseg5e16.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call {, , , , } @llvm.vector.deinterleave5.nxv10f16( %arg) + ret {, , , , } %res +} + +define {, , , , } @vector_deinterleave_nxv4f16_nxv20f16( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv4f16_nxv20f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vlseg5e16.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call {, , , , } @llvm.vector.deinterleave5.nxv20f16( %arg) + ret {, , , , } %res +} + +define {, , , , } @vector_deinterleave_nxv8f16_nxv40f16( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv8f16_nxv40f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv1r.v v26, v15 +; CHECK-NEXT: vmv1r.v v27, v16 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv1r.v v24, v13 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vmv1r.v v25, v14 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vmv1r.v v28, v17 +; CHECK-NEXT: vs8r.v v24, (a1) +; CHECK-NEXT: vlseg5e16.v v12, (a0) +; CHECK-NEXT: vlseg5e16.v v18, (a1) +; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv1r.v v9, v18 +; CHECK-NEXT: vmv1r.v v18, v13 +; CHECK-NEXT: vmv2r.v v12, v14 +; CHECK-NEXT: vmv1r.v v13, v20 +; CHECK-NEXT: vmv1r.v v20, v15 +; CHECK-NEXT: vmv1r.v v17, v22 +; CHECK-NEXT: vmv2r.v v10, v18 +; CHECK-NEXT: vmv2r.v v14, v20 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call {, , , , } @llvm.vector.deinterleave5.nxv40f16( %arg) + ret {, , , , } %res +} + +define {, , , , } @vector_deinterleave_nxv2bf16_nxv10bf16( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv2bf16_nxv10bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v11, v9, a0 +; CHECK-NEXT: vslideup.vx v9, v11, a0 +; CHECK-NEXT: vslidedown.vx v11, v8, a0 +; CHECK-NEXT: vslideup.vx v8, v11, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs4r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vlseg5e16.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call {, , , , } @llvm.vector.deinterleave5.nxv10bf16( %arg) + ret {, , , , } %res +} + +define {, , , , } @vector_deinterleave_nxv4bf16_nxv20bf16( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv4bf16_nxv20bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; CHECK-NEXT: vlseg5e16.v v8, (a0) ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 @@ -1720,12 +2550,351 @@ define {, , , , , , , } @llvm.vector.deinterleave5.nxv40bf16( %arg) - ret {, , , , } %res + %res = call {, , , , } @llvm.vector.deinterleave5.nxv40bf16( %arg) + ret {, , , , } %res +} + +define {, , , , } @vector_deinterleave_nxv1f32_nxv5f32( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv1f32_nxv5f32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v11, v9, a0 +; CHECK-NEXT: vslideup.vx v9, v11, a0 +; CHECK-NEXT: vslidedown.vx v11, v8, a0 +; CHECK-NEXT: vslideup.vx v8, v11, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs4r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, ma +; CHECK-NEXT: vlseg5e32.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call {, , , , } @llvm.vector.deinterleave5.nxv5f32( %arg) + ret {, , , , } %res +} + +define {, , , , } @vector_deinterleave_nxv2f32_nxv10f32( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv2f32_nxv10f32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vlseg5e32.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call {, , , , } @llvm.vector.deinterleave5.nxv10f32( %arg) + ret {, , , , } %res +} + +define {, , , , } @vector_deinterleave_nxv4f32_nxv20f32( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv4f32_nxv20f32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vmv1r.v v26, v15 +; CHECK-NEXT: vmv1r.v v27, v16 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv1r.v v24, v13 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vmv1r.v v25, v14 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vmv1r.v v28, v17 +; CHECK-NEXT: vs8r.v v24, (a1) +; CHECK-NEXT: vlseg5e32.v v12, (a0) +; CHECK-NEXT: vlseg5e32.v v18, (a1) +; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv1r.v v9, v18 +; CHECK-NEXT: vmv1r.v v18, v13 +; CHECK-NEXT: vmv2r.v v12, v14 +; CHECK-NEXT: vmv1r.v v13, v20 +; CHECK-NEXT: vmv1r.v v20, v15 +; CHECK-NEXT: vmv1r.v v17, v22 +; CHECK-NEXT: vmv2r.v v10, v18 +; CHECK-NEXT: vmv2r.v v14, v20 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call {, , , , } @llvm.vector.deinterleave5.nxv20f32( %arg) + ret {, , , , } %res +} + +define {, , , , } @vector_deinterleave_nxv1f64_nxv5f64( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv1f64_nxv5f64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; CHECK-NEXT: vlseg5e64.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call {, , , , } @llvm.vector.deinterleave5.nxv5f64( %arg) + ret {, , , , } %res +} + +define {, , , , } @vector_deinterleave_nxv2f64_nxv10f64( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv2f64_nxv10f64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; CHECK-NEXT: vmv1r.v v26, v15 +; CHECK-NEXT: vmv1r.v v27, v16 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv1r.v v24, v13 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vmv1r.v v25, v14 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vmv1r.v v28, v17 +; CHECK-NEXT: vs8r.v v24, (a1) +; CHECK-NEXT: vlseg5e64.v v12, (a0) +; CHECK-NEXT: vlseg5e64.v v18, (a1) +; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv1r.v v9, v18 +; CHECK-NEXT: vmv1r.v v18, v13 +; CHECK-NEXT: vmv2r.v v12, v14 +; CHECK-NEXT: vmv1r.v v13, v20 +; CHECK-NEXT: vmv1r.v v20, v15 +; CHECK-NEXT: vmv1r.v v17, v22 +; CHECK-NEXT: vmv2r.v v10, v18 +; CHECK-NEXT: vmv2r.v v14, v20 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call {, , , , } @llvm.vector.deinterleave5.nxv10f64( %arg) + ret {, , , , } %res +} + +define {, , , , , } @vector_deinterleave_nxv2f16_nxv12f16( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv2f16_nxv12f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v11, v9, a0 +; CHECK-NEXT: add a1, a0, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v9, v11, a0 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v11, v8, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v8, v11, a0 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v11, v10, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v10, v11, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs4r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vlseg6e16.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call {, , , , , } @llvm.vector.deinterleave6.nxv12f16( %arg) + ret {, , , , , } %res +} + +define {, , , , , } @vector_deinterleave_nxv4f16_nxv24f16( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv4f16_nxv24f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vlseg6e16.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call {, , , , , } @llvm.vector.deinterleave6.nxv24f16( %arg) + ret {, , , , , } %res +} + +define {, , , , , } @vector_deinterleave_nxv8f16_nxv48f16( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv8f16_nxv48f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv2r.v v28, v18 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vmv2r.v v24, v14 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vmv2r.v v26, v16 +; CHECK-NEXT: vs8r.v v24, (a1) +; CHECK-NEXT: vlseg6e16.v v24, (a0) +; CHECK-NEXT: vlseg6e16.v v18, (a1) +; CHECK-NEXT: vmv2r.v v8, v24 +; CHECK-NEXT: vmv1r.v v9, v18 +; CHECK-NEXT: vmv1r.v v18, v25 +; CHECK-NEXT: vmv2r.v v12, v26 +; CHECK-NEXT: vmv1r.v v13, v20 +; CHECK-NEXT: vmv1r.v v20, v27 +; CHECK-NEXT: vmv2r.v v16, v28 +; CHECK-NEXT: vmv1r.v v17, v22 +; CHECK-NEXT: vmv1r.v v22, v29 +; CHECK-NEXT: vmv2r.v v10, v18 +; CHECK-NEXT: vmv2r.v v14, v20 +; CHECK-NEXT: vmv2r.v v18, v22 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call {, , , , , } @llvm.vector.deinterleave6.nxv48f16( %arg) + ret {, , , , , } %res +} + +define {, , , , , } @vector_deinterleave_nxv2bf16_nxv12bf16( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv2bf16_nxv12bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v11, v9, a0 +; CHECK-NEXT: add a1, a0, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v9, v11, a0 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v11, v8, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v8, v11, a0 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v11, v10, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v10, v11, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs4r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vlseg6e16.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call {, , , , , } @llvm.vector.deinterleave6.nxv12bf16( %arg) + ret {, , , , , } %res +} + +define {, , , , , } @vector_deinterleave_nxv4bf16_nxv24bf16( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv4bf16_nxv24bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vlseg6e16.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call {, , , , , } @llvm.vector.deinterleave6.nxv24bf16( %arg) + ret {, , , , , } %res +} + +define {, , , , , } @vector_deinterleave_nxv8bf16_nxv48bf16( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv8bf16_nxv48bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv2r.v v28, v18 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vmv2r.v v24, v14 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vmv2r.v v26, v16 +; CHECK-NEXT: vs8r.v v24, (a1) +; CHECK-NEXT: vlseg6e16.v v24, (a0) +; CHECK-NEXT: vlseg6e16.v v18, (a1) +; CHECK-NEXT: vmv2r.v v8, v24 +; CHECK-NEXT: vmv1r.v v9, v18 +; CHECK-NEXT: vmv1r.v v18, v25 +; CHECK-NEXT: vmv2r.v v12, v26 +; CHECK-NEXT: vmv1r.v v13, v20 +; CHECK-NEXT: vmv1r.v v20, v27 +; CHECK-NEXT: vmv2r.v v16, v28 +; CHECK-NEXT: vmv1r.v v17, v22 +; CHECK-NEXT: vmv1r.v v22, v29 +; CHECK-NEXT: vmv2r.v v10, v18 +; CHECK-NEXT: vmv2r.v v14, v20 +; CHECK-NEXT: vmv2r.v v18, v22 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call {, , , , , } @llvm.vector.deinterleave6.nxv48bf16( %arg) + ret {, , , , , } %res } -define {, , , , } @vector_deinterleave_nxv1f32_nxv5f32( %arg) nounwind { -; CHECK-LABEL: vector_deinterleave_nxv1f32_nxv5f32: +define {, , , , , } @vector_deinterleave_nxv1f32_nxv6f32( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv1f32_nxv6f32: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a0, vlenb @@ -1735,24 +2904,32 @@ define {, , , , , , , } @llvm.vector.deinterleave5.nxv5f32( %arg) - ret {, , , , } %res + %res = call {, , , , , } @llvm.vector.deinterleave6.nxv6f32( %arg) + ret {, , , , , } %res } -define {, , , , } @vector_deinterleave_nxv2f32_nxv10f32( %arg) nounwind { -; CHECK-LABEL: vector_deinterleave_nxv2f32_nxv10f32: +define {, , , , , } @vector_deinterleave_nxv2f32_nxv12f32( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv2f32_nxv12f32: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a0, vlenb @@ -1761,58 +2938,59 @@ define {, , , , , , , } @llvm.vector.deinterleave5.nxv10f32( %arg) - ret {, , , , } %res + %res = call {, , , , , } @llvm.vector.deinterleave6.nxv12f32( %arg) + ret {, , , , , } %res } -define {, , , , } @vector_deinterleave_nxv4f32_nxv20f32( %arg) nounwind { -; CHECK-LABEL: vector_deinterleave_nxv4f32_nxv20f32: +define {, , , , , } @vector_deinterleave_nxv4f32_nxv24f32( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv4f32_nxv24f32: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma -; CHECK-NEXT: vmv1r.v v26, v15 -; CHECK-NEXT: vmv1r.v v27, v16 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vmv1r.v v24, v13 +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vmv2r.v v28, v18 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vmv1r.v v25, v14 +; CHECK-NEXT: vmv2r.v v24, v14 ; CHECK-NEXT: vs8r.v v8, (a0) -; CHECK-NEXT: vmv1r.v v28, v17 +; CHECK-NEXT: vmv2r.v v26, v16 ; CHECK-NEXT: vs8r.v v24, (a1) -; CHECK-NEXT: vlseg5e32.v v12, (a0) -; CHECK-NEXT: vlseg5e32.v v18, (a1) -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vlseg6e32.v v24, (a0) +; CHECK-NEXT: vlseg6e32.v v18, (a1) +; CHECK-NEXT: vmv2r.v v8, v24 ; CHECK-NEXT: vmv1r.v v9, v18 -; CHECK-NEXT: vmv1r.v v18, v13 -; CHECK-NEXT: vmv2r.v v12, v14 +; CHECK-NEXT: vmv1r.v v18, v25 +; CHECK-NEXT: vmv2r.v v12, v26 ; CHECK-NEXT: vmv1r.v v13, v20 -; CHECK-NEXT: vmv1r.v v20, v15 +; CHECK-NEXT: vmv1r.v v20, v27 +; CHECK-NEXT: vmv2r.v v16, v28 ; CHECK-NEXT: vmv1r.v v17, v22 +; CHECK-NEXT: vmv1r.v v22, v29 ; CHECK-NEXT: vmv2r.v v10, v18 ; CHECK-NEXT: vmv2r.v v14, v20 +; CHECK-NEXT: vmv2r.v v18, v22 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret - %res = call {, , , , } @llvm.vector.deinterleave5.nxv20f32( %arg) - ret {, , , , } %res + %res = call {, , , , , } @llvm.vector.deinterleave6.nxv24f32( %arg) + ret {, , , , , } %res } -define {, , , , } @vector_deinterleave_nxv1f64_nxv5f64( %arg) nounwind { -; CHECK-LABEL: vector_deinterleave_nxv1f64_nxv5f64: +define {, , , , , } @vector_deinterleave_nxv1f64_nxv6f64( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv1f64_nxv6f64: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a0, vlenb @@ -1821,54 +2999,55 @@ define {, , , , , , , } @llvm.vector.deinterleave5.nxv5f64( %arg) - ret {, , , , } %res + %res = call {, , , , , } @llvm.vector.deinterleave6.nxv6f64( %arg) + ret {, , , , , } %res } -define {, , , , } @vector_deinterleave_nxv2f64_nxv10f64( %arg) nounwind { -; CHECK-LABEL: vector_deinterleave_nxv2f64_nxv10f64: +define {, , , , , } @vector_deinterleave_nxv2f64_nxv12f64( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv2f64_nxv12f64: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; CHECK-NEXT: vmv1r.v v26, v15 -; CHECK-NEXT: vmv1r.v v27, v16 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vmv1r.v v24, v13 +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; CHECK-NEXT: vmv2r.v v28, v18 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vmv1r.v v25, v14 +; CHECK-NEXT: vmv2r.v v24, v14 ; CHECK-NEXT: vs8r.v v8, (a0) -; CHECK-NEXT: vmv1r.v v28, v17 +; CHECK-NEXT: vmv2r.v v26, v16 ; CHECK-NEXT: vs8r.v v24, (a1) -; CHECK-NEXT: vlseg5e64.v v12, (a0) -; CHECK-NEXT: vlseg5e64.v v18, (a1) -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vlseg6e64.v v24, (a0) +; CHECK-NEXT: vlseg6e64.v v18, (a1) +; CHECK-NEXT: vmv2r.v v8, v24 ; CHECK-NEXT: vmv1r.v v9, v18 -; CHECK-NEXT: vmv1r.v v18, v13 -; CHECK-NEXT: vmv2r.v v12, v14 +; CHECK-NEXT: vmv1r.v v18, v25 +; CHECK-NEXT: vmv2r.v v12, v26 ; CHECK-NEXT: vmv1r.v v13, v20 -; CHECK-NEXT: vmv1r.v v20, v15 +; CHECK-NEXT: vmv1r.v v20, v27 +; CHECK-NEXT: vmv2r.v v16, v28 ; CHECK-NEXT: vmv1r.v v17, v22 +; CHECK-NEXT: vmv1r.v v22, v29 ; CHECK-NEXT: vmv2r.v v10, v18 ; CHECK-NEXT: vmv2r.v v14, v20 +; CHECK-NEXT: vmv2r.v v18, v22 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret - %res = call {, , , , } @llvm.vector.deinterleave5.nxv10f64( %arg) - ret {, , , , } %res + %res = call {, , , , , } @llvm.vector.deinterleave6.nxv12f64( %arg) + ret {, , , , , } %res } define {, , , , , , } @vector_deinterleave_nxv2f16_nxv14f16( %arg) nounwind { @@ -2221,3 +3400,311 @@ define {, , , , , , , , , } @llvm.vector.deinterleave7.nxv14f64( %arg) ret {, , , , , , } %res } + +define {, , , , , , , } @vector_deinterleave_nxv2f16_nxv16f16( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv2f16_nxv16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs4r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vlseg8e16.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call {, , , , , , , } @llvm.vector.deinterleave8.nxv16f16( %arg) + ret {, , , , , , , } %res +} + +define {, , , , , , , } @vector_deinterleave_nxv4f16_nxv32f16( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv4f16_nxv32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vlseg8e16.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call {, , , , , , , } @llvm.vector.deinterleave8.nxv32f16( %arg) + ret {, , , , , , , } %res +} + +define {, , , , , , , } @vector_deinterleave_nxv8f16_nxv64f16( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv8f16_nxv64f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vs8r.v v16, (a1) +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma +; CHECK-NEXT: vlseg8e16.v v0, (a0) +; CHECK-NEXT: vlseg8e16.v v22, (a1) +; CHECK-NEXT: vmv2r.v v8, v0 +; CHECK-NEXT: vmv1r.v v9, v22 +; CHECK-NEXT: vmv1r.v v22, v1 +; CHECK-NEXT: vmv2r.v v12, v2 +; CHECK-NEXT: vmv1r.v v13, v24 +; CHECK-NEXT: vmv1r.v v24, v3 +; CHECK-NEXT: vmv2r.v v16, v4 +; CHECK-NEXT: vmv1r.v v17, v26 +; CHECK-NEXT: vmv1r.v v26, v5 +; CHECK-NEXT: vmv2r.v v20, v6 +; CHECK-NEXT: vmv1r.v v21, v28 +; CHECK-NEXT: vmv1r.v v28, v7 +; CHECK-NEXT: vmv2r.v v10, v22 +; CHECK-NEXT: vmv2r.v v14, v24 +; CHECK-NEXT: vmv2r.v v18, v26 +; CHECK-NEXT: vmv2r.v v22, v28 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call {, , , , , , , } @llvm.vector.deinterleave8.nxv64f16( %arg) + ret {, , , , , , , } %res +} + +define {, , , , , , , } @vector_deinterleave_nxv2bf16_nxv16bf16( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv2bf16_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs4r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vlseg8e16.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call {, , , , , , , } @llvm.vector.deinterleave8.nxv16bf16( %arg) + ret {, , , , , , , } %res +} + +define {, , , , , , , } @vector_deinterleave_nxv4bf16_nxv32bf16( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv4bf16_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vlseg8e16.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call {, , , , , , , } @llvm.vector.deinterleave8.nxv32bf16( %arg) + ret {, , , , , , , } %res +} + +define {, , , , , , , } @vector_deinterleave_nxv8bf16_nxv64bf16( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv8bf16_nxv64bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vs8r.v v16, (a1) +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma +; CHECK-NEXT: vlseg8e16.v v0, (a0) +; CHECK-NEXT: vlseg8e16.v v22, (a1) +; CHECK-NEXT: vmv2r.v v8, v0 +; CHECK-NEXT: vmv1r.v v9, v22 +; CHECK-NEXT: vmv1r.v v22, v1 +; CHECK-NEXT: vmv2r.v v12, v2 +; CHECK-NEXT: vmv1r.v v13, v24 +; CHECK-NEXT: vmv1r.v v24, v3 +; CHECK-NEXT: vmv2r.v v16, v4 +; CHECK-NEXT: vmv1r.v v17, v26 +; CHECK-NEXT: vmv1r.v v26, v5 +; CHECK-NEXT: vmv2r.v v20, v6 +; CHECK-NEXT: vmv1r.v v21, v28 +; CHECK-NEXT: vmv1r.v v28, v7 +; CHECK-NEXT: vmv2r.v v10, v22 +; CHECK-NEXT: vmv2r.v v14, v24 +; CHECK-NEXT: vmv2r.v v18, v26 +; CHECK-NEXT: vmv2r.v v22, v28 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call {, , , , , , , } @llvm.vector.deinterleave8.nxv64bf16( %arg) + ret {, , , , , , , } %res +} + +define {, , , , , , , } @vector_deinterleave_nxv1f32_nxv8f32( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv1f32_nxv8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs4r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, ma +; CHECK-NEXT: vlseg8e32.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call {, , , , , , , } @llvm.vector.deinterleave8.nxv8f32( %arg) + ret {, , , , , , , } %res +} + +define {, , , , , , , } @vector_deinterleave_nxv2f32_nxv16f32( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv2f32_nxv16f32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vlseg8e32.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call {, , , , , , , } @llvm.vector.deinterleave8.nxv16f32( %arg) + ret {, , , , , , , } %res +} + +define {, , , , , , , } @vector_deinterleave_nxv4f32_nxv32f32( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv4f32_nxv32f32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vs8r.v v16, (a1) +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma +; CHECK-NEXT: vlseg8e32.v v0, (a0) +; CHECK-NEXT: vlseg8e32.v v22, (a1) +; CHECK-NEXT: vmv2r.v v8, v0 +; CHECK-NEXT: vmv1r.v v9, v22 +; CHECK-NEXT: vmv1r.v v22, v1 +; CHECK-NEXT: vmv2r.v v12, v2 +; CHECK-NEXT: vmv1r.v v13, v24 +; CHECK-NEXT: vmv1r.v v24, v3 +; CHECK-NEXT: vmv2r.v v16, v4 +; CHECK-NEXT: vmv1r.v v17, v26 +; CHECK-NEXT: vmv1r.v v26, v5 +; CHECK-NEXT: vmv2r.v v20, v6 +; CHECK-NEXT: vmv1r.v v21, v28 +; CHECK-NEXT: vmv1r.v v28, v7 +; CHECK-NEXT: vmv2r.v v10, v22 +; CHECK-NEXT: vmv2r.v v14, v24 +; CHECK-NEXT: vmv2r.v v18, v26 +; CHECK-NEXT: vmv2r.v v22, v28 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call {, , , , , , , } @llvm.vector.deinterleave8.nxv32f32( %arg) + ret {, , , , , , , } %res +} + +define {, , , , , , , } @vector_deinterleave_nxv1f64_nxv8f64( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv1f64_nxv8f64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; CHECK-NEXT: vlseg8e64.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call {, , , , , , , } @llvm.vector.deinterleave8.nxv8f64( %arg) + ret {, , , , , , , } %res +} + +define {, , , , , , , } @vector_deinterleave_nxv2f64_nxv16f64( %arg) nounwind { +; CHECK-LABEL: vector_deinterleave_nxv2f64_nxv16f64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vs8r.v v16, (a1) +; CHECK-NEXT: vsetvli a2, zero, e64, m1, ta, ma +; CHECK-NEXT: vlseg8e64.v v0, (a0) +; CHECK-NEXT: vlseg8e64.v v22, (a1) +; CHECK-NEXT: vmv2r.v v8, v0 +; CHECK-NEXT: vmv1r.v v9, v22 +; CHECK-NEXT: vmv1r.v v22, v1 +; CHECK-NEXT: vmv2r.v v12, v2 +; CHECK-NEXT: vmv1r.v v13, v24 +; CHECK-NEXT: vmv1r.v v24, v3 +; CHECK-NEXT: vmv2r.v v16, v4 +; CHECK-NEXT: vmv1r.v v17, v26 +; CHECK-NEXT: vmv1r.v v26, v5 +; CHECK-NEXT: vmv2r.v v20, v6 +; CHECK-NEXT: vmv1r.v v21, v28 +; CHECK-NEXT: vmv1r.v v28, v7 +; CHECK-NEXT: vmv2r.v v10, v22 +; CHECK-NEXT: vmv2r.v v14, v24 +; CHECK-NEXT: vmv2r.v v18, v26 +; CHECK-NEXT: vmv2r.v v22, v28 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %res = call {, , , , , , , } @llvm.vector.deinterleave8.nxv16f64( %arg) + ret {, , , , , , , } %res +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll index 279779dc49667..3dc83d50ee3f3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll @@ -167,15 +167,13 @@ define <4 x i64> @vector_interleave_v4i64_v2i64(<2 x i64> %a, <2 x i64> %b) { ret <4 x i64> %res } -define <6 x i32> @vector_interleave3_v6i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) { +define <6 x i32> @vector_interleave3_v6i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) nounwind { ; CHECK-LABEL: vector_interleave3_v6i32_v2i32: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a1, a1, 1 @@ -193,19 +191,15 @@ define <6 x i32> @vector_interleave3_v6i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret ; ; ZVBB-LABEL: vector_interleave3_v6i32_v2i32: ; ZVBB: # %bb.0: ; ZVBB-NEXT: addi sp, sp, -16 -; ZVBB-NEXT: .cfi_def_cfa_offset 16 ; ZVBB-NEXT: csrr a0, vlenb ; ZVBB-NEXT: slli a0, a0, 1 ; ZVBB-NEXT: sub sp, sp, a0 -; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb ; ZVBB-NEXT: addi a0, sp, 16 ; ZVBB-NEXT: csrr a1, vlenb ; ZVBB-NEXT: srli a1, a1, 1 @@ -223,19 +217,15 @@ define <6 x i32> @vector_interleave3_v6i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2 ; ZVBB-NEXT: csrr a0, vlenb ; ZVBB-NEXT: slli a0, a0, 1 ; ZVBB-NEXT: add sp, sp, a0 -; ZVBB-NEXT: .cfi_def_cfa sp, 16 ; ZVBB-NEXT: addi sp, sp, 16 -; ZVBB-NEXT: .cfi_def_cfa_offset 0 ; ZVBB-NEXT: ret ; ; ZIP-LABEL: vector_interleave3_v6i32_v2i32: ; ZIP: # %bb.0: ; ZIP-NEXT: addi sp, sp, -16 -; ZIP-NEXT: .cfi_def_cfa_offset 16 ; ZIP-NEXT: csrr a0, vlenb ; ZIP-NEXT: slli a0, a0, 1 ; ZIP-NEXT: sub sp, sp, a0 -; ZIP-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb ; ZIP-NEXT: addi a0, sp, 16 ; ZIP-NEXT: csrr a1, vlenb ; ZIP-NEXT: srli a1, a1, 1 @@ -253,24 +243,110 @@ define <6 x i32> @vector_interleave3_v6i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2 ; ZIP-NEXT: csrr a0, vlenb ; ZIP-NEXT: slli a0, a0, 1 ; ZIP-NEXT: add sp, sp, a0 -; ZIP-NEXT: .cfi_def_cfa sp, 16 ; ZIP-NEXT: addi sp, sp, 16 -; ZIP-NEXT: .cfi_def_cfa_offset 0 ; ZIP-NEXT: ret %res = call <6 x i32> @llvm.vector.interleave3.v6i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) ret <6 x i32> %res } +define <8 x i32> @vector_interleave4_v8i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) nounwind { +; CHECK-LABEL: vector_interleave4_v8i32_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a1, a1, 1 +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: vsetvli a3, zero, e32, mf2, ta, ma +; CHECK-NEXT: vsseg4e32.v v8, (a0) +; CHECK-NEXT: add a3, a2, a1 +; CHECK-NEXT: add a1, a3, a1 +; CHECK-NEXT: vle32.v v10, (a3) +; CHECK-NEXT: vle32.v v9, (a2) +; CHECK-NEXT: vle32.v v11, (a1) +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vslideup.vi v10, v11, 2 +; CHECK-NEXT: vslideup.vi v8, v9, 2 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v10, 4 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave4_v8i32_v2i32: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 1 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: srli a1, a1, 1 +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: vsetvli a3, zero, e32, mf2, ta, ma +; ZVBB-NEXT: vsseg4e32.v v8, (a0) +; ZVBB-NEXT: add a3, a2, a1 +; ZVBB-NEXT: add a1, a3, a1 +; ZVBB-NEXT: vle32.v v10, (a3) +; ZVBB-NEXT: vle32.v v9, (a2) +; ZVBB-NEXT: vle32.v v11, (a1) +; ZVBB-NEXT: vle32.v v8, (a0) +; ZVBB-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; ZVBB-NEXT: vslideup.vi v10, v11, 2 +; ZVBB-NEXT: vslideup.vi v8, v9, 2 +; ZVBB-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; ZVBB-NEXT: vslideup.vi v8, v10, 4 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 1 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret +; +; ZIP-LABEL: vector_interleave4_v8i32_v2i32: +; ZIP: # %bb.0: +; ZIP-NEXT: addi sp, sp, -16 +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: slli a0, a0, 1 +; ZIP-NEXT: sub sp, sp, a0 +; ZIP-NEXT: addi a0, sp, 16 +; ZIP-NEXT: csrr a1, vlenb +; ZIP-NEXT: srli a1, a1, 1 +; ZIP-NEXT: add a2, a0, a1 +; ZIP-NEXT: vsetvli a3, zero, e32, mf2, ta, ma +; ZIP-NEXT: vsseg4e32.v v8, (a0) +; ZIP-NEXT: add a3, a2, a1 +; ZIP-NEXT: add a1, a3, a1 +; ZIP-NEXT: vle32.v v10, (a3) +; ZIP-NEXT: vle32.v v9, (a2) +; ZIP-NEXT: vle32.v v11, (a1) +; ZIP-NEXT: vle32.v v8, (a0) +; ZIP-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; ZIP-NEXT: vslideup.vi v10, v11, 2 +; ZIP-NEXT: vslideup.vi v8, v9, 2 +; ZIP-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; ZIP-NEXT: vslideup.vi v8, v10, 4 +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: slli a0, a0, 1 +; ZIP-NEXT: add sp, sp, a0 +; ZIP-NEXT: addi sp, sp, 16 +; ZIP-NEXT: ret + %res = call <8 x i32> @llvm.vector.interleave4.v8i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) + ret <8 x i32> %res +} -define <10 x i16> @vector_interleave5_v10i16_v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d, <2 x i16> %e) { +define <10 x i16> @vector_interleave5_v10i16_v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d, <2 x i16> %e) nounwind { ; CHECK-LABEL: vector_interleave5_v10i16_v2i16: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a1, a1, 2 @@ -295,19 +371,15 @@ define <10 x i16> @vector_interleave5_v10i16_v2i16(<2 x i16> %a, <2 x i16> %b, < ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret ; ; ZVBB-LABEL: vector_interleave5_v10i16_v2i16: ; ZVBB: # %bb.0: ; ZVBB-NEXT: addi sp, sp, -16 -; ZVBB-NEXT: .cfi_def_cfa_offset 16 ; ZVBB-NEXT: csrr a0, vlenb ; ZVBB-NEXT: slli a0, a0, 1 ; ZVBB-NEXT: sub sp, sp, a0 -; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb ; ZVBB-NEXT: addi a0, sp, 16 ; ZVBB-NEXT: csrr a1, vlenb ; ZVBB-NEXT: srli a1, a1, 2 @@ -332,19 +404,15 @@ define <10 x i16> @vector_interleave5_v10i16_v2i16(<2 x i16> %a, <2 x i16> %b, < ; ZVBB-NEXT: csrr a0, vlenb ; ZVBB-NEXT: slli a0, a0, 1 ; ZVBB-NEXT: add sp, sp, a0 -; ZVBB-NEXT: .cfi_def_cfa sp, 16 ; ZVBB-NEXT: addi sp, sp, 16 -; ZVBB-NEXT: .cfi_def_cfa_offset 0 ; ZVBB-NEXT: ret ; ; ZIP-LABEL: vector_interleave5_v10i16_v2i16: ; ZIP: # %bb.0: ; ZIP-NEXT: addi sp, sp, -16 -; ZIP-NEXT: .cfi_def_cfa_offset 16 ; ZIP-NEXT: csrr a0, vlenb ; ZIP-NEXT: slli a0, a0, 1 ; ZIP-NEXT: sub sp, sp, a0 -; ZIP-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb ; ZIP-NEXT: addi a0, sp, 16 ; ZIP-NEXT: csrr a1, vlenb ; ZIP-NEXT: srli a1, a1, 2 @@ -369,22 +437,130 @@ define <10 x i16> @vector_interleave5_v10i16_v2i16(<2 x i16> %a, <2 x i16> %b, < ; ZIP-NEXT: csrr a0, vlenb ; ZIP-NEXT: slli a0, a0, 1 ; ZIP-NEXT: add sp, sp, a0 -; ZIP-NEXT: .cfi_def_cfa sp, 16 ; ZIP-NEXT: addi sp, sp, 16 -; ZIP-NEXT: .cfi_def_cfa_offset 0 ; ZIP-NEXT: ret %res = call <10 x i16> @llvm.vector.interleave5.v10i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d, <2 x i16> %e) ret <10 x i16> %res } -define <14 x i8> @vector_interleave7_v14i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x i8> %e, <2 x i8> %f, <2 x i8> %g) { +define <12 x i16> @vector_interleave6_v12i16_v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d, <2 x i16> %e, <2 x i16> %f) nounwind { +; CHECK-LABEL: vector_interleave6_v12i16_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: add a3, a2, a1 +; CHECK-NEXT: vsetvli a4, zero, e16, mf4, ta, ma +; CHECK-NEXT: vsseg6e16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a2) +; CHECK-NEXT: add a2, a3, a1 +; CHECK-NEXT: vle16.v v11, (a2) +; CHECK-NEXT: add a2, a2, a1 +; CHECK-NEXT: vle16.v v12, (a3) +; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: vle16.v v10, (a2) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v13, (a1) +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v12, v11, 2 +; CHECK-NEXT: vslideup.vi v8, v9, 2 +; CHECK-NEXT: vslideup.vi v10, v13, 2 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vi v8, v12, 4 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v10, 8 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave6_v12i16_v2i16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 1 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: srli a1, a1, 2 +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: add a3, a2, a1 +; ZVBB-NEXT: vsetvli a4, zero, e16, mf4, ta, ma +; ZVBB-NEXT: vsseg6e16.v v8, (a0) +; ZVBB-NEXT: vle16.v v9, (a2) +; ZVBB-NEXT: add a2, a3, a1 +; ZVBB-NEXT: vle16.v v11, (a2) +; ZVBB-NEXT: add a2, a2, a1 +; ZVBB-NEXT: vle16.v v12, (a3) +; ZVBB-NEXT: add a1, a2, a1 +; ZVBB-NEXT: vle16.v v10, (a2) +; ZVBB-NEXT: vle16.v v8, (a0) +; ZVBB-NEXT: vle16.v v13, (a1) +; ZVBB-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZVBB-NEXT: vslideup.vi v12, v11, 2 +; ZVBB-NEXT: vslideup.vi v8, v9, 2 +; ZVBB-NEXT: vslideup.vi v10, v13, 2 +; ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVBB-NEXT: vslideup.vi v8, v12, 4 +; ZVBB-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; ZVBB-NEXT: vslideup.vi v8, v10, 8 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 1 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret +; +; ZIP-LABEL: vector_interleave6_v12i16_v2i16: +; ZIP: # %bb.0: +; ZIP-NEXT: addi sp, sp, -16 +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: slli a0, a0, 1 +; ZIP-NEXT: sub sp, sp, a0 +; ZIP-NEXT: addi a0, sp, 16 +; ZIP-NEXT: csrr a1, vlenb +; ZIP-NEXT: srli a1, a1, 2 +; ZIP-NEXT: add a2, a0, a1 +; ZIP-NEXT: add a3, a2, a1 +; ZIP-NEXT: vsetvli a4, zero, e16, mf4, ta, ma +; ZIP-NEXT: vsseg6e16.v v8, (a0) +; ZIP-NEXT: vle16.v v9, (a2) +; ZIP-NEXT: add a2, a3, a1 +; ZIP-NEXT: vle16.v v11, (a2) +; ZIP-NEXT: add a2, a2, a1 +; ZIP-NEXT: vle16.v v12, (a3) +; ZIP-NEXT: add a1, a2, a1 +; ZIP-NEXT: vle16.v v10, (a2) +; ZIP-NEXT: vle16.v v8, (a0) +; ZIP-NEXT: vle16.v v13, (a1) +; ZIP-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZIP-NEXT: vslideup.vi v12, v11, 2 +; ZIP-NEXT: vslideup.vi v8, v9, 2 +; ZIP-NEXT: vslideup.vi v10, v13, 2 +; ZIP-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZIP-NEXT: vslideup.vi v8, v12, 4 +; ZIP-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; ZIP-NEXT: vslideup.vi v8, v10, 8 +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: slli a0, a0, 1 +; ZIP-NEXT: add sp, sp, a0 +; ZIP-NEXT: addi sp, sp, 16 +; ZIP-NEXT: ret + %res = call <12 x i16> @llvm.vector.interleave6.v12i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d, <2 x i16> %e, <2 x i16> %f) + ret <12 x i16> %res +} + +define <14 x i8> @vector_interleave7_v14i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x i8> %e, <2 x i8> %f, <2 x i8> %g) nounwind { ; CHECK-LABEL: vector_interleave7_v14i8_v2i8: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a1, a1, 3 @@ -415,18 +591,14 @@ define <14 x i8> @vector_interleave7_v14i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i ; CHECK-NEXT: vslideup.vi v8, v12, 8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret ; ; ZVBB-LABEL: vector_interleave7_v14i8_v2i8: ; ZVBB: # %bb.0: ; ZVBB-NEXT: addi sp, sp, -16 -; ZVBB-NEXT: .cfi_def_cfa_offset 16 ; ZVBB-NEXT: csrr a0, vlenb ; ZVBB-NEXT: sub sp, sp, a0 -; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb ; ZVBB-NEXT: addi a0, sp, 16 ; ZVBB-NEXT: csrr a1, vlenb ; ZVBB-NEXT: srli a1, a1, 3 @@ -457,18 +629,14 @@ define <14 x i8> @vector_interleave7_v14i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i ; ZVBB-NEXT: vslideup.vi v8, v12, 8 ; ZVBB-NEXT: csrr a0, vlenb ; ZVBB-NEXT: add sp, sp, a0 -; ZVBB-NEXT: .cfi_def_cfa sp, 16 ; ZVBB-NEXT: addi sp, sp, 16 -; ZVBB-NEXT: .cfi_def_cfa_offset 0 ; ZVBB-NEXT: ret ; ; ZIP-LABEL: vector_interleave7_v14i8_v2i8: ; ZIP: # %bb.0: ; ZIP-NEXT: addi sp, sp, -16 -; ZIP-NEXT: .cfi_def_cfa_offset 16 ; ZIP-NEXT: csrr a0, vlenb ; ZIP-NEXT: sub sp, sp, a0 -; ZIP-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb ; ZIP-NEXT: addi a0, sp, 16 ; ZIP-NEXT: csrr a1, vlenb ; ZIP-NEXT: srli a1, a1, 3 @@ -499,14 +667,138 @@ define <14 x i8> @vector_interleave7_v14i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i ; ZIP-NEXT: vslideup.vi v8, v12, 8 ; ZIP-NEXT: csrr a0, vlenb ; ZIP-NEXT: add sp, sp, a0 -; ZIP-NEXT: .cfi_def_cfa sp, 16 ; ZIP-NEXT: addi sp, sp, 16 -; ZIP-NEXT: .cfi_def_cfa_offset 0 ; ZIP-NEXT: ret %res = call <14 x i8> @llvm.vector.interleave7.v14i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x i8> %e, <2 x i8> %f, <2 x i8> %g) ret <14 x i8> %res } +define <16 x i8> @vector_interleave8_v16i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x i8> %e, <2 x i8> %f, <2 x i8> %g, <2 x i8> %h) nounwind { +; CHECK-LABEL: vector_interleave8_v16i8_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a1, a1, 3 +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: add a3, a2, a1 +; CHECK-NEXT: add a4, a3, a1 +; CHECK-NEXT: add a5, a4, a1 +; CHECK-NEXT: add a6, a5, a1 +; CHECK-NEXT: vsetvli a7, zero, e8, mf8, ta, ma +; CHECK-NEXT: vsseg8e8.v v8, (a0) +; CHECK-NEXT: vle8.v v9, (a6) +; CHECK-NEXT: add a6, a6, a1 +; CHECK-NEXT: vle8.v v10, (a5) +; CHECK-NEXT: vle8.v v11, (a6) +; CHECK-NEXT: add a1, a6, a1 +; CHECK-NEXT: vle8.v v12, (a2) +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v13, (a3) +; CHECK-NEXT: vle8.v v14, (a4) +; CHECK-NEXT: vle8.v v15, (a1) +; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v10, v9, 2 +; CHECK-NEXT: vslideup.vi v8, v12, 2 +; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v10, v11, 4 +; CHECK-NEXT: vslideup.vi v8, v13, 4 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v10, v15, 6 +; CHECK-NEXT: vslideup.vi v8, v14, 6 +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vslideup.vi v8, v10, 8 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave8_v16i8_v2i8: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: srli a1, a1, 3 +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: add a3, a2, a1 +; ZVBB-NEXT: add a4, a3, a1 +; ZVBB-NEXT: add a5, a4, a1 +; ZVBB-NEXT: add a6, a5, a1 +; ZVBB-NEXT: vsetvli a7, zero, e8, mf8, ta, ma +; ZVBB-NEXT: vsseg8e8.v v8, (a0) +; ZVBB-NEXT: vle8.v v9, (a6) +; ZVBB-NEXT: add a6, a6, a1 +; ZVBB-NEXT: vle8.v v10, (a5) +; ZVBB-NEXT: vle8.v v11, (a6) +; ZVBB-NEXT: add a1, a6, a1 +; ZVBB-NEXT: vle8.v v12, (a2) +; ZVBB-NEXT: vle8.v v8, (a0) +; ZVBB-NEXT: vle8.v v13, (a3) +; ZVBB-NEXT: vle8.v v14, (a4) +; ZVBB-NEXT: vle8.v v15, (a1) +; ZVBB-NEXT: vsetivli zero, 4, e8, mf2, tu, ma +; ZVBB-NEXT: vslideup.vi v10, v9, 2 +; ZVBB-NEXT: vslideup.vi v8, v12, 2 +; ZVBB-NEXT: vsetivli zero, 6, e8, mf2, tu, ma +; ZVBB-NEXT: vslideup.vi v10, v11, 4 +; ZVBB-NEXT: vslideup.vi v8, v13, 4 +; ZVBB-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; ZVBB-NEXT: vslideup.vi v10, v15, 6 +; ZVBB-NEXT: vslideup.vi v8, v14, 6 +; ZVBB-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; ZVBB-NEXT: vslideup.vi v8, v10, 8 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret +; +; ZIP-LABEL: vector_interleave8_v16i8_v2i8: +; ZIP: # %bb.0: +; ZIP-NEXT: addi sp, sp, -16 +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: sub sp, sp, a0 +; ZIP-NEXT: addi a0, sp, 16 +; ZIP-NEXT: csrr a1, vlenb +; ZIP-NEXT: srli a1, a1, 3 +; ZIP-NEXT: add a2, a0, a1 +; ZIP-NEXT: add a3, a2, a1 +; ZIP-NEXT: add a4, a3, a1 +; ZIP-NEXT: add a5, a4, a1 +; ZIP-NEXT: add a6, a5, a1 +; ZIP-NEXT: vsetvli a7, zero, e8, mf8, ta, ma +; ZIP-NEXT: vsseg8e8.v v8, (a0) +; ZIP-NEXT: vle8.v v9, (a6) +; ZIP-NEXT: add a6, a6, a1 +; ZIP-NEXT: vle8.v v10, (a5) +; ZIP-NEXT: vle8.v v11, (a6) +; ZIP-NEXT: add a1, a6, a1 +; ZIP-NEXT: vle8.v v12, (a2) +; ZIP-NEXT: vle8.v v8, (a0) +; ZIP-NEXT: vle8.v v13, (a3) +; ZIP-NEXT: vle8.v v14, (a4) +; ZIP-NEXT: vle8.v v15, (a1) +; ZIP-NEXT: vsetivli zero, 4, e8, mf2, tu, ma +; ZIP-NEXT: vslideup.vi v10, v9, 2 +; ZIP-NEXT: vslideup.vi v8, v12, 2 +; ZIP-NEXT: vsetivli zero, 6, e8, mf2, tu, ma +; ZIP-NEXT: vslideup.vi v10, v11, 4 +; ZIP-NEXT: vslideup.vi v8, v13, 4 +; ZIP-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; ZIP-NEXT: vslideup.vi v10, v15, 6 +; ZIP-NEXT: vslideup.vi v8, v14, 6 +; ZIP-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; ZIP-NEXT: vslideup.vi v8, v10, 8 +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: add sp, sp, a0 +; ZIP-NEXT: addi sp, sp, 16 +; ZIP-NEXT: ret + %res = call <16 x i8> @llvm.vector.interleave8.v16i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x i8> %e, <2 x i8> %f, <2 x i8> %g, <2 x i8> %h) + ret <16 x i8> %res +} ; Floats @@ -700,15 +992,13 @@ define <4 x double> @vector_interleave_v4f64_v2f64(<2 x double> %a, <2 x double> ret <4 x double> %res } -define <6 x float> @vector_interleave3_v632_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) { -; CHECK-LABEL: vector_interleave3_v632_v2f32: +define <6 x float> @vector_interleave3_v6f32_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind { +; CHECK-LABEL: vector_interleave3_v6f32_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a1, a1, 1 @@ -726,19 +1016,15 @@ define <6 x float> @vector_interleave3_v632_v2f32(<2 x float> %a, <2 x float> %b ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret ; -; ZVBB-LABEL: vector_interleave3_v632_v2f32: +; ZVBB-LABEL: vector_interleave3_v6f32_v2f32: ; ZVBB: # %bb.0: ; ZVBB-NEXT: addi sp, sp, -16 -; ZVBB-NEXT: .cfi_def_cfa_offset 16 ; ZVBB-NEXT: csrr a0, vlenb ; ZVBB-NEXT: slli a0, a0, 1 ; ZVBB-NEXT: sub sp, sp, a0 -; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb ; ZVBB-NEXT: addi a0, sp, 16 ; ZVBB-NEXT: csrr a1, vlenb ; ZVBB-NEXT: srli a1, a1, 1 @@ -756,19 +1042,15 @@ define <6 x float> @vector_interleave3_v632_v2f32(<2 x float> %a, <2 x float> %b ; ZVBB-NEXT: csrr a0, vlenb ; ZVBB-NEXT: slli a0, a0, 1 ; ZVBB-NEXT: add sp, sp, a0 -; ZVBB-NEXT: .cfi_def_cfa sp, 16 ; ZVBB-NEXT: addi sp, sp, 16 -; ZVBB-NEXT: .cfi_def_cfa_offset 0 ; ZVBB-NEXT: ret ; -; ZIP-LABEL: vector_interleave3_v632_v2f32: +; ZIP-LABEL: vector_interleave3_v6f32_v2f32: ; ZIP: # %bb.0: ; ZIP-NEXT: addi sp, sp, -16 -; ZIP-NEXT: .cfi_def_cfa_offset 16 ; ZIP-NEXT: csrr a0, vlenb ; ZIP-NEXT: slli a0, a0, 1 ; ZIP-NEXT: sub sp, sp, a0 -; ZIP-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb ; ZIP-NEXT: addi a0, sp, 16 ; ZIP-NEXT: csrr a1, vlenb ; ZIP-NEXT: srli a1, a1, 1 @@ -786,24 +1068,110 @@ define <6 x float> @vector_interleave3_v632_v2f32(<2 x float> %a, <2 x float> %b ; ZIP-NEXT: csrr a0, vlenb ; ZIP-NEXT: slli a0, a0, 1 ; ZIP-NEXT: add sp, sp, a0 -; ZIP-NEXT: .cfi_def_cfa sp, 16 ; ZIP-NEXT: addi sp, sp, 16 -; ZIP-NEXT: .cfi_def_cfa_offset 0 ; ZIP-NEXT: ret %res = call <6 x float> @llvm.vector.interleave3.v6f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) ret <6 x float> %res } +define <8 x float> @vector_interleave4_v8f32_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d) nounwind { +; CHECK-LABEL: vector_interleave4_v8f32_v2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a1, a1, 1 +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: vsetvli a3, zero, e32, mf2, ta, ma +; CHECK-NEXT: vsseg4e32.v v8, (a0) +; CHECK-NEXT: add a3, a2, a1 +; CHECK-NEXT: add a1, a3, a1 +; CHECK-NEXT: vle32.v v10, (a3) +; CHECK-NEXT: vle32.v v9, (a2) +; CHECK-NEXT: vle32.v v11, (a1) +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vslideup.vi v10, v11, 2 +; CHECK-NEXT: vslideup.vi v8, v9, 2 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v10, 4 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave4_v8f32_v2f32: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 1 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: srli a1, a1, 1 +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: vsetvli a3, zero, e32, mf2, ta, ma +; ZVBB-NEXT: vsseg4e32.v v8, (a0) +; ZVBB-NEXT: add a3, a2, a1 +; ZVBB-NEXT: add a1, a3, a1 +; ZVBB-NEXT: vle32.v v10, (a3) +; ZVBB-NEXT: vle32.v v9, (a2) +; ZVBB-NEXT: vle32.v v11, (a1) +; ZVBB-NEXT: vle32.v v8, (a0) +; ZVBB-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; ZVBB-NEXT: vslideup.vi v10, v11, 2 +; ZVBB-NEXT: vslideup.vi v8, v9, 2 +; ZVBB-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; ZVBB-NEXT: vslideup.vi v8, v10, 4 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 1 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret +; +; ZIP-LABEL: vector_interleave4_v8f32_v2f32: +; ZIP: # %bb.0: +; ZIP-NEXT: addi sp, sp, -16 +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: slli a0, a0, 1 +; ZIP-NEXT: sub sp, sp, a0 +; ZIP-NEXT: addi a0, sp, 16 +; ZIP-NEXT: csrr a1, vlenb +; ZIP-NEXT: srli a1, a1, 1 +; ZIP-NEXT: add a2, a0, a1 +; ZIP-NEXT: vsetvli a3, zero, e32, mf2, ta, ma +; ZIP-NEXT: vsseg4e32.v v8, (a0) +; ZIP-NEXT: add a3, a2, a1 +; ZIP-NEXT: add a1, a3, a1 +; ZIP-NEXT: vle32.v v10, (a3) +; ZIP-NEXT: vle32.v v9, (a2) +; ZIP-NEXT: vle32.v v11, (a1) +; ZIP-NEXT: vle32.v v8, (a0) +; ZIP-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; ZIP-NEXT: vslideup.vi v10, v11, 2 +; ZIP-NEXT: vslideup.vi v8, v9, 2 +; ZIP-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; ZIP-NEXT: vslideup.vi v8, v10, 4 +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: slli a0, a0, 1 +; ZIP-NEXT: add sp, sp, a0 +; ZIP-NEXT: addi sp, sp, 16 +; ZIP-NEXT: ret + %res = call <8 x float> @llvm.vector.interleave4.v8f32(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d) + ret <8 x float> %res +} -define <10 x half> @vector_interleave5_v10f16_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d, <2 x half> %e) { +define <10 x half> @vector_interleave5_v10f16_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d, <2 x half> %e) nounwind { ; CHECK-LABEL: vector_interleave5_v10f16_v2f16: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a1, a1, 2 @@ -828,19 +1196,15 @@ define <10 x half> @vector_interleave5_v10f16_v2f16(<2 x half> %a, <2 x half> %b ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret ; ; ZVBB-LABEL: vector_interleave5_v10f16_v2f16: ; ZVBB: # %bb.0: ; ZVBB-NEXT: addi sp, sp, -16 -; ZVBB-NEXT: .cfi_def_cfa_offset 16 ; ZVBB-NEXT: csrr a0, vlenb ; ZVBB-NEXT: slli a0, a0, 1 ; ZVBB-NEXT: sub sp, sp, a0 -; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb ; ZVBB-NEXT: addi a0, sp, 16 ; ZVBB-NEXT: csrr a1, vlenb ; ZVBB-NEXT: srli a1, a1, 2 @@ -865,19 +1229,15 @@ define <10 x half> @vector_interleave5_v10f16_v2f16(<2 x half> %a, <2 x half> %b ; ZVBB-NEXT: csrr a0, vlenb ; ZVBB-NEXT: slli a0, a0, 1 ; ZVBB-NEXT: add sp, sp, a0 -; ZVBB-NEXT: .cfi_def_cfa sp, 16 ; ZVBB-NEXT: addi sp, sp, 16 -; ZVBB-NEXT: .cfi_def_cfa_offset 0 ; ZVBB-NEXT: ret ; ; ZIP-LABEL: vector_interleave5_v10f16_v2f16: ; ZIP: # %bb.0: ; ZIP-NEXT: addi sp, sp, -16 -; ZIP-NEXT: .cfi_def_cfa_offset 16 ; ZIP-NEXT: csrr a0, vlenb ; ZIP-NEXT: slli a0, a0, 1 ; ZIP-NEXT: sub sp, sp, a0 -; ZIP-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb ; ZIP-NEXT: addi a0, sp, 16 ; ZIP-NEXT: csrr a1, vlenb ; ZIP-NEXT: srli a1, a1, 2 @@ -902,23 +1262,131 @@ define <10 x half> @vector_interleave5_v10f16_v2f16(<2 x half> %a, <2 x half> %b ; ZIP-NEXT: csrr a0, vlenb ; ZIP-NEXT: slli a0, a0, 1 ; ZIP-NEXT: add sp, sp, a0 -; ZIP-NEXT: .cfi_def_cfa sp, 16 ; ZIP-NEXT: addi sp, sp, 16 -; ZIP-NEXT: .cfi_def_cfa_offset 0 ; ZIP-NEXT: ret %res = call <10 x half> @llvm.vector.interleave5.v10f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d, <2 x half> %e) ret <10 x half> %res } -define <7 x half> @vector_interleave7_v7f16_v1f16(<1 x half> %a, <1 x half> %b, <1 x half> %c, <1 x half> %d, <1 x half> %e, <1 x half> %f, <1 x half> %g) { +define <12 x half> @vector_interleave6_v12f16_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d, <2 x half> %e, <2 x half> %f) nounwind { +; CHECK-LABEL: vector_interleave6_v12f16_v2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: add a3, a2, a1 +; CHECK-NEXT: vsetvli a4, zero, e16, mf4, ta, ma +; CHECK-NEXT: vsseg6e16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a2) +; CHECK-NEXT: add a2, a3, a1 +; CHECK-NEXT: vle16.v v11, (a2) +; CHECK-NEXT: add a2, a2, a1 +; CHECK-NEXT: vle16.v v12, (a3) +; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: vle16.v v10, (a2) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v13, (a1) +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v12, v11, 2 +; CHECK-NEXT: vslideup.vi v8, v9, 2 +; CHECK-NEXT: vslideup.vi v10, v13, 2 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vi v8, v12, 4 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v10, 8 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave6_v12f16_v2f16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 1 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: srli a1, a1, 2 +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: add a3, a2, a1 +; ZVBB-NEXT: vsetvli a4, zero, e16, mf4, ta, ma +; ZVBB-NEXT: vsseg6e16.v v8, (a0) +; ZVBB-NEXT: vle16.v v9, (a2) +; ZVBB-NEXT: add a2, a3, a1 +; ZVBB-NEXT: vle16.v v11, (a2) +; ZVBB-NEXT: add a2, a2, a1 +; ZVBB-NEXT: vle16.v v12, (a3) +; ZVBB-NEXT: add a1, a2, a1 +; ZVBB-NEXT: vle16.v v10, (a2) +; ZVBB-NEXT: vle16.v v8, (a0) +; ZVBB-NEXT: vle16.v v13, (a1) +; ZVBB-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZVBB-NEXT: vslideup.vi v12, v11, 2 +; ZVBB-NEXT: vslideup.vi v8, v9, 2 +; ZVBB-NEXT: vslideup.vi v10, v13, 2 +; ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVBB-NEXT: vslideup.vi v8, v12, 4 +; ZVBB-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; ZVBB-NEXT: vslideup.vi v8, v10, 8 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 1 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret +; +; ZIP-LABEL: vector_interleave6_v12f16_v2f16: +; ZIP: # %bb.0: +; ZIP-NEXT: addi sp, sp, -16 +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: slli a0, a0, 1 +; ZIP-NEXT: sub sp, sp, a0 +; ZIP-NEXT: addi a0, sp, 16 +; ZIP-NEXT: csrr a1, vlenb +; ZIP-NEXT: srli a1, a1, 2 +; ZIP-NEXT: add a2, a0, a1 +; ZIP-NEXT: add a3, a2, a1 +; ZIP-NEXT: vsetvli a4, zero, e16, mf4, ta, ma +; ZIP-NEXT: vsseg6e16.v v8, (a0) +; ZIP-NEXT: vle16.v v9, (a2) +; ZIP-NEXT: add a2, a3, a1 +; ZIP-NEXT: vle16.v v11, (a2) +; ZIP-NEXT: add a2, a2, a1 +; ZIP-NEXT: vle16.v v12, (a3) +; ZIP-NEXT: add a1, a2, a1 +; ZIP-NEXT: vle16.v v10, (a2) +; ZIP-NEXT: vle16.v v8, (a0) +; ZIP-NEXT: vle16.v v13, (a1) +; ZIP-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZIP-NEXT: vslideup.vi v12, v11, 2 +; ZIP-NEXT: vslideup.vi v8, v9, 2 +; ZIP-NEXT: vslideup.vi v10, v13, 2 +; ZIP-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZIP-NEXT: vslideup.vi v8, v12, 4 +; ZIP-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; ZIP-NEXT: vslideup.vi v8, v10, 8 +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: slli a0, a0, 1 +; ZIP-NEXT: add sp, sp, a0 +; ZIP-NEXT: addi sp, sp, 16 +; ZIP-NEXT: ret + %res = call <12 x half> @llvm.vector.interleave6.v12f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d, <2 x half> %e, <2 x half> %f) + ret <12 x half> %res +} + +define <7 x half> @vector_interleave7_v7f16_v1f16(<1 x half> %a, <1 x half> %b, <1 x half> %c, <1 x half> %d, <1 x half> %e, <1 x half> %f, <1 x half> %g) nounwind { ; CHECK-LABEL: vector_interleave7_v7f16_v1f16: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a1, a1, 2 @@ -950,19 +1418,15 @@ define <7 x half> @vector_interleave7_v7f16_v1f16(<1 x half> %a, <1 x half> %b, ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret ; ; ZVBB-LABEL: vector_interleave7_v7f16_v1f16: ; ZVBB: # %bb.0: ; ZVBB-NEXT: addi sp, sp, -16 -; ZVBB-NEXT: .cfi_def_cfa_offset 16 ; ZVBB-NEXT: csrr a0, vlenb ; ZVBB-NEXT: slli a0, a0, 1 ; ZVBB-NEXT: sub sp, sp, a0 -; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb ; ZVBB-NEXT: addi a0, sp, 16 ; ZVBB-NEXT: csrr a1, vlenb ; ZVBB-NEXT: srli a1, a1, 2 @@ -994,19 +1458,15 @@ define <7 x half> @vector_interleave7_v7f16_v1f16(<1 x half> %a, <1 x half> %b, ; ZVBB-NEXT: csrr a0, vlenb ; ZVBB-NEXT: slli a0, a0, 1 ; ZVBB-NEXT: add sp, sp, a0 -; ZVBB-NEXT: .cfi_def_cfa sp, 16 ; ZVBB-NEXT: addi sp, sp, 16 -; ZVBB-NEXT: .cfi_def_cfa_offset 0 ; ZVBB-NEXT: ret ; ; ZIP-LABEL: vector_interleave7_v7f16_v1f16: ; ZIP: # %bb.0: ; ZIP-NEXT: addi sp, sp, -16 -; ZIP-NEXT: .cfi_def_cfa_offset 16 ; ZIP-NEXT: csrr a0, vlenb ; ZIP-NEXT: slli a0, a0, 1 ; ZIP-NEXT: sub sp, sp, a0 -; ZIP-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb ; ZIP-NEXT: addi a0, sp, 16 ; ZIP-NEXT: csrr a1, vlenb ; ZIP-NEXT: srli a1, a1, 2 @@ -1038,10 +1498,141 @@ define <7 x half> @vector_interleave7_v7f16_v1f16(<1 x half> %a, <1 x half> %b, ; ZIP-NEXT: csrr a0, vlenb ; ZIP-NEXT: slli a0, a0, 1 ; ZIP-NEXT: add sp, sp, a0 -; ZIP-NEXT: .cfi_def_cfa sp, 16 ; ZIP-NEXT: addi sp, sp, 16 -; ZIP-NEXT: .cfi_def_cfa_offset 0 ; ZIP-NEXT: ret %res = call <7 x half> @llvm.vector.interleave7.v7f16(<1 x half> %a, <1 x half> %b, <1 x half> %c, <1 x half> %d, <1 x half> %e, <1 x half> %f, <1 x half> %g) ret <7 x half> %res } + +define <8 x half> @vector_interleave8_v8f16_v1f16(<1 x half> %a, <1 x half> %b, <1 x half> %c, <1 x half> %d, <1 x half> %e, <1 x half> %f, <1 x half> %g, <1 x half> %h) nounwind { +; CHECK-LABEL: vector_interleave8_v8f16_v1f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: add a3, a2, a1 +; CHECK-NEXT: add a4, a3, a1 +; CHECK-NEXT: add a5, a4, a1 +; CHECK-NEXT: add a6, a5, a1 +; CHECK-NEXT: vsetvli a7, zero, e16, mf4, ta, ma +; CHECK-NEXT: vsseg8e16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a6) +; CHECK-NEXT: add a6, a6, a1 +; CHECK-NEXT: vle16.v v10, (a5) +; CHECK-NEXT: vle16.v v11, (a6) +; CHECK-NEXT: add a1, a6, a1 +; CHECK-NEXT: vle16.v v12, (a2) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v13, (a3) +; CHECK-NEXT: vle16.v v14, (a4) +; CHECK-NEXT: vle16.v v15, (a1) +; CHECK-NEXT: vsetivli zero, 2, e16, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v10, v9, 1 +; CHECK-NEXT: vslideup.vi v8, v12, 1 +; CHECK-NEXT: vsetivli zero, 3, e16, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v10, v11, 2 +; CHECK-NEXT: vslideup.vi v8, v13, 2 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v10, v15, 3 +; CHECK-NEXT: vslideup.vi v8, v14, 3 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vi v8, v10, 4 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave8_v8f16_v1f16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 1 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: srli a1, a1, 2 +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: add a3, a2, a1 +; ZVBB-NEXT: add a4, a3, a1 +; ZVBB-NEXT: add a5, a4, a1 +; ZVBB-NEXT: add a6, a5, a1 +; ZVBB-NEXT: vsetvli a7, zero, e16, mf4, ta, ma +; ZVBB-NEXT: vsseg8e16.v v8, (a0) +; ZVBB-NEXT: vle16.v v9, (a6) +; ZVBB-NEXT: add a6, a6, a1 +; ZVBB-NEXT: vle16.v v10, (a5) +; ZVBB-NEXT: vle16.v v11, (a6) +; ZVBB-NEXT: add a1, a6, a1 +; ZVBB-NEXT: vle16.v v12, (a2) +; ZVBB-NEXT: vle16.v v8, (a0) +; ZVBB-NEXT: vle16.v v13, (a3) +; ZVBB-NEXT: vle16.v v14, (a4) +; ZVBB-NEXT: vle16.v v15, (a1) +; ZVBB-NEXT: vsetivli zero, 2, e16, mf2, tu, ma +; ZVBB-NEXT: vslideup.vi v10, v9, 1 +; ZVBB-NEXT: vslideup.vi v8, v12, 1 +; ZVBB-NEXT: vsetivli zero, 3, e16, mf2, tu, ma +; ZVBB-NEXT: vslideup.vi v10, v11, 2 +; ZVBB-NEXT: vslideup.vi v8, v13, 2 +; ZVBB-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZVBB-NEXT: vslideup.vi v10, v15, 3 +; ZVBB-NEXT: vslideup.vi v8, v14, 3 +; ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVBB-NEXT: vslideup.vi v8, v10, 4 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 1 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret +; +; ZIP-LABEL: vector_interleave8_v8f16_v1f16: +; ZIP: # %bb.0: +; ZIP-NEXT: addi sp, sp, -16 +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: slli a0, a0, 1 +; ZIP-NEXT: sub sp, sp, a0 +; ZIP-NEXT: addi a0, sp, 16 +; ZIP-NEXT: csrr a1, vlenb +; ZIP-NEXT: srli a1, a1, 2 +; ZIP-NEXT: add a2, a0, a1 +; ZIP-NEXT: add a3, a2, a1 +; ZIP-NEXT: add a4, a3, a1 +; ZIP-NEXT: add a5, a4, a1 +; ZIP-NEXT: add a6, a5, a1 +; ZIP-NEXT: vsetvli a7, zero, e16, mf4, ta, ma +; ZIP-NEXT: vsseg8e16.v v8, (a0) +; ZIP-NEXT: vle16.v v9, (a6) +; ZIP-NEXT: add a6, a6, a1 +; ZIP-NEXT: vle16.v v10, (a5) +; ZIP-NEXT: vle16.v v11, (a6) +; ZIP-NEXT: add a1, a6, a1 +; ZIP-NEXT: vle16.v v12, (a2) +; ZIP-NEXT: vle16.v v8, (a0) +; ZIP-NEXT: vle16.v v13, (a3) +; ZIP-NEXT: vle16.v v14, (a4) +; ZIP-NEXT: vle16.v v15, (a1) +; ZIP-NEXT: vsetivli zero, 2, e16, mf2, tu, ma +; ZIP-NEXT: vslideup.vi v10, v9, 1 +; ZIP-NEXT: vslideup.vi v8, v12, 1 +; ZIP-NEXT: vsetivli zero, 3, e16, mf2, tu, ma +; ZIP-NEXT: vslideup.vi v10, v11, 2 +; ZIP-NEXT: vslideup.vi v8, v13, 2 +; ZIP-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZIP-NEXT: vslideup.vi v10, v15, 3 +; ZIP-NEXT: vslideup.vi v8, v14, 3 +; ZIP-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZIP-NEXT: vslideup.vi v8, v10, 4 +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: slli a0, a0, 1 +; ZIP-NEXT: add sp, sp, a0 +; ZIP-NEXT: addi sp, sp, 16 +; ZIP-NEXT: ret + %res = call <8 x half> @llvm.vector.interleave8.v8f16(<1 x half> %a, <1 x half> %b, <1 x half> %c, <1 x half> %d, <1 x half> %e, <1 x half> %f, <1 x half> %g, <1 x half> %h) + ret <8 x half> %res +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll index 7347000bf5c71..77723609a60c7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll @@ -786,6 +786,313 @@ define @vector_interleave_nxv6i64_nxv2i64( ret %res } +define @vector_interleave_nxv64i1_nxv16i1( %a, %b, %c, %d) nounwind { +; CHECK-LABEL: vector_interleave_nxv64i1_nxv16i1: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vmv1r.v v11, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: vmerge.vim v16, v12, 1, v0 +; CHECK-NEXT: slli a2, a1, 1 +; CHECK-NEXT: vmv1r.v v0, v11 +; CHECK-NEXT: vmerge.vim v14, v12, 1, v0 +; CHECK-NEXT: add a3, a0, a2 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vmerge.vim v18, v12, 1, v0 +; CHECK-NEXT: add a4, a3, a2 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vmerge.vim v20, v12, 1, v0 +; CHECK-NEXT: add a2, a4, a2 +; CHECK-NEXT: vsseg4e8.v v14, (a0) +; CHECK-NEXT: vl2r.v v8, (a2) +; CHECK-NEXT: srli a2, a1, 2 +; CHECK-NEXT: srli a1, a1, 1 +; CHECK-NEXT: vl2r.v v10, (a4) +; CHECK-NEXT: add a4, a2, a2 +; CHECK-NEXT: vl2r.v v12, (a3) +; CHECK-NEXT: vl2r.v v14, (a0) +; CHECK-NEXT: vmsne.vi v16, v8, 0 +; CHECK-NEXT: vmsne.vi v8, v10, 0 +; CHECK-NEXT: vmsne.vi v9, v12, 0 +; CHECK-NEXT: vmsne.vi v0, v14, 0 +; CHECK-NEXT: vsetvli zero, a4, e8, mf2, ta, ma +; CHECK-NEXT: vslideup.vx v8, v16, a2 +; CHECK-NEXT: vslideup.vx v0, v9, a2 +; CHECK-NEXT: add a0, a1, a1 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vslideup.vx v0, v8, a1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv64i1_nxv16i1: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 3 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; ZVBB-NEXT: vmv1r.v v11, v0 +; ZVBB-NEXT: vmv1r.v v0, v8 +; ZVBB-NEXT: vmv.v.i v12, 0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: vmerge.vim v16, v12, 1, v0 +; ZVBB-NEXT: slli a2, a1, 1 +; ZVBB-NEXT: vmv1r.v v0, v11 +; ZVBB-NEXT: vmerge.vim v14, v12, 1, v0 +; ZVBB-NEXT: add a3, a0, a2 +; ZVBB-NEXT: vmv1r.v v0, v9 +; ZVBB-NEXT: vmerge.vim v18, v12, 1, v0 +; ZVBB-NEXT: add a4, a3, a2 +; ZVBB-NEXT: vmv1r.v v0, v10 +; ZVBB-NEXT: vmerge.vim v20, v12, 1, v0 +; ZVBB-NEXT: add a2, a4, a2 +; ZVBB-NEXT: vsseg4e8.v v14, (a0) +; ZVBB-NEXT: vl2r.v v8, (a2) +; ZVBB-NEXT: srli a2, a1, 2 +; ZVBB-NEXT: srli a1, a1, 1 +; ZVBB-NEXT: vl2r.v v10, (a4) +; ZVBB-NEXT: add a4, a2, a2 +; ZVBB-NEXT: vl2r.v v12, (a3) +; ZVBB-NEXT: vl2r.v v14, (a0) +; ZVBB-NEXT: vmsne.vi v16, v8, 0 +; ZVBB-NEXT: vmsne.vi v8, v10, 0 +; ZVBB-NEXT: vmsne.vi v9, v12, 0 +; ZVBB-NEXT: vmsne.vi v0, v14, 0 +; ZVBB-NEXT: vsetvli zero, a4, e8, mf2, ta, ma +; ZVBB-NEXT: vslideup.vx v8, v16, a2 +; ZVBB-NEXT: vslideup.vx v0, v9, a2 +; ZVBB-NEXT: add a0, a1, a1 +; ZVBB-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v0, v8, a1 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 3 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave4.nxv64i1( %a, %b, %c, %d) + ret %res +} + +define @vector_interleave_nxv64i8_nxv16i8( %a, %b, %c, %d) nounwind { +; +; CHECK-LABEL: vector_interleave_nxv64i8_nxv16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: vsetvli a3, zero, e8, m2, ta, ma +; CHECK-NEXT: vsseg4e8.v v8, (a0) +; CHECK-NEXT: add a3, a2, a1 +; CHECK-NEXT: vl2r.v v12, (a3) +; CHECK-NEXT: add a1, a3, a1 +; CHECK-NEXT: vl2r.v v14, (a1) +; CHECK-NEXT: vl2r.v v8, (a0) +; CHECK-NEXT: vl2r.v v10, (a2) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv64i8_nxv16i8: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 3 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: slli a1, a1, 1 +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: vsetvli a3, zero, e8, m2, ta, ma +; ZVBB-NEXT: vsseg4e8.v v8, (a0) +; ZVBB-NEXT: add a3, a2, a1 +; ZVBB-NEXT: vl2r.v v12, (a3) +; ZVBB-NEXT: add a1, a3, a1 +; ZVBB-NEXT: vl2r.v v14, (a1) +; ZVBB-NEXT: vl2r.v v8, (a0) +; ZVBB-NEXT: vl2r.v v10, (a2) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 3 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave4.nxv64i8( %a, %b, %c, %d) + ret %res +} + +define @vector_interleave_nxv32i8_nxv8i8( %a, %b, %c, %d) nounwind { +; CHECK-LABEL: vector_interleave_nxv32i8_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: add a3, a2, a1 +; CHECK-NEXT: vsetvli a4, zero, e8, m1, ta, ma +; CHECK-NEXT: vsseg4e8.v v8, (a0) +; CHECK-NEXT: vl1r.v v10, (a3) +; CHECK-NEXT: add a1, a3, a1 +; CHECK-NEXT: vl1r.v v11, (a1) +; CHECK-NEXT: vl1r.v v8, (a0) +; CHECK-NEXT: vl1r.v v9, (a2) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv32i8_nxv8i8: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 2 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: add a3, a2, a1 +; ZVBB-NEXT: vsetvli a4, zero, e8, m1, ta, ma +; ZVBB-NEXT: vsseg4e8.v v8, (a0) +; ZVBB-NEXT: vl1r.v v10, (a3) +; ZVBB-NEXT: add a1, a3, a1 +; ZVBB-NEXT: vl1r.v v11, (a1) +; ZVBB-NEXT: vl1r.v v8, (a0) +; ZVBB-NEXT: vl1r.v v9, (a2) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 2 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave4.nxv32i8( %a, %b, %c, %d) + ret %res +} + +define @vector_interleave_nxv16i32_nxv4i32( %a, %b, %c, %d) nounwind { +; +; CHECK-LABEL: vector_interleave_nxv16i32_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: vsetvli a3, zero, e32, m2, ta, ma +; CHECK-NEXT: vsseg4e32.v v8, (a0) +; CHECK-NEXT: add a3, a2, a1 +; CHECK-NEXT: vl2re32.v v12, (a3) +; CHECK-NEXT: add a1, a3, a1 +; CHECK-NEXT: vl2re32.v v14, (a1) +; CHECK-NEXT: vl2re32.v v8, (a0) +; CHECK-NEXT: vl2re32.v v10, (a2) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv16i32_nxv4i32: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 3 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: slli a1, a1, 1 +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: vsetvli a3, zero, e32, m2, ta, ma +; ZVBB-NEXT: vsseg4e32.v v8, (a0) +; ZVBB-NEXT: add a3, a2, a1 +; ZVBB-NEXT: vl2re32.v v12, (a3) +; ZVBB-NEXT: add a1, a3, a1 +; ZVBB-NEXT: vl2re32.v v14, (a1) +; ZVBB-NEXT: vl2re32.v v8, (a0) +; ZVBB-NEXT: vl2re32.v v10, (a2) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 3 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave4.nxv4i32( %a, %b, %c, %d) + ret %res +} + +define @vector_interleave_nxv8i64_nxv2i64( %a, %b, %c, %d) nounwind { +; +; CHECK-LABEL: vector_interleave_nxv8i64_nxv2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: vsetvli a3, zero, e64, m2, ta, ma +; CHECK-NEXT: vsseg4e64.v v8, (a0) +; CHECK-NEXT: add a3, a2, a1 +; CHECK-NEXT: vl2re64.v v12, (a3) +; CHECK-NEXT: add a1, a3, a1 +; CHECK-NEXT: vl2re64.v v14, (a1) +; CHECK-NEXT: vl2re64.v v8, (a0) +; CHECK-NEXT: vl2re64.v v10, (a2) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv8i64_nxv2i64: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 3 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: slli a1, a1, 1 +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: vsetvli a3, zero, e64, m2, ta, ma +; ZVBB-NEXT: vsseg4e64.v v8, (a0) +; ZVBB-NEXT: add a3, a2, a1 +; ZVBB-NEXT: vl2re64.v v12, (a3) +; ZVBB-NEXT: add a1, a3, a1 +; ZVBB-NEXT: vl2re64.v v14, (a1) +; ZVBB-NEXT: vl2re64.v v8, (a0) +; ZVBB-NEXT: vl2re64.v v10, (a2) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 3 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave4.nxv8i64( %a, %b, %c, %d) + ret %res +} + define @vector_interleave_nxv80i1_nxv16i1( %a, %b, %c, %d, %e) nounwind { ; CHECK-LABEL: vector_interleave_nxv80i1_nxv16i1: ; CHECK: # %bb.0: @@ -2009,1449 +2316,1552 @@ define @vector_interleave_nxv10i64_nxv2i64( %res } -define @vector_interleave_nxv112i1_nxv16i1( %a, %b, %c, %d, %e, %f, %g) nounwind { -; CHECK-LABEL: vector_interleave_nxv112i1_nxv16i1: +define @vector_interleave_nxv96i1_nxv16i1( %a, %b, %c, %d, %e, %f) nounwind { +; CHECK-LABEL: vector_interleave_nxv96i1_nxv16i1: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 14 +; CHECK-NEXT: li a1, 12 ; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; CHECK-NEXT: vmv.v.i v14, 0 -; CHECK-NEXT: addi a4, sp, 16 +; CHECK-NEXT: vmv.v.i v20, 0 +; CHECK-NEXT: vmerge.vim v14, v20, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vim v22, v20, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vmv1r.v v16, v23 +; CHECK-NEXT: vmerge.vim v8, v20, 1, v0 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a1, a0, 3 -; CHECK-NEXT: sub a0, a1, a0 +; CHECK-NEXT: li a1, 6 +; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: vmerge.vim v16, v14, 1, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vim v22, v14, 1, v0 -; CHECK-NEXT: add a3, a4, a2 -; CHECK-NEXT: srli a1, a2, 2 -; CHECK-NEXT: add a5, a0, a2 -; CHECK-NEXT: vmv4r.v v24, v16 -; CHECK-NEXT: vmv1r.v v0, v9 -; CHECK-NEXT: vmerge.vim v18, v14, 1, v0 -; CHECK-NEXT: add a6, a3, a2 -; CHECK-NEXT: vmv1r.v v25, v22 +; CHECK-NEXT: vmv1r.v v17, v9 ; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vmerge.vim v8, v14, 1, v0 -; CHECK-NEXT: vmv1r.v v26, v18 +; CHECK-NEXT: vmerge.vim v24, v20, 1, v0 +; CHECK-NEXT: addi a5, sp, 16 +; CHECK-NEXT: vmv1r.v v18, v25 ; CHECK-NEXT: vmv1r.v v0, v11 -; CHECK-NEXT: vmerge.vim v20, v14, 1, v0 -; CHECK-NEXT: vmv1r.v v27, v8 +; CHECK-NEXT: vmerge.vim v26, v20, 1, v0 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: vmv1r.v v19, v27 ; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vmerge.vim v10, v14, 1, v0 -; CHECK-NEXT: vmv1r.v v28, v20 -; CHECK-NEXT: vmv1r.v v18, v23 +; CHECK-NEXT: vmerge.vim v10, v20, 1, v0 +; CHECK-NEXT: add a3, a0, a2 +; CHECK-NEXT: vmv1r.v v20, v11 +; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; CHECK-NEXT: vsseg6e8.v v15, (a0) +; CHECK-NEXT: vmv1r.v v15, v22 +; CHECK-NEXT: add a4, a5, a2 +; CHECK-NEXT: vmv1r.v v16, v8 +; CHECK-NEXT: srli a1, a2, 2 +; CHECK-NEXT: vmv1r.v v17, v24 +; CHECK-NEXT: add a6, a4, a2 +; CHECK-NEXT: vmv1r.v v18, v26 +; CHECK-NEXT: add a7, a3, a2 +; CHECK-NEXT: vmv1r.v v19, v10 +; CHECK-NEXT: vsseg6e8.v v14, (a5) +; CHECK-NEXT: vl1r.v v8, (a0) +; CHECK-NEXT: add a0, a6, a2 +; CHECK-NEXT: vl1r.v v10, (a6) +; CHECK-NEXT: add a6, a7, a2 +; CHECK-NEXT: vl1r.v v12, (a5) +; CHECK-NEXT: add a5, a0, a2 +; CHECK-NEXT: vl1r.v v14, (a7) ; CHECK-NEXT: add a7, a6, a2 -; CHECK-NEXT: vmv1r.v v29, v10 -; CHECK-NEXT: vmv1r.v v20, v9 -; CHECK-NEXT: vmv1r.v v0, v13 -; CHECK-NEXT: vmerge.vim v30, v14, 1, v0 -; CHECK-NEXT: vmv1r.v v22, v11 -; CHECK-NEXT: vsetvli t0, zero, e8, m1, ta, ma -; CHECK-NEXT: vsseg7e8.v v24, (a4) -; CHECK-NEXT: vmv1r.v v23, v31 -; CHECK-NEXT: vsseg7e8.v v17, (a0) -; CHECK-NEXT: vl1r.v v8, (a6) -; CHECK-NEXT: add a6, a7, a2 -; CHECK-NEXT: vl1r.v v10, (a4) -; CHECK-NEXT: add a4, a6, a2 -; CHECK-NEXT: vl1r.v v12, (a6) -; CHECK-NEXT: add a6, a4, a2 -; CHECK-NEXT: vl1r.v v14, (a6) -; CHECK-NEXT: add a6, a5, a2 ; CHECK-NEXT: vl1r.v v16, (a5) -; CHECK-NEXT: add a5, a6, a2 -; CHECK-NEXT: vl1r.v v18, (a5) ; CHECK-NEXT: add a5, a5, a2 -; CHECK-NEXT: vl1r.v v9, (a7) -; CHECK-NEXT: add a7, a5, a2 -; CHECK-NEXT: vl1r.v v20, (a7) +; CHECK-NEXT: vl1r.v v18, (a7) ; CHECK-NEXT: add a7, a7, a2 ; CHECK-NEXT: srli a2, a2, 1 -; CHECK-NEXT: vl1r.v v11, (a3) +; CHECK-NEXT: vl1r.v v9, (a3) ; CHECK-NEXT: add a3, a1, a1 +; CHECK-NEXT: vl1r.v v17, (a5) +; CHECK-NEXT: add a5, a2, a2 +; CHECK-NEXT: vl1r.v v11, (a0) ; CHECK-NEXT: vl1r.v v13, (a4) -; CHECK-NEXT: add a4, a2, a2 -; CHECK-NEXT: vl1r.v v15, (a0) -; CHECK-NEXT: vl1r.v v19, (a5) -; CHECK-NEXT: vl1r.v v17, (a6) -; CHECK-NEXT: vl1r.v v21, (a7) +; CHECK-NEXT: vl1r.v v19, (a7) +; CHECK-NEXT: vl1r.v v15, (a6) ; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; CHECK-NEXT: vmsne.vi v22, v8, 0 -; CHECK-NEXT: vmsne.vi v0, v10, 0 -; CHECK-NEXT: vmsne.vi v9, v12, 0 -; CHECK-NEXT: vmsne.vi v10, v14, 0 -; CHECK-NEXT: vmsne.vi v11, v18, 0 -; CHECK-NEXT: vmsne.vi v8, v16, 0 -; CHECK-NEXT: vmsne.vi v12, v20, 0 +; CHECK-NEXT: vmsne.vi v20, v8, 0 +; CHECK-NEXT: vmsne.vi v9, v16, 0 +; CHECK-NEXT: vmsne.vi v16, v10, 0 +; CHECK-NEXT: vmsne.vi v0, v12, 0 +; CHECK-NEXT: vmsne.vi v10, v18, 0 +; CHECK-NEXT: vmsne.vi v8, v14, 0 ; CHECK-NEXT: vsetvli zero, a3, e8, mf2, ta, ma -; CHECK-NEXT: vslideup.vx v0, v22, a1 -; CHECK-NEXT: vslideup.vx v9, v10, a1 -; CHECK-NEXT: vslideup.vx v8, v11, a1 -; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-NEXT: vslideup.vx v9, v20, a1 +; CHECK-NEXT: vslideup.vx v0, v16, a1 +; CHECK-NEXT: vsetvli zero, a5, e8, m1, ta, ma ; CHECK-NEXT: vslideup.vx v0, v9, a2 -; CHECK-NEXT: vslideup.vx v8, v12, a2 +; CHECK-NEXT: vsetvli zero, a3, e8, mf2, ta, ma +; CHECK-NEXT: vslideup.vx v8, v10, a1 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 14 +; CHECK-NEXT: li a1, 12 ; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret ; -; ZVBB-LABEL: vector_interleave_nxv112i1_nxv16i1: +; ZVBB-LABEL: vector_interleave_nxv96i1_nxv16i1: ; ZVBB: # %bb.0: ; ZVBB-NEXT: addi sp, sp, -16 ; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: li a1, 14 +; ZVBB-NEXT: li a1, 12 ; ZVBB-NEXT: mul a0, a0, a1 ; ZVBB-NEXT: sub sp, sp, a0 ; ZVBB-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; ZVBB-NEXT: vmv.v.i v14, 0 -; ZVBB-NEXT: addi a4, sp, 16 +; ZVBB-NEXT: vmv.v.i v20, 0 +; ZVBB-NEXT: vmerge.vim v14, v20, 1, v0 +; ZVBB-NEXT: vmv1r.v v0, v8 +; ZVBB-NEXT: vmerge.vim v22, v20, 1, v0 +; ZVBB-NEXT: vmv1r.v v0, v9 +; ZVBB-NEXT: vmv1r.v v16, v23 +; ZVBB-NEXT: vmerge.vim v8, v20, 1, v0 ; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: slli a1, a0, 3 -; ZVBB-NEXT: sub a0, a1, a0 +; ZVBB-NEXT: li a1, 6 +; ZVBB-NEXT: mul a0, a0, a1 ; ZVBB-NEXT: add a0, sp, a0 ; ZVBB-NEXT: addi a0, a0, 16 -; ZVBB-NEXT: csrr a2, vlenb -; ZVBB-NEXT: vmerge.vim v16, v14, 1, v0 -; ZVBB-NEXT: vmv1r.v v0, v8 -; ZVBB-NEXT: vmerge.vim v22, v14, 1, v0 -; ZVBB-NEXT: add a3, a4, a2 -; ZVBB-NEXT: srli a1, a2, 2 -; ZVBB-NEXT: add a5, a0, a2 -; ZVBB-NEXT: vmv4r.v v24, v16 -; ZVBB-NEXT: vmv1r.v v0, v9 -; ZVBB-NEXT: vmerge.vim v18, v14, 1, v0 -; ZVBB-NEXT: add a6, a3, a2 -; ZVBB-NEXT: vmv1r.v v25, v22 +; ZVBB-NEXT: vmv1r.v v17, v9 ; ZVBB-NEXT: vmv1r.v v0, v10 -; ZVBB-NEXT: vmerge.vim v8, v14, 1, v0 -; ZVBB-NEXT: vmv1r.v v26, v18 +; ZVBB-NEXT: vmerge.vim v24, v20, 1, v0 +; ZVBB-NEXT: addi a5, sp, 16 +; ZVBB-NEXT: vmv1r.v v18, v25 ; ZVBB-NEXT: vmv1r.v v0, v11 -; ZVBB-NEXT: vmerge.vim v20, v14, 1, v0 -; ZVBB-NEXT: vmv1r.v v27, v8 +; ZVBB-NEXT: vmerge.vim v26, v20, 1, v0 +; ZVBB-NEXT: csrr a2, vlenb +; ZVBB-NEXT: vmv1r.v v19, v27 ; ZVBB-NEXT: vmv1r.v v0, v12 -; ZVBB-NEXT: vmerge.vim v10, v14, 1, v0 -; ZVBB-NEXT: vmv1r.v v28, v20 -; ZVBB-NEXT: vmv1r.v v18, v23 -; ZVBB-NEXT: add a7, a6, a2 -; ZVBB-NEXT: vmv1r.v v29, v10 -; ZVBB-NEXT: vmv1r.v v20, v9 -; ZVBB-NEXT: vmv1r.v v0, v13 -; ZVBB-NEXT: vmerge.vim v30, v14, 1, v0 -; ZVBB-NEXT: vmv1r.v v22, v11 -; ZVBB-NEXT: vsetvli t0, zero, e8, m1, ta, ma -; ZVBB-NEXT: vsseg7e8.v v24, (a4) -; ZVBB-NEXT: vmv1r.v v23, v31 -; ZVBB-NEXT: vsseg7e8.v v17, (a0) -; ZVBB-NEXT: vl1r.v v8, (a6) -; ZVBB-NEXT: add a6, a7, a2 -; ZVBB-NEXT: vl1r.v v10, (a4) -; ZVBB-NEXT: add a4, a6, a2 -; ZVBB-NEXT: vl1r.v v12, (a6) +; ZVBB-NEXT: vmerge.vim v10, v20, 1, v0 +; ZVBB-NEXT: add a3, a0, a2 +; ZVBB-NEXT: vmv1r.v v20, v11 +; ZVBB-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; ZVBB-NEXT: vsseg6e8.v v15, (a0) +; ZVBB-NEXT: vmv1r.v v15, v22 +; ZVBB-NEXT: add a4, a5, a2 +; ZVBB-NEXT: vmv1r.v v16, v8 +; ZVBB-NEXT: srli a1, a2, 2 +; ZVBB-NEXT: vmv1r.v v17, v24 ; ZVBB-NEXT: add a6, a4, a2 -; ZVBB-NEXT: vl1r.v v14, (a6) -; ZVBB-NEXT: add a6, a5, a2 +; ZVBB-NEXT: vmv1r.v v18, v26 +; ZVBB-NEXT: add a7, a3, a2 +; ZVBB-NEXT: vmv1r.v v19, v10 +; ZVBB-NEXT: vsseg6e8.v v14, (a5) +; ZVBB-NEXT: vl1r.v v8, (a0) +; ZVBB-NEXT: add a0, a6, a2 +; ZVBB-NEXT: vl1r.v v10, (a6) +; ZVBB-NEXT: add a6, a7, a2 +; ZVBB-NEXT: vl1r.v v12, (a5) +; ZVBB-NEXT: add a5, a0, a2 +; ZVBB-NEXT: vl1r.v v14, (a7) +; ZVBB-NEXT: add a7, a6, a2 ; ZVBB-NEXT: vl1r.v v16, (a5) -; ZVBB-NEXT: add a5, a6, a2 -; ZVBB-NEXT: vl1r.v v18, (a5) ; ZVBB-NEXT: add a5, a5, a2 -; ZVBB-NEXT: vl1r.v v9, (a7) -; ZVBB-NEXT: add a7, a5, a2 -; ZVBB-NEXT: vl1r.v v20, (a7) +; ZVBB-NEXT: vl1r.v v18, (a7) ; ZVBB-NEXT: add a7, a7, a2 ; ZVBB-NEXT: srli a2, a2, 1 -; ZVBB-NEXT: vl1r.v v11, (a3) +; ZVBB-NEXT: vl1r.v v9, (a3) ; ZVBB-NEXT: add a3, a1, a1 +; ZVBB-NEXT: vl1r.v v17, (a5) +; ZVBB-NEXT: add a5, a2, a2 +; ZVBB-NEXT: vl1r.v v11, (a0) ; ZVBB-NEXT: vl1r.v v13, (a4) -; ZVBB-NEXT: add a4, a2, a2 -; ZVBB-NEXT: vl1r.v v15, (a0) -; ZVBB-NEXT: vl1r.v v19, (a5) -; ZVBB-NEXT: vl1r.v v17, (a6) -; ZVBB-NEXT: vl1r.v v21, (a7) +; ZVBB-NEXT: vl1r.v v19, (a7) +; ZVBB-NEXT: vl1r.v v15, (a6) ; ZVBB-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; ZVBB-NEXT: vmsne.vi v22, v8, 0 -; ZVBB-NEXT: vmsne.vi v0, v10, 0 -; ZVBB-NEXT: vmsne.vi v9, v12, 0 -; ZVBB-NEXT: vmsne.vi v10, v14, 0 -; ZVBB-NEXT: vmsne.vi v11, v18, 0 -; ZVBB-NEXT: vmsne.vi v8, v16, 0 -; ZVBB-NEXT: vmsne.vi v12, v20, 0 +; ZVBB-NEXT: vmsne.vi v20, v8, 0 +; ZVBB-NEXT: vmsne.vi v9, v16, 0 +; ZVBB-NEXT: vmsne.vi v16, v10, 0 +; ZVBB-NEXT: vmsne.vi v0, v12, 0 +; ZVBB-NEXT: vmsne.vi v10, v18, 0 +; ZVBB-NEXT: vmsne.vi v8, v14, 0 ; ZVBB-NEXT: vsetvli zero, a3, e8, mf2, ta, ma -; ZVBB-NEXT: vslideup.vx v0, v22, a1 -; ZVBB-NEXT: vslideup.vx v9, v10, a1 -; ZVBB-NEXT: vslideup.vx v8, v11, a1 -; ZVBB-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v9, v20, a1 +; ZVBB-NEXT: vslideup.vx v0, v16, a1 +; ZVBB-NEXT: vsetvli zero, a5, e8, m1, ta, ma ; ZVBB-NEXT: vslideup.vx v0, v9, a2 -; ZVBB-NEXT: vslideup.vx v8, v12, a2 +; ZVBB-NEXT: vsetvli zero, a3, e8, mf2, ta, ma +; ZVBB-NEXT: vslideup.vx v8, v10, a1 ; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: li a1, 14 +; ZVBB-NEXT: li a1, 12 ; ZVBB-NEXT: mul a0, a0, a1 ; ZVBB-NEXT: add sp, sp, a0 ; ZVBB-NEXT: addi sp, sp, 16 ; ZVBB-NEXT: ret - %res = call @llvm.vector.interleave7.nxv112i1( %a, %b, %c, %d, %e, %f, %g) - ret %res + %res = call @llvm.vector.interleave6.nxv96i1( %a, %b, %c, %d, %e, %f) + ret %res } - -define @vector_interleave_nxv112i8_nxv16i8( %a, %b, %c, %d, %e, %f, %g) nounwind { +define @vector_interleave_nxv96i8_nxv16i8( %a, %b, %c, %d, %e, %f) nounwind { ; -; RV32-LABEL: vector_interleave_nxv112i8_nxv16i8: +; RV32-LABEL: vector_interleave_nxv96i8_nxv16i8: ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -80 ; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill ; RV32-NEXT: addi s0, sp, 80 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: li a1, 28 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: sub sp, sp, a0 ; RV32-NEXT: andi sp, sp, -64 ; RV32-NEXT: vsetvli a0, zero, e8, m1, ta, ma -; RV32-NEXT: vmv2r.v v26, v20 -; RV32-NEXT: addi a0, sp, 64 -; RV32-NEXT: vmv2r.v v24, v16 +; RV32-NEXT: vmv2r.v v20, v14 +; RV32-NEXT: vmv2r.v v22, v12 +; RV32-NEXT: vmv2r.v v24, v10 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a2, a1, 3 -; RV32-NEXT: sub a1, a2, a1 +; RV32-NEXT: li a0, 6 +; RV32-NEXT: mul a1, a1, a0 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 64 -; RV32-NEXT: vmv2r.v v22, v12 +; RV32-NEXT: vmv1r.v v10, v25 +; RV32-NEXT: vmv1r.v v11, v23 +; RV32-NEXT: vmv1r.v v12, v21 +; RV32-NEXT: addi a0, sp, 64 +; RV32-NEXT: vmv1r.v v13, v17 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: vmv2r.v v20, v8 -; RV32-NEXT: vmv1r.v v1, v20 -; RV32-NEXT: vmv1r.v v3, v22 -; RV32-NEXT: vmv1r.v v5, v24 -; RV32-NEXT: vmv1r.v v7, v26 +; RV32-NEXT: vmv1r.v v14, v19 +; RV32-NEXT: vsseg6e8.v v9, (a1) +; RV32-NEXT: vmv1r.v v9, v24 +; RV32-NEXT: add a5, a1, a2 +; RV32-NEXT: vmv1r.v v10, v22 ; RV32-NEXT: add a3, a0, a2 -; RV32-NEXT: vmv1r.v v2, v10 -; RV32-NEXT: add a4, a1, a2 -; RV32-NEXT: slli a5, a2, 2 -; RV32-NEXT: vmv1r.v v4, v14 -; RV32-NEXT: slli a6, a2, 4 -; RV32-NEXT: add a7, a4, a2 -; RV32-NEXT: vmv1r.v v6, v18 -; RV32-NEXT: sub a5, a6, a5 -; RV32-NEXT: vmv1r.v v22, v11 -; RV32-NEXT: add a6, a7, a2 -; RV32-NEXT: vmv1r.v v24, v15 -; RV32-NEXT: vsseg7e8.v v1, (a0) -; RV32-NEXT: vmv1r.v v26, v19 -; RV32-NEXT: vsseg7e8.v v21, (a1) -; RV32-NEXT: vl1r.v v18, (a6) -; RV32-NEXT: add a6, a6, a2 -; RV32-NEXT: vl1r.v v19, (a6) -; RV32-NEXT: add a6, a6, a2 -; RV32-NEXT: vl1r.v v20, (a6) -; RV32-NEXT: add a6, a6, a2 -; RV32-NEXT: vl1r.v v21, (a6) -; RV32-NEXT: add a6, a3, a2 -; RV32-NEXT: vl1r.v v10, (a6) -; RV32-NEXT: add a6, a6, a2 -; RV32-NEXT: vl1r.v v11, (a6) -; RV32-NEXT: vl1r.v v8, (a0) -; RV32-NEXT: vl1r.v v16, (a4) -; RV32-NEXT: vl1r.v v9, (a3) -; RV32-NEXT: vl1r.v v17, (a7) -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a3, 14 -; RV32-NEXT: mul a0, a0, a3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 64 -; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vmv1r.v v11, v20 +; RV32-NEXT: add a4, a3, a2 +; RV32-NEXT: vmv1r.v v12, v16 +; RV32-NEXT: add a6, a5, a2 +; RV32-NEXT: vmv1r.v v13, v18 +; RV32-NEXT: vsseg6e8.v v8, (a0) +; RV32-NEXT: vl1r.v v14, (a1) +; RV32-NEXT: add a1, a6, a2 +; RV32-NEXT: vl1r.v v15, (a5) +; RV32-NEXT: add a5, a1, a2 +; RV32-NEXT: vl1r.v v18, (a5) +; RV32-NEXT: add a5, a5, a2 +; RV32-NEXT: vl1r.v v19, (a5) +; RV32-NEXT: add a5, a4, a2 +; RV32-NEXT: vl1r.v v16, (a6) +; RV32-NEXT: add a6, a5, a2 ; RV32-NEXT: vl1r.v v12, (a6) ; RV32-NEXT: add a6, a6, a2 ; RV32-NEXT: vl1r.v v13, (a6) -; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: csrr a6, vlenb +; RV32-NEXT: li a7, 12 +; RV32-NEXT: mul a6, a6, a7 +; RV32-NEXT: add a6, sp, a6 +; RV32-NEXT: addi a6, a6, 64 +; RV32-NEXT: vl1r.v v17, (a1) +; RV32-NEXT: vl1r.v v10, (a4) +; RV32-NEXT: vl1r.v v11, (a5) +; RV32-NEXT: vl1r.v v8, (a0) +; RV32-NEXT: vl1r.v v9, (a3) ; RV32-NEXT: slli a2, a2, 3 -; RV32-NEXT: add a2, a0, a2 -; RV32-NEXT: vl1r.v v14, (a6) -; RV32-NEXT: vl1r.v v15, (a1) -; RV32-NEXT: add a5, a0, a5 -; RV32-NEXT: vs2r.v v20, (a5) +; RV32-NEXT: add a2, a6, a2 ; RV32-NEXT: vs4r.v v16, (a2) -; RV32-NEXT: vs8r.v v8, (a0) +; RV32-NEXT: vs8r.v v8, (a6) ; RV32-NEXT: vl8r.v v16, (a2) -; RV32-NEXT: vl8r.v v8, (a0) +; RV32-NEXT: vl8r.v v8, (a6) ; RV32-NEXT: addi sp, s0, -80 ; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 80 ; RV32-NEXT: ret ; -; RV64-LABEL: vector_interleave_nxv112i8_nxv16i8: +; RV64-LABEL: vector_interleave_nxv96i8_nxv16i8: ; RV64: # %bb.0: ; RV64-NEXT: addi sp, sp, -80 ; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill ; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill ; RV64-NEXT: addi s0, sp, 80 ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: li a1, 28 +; RV64-NEXT: mul a0, a0, a1 ; RV64-NEXT: sub sp, sp, a0 ; RV64-NEXT: andi sp, sp, -64 ; RV64-NEXT: vsetvli a0, zero, e8, m1, ta, ma -; RV64-NEXT: vmv2r.v v26, v20 -; RV64-NEXT: addi a0, sp, 64 -; RV64-NEXT: vmv2r.v v24, v16 +; RV64-NEXT: vmv2r.v v20, v14 +; RV64-NEXT: vmv2r.v v22, v12 +; RV64-NEXT: vmv2r.v v24, v10 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 3 -; RV64-NEXT: sub a1, a2, a1 +; RV64-NEXT: li a0, 6 +; RV64-NEXT: mul a1, a1, a0 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 64 -; RV64-NEXT: vmv2r.v v22, v12 +; RV64-NEXT: vmv1r.v v10, v25 +; RV64-NEXT: vmv1r.v v11, v23 +; RV64-NEXT: vmv1r.v v12, v21 +; RV64-NEXT: addi a0, sp, 64 +; RV64-NEXT: vmv1r.v v13, v17 ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: vmv2r.v v20, v8 -; RV64-NEXT: vmv1r.v v1, v20 -; RV64-NEXT: vmv1r.v v3, v22 -; RV64-NEXT: vmv1r.v v5, v24 -; RV64-NEXT: vmv1r.v v7, v26 +; RV64-NEXT: vmv1r.v v14, v19 +; RV64-NEXT: vsseg6e8.v v9, (a1) +; RV64-NEXT: vmv1r.v v9, v24 +; RV64-NEXT: add a5, a1, a2 +; RV64-NEXT: vmv1r.v v10, v22 ; RV64-NEXT: add a3, a0, a2 -; RV64-NEXT: vmv1r.v v2, v10 -; RV64-NEXT: add a4, a1, a2 -; RV64-NEXT: slli a5, a2, 2 -; RV64-NEXT: vmv1r.v v4, v14 -; RV64-NEXT: slli a6, a2, 4 -; RV64-NEXT: add a7, a4, a2 -; RV64-NEXT: vmv1r.v v6, v18 -; RV64-NEXT: sub a5, a6, a5 -; RV64-NEXT: vmv1r.v v22, v11 -; RV64-NEXT: add a6, a7, a2 -; RV64-NEXT: vmv1r.v v24, v15 -; RV64-NEXT: vsseg7e8.v v1, (a0) -; RV64-NEXT: vmv1r.v v26, v19 -; RV64-NEXT: vsseg7e8.v v21, (a1) -; RV64-NEXT: vl1r.v v18, (a6) -; RV64-NEXT: add a6, a6, a2 -; RV64-NEXT: vl1r.v v19, (a6) -; RV64-NEXT: add a6, a6, a2 -; RV64-NEXT: vl1r.v v20, (a6) -; RV64-NEXT: add a6, a6, a2 -; RV64-NEXT: vl1r.v v21, (a6) -; RV64-NEXT: add a6, a3, a2 -; RV64-NEXT: vl1r.v v10, (a6) -; RV64-NEXT: add a6, a6, a2 -; RV64-NEXT: vl1r.v v11, (a6) -; RV64-NEXT: vl1r.v v8, (a0) -; RV64-NEXT: vl1r.v v16, (a4) -; RV64-NEXT: vl1r.v v9, (a3) -; RV64-NEXT: vl1r.v v17, (a7) -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: li a3, 14 -; RV64-NEXT: mul a0, a0, a3 -; RV64-NEXT: add a0, sp, a0 -; RV64-NEXT: addi a0, a0, 64 -; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vmv1r.v v11, v20 +; RV64-NEXT: add a4, a3, a2 +; RV64-NEXT: vmv1r.v v12, v16 +; RV64-NEXT: add a6, a5, a2 +; RV64-NEXT: vmv1r.v v13, v18 +; RV64-NEXT: vsseg6e8.v v8, (a0) +; RV64-NEXT: vl1r.v v14, (a1) +; RV64-NEXT: add a1, a6, a2 +; RV64-NEXT: vl1r.v v15, (a5) +; RV64-NEXT: add a5, a1, a2 +; RV64-NEXT: vl1r.v v18, (a5) +; RV64-NEXT: add a5, a5, a2 +; RV64-NEXT: vl1r.v v19, (a5) +; RV64-NEXT: add a5, a4, a2 +; RV64-NEXT: vl1r.v v16, (a6) +; RV64-NEXT: add a6, a5, a2 ; RV64-NEXT: vl1r.v v12, (a6) ; RV64-NEXT: add a6, a6, a2 ; RV64-NEXT: vl1r.v v13, (a6) -; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: csrr a6, vlenb +; RV64-NEXT: li a7, 12 +; RV64-NEXT: mul a6, a6, a7 +; RV64-NEXT: add a6, sp, a6 +; RV64-NEXT: addi a6, a6, 64 +; RV64-NEXT: vl1r.v v17, (a1) +; RV64-NEXT: vl1r.v v10, (a4) +; RV64-NEXT: vl1r.v v11, (a5) +; RV64-NEXT: vl1r.v v8, (a0) +; RV64-NEXT: vl1r.v v9, (a3) ; RV64-NEXT: slli a2, a2, 3 -; RV64-NEXT: add a2, a0, a2 -; RV64-NEXT: vl1r.v v14, (a6) -; RV64-NEXT: vl1r.v v15, (a1) -; RV64-NEXT: add a5, a0, a5 -; RV64-NEXT: vs2r.v v20, (a5) +; RV64-NEXT: add a2, a6, a2 ; RV64-NEXT: vs4r.v v16, (a2) -; RV64-NEXT: vs8r.v v8, (a0) +; RV64-NEXT: vs8r.v v8, (a6) ; RV64-NEXT: vl8r.v v16, (a2) -; RV64-NEXT: vl8r.v v8, (a0) +; RV64-NEXT: vl8r.v v8, (a6) ; RV64-NEXT: addi sp, s0, -80 ; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload ; RV64-NEXT: addi sp, sp, 80 ; RV64-NEXT: ret ; -; ZVBB-RV32-LABEL: vector_interleave_nxv112i8_nxv16i8: +; ZVBB-RV32-LABEL: vector_interleave_nxv96i8_nxv16i8: ; ZVBB-RV32: # %bb.0: ; ZVBB-RV32-NEXT: addi sp, sp, -80 ; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill ; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill ; ZVBB-RV32-NEXT: addi s0, sp, 80 ; ZVBB-RV32-NEXT: csrr a0, vlenb -; ZVBB-RV32-NEXT: slli a0, a0, 5 +; ZVBB-RV32-NEXT: li a1, 28 +; ZVBB-RV32-NEXT: mul a0, a0, a1 ; ZVBB-RV32-NEXT: sub sp, sp, a0 ; ZVBB-RV32-NEXT: andi sp, sp, -64 ; ZVBB-RV32-NEXT: vsetvli a0, zero, e8, m1, ta, ma -; ZVBB-RV32-NEXT: vmv2r.v v26, v20 -; ZVBB-RV32-NEXT: addi a0, sp, 64 -; ZVBB-RV32-NEXT: vmv2r.v v24, v16 +; ZVBB-RV32-NEXT: vmv2r.v v20, v14 +; ZVBB-RV32-NEXT: vmv2r.v v22, v12 +; ZVBB-RV32-NEXT: vmv2r.v v24, v10 ; ZVBB-RV32-NEXT: csrr a1, vlenb -; ZVBB-RV32-NEXT: slli a2, a1, 3 -; ZVBB-RV32-NEXT: sub a1, a2, a1 +; ZVBB-RV32-NEXT: li a0, 6 +; ZVBB-RV32-NEXT: mul a1, a1, a0 ; ZVBB-RV32-NEXT: add a1, sp, a1 ; ZVBB-RV32-NEXT: addi a1, a1, 64 -; ZVBB-RV32-NEXT: vmv2r.v v22, v12 +; ZVBB-RV32-NEXT: vmv1r.v v10, v25 +; ZVBB-RV32-NEXT: vmv1r.v v11, v23 +; ZVBB-RV32-NEXT: vmv1r.v v12, v21 +; ZVBB-RV32-NEXT: addi a0, sp, 64 +; ZVBB-RV32-NEXT: vmv1r.v v13, v17 ; ZVBB-RV32-NEXT: csrr a2, vlenb -; ZVBB-RV32-NEXT: vmv2r.v v20, v8 -; ZVBB-RV32-NEXT: vmv1r.v v1, v20 -; ZVBB-RV32-NEXT: vmv1r.v v3, v22 -; ZVBB-RV32-NEXT: vmv1r.v v5, v24 -; ZVBB-RV32-NEXT: vmv1r.v v7, v26 +; ZVBB-RV32-NEXT: vmv1r.v v14, v19 +; ZVBB-RV32-NEXT: vsseg6e8.v v9, (a1) +; ZVBB-RV32-NEXT: vmv1r.v v9, v24 +; ZVBB-RV32-NEXT: add a5, a1, a2 +; ZVBB-RV32-NEXT: vmv1r.v v10, v22 ; ZVBB-RV32-NEXT: add a3, a0, a2 -; ZVBB-RV32-NEXT: vmv1r.v v2, v10 -; ZVBB-RV32-NEXT: add a4, a1, a2 -; ZVBB-RV32-NEXT: slli a5, a2, 2 -; ZVBB-RV32-NEXT: vmv1r.v v4, v14 -; ZVBB-RV32-NEXT: slli a6, a2, 4 -; ZVBB-RV32-NEXT: add a7, a4, a2 -; ZVBB-RV32-NEXT: vmv1r.v v6, v18 -; ZVBB-RV32-NEXT: sub a5, a6, a5 -; ZVBB-RV32-NEXT: vmv1r.v v22, v11 -; ZVBB-RV32-NEXT: add a6, a7, a2 -; ZVBB-RV32-NEXT: vmv1r.v v24, v15 -; ZVBB-RV32-NEXT: vsseg7e8.v v1, (a0) -; ZVBB-RV32-NEXT: vmv1r.v v26, v19 -; ZVBB-RV32-NEXT: vsseg7e8.v v21, (a1) -; ZVBB-RV32-NEXT: vl1r.v v18, (a6) -; ZVBB-RV32-NEXT: add a6, a6, a2 -; ZVBB-RV32-NEXT: vl1r.v v19, (a6) -; ZVBB-RV32-NEXT: add a6, a6, a2 -; ZVBB-RV32-NEXT: vl1r.v v20, (a6) -; ZVBB-RV32-NEXT: add a6, a6, a2 -; ZVBB-RV32-NEXT: vl1r.v v21, (a6) -; ZVBB-RV32-NEXT: add a6, a3, a2 -; ZVBB-RV32-NEXT: vl1r.v v10, (a6) -; ZVBB-RV32-NEXT: add a6, a6, a2 -; ZVBB-RV32-NEXT: vl1r.v v11, (a6) -; ZVBB-RV32-NEXT: vl1r.v v8, (a0) -; ZVBB-RV32-NEXT: vl1r.v v16, (a4) -; ZVBB-RV32-NEXT: vl1r.v v9, (a3) -; ZVBB-RV32-NEXT: vl1r.v v17, (a7) -; ZVBB-RV32-NEXT: csrr a0, vlenb -; ZVBB-RV32-NEXT: li a3, 14 -; ZVBB-RV32-NEXT: mul a0, a0, a3 -; ZVBB-RV32-NEXT: add a0, sp, a0 -; ZVBB-RV32-NEXT: addi a0, a0, 64 -; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vmv1r.v v11, v20 +; ZVBB-RV32-NEXT: add a4, a3, a2 +; ZVBB-RV32-NEXT: vmv1r.v v12, v16 +; ZVBB-RV32-NEXT: add a6, a5, a2 +; ZVBB-RV32-NEXT: vmv1r.v v13, v18 +; ZVBB-RV32-NEXT: vsseg6e8.v v8, (a0) +; ZVBB-RV32-NEXT: vl1r.v v14, (a1) +; ZVBB-RV32-NEXT: add a1, a6, a2 +; ZVBB-RV32-NEXT: vl1r.v v15, (a5) +; ZVBB-RV32-NEXT: add a5, a1, a2 +; ZVBB-RV32-NEXT: vl1r.v v18, (a5) +; ZVBB-RV32-NEXT: add a5, a5, a2 +; ZVBB-RV32-NEXT: vl1r.v v19, (a5) +; ZVBB-RV32-NEXT: add a5, a4, a2 +; ZVBB-RV32-NEXT: vl1r.v v16, (a6) +; ZVBB-RV32-NEXT: add a6, a5, a2 ; ZVBB-RV32-NEXT: vl1r.v v12, (a6) ; ZVBB-RV32-NEXT: add a6, a6, a2 ; ZVBB-RV32-NEXT: vl1r.v v13, (a6) -; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: csrr a6, vlenb +; ZVBB-RV32-NEXT: li a7, 12 +; ZVBB-RV32-NEXT: mul a6, a6, a7 +; ZVBB-RV32-NEXT: add a6, sp, a6 +; ZVBB-RV32-NEXT: addi a6, a6, 64 +; ZVBB-RV32-NEXT: vl1r.v v17, (a1) +; ZVBB-RV32-NEXT: vl1r.v v10, (a4) +; ZVBB-RV32-NEXT: vl1r.v v11, (a5) +; ZVBB-RV32-NEXT: vl1r.v v8, (a0) +; ZVBB-RV32-NEXT: vl1r.v v9, (a3) ; ZVBB-RV32-NEXT: slli a2, a2, 3 -; ZVBB-RV32-NEXT: add a2, a0, a2 -; ZVBB-RV32-NEXT: vl1r.v v14, (a6) -; ZVBB-RV32-NEXT: vl1r.v v15, (a1) -; ZVBB-RV32-NEXT: add a5, a0, a5 -; ZVBB-RV32-NEXT: vs2r.v v20, (a5) +; ZVBB-RV32-NEXT: add a2, a6, a2 ; ZVBB-RV32-NEXT: vs4r.v v16, (a2) -; ZVBB-RV32-NEXT: vs8r.v v8, (a0) +; ZVBB-RV32-NEXT: vs8r.v v8, (a6) ; ZVBB-RV32-NEXT: vl8r.v v16, (a2) -; ZVBB-RV32-NEXT: vl8r.v v8, (a0) +; ZVBB-RV32-NEXT: vl8r.v v8, (a6) ; ZVBB-RV32-NEXT: addi sp, s0, -80 ; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload ; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload ; ZVBB-RV32-NEXT: addi sp, sp, 80 ; ZVBB-RV32-NEXT: ret ; -; ZVBB-RV64-LABEL: vector_interleave_nxv112i8_nxv16i8: +; ZVBB-RV64-LABEL: vector_interleave_nxv96i8_nxv16i8: ; ZVBB-RV64: # %bb.0: ; ZVBB-RV64-NEXT: addi sp, sp, -80 ; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill ; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill ; ZVBB-RV64-NEXT: addi s0, sp, 80 ; ZVBB-RV64-NEXT: csrr a0, vlenb -; ZVBB-RV64-NEXT: slli a0, a0, 5 +; ZVBB-RV64-NEXT: li a1, 28 +; ZVBB-RV64-NEXT: mul a0, a0, a1 ; ZVBB-RV64-NEXT: sub sp, sp, a0 ; ZVBB-RV64-NEXT: andi sp, sp, -64 ; ZVBB-RV64-NEXT: vsetvli a0, zero, e8, m1, ta, ma -; ZVBB-RV64-NEXT: vmv2r.v v26, v20 -; ZVBB-RV64-NEXT: addi a0, sp, 64 -; ZVBB-RV64-NEXT: vmv2r.v v24, v16 +; ZVBB-RV64-NEXT: vmv2r.v v20, v14 +; ZVBB-RV64-NEXT: vmv2r.v v22, v12 +; ZVBB-RV64-NEXT: vmv2r.v v24, v10 ; ZVBB-RV64-NEXT: csrr a1, vlenb -; ZVBB-RV64-NEXT: slli a2, a1, 3 -; ZVBB-RV64-NEXT: sub a1, a2, a1 +; ZVBB-RV64-NEXT: li a0, 6 +; ZVBB-RV64-NEXT: mul a1, a1, a0 ; ZVBB-RV64-NEXT: add a1, sp, a1 ; ZVBB-RV64-NEXT: addi a1, a1, 64 -; ZVBB-RV64-NEXT: vmv2r.v v22, v12 +; ZVBB-RV64-NEXT: vmv1r.v v10, v25 +; ZVBB-RV64-NEXT: vmv1r.v v11, v23 +; ZVBB-RV64-NEXT: vmv1r.v v12, v21 +; ZVBB-RV64-NEXT: addi a0, sp, 64 +; ZVBB-RV64-NEXT: vmv1r.v v13, v17 ; ZVBB-RV64-NEXT: csrr a2, vlenb -; ZVBB-RV64-NEXT: vmv2r.v v20, v8 -; ZVBB-RV64-NEXT: vmv1r.v v1, v20 -; ZVBB-RV64-NEXT: vmv1r.v v3, v22 -; ZVBB-RV64-NEXT: vmv1r.v v5, v24 -; ZVBB-RV64-NEXT: vmv1r.v v7, v26 +; ZVBB-RV64-NEXT: vmv1r.v v14, v19 +; ZVBB-RV64-NEXT: vsseg6e8.v v9, (a1) +; ZVBB-RV64-NEXT: vmv1r.v v9, v24 +; ZVBB-RV64-NEXT: add a5, a1, a2 +; ZVBB-RV64-NEXT: vmv1r.v v10, v22 ; ZVBB-RV64-NEXT: add a3, a0, a2 -; ZVBB-RV64-NEXT: vmv1r.v v2, v10 -; ZVBB-RV64-NEXT: add a4, a1, a2 -; ZVBB-RV64-NEXT: slli a5, a2, 2 -; ZVBB-RV64-NEXT: vmv1r.v v4, v14 -; ZVBB-RV64-NEXT: slli a6, a2, 4 -; ZVBB-RV64-NEXT: add a7, a4, a2 -; ZVBB-RV64-NEXT: vmv1r.v v6, v18 -; ZVBB-RV64-NEXT: sub a5, a6, a5 -; ZVBB-RV64-NEXT: vmv1r.v v22, v11 -; ZVBB-RV64-NEXT: add a6, a7, a2 -; ZVBB-RV64-NEXT: vmv1r.v v24, v15 -; ZVBB-RV64-NEXT: vsseg7e8.v v1, (a0) -; ZVBB-RV64-NEXT: vmv1r.v v26, v19 -; ZVBB-RV64-NEXT: vsseg7e8.v v21, (a1) -; ZVBB-RV64-NEXT: vl1r.v v18, (a6) -; ZVBB-RV64-NEXT: add a6, a6, a2 -; ZVBB-RV64-NEXT: vl1r.v v19, (a6) -; ZVBB-RV64-NEXT: add a6, a6, a2 -; ZVBB-RV64-NEXT: vl1r.v v20, (a6) -; ZVBB-RV64-NEXT: add a6, a6, a2 -; ZVBB-RV64-NEXT: vl1r.v v21, (a6) -; ZVBB-RV64-NEXT: add a6, a3, a2 -; ZVBB-RV64-NEXT: vl1r.v v10, (a6) -; ZVBB-RV64-NEXT: add a6, a6, a2 -; ZVBB-RV64-NEXT: vl1r.v v11, (a6) -; ZVBB-RV64-NEXT: vl1r.v v8, (a0) -; ZVBB-RV64-NEXT: vl1r.v v16, (a4) -; ZVBB-RV64-NEXT: vl1r.v v9, (a3) -; ZVBB-RV64-NEXT: vl1r.v v17, (a7) -; ZVBB-RV64-NEXT: csrr a0, vlenb -; ZVBB-RV64-NEXT: li a3, 14 -; ZVBB-RV64-NEXT: mul a0, a0, a3 -; ZVBB-RV64-NEXT: add a0, sp, a0 -; ZVBB-RV64-NEXT: addi a0, a0, 64 -; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vmv1r.v v11, v20 +; ZVBB-RV64-NEXT: add a4, a3, a2 +; ZVBB-RV64-NEXT: vmv1r.v v12, v16 +; ZVBB-RV64-NEXT: add a6, a5, a2 +; ZVBB-RV64-NEXT: vmv1r.v v13, v18 +; ZVBB-RV64-NEXT: vsseg6e8.v v8, (a0) +; ZVBB-RV64-NEXT: vl1r.v v14, (a1) +; ZVBB-RV64-NEXT: add a1, a6, a2 +; ZVBB-RV64-NEXT: vl1r.v v15, (a5) +; ZVBB-RV64-NEXT: add a5, a1, a2 +; ZVBB-RV64-NEXT: vl1r.v v18, (a5) +; ZVBB-RV64-NEXT: add a5, a5, a2 +; ZVBB-RV64-NEXT: vl1r.v v19, (a5) +; ZVBB-RV64-NEXT: add a5, a4, a2 +; ZVBB-RV64-NEXT: vl1r.v v16, (a6) +; ZVBB-RV64-NEXT: add a6, a5, a2 ; ZVBB-RV64-NEXT: vl1r.v v12, (a6) ; ZVBB-RV64-NEXT: add a6, a6, a2 ; ZVBB-RV64-NEXT: vl1r.v v13, (a6) -; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: csrr a6, vlenb +; ZVBB-RV64-NEXT: li a7, 12 +; ZVBB-RV64-NEXT: mul a6, a6, a7 +; ZVBB-RV64-NEXT: add a6, sp, a6 +; ZVBB-RV64-NEXT: addi a6, a6, 64 +; ZVBB-RV64-NEXT: vl1r.v v17, (a1) +; ZVBB-RV64-NEXT: vl1r.v v10, (a4) +; ZVBB-RV64-NEXT: vl1r.v v11, (a5) +; ZVBB-RV64-NEXT: vl1r.v v8, (a0) +; ZVBB-RV64-NEXT: vl1r.v v9, (a3) ; ZVBB-RV64-NEXT: slli a2, a2, 3 -; ZVBB-RV64-NEXT: add a2, a0, a2 -; ZVBB-RV64-NEXT: vl1r.v v14, (a6) -; ZVBB-RV64-NEXT: vl1r.v v15, (a1) -; ZVBB-RV64-NEXT: add a5, a0, a5 -; ZVBB-RV64-NEXT: vs2r.v v20, (a5) +; ZVBB-RV64-NEXT: add a2, a6, a2 ; ZVBB-RV64-NEXT: vs4r.v v16, (a2) -; ZVBB-RV64-NEXT: vs8r.v v8, (a0) +; ZVBB-RV64-NEXT: vs8r.v v8, (a6) ; ZVBB-RV64-NEXT: vl8r.v v16, (a2) -; ZVBB-RV64-NEXT: vl8r.v v8, (a0) +; ZVBB-RV64-NEXT: vl8r.v v8, (a6) ; ZVBB-RV64-NEXT: addi sp, s0, -80 ; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload ; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload ; ZVBB-RV64-NEXT: addi sp, sp, 80 ; ZVBB-RV64-NEXT: ret ; -; ZIP-LABEL: vector_interleave_nxv112i8_nxv16i8: +; ZIP-LABEL: vector_interleave_nxv96i8_nxv16i8: ; ZIP: # %bb.0: ; ZIP-NEXT: addi sp, sp, -80 ; ZIP-NEXT: sd ra, 72(sp) # 8-byte Folded Spill ; ZIP-NEXT: sd s0, 64(sp) # 8-byte Folded Spill ; ZIP-NEXT: addi s0, sp, 80 ; ZIP-NEXT: csrr a0, vlenb -; ZIP-NEXT: slli a0, a0, 5 +; ZIP-NEXT: li a1, 28 +; ZIP-NEXT: mul a0, a0, a1 ; ZIP-NEXT: sub sp, sp, a0 ; ZIP-NEXT: andi sp, sp, -64 ; ZIP-NEXT: vsetvli a0, zero, e8, m1, ta, ma -; ZIP-NEXT: vmv2r.v v26, v20 -; ZIP-NEXT: addi a0, sp, 64 -; ZIP-NEXT: vmv2r.v v24, v16 +; ZIP-NEXT: vmv2r.v v20, v14 +; ZIP-NEXT: vmv2r.v v22, v12 +; ZIP-NEXT: vmv2r.v v24, v10 ; ZIP-NEXT: csrr a1, vlenb -; ZIP-NEXT: slli a2, a1, 3 -; ZIP-NEXT: sub a1, a2, a1 +; ZIP-NEXT: li a0, 6 +; ZIP-NEXT: mul a1, a1, a0 ; ZIP-NEXT: add a1, sp, a1 ; ZIP-NEXT: addi a1, a1, 64 -; ZIP-NEXT: vmv2r.v v22, v12 +; ZIP-NEXT: vmv1r.v v10, v25 +; ZIP-NEXT: vmv1r.v v11, v23 +; ZIP-NEXT: vmv1r.v v12, v21 +; ZIP-NEXT: addi a0, sp, 64 +; ZIP-NEXT: vmv1r.v v13, v17 ; ZIP-NEXT: csrr a2, vlenb -; ZIP-NEXT: vmv2r.v v20, v8 -; ZIP-NEXT: vmv1r.v v1, v20 -; ZIP-NEXT: vmv1r.v v3, v22 -; ZIP-NEXT: vmv1r.v v5, v24 -; ZIP-NEXT: vmv1r.v v7, v26 +; ZIP-NEXT: vmv1r.v v14, v19 +; ZIP-NEXT: vsseg6e8.v v9, (a1) +; ZIP-NEXT: vmv1r.v v9, v24 +; ZIP-NEXT: add a5, a1, a2 +; ZIP-NEXT: vmv1r.v v10, v22 ; ZIP-NEXT: add a3, a0, a2 -; ZIP-NEXT: vmv1r.v v2, v10 -; ZIP-NEXT: add a4, a1, a2 -; ZIP-NEXT: slli a5, a2, 2 -; ZIP-NEXT: vmv1r.v v4, v14 -; ZIP-NEXT: slli a6, a2, 4 -; ZIP-NEXT: add a7, a4, a2 -; ZIP-NEXT: vmv1r.v v6, v18 -; ZIP-NEXT: sub a5, a6, a5 -; ZIP-NEXT: vmv1r.v v22, v11 -; ZIP-NEXT: add a6, a7, a2 -; ZIP-NEXT: vmv1r.v v24, v15 -; ZIP-NEXT: vsseg7e8.v v1, (a0) -; ZIP-NEXT: vmv1r.v v26, v19 -; ZIP-NEXT: vsseg7e8.v v21, (a1) -; ZIP-NEXT: vl1r.v v18, (a6) -; ZIP-NEXT: add a6, a6, a2 -; ZIP-NEXT: vl1r.v v19, (a6) -; ZIP-NEXT: add a6, a6, a2 -; ZIP-NEXT: vl1r.v v20, (a6) -; ZIP-NEXT: add a6, a6, a2 -; ZIP-NEXT: vl1r.v v21, (a6) -; ZIP-NEXT: add a6, a3, a2 -; ZIP-NEXT: vl1r.v v10, (a6) -; ZIP-NEXT: add a6, a6, a2 -; ZIP-NEXT: vl1r.v v11, (a6) -; ZIP-NEXT: vl1r.v v8, (a0) -; ZIP-NEXT: vl1r.v v16, (a4) -; ZIP-NEXT: vl1r.v v9, (a3) -; ZIP-NEXT: vl1r.v v17, (a7) -; ZIP-NEXT: csrr a0, vlenb -; ZIP-NEXT: li a3, 14 -; ZIP-NEXT: mul a0, a0, a3 -; ZIP-NEXT: add a0, sp, a0 -; ZIP-NEXT: addi a0, a0, 64 -; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: vmv1r.v v11, v20 +; ZIP-NEXT: add a4, a3, a2 +; ZIP-NEXT: vmv1r.v v12, v16 +; ZIP-NEXT: add a6, a5, a2 +; ZIP-NEXT: vmv1r.v v13, v18 +; ZIP-NEXT: vsseg6e8.v v8, (a0) +; ZIP-NEXT: vl1r.v v14, (a1) +; ZIP-NEXT: add a1, a6, a2 +; ZIP-NEXT: vl1r.v v15, (a5) +; ZIP-NEXT: add a5, a1, a2 +; ZIP-NEXT: vl1r.v v18, (a5) +; ZIP-NEXT: add a5, a5, a2 +; ZIP-NEXT: vl1r.v v19, (a5) +; ZIP-NEXT: add a5, a4, a2 +; ZIP-NEXT: vl1r.v v16, (a6) +; ZIP-NEXT: add a6, a5, a2 ; ZIP-NEXT: vl1r.v v12, (a6) ; ZIP-NEXT: add a6, a6, a2 ; ZIP-NEXT: vl1r.v v13, (a6) -; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: csrr a6, vlenb +; ZIP-NEXT: li a7, 12 +; ZIP-NEXT: mul a6, a6, a7 +; ZIP-NEXT: add a6, sp, a6 +; ZIP-NEXT: addi a6, a6, 64 +; ZIP-NEXT: vl1r.v v17, (a1) +; ZIP-NEXT: vl1r.v v10, (a4) +; ZIP-NEXT: vl1r.v v11, (a5) +; ZIP-NEXT: vl1r.v v8, (a0) +; ZIP-NEXT: vl1r.v v9, (a3) ; ZIP-NEXT: slli a2, a2, 3 -; ZIP-NEXT: add a2, a0, a2 -; ZIP-NEXT: vl1r.v v14, (a6) -; ZIP-NEXT: vl1r.v v15, (a1) -; ZIP-NEXT: add a5, a0, a5 -; ZIP-NEXT: vs2r.v v20, (a5) +; ZIP-NEXT: add a2, a6, a2 ; ZIP-NEXT: vs4r.v v16, (a2) -; ZIP-NEXT: vs8r.v v8, (a0) +; ZIP-NEXT: vs8r.v v8, (a6) ; ZIP-NEXT: vl8r.v v16, (a2) -; ZIP-NEXT: vl8r.v v8, (a0) +; ZIP-NEXT: vl8r.v v8, (a6) ; ZIP-NEXT: addi sp, s0, -80 ; ZIP-NEXT: ld ra, 72(sp) # 8-byte Folded Reload ; ZIP-NEXT: ld s0, 64(sp) # 8-byte Folded Reload ; ZIP-NEXT: addi sp, sp, 80 ; ZIP-NEXT: ret - %res = call @llvm.vector.interleave7.nxv112i8( %a, %b, %c, %d, %e, %f, %g) - ret %res + %res = call @llvm.vector.interleave6.nxv96i8( %a, %b, %c, %d, %e, %f) + ret %res } - -define @vector_interleave_nxv56i16_nxv8i16( %a, %b, %c, %d, %e, %f, %g) nounwind { +define @vector_interleave_nxv48i8_nxv8i8( %a, %b, %c, %d, %e, %f) nounwind { +; CHECK-LABEL: vector_interleave_nxv48i8_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 6 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: add a3, a2, a1 +; CHECK-NEXT: vsetvli a4, zero, e8, m1, ta, ma +; CHECK-NEXT: vsseg6e8.v v8, (a0) +; CHECK-NEXT: vl1r.v v10, (a3) +; CHECK-NEXT: add a3, a3, a1 +; CHECK-NEXT: vl1r.v v11, (a3) +; CHECK-NEXT: add a3, a3, a1 +; CHECK-NEXT: vl1r.v v8, (a0) +; CHECK-NEXT: vl1r.v v9, (a2) +; CHECK-NEXT: vl1r.v v12, (a3) +; CHECK-NEXT: add a1, a3, a1 +; CHECK-NEXT: vl1r.v v13, (a1) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 6 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret ; -; RV32-LABEL: vector_interleave_nxv56i16_nxv8i16: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -80 -; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill -; RV32-NEXT: addi s0, sp, 80 -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 -; RV32-NEXT: sub sp, sp, a0 -; RV32-NEXT: andi sp, sp, -64 -; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; RV32-NEXT: vmv2r.v v26, v20 -; RV32-NEXT: addi a0, sp, 64 -; RV32-NEXT: vmv2r.v v24, v16 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a2, a1, 3 -; RV32-NEXT: sub a1, a2, a1 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 64 -; RV32-NEXT: vmv2r.v v22, v12 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: vmv2r.v v20, v8 -; RV32-NEXT: vmv1r.v v1, v20 -; RV32-NEXT: vmv1r.v v3, v22 -; RV32-NEXT: vmv1r.v v5, v24 -; RV32-NEXT: vmv1r.v v7, v26 -; RV32-NEXT: add a3, a0, a2 -; RV32-NEXT: vmv1r.v v2, v10 -; RV32-NEXT: add a4, a1, a2 -; RV32-NEXT: slli a5, a2, 2 -; RV32-NEXT: vmv1r.v v4, v14 -; RV32-NEXT: slli a6, a2, 4 -; RV32-NEXT: add a7, a4, a2 -; RV32-NEXT: vmv1r.v v6, v18 -; RV32-NEXT: sub a5, a6, a5 -; RV32-NEXT: vmv1r.v v22, v11 -; RV32-NEXT: add a6, a7, a2 -; RV32-NEXT: vmv1r.v v24, v15 -; RV32-NEXT: vsseg7e16.v v1, (a0) -; RV32-NEXT: vmv1r.v v26, v19 -; RV32-NEXT: vsseg7e16.v v21, (a1) -; RV32-NEXT: vl1re16.v v18, (a6) -; RV32-NEXT: add a6, a6, a2 -; RV32-NEXT: vl1re16.v v19, (a6) -; RV32-NEXT: add a6, a6, a2 -; RV32-NEXT: vl1re16.v v20, (a6) -; RV32-NEXT: add a6, a6, a2 -; RV32-NEXT: vl1re16.v v21, (a6) -; RV32-NEXT: add a6, a3, a2 -; RV32-NEXT: vl1re16.v v10, (a6) -; RV32-NEXT: add a6, a6, a2 -; RV32-NEXT: vl1re16.v v11, (a6) -; RV32-NEXT: vl1re16.v v8, (a0) -; RV32-NEXT: vl1re16.v v16, (a4) -; RV32-NEXT: vl1re16.v v9, (a3) -; RV32-NEXT: vl1re16.v v17, (a7) +; ZVBB-LABEL: vector_interleave_nxv48i8_nxv8i8: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: li a1, 6 +; ZVBB-NEXT: mul a0, a0, a1 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: add a3, a2, a1 +; ZVBB-NEXT: vsetvli a4, zero, e8, m1, ta, ma +; ZVBB-NEXT: vsseg6e8.v v8, (a0) +; ZVBB-NEXT: vl1r.v v10, (a3) +; ZVBB-NEXT: add a3, a3, a1 +; ZVBB-NEXT: vl1r.v v11, (a3) +; ZVBB-NEXT: add a3, a3, a1 +; ZVBB-NEXT: vl1r.v v8, (a0) +; ZVBB-NEXT: vl1r.v v9, (a2) +; ZVBB-NEXT: vl1r.v v12, (a3) +; ZVBB-NEXT: add a1, a3, a1 +; ZVBB-NEXT: vl1r.v v13, (a1) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: li a1, 6 +; ZVBB-NEXT: mul a0, a0, a1 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave6.nxv48i8( %a, %b, %c, %d, %e, %f) + ret %res +} + +define @vector_interleave_nxv24i32_nxv4i32( %a, %b, %c, %d, %e, %f) nounwind { +; +; RV32-LABEL: vector_interleave_nxv24i32_nxv4i32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -80 +; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; RV32-NEXT: addi s0, sp, 80 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a3, 14 -; RV32-NEXT: mul a0, a0, a3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 64 -; RV32-NEXT: add a6, a6, a2 -; RV32-NEXT: vl1re16.v v12, (a6) -; RV32-NEXT: add a6, a6, a2 -; RV32-NEXT: vl1re16.v v13, (a6) +; RV32-NEXT: li a1, 28 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: sub sp, sp, a0 +; RV32-NEXT: andi sp, sp, -64 +; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; RV32-NEXT: vmv2r.v v20, v14 +; RV32-NEXT: vmv2r.v v22, v12 +; RV32-NEXT: vmv2r.v v24, v10 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a0, 6 +; RV32-NEXT: mul a1, a1, a0 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 64 +; RV32-NEXT: vmv1r.v v10, v25 +; RV32-NEXT: vmv1r.v v11, v23 +; RV32-NEXT: vmv1r.v v12, v21 +; RV32-NEXT: addi a0, sp, 64 +; RV32-NEXT: vmv1r.v v13, v17 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: vmv1r.v v14, v19 +; RV32-NEXT: vsseg6e32.v v9, (a1) +; RV32-NEXT: vmv1r.v v9, v24 +; RV32-NEXT: add a5, a1, a2 +; RV32-NEXT: vmv1r.v v10, v22 +; RV32-NEXT: add a3, a0, a2 +; RV32-NEXT: vmv1r.v v11, v20 +; RV32-NEXT: add a4, a3, a2 +; RV32-NEXT: vmv1r.v v12, v16 +; RV32-NEXT: add a6, a5, a2 +; RV32-NEXT: vmv1r.v v13, v18 +; RV32-NEXT: vsseg6e32.v v8, (a0) +; RV32-NEXT: vl1re32.v v14, (a1) +; RV32-NEXT: add a1, a6, a2 +; RV32-NEXT: vl1re32.v v15, (a5) +; RV32-NEXT: add a5, a1, a2 +; RV32-NEXT: vl1re32.v v18, (a5) +; RV32-NEXT: add a5, a5, a2 +; RV32-NEXT: vl1re32.v v19, (a5) +; RV32-NEXT: add a5, a4, a2 +; RV32-NEXT: vl1re32.v v16, (a6) +; RV32-NEXT: add a6, a5, a2 +; RV32-NEXT: vl1re32.v v12, (a6) ; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re32.v v13, (a6) +; RV32-NEXT: csrr a6, vlenb +; RV32-NEXT: li a7, 12 +; RV32-NEXT: mul a6, a6, a7 +; RV32-NEXT: add a6, sp, a6 +; RV32-NEXT: addi a6, a6, 64 +; RV32-NEXT: vl1re32.v v17, (a1) +; RV32-NEXT: vl1re32.v v10, (a4) +; RV32-NEXT: vl1re32.v v11, (a5) +; RV32-NEXT: vl1re32.v v8, (a0) +; RV32-NEXT: vl1re32.v v9, (a3) ; RV32-NEXT: slli a2, a2, 3 -; RV32-NEXT: add a2, a0, a2 -; RV32-NEXT: vl1re16.v v14, (a6) -; RV32-NEXT: vl1re16.v v15, (a1) -; RV32-NEXT: add a5, a0, a5 -; RV32-NEXT: vs2r.v v20, (a5) +; RV32-NEXT: add a2, a6, a2 ; RV32-NEXT: vs4r.v v16, (a2) -; RV32-NEXT: vs8r.v v8, (a0) -; RV32-NEXT: vl8re16.v v16, (a2) -; RV32-NEXT: vl8re16.v v8, (a0) +; RV32-NEXT: vs8r.v v8, (a6) +; RV32-NEXT: vl8re32.v v16, (a2) +; RV32-NEXT: vl8re32.v v8, (a6) ; RV32-NEXT: addi sp, s0, -80 ; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 80 ; RV32-NEXT: ret ; -; RV64-LABEL: vector_interleave_nxv56i16_nxv8i16: +; RV64-LABEL: vector_interleave_nxv24i32_nxv4i32: ; RV64: # %bb.0: ; RV64-NEXT: addi sp, sp, -80 ; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill ; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill ; RV64-NEXT: addi s0, sp, 80 ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: li a1, 28 +; RV64-NEXT: mul a0, a0, a1 ; RV64-NEXT: sub sp, sp, a0 ; RV64-NEXT: andi sp, sp, -64 -; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; RV64-NEXT: vmv2r.v v26, v20 -; RV64-NEXT: addi a0, sp, 64 -; RV64-NEXT: vmv2r.v v24, v16 +; RV64-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; RV64-NEXT: vmv2r.v v20, v14 +; RV64-NEXT: vmv2r.v v22, v12 +; RV64-NEXT: vmv2r.v v24, v10 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 3 -; RV64-NEXT: sub a1, a2, a1 +; RV64-NEXT: li a0, 6 +; RV64-NEXT: mul a1, a1, a0 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 64 -; RV64-NEXT: vmv2r.v v22, v12 +; RV64-NEXT: vmv1r.v v10, v25 +; RV64-NEXT: vmv1r.v v11, v23 +; RV64-NEXT: vmv1r.v v12, v21 +; RV64-NEXT: addi a0, sp, 64 +; RV64-NEXT: vmv1r.v v13, v17 ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: vmv2r.v v20, v8 -; RV64-NEXT: vmv1r.v v1, v20 -; RV64-NEXT: vmv1r.v v3, v22 -; RV64-NEXT: vmv1r.v v5, v24 -; RV64-NEXT: vmv1r.v v7, v26 +; RV64-NEXT: vmv1r.v v14, v19 +; RV64-NEXT: vsseg6e32.v v9, (a1) +; RV64-NEXT: vmv1r.v v9, v24 +; RV64-NEXT: add a5, a1, a2 +; RV64-NEXT: vmv1r.v v10, v22 ; RV64-NEXT: add a3, a0, a2 -; RV64-NEXT: vmv1r.v v2, v10 -; RV64-NEXT: add a4, a1, a2 -; RV64-NEXT: slli a5, a2, 2 -; RV64-NEXT: vmv1r.v v4, v14 -; RV64-NEXT: slli a6, a2, 4 -; RV64-NEXT: add a7, a4, a2 -; RV64-NEXT: vmv1r.v v6, v18 -; RV64-NEXT: sub a5, a6, a5 -; RV64-NEXT: vmv1r.v v22, v11 -; RV64-NEXT: add a6, a7, a2 -; RV64-NEXT: vmv1r.v v24, v15 -; RV64-NEXT: vsseg7e16.v v1, (a0) -; RV64-NEXT: vmv1r.v v26, v19 -; RV64-NEXT: vsseg7e16.v v21, (a1) -; RV64-NEXT: vl1re16.v v18, (a6) -; RV64-NEXT: add a6, a6, a2 -; RV64-NEXT: vl1re16.v v19, (a6) -; RV64-NEXT: add a6, a6, a2 -; RV64-NEXT: vl1re16.v v20, (a6) -; RV64-NEXT: add a6, a6, a2 -; RV64-NEXT: vl1re16.v v21, (a6) -; RV64-NEXT: add a6, a3, a2 -; RV64-NEXT: vl1re16.v v10, (a6) -; RV64-NEXT: add a6, a6, a2 -; RV64-NEXT: vl1re16.v v11, (a6) -; RV64-NEXT: vl1re16.v v8, (a0) -; RV64-NEXT: vl1re16.v v16, (a4) -; RV64-NEXT: vl1re16.v v9, (a3) -; RV64-NEXT: vl1re16.v v17, (a7) -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: li a3, 14 -; RV64-NEXT: mul a0, a0, a3 -; RV64-NEXT: add a0, sp, a0 -; RV64-NEXT: addi a0, a0, 64 -; RV64-NEXT: add a6, a6, a2 -; RV64-NEXT: vl1re16.v v12, (a6) -; RV64-NEXT: add a6, a6, a2 -; RV64-NEXT: vl1re16.v v13, (a6) +; RV64-NEXT: vmv1r.v v11, v20 +; RV64-NEXT: add a4, a3, a2 +; RV64-NEXT: vmv1r.v v12, v16 +; RV64-NEXT: add a6, a5, a2 +; RV64-NEXT: vmv1r.v v13, v18 +; RV64-NEXT: vsseg6e32.v v8, (a0) +; RV64-NEXT: vl1re32.v v14, (a1) +; RV64-NEXT: add a1, a6, a2 +; RV64-NEXT: vl1re32.v v15, (a5) +; RV64-NEXT: add a5, a1, a2 +; RV64-NEXT: vl1re32.v v18, (a5) +; RV64-NEXT: add a5, a5, a2 +; RV64-NEXT: vl1re32.v v19, (a5) +; RV64-NEXT: add a5, a4, a2 +; RV64-NEXT: vl1re32.v v16, (a6) +; RV64-NEXT: add a6, a5, a2 +; RV64-NEXT: vl1re32.v v12, (a6) ; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re32.v v13, (a6) +; RV64-NEXT: csrr a6, vlenb +; RV64-NEXT: li a7, 12 +; RV64-NEXT: mul a6, a6, a7 +; RV64-NEXT: add a6, sp, a6 +; RV64-NEXT: addi a6, a6, 64 +; RV64-NEXT: vl1re32.v v17, (a1) +; RV64-NEXT: vl1re32.v v10, (a4) +; RV64-NEXT: vl1re32.v v11, (a5) +; RV64-NEXT: vl1re32.v v8, (a0) +; RV64-NEXT: vl1re32.v v9, (a3) ; RV64-NEXT: slli a2, a2, 3 -; RV64-NEXT: add a2, a0, a2 -; RV64-NEXT: vl1re16.v v14, (a6) -; RV64-NEXT: vl1re16.v v15, (a1) -; RV64-NEXT: add a5, a0, a5 -; RV64-NEXT: vs2r.v v20, (a5) +; RV64-NEXT: add a2, a6, a2 ; RV64-NEXT: vs4r.v v16, (a2) -; RV64-NEXT: vs8r.v v8, (a0) -; RV64-NEXT: vl8re16.v v16, (a2) -; RV64-NEXT: vl8re16.v v8, (a0) +; RV64-NEXT: vs8r.v v8, (a6) +; RV64-NEXT: vl8re32.v v16, (a2) +; RV64-NEXT: vl8re32.v v8, (a6) ; RV64-NEXT: addi sp, s0, -80 ; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload ; RV64-NEXT: addi sp, sp, 80 ; RV64-NEXT: ret ; -; ZVBB-RV32-LABEL: vector_interleave_nxv56i16_nxv8i16: +; ZVBB-RV32-LABEL: vector_interleave_nxv24i32_nxv4i32: ; ZVBB-RV32: # %bb.0: ; ZVBB-RV32-NEXT: addi sp, sp, -80 ; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill ; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill ; ZVBB-RV32-NEXT: addi s0, sp, 80 ; ZVBB-RV32-NEXT: csrr a0, vlenb -; ZVBB-RV32-NEXT: slli a0, a0, 5 +; ZVBB-RV32-NEXT: li a1, 28 +; ZVBB-RV32-NEXT: mul a0, a0, a1 ; ZVBB-RV32-NEXT: sub sp, sp, a0 ; ZVBB-RV32-NEXT: andi sp, sp, -64 -; ZVBB-RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; ZVBB-RV32-NEXT: vmv2r.v v26, v20 -; ZVBB-RV32-NEXT: addi a0, sp, 64 -; ZVBB-RV32-NEXT: vmv2r.v v24, v16 +; ZVBB-RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; ZVBB-RV32-NEXT: vmv2r.v v20, v14 +; ZVBB-RV32-NEXT: vmv2r.v v22, v12 +; ZVBB-RV32-NEXT: vmv2r.v v24, v10 ; ZVBB-RV32-NEXT: csrr a1, vlenb -; ZVBB-RV32-NEXT: slli a2, a1, 3 -; ZVBB-RV32-NEXT: sub a1, a2, a1 +; ZVBB-RV32-NEXT: li a0, 6 +; ZVBB-RV32-NEXT: mul a1, a1, a0 ; ZVBB-RV32-NEXT: add a1, sp, a1 ; ZVBB-RV32-NEXT: addi a1, a1, 64 -; ZVBB-RV32-NEXT: vmv2r.v v22, v12 +; ZVBB-RV32-NEXT: vmv1r.v v10, v25 +; ZVBB-RV32-NEXT: vmv1r.v v11, v23 +; ZVBB-RV32-NEXT: vmv1r.v v12, v21 +; ZVBB-RV32-NEXT: addi a0, sp, 64 +; ZVBB-RV32-NEXT: vmv1r.v v13, v17 ; ZVBB-RV32-NEXT: csrr a2, vlenb -; ZVBB-RV32-NEXT: vmv2r.v v20, v8 -; ZVBB-RV32-NEXT: vmv1r.v v1, v20 -; ZVBB-RV32-NEXT: vmv1r.v v3, v22 -; ZVBB-RV32-NEXT: vmv1r.v v5, v24 -; ZVBB-RV32-NEXT: vmv1r.v v7, v26 +; ZVBB-RV32-NEXT: vmv1r.v v14, v19 +; ZVBB-RV32-NEXT: vsseg6e32.v v9, (a1) +; ZVBB-RV32-NEXT: vmv1r.v v9, v24 +; ZVBB-RV32-NEXT: add a5, a1, a2 +; ZVBB-RV32-NEXT: vmv1r.v v10, v22 ; ZVBB-RV32-NEXT: add a3, a0, a2 -; ZVBB-RV32-NEXT: vmv1r.v v2, v10 -; ZVBB-RV32-NEXT: add a4, a1, a2 -; ZVBB-RV32-NEXT: slli a5, a2, 2 -; ZVBB-RV32-NEXT: vmv1r.v v4, v14 -; ZVBB-RV32-NEXT: slli a6, a2, 4 -; ZVBB-RV32-NEXT: add a7, a4, a2 -; ZVBB-RV32-NEXT: vmv1r.v v6, v18 -; ZVBB-RV32-NEXT: sub a5, a6, a5 -; ZVBB-RV32-NEXT: vmv1r.v v22, v11 -; ZVBB-RV32-NEXT: add a6, a7, a2 -; ZVBB-RV32-NEXT: vmv1r.v v24, v15 -; ZVBB-RV32-NEXT: vsseg7e16.v v1, (a0) -; ZVBB-RV32-NEXT: vmv1r.v v26, v19 -; ZVBB-RV32-NEXT: vsseg7e16.v v21, (a1) -; ZVBB-RV32-NEXT: vl1re16.v v18, (a6) -; ZVBB-RV32-NEXT: add a6, a6, a2 -; ZVBB-RV32-NEXT: vl1re16.v v19, (a6) -; ZVBB-RV32-NEXT: add a6, a6, a2 -; ZVBB-RV32-NEXT: vl1re16.v v20, (a6) -; ZVBB-RV32-NEXT: add a6, a6, a2 -; ZVBB-RV32-NEXT: vl1re16.v v21, (a6) -; ZVBB-RV32-NEXT: add a6, a3, a2 -; ZVBB-RV32-NEXT: vl1re16.v v10, (a6) -; ZVBB-RV32-NEXT: add a6, a6, a2 -; ZVBB-RV32-NEXT: vl1re16.v v11, (a6) -; ZVBB-RV32-NEXT: vl1re16.v v8, (a0) -; ZVBB-RV32-NEXT: vl1re16.v v16, (a4) -; ZVBB-RV32-NEXT: vl1re16.v v9, (a3) -; ZVBB-RV32-NEXT: vl1re16.v v17, (a7) -; ZVBB-RV32-NEXT: csrr a0, vlenb -; ZVBB-RV32-NEXT: li a3, 14 -; ZVBB-RV32-NEXT: mul a0, a0, a3 -; ZVBB-RV32-NEXT: add a0, sp, a0 -; ZVBB-RV32-NEXT: addi a0, a0, 64 -; ZVBB-RV32-NEXT: add a6, a6, a2 -; ZVBB-RV32-NEXT: vl1re16.v v12, (a6) -; ZVBB-RV32-NEXT: add a6, a6, a2 -; ZVBB-RV32-NEXT: vl1re16.v v13, (a6) +; ZVBB-RV32-NEXT: vmv1r.v v11, v20 +; ZVBB-RV32-NEXT: add a4, a3, a2 +; ZVBB-RV32-NEXT: vmv1r.v v12, v16 +; ZVBB-RV32-NEXT: add a6, a5, a2 +; ZVBB-RV32-NEXT: vmv1r.v v13, v18 +; ZVBB-RV32-NEXT: vsseg6e32.v v8, (a0) +; ZVBB-RV32-NEXT: vl1re32.v v14, (a1) +; ZVBB-RV32-NEXT: add a1, a6, a2 +; ZVBB-RV32-NEXT: vl1re32.v v15, (a5) +; ZVBB-RV32-NEXT: add a5, a1, a2 +; ZVBB-RV32-NEXT: vl1re32.v v18, (a5) +; ZVBB-RV32-NEXT: add a5, a5, a2 +; ZVBB-RV32-NEXT: vl1re32.v v19, (a5) +; ZVBB-RV32-NEXT: add a5, a4, a2 +; ZVBB-RV32-NEXT: vl1re32.v v16, (a6) +; ZVBB-RV32-NEXT: add a6, a5, a2 +; ZVBB-RV32-NEXT: vl1re32.v v12, (a6) ; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re32.v v13, (a6) +; ZVBB-RV32-NEXT: csrr a6, vlenb +; ZVBB-RV32-NEXT: li a7, 12 +; ZVBB-RV32-NEXT: mul a6, a6, a7 +; ZVBB-RV32-NEXT: add a6, sp, a6 +; ZVBB-RV32-NEXT: addi a6, a6, 64 +; ZVBB-RV32-NEXT: vl1re32.v v17, (a1) +; ZVBB-RV32-NEXT: vl1re32.v v10, (a4) +; ZVBB-RV32-NEXT: vl1re32.v v11, (a5) +; ZVBB-RV32-NEXT: vl1re32.v v8, (a0) +; ZVBB-RV32-NEXT: vl1re32.v v9, (a3) ; ZVBB-RV32-NEXT: slli a2, a2, 3 -; ZVBB-RV32-NEXT: add a2, a0, a2 -; ZVBB-RV32-NEXT: vl1re16.v v14, (a6) -; ZVBB-RV32-NEXT: vl1re16.v v15, (a1) -; ZVBB-RV32-NEXT: add a5, a0, a5 -; ZVBB-RV32-NEXT: vs2r.v v20, (a5) +; ZVBB-RV32-NEXT: add a2, a6, a2 ; ZVBB-RV32-NEXT: vs4r.v v16, (a2) -; ZVBB-RV32-NEXT: vs8r.v v8, (a0) -; ZVBB-RV32-NEXT: vl8re16.v v16, (a2) -; ZVBB-RV32-NEXT: vl8re16.v v8, (a0) +; ZVBB-RV32-NEXT: vs8r.v v8, (a6) +; ZVBB-RV32-NEXT: vl8re32.v v16, (a2) +; ZVBB-RV32-NEXT: vl8re32.v v8, (a6) ; ZVBB-RV32-NEXT: addi sp, s0, -80 ; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload ; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload ; ZVBB-RV32-NEXT: addi sp, sp, 80 ; ZVBB-RV32-NEXT: ret ; -; ZVBB-RV64-LABEL: vector_interleave_nxv56i16_nxv8i16: +; ZVBB-RV64-LABEL: vector_interleave_nxv24i32_nxv4i32: ; ZVBB-RV64: # %bb.0: ; ZVBB-RV64-NEXT: addi sp, sp, -80 ; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill ; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill ; ZVBB-RV64-NEXT: addi s0, sp, 80 ; ZVBB-RV64-NEXT: csrr a0, vlenb -; ZVBB-RV64-NEXT: slli a0, a0, 5 +; ZVBB-RV64-NEXT: li a1, 28 +; ZVBB-RV64-NEXT: mul a0, a0, a1 ; ZVBB-RV64-NEXT: sub sp, sp, a0 ; ZVBB-RV64-NEXT: andi sp, sp, -64 -; ZVBB-RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; ZVBB-RV64-NEXT: vmv2r.v v26, v20 -; ZVBB-RV64-NEXT: addi a0, sp, 64 -; ZVBB-RV64-NEXT: vmv2r.v v24, v16 +; ZVBB-RV64-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; ZVBB-RV64-NEXT: vmv2r.v v20, v14 +; ZVBB-RV64-NEXT: vmv2r.v v22, v12 +; ZVBB-RV64-NEXT: vmv2r.v v24, v10 ; ZVBB-RV64-NEXT: csrr a1, vlenb -; ZVBB-RV64-NEXT: slli a2, a1, 3 -; ZVBB-RV64-NEXT: sub a1, a2, a1 +; ZVBB-RV64-NEXT: li a0, 6 +; ZVBB-RV64-NEXT: mul a1, a1, a0 ; ZVBB-RV64-NEXT: add a1, sp, a1 ; ZVBB-RV64-NEXT: addi a1, a1, 64 -; ZVBB-RV64-NEXT: vmv2r.v v22, v12 +; ZVBB-RV64-NEXT: vmv1r.v v10, v25 +; ZVBB-RV64-NEXT: vmv1r.v v11, v23 +; ZVBB-RV64-NEXT: vmv1r.v v12, v21 +; ZVBB-RV64-NEXT: addi a0, sp, 64 +; ZVBB-RV64-NEXT: vmv1r.v v13, v17 ; ZVBB-RV64-NEXT: csrr a2, vlenb -; ZVBB-RV64-NEXT: vmv2r.v v20, v8 -; ZVBB-RV64-NEXT: vmv1r.v v1, v20 -; ZVBB-RV64-NEXT: vmv1r.v v3, v22 -; ZVBB-RV64-NEXT: vmv1r.v v5, v24 -; ZVBB-RV64-NEXT: vmv1r.v v7, v26 +; ZVBB-RV64-NEXT: vmv1r.v v14, v19 +; ZVBB-RV64-NEXT: vsseg6e32.v v9, (a1) +; ZVBB-RV64-NEXT: vmv1r.v v9, v24 +; ZVBB-RV64-NEXT: add a5, a1, a2 +; ZVBB-RV64-NEXT: vmv1r.v v10, v22 ; ZVBB-RV64-NEXT: add a3, a0, a2 -; ZVBB-RV64-NEXT: vmv1r.v v2, v10 -; ZVBB-RV64-NEXT: add a4, a1, a2 -; ZVBB-RV64-NEXT: slli a5, a2, 2 -; ZVBB-RV64-NEXT: vmv1r.v v4, v14 -; ZVBB-RV64-NEXT: slli a6, a2, 4 -; ZVBB-RV64-NEXT: add a7, a4, a2 -; ZVBB-RV64-NEXT: vmv1r.v v6, v18 -; ZVBB-RV64-NEXT: sub a5, a6, a5 -; ZVBB-RV64-NEXT: vmv1r.v v22, v11 -; ZVBB-RV64-NEXT: add a6, a7, a2 -; ZVBB-RV64-NEXT: vmv1r.v v24, v15 -; ZVBB-RV64-NEXT: vsseg7e16.v v1, (a0) -; ZVBB-RV64-NEXT: vmv1r.v v26, v19 -; ZVBB-RV64-NEXT: vsseg7e16.v v21, (a1) -; ZVBB-RV64-NEXT: vl1re16.v v18, (a6) -; ZVBB-RV64-NEXT: add a6, a6, a2 -; ZVBB-RV64-NEXT: vl1re16.v v19, (a6) -; ZVBB-RV64-NEXT: add a6, a6, a2 -; ZVBB-RV64-NEXT: vl1re16.v v20, (a6) -; ZVBB-RV64-NEXT: add a6, a6, a2 -; ZVBB-RV64-NEXT: vl1re16.v v21, (a6) -; ZVBB-RV64-NEXT: add a6, a3, a2 -; ZVBB-RV64-NEXT: vl1re16.v v10, (a6) -; ZVBB-RV64-NEXT: add a6, a6, a2 -; ZVBB-RV64-NEXT: vl1re16.v v11, (a6) -; ZVBB-RV64-NEXT: vl1re16.v v8, (a0) -; ZVBB-RV64-NEXT: vl1re16.v v16, (a4) -; ZVBB-RV64-NEXT: vl1re16.v v9, (a3) -; ZVBB-RV64-NEXT: vl1re16.v v17, (a7) -; ZVBB-RV64-NEXT: csrr a0, vlenb -; ZVBB-RV64-NEXT: li a3, 14 -; ZVBB-RV64-NEXT: mul a0, a0, a3 -; ZVBB-RV64-NEXT: add a0, sp, a0 -; ZVBB-RV64-NEXT: addi a0, a0, 64 -; ZVBB-RV64-NEXT: add a6, a6, a2 -; ZVBB-RV64-NEXT: vl1re16.v v12, (a6) -; ZVBB-RV64-NEXT: add a6, a6, a2 -; ZVBB-RV64-NEXT: vl1re16.v v13, (a6) +; ZVBB-RV64-NEXT: vmv1r.v v11, v20 +; ZVBB-RV64-NEXT: add a4, a3, a2 +; ZVBB-RV64-NEXT: vmv1r.v v12, v16 +; ZVBB-RV64-NEXT: add a6, a5, a2 +; ZVBB-RV64-NEXT: vmv1r.v v13, v18 +; ZVBB-RV64-NEXT: vsseg6e32.v v8, (a0) +; ZVBB-RV64-NEXT: vl1re32.v v14, (a1) +; ZVBB-RV64-NEXT: add a1, a6, a2 +; ZVBB-RV64-NEXT: vl1re32.v v15, (a5) +; ZVBB-RV64-NEXT: add a5, a1, a2 +; ZVBB-RV64-NEXT: vl1re32.v v18, (a5) +; ZVBB-RV64-NEXT: add a5, a5, a2 +; ZVBB-RV64-NEXT: vl1re32.v v19, (a5) +; ZVBB-RV64-NEXT: add a5, a4, a2 +; ZVBB-RV64-NEXT: vl1re32.v v16, (a6) +; ZVBB-RV64-NEXT: add a6, a5, a2 +; ZVBB-RV64-NEXT: vl1re32.v v12, (a6) ; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re32.v v13, (a6) +; ZVBB-RV64-NEXT: csrr a6, vlenb +; ZVBB-RV64-NEXT: li a7, 12 +; ZVBB-RV64-NEXT: mul a6, a6, a7 +; ZVBB-RV64-NEXT: add a6, sp, a6 +; ZVBB-RV64-NEXT: addi a6, a6, 64 +; ZVBB-RV64-NEXT: vl1re32.v v17, (a1) +; ZVBB-RV64-NEXT: vl1re32.v v10, (a4) +; ZVBB-RV64-NEXT: vl1re32.v v11, (a5) +; ZVBB-RV64-NEXT: vl1re32.v v8, (a0) +; ZVBB-RV64-NEXT: vl1re32.v v9, (a3) ; ZVBB-RV64-NEXT: slli a2, a2, 3 -; ZVBB-RV64-NEXT: add a2, a0, a2 -; ZVBB-RV64-NEXT: vl1re16.v v14, (a6) -; ZVBB-RV64-NEXT: vl1re16.v v15, (a1) -; ZVBB-RV64-NEXT: add a5, a0, a5 -; ZVBB-RV64-NEXT: vs2r.v v20, (a5) +; ZVBB-RV64-NEXT: add a2, a6, a2 ; ZVBB-RV64-NEXT: vs4r.v v16, (a2) -; ZVBB-RV64-NEXT: vs8r.v v8, (a0) -; ZVBB-RV64-NEXT: vl8re16.v v16, (a2) -; ZVBB-RV64-NEXT: vl8re16.v v8, (a0) +; ZVBB-RV64-NEXT: vs8r.v v8, (a6) +; ZVBB-RV64-NEXT: vl8re32.v v16, (a2) +; ZVBB-RV64-NEXT: vl8re32.v v8, (a6) ; ZVBB-RV64-NEXT: addi sp, s0, -80 ; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload ; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload ; ZVBB-RV64-NEXT: addi sp, sp, 80 ; ZVBB-RV64-NEXT: ret ; -; ZIP-LABEL: vector_interleave_nxv56i16_nxv8i16: +; ZIP-LABEL: vector_interleave_nxv24i32_nxv4i32: ; ZIP: # %bb.0: ; ZIP-NEXT: addi sp, sp, -80 ; ZIP-NEXT: sd ra, 72(sp) # 8-byte Folded Spill ; ZIP-NEXT: sd s0, 64(sp) # 8-byte Folded Spill ; ZIP-NEXT: addi s0, sp, 80 ; ZIP-NEXT: csrr a0, vlenb -; ZIP-NEXT: slli a0, a0, 5 +; ZIP-NEXT: li a1, 28 +; ZIP-NEXT: mul a0, a0, a1 ; ZIP-NEXT: sub sp, sp, a0 ; ZIP-NEXT: andi sp, sp, -64 -; ZIP-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; ZIP-NEXT: vmv2r.v v26, v20 -; ZIP-NEXT: addi a0, sp, 64 -; ZIP-NEXT: vmv2r.v v24, v16 +; ZIP-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; ZIP-NEXT: vmv2r.v v20, v14 +; ZIP-NEXT: vmv2r.v v22, v12 +; ZIP-NEXT: vmv2r.v v24, v10 ; ZIP-NEXT: csrr a1, vlenb -; ZIP-NEXT: slli a2, a1, 3 -; ZIP-NEXT: sub a1, a2, a1 +; ZIP-NEXT: li a0, 6 +; ZIP-NEXT: mul a1, a1, a0 ; ZIP-NEXT: add a1, sp, a1 ; ZIP-NEXT: addi a1, a1, 64 -; ZIP-NEXT: vmv2r.v v22, v12 +; ZIP-NEXT: vmv1r.v v10, v25 +; ZIP-NEXT: vmv1r.v v11, v23 +; ZIP-NEXT: vmv1r.v v12, v21 +; ZIP-NEXT: addi a0, sp, 64 +; ZIP-NEXT: vmv1r.v v13, v17 ; ZIP-NEXT: csrr a2, vlenb -; ZIP-NEXT: vmv2r.v v20, v8 -; ZIP-NEXT: vmv1r.v v1, v20 -; ZIP-NEXT: vmv1r.v v3, v22 -; ZIP-NEXT: vmv1r.v v5, v24 -; ZIP-NEXT: vmv1r.v v7, v26 +; ZIP-NEXT: vmv1r.v v14, v19 +; ZIP-NEXT: vsseg6e32.v v9, (a1) +; ZIP-NEXT: vmv1r.v v9, v24 +; ZIP-NEXT: add a5, a1, a2 +; ZIP-NEXT: vmv1r.v v10, v22 ; ZIP-NEXT: add a3, a0, a2 -; ZIP-NEXT: vmv1r.v v2, v10 -; ZIP-NEXT: add a4, a1, a2 -; ZIP-NEXT: slli a5, a2, 2 -; ZIP-NEXT: vmv1r.v v4, v14 -; ZIP-NEXT: slli a6, a2, 4 -; ZIP-NEXT: add a7, a4, a2 -; ZIP-NEXT: vmv1r.v v6, v18 -; ZIP-NEXT: sub a5, a6, a5 -; ZIP-NEXT: vmv1r.v v22, v11 -; ZIP-NEXT: add a6, a7, a2 -; ZIP-NEXT: vmv1r.v v24, v15 -; ZIP-NEXT: vsseg7e16.v v1, (a0) -; ZIP-NEXT: vmv1r.v v26, v19 -; ZIP-NEXT: vsseg7e16.v v21, (a1) -; ZIP-NEXT: vl1re16.v v18, (a6) -; ZIP-NEXT: add a6, a6, a2 -; ZIP-NEXT: vl1re16.v v19, (a6) -; ZIP-NEXT: add a6, a6, a2 -; ZIP-NEXT: vl1re16.v v20, (a6) -; ZIP-NEXT: add a6, a6, a2 -; ZIP-NEXT: vl1re16.v v21, (a6) -; ZIP-NEXT: add a6, a3, a2 -; ZIP-NEXT: vl1re16.v v10, (a6) -; ZIP-NEXT: add a6, a6, a2 -; ZIP-NEXT: vl1re16.v v11, (a6) -; ZIP-NEXT: vl1re16.v v8, (a0) -; ZIP-NEXT: vl1re16.v v16, (a4) -; ZIP-NEXT: vl1re16.v v9, (a3) -; ZIP-NEXT: vl1re16.v v17, (a7) -; ZIP-NEXT: csrr a0, vlenb -; ZIP-NEXT: li a3, 14 -; ZIP-NEXT: mul a0, a0, a3 -; ZIP-NEXT: add a0, sp, a0 -; ZIP-NEXT: addi a0, a0, 64 -; ZIP-NEXT: add a6, a6, a2 -; ZIP-NEXT: vl1re16.v v12, (a6) -; ZIP-NEXT: add a6, a6, a2 -; ZIP-NEXT: vl1re16.v v13, (a6) +; ZIP-NEXT: vmv1r.v v11, v20 +; ZIP-NEXT: add a4, a3, a2 +; ZIP-NEXT: vmv1r.v v12, v16 +; ZIP-NEXT: add a6, a5, a2 +; ZIP-NEXT: vmv1r.v v13, v18 +; ZIP-NEXT: vsseg6e32.v v8, (a0) +; ZIP-NEXT: vl1re32.v v14, (a1) +; ZIP-NEXT: add a1, a6, a2 +; ZIP-NEXT: vl1re32.v v15, (a5) +; ZIP-NEXT: add a5, a1, a2 +; ZIP-NEXT: vl1re32.v v18, (a5) +; ZIP-NEXT: add a5, a5, a2 +; ZIP-NEXT: vl1re32.v v19, (a5) +; ZIP-NEXT: add a5, a4, a2 +; ZIP-NEXT: vl1re32.v v16, (a6) +; ZIP-NEXT: add a6, a5, a2 +; ZIP-NEXT: vl1re32.v v12, (a6) ; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: vl1re32.v v13, (a6) +; ZIP-NEXT: csrr a6, vlenb +; ZIP-NEXT: li a7, 12 +; ZIP-NEXT: mul a6, a6, a7 +; ZIP-NEXT: add a6, sp, a6 +; ZIP-NEXT: addi a6, a6, 64 +; ZIP-NEXT: vl1re32.v v17, (a1) +; ZIP-NEXT: vl1re32.v v10, (a4) +; ZIP-NEXT: vl1re32.v v11, (a5) +; ZIP-NEXT: vl1re32.v v8, (a0) +; ZIP-NEXT: vl1re32.v v9, (a3) ; ZIP-NEXT: slli a2, a2, 3 -; ZIP-NEXT: add a2, a0, a2 -; ZIP-NEXT: vl1re16.v v14, (a6) -; ZIP-NEXT: vl1re16.v v15, (a1) -; ZIP-NEXT: add a5, a0, a5 -; ZIP-NEXT: vs2r.v v20, (a5) +; ZIP-NEXT: add a2, a6, a2 ; ZIP-NEXT: vs4r.v v16, (a2) -; ZIP-NEXT: vs8r.v v8, (a0) -; ZIP-NEXT: vl8re16.v v16, (a2) -; ZIP-NEXT: vl8re16.v v8, (a0) +; ZIP-NEXT: vs8r.v v8, (a6) +; ZIP-NEXT: vl8re32.v v16, (a2) +; ZIP-NEXT: vl8re32.v v8, (a6) ; ZIP-NEXT: addi sp, s0, -80 ; ZIP-NEXT: ld ra, 72(sp) # 8-byte Folded Reload ; ZIP-NEXT: ld s0, 64(sp) # 8-byte Folded Reload ; ZIP-NEXT: addi sp, sp, 80 ; ZIP-NEXT: ret - %res = call @llvm.vector.interleave7.nxv56i16( %a, %b, %c, %d, %e, %f, %g) - ret %res + %res = call @llvm.vector.interleave6.nxv4i32( %a, %b, %c, %d, %e, %f) + ret %res } - -define @vector_interleave_nxv28i32_nxv4i32( %a, %b, %c, %d, %e, %f, %g) nounwind { +define @vector_interleave_nxv12i64_nxv2i64( %a, %b, %c, %d, %e, %f) nounwind { ; -; RV32-LABEL: vector_interleave_nxv28i32_nxv4i32: +; RV32-LABEL: vector_interleave_nxv12i64_nxv2i64: ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -80 ; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill ; RV32-NEXT: addi s0, sp, 80 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: li a1, 28 +; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: sub sp, sp, a0 ; RV32-NEXT: andi sp, sp, -64 -; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma -; RV32-NEXT: vmv2r.v v26, v20 -; RV32-NEXT: addi a0, sp, 64 -; RV32-NEXT: vmv2r.v v24, v16 +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32-NEXT: vmv2r.v v20, v14 +; RV32-NEXT: vmv2r.v v22, v12 +; RV32-NEXT: vmv2r.v v24, v10 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a2, a1, 3 -; RV32-NEXT: sub a1, a2, a1 +; RV32-NEXT: li a0, 6 +; RV32-NEXT: mul a1, a1, a0 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 64 -; RV32-NEXT: vmv2r.v v22, v12 +; RV32-NEXT: vmv1r.v v10, v25 +; RV32-NEXT: vmv1r.v v11, v23 +; RV32-NEXT: vmv1r.v v12, v21 +; RV32-NEXT: addi a0, sp, 64 +; RV32-NEXT: vmv1r.v v13, v17 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: vmv2r.v v20, v8 -; RV32-NEXT: vmv1r.v v1, v20 -; RV32-NEXT: vmv1r.v v3, v22 -; RV32-NEXT: vmv1r.v v5, v24 -; RV32-NEXT: vmv1r.v v7, v26 +; RV32-NEXT: vmv1r.v v14, v19 +; RV32-NEXT: vsseg6e64.v v9, (a1) +; RV32-NEXT: vmv1r.v v9, v24 +; RV32-NEXT: add a5, a1, a2 +; RV32-NEXT: vmv1r.v v10, v22 ; RV32-NEXT: add a3, a0, a2 -; RV32-NEXT: vmv1r.v v2, v10 -; RV32-NEXT: add a4, a1, a2 -; RV32-NEXT: slli a5, a2, 2 -; RV32-NEXT: vmv1r.v v4, v14 -; RV32-NEXT: slli a6, a2, 4 -; RV32-NEXT: add a7, a4, a2 -; RV32-NEXT: vmv1r.v v6, v18 -; RV32-NEXT: sub a5, a6, a5 -; RV32-NEXT: vmv1r.v v22, v11 -; RV32-NEXT: add a6, a7, a2 -; RV32-NEXT: vmv1r.v v24, v15 -; RV32-NEXT: vsseg7e32.v v1, (a0) -; RV32-NEXT: vmv1r.v v26, v19 -; RV32-NEXT: vsseg7e32.v v21, (a1) -; RV32-NEXT: vl1re32.v v18, (a6) -; RV32-NEXT: add a6, a6, a2 -; RV32-NEXT: vl1re32.v v19, (a6) -; RV32-NEXT: add a6, a6, a2 -; RV32-NEXT: vl1re32.v v20, (a6) -; RV32-NEXT: add a6, a6, a2 -; RV32-NEXT: vl1re32.v v21, (a6) -; RV32-NEXT: add a6, a3, a2 -; RV32-NEXT: vl1re32.v v10, (a6) -; RV32-NEXT: add a6, a6, a2 -; RV32-NEXT: vl1re32.v v11, (a6) -; RV32-NEXT: vl1re32.v v8, (a0) -; RV32-NEXT: vl1re32.v v16, (a4) -; RV32-NEXT: vl1re32.v v9, (a3) -; RV32-NEXT: vl1re32.v v17, (a7) -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a3, 14 -; RV32-NEXT: mul a0, a0, a3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 64 -; RV32-NEXT: add a6, a6, a2 -; RV32-NEXT: vl1re32.v v12, (a6) -; RV32-NEXT: add a6, a6, a2 -; RV32-NEXT: vl1re32.v v13, (a6) +; RV32-NEXT: vmv1r.v v11, v20 +; RV32-NEXT: add a4, a3, a2 +; RV32-NEXT: vmv1r.v v12, v16 +; RV32-NEXT: add a6, a5, a2 +; RV32-NEXT: vmv1r.v v13, v18 +; RV32-NEXT: vsseg6e64.v v8, (a0) +; RV32-NEXT: vl1re64.v v14, (a1) +; RV32-NEXT: add a1, a6, a2 +; RV32-NEXT: vl1re64.v v15, (a5) +; RV32-NEXT: add a5, a1, a2 +; RV32-NEXT: vl1re64.v v18, (a5) +; RV32-NEXT: add a5, a5, a2 +; RV32-NEXT: vl1re64.v v19, (a5) +; RV32-NEXT: add a5, a4, a2 +; RV32-NEXT: vl1re64.v v16, (a6) +; RV32-NEXT: add a6, a5, a2 +; RV32-NEXT: vl1re64.v v12, (a6) ; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re64.v v13, (a6) +; RV32-NEXT: csrr a6, vlenb +; RV32-NEXT: li a7, 12 +; RV32-NEXT: mul a6, a6, a7 +; RV32-NEXT: add a6, sp, a6 +; RV32-NEXT: addi a6, a6, 64 +; RV32-NEXT: vl1re64.v v17, (a1) +; RV32-NEXT: vl1re64.v v10, (a4) +; RV32-NEXT: vl1re64.v v11, (a5) +; RV32-NEXT: vl1re64.v v8, (a0) +; RV32-NEXT: vl1re64.v v9, (a3) ; RV32-NEXT: slli a2, a2, 3 -; RV32-NEXT: add a2, a0, a2 -; RV32-NEXT: vl1re32.v v14, (a6) -; RV32-NEXT: vl1re32.v v15, (a1) -; RV32-NEXT: add a5, a0, a5 -; RV32-NEXT: vs2r.v v20, (a5) +; RV32-NEXT: add a2, a6, a2 ; RV32-NEXT: vs4r.v v16, (a2) -; RV32-NEXT: vs8r.v v8, (a0) -; RV32-NEXT: vl8re32.v v16, (a2) -; RV32-NEXT: vl8re32.v v8, (a0) +; RV32-NEXT: vs8r.v v8, (a6) +; RV32-NEXT: vl8re64.v v16, (a2) +; RV32-NEXT: vl8re64.v v8, (a6) ; RV32-NEXT: addi sp, s0, -80 ; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 80 ; RV32-NEXT: ret ; -; RV64-LABEL: vector_interleave_nxv28i32_nxv4i32: +; RV64-LABEL: vector_interleave_nxv12i64_nxv2i64: ; RV64: # %bb.0: ; RV64-NEXT: addi sp, sp, -80 ; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill ; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill ; RV64-NEXT: addi s0, sp, 80 ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: li a1, 28 +; RV64-NEXT: mul a0, a0, a1 ; RV64-NEXT: sub sp, sp, a0 ; RV64-NEXT: andi sp, sp, -64 -; RV64-NEXT: vsetvli a0, zero, e32, m1, ta, ma -; RV64-NEXT: vmv2r.v v26, v20 -; RV64-NEXT: addi a0, sp, 64 -; RV64-NEXT: vmv2r.v v24, v16 +; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV64-NEXT: vmv2r.v v20, v14 +; RV64-NEXT: vmv2r.v v22, v12 +; RV64-NEXT: vmv2r.v v24, v10 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 3 -; RV64-NEXT: sub a1, a2, a1 +; RV64-NEXT: li a0, 6 +; RV64-NEXT: mul a1, a1, a0 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 64 -; RV64-NEXT: vmv2r.v v22, v12 +; RV64-NEXT: vmv1r.v v10, v25 +; RV64-NEXT: vmv1r.v v11, v23 +; RV64-NEXT: vmv1r.v v12, v21 +; RV64-NEXT: addi a0, sp, 64 +; RV64-NEXT: vmv1r.v v13, v17 ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: vmv2r.v v20, v8 -; RV64-NEXT: vmv1r.v v1, v20 -; RV64-NEXT: vmv1r.v v3, v22 -; RV64-NEXT: vmv1r.v v5, v24 -; RV64-NEXT: vmv1r.v v7, v26 +; RV64-NEXT: vmv1r.v v14, v19 +; RV64-NEXT: vsseg6e64.v v9, (a1) +; RV64-NEXT: vmv1r.v v9, v24 +; RV64-NEXT: add a5, a1, a2 +; RV64-NEXT: vmv1r.v v10, v22 ; RV64-NEXT: add a3, a0, a2 -; RV64-NEXT: vmv1r.v v2, v10 -; RV64-NEXT: add a4, a1, a2 -; RV64-NEXT: slli a5, a2, 2 -; RV64-NEXT: vmv1r.v v4, v14 -; RV64-NEXT: slli a6, a2, 4 -; RV64-NEXT: add a7, a4, a2 -; RV64-NEXT: vmv1r.v v6, v18 -; RV64-NEXT: sub a5, a6, a5 -; RV64-NEXT: vmv1r.v v22, v11 -; RV64-NEXT: add a6, a7, a2 -; RV64-NEXT: vmv1r.v v24, v15 -; RV64-NEXT: vsseg7e32.v v1, (a0) -; RV64-NEXT: vmv1r.v v26, v19 -; RV64-NEXT: vsseg7e32.v v21, (a1) -; RV64-NEXT: vl1re32.v v18, (a6) -; RV64-NEXT: add a6, a6, a2 -; RV64-NEXT: vl1re32.v v19, (a6) -; RV64-NEXT: add a6, a6, a2 -; RV64-NEXT: vl1re32.v v20, (a6) -; RV64-NEXT: add a6, a6, a2 -; RV64-NEXT: vl1re32.v v21, (a6) -; RV64-NEXT: add a6, a3, a2 -; RV64-NEXT: vl1re32.v v10, (a6) -; RV64-NEXT: add a6, a6, a2 -; RV64-NEXT: vl1re32.v v11, (a6) -; RV64-NEXT: vl1re32.v v8, (a0) -; RV64-NEXT: vl1re32.v v16, (a4) -; RV64-NEXT: vl1re32.v v9, (a3) -; RV64-NEXT: vl1re32.v v17, (a7) -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: li a3, 14 -; RV64-NEXT: mul a0, a0, a3 -; RV64-NEXT: add a0, sp, a0 -; RV64-NEXT: addi a0, a0, 64 -; RV64-NEXT: add a6, a6, a2 -; RV64-NEXT: vl1re32.v v12, (a6) -; RV64-NEXT: add a6, a6, a2 -; RV64-NEXT: vl1re32.v v13, (a6) +; RV64-NEXT: vmv1r.v v11, v20 +; RV64-NEXT: add a4, a3, a2 +; RV64-NEXT: vmv1r.v v12, v16 +; RV64-NEXT: add a6, a5, a2 +; RV64-NEXT: vmv1r.v v13, v18 +; RV64-NEXT: vsseg6e64.v v8, (a0) +; RV64-NEXT: vl1re64.v v14, (a1) +; RV64-NEXT: add a1, a6, a2 +; RV64-NEXT: vl1re64.v v15, (a5) +; RV64-NEXT: add a5, a1, a2 +; RV64-NEXT: vl1re64.v v18, (a5) +; RV64-NEXT: add a5, a5, a2 +; RV64-NEXT: vl1re64.v v19, (a5) +; RV64-NEXT: add a5, a4, a2 +; RV64-NEXT: vl1re64.v v16, (a6) +; RV64-NEXT: add a6, a5, a2 +; RV64-NEXT: vl1re64.v v12, (a6) ; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re64.v v13, (a6) +; RV64-NEXT: csrr a6, vlenb +; RV64-NEXT: li a7, 12 +; RV64-NEXT: mul a6, a6, a7 +; RV64-NEXT: add a6, sp, a6 +; RV64-NEXT: addi a6, a6, 64 +; RV64-NEXT: vl1re64.v v17, (a1) +; RV64-NEXT: vl1re64.v v10, (a4) +; RV64-NEXT: vl1re64.v v11, (a5) +; RV64-NEXT: vl1re64.v v8, (a0) +; RV64-NEXT: vl1re64.v v9, (a3) ; RV64-NEXT: slli a2, a2, 3 -; RV64-NEXT: add a2, a0, a2 -; RV64-NEXT: vl1re32.v v14, (a6) -; RV64-NEXT: vl1re32.v v15, (a1) -; RV64-NEXT: add a5, a0, a5 -; RV64-NEXT: vs2r.v v20, (a5) +; RV64-NEXT: add a2, a6, a2 ; RV64-NEXT: vs4r.v v16, (a2) -; RV64-NEXT: vs8r.v v8, (a0) -; RV64-NEXT: vl8re32.v v16, (a2) -; RV64-NEXT: vl8re32.v v8, (a0) +; RV64-NEXT: vs8r.v v8, (a6) +; RV64-NEXT: vl8re64.v v16, (a2) +; RV64-NEXT: vl8re64.v v8, (a6) ; RV64-NEXT: addi sp, s0, -80 ; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload ; RV64-NEXT: addi sp, sp, 80 ; RV64-NEXT: ret ; -; ZVBB-RV32-LABEL: vector_interleave_nxv28i32_nxv4i32: +; ZVBB-RV32-LABEL: vector_interleave_nxv12i64_nxv2i64: ; ZVBB-RV32: # %bb.0: ; ZVBB-RV32-NEXT: addi sp, sp, -80 ; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill ; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill ; ZVBB-RV32-NEXT: addi s0, sp, 80 ; ZVBB-RV32-NEXT: csrr a0, vlenb -; ZVBB-RV32-NEXT: slli a0, a0, 5 +; ZVBB-RV32-NEXT: li a1, 28 +; ZVBB-RV32-NEXT: mul a0, a0, a1 ; ZVBB-RV32-NEXT: sub sp, sp, a0 ; ZVBB-RV32-NEXT: andi sp, sp, -64 -; ZVBB-RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma -; ZVBB-RV32-NEXT: vmv2r.v v26, v20 -; ZVBB-RV32-NEXT: addi a0, sp, 64 -; ZVBB-RV32-NEXT: vmv2r.v v24, v16 +; ZVBB-RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; ZVBB-RV32-NEXT: vmv2r.v v20, v14 +; ZVBB-RV32-NEXT: vmv2r.v v22, v12 +; ZVBB-RV32-NEXT: vmv2r.v v24, v10 ; ZVBB-RV32-NEXT: csrr a1, vlenb -; ZVBB-RV32-NEXT: slli a2, a1, 3 -; ZVBB-RV32-NEXT: sub a1, a2, a1 +; ZVBB-RV32-NEXT: li a0, 6 +; ZVBB-RV32-NEXT: mul a1, a1, a0 ; ZVBB-RV32-NEXT: add a1, sp, a1 ; ZVBB-RV32-NEXT: addi a1, a1, 64 -; ZVBB-RV32-NEXT: vmv2r.v v22, v12 +; ZVBB-RV32-NEXT: vmv1r.v v10, v25 +; ZVBB-RV32-NEXT: vmv1r.v v11, v23 +; ZVBB-RV32-NEXT: vmv1r.v v12, v21 +; ZVBB-RV32-NEXT: addi a0, sp, 64 +; ZVBB-RV32-NEXT: vmv1r.v v13, v17 ; ZVBB-RV32-NEXT: csrr a2, vlenb -; ZVBB-RV32-NEXT: vmv2r.v v20, v8 -; ZVBB-RV32-NEXT: vmv1r.v v1, v20 -; ZVBB-RV32-NEXT: vmv1r.v v3, v22 -; ZVBB-RV32-NEXT: vmv1r.v v5, v24 -; ZVBB-RV32-NEXT: vmv1r.v v7, v26 +; ZVBB-RV32-NEXT: vmv1r.v v14, v19 +; ZVBB-RV32-NEXT: vsseg6e64.v v9, (a1) +; ZVBB-RV32-NEXT: vmv1r.v v9, v24 +; ZVBB-RV32-NEXT: add a5, a1, a2 +; ZVBB-RV32-NEXT: vmv1r.v v10, v22 ; ZVBB-RV32-NEXT: add a3, a0, a2 -; ZVBB-RV32-NEXT: vmv1r.v v2, v10 -; ZVBB-RV32-NEXT: add a4, a1, a2 -; ZVBB-RV32-NEXT: slli a5, a2, 2 -; ZVBB-RV32-NEXT: vmv1r.v v4, v14 -; ZVBB-RV32-NEXT: slli a6, a2, 4 -; ZVBB-RV32-NEXT: add a7, a4, a2 -; ZVBB-RV32-NEXT: vmv1r.v v6, v18 -; ZVBB-RV32-NEXT: sub a5, a6, a5 -; ZVBB-RV32-NEXT: vmv1r.v v22, v11 -; ZVBB-RV32-NEXT: add a6, a7, a2 -; ZVBB-RV32-NEXT: vmv1r.v v24, v15 -; ZVBB-RV32-NEXT: vsseg7e32.v v1, (a0) -; ZVBB-RV32-NEXT: vmv1r.v v26, v19 -; ZVBB-RV32-NEXT: vsseg7e32.v v21, (a1) -; ZVBB-RV32-NEXT: vl1re32.v v18, (a6) -; ZVBB-RV32-NEXT: add a6, a6, a2 -; ZVBB-RV32-NEXT: vl1re32.v v19, (a6) -; ZVBB-RV32-NEXT: add a6, a6, a2 -; ZVBB-RV32-NEXT: vl1re32.v v20, (a6) -; ZVBB-RV32-NEXT: add a6, a6, a2 -; ZVBB-RV32-NEXT: vl1re32.v v21, (a6) -; ZVBB-RV32-NEXT: add a6, a3, a2 -; ZVBB-RV32-NEXT: vl1re32.v v10, (a6) -; ZVBB-RV32-NEXT: add a6, a6, a2 -; ZVBB-RV32-NEXT: vl1re32.v v11, (a6) -; ZVBB-RV32-NEXT: vl1re32.v v8, (a0) -; ZVBB-RV32-NEXT: vl1re32.v v16, (a4) -; ZVBB-RV32-NEXT: vl1re32.v v9, (a3) -; ZVBB-RV32-NEXT: vl1re32.v v17, (a7) -; ZVBB-RV32-NEXT: csrr a0, vlenb -; ZVBB-RV32-NEXT: li a3, 14 -; ZVBB-RV32-NEXT: mul a0, a0, a3 -; ZVBB-RV32-NEXT: add a0, sp, a0 -; ZVBB-RV32-NEXT: addi a0, a0, 64 -; ZVBB-RV32-NEXT: add a6, a6, a2 -; ZVBB-RV32-NEXT: vl1re32.v v12, (a6) -; ZVBB-RV32-NEXT: add a6, a6, a2 -; ZVBB-RV32-NEXT: vl1re32.v v13, (a6) +; ZVBB-RV32-NEXT: vmv1r.v v11, v20 +; ZVBB-RV32-NEXT: add a4, a3, a2 +; ZVBB-RV32-NEXT: vmv1r.v v12, v16 +; ZVBB-RV32-NEXT: add a6, a5, a2 +; ZVBB-RV32-NEXT: vmv1r.v v13, v18 +; ZVBB-RV32-NEXT: vsseg6e64.v v8, (a0) +; ZVBB-RV32-NEXT: vl1re64.v v14, (a1) +; ZVBB-RV32-NEXT: add a1, a6, a2 +; ZVBB-RV32-NEXT: vl1re64.v v15, (a5) +; ZVBB-RV32-NEXT: add a5, a1, a2 +; ZVBB-RV32-NEXT: vl1re64.v v18, (a5) +; ZVBB-RV32-NEXT: add a5, a5, a2 +; ZVBB-RV32-NEXT: vl1re64.v v19, (a5) +; ZVBB-RV32-NEXT: add a5, a4, a2 +; ZVBB-RV32-NEXT: vl1re64.v v16, (a6) +; ZVBB-RV32-NEXT: add a6, a5, a2 +; ZVBB-RV32-NEXT: vl1re64.v v12, (a6) ; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re64.v v13, (a6) +; ZVBB-RV32-NEXT: csrr a6, vlenb +; ZVBB-RV32-NEXT: li a7, 12 +; ZVBB-RV32-NEXT: mul a6, a6, a7 +; ZVBB-RV32-NEXT: add a6, sp, a6 +; ZVBB-RV32-NEXT: addi a6, a6, 64 +; ZVBB-RV32-NEXT: vl1re64.v v17, (a1) +; ZVBB-RV32-NEXT: vl1re64.v v10, (a4) +; ZVBB-RV32-NEXT: vl1re64.v v11, (a5) +; ZVBB-RV32-NEXT: vl1re64.v v8, (a0) +; ZVBB-RV32-NEXT: vl1re64.v v9, (a3) ; ZVBB-RV32-NEXT: slli a2, a2, 3 -; ZVBB-RV32-NEXT: add a2, a0, a2 -; ZVBB-RV32-NEXT: vl1re32.v v14, (a6) -; ZVBB-RV32-NEXT: vl1re32.v v15, (a1) -; ZVBB-RV32-NEXT: add a5, a0, a5 -; ZVBB-RV32-NEXT: vs2r.v v20, (a5) +; ZVBB-RV32-NEXT: add a2, a6, a2 ; ZVBB-RV32-NEXT: vs4r.v v16, (a2) -; ZVBB-RV32-NEXT: vs8r.v v8, (a0) -; ZVBB-RV32-NEXT: vl8re32.v v16, (a2) -; ZVBB-RV32-NEXT: vl8re32.v v8, (a0) +; ZVBB-RV32-NEXT: vs8r.v v8, (a6) +; ZVBB-RV32-NEXT: vl8re64.v v16, (a2) +; ZVBB-RV32-NEXT: vl8re64.v v8, (a6) ; ZVBB-RV32-NEXT: addi sp, s0, -80 ; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload ; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload ; ZVBB-RV32-NEXT: addi sp, sp, 80 ; ZVBB-RV32-NEXT: ret ; -; ZVBB-RV64-LABEL: vector_interleave_nxv28i32_nxv4i32: +; ZVBB-RV64-LABEL: vector_interleave_nxv12i64_nxv2i64: ; ZVBB-RV64: # %bb.0: ; ZVBB-RV64-NEXT: addi sp, sp, -80 ; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill ; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill ; ZVBB-RV64-NEXT: addi s0, sp, 80 ; ZVBB-RV64-NEXT: csrr a0, vlenb -; ZVBB-RV64-NEXT: slli a0, a0, 5 +; ZVBB-RV64-NEXT: li a1, 28 +; ZVBB-RV64-NEXT: mul a0, a0, a1 ; ZVBB-RV64-NEXT: sub sp, sp, a0 ; ZVBB-RV64-NEXT: andi sp, sp, -64 -; ZVBB-RV64-NEXT: vsetvli a0, zero, e32, m1, ta, ma -; ZVBB-RV64-NEXT: vmv2r.v v26, v20 -; ZVBB-RV64-NEXT: addi a0, sp, 64 -; ZVBB-RV64-NEXT: vmv2r.v v24, v16 +; ZVBB-RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; ZVBB-RV64-NEXT: vmv2r.v v20, v14 +; ZVBB-RV64-NEXT: vmv2r.v v22, v12 +; ZVBB-RV64-NEXT: vmv2r.v v24, v10 ; ZVBB-RV64-NEXT: csrr a1, vlenb -; ZVBB-RV64-NEXT: slli a2, a1, 3 -; ZVBB-RV64-NEXT: sub a1, a2, a1 +; ZVBB-RV64-NEXT: li a0, 6 +; ZVBB-RV64-NEXT: mul a1, a1, a0 ; ZVBB-RV64-NEXT: add a1, sp, a1 ; ZVBB-RV64-NEXT: addi a1, a1, 64 -; ZVBB-RV64-NEXT: vmv2r.v v22, v12 +; ZVBB-RV64-NEXT: vmv1r.v v10, v25 +; ZVBB-RV64-NEXT: vmv1r.v v11, v23 +; ZVBB-RV64-NEXT: vmv1r.v v12, v21 +; ZVBB-RV64-NEXT: addi a0, sp, 64 +; ZVBB-RV64-NEXT: vmv1r.v v13, v17 ; ZVBB-RV64-NEXT: csrr a2, vlenb -; ZVBB-RV64-NEXT: vmv2r.v v20, v8 -; ZVBB-RV64-NEXT: vmv1r.v v1, v20 -; ZVBB-RV64-NEXT: vmv1r.v v3, v22 -; ZVBB-RV64-NEXT: vmv1r.v v5, v24 -; ZVBB-RV64-NEXT: vmv1r.v v7, v26 +; ZVBB-RV64-NEXT: vmv1r.v v14, v19 +; ZVBB-RV64-NEXT: vsseg6e64.v v9, (a1) +; ZVBB-RV64-NEXT: vmv1r.v v9, v24 +; ZVBB-RV64-NEXT: add a5, a1, a2 +; ZVBB-RV64-NEXT: vmv1r.v v10, v22 ; ZVBB-RV64-NEXT: add a3, a0, a2 -; ZVBB-RV64-NEXT: vmv1r.v v2, v10 -; ZVBB-RV64-NEXT: add a4, a1, a2 -; ZVBB-RV64-NEXT: slli a5, a2, 2 -; ZVBB-RV64-NEXT: vmv1r.v v4, v14 -; ZVBB-RV64-NEXT: slli a6, a2, 4 -; ZVBB-RV64-NEXT: add a7, a4, a2 -; ZVBB-RV64-NEXT: vmv1r.v v6, v18 -; ZVBB-RV64-NEXT: sub a5, a6, a5 -; ZVBB-RV64-NEXT: vmv1r.v v22, v11 -; ZVBB-RV64-NEXT: add a6, a7, a2 -; ZVBB-RV64-NEXT: vmv1r.v v24, v15 -; ZVBB-RV64-NEXT: vsseg7e32.v v1, (a0) -; ZVBB-RV64-NEXT: vmv1r.v v26, v19 -; ZVBB-RV64-NEXT: vsseg7e32.v v21, (a1) -; ZVBB-RV64-NEXT: vl1re32.v v18, (a6) -; ZVBB-RV64-NEXT: add a6, a6, a2 -; ZVBB-RV64-NEXT: vl1re32.v v19, (a6) -; ZVBB-RV64-NEXT: add a6, a6, a2 -; ZVBB-RV64-NEXT: vl1re32.v v20, (a6) -; ZVBB-RV64-NEXT: add a6, a6, a2 -; ZVBB-RV64-NEXT: vl1re32.v v21, (a6) -; ZVBB-RV64-NEXT: add a6, a3, a2 -; ZVBB-RV64-NEXT: vl1re32.v v10, (a6) -; ZVBB-RV64-NEXT: add a6, a6, a2 -; ZVBB-RV64-NEXT: vl1re32.v v11, (a6) -; ZVBB-RV64-NEXT: vl1re32.v v8, (a0) -; ZVBB-RV64-NEXT: vl1re32.v v16, (a4) -; ZVBB-RV64-NEXT: vl1re32.v v9, (a3) -; ZVBB-RV64-NEXT: vl1re32.v v17, (a7) -; ZVBB-RV64-NEXT: csrr a0, vlenb -; ZVBB-RV64-NEXT: li a3, 14 -; ZVBB-RV64-NEXT: mul a0, a0, a3 -; ZVBB-RV64-NEXT: add a0, sp, a0 -; ZVBB-RV64-NEXT: addi a0, a0, 64 -; ZVBB-RV64-NEXT: add a6, a6, a2 -; ZVBB-RV64-NEXT: vl1re32.v v12, (a6) -; ZVBB-RV64-NEXT: add a6, a6, a2 -; ZVBB-RV64-NEXT: vl1re32.v v13, (a6) +; ZVBB-RV64-NEXT: vmv1r.v v11, v20 +; ZVBB-RV64-NEXT: add a4, a3, a2 +; ZVBB-RV64-NEXT: vmv1r.v v12, v16 +; ZVBB-RV64-NEXT: add a6, a5, a2 +; ZVBB-RV64-NEXT: vmv1r.v v13, v18 +; ZVBB-RV64-NEXT: vsseg6e64.v v8, (a0) +; ZVBB-RV64-NEXT: vl1re64.v v14, (a1) +; ZVBB-RV64-NEXT: add a1, a6, a2 +; ZVBB-RV64-NEXT: vl1re64.v v15, (a5) +; ZVBB-RV64-NEXT: add a5, a1, a2 +; ZVBB-RV64-NEXT: vl1re64.v v18, (a5) +; ZVBB-RV64-NEXT: add a5, a5, a2 +; ZVBB-RV64-NEXT: vl1re64.v v19, (a5) +; ZVBB-RV64-NEXT: add a5, a4, a2 +; ZVBB-RV64-NEXT: vl1re64.v v16, (a6) +; ZVBB-RV64-NEXT: add a6, a5, a2 +; ZVBB-RV64-NEXT: vl1re64.v v12, (a6) ; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re64.v v13, (a6) +; ZVBB-RV64-NEXT: csrr a6, vlenb +; ZVBB-RV64-NEXT: li a7, 12 +; ZVBB-RV64-NEXT: mul a6, a6, a7 +; ZVBB-RV64-NEXT: add a6, sp, a6 +; ZVBB-RV64-NEXT: addi a6, a6, 64 +; ZVBB-RV64-NEXT: vl1re64.v v17, (a1) +; ZVBB-RV64-NEXT: vl1re64.v v10, (a4) +; ZVBB-RV64-NEXT: vl1re64.v v11, (a5) +; ZVBB-RV64-NEXT: vl1re64.v v8, (a0) +; ZVBB-RV64-NEXT: vl1re64.v v9, (a3) ; ZVBB-RV64-NEXT: slli a2, a2, 3 -; ZVBB-RV64-NEXT: add a2, a0, a2 -; ZVBB-RV64-NEXT: vl1re32.v v14, (a6) -; ZVBB-RV64-NEXT: vl1re32.v v15, (a1) -; ZVBB-RV64-NEXT: add a5, a0, a5 -; ZVBB-RV64-NEXT: vs2r.v v20, (a5) +; ZVBB-RV64-NEXT: add a2, a6, a2 ; ZVBB-RV64-NEXT: vs4r.v v16, (a2) -; ZVBB-RV64-NEXT: vs8r.v v8, (a0) -; ZVBB-RV64-NEXT: vl8re32.v v16, (a2) -; ZVBB-RV64-NEXT: vl8re32.v v8, (a0) +; ZVBB-RV64-NEXT: vs8r.v v8, (a6) +; ZVBB-RV64-NEXT: vl8re64.v v16, (a2) +; ZVBB-RV64-NEXT: vl8re64.v v8, (a6) ; ZVBB-RV64-NEXT: addi sp, s0, -80 ; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload ; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload ; ZVBB-RV64-NEXT: addi sp, sp, 80 ; ZVBB-RV64-NEXT: ret ; -; ZIP-LABEL: vector_interleave_nxv28i32_nxv4i32: +; ZIP-LABEL: vector_interleave_nxv12i64_nxv2i64: ; ZIP: # %bb.0: ; ZIP-NEXT: addi sp, sp, -80 ; ZIP-NEXT: sd ra, 72(sp) # 8-byte Folded Spill ; ZIP-NEXT: sd s0, 64(sp) # 8-byte Folded Spill ; ZIP-NEXT: addi s0, sp, 80 ; ZIP-NEXT: csrr a0, vlenb -; ZIP-NEXT: slli a0, a0, 5 +; ZIP-NEXT: li a1, 28 +; ZIP-NEXT: mul a0, a0, a1 ; ZIP-NEXT: sub sp, sp, a0 ; ZIP-NEXT: andi sp, sp, -64 -; ZIP-NEXT: vsetvli a0, zero, e32, m1, ta, ma -; ZIP-NEXT: vmv2r.v v26, v20 -; ZIP-NEXT: addi a0, sp, 64 -; ZIP-NEXT: vmv2r.v v24, v16 +; ZIP-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; ZIP-NEXT: vmv2r.v v20, v14 +; ZIP-NEXT: vmv2r.v v22, v12 +; ZIP-NEXT: vmv2r.v v24, v10 ; ZIP-NEXT: csrr a1, vlenb -; ZIP-NEXT: slli a2, a1, 3 -; ZIP-NEXT: sub a1, a2, a1 +; ZIP-NEXT: li a0, 6 +; ZIP-NEXT: mul a1, a1, a0 ; ZIP-NEXT: add a1, sp, a1 ; ZIP-NEXT: addi a1, a1, 64 -; ZIP-NEXT: vmv2r.v v22, v12 +; ZIP-NEXT: vmv1r.v v10, v25 +; ZIP-NEXT: vmv1r.v v11, v23 +; ZIP-NEXT: vmv1r.v v12, v21 +; ZIP-NEXT: addi a0, sp, 64 +; ZIP-NEXT: vmv1r.v v13, v17 ; ZIP-NEXT: csrr a2, vlenb -; ZIP-NEXT: vmv2r.v v20, v8 -; ZIP-NEXT: vmv1r.v v1, v20 -; ZIP-NEXT: vmv1r.v v3, v22 -; ZIP-NEXT: vmv1r.v v5, v24 -; ZIP-NEXT: vmv1r.v v7, v26 +; ZIP-NEXT: vmv1r.v v14, v19 +; ZIP-NEXT: vsseg6e64.v v9, (a1) +; ZIP-NEXT: vmv1r.v v9, v24 +; ZIP-NEXT: add a5, a1, a2 +; ZIP-NEXT: vmv1r.v v10, v22 ; ZIP-NEXT: add a3, a0, a2 -; ZIP-NEXT: vmv1r.v v2, v10 -; ZIP-NEXT: add a4, a1, a2 -; ZIP-NEXT: slli a5, a2, 2 -; ZIP-NEXT: vmv1r.v v4, v14 -; ZIP-NEXT: slli a6, a2, 4 -; ZIP-NEXT: add a7, a4, a2 -; ZIP-NEXT: vmv1r.v v6, v18 -; ZIP-NEXT: sub a5, a6, a5 -; ZIP-NEXT: vmv1r.v v22, v11 -; ZIP-NEXT: add a6, a7, a2 -; ZIP-NEXT: vmv1r.v v24, v15 -; ZIP-NEXT: vsseg7e32.v v1, (a0) -; ZIP-NEXT: vmv1r.v v26, v19 -; ZIP-NEXT: vsseg7e32.v v21, (a1) -; ZIP-NEXT: vl1re32.v v18, (a6) -; ZIP-NEXT: add a6, a6, a2 -; ZIP-NEXT: vl1re32.v v19, (a6) -; ZIP-NEXT: add a6, a6, a2 -; ZIP-NEXT: vl1re32.v v20, (a6) -; ZIP-NEXT: add a6, a6, a2 -; ZIP-NEXT: vl1re32.v v21, (a6) -; ZIP-NEXT: add a6, a3, a2 -; ZIP-NEXT: vl1re32.v v10, (a6) -; ZIP-NEXT: add a6, a6, a2 -; ZIP-NEXT: vl1re32.v v11, (a6) -; ZIP-NEXT: vl1re32.v v8, (a0) -; ZIP-NEXT: vl1re32.v v16, (a4) -; ZIP-NEXT: vl1re32.v v9, (a3) -; ZIP-NEXT: vl1re32.v v17, (a7) -; ZIP-NEXT: csrr a0, vlenb -; ZIP-NEXT: li a3, 14 -; ZIP-NEXT: mul a0, a0, a3 -; ZIP-NEXT: add a0, sp, a0 -; ZIP-NEXT: addi a0, a0, 64 -; ZIP-NEXT: add a6, a6, a2 -; ZIP-NEXT: vl1re32.v v12, (a6) -; ZIP-NEXT: add a6, a6, a2 -; ZIP-NEXT: vl1re32.v v13, (a6) +; ZIP-NEXT: vmv1r.v v11, v20 +; ZIP-NEXT: add a4, a3, a2 +; ZIP-NEXT: vmv1r.v v12, v16 +; ZIP-NEXT: add a6, a5, a2 +; ZIP-NEXT: vmv1r.v v13, v18 +; ZIP-NEXT: vsseg6e64.v v8, (a0) +; ZIP-NEXT: vl1re64.v v14, (a1) +; ZIP-NEXT: add a1, a6, a2 +; ZIP-NEXT: vl1re64.v v15, (a5) +; ZIP-NEXT: add a5, a1, a2 +; ZIP-NEXT: vl1re64.v v18, (a5) +; ZIP-NEXT: add a5, a5, a2 +; ZIP-NEXT: vl1re64.v v19, (a5) +; ZIP-NEXT: add a5, a4, a2 +; ZIP-NEXT: vl1re64.v v16, (a6) +; ZIP-NEXT: add a6, a5, a2 +; ZIP-NEXT: vl1re64.v v12, (a6) ; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: vl1re64.v v13, (a6) +; ZIP-NEXT: csrr a6, vlenb +; ZIP-NEXT: li a7, 12 +; ZIP-NEXT: mul a6, a6, a7 +; ZIP-NEXT: add a6, sp, a6 +; ZIP-NEXT: addi a6, a6, 64 +; ZIP-NEXT: vl1re64.v v17, (a1) +; ZIP-NEXT: vl1re64.v v10, (a4) +; ZIP-NEXT: vl1re64.v v11, (a5) +; ZIP-NEXT: vl1re64.v v8, (a0) +; ZIP-NEXT: vl1re64.v v9, (a3) ; ZIP-NEXT: slli a2, a2, 3 -; ZIP-NEXT: add a2, a0, a2 -; ZIP-NEXT: vl1re32.v v14, (a6) -; ZIP-NEXT: vl1re32.v v15, (a1) -; ZIP-NEXT: add a5, a0, a5 -; ZIP-NEXT: vs2r.v v20, (a5) +; ZIP-NEXT: add a2, a6, a2 ; ZIP-NEXT: vs4r.v v16, (a2) -; ZIP-NEXT: vs8r.v v8, (a0) -; ZIP-NEXT: vl8re32.v v16, (a2) -; ZIP-NEXT: vl8re32.v v8, (a0) +; ZIP-NEXT: vs8r.v v8, (a6) +; ZIP-NEXT: vl8re64.v v16, (a2) +; ZIP-NEXT: vl8re64.v v8, (a6) ; ZIP-NEXT: addi sp, s0, -80 ; ZIP-NEXT: ld ra, 72(sp) # 8-byte Folded Reload ; ZIP-NEXT: ld s0, 64(sp) # 8-byte Folded Reload ; ZIP-NEXT: addi sp, sp, 80 ; ZIP-NEXT: ret - %res = call @llvm.vector.interleave7.nxv28i32( %a, %b, %c, %d, %e, %f, %g) - ret %res + %res = call @llvm.vector.interleave6.nxv12i64( %a, %b, %c, %d, %e, %f) + ret %res } -define @vector_interleave_nxv14i64_nxv2i64( %a, %b, %c, %d, %e, %f, %g) nounwind { +define @vector_interleave_nxv112i1_nxv16i1( %a, %b, %c, %d, %e, %f, %g) nounwind { +; CHECK-LABEL: vector_interleave_nxv112i1_nxv16i1: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 14 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vmv.v.i v14, 0 +; CHECK-NEXT: addi a4, sp, 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a1, a0, 3 +; CHECK-NEXT: sub a0, a1, a0 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: vmerge.vim v16, v14, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vim v22, v14, 1, v0 +; CHECK-NEXT: add a3, a4, a2 +; CHECK-NEXT: srli a1, a2, 2 +; CHECK-NEXT: add a5, a0, a2 +; CHECK-NEXT: vmv4r.v v24, v16 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vmerge.vim v18, v14, 1, v0 +; CHECK-NEXT: add a6, a3, a2 +; CHECK-NEXT: vmv1r.v v25, v22 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vmerge.vim v8, v14, 1, v0 +; CHECK-NEXT: vmv1r.v v26, v18 +; CHECK-NEXT: vmv1r.v v0, v11 +; CHECK-NEXT: vmerge.vim v20, v14, 1, v0 +; CHECK-NEXT: vmv1r.v v27, v8 +; CHECK-NEXT: vmv1r.v v0, v12 +; CHECK-NEXT: vmerge.vim v10, v14, 1, v0 +; CHECK-NEXT: vmv1r.v v28, v20 +; CHECK-NEXT: vmv1r.v v18, v23 +; CHECK-NEXT: add a7, a6, a2 +; CHECK-NEXT: vmv1r.v v29, v10 +; CHECK-NEXT: vmv1r.v v20, v9 +; CHECK-NEXT: vmv1r.v v0, v13 +; CHECK-NEXT: vmerge.vim v30, v14, 1, v0 +; CHECK-NEXT: vmv1r.v v22, v11 +; CHECK-NEXT: vsetvli t0, zero, e8, m1, ta, ma +; CHECK-NEXT: vsseg7e8.v v24, (a4) +; CHECK-NEXT: vmv1r.v v23, v31 +; CHECK-NEXT: vsseg7e8.v v17, (a0) +; CHECK-NEXT: vl1r.v v8, (a6) +; CHECK-NEXT: add a6, a7, a2 +; CHECK-NEXT: vl1r.v v10, (a4) +; CHECK-NEXT: add a4, a6, a2 +; CHECK-NEXT: vl1r.v v12, (a6) +; CHECK-NEXT: add a6, a4, a2 +; CHECK-NEXT: vl1r.v v14, (a6) +; CHECK-NEXT: add a6, a5, a2 +; CHECK-NEXT: vl1r.v v16, (a5) +; CHECK-NEXT: add a5, a6, a2 +; CHECK-NEXT: vl1r.v v18, (a5) +; CHECK-NEXT: add a5, a5, a2 +; CHECK-NEXT: vl1r.v v9, (a7) +; CHECK-NEXT: add a7, a5, a2 +; CHECK-NEXT: vl1r.v v20, (a7) +; CHECK-NEXT: add a7, a7, a2 +; CHECK-NEXT: srli a2, a2, 1 +; CHECK-NEXT: vl1r.v v11, (a3) +; CHECK-NEXT: add a3, a1, a1 +; CHECK-NEXT: vl1r.v v13, (a4) +; CHECK-NEXT: add a4, a2, a2 +; CHECK-NEXT: vl1r.v v15, (a0) +; CHECK-NEXT: vl1r.v v19, (a5) +; CHECK-NEXT: vl1r.v v17, (a6) +; CHECK-NEXT: vl1r.v v21, (a7) +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vmsne.vi v22, v8, 0 +; CHECK-NEXT: vmsne.vi v0, v10, 0 +; CHECK-NEXT: vmsne.vi v9, v12, 0 +; CHECK-NEXT: vmsne.vi v10, v14, 0 +; CHECK-NEXT: vmsne.vi v11, v18, 0 +; CHECK-NEXT: vmsne.vi v8, v16, 0 +; CHECK-NEXT: vmsne.vi v12, v20, 0 +; CHECK-NEXT: vsetvli zero, a3, e8, mf2, ta, ma +; CHECK-NEXT: vslideup.vx v0, v22, a1 +; CHECK-NEXT: vslideup.vx v9, v10, a1 +; CHECK-NEXT: vslideup.vx v8, v11, a1 +; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; CHECK-NEXT: vslideup.vx v0, v9, a2 +; CHECK-NEXT: vslideup.vx v8, v12, a2 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 14 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret ; -; RV32-LABEL: vector_interleave_nxv14i64_nxv2i64: +; ZVBB-LABEL: vector_interleave_nxv112i1_nxv16i1: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: li a1, 14 +; ZVBB-NEXT: mul a0, a0, a1 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; ZVBB-NEXT: vmv.v.i v14, 0 +; ZVBB-NEXT: addi a4, sp, 16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a1, a0, 3 +; ZVBB-NEXT: sub a0, a1, a0 +; ZVBB-NEXT: add a0, sp, a0 +; ZVBB-NEXT: addi a0, a0, 16 +; ZVBB-NEXT: csrr a2, vlenb +; ZVBB-NEXT: vmerge.vim v16, v14, 1, v0 +; ZVBB-NEXT: vmv1r.v v0, v8 +; ZVBB-NEXT: vmerge.vim v22, v14, 1, v0 +; ZVBB-NEXT: add a3, a4, a2 +; ZVBB-NEXT: srli a1, a2, 2 +; ZVBB-NEXT: add a5, a0, a2 +; ZVBB-NEXT: vmv4r.v v24, v16 +; ZVBB-NEXT: vmv1r.v v0, v9 +; ZVBB-NEXT: vmerge.vim v18, v14, 1, v0 +; ZVBB-NEXT: add a6, a3, a2 +; ZVBB-NEXT: vmv1r.v v25, v22 +; ZVBB-NEXT: vmv1r.v v0, v10 +; ZVBB-NEXT: vmerge.vim v8, v14, 1, v0 +; ZVBB-NEXT: vmv1r.v v26, v18 +; ZVBB-NEXT: vmv1r.v v0, v11 +; ZVBB-NEXT: vmerge.vim v20, v14, 1, v0 +; ZVBB-NEXT: vmv1r.v v27, v8 +; ZVBB-NEXT: vmv1r.v v0, v12 +; ZVBB-NEXT: vmerge.vim v10, v14, 1, v0 +; ZVBB-NEXT: vmv1r.v v28, v20 +; ZVBB-NEXT: vmv1r.v v18, v23 +; ZVBB-NEXT: add a7, a6, a2 +; ZVBB-NEXT: vmv1r.v v29, v10 +; ZVBB-NEXT: vmv1r.v v20, v9 +; ZVBB-NEXT: vmv1r.v v0, v13 +; ZVBB-NEXT: vmerge.vim v30, v14, 1, v0 +; ZVBB-NEXT: vmv1r.v v22, v11 +; ZVBB-NEXT: vsetvli t0, zero, e8, m1, ta, ma +; ZVBB-NEXT: vsseg7e8.v v24, (a4) +; ZVBB-NEXT: vmv1r.v v23, v31 +; ZVBB-NEXT: vsseg7e8.v v17, (a0) +; ZVBB-NEXT: vl1r.v v8, (a6) +; ZVBB-NEXT: add a6, a7, a2 +; ZVBB-NEXT: vl1r.v v10, (a4) +; ZVBB-NEXT: add a4, a6, a2 +; ZVBB-NEXT: vl1r.v v12, (a6) +; ZVBB-NEXT: add a6, a4, a2 +; ZVBB-NEXT: vl1r.v v14, (a6) +; ZVBB-NEXT: add a6, a5, a2 +; ZVBB-NEXT: vl1r.v v16, (a5) +; ZVBB-NEXT: add a5, a6, a2 +; ZVBB-NEXT: vl1r.v v18, (a5) +; ZVBB-NEXT: add a5, a5, a2 +; ZVBB-NEXT: vl1r.v v9, (a7) +; ZVBB-NEXT: add a7, a5, a2 +; ZVBB-NEXT: vl1r.v v20, (a7) +; ZVBB-NEXT: add a7, a7, a2 +; ZVBB-NEXT: srli a2, a2, 1 +; ZVBB-NEXT: vl1r.v v11, (a3) +; ZVBB-NEXT: add a3, a1, a1 +; ZVBB-NEXT: vl1r.v v13, (a4) +; ZVBB-NEXT: add a4, a2, a2 +; ZVBB-NEXT: vl1r.v v15, (a0) +; ZVBB-NEXT: vl1r.v v19, (a5) +; ZVBB-NEXT: vl1r.v v17, (a6) +; ZVBB-NEXT: vl1r.v v21, (a7) +; ZVBB-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; ZVBB-NEXT: vmsne.vi v22, v8, 0 +; ZVBB-NEXT: vmsne.vi v0, v10, 0 +; ZVBB-NEXT: vmsne.vi v9, v12, 0 +; ZVBB-NEXT: vmsne.vi v10, v14, 0 +; ZVBB-NEXT: vmsne.vi v11, v18, 0 +; ZVBB-NEXT: vmsne.vi v8, v16, 0 +; ZVBB-NEXT: vmsne.vi v12, v20, 0 +; ZVBB-NEXT: vsetvli zero, a3, e8, mf2, ta, ma +; ZVBB-NEXT: vslideup.vx v0, v22, a1 +; ZVBB-NEXT: vslideup.vx v9, v10, a1 +; ZVBB-NEXT: vslideup.vx v8, v11, a1 +; ZVBB-NEXT: vsetvli zero, a4, e8, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v0, v9, a2 +; ZVBB-NEXT: vslideup.vx v8, v12, a2 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: li a1, 14 +; ZVBB-NEXT: mul a0, a0, a1 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave7.nxv112i1( %a, %b, %c, %d, %e, %f, %g) + ret %res +} + + +define @vector_interleave_nxv112i8_nxv16i8( %a, %b, %c, %d, %e, %f, %g) nounwind { +; +; RV32-LABEL: vector_interleave_nxv112i8_nxv16i8: ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -80 ; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill @@ -3461,7 +3871,7 @@ define @vector_interleave_nxv14i64_nxv2i64( @vector_interleave_nxv14i64_nxv2i64( @vector_interleave_nxv14i64_nxv2i64( @vector_interleave_nxv14i64_nxv2i64( @vector_interleave_nxv14i64_nxv2i64( @vector_interleave_nxv14i64_nxv2i64( @vector_interleave_nxv14i64_nxv2i64( @vector_interleave_nxv14i64_nxv2i64( @vector_interleave_nxv14i64_nxv2i64( @vector_interleave_nxv14i64_nxv2i64( @llvm.vector.interleave7.nxv14i64( %a, %b, %c, %d, %e, %f, %g) - ret %res -} - -; Floats - -define @vector_interleave_nxv4bf16_nxv2bf16( %a, %b) { -; V-LABEL: vector_interleave_nxv4bf16_nxv2bf16: -; V: # %bb.0: -; V-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; V-NEXT: vwaddu.vv v10, v8, v9 -; V-NEXT: li a0, -1 -; V-NEXT: csrr a1, vlenb -; V-NEXT: vwmaccu.vx v10, a0, v9 -; V-NEXT: srli a1, a1, 2 -; V-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; V-NEXT: vslidedown.vx v8, v10, a1 -; V-NEXT: add a0, a1, a1 -; V-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; V-NEXT: vslideup.vx v10, v8, a1 -; V-NEXT: vmv.v.v v8, v10 -; V-NEXT: ret -; -; ZVBB-LABEL: vector_interleave_nxv4bf16_nxv2bf16: -; ZVBB: # %bb.0: -; ZVBB-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; ZVBB-NEXT: vwsll.vi v10, v9, 16 -; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: vwaddu.wv v10, v10, v8 -; ZVBB-NEXT: srli a0, a0, 2 -; ZVBB-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; ZVBB-NEXT: vslidedown.vx v8, v10, a0 -; ZVBB-NEXT: add a1, a0, a0 -; ZVBB-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; ZVBB-NEXT: vslideup.vx v10, v8, a0 -; ZVBB-NEXT: vmv.v.v v8, v10 -; ZVBB-NEXT: ret -; -; ZIP-LABEL: vector_interleave_nxv4bf16_nxv2bf16: -; ZIP: # %bb.0: -; ZIP-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; ZIP-NEXT: ri.vzip2b.vv v11, v8, v9 -; ZIP-NEXT: ri.vzip2a.vv v10, v8, v9 -; ZIP-NEXT: csrr a0, vlenb -; ZIP-NEXT: srli a0, a0, 2 -; ZIP-NEXT: add a1, a0, a0 -; ZIP-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; ZIP-NEXT: vslideup.vx v10, v11, a0 -; ZIP-NEXT: vmv.v.v v8, v10 -; ZIP-NEXT: ret - %res = call @llvm.vector.interleave2.nxv4bf16( %a, %b) - ret %res + %res = call @llvm.vector.interleave7.nxv112i8( %a, %b, %c, %d, %e, %f, %g) + ret %res } -define @vector_interleave_nxv8bf16_nxv4bf16( %a, %b) { -; V-LABEL: vector_interleave_nxv8bf16_nxv4bf16: -; V: # %bb.0: -; V-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; V-NEXT: vmv1r.v v10, v9 -; V-NEXT: vmv1r.v v11, v8 -; V-NEXT: vwaddu.vv v8, v11, v10 -; V-NEXT: li a0, -1 -; V-NEXT: vwmaccu.vx v8, a0, v10 -; V-NEXT: ret -; -; ZVBB-LABEL: vector_interleave_nxv8bf16_nxv4bf16: -; ZVBB: # %bb.0: -; ZVBB-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; ZVBB-NEXT: vmv1r.v v10, v9 -; ZVBB-NEXT: vmv1r.v v11, v8 -; ZVBB-NEXT: vwsll.vi v8, v10, 16 -; ZVBB-NEXT: vwaddu.wv v8, v8, v11 -; ZVBB-NEXT: ret -; -; ZIP-LABEL: vector_interleave_nxv8bf16_nxv4bf16: -; ZIP: # %bb.0: -; ZIP-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; ZIP-NEXT: vmv1r.v v10, v9 -; ZIP-NEXT: vmv1r.v v11, v8 -; ZIP-NEXT: ri.vzip2b.vv v9, v8, v10 -; ZIP-NEXT: ri.vzip2a.vv v8, v11, v10 -; ZIP-NEXT: ret - %res = call @llvm.vector.interleave2.nxv8bf16( %a, %b) - ret %res -} -define @vector_interleave_nxv4f16_nxv2f16( %a, %b) { -; V-LABEL: vector_interleave_nxv4f16_nxv2f16: -; V: # %bb.0: -; V-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; V-NEXT: vwaddu.vv v10, v8, v9 -; V-NEXT: li a0, -1 -; V-NEXT: csrr a1, vlenb -; V-NEXT: vwmaccu.vx v10, a0, v9 -; V-NEXT: srli a1, a1, 2 -; V-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; V-NEXT: vslidedown.vx v8, v10, a1 -; V-NEXT: add a0, a1, a1 -; V-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; V-NEXT: vslideup.vx v10, v8, a1 -; V-NEXT: vmv.v.v v8, v10 -; V-NEXT: ret +define @vector_interleave_nxv56i16_nxv8i16( %a, %b, %c, %d, %e, %f, %g) nounwind { +; +; RV32-LABEL: vector_interleave_nxv56i16_nxv8i16: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -80 +; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; RV32-NEXT: addi s0, sp, 80 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: sub sp, sp, a0 +; RV32-NEXT: andi sp, sp, -64 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; RV32-NEXT: vmv2r.v v26, v20 +; RV32-NEXT: addi a0, sp, 64 +; RV32-NEXT: vmv2r.v v24, v16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a2, a1, 3 +; RV32-NEXT: sub a1, a2, a1 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 64 +; RV32-NEXT: vmv2r.v v22, v12 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: vmv2r.v v20, v8 +; RV32-NEXT: vmv1r.v v1, v20 +; RV32-NEXT: vmv1r.v v3, v22 +; RV32-NEXT: vmv1r.v v5, v24 +; RV32-NEXT: vmv1r.v v7, v26 +; RV32-NEXT: add a3, a0, a2 +; RV32-NEXT: vmv1r.v v2, v10 +; RV32-NEXT: add a4, a1, a2 +; RV32-NEXT: slli a5, a2, 2 +; RV32-NEXT: vmv1r.v v4, v14 +; RV32-NEXT: slli a6, a2, 4 +; RV32-NEXT: add a7, a4, a2 +; RV32-NEXT: vmv1r.v v6, v18 +; RV32-NEXT: sub a5, a6, a5 +; RV32-NEXT: vmv1r.v v22, v11 +; RV32-NEXT: add a6, a7, a2 +; RV32-NEXT: vmv1r.v v24, v15 +; RV32-NEXT: vsseg7e16.v v1, (a0) +; RV32-NEXT: vmv1r.v v26, v19 +; RV32-NEXT: vsseg7e16.v v21, (a1) +; RV32-NEXT: vl1re16.v v18, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re16.v v19, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re16.v v20, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re16.v v21, (a6) +; RV32-NEXT: add a6, a3, a2 +; RV32-NEXT: vl1re16.v v10, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re16.v v11, (a6) +; RV32-NEXT: vl1re16.v v8, (a0) +; RV32-NEXT: vl1re16.v v16, (a4) +; RV32-NEXT: vl1re16.v v9, (a3) +; RV32-NEXT: vl1re16.v v17, (a7) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a3, 14 +; RV32-NEXT: mul a0, a0, a3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 64 +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re16.v v12, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re16.v v13, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, a0, a2 +; RV32-NEXT: vl1re16.v v14, (a6) +; RV32-NEXT: vl1re16.v v15, (a1) +; RV32-NEXT: add a5, a0, a5 +; RV32-NEXT: vs2r.v v20, (a5) +; RV32-NEXT: vs4r.v v16, (a2) +; RV32-NEXT: vs8r.v v8, (a0) +; RV32-NEXT: vl8re16.v v16, (a2) +; RV32-NEXT: vl8re16.v v8, (a0) +; RV32-NEXT: addi sp, s0, -80 +; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 80 +; RV32-NEXT: ret +; +; RV64-LABEL: vector_interleave_nxv56i16_nxv8i16: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -80 +; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; RV64-NEXT: addi s0, sp, 80 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: sub sp, sp, a0 +; RV64-NEXT: andi sp, sp, -64 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; RV64-NEXT: vmv2r.v v26, v20 +; RV64-NEXT: addi a0, sp, 64 +; RV64-NEXT: vmv2r.v v24, v16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 3 +; RV64-NEXT: sub a1, a2, a1 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 64 +; RV64-NEXT: vmv2r.v v22, v12 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: vmv2r.v v20, v8 +; RV64-NEXT: vmv1r.v v1, v20 +; RV64-NEXT: vmv1r.v v3, v22 +; RV64-NEXT: vmv1r.v v5, v24 +; RV64-NEXT: vmv1r.v v7, v26 +; RV64-NEXT: add a3, a0, a2 +; RV64-NEXT: vmv1r.v v2, v10 +; RV64-NEXT: add a4, a1, a2 +; RV64-NEXT: slli a5, a2, 2 +; RV64-NEXT: vmv1r.v v4, v14 +; RV64-NEXT: slli a6, a2, 4 +; RV64-NEXT: add a7, a4, a2 +; RV64-NEXT: vmv1r.v v6, v18 +; RV64-NEXT: sub a5, a6, a5 +; RV64-NEXT: vmv1r.v v22, v11 +; RV64-NEXT: add a6, a7, a2 +; RV64-NEXT: vmv1r.v v24, v15 +; RV64-NEXT: vsseg7e16.v v1, (a0) +; RV64-NEXT: vmv1r.v v26, v19 +; RV64-NEXT: vsseg7e16.v v21, (a1) +; RV64-NEXT: vl1re16.v v18, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re16.v v19, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re16.v v20, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re16.v v21, (a6) +; RV64-NEXT: add a6, a3, a2 +; RV64-NEXT: vl1re16.v v10, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re16.v v11, (a6) +; RV64-NEXT: vl1re16.v v8, (a0) +; RV64-NEXT: vl1re16.v v16, (a4) +; RV64-NEXT: vl1re16.v v9, (a3) +; RV64-NEXT: vl1re16.v v17, (a7) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a3, 14 +; RV64-NEXT: mul a0, a0, a3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 64 +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re16.v v12, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re16.v v13, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: slli a2, a2, 3 +; RV64-NEXT: add a2, a0, a2 +; RV64-NEXT: vl1re16.v v14, (a6) +; RV64-NEXT: vl1re16.v v15, (a1) +; RV64-NEXT: add a5, a0, a5 +; RV64-NEXT: vs2r.v v20, (a5) +; RV64-NEXT: vs4r.v v16, (a2) +; RV64-NEXT: vs8r.v v8, (a0) +; RV64-NEXT: vl8re16.v v16, (a2) +; RV64-NEXT: vl8re16.v v8, (a0) +; RV64-NEXT: addi sp, s0, -80 +; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 80 +; RV64-NEXT: ret +; +; ZVBB-RV32-LABEL: vector_interleave_nxv56i16_nxv8i16: +; ZVBB-RV32: # %bb.0: +; ZVBB-RV32-NEXT: addi sp, sp, -80 +; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; ZVBB-RV32-NEXT: addi s0, sp, 80 +; ZVBB-RV32-NEXT: csrr a0, vlenb +; ZVBB-RV32-NEXT: slli a0, a0, 5 +; ZVBB-RV32-NEXT: sub sp, sp, a0 +; ZVBB-RV32-NEXT: andi sp, sp, -64 +; ZVBB-RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZVBB-RV32-NEXT: vmv2r.v v26, v20 +; ZVBB-RV32-NEXT: addi a0, sp, 64 +; ZVBB-RV32-NEXT: vmv2r.v v24, v16 +; ZVBB-RV32-NEXT: csrr a1, vlenb +; ZVBB-RV32-NEXT: slli a2, a1, 3 +; ZVBB-RV32-NEXT: sub a1, a2, a1 +; ZVBB-RV32-NEXT: add a1, sp, a1 +; ZVBB-RV32-NEXT: addi a1, a1, 64 +; ZVBB-RV32-NEXT: vmv2r.v v22, v12 +; ZVBB-RV32-NEXT: csrr a2, vlenb +; ZVBB-RV32-NEXT: vmv2r.v v20, v8 +; ZVBB-RV32-NEXT: vmv1r.v v1, v20 +; ZVBB-RV32-NEXT: vmv1r.v v3, v22 +; ZVBB-RV32-NEXT: vmv1r.v v5, v24 +; ZVBB-RV32-NEXT: vmv1r.v v7, v26 +; ZVBB-RV32-NEXT: add a3, a0, a2 +; ZVBB-RV32-NEXT: vmv1r.v v2, v10 +; ZVBB-RV32-NEXT: add a4, a1, a2 +; ZVBB-RV32-NEXT: slli a5, a2, 2 +; ZVBB-RV32-NEXT: vmv1r.v v4, v14 +; ZVBB-RV32-NEXT: slli a6, a2, 4 +; ZVBB-RV32-NEXT: add a7, a4, a2 +; ZVBB-RV32-NEXT: vmv1r.v v6, v18 +; ZVBB-RV32-NEXT: sub a5, a6, a5 +; ZVBB-RV32-NEXT: vmv1r.v v22, v11 +; ZVBB-RV32-NEXT: add a6, a7, a2 +; ZVBB-RV32-NEXT: vmv1r.v v24, v15 +; ZVBB-RV32-NEXT: vsseg7e16.v v1, (a0) +; ZVBB-RV32-NEXT: vmv1r.v v26, v19 +; ZVBB-RV32-NEXT: vsseg7e16.v v21, (a1) +; ZVBB-RV32-NEXT: vl1re16.v v18, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re16.v v19, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re16.v v20, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re16.v v21, (a6) +; ZVBB-RV32-NEXT: add a6, a3, a2 +; ZVBB-RV32-NEXT: vl1re16.v v10, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re16.v v11, (a6) +; ZVBB-RV32-NEXT: vl1re16.v v8, (a0) +; ZVBB-RV32-NEXT: vl1re16.v v16, (a4) +; ZVBB-RV32-NEXT: vl1re16.v v9, (a3) +; ZVBB-RV32-NEXT: vl1re16.v v17, (a7) +; ZVBB-RV32-NEXT: csrr a0, vlenb +; ZVBB-RV32-NEXT: li a3, 14 +; ZVBB-RV32-NEXT: mul a0, a0, a3 +; ZVBB-RV32-NEXT: add a0, sp, a0 +; ZVBB-RV32-NEXT: addi a0, a0, 64 +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re16.v v12, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re16.v v13, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: slli a2, a2, 3 +; ZVBB-RV32-NEXT: add a2, a0, a2 +; ZVBB-RV32-NEXT: vl1re16.v v14, (a6) +; ZVBB-RV32-NEXT: vl1re16.v v15, (a1) +; ZVBB-RV32-NEXT: add a5, a0, a5 +; ZVBB-RV32-NEXT: vs2r.v v20, (a5) +; ZVBB-RV32-NEXT: vs4r.v v16, (a2) +; ZVBB-RV32-NEXT: vs8r.v v8, (a0) +; ZVBB-RV32-NEXT: vl8re16.v v16, (a2) +; ZVBB-RV32-NEXT: vl8re16.v v8, (a0) +; ZVBB-RV32-NEXT: addi sp, s0, -80 +; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; ZVBB-RV32-NEXT: addi sp, sp, 80 +; ZVBB-RV32-NEXT: ret +; +; ZVBB-RV64-LABEL: vector_interleave_nxv56i16_nxv8i16: +; ZVBB-RV64: # %bb.0: +; ZVBB-RV64-NEXT: addi sp, sp, -80 +; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; ZVBB-RV64-NEXT: addi s0, sp, 80 +; ZVBB-RV64-NEXT: csrr a0, vlenb +; ZVBB-RV64-NEXT: slli a0, a0, 5 +; ZVBB-RV64-NEXT: sub sp, sp, a0 +; ZVBB-RV64-NEXT: andi sp, sp, -64 +; ZVBB-RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZVBB-RV64-NEXT: vmv2r.v v26, v20 +; ZVBB-RV64-NEXT: addi a0, sp, 64 +; ZVBB-RV64-NEXT: vmv2r.v v24, v16 +; ZVBB-RV64-NEXT: csrr a1, vlenb +; ZVBB-RV64-NEXT: slli a2, a1, 3 +; ZVBB-RV64-NEXT: sub a1, a2, a1 +; ZVBB-RV64-NEXT: add a1, sp, a1 +; ZVBB-RV64-NEXT: addi a1, a1, 64 +; ZVBB-RV64-NEXT: vmv2r.v v22, v12 +; ZVBB-RV64-NEXT: csrr a2, vlenb +; ZVBB-RV64-NEXT: vmv2r.v v20, v8 +; ZVBB-RV64-NEXT: vmv1r.v v1, v20 +; ZVBB-RV64-NEXT: vmv1r.v v3, v22 +; ZVBB-RV64-NEXT: vmv1r.v v5, v24 +; ZVBB-RV64-NEXT: vmv1r.v v7, v26 +; ZVBB-RV64-NEXT: add a3, a0, a2 +; ZVBB-RV64-NEXT: vmv1r.v v2, v10 +; ZVBB-RV64-NEXT: add a4, a1, a2 +; ZVBB-RV64-NEXT: slli a5, a2, 2 +; ZVBB-RV64-NEXT: vmv1r.v v4, v14 +; ZVBB-RV64-NEXT: slli a6, a2, 4 +; ZVBB-RV64-NEXT: add a7, a4, a2 +; ZVBB-RV64-NEXT: vmv1r.v v6, v18 +; ZVBB-RV64-NEXT: sub a5, a6, a5 +; ZVBB-RV64-NEXT: vmv1r.v v22, v11 +; ZVBB-RV64-NEXT: add a6, a7, a2 +; ZVBB-RV64-NEXT: vmv1r.v v24, v15 +; ZVBB-RV64-NEXT: vsseg7e16.v v1, (a0) +; ZVBB-RV64-NEXT: vmv1r.v v26, v19 +; ZVBB-RV64-NEXT: vsseg7e16.v v21, (a1) +; ZVBB-RV64-NEXT: vl1re16.v v18, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re16.v v19, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re16.v v20, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re16.v v21, (a6) +; ZVBB-RV64-NEXT: add a6, a3, a2 +; ZVBB-RV64-NEXT: vl1re16.v v10, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re16.v v11, (a6) +; ZVBB-RV64-NEXT: vl1re16.v v8, (a0) +; ZVBB-RV64-NEXT: vl1re16.v v16, (a4) +; ZVBB-RV64-NEXT: vl1re16.v v9, (a3) +; ZVBB-RV64-NEXT: vl1re16.v v17, (a7) +; ZVBB-RV64-NEXT: csrr a0, vlenb +; ZVBB-RV64-NEXT: li a3, 14 +; ZVBB-RV64-NEXT: mul a0, a0, a3 +; ZVBB-RV64-NEXT: add a0, sp, a0 +; ZVBB-RV64-NEXT: addi a0, a0, 64 +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re16.v v12, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re16.v v13, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: slli a2, a2, 3 +; ZVBB-RV64-NEXT: add a2, a0, a2 +; ZVBB-RV64-NEXT: vl1re16.v v14, (a6) +; ZVBB-RV64-NEXT: vl1re16.v v15, (a1) +; ZVBB-RV64-NEXT: add a5, a0, a5 +; ZVBB-RV64-NEXT: vs2r.v v20, (a5) +; ZVBB-RV64-NEXT: vs4r.v v16, (a2) +; ZVBB-RV64-NEXT: vs8r.v v8, (a0) +; ZVBB-RV64-NEXT: vl8re16.v v16, (a2) +; ZVBB-RV64-NEXT: vl8re16.v v8, (a0) +; ZVBB-RV64-NEXT: addi sp, s0, -80 +; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; ZVBB-RV64-NEXT: addi sp, sp, 80 +; ZVBB-RV64-NEXT: ret +; +; ZIP-LABEL: vector_interleave_nxv56i16_nxv8i16: +; ZIP: # %bb.0: +; ZIP-NEXT: addi sp, sp, -80 +; ZIP-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; ZIP-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; ZIP-NEXT: addi s0, sp, 80 +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: slli a0, a0, 5 +; ZIP-NEXT: sub sp, sp, a0 +; ZIP-NEXT: andi sp, sp, -64 +; ZIP-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZIP-NEXT: vmv2r.v v26, v20 +; ZIP-NEXT: addi a0, sp, 64 +; ZIP-NEXT: vmv2r.v v24, v16 +; ZIP-NEXT: csrr a1, vlenb +; ZIP-NEXT: slli a2, a1, 3 +; ZIP-NEXT: sub a1, a2, a1 +; ZIP-NEXT: add a1, sp, a1 +; ZIP-NEXT: addi a1, a1, 64 +; ZIP-NEXT: vmv2r.v v22, v12 +; ZIP-NEXT: csrr a2, vlenb +; ZIP-NEXT: vmv2r.v v20, v8 +; ZIP-NEXT: vmv1r.v v1, v20 +; ZIP-NEXT: vmv1r.v v3, v22 +; ZIP-NEXT: vmv1r.v v5, v24 +; ZIP-NEXT: vmv1r.v v7, v26 +; ZIP-NEXT: add a3, a0, a2 +; ZIP-NEXT: vmv1r.v v2, v10 +; ZIP-NEXT: add a4, a1, a2 +; ZIP-NEXT: slli a5, a2, 2 +; ZIP-NEXT: vmv1r.v v4, v14 +; ZIP-NEXT: slli a6, a2, 4 +; ZIP-NEXT: add a7, a4, a2 +; ZIP-NEXT: vmv1r.v v6, v18 +; ZIP-NEXT: sub a5, a6, a5 +; ZIP-NEXT: vmv1r.v v22, v11 +; ZIP-NEXT: add a6, a7, a2 +; ZIP-NEXT: vmv1r.v v24, v15 +; ZIP-NEXT: vsseg7e16.v v1, (a0) +; ZIP-NEXT: vmv1r.v v26, v19 +; ZIP-NEXT: vsseg7e16.v v21, (a1) +; ZIP-NEXT: vl1re16.v v18, (a6) +; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: vl1re16.v v19, (a6) +; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: vl1re16.v v20, (a6) +; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: vl1re16.v v21, (a6) +; ZIP-NEXT: add a6, a3, a2 +; ZIP-NEXT: vl1re16.v v10, (a6) +; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: vl1re16.v v11, (a6) +; ZIP-NEXT: vl1re16.v v8, (a0) +; ZIP-NEXT: vl1re16.v v16, (a4) +; ZIP-NEXT: vl1re16.v v9, (a3) +; ZIP-NEXT: vl1re16.v v17, (a7) +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: li a3, 14 +; ZIP-NEXT: mul a0, a0, a3 +; ZIP-NEXT: add a0, sp, a0 +; ZIP-NEXT: addi a0, a0, 64 +; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: vl1re16.v v12, (a6) +; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: vl1re16.v v13, (a6) +; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: slli a2, a2, 3 +; ZIP-NEXT: add a2, a0, a2 +; ZIP-NEXT: vl1re16.v v14, (a6) +; ZIP-NEXT: vl1re16.v v15, (a1) +; ZIP-NEXT: add a5, a0, a5 +; ZIP-NEXT: vs2r.v v20, (a5) +; ZIP-NEXT: vs4r.v v16, (a2) +; ZIP-NEXT: vs8r.v v8, (a0) +; ZIP-NEXT: vl8re16.v v16, (a2) +; ZIP-NEXT: vl8re16.v v8, (a0) +; ZIP-NEXT: addi sp, s0, -80 +; ZIP-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; ZIP-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; ZIP-NEXT: addi sp, sp, 80 +; ZIP-NEXT: ret + %res = call @llvm.vector.interleave7.nxv56i16( %a, %b, %c, %d, %e, %f, %g) + ret %res +} + + +define @vector_interleave_nxv28i32_nxv4i32( %a, %b, %c, %d, %e, %f, %g) nounwind { +; +; RV32-LABEL: vector_interleave_nxv28i32_nxv4i32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -80 +; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; RV32-NEXT: addi s0, sp, 80 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: sub sp, sp, a0 +; RV32-NEXT: andi sp, sp, -64 +; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; RV32-NEXT: vmv2r.v v26, v20 +; RV32-NEXT: addi a0, sp, 64 +; RV32-NEXT: vmv2r.v v24, v16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a2, a1, 3 +; RV32-NEXT: sub a1, a2, a1 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 64 +; RV32-NEXT: vmv2r.v v22, v12 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: vmv2r.v v20, v8 +; RV32-NEXT: vmv1r.v v1, v20 +; RV32-NEXT: vmv1r.v v3, v22 +; RV32-NEXT: vmv1r.v v5, v24 +; RV32-NEXT: vmv1r.v v7, v26 +; RV32-NEXT: add a3, a0, a2 +; RV32-NEXT: vmv1r.v v2, v10 +; RV32-NEXT: add a4, a1, a2 +; RV32-NEXT: slli a5, a2, 2 +; RV32-NEXT: vmv1r.v v4, v14 +; RV32-NEXT: slli a6, a2, 4 +; RV32-NEXT: add a7, a4, a2 +; RV32-NEXT: vmv1r.v v6, v18 +; RV32-NEXT: sub a5, a6, a5 +; RV32-NEXT: vmv1r.v v22, v11 +; RV32-NEXT: add a6, a7, a2 +; RV32-NEXT: vmv1r.v v24, v15 +; RV32-NEXT: vsseg7e32.v v1, (a0) +; RV32-NEXT: vmv1r.v v26, v19 +; RV32-NEXT: vsseg7e32.v v21, (a1) +; RV32-NEXT: vl1re32.v v18, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re32.v v19, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re32.v v20, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re32.v v21, (a6) +; RV32-NEXT: add a6, a3, a2 +; RV32-NEXT: vl1re32.v v10, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re32.v v11, (a6) +; RV32-NEXT: vl1re32.v v8, (a0) +; RV32-NEXT: vl1re32.v v16, (a4) +; RV32-NEXT: vl1re32.v v9, (a3) +; RV32-NEXT: vl1re32.v v17, (a7) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a3, 14 +; RV32-NEXT: mul a0, a0, a3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 64 +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re32.v v12, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re32.v v13, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, a0, a2 +; RV32-NEXT: vl1re32.v v14, (a6) +; RV32-NEXT: vl1re32.v v15, (a1) +; RV32-NEXT: add a5, a0, a5 +; RV32-NEXT: vs2r.v v20, (a5) +; RV32-NEXT: vs4r.v v16, (a2) +; RV32-NEXT: vs8r.v v8, (a0) +; RV32-NEXT: vl8re32.v v16, (a2) +; RV32-NEXT: vl8re32.v v8, (a0) +; RV32-NEXT: addi sp, s0, -80 +; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 80 +; RV32-NEXT: ret +; +; RV64-LABEL: vector_interleave_nxv28i32_nxv4i32: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -80 +; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; RV64-NEXT: addi s0, sp, 80 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: sub sp, sp, a0 +; RV64-NEXT: andi sp, sp, -64 +; RV64-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; RV64-NEXT: vmv2r.v v26, v20 +; RV64-NEXT: addi a0, sp, 64 +; RV64-NEXT: vmv2r.v v24, v16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 3 +; RV64-NEXT: sub a1, a2, a1 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 64 +; RV64-NEXT: vmv2r.v v22, v12 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: vmv2r.v v20, v8 +; RV64-NEXT: vmv1r.v v1, v20 +; RV64-NEXT: vmv1r.v v3, v22 +; RV64-NEXT: vmv1r.v v5, v24 +; RV64-NEXT: vmv1r.v v7, v26 +; RV64-NEXT: add a3, a0, a2 +; RV64-NEXT: vmv1r.v v2, v10 +; RV64-NEXT: add a4, a1, a2 +; RV64-NEXT: slli a5, a2, 2 +; RV64-NEXT: vmv1r.v v4, v14 +; RV64-NEXT: slli a6, a2, 4 +; RV64-NEXT: add a7, a4, a2 +; RV64-NEXT: vmv1r.v v6, v18 +; RV64-NEXT: sub a5, a6, a5 +; RV64-NEXT: vmv1r.v v22, v11 +; RV64-NEXT: add a6, a7, a2 +; RV64-NEXT: vmv1r.v v24, v15 +; RV64-NEXT: vsseg7e32.v v1, (a0) +; RV64-NEXT: vmv1r.v v26, v19 +; RV64-NEXT: vsseg7e32.v v21, (a1) +; RV64-NEXT: vl1re32.v v18, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re32.v v19, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re32.v v20, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re32.v v21, (a6) +; RV64-NEXT: add a6, a3, a2 +; RV64-NEXT: vl1re32.v v10, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re32.v v11, (a6) +; RV64-NEXT: vl1re32.v v8, (a0) +; RV64-NEXT: vl1re32.v v16, (a4) +; RV64-NEXT: vl1re32.v v9, (a3) +; RV64-NEXT: vl1re32.v v17, (a7) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a3, 14 +; RV64-NEXT: mul a0, a0, a3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 64 +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re32.v v12, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re32.v v13, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: slli a2, a2, 3 +; RV64-NEXT: add a2, a0, a2 +; RV64-NEXT: vl1re32.v v14, (a6) +; RV64-NEXT: vl1re32.v v15, (a1) +; RV64-NEXT: add a5, a0, a5 +; RV64-NEXT: vs2r.v v20, (a5) +; RV64-NEXT: vs4r.v v16, (a2) +; RV64-NEXT: vs8r.v v8, (a0) +; RV64-NEXT: vl8re32.v v16, (a2) +; RV64-NEXT: vl8re32.v v8, (a0) +; RV64-NEXT: addi sp, s0, -80 +; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 80 +; RV64-NEXT: ret +; +; ZVBB-RV32-LABEL: vector_interleave_nxv28i32_nxv4i32: +; ZVBB-RV32: # %bb.0: +; ZVBB-RV32-NEXT: addi sp, sp, -80 +; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; ZVBB-RV32-NEXT: addi s0, sp, 80 +; ZVBB-RV32-NEXT: csrr a0, vlenb +; ZVBB-RV32-NEXT: slli a0, a0, 5 +; ZVBB-RV32-NEXT: sub sp, sp, a0 +; ZVBB-RV32-NEXT: andi sp, sp, -64 +; ZVBB-RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; ZVBB-RV32-NEXT: vmv2r.v v26, v20 +; ZVBB-RV32-NEXT: addi a0, sp, 64 +; ZVBB-RV32-NEXT: vmv2r.v v24, v16 +; ZVBB-RV32-NEXT: csrr a1, vlenb +; ZVBB-RV32-NEXT: slli a2, a1, 3 +; ZVBB-RV32-NEXT: sub a1, a2, a1 +; ZVBB-RV32-NEXT: add a1, sp, a1 +; ZVBB-RV32-NEXT: addi a1, a1, 64 +; ZVBB-RV32-NEXT: vmv2r.v v22, v12 +; ZVBB-RV32-NEXT: csrr a2, vlenb +; ZVBB-RV32-NEXT: vmv2r.v v20, v8 +; ZVBB-RV32-NEXT: vmv1r.v v1, v20 +; ZVBB-RV32-NEXT: vmv1r.v v3, v22 +; ZVBB-RV32-NEXT: vmv1r.v v5, v24 +; ZVBB-RV32-NEXT: vmv1r.v v7, v26 +; ZVBB-RV32-NEXT: add a3, a0, a2 +; ZVBB-RV32-NEXT: vmv1r.v v2, v10 +; ZVBB-RV32-NEXT: add a4, a1, a2 +; ZVBB-RV32-NEXT: slli a5, a2, 2 +; ZVBB-RV32-NEXT: vmv1r.v v4, v14 +; ZVBB-RV32-NEXT: slli a6, a2, 4 +; ZVBB-RV32-NEXT: add a7, a4, a2 +; ZVBB-RV32-NEXT: vmv1r.v v6, v18 +; ZVBB-RV32-NEXT: sub a5, a6, a5 +; ZVBB-RV32-NEXT: vmv1r.v v22, v11 +; ZVBB-RV32-NEXT: add a6, a7, a2 +; ZVBB-RV32-NEXT: vmv1r.v v24, v15 +; ZVBB-RV32-NEXT: vsseg7e32.v v1, (a0) +; ZVBB-RV32-NEXT: vmv1r.v v26, v19 +; ZVBB-RV32-NEXT: vsseg7e32.v v21, (a1) +; ZVBB-RV32-NEXT: vl1re32.v v18, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re32.v v19, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re32.v v20, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re32.v v21, (a6) +; ZVBB-RV32-NEXT: add a6, a3, a2 +; ZVBB-RV32-NEXT: vl1re32.v v10, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re32.v v11, (a6) +; ZVBB-RV32-NEXT: vl1re32.v v8, (a0) +; ZVBB-RV32-NEXT: vl1re32.v v16, (a4) +; ZVBB-RV32-NEXT: vl1re32.v v9, (a3) +; ZVBB-RV32-NEXT: vl1re32.v v17, (a7) +; ZVBB-RV32-NEXT: csrr a0, vlenb +; ZVBB-RV32-NEXT: li a3, 14 +; ZVBB-RV32-NEXT: mul a0, a0, a3 +; ZVBB-RV32-NEXT: add a0, sp, a0 +; ZVBB-RV32-NEXT: addi a0, a0, 64 +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re32.v v12, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re32.v v13, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: slli a2, a2, 3 +; ZVBB-RV32-NEXT: add a2, a0, a2 +; ZVBB-RV32-NEXT: vl1re32.v v14, (a6) +; ZVBB-RV32-NEXT: vl1re32.v v15, (a1) +; ZVBB-RV32-NEXT: add a5, a0, a5 +; ZVBB-RV32-NEXT: vs2r.v v20, (a5) +; ZVBB-RV32-NEXT: vs4r.v v16, (a2) +; ZVBB-RV32-NEXT: vs8r.v v8, (a0) +; ZVBB-RV32-NEXT: vl8re32.v v16, (a2) +; ZVBB-RV32-NEXT: vl8re32.v v8, (a0) +; ZVBB-RV32-NEXT: addi sp, s0, -80 +; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; ZVBB-RV32-NEXT: addi sp, sp, 80 +; ZVBB-RV32-NEXT: ret +; +; ZVBB-RV64-LABEL: vector_interleave_nxv28i32_nxv4i32: +; ZVBB-RV64: # %bb.0: +; ZVBB-RV64-NEXT: addi sp, sp, -80 +; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; ZVBB-RV64-NEXT: addi s0, sp, 80 +; ZVBB-RV64-NEXT: csrr a0, vlenb +; ZVBB-RV64-NEXT: slli a0, a0, 5 +; ZVBB-RV64-NEXT: sub sp, sp, a0 +; ZVBB-RV64-NEXT: andi sp, sp, -64 +; ZVBB-RV64-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; ZVBB-RV64-NEXT: vmv2r.v v26, v20 +; ZVBB-RV64-NEXT: addi a0, sp, 64 +; ZVBB-RV64-NEXT: vmv2r.v v24, v16 +; ZVBB-RV64-NEXT: csrr a1, vlenb +; ZVBB-RV64-NEXT: slli a2, a1, 3 +; ZVBB-RV64-NEXT: sub a1, a2, a1 +; ZVBB-RV64-NEXT: add a1, sp, a1 +; ZVBB-RV64-NEXT: addi a1, a1, 64 +; ZVBB-RV64-NEXT: vmv2r.v v22, v12 +; ZVBB-RV64-NEXT: csrr a2, vlenb +; ZVBB-RV64-NEXT: vmv2r.v v20, v8 +; ZVBB-RV64-NEXT: vmv1r.v v1, v20 +; ZVBB-RV64-NEXT: vmv1r.v v3, v22 +; ZVBB-RV64-NEXT: vmv1r.v v5, v24 +; ZVBB-RV64-NEXT: vmv1r.v v7, v26 +; ZVBB-RV64-NEXT: add a3, a0, a2 +; ZVBB-RV64-NEXT: vmv1r.v v2, v10 +; ZVBB-RV64-NEXT: add a4, a1, a2 +; ZVBB-RV64-NEXT: slli a5, a2, 2 +; ZVBB-RV64-NEXT: vmv1r.v v4, v14 +; ZVBB-RV64-NEXT: slli a6, a2, 4 +; ZVBB-RV64-NEXT: add a7, a4, a2 +; ZVBB-RV64-NEXT: vmv1r.v v6, v18 +; ZVBB-RV64-NEXT: sub a5, a6, a5 +; ZVBB-RV64-NEXT: vmv1r.v v22, v11 +; ZVBB-RV64-NEXT: add a6, a7, a2 +; ZVBB-RV64-NEXT: vmv1r.v v24, v15 +; ZVBB-RV64-NEXT: vsseg7e32.v v1, (a0) +; ZVBB-RV64-NEXT: vmv1r.v v26, v19 +; ZVBB-RV64-NEXT: vsseg7e32.v v21, (a1) +; ZVBB-RV64-NEXT: vl1re32.v v18, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re32.v v19, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re32.v v20, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re32.v v21, (a6) +; ZVBB-RV64-NEXT: add a6, a3, a2 +; ZVBB-RV64-NEXT: vl1re32.v v10, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re32.v v11, (a6) +; ZVBB-RV64-NEXT: vl1re32.v v8, (a0) +; ZVBB-RV64-NEXT: vl1re32.v v16, (a4) +; ZVBB-RV64-NEXT: vl1re32.v v9, (a3) +; ZVBB-RV64-NEXT: vl1re32.v v17, (a7) +; ZVBB-RV64-NEXT: csrr a0, vlenb +; ZVBB-RV64-NEXT: li a3, 14 +; ZVBB-RV64-NEXT: mul a0, a0, a3 +; ZVBB-RV64-NEXT: add a0, sp, a0 +; ZVBB-RV64-NEXT: addi a0, a0, 64 +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re32.v v12, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re32.v v13, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: slli a2, a2, 3 +; ZVBB-RV64-NEXT: add a2, a0, a2 +; ZVBB-RV64-NEXT: vl1re32.v v14, (a6) +; ZVBB-RV64-NEXT: vl1re32.v v15, (a1) +; ZVBB-RV64-NEXT: add a5, a0, a5 +; ZVBB-RV64-NEXT: vs2r.v v20, (a5) +; ZVBB-RV64-NEXT: vs4r.v v16, (a2) +; ZVBB-RV64-NEXT: vs8r.v v8, (a0) +; ZVBB-RV64-NEXT: vl8re32.v v16, (a2) +; ZVBB-RV64-NEXT: vl8re32.v v8, (a0) +; ZVBB-RV64-NEXT: addi sp, s0, -80 +; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; ZVBB-RV64-NEXT: addi sp, sp, 80 +; ZVBB-RV64-NEXT: ret +; +; ZIP-LABEL: vector_interleave_nxv28i32_nxv4i32: +; ZIP: # %bb.0: +; ZIP-NEXT: addi sp, sp, -80 +; ZIP-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; ZIP-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; ZIP-NEXT: addi s0, sp, 80 +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: slli a0, a0, 5 +; ZIP-NEXT: sub sp, sp, a0 +; ZIP-NEXT: andi sp, sp, -64 +; ZIP-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; ZIP-NEXT: vmv2r.v v26, v20 +; ZIP-NEXT: addi a0, sp, 64 +; ZIP-NEXT: vmv2r.v v24, v16 +; ZIP-NEXT: csrr a1, vlenb +; ZIP-NEXT: slli a2, a1, 3 +; ZIP-NEXT: sub a1, a2, a1 +; ZIP-NEXT: add a1, sp, a1 +; ZIP-NEXT: addi a1, a1, 64 +; ZIP-NEXT: vmv2r.v v22, v12 +; ZIP-NEXT: csrr a2, vlenb +; ZIP-NEXT: vmv2r.v v20, v8 +; ZIP-NEXT: vmv1r.v v1, v20 +; ZIP-NEXT: vmv1r.v v3, v22 +; ZIP-NEXT: vmv1r.v v5, v24 +; ZIP-NEXT: vmv1r.v v7, v26 +; ZIP-NEXT: add a3, a0, a2 +; ZIP-NEXT: vmv1r.v v2, v10 +; ZIP-NEXT: add a4, a1, a2 +; ZIP-NEXT: slli a5, a2, 2 +; ZIP-NEXT: vmv1r.v v4, v14 +; ZIP-NEXT: slli a6, a2, 4 +; ZIP-NEXT: add a7, a4, a2 +; ZIP-NEXT: vmv1r.v v6, v18 +; ZIP-NEXT: sub a5, a6, a5 +; ZIP-NEXT: vmv1r.v v22, v11 +; ZIP-NEXT: add a6, a7, a2 +; ZIP-NEXT: vmv1r.v v24, v15 +; ZIP-NEXT: vsseg7e32.v v1, (a0) +; ZIP-NEXT: vmv1r.v v26, v19 +; ZIP-NEXT: vsseg7e32.v v21, (a1) +; ZIP-NEXT: vl1re32.v v18, (a6) +; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: vl1re32.v v19, (a6) +; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: vl1re32.v v20, (a6) +; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: vl1re32.v v21, (a6) +; ZIP-NEXT: add a6, a3, a2 +; ZIP-NEXT: vl1re32.v v10, (a6) +; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: vl1re32.v v11, (a6) +; ZIP-NEXT: vl1re32.v v8, (a0) +; ZIP-NEXT: vl1re32.v v16, (a4) +; ZIP-NEXT: vl1re32.v v9, (a3) +; ZIP-NEXT: vl1re32.v v17, (a7) +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: li a3, 14 +; ZIP-NEXT: mul a0, a0, a3 +; ZIP-NEXT: add a0, sp, a0 +; ZIP-NEXT: addi a0, a0, 64 +; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: vl1re32.v v12, (a6) +; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: vl1re32.v v13, (a6) +; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: slli a2, a2, 3 +; ZIP-NEXT: add a2, a0, a2 +; ZIP-NEXT: vl1re32.v v14, (a6) +; ZIP-NEXT: vl1re32.v v15, (a1) +; ZIP-NEXT: add a5, a0, a5 +; ZIP-NEXT: vs2r.v v20, (a5) +; ZIP-NEXT: vs4r.v v16, (a2) +; ZIP-NEXT: vs8r.v v8, (a0) +; ZIP-NEXT: vl8re32.v v16, (a2) +; ZIP-NEXT: vl8re32.v v8, (a0) +; ZIP-NEXT: addi sp, s0, -80 +; ZIP-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; ZIP-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; ZIP-NEXT: addi sp, sp, 80 +; ZIP-NEXT: ret + %res = call @llvm.vector.interleave7.nxv28i32( %a, %b, %c, %d, %e, %f, %g) + ret %res +} + +define @vector_interleave_nxv14i64_nxv2i64( %a, %b, %c, %d, %e, %f, %g) nounwind { +; +; RV32-LABEL: vector_interleave_nxv14i64_nxv2i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -80 +; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; RV32-NEXT: addi s0, sp, 80 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: sub sp, sp, a0 +; RV32-NEXT: andi sp, sp, -64 +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32-NEXT: vmv2r.v v26, v20 +; RV32-NEXT: addi a0, sp, 64 +; RV32-NEXT: vmv2r.v v24, v16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a2, a1, 3 +; RV32-NEXT: sub a1, a2, a1 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 64 +; RV32-NEXT: vmv2r.v v22, v12 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: vmv2r.v v20, v8 +; RV32-NEXT: vmv1r.v v1, v20 +; RV32-NEXT: vmv1r.v v3, v22 +; RV32-NEXT: vmv1r.v v5, v24 +; RV32-NEXT: vmv1r.v v7, v26 +; RV32-NEXT: add a3, a0, a2 +; RV32-NEXT: vmv1r.v v2, v10 +; RV32-NEXT: add a4, a1, a2 +; RV32-NEXT: slli a5, a2, 2 +; RV32-NEXT: vmv1r.v v4, v14 +; RV32-NEXT: slli a6, a2, 4 +; RV32-NEXT: add a7, a4, a2 +; RV32-NEXT: vmv1r.v v6, v18 +; RV32-NEXT: sub a5, a6, a5 +; RV32-NEXT: vmv1r.v v22, v11 +; RV32-NEXT: add a6, a7, a2 +; RV32-NEXT: vmv1r.v v24, v15 +; RV32-NEXT: vsseg7e64.v v1, (a0) +; RV32-NEXT: vmv1r.v v26, v19 +; RV32-NEXT: vsseg7e64.v v21, (a1) +; RV32-NEXT: vl1re64.v v18, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re64.v v19, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re64.v v20, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re64.v v21, (a6) +; RV32-NEXT: add a6, a3, a2 +; RV32-NEXT: vl1re64.v v10, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re64.v v11, (a6) +; RV32-NEXT: vl1re64.v v8, (a0) +; RV32-NEXT: vl1re64.v v16, (a4) +; RV32-NEXT: vl1re64.v v9, (a3) +; RV32-NEXT: vl1re64.v v17, (a7) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a3, 14 +; RV32-NEXT: mul a0, a0, a3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 64 +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re64.v v12, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re64.v v13, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, a0, a2 +; RV32-NEXT: vl1re64.v v14, (a6) +; RV32-NEXT: vl1re64.v v15, (a1) +; RV32-NEXT: add a5, a0, a5 +; RV32-NEXT: vs2r.v v20, (a5) +; RV32-NEXT: vs4r.v v16, (a2) +; RV32-NEXT: vs8r.v v8, (a0) +; RV32-NEXT: vl8re64.v v16, (a2) +; RV32-NEXT: vl8re64.v v8, (a0) +; RV32-NEXT: addi sp, s0, -80 +; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 80 +; RV32-NEXT: ret +; +; RV64-LABEL: vector_interleave_nxv14i64_nxv2i64: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -80 +; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; RV64-NEXT: addi s0, sp, 80 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: sub sp, sp, a0 +; RV64-NEXT: andi sp, sp, -64 +; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV64-NEXT: vmv2r.v v26, v20 +; RV64-NEXT: addi a0, sp, 64 +; RV64-NEXT: vmv2r.v v24, v16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 3 +; RV64-NEXT: sub a1, a2, a1 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 64 +; RV64-NEXT: vmv2r.v v22, v12 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: vmv2r.v v20, v8 +; RV64-NEXT: vmv1r.v v1, v20 +; RV64-NEXT: vmv1r.v v3, v22 +; RV64-NEXT: vmv1r.v v5, v24 +; RV64-NEXT: vmv1r.v v7, v26 +; RV64-NEXT: add a3, a0, a2 +; RV64-NEXT: vmv1r.v v2, v10 +; RV64-NEXT: add a4, a1, a2 +; RV64-NEXT: slli a5, a2, 2 +; RV64-NEXT: vmv1r.v v4, v14 +; RV64-NEXT: slli a6, a2, 4 +; RV64-NEXT: add a7, a4, a2 +; RV64-NEXT: vmv1r.v v6, v18 +; RV64-NEXT: sub a5, a6, a5 +; RV64-NEXT: vmv1r.v v22, v11 +; RV64-NEXT: add a6, a7, a2 +; RV64-NEXT: vmv1r.v v24, v15 +; RV64-NEXT: vsseg7e64.v v1, (a0) +; RV64-NEXT: vmv1r.v v26, v19 +; RV64-NEXT: vsseg7e64.v v21, (a1) +; RV64-NEXT: vl1re64.v v18, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re64.v v19, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re64.v v20, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re64.v v21, (a6) +; RV64-NEXT: add a6, a3, a2 +; RV64-NEXT: vl1re64.v v10, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re64.v v11, (a6) +; RV64-NEXT: vl1re64.v v8, (a0) +; RV64-NEXT: vl1re64.v v16, (a4) +; RV64-NEXT: vl1re64.v v9, (a3) +; RV64-NEXT: vl1re64.v v17, (a7) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a3, 14 +; RV64-NEXT: mul a0, a0, a3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 64 +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re64.v v12, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re64.v v13, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: slli a2, a2, 3 +; RV64-NEXT: add a2, a0, a2 +; RV64-NEXT: vl1re64.v v14, (a6) +; RV64-NEXT: vl1re64.v v15, (a1) +; RV64-NEXT: add a5, a0, a5 +; RV64-NEXT: vs2r.v v20, (a5) +; RV64-NEXT: vs4r.v v16, (a2) +; RV64-NEXT: vs8r.v v8, (a0) +; RV64-NEXT: vl8re64.v v16, (a2) +; RV64-NEXT: vl8re64.v v8, (a0) +; RV64-NEXT: addi sp, s0, -80 +; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 80 +; RV64-NEXT: ret +; +; ZVBB-RV32-LABEL: vector_interleave_nxv14i64_nxv2i64: +; ZVBB-RV32: # %bb.0: +; ZVBB-RV32-NEXT: addi sp, sp, -80 +; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; ZVBB-RV32-NEXT: addi s0, sp, 80 +; ZVBB-RV32-NEXT: csrr a0, vlenb +; ZVBB-RV32-NEXT: slli a0, a0, 5 +; ZVBB-RV32-NEXT: sub sp, sp, a0 +; ZVBB-RV32-NEXT: andi sp, sp, -64 +; ZVBB-RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; ZVBB-RV32-NEXT: vmv2r.v v26, v20 +; ZVBB-RV32-NEXT: addi a0, sp, 64 +; ZVBB-RV32-NEXT: vmv2r.v v24, v16 +; ZVBB-RV32-NEXT: csrr a1, vlenb +; ZVBB-RV32-NEXT: slli a2, a1, 3 +; ZVBB-RV32-NEXT: sub a1, a2, a1 +; ZVBB-RV32-NEXT: add a1, sp, a1 +; ZVBB-RV32-NEXT: addi a1, a1, 64 +; ZVBB-RV32-NEXT: vmv2r.v v22, v12 +; ZVBB-RV32-NEXT: csrr a2, vlenb +; ZVBB-RV32-NEXT: vmv2r.v v20, v8 +; ZVBB-RV32-NEXT: vmv1r.v v1, v20 +; ZVBB-RV32-NEXT: vmv1r.v v3, v22 +; ZVBB-RV32-NEXT: vmv1r.v v5, v24 +; ZVBB-RV32-NEXT: vmv1r.v v7, v26 +; ZVBB-RV32-NEXT: add a3, a0, a2 +; ZVBB-RV32-NEXT: vmv1r.v v2, v10 +; ZVBB-RV32-NEXT: add a4, a1, a2 +; ZVBB-RV32-NEXT: slli a5, a2, 2 +; ZVBB-RV32-NEXT: vmv1r.v v4, v14 +; ZVBB-RV32-NEXT: slli a6, a2, 4 +; ZVBB-RV32-NEXT: add a7, a4, a2 +; ZVBB-RV32-NEXT: vmv1r.v v6, v18 +; ZVBB-RV32-NEXT: sub a5, a6, a5 +; ZVBB-RV32-NEXT: vmv1r.v v22, v11 +; ZVBB-RV32-NEXT: add a6, a7, a2 +; ZVBB-RV32-NEXT: vmv1r.v v24, v15 +; ZVBB-RV32-NEXT: vsseg7e64.v v1, (a0) +; ZVBB-RV32-NEXT: vmv1r.v v26, v19 +; ZVBB-RV32-NEXT: vsseg7e64.v v21, (a1) +; ZVBB-RV32-NEXT: vl1re64.v v18, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re64.v v19, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re64.v v20, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re64.v v21, (a6) +; ZVBB-RV32-NEXT: add a6, a3, a2 +; ZVBB-RV32-NEXT: vl1re64.v v10, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re64.v v11, (a6) +; ZVBB-RV32-NEXT: vl1re64.v v8, (a0) +; ZVBB-RV32-NEXT: vl1re64.v v16, (a4) +; ZVBB-RV32-NEXT: vl1re64.v v9, (a3) +; ZVBB-RV32-NEXT: vl1re64.v v17, (a7) +; ZVBB-RV32-NEXT: csrr a0, vlenb +; ZVBB-RV32-NEXT: li a3, 14 +; ZVBB-RV32-NEXT: mul a0, a0, a3 +; ZVBB-RV32-NEXT: add a0, sp, a0 +; ZVBB-RV32-NEXT: addi a0, a0, 64 +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re64.v v12, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re64.v v13, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: slli a2, a2, 3 +; ZVBB-RV32-NEXT: add a2, a0, a2 +; ZVBB-RV32-NEXT: vl1re64.v v14, (a6) +; ZVBB-RV32-NEXT: vl1re64.v v15, (a1) +; ZVBB-RV32-NEXT: add a5, a0, a5 +; ZVBB-RV32-NEXT: vs2r.v v20, (a5) +; ZVBB-RV32-NEXT: vs4r.v v16, (a2) +; ZVBB-RV32-NEXT: vs8r.v v8, (a0) +; ZVBB-RV32-NEXT: vl8re64.v v16, (a2) +; ZVBB-RV32-NEXT: vl8re64.v v8, (a0) +; ZVBB-RV32-NEXT: addi sp, s0, -80 +; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; ZVBB-RV32-NEXT: addi sp, sp, 80 +; ZVBB-RV32-NEXT: ret +; +; ZVBB-RV64-LABEL: vector_interleave_nxv14i64_nxv2i64: +; ZVBB-RV64: # %bb.0: +; ZVBB-RV64-NEXT: addi sp, sp, -80 +; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; ZVBB-RV64-NEXT: addi s0, sp, 80 +; ZVBB-RV64-NEXT: csrr a0, vlenb +; ZVBB-RV64-NEXT: slli a0, a0, 5 +; ZVBB-RV64-NEXT: sub sp, sp, a0 +; ZVBB-RV64-NEXT: andi sp, sp, -64 +; ZVBB-RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; ZVBB-RV64-NEXT: vmv2r.v v26, v20 +; ZVBB-RV64-NEXT: addi a0, sp, 64 +; ZVBB-RV64-NEXT: vmv2r.v v24, v16 +; ZVBB-RV64-NEXT: csrr a1, vlenb +; ZVBB-RV64-NEXT: slli a2, a1, 3 +; ZVBB-RV64-NEXT: sub a1, a2, a1 +; ZVBB-RV64-NEXT: add a1, sp, a1 +; ZVBB-RV64-NEXT: addi a1, a1, 64 +; ZVBB-RV64-NEXT: vmv2r.v v22, v12 +; ZVBB-RV64-NEXT: csrr a2, vlenb +; ZVBB-RV64-NEXT: vmv2r.v v20, v8 +; ZVBB-RV64-NEXT: vmv1r.v v1, v20 +; ZVBB-RV64-NEXT: vmv1r.v v3, v22 +; ZVBB-RV64-NEXT: vmv1r.v v5, v24 +; ZVBB-RV64-NEXT: vmv1r.v v7, v26 +; ZVBB-RV64-NEXT: add a3, a0, a2 +; ZVBB-RV64-NEXT: vmv1r.v v2, v10 +; ZVBB-RV64-NEXT: add a4, a1, a2 +; ZVBB-RV64-NEXT: slli a5, a2, 2 +; ZVBB-RV64-NEXT: vmv1r.v v4, v14 +; ZVBB-RV64-NEXT: slli a6, a2, 4 +; ZVBB-RV64-NEXT: add a7, a4, a2 +; ZVBB-RV64-NEXT: vmv1r.v v6, v18 +; ZVBB-RV64-NEXT: sub a5, a6, a5 +; ZVBB-RV64-NEXT: vmv1r.v v22, v11 +; ZVBB-RV64-NEXT: add a6, a7, a2 +; ZVBB-RV64-NEXT: vmv1r.v v24, v15 +; ZVBB-RV64-NEXT: vsseg7e64.v v1, (a0) +; ZVBB-RV64-NEXT: vmv1r.v v26, v19 +; ZVBB-RV64-NEXT: vsseg7e64.v v21, (a1) +; ZVBB-RV64-NEXT: vl1re64.v v18, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re64.v v19, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re64.v v20, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re64.v v21, (a6) +; ZVBB-RV64-NEXT: add a6, a3, a2 +; ZVBB-RV64-NEXT: vl1re64.v v10, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re64.v v11, (a6) +; ZVBB-RV64-NEXT: vl1re64.v v8, (a0) +; ZVBB-RV64-NEXT: vl1re64.v v16, (a4) +; ZVBB-RV64-NEXT: vl1re64.v v9, (a3) +; ZVBB-RV64-NEXT: vl1re64.v v17, (a7) +; ZVBB-RV64-NEXT: csrr a0, vlenb +; ZVBB-RV64-NEXT: li a3, 14 +; ZVBB-RV64-NEXT: mul a0, a0, a3 +; ZVBB-RV64-NEXT: add a0, sp, a0 +; ZVBB-RV64-NEXT: addi a0, a0, 64 +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re64.v v12, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re64.v v13, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: slli a2, a2, 3 +; ZVBB-RV64-NEXT: add a2, a0, a2 +; ZVBB-RV64-NEXT: vl1re64.v v14, (a6) +; ZVBB-RV64-NEXT: vl1re64.v v15, (a1) +; ZVBB-RV64-NEXT: add a5, a0, a5 +; ZVBB-RV64-NEXT: vs2r.v v20, (a5) +; ZVBB-RV64-NEXT: vs4r.v v16, (a2) +; ZVBB-RV64-NEXT: vs8r.v v8, (a0) +; ZVBB-RV64-NEXT: vl8re64.v v16, (a2) +; ZVBB-RV64-NEXT: vl8re64.v v8, (a0) +; ZVBB-RV64-NEXT: addi sp, s0, -80 +; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; ZVBB-RV64-NEXT: addi sp, sp, 80 +; ZVBB-RV64-NEXT: ret +; +; ZIP-LABEL: vector_interleave_nxv14i64_nxv2i64: +; ZIP: # %bb.0: +; ZIP-NEXT: addi sp, sp, -80 +; ZIP-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; ZIP-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; ZIP-NEXT: addi s0, sp, 80 +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: slli a0, a0, 5 +; ZIP-NEXT: sub sp, sp, a0 +; ZIP-NEXT: andi sp, sp, -64 +; ZIP-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; ZIP-NEXT: vmv2r.v v26, v20 +; ZIP-NEXT: addi a0, sp, 64 +; ZIP-NEXT: vmv2r.v v24, v16 +; ZIP-NEXT: csrr a1, vlenb +; ZIP-NEXT: slli a2, a1, 3 +; ZIP-NEXT: sub a1, a2, a1 +; ZIP-NEXT: add a1, sp, a1 +; ZIP-NEXT: addi a1, a1, 64 +; ZIP-NEXT: vmv2r.v v22, v12 +; ZIP-NEXT: csrr a2, vlenb +; ZIP-NEXT: vmv2r.v v20, v8 +; ZIP-NEXT: vmv1r.v v1, v20 +; ZIP-NEXT: vmv1r.v v3, v22 +; ZIP-NEXT: vmv1r.v v5, v24 +; ZIP-NEXT: vmv1r.v v7, v26 +; ZIP-NEXT: add a3, a0, a2 +; ZIP-NEXT: vmv1r.v v2, v10 +; ZIP-NEXT: add a4, a1, a2 +; ZIP-NEXT: slli a5, a2, 2 +; ZIP-NEXT: vmv1r.v v4, v14 +; ZIP-NEXT: slli a6, a2, 4 +; ZIP-NEXT: add a7, a4, a2 +; ZIP-NEXT: vmv1r.v v6, v18 +; ZIP-NEXT: sub a5, a6, a5 +; ZIP-NEXT: vmv1r.v v22, v11 +; ZIP-NEXT: add a6, a7, a2 +; ZIP-NEXT: vmv1r.v v24, v15 +; ZIP-NEXT: vsseg7e64.v v1, (a0) +; ZIP-NEXT: vmv1r.v v26, v19 +; ZIP-NEXT: vsseg7e64.v v21, (a1) +; ZIP-NEXT: vl1re64.v v18, (a6) +; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: vl1re64.v v19, (a6) +; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: vl1re64.v v20, (a6) +; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: vl1re64.v v21, (a6) +; ZIP-NEXT: add a6, a3, a2 +; ZIP-NEXT: vl1re64.v v10, (a6) +; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: vl1re64.v v11, (a6) +; ZIP-NEXT: vl1re64.v v8, (a0) +; ZIP-NEXT: vl1re64.v v16, (a4) +; ZIP-NEXT: vl1re64.v v9, (a3) +; ZIP-NEXT: vl1re64.v v17, (a7) +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: li a3, 14 +; ZIP-NEXT: mul a0, a0, a3 +; ZIP-NEXT: add a0, sp, a0 +; ZIP-NEXT: addi a0, a0, 64 +; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: vl1re64.v v12, (a6) +; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: vl1re64.v v13, (a6) +; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: slli a2, a2, 3 +; ZIP-NEXT: add a2, a0, a2 +; ZIP-NEXT: vl1re64.v v14, (a6) +; ZIP-NEXT: vl1re64.v v15, (a1) +; ZIP-NEXT: add a5, a0, a5 +; ZIP-NEXT: vs2r.v v20, (a5) +; ZIP-NEXT: vs4r.v v16, (a2) +; ZIP-NEXT: vs8r.v v8, (a0) +; ZIP-NEXT: vl8re64.v v16, (a2) +; ZIP-NEXT: vl8re64.v v8, (a0) +; ZIP-NEXT: addi sp, s0, -80 +; ZIP-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; ZIP-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; ZIP-NEXT: addi sp, sp, 80 +; ZIP-NEXT: ret + %res = call @llvm.vector.interleave7.nxv14i64( %a, %b, %c, %d, %e, %f, %g) + ret %res +} + +define @vector_interleave_nxv128i1_nxv16i1( %a, %b, %c, %d, %e, %f, %g, %h) nounwind { +; CHECK-LABEL: vector_interleave_nxv128i1_nxv16i1: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vmv.v.i v22, 0 +; CHECK-NEXT: vmerge.vim v24, v22, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vim v16, v22, 1, v0 +; CHECK-NEXT: vmv1r.v v1, v24 +; CHECK-NEXT: vmv1r.v v2, v16 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vmerge.vim v26, v22, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vmerge.vim v18, v22, 1, v0 +; CHECK-NEXT: vmv1r.v v3, v26 +; CHECK-NEXT: vmv1r.v v4, v18 +; CHECK-NEXT: vmv1r.v v0, v11 +; CHECK-NEXT: vmerge.vim v8, v22, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v12 +; CHECK-NEXT: vmerge.vim v20, v22, 1, v0 +; CHECK-NEXT: vmv1r.v v5, v8 +; CHECK-NEXT: vmv1r.v v6, v20 +; CHECK-NEXT: vmv1r.v v0, v13 +; CHECK-NEXT: vmerge.vim v10, v22, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v14 +; CHECK-NEXT: vmerge.vim v22, v22, 1, v0 +; CHECK-NEXT: vmv1r.v v7, v10 +; CHECK-NEXT: vmv1r.v v8, v22 +; CHECK-NEXT: vmv1r.v v16, v25 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; CHECK-NEXT: vsseg8e8.v v1, (a2) +; CHECK-NEXT: vmv1r.v v18, v27 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: add a3, a2, a0 +; CHECK-NEXT: add a4, a1, a0 +; CHECK-NEXT: add a5, a3, a0 +; CHECK-NEXT: add a6, a4, a0 +; CHECK-NEXT: add a7, a5, a0 +; CHECK-NEXT: add t0, a6, a0 +; CHECK-NEXT: add t1, a7, a0 +; CHECK-NEXT: add t2, t0, a0 +; CHECK-NEXT: vmv1r.v v20, v9 +; CHECK-NEXT: add t3, t1, a0 +; CHECK-NEXT: vmv1r.v v22, v11 +; CHECK-NEXT: vsseg8e8.v v16, (a1) +; CHECK-NEXT: vl1r.v v10, (t1) +; CHECK-NEXT: add t1, t2, a0 +; CHECK-NEXT: vl1r.v v12, (a5) +; CHECK-NEXT: add a5, t3, a0 +; CHECK-NEXT: vl1r.v v14, (a2) +; CHECK-NEXT: add a2, t1, a0 +; CHECK-NEXT: vl1r.v v16, (a5) +; CHECK-NEXT: add a5, a5, a0 +; CHECK-NEXT: vl1r.v v8, (a2) +; CHECK-NEXT: add a2, a2, a0 +; CHECK-NEXT: vl1r.v v18, (t2) +; CHECK-NEXT: vl1r.v v17, (a5) +; CHECK-NEXT: vl1r.v v11, (t3) +; CHECK-NEXT: vl1r.v v13, (a7) +; CHECK-NEXT: vl1r.v v15, (a3) +; CHECK-NEXT: vsetvli a3, zero, e8, m2, ta, ma +; CHECK-NEXT: vmsne.vi v20, v16, 0 +; CHECK-NEXT: vmsne.vi v16, v10, 0 +; CHECK-NEXT: vl1r.v v10, (a6) +; CHECK-NEXT: vmsne.vi v17, v12, 0 +; CHECK-NEXT: vmsne.vi v0, v14, 0 +; CHECK-NEXT: vl1r.v v12, (a1) +; CHECK-NEXT: vl1r.v v9, (a2) +; CHECK-NEXT: vl1r.v v19, (t1) +; CHECK-NEXT: vl1r.v v11, (t0) +; CHECK-NEXT: vl1r.v v13, (a4) +; CHECK-NEXT: vmsne.vi v14, v8, 0 +; CHECK-NEXT: vmsne.vi v9, v18, 0 +; CHECK-NEXT: vmsne.vi v15, v10, 0 +; CHECK-NEXT: vmsne.vi v8, v12, 0 +; CHECK-NEXT: srli a1, a0, 2 +; CHECK-NEXT: add a2, a1, a1 +; CHECK-NEXT: vsetvli zero, a2, e8, mf2, ta, ma +; CHECK-NEXT: vslideup.vx v16, v20, a1 +; CHECK-NEXT: vslideup.vx v0, v17, a1 +; CHECK-NEXT: vslideup.vx v9, v14, a1 +; CHECK-NEXT: vslideup.vx v8, v15, a1 +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: add a1, a0, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vslideup.vx v0, v16, a0 +; CHECK-NEXT: vslideup.vx v8, v9, a0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv128i1_nxv16i1: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 4 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; ZVBB-NEXT: vmv.v.i v22, 0 +; ZVBB-NEXT: vmerge.vim v24, v22, 1, v0 +; ZVBB-NEXT: vmv1r.v v0, v8 +; ZVBB-NEXT: vmerge.vim v16, v22, 1, v0 +; ZVBB-NEXT: vmv1r.v v1, v24 +; ZVBB-NEXT: vmv1r.v v2, v16 +; ZVBB-NEXT: vmv1r.v v0, v9 +; ZVBB-NEXT: vmerge.vim v26, v22, 1, v0 +; ZVBB-NEXT: vmv1r.v v0, v10 +; ZVBB-NEXT: vmerge.vim v18, v22, 1, v0 +; ZVBB-NEXT: vmv1r.v v3, v26 +; ZVBB-NEXT: vmv1r.v v4, v18 +; ZVBB-NEXT: vmv1r.v v0, v11 +; ZVBB-NEXT: vmerge.vim v8, v22, 1, v0 +; ZVBB-NEXT: vmv1r.v v0, v12 +; ZVBB-NEXT: vmerge.vim v20, v22, 1, v0 +; ZVBB-NEXT: vmv1r.v v5, v8 +; ZVBB-NEXT: vmv1r.v v6, v20 +; ZVBB-NEXT: vmv1r.v v0, v13 +; ZVBB-NEXT: vmerge.vim v10, v22, 1, v0 +; ZVBB-NEXT: vmv1r.v v0, v14 +; ZVBB-NEXT: vmerge.vim v22, v22, 1, v0 +; ZVBB-NEXT: vmv1r.v v7, v10 +; ZVBB-NEXT: vmv1r.v v8, v22 +; ZVBB-NEXT: vmv1r.v v16, v25 +; ZVBB-NEXT: addi a2, sp, 16 +; ZVBB-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; ZVBB-NEXT: vsseg8e8.v v1, (a2) +; ZVBB-NEXT: vmv1r.v v18, v27 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: slli a1, a1, 3 +; ZVBB-NEXT: add a1, sp, a1 +; ZVBB-NEXT: addi a1, a1, 16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: add a3, a2, a0 +; ZVBB-NEXT: add a4, a1, a0 +; ZVBB-NEXT: add a5, a3, a0 +; ZVBB-NEXT: add a6, a4, a0 +; ZVBB-NEXT: add a7, a5, a0 +; ZVBB-NEXT: add t0, a6, a0 +; ZVBB-NEXT: add t1, a7, a0 +; ZVBB-NEXT: add t2, t0, a0 +; ZVBB-NEXT: vmv1r.v v20, v9 +; ZVBB-NEXT: add t3, t1, a0 +; ZVBB-NEXT: vmv1r.v v22, v11 +; ZVBB-NEXT: vsseg8e8.v v16, (a1) +; ZVBB-NEXT: vl1r.v v10, (t1) +; ZVBB-NEXT: add t1, t2, a0 +; ZVBB-NEXT: vl1r.v v12, (a5) +; ZVBB-NEXT: add a5, t3, a0 +; ZVBB-NEXT: vl1r.v v14, (a2) +; ZVBB-NEXT: add a2, t1, a0 +; ZVBB-NEXT: vl1r.v v16, (a5) +; ZVBB-NEXT: add a5, a5, a0 +; ZVBB-NEXT: vl1r.v v8, (a2) +; ZVBB-NEXT: add a2, a2, a0 +; ZVBB-NEXT: vl1r.v v18, (t2) +; ZVBB-NEXT: vl1r.v v17, (a5) +; ZVBB-NEXT: vl1r.v v11, (t3) +; ZVBB-NEXT: vl1r.v v13, (a7) +; ZVBB-NEXT: vl1r.v v15, (a3) +; ZVBB-NEXT: vsetvli a3, zero, e8, m2, ta, ma +; ZVBB-NEXT: vmsne.vi v20, v16, 0 +; ZVBB-NEXT: vmsne.vi v16, v10, 0 +; ZVBB-NEXT: vl1r.v v10, (a6) +; ZVBB-NEXT: vmsne.vi v17, v12, 0 +; ZVBB-NEXT: vmsne.vi v0, v14, 0 +; ZVBB-NEXT: vl1r.v v12, (a1) +; ZVBB-NEXT: vl1r.v v9, (a2) +; ZVBB-NEXT: vl1r.v v19, (t1) +; ZVBB-NEXT: vl1r.v v11, (t0) +; ZVBB-NEXT: vl1r.v v13, (a4) +; ZVBB-NEXT: vmsne.vi v14, v8, 0 +; ZVBB-NEXT: vmsne.vi v9, v18, 0 +; ZVBB-NEXT: vmsne.vi v15, v10, 0 +; ZVBB-NEXT: vmsne.vi v8, v12, 0 +; ZVBB-NEXT: srli a1, a0, 2 +; ZVBB-NEXT: add a2, a1, a1 +; ZVBB-NEXT: vsetvli zero, a2, e8, mf2, ta, ma +; ZVBB-NEXT: vslideup.vx v16, v20, a1 +; ZVBB-NEXT: vslideup.vx v0, v17, a1 +; ZVBB-NEXT: vslideup.vx v9, v14, a1 +; ZVBB-NEXT: vslideup.vx v8, v15, a1 +; ZVBB-NEXT: srli a0, a0, 1 +; ZVBB-NEXT: add a1, a0, a0 +; ZVBB-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v0, v16, a0 +; ZVBB-NEXT: vslideup.vx v8, v9, a0 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 4 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave8.nxv128i1( %a, %b, %c, %d, %e, %f, %g, %h) + ret %res +} + + +define @vector_interleave_nxv128i8_nxv16i8( %a, %b, %c, %d, %e, %f, %g, %h) nounwind { +; +; CHECK-LABEL: vector_interleave_nxv128i8_nxv16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v28, v22 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv2r.v v26, v18 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: add a2, a0, a3 +; CHECK-NEXT: add a4, a1, a3 +; CHECK-NEXT: add a5, a2, a3 +; CHECK-NEXT: vmv1r.v v1, v8 +; CHECK-NEXT: vmv2r.v v24, v14 +; CHECK-NEXT: add a6, a4, a3 +; CHECK-NEXT: vmv2r.v v22, v10 +; CHECK-NEXT: vmv1r.v v2, v22 +; CHECK-NEXT: add a7, a5, a3 +; CHECK-NEXT: vmv1r.v v3, v12 +; CHECK-NEXT: add t0, a6, a3 +; CHECK-NEXT: vmv1r.v v4, v24 +; CHECK-NEXT: add t1, a7, a3 +; CHECK-NEXT: vmv1r.v v5, v16 +; CHECK-NEXT: add t2, t0, a3 +; CHECK-NEXT: vmv1r.v v6, v26 +; CHECK-NEXT: add t3, t1, a3 +; CHECK-NEXT: vmv1r.v v7, v20 +; CHECK-NEXT: add t4, t2, a3 +; CHECK-NEXT: vmv1r.v v8, v28 +; CHECK-NEXT: vmv1r.v v22, v9 +; CHECK-NEXT: add t5, t3, a3 +; CHECK-NEXT: vmv1r.v v24, v13 +; CHECK-NEXT: add t6, t4, a3 +; CHECK-NEXT: vmv1r.v v26, v17 +; CHECK-NEXT: vsseg8e8.v v1, (a0) +; CHECK-NEXT: vmv1r.v v28, v21 +; CHECK-NEXT: vsseg8e8.v v22, (a1) +; CHECK-NEXT: vl1r.v v14, (t5) +; CHECK-NEXT: add t5, t5, a3 +; CHECK-NEXT: add a3, t6, a3 +; CHECK-NEXT: vl1r.v v22, (t6) +; CHECK-NEXT: vl1r.v v15, (t5) +; CHECK-NEXT: vl1r.v v23, (a3) +; CHECK-NEXT: vl1r.v v12, (t1) +; CHECK-NEXT: vl1r.v v20, (t2) +; CHECK-NEXT: vl1r.v v13, (t3) +; CHECK-NEXT: vl1r.v v21, (t4) +; CHECK-NEXT: vl1r.v v10, (a5) +; CHECK-NEXT: vl1r.v v18, (a6) +; CHECK-NEXT: vl1r.v v11, (a7) +; CHECK-NEXT: vl1r.v v19, (t0) +; CHECK-NEXT: vl1r.v v8, (a0) +; CHECK-NEXT: vl1r.v v16, (a1) +; CHECK-NEXT: vl1r.v v9, (a2) +; CHECK-NEXT: vl1r.v v17, (a4) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv128i8_nxv16i8: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 4 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; ZVBB-NEXT: vmv2r.v v28, v22 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: vmv2r.v v26, v18 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: slli a1, a1, 3 +; ZVBB-NEXT: add a1, sp, a1 +; ZVBB-NEXT: addi a1, a1, 16 +; ZVBB-NEXT: csrr a3, vlenb +; ZVBB-NEXT: add a2, a0, a3 +; ZVBB-NEXT: add a4, a1, a3 +; ZVBB-NEXT: add a5, a2, a3 +; ZVBB-NEXT: vmv1r.v v1, v8 +; ZVBB-NEXT: vmv2r.v v24, v14 +; ZVBB-NEXT: add a6, a4, a3 +; ZVBB-NEXT: vmv2r.v v22, v10 +; ZVBB-NEXT: vmv1r.v v2, v22 +; ZVBB-NEXT: add a7, a5, a3 +; ZVBB-NEXT: vmv1r.v v3, v12 +; ZVBB-NEXT: add t0, a6, a3 +; ZVBB-NEXT: vmv1r.v v4, v24 +; ZVBB-NEXT: add t1, a7, a3 +; ZVBB-NEXT: vmv1r.v v5, v16 +; ZVBB-NEXT: add t2, t0, a3 +; ZVBB-NEXT: vmv1r.v v6, v26 +; ZVBB-NEXT: add t3, t1, a3 +; ZVBB-NEXT: vmv1r.v v7, v20 +; ZVBB-NEXT: add t4, t2, a3 +; ZVBB-NEXT: vmv1r.v v8, v28 +; ZVBB-NEXT: vmv1r.v v22, v9 +; ZVBB-NEXT: add t5, t3, a3 +; ZVBB-NEXT: vmv1r.v v24, v13 +; ZVBB-NEXT: add t6, t4, a3 +; ZVBB-NEXT: vmv1r.v v26, v17 +; ZVBB-NEXT: vsseg8e8.v v1, (a0) +; ZVBB-NEXT: vmv1r.v v28, v21 +; ZVBB-NEXT: vsseg8e8.v v22, (a1) +; ZVBB-NEXT: vl1r.v v14, (t5) +; ZVBB-NEXT: add t5, t5, a3 +; ZVBB-NEXT: add a3, t6, a3 +; ZVBB-NEXT: vl1r.v v22, (t6) +; ZVBB-NEXT: vl1r.v v15, (t5) +; ZVBB-NEXT: vl1r.v v23, (a3) +; ZVBB-NEXT: vl1r.v v12, (t1) +; ZVBB-NEXT: vl1r.v v20, (t2) +; ZVBB-NEXT: vl1r.v v13, (t3) +; ZVBB-NEXT: vl1r.v v21, (t4) +; ZVBB-NEXT: vl1r.v v10, (a5) +; ZVBB-NEXT: vl1r.v v18, (a6) +; ZVBB-NEXT: vl1r.v v11, (a7) +; ZVBB-NEXT: vl1r.v v19, (t0) +; ZVBB-NEXT: vl1r.v v8, (a0) +; ZVBB-NEXT: vl1r.v v16, (a1) +; ZVBB-NEXT: vl1r.v v9, (a2) +; ZVBB-NEXT: vl1r.v v17, (a4) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 4 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave8.nxv128i8( %a, %b, %c, %d, %e, %f, %g, %h) + ret %res +} + + +define @vector_interleave_nxv64i16_nxv8i16( %a, %b, %c, %d, %e, %f, %g, %h) nounwind { +; +; CHECK-LABEL: vector_interleave_nxv64i16_nxv8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv2r.v v28, v22 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv2r.v v26, v18 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: add a2, a0, a3 +; CHECK-NEXT: add a4, a1, a3 +; CHECK-NEXT: add a5, a2, a3 +; CHECK-NEXT: vmv1r.v v1, v8 +; CHECK-NEXT: vmv2r.v v24, v14 +; CHECK-NEXT: add a6, a4, a3 +; CHECK-NEXT: vmv2r.v v22, v10 +; CHECK-NEXT: vmv1r.v v2, v22 +; CHECK-NEXT: add a7, a5, a3 +; CHECK-NEXT: vmv1r.v v3, v12 +; CHECK-NEXT: add t0, a6, a3 +; CHECK-NEXT: vmv1r.v v4, v24 +; CHECK-NEXT: add t1, a7, a3 +; CHECK-NEXT: vmv1r.v v5, v16 +; CHECK-NEXT: add t2, t0, a3 +; CHECK-NEXT: vmv1r.v v6, v26 +; CHECK-NEXT: add t3, t1, a3 +; CHECK-NEXT: vmv1r.v v7, v20 +; CHECK-NEXT: add t4, t2, a3 +; CHECK-NEXT: vmv1r.v v8, v28 +; CHECK-NEXT: vmv1r.v v22, v9 +; CHECK-NEXT: add t5, t3, a3 +; CHECK-NEXT: vmv1r.v v24, v13 +; CHECK-NEXT: add t6, t4, a3 +; CHECK-NEXT: vmv1r.v v26, v17 +; CHECK-NEXT: vsseg8e16.v v1, (a0) +; CHECK-NEXT: vmv1r.v v28, v21 +; CHECK-NEXT: vsseg8e16.v v22, (a1) +; CHECK-NEXT: vl1re16.v v14, (t5) +; CHECK-NEXT: add t5, t5, a3 +; CHECK-NEXT: add a3, t6, a3 +; CHECK-NEXT: vl1re16.v v22, (t6) +; CHECK-NEXT: vl1re16.v v15, (t5) +; CHECK-NEXT: vl1re16.v v23, (a3) +; CHECK-NEXT: vl1re16.v v12, (t1) +; CHECK-NEXT: vl1re16.v v20, (t2) +; CHECK-NEXT: vl1re16.v v13, (t3) +; CHECK-NEXT: vl1re16.v v21, (t4) +; CHECK-NEXT: vl1re16.v v10, (a5) +; CHECK-NEXT: vl1re16.v v18, (a6) +; CHECK-NEXT: vl1re16.v v11, (a7) +; CHECK-NEXT: vl1re16.v v19, (t0) +; CHECK-NEXT: vl1re16.v v8, (a0) +; CHECK-NEXT: vl1re16.v v16, (a1) +; CHECK-NEXT: vl1re16.v v9, (a2) +; CHECK-NEXT: vl1re16.v v17, (a4) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv64i16_nxv8i16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 4 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZVBB-NEXT: vmv2r.v v28, v22 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: vmv2r.v v26, v18 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: slli a1, a1, 3 +; ZVBB-NEXT: add a1, sp, a1 +; ZVBB-NEXT: addi a1, a1, 16 +; ZVBB-NEXT: csrr a3, vlenb +; ZVBB-NEXT: add a2, a0, a3 +; ZVBB-NEXT: add a4, a1, a3 +; ZVBB-NEXT: add a5, a2, a3 +; ZVBB-NEXT: vmv1r.v v1, v8 +; ZVBB-NEXT: vmv2r.v v24, v14 +; ZVBB-NEXT: add a6, a4, a3 +; ZVBB-NEXT: vmv2r.v v22, v10 +; ZVBB-NEXT: vmv1r.v v2, v22 +; ZVBB-NEXT: add a7, a5, a3 +; ZVBB-NEXT: vmv1r.v v3, v12 +; ZVBB-NEXT: add t0, a6, a3 +; ZVBB-NEXT: vmv1r.v v4, v24 +; ZVBB-NEXT: add t1, a7, a3 +; ZVBB-NEXT: vmv1r.v v5, v16 +; ZVBB-NEXT: add t2, t0, a3 +; ZVBB-NEXT: vmv1r.v v6, v26 +; ZVBB-NEXT: add t3, t1, a3 +; ZVBB-NEXT: vmv1r.v v7, v20 +; ZVBB-NEXT: add t4, t2, a3 +; ZVBB-NEXT: vmv1r.v v8, v28 +; ZVBB-NEXT: vmv1r.v v22, v9 +; ZVBB-NEXT: add t5, t3, a3 +; ZVBB-NEXT: vmv1r.v v24, v13 +; ZVBB-NEXT: add t6, t4, a3 +; ZVBB-NEXT: vmv1r.v v26, v17 +; ZVBB-NEXT: vsseg8e16.v v1, (a0) +; ZVBB-NEXT: vmv1r.v v28, v21 +; ZVBB-NEXT: vsseg8e16.v v22, (a1) +; ZVBB-NEXT: vl1re16.v v14, (t5) +; ZVBB-NEXT: add t5, t5, a3 +; ZVBB-NEXT: add a3, t6, a3 +; ZVBB-NEXT: vl1re16.v v22, (t6) +; ZVBB-NEXT: vl1re16.v v15, (t5) +; ZVBB-NEXT: vl1re16.v v23, (a3) +; ZVBB-NEXT: vl1re16.v v12, (t1) +; ZVBB-NEXT: vl1re16.v v20, (t2) +; ZVBB-NEXT: vl1re16.v v13, (t3) +; ZVBB-NEXT: vl1re16.v v21, (t4) +; ZVBB-NEXT: vl1re16.v v10, (a5) +; ZVBB-NEXT: vl1re16.v v18, (a6) +; ZVBB-NEXT: vl1re16.v v11, (a7) +; ZVBB-NEXT: vl1re16.v v19, (t0) +; ZVBB-NEXT: vl1re16.v v8, (a0) +; ZVBB-NEXT: vl1re16.v v16, (a1) +; ZVBB-NEXT: vl1re16.v v9, (a2) +; ZVBB-NEXT: vl1re16.v v17, (a4) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 4 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave8.nxv64i16( %a, %b, %c, %d, %e, %f, %g, %h) + ret %res +} + + +define @vector_interleave_nxv32i32_nxv4i32( %a, %b, %c, %d, %e, %f, %g, %h) nounwind { +; +; CHECK-LABEL: vector_interleave_nxv32i32_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vmv2r.v v28, v22 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv2r.v v26, v18 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: add a2, a0, a3 +; CHECK-NEXT: add a4, a1, a3 +; CHECK-NEXT: add a5, a2, a3 +; CHECK-NEXT: vmv1r.v v1, v8 +; CHECK-NEXT: vmv2r.v v24, v14 +; CHECK-NEXT: add a6, a4, a3 +; CHECK-NEXT: vmv2r.v v22, v10 +; CHECK-NEXT: vmv1r.v v2, v22 +; CHECK-NEXT: add a7, a5, a3 +; CHECK-NEXT: vmv1r.v v3, v12 +; CHECK-NEXT: add t0, a6, a3 +; CHECK-NEXT: vmv1r.v v4, v24 +; CHECK-NEXT: add t1, a7, a3 +; CHECK-NEXT: vmv1r.v v5, v16 +; CHECK-NEXT: add t2, t0, a3 +; CHECK-NEXT: vmv1r.v v6, v26 +; CHECK-NEXT: add t3, t1, a3 +; CHECK-NEXT: vmv1r.v v7, v20 +; CHECK-NEXT: add t4, t2, a3 +; CHECK-NEXT: vmv1r.v v8, v28 +; CHECK-NEXT: vmv1r.v v22, v9 +; CHECK-NEXT: add t5, t3, a3 +; CHECK-NEXT: vmv1r.v v24, v13 +; CHECK-NEXT: add t6, t4, a3 +; CHECK-NEXT: vmv1r.v v26, v17 +; CHECK-NEXT: vsseg8e32.v v1, (a0) +; CHECK-NEXT: vmv1r.v v28, v21 +; CHECK-NEXT: vsseg8e32.v v22, (a1) +; CHECK-NEXT: vl1re32.v v14, (t5) +; CHECK-NEXT: add t5, t5, a3 +; CHECK-NEXT: add a3, t6, a3 +; CHECK-NEXT: vl1re32.v v22, (t6) +; CHECK-NEXT: vl1re32.v v15, (t5) +; CHECK-NEXT: vl1re32.v v23, (a3) +; CHECK-NEXT: vl1re32.v v12, (t1) +; CHECK-NEXT: vl1re32.v v20, (t2) +; CHECK-NEXT: vl1re32.v v13, (t3) +; CHECK-NEXT: vl1re32.v v21, (t4) +; CHECK-NEXT: vl1re32.v v10, (a5) +; CHECK-NEXT: vl1re32.v v18, (a6) +; CHECK-NEXT: vl1re32.v v11, (a7) +; CHECK-NEXT: vl1re32.v v19, (t0) +; CHECK-NEXT: vl1re32.v v8, (a0) +; CHECK-NEXT: vl1re32.v v16, (a1) +; CHECK-NEXT: vl1re32.v v9, (a2) +; CHECK-NEXT: vl1re32.v v17, (a4) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv32i32_nxv4i32: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 4 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; ZVBB-NEXT: vmv2r.v v28, v22 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: vmv2r.v v26, v18 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: slli a1, a1, 3 +; ZVBB-NEXT: add a1, sp, a1 +; ZVBB-NEXT: addi a1, a1, 16 +; ZVBB-NEXT: csrr a3, vlenb +; ZVBB-NEXT: add a2, a0, a3 +; ZVBB-NEXT: add a4, a1, a3 +; ZVBB-NEXT: add a5, a2, a3 +; ZVBB-NEXT: vmv1r.v v1, v8 +; ZVBB-NEXT: vmv2r.v v24, v14 +; ZVBB-NEXT: add a6, a4, a3 +; ZVBB-NEXT: vmv2r.v v22, v10 +; ZVBB-NEXT: vmv1r.v v2, v22 +; ZVBB-NEXT: add a7, a5, a3 +; ZVBB-NEXT: vmv1r.v v3, v12 +; ZVBB-NEXT: add t0, a6, a3 +; ZVBB-NEXT: vmv1r.v v4, v24 +; ZVBB-NEXT: add t1, a7, a3 +; ZVBB-NEXT: vmv1r.v v5, v16 +; ZVBB-NEXT: add t2, t0, a3 +; ZVBB-NEXT: vmv1r.v v6, v26 +; ZVBB-NEXT: add t3, t1, a3 +; ZVBB-NEXT: vmv1r.v v7, v20 +; ZVBB-NEXT: add t4, t2, a3 +; ZVBB-NEXT: vmv1r.v v8, v28 +; ZVBB-NEXT: vmv1r.v v22, v9 +; ZVBB-NEXT: add t5, t3, a3 +; ZVBB-NEXT: vmv1r.v v24, v13 +; ZVBB-NEXT: add t6, t4, a3 +; ZVBB-NEXT: vmv1r.v v26, v17 +; ZVBB-NEXT: vsseg8e32.v v1, (a0) +; ZVBB-NEXT: vmv1r.v v28, v21 +; ZVBB-NEXT: vsseg8e32.v v22, (a1) +; ZVBB-NEXT: vl1re32.v v14, (t5) +; ZVBB-NEXT: add t5, t5, a3 +; ZVBB-NEXT: add a3, t6, a3 +; ZVBB-NEXT: vl1re32.v v22, (t6) +; ZVBB-NEXT: vl1re32.v v15, (t5) +; ZVBB-NEXT: vl1re32.v v23, (a3) +; ZVBB-NEXT: vl1re32.v v12, (t1) +; ZVBB-NEXT: vl1re32.v v20, (t2) +; ZVBB-NEXT: vl1re32.v v13, (t3) +; ZVBB-NEXT: vl1re32.v v21, (t4) +; ZVBB-NEXT: vl1re32.v v10, (a5) +; ZVBB-NEXT: vl1re32.v v18, (a6) +; ZVBB-NEXT: vl1re32.v v11, (a7) +; ZVBB-NEXT: vl1re32.v v19, (t0) +; ZVBB-NEXT: vl1re32.v v8, (a0) +; ZVBB-NEXT: vl1re32.v v16, (a1) +; ZVBB-NEXT: vl1re32.v v9, (a2) +; ZVBB-NEXT: vl1re32.v v17, (a4) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 4 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave8.nxv32i32( %a, %b, %c, %d, %e, %f, %g, %h) + ret %res +} + +define @vector_interleave_nxv16i64_nxv2i64( %a, %b, %c, %d, %e, %f, %g, %h) nounwind { +; +; CHECK-LABEL: vector_interleave_nxv16i64_nxv2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; CHECK-NEXT: vmv2r.v v28, v22 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv2r.v v26, v18 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: add a2, a0, a3 +; CHECK-NEXT: add a4, a1, a3 +; CHECK-NEXT: add a5, a2, a3 +; CHECK-NEXT: vmv1r.v v1, v8 +; CHECK-NEXT: vmv2r.v v24, v14 +; CHECK-NEXT: add a6, a4, a3 +; CHECK-NEXT: vmv2r.v v22, v10 +; CHECK-NEXT: vmv1r.v v2, v22 +; CHECK-NEXT: add a7, a5, a3 +; CHECK-NEXT: vmv1r.v v3, v12 +; CHECK-NEXT: add t0, a6, a3 +; CHECK-NEXT: vmv1r.v v4, v24 +; CHECK-NEXT: add t1, a7, a3 +; CHECK-NEXT: vmv1r.v v5, v16 +; CHECK-NEXT: add t2, t0, a3 +; CHECK-NEXT: vmv1r.v v6, v26 +; CHECK-NEXT: add t3, t1, a3 +; CHECK-NEXT: vmv1r.v v7, v20 +; CHECK-NEXT: add t4, t2, a3 +; CHECK-NEXT: vmv1r.v v8, v28 +; CHECK-NEXT: vmv1r.v v22, v9 +; CHECK-NEXT: add t5, t3, a3 +; CHECK-NEXT: vmv1r.v v24, v13 +; CHECK-NEXT: add t6, t4, a3 +; CHECK-NEXT: vmv1r.v v26, v17 +; CHECK-NEXT: vsseg8e64.v v1, (a0) +; CHECK-NEXT: vmv1r.v v28, v21 +; CHECK-NEXT: vsseg8e64.v v22, (a1) +; CHECK-NEXT: vl1re64.v v14, (t5) +; CHECK-NEXT: add t5, t5, a3 +; CHECK-NEXT: add a3, t6, a3 +; CHECK-NEXT: vl1re64.v v22, (t6) +; CHECK-NEXT: vl1re64.v v15, (t5) +; CHECK-NEXT: vl1re64.v v23, (a3) +; CHECK-NEXT: vl1re64.v v12, (t1) +; CHECK-NEXT: vl1re64.v v20, (t2) +; CHECK-NEXT: vl1re64.v v13, (t3) +; CHECK-NEXT: vl1re64.v v21, (t4) +; CHECK-NEXT: vl1re64.v v10, (a5) +; CHECK-NEXT: vl1re64.v v18, (a6) +; CHECK-NEXT: vl1re64.v v11, (a7) +; CHECK-NEXT: vl1re64.v v19, (t0) +; CHECK-NEXT: vl1re64.v v8, (a0) +; CHECK-NEXT: vl1re64.v v16, (a1) +; CHECK-NEXT: vl1re64.v v9, (a2) +; CHECK-NEXT: vl1re64.v v17, (a4) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv16i64_nxv2i64: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 4 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; ZVBB-NEXT: vmv2r.v v28, v22 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: vmv2r.v v26, v18 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: slli a1, a1, 3 +; ZVBB-NEXT: add a1, sp, a1 +; ZVBB-NEXT: addi a1, a1, 16 +; ZVBB-NEXT: csrr a3, vlenb +; ZVBB-NEXT: add a2, a0, a3 +; ZVBB-NEXT: add a4, a1, a3 +; ZVBB-NEXT: add a5, a2, a3 +; ZVBB-NEXT: vmv1r.v v1, v8 +; ZVBB-NEXT: vmv2r.v v24, v14 +; ZVBB-NEXT: add a6, a4, a3 +; ZVBB-NEXT: vmv2r.v v22, v10 +; ZVBB-NEXT: vmv1r.v v2, v22 +; ZVBB-NEXT: add a7, a5, a3 +; ZVBB-NEXT: vmv1r.v v3, v12 +; ZVBB-NEXT: add t0, a6, a3 +; ZVBB-NEXT: vmv1r.v v4, v24 +; ZVBB-NEXT: add t1, a7, a3 +; ZVBB-NEXT: vmv1r.v v5, v16 +; ZVBB-NEXT: add t2, t0, a3 +; ZVBB-NEXT: vmv1r.v v6, v26 +; ZVBB-NEXT: add t3, t1, a3 +; ZVBB-NEXT: vmv1r.v v7, v20 +; ZVBB-NEXT: add t4, t2, a3 +; ZVBB-NEXT: vmv1r.v v8, v28 +; ZVBB-NEXT: vmv1r.v v22, v9 +; ZVBB-NEXT: add t5, t3, a3 +; ZVBB-NEXT: vmv1r.v v24, v13 +; ZVBB-NEXT: add t6, t4, a3 +; ZVBB-NEXT: vmv1r.v v26, v17 +; ZVBB-NEXT: vsseg8e64.v v1, (a0) +; ZVBB-NEXT: vmv1r.v v28, v21 +; ZVBB-NEXT: vsseg8e64.v v22, (a1) +; ZVBB-NEXT: vl1re64.v v14, (t5) +; ZVBB-NEXT: add t5, t5, a3 +; ZVBB-NEXT: add a3, t6, a3 +; ZVBB-NEXT: vl1re64.v v22, (t6) +; ZVBB-NEXT: vl1re64.v v15, (t5) +; ZVBB-NEXT: vl1re64.v v23, (a3) +; ZVBB-NEXT: vl1re64.v v12, (t1) +; ZVBB-NEXT: vl1re64.v v20, (t2) +; ZVBB-NEXT: vl1re64.v v13, (t3) +; ZVBB-NEXT: vl1re64.v v21, (t4) +; ZVBB-NEXT: vl1re64.v v10, (a5) +; ZVBB-NEXT: vl1re64.v v18, (a6) +; ZVBB-NEXT: vl1re64.v v11, (a7) +; ZVBB-NEXT: vl1re64.v v19, (t0) +; ZVBB-NEXT: vl1re64.v v8, (a0) +; ZVBB-NEXT: vl1re64.v v16, (a1) +; ZVBB-NEXT: vl1re64.v v9, (a2) +; ZVBB-NEXT: vl1re64.v v17, (a4) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 4 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave8.nxv16i64( %a, %b, %c, %d, %e, %f, %g, %h) + ret %res +} + +; Floats + +define @vector_interleave_nxv4bf16_nxv2bf16( %a, %b) { +; V-LABEL: vector_interleave_nxv4bf16_nxv2bf16: +; V: # %bb.0: +; V-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; V-NEXT: vwaddu.vv v10, v8, v9 +; V-NEXT: li a0, -1 +; V-NEXT: csrr a1, vlenb +; V-NEXT: vwmaccu.vx v10, a0, v9 +; V-NEXT: srli a1, a1, 2 +; V-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; V-NEXT: vslidedown.vx v8, v10, a1 +; V-NEXT: add a0, a1, a1 +; V-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; V-NEXT: vslideup.vx v10, v8, a1 +; V-NEXT: vmv.v.v v8, v10 +; V-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv4bf16_nxv2bf16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; ZVBB-NEXT: vwsll.vi v10, v9, 16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: vwaddu.wv v10, v10, v8 +; ZVBB-NEXT: srli a0, a0, 2 +; ZVBB-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; ZVBB-NEXT: vslidedown.vx v8, v10, a0 +; ZVBB-NEXT: add a1, a0, a0 +; ZVBB-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v10, v8, a0 +; ZVBB-NEXT: vmv.v.v v8, v10 +; ZVBB-NEXT: ret +; +; ZIP-LABEL: vector_interleave_nxv4bf16_nxv2bf16: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; ZIP-NEXT: ri.vzip2b.vv v11, v8, v9 +; ZIP-NEXT: ri.vzip2a.vv v10, v8, v9 +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: srli a0, a0, 2 +; ZIP-NEXT: add a1, a0, a0 +; ZIP-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; ZIP-NEXT: vslideup.vx v10, v11, a0 +; ZIP-NEXT: vmv.v.v v8, v10 +; ZIP-NEXT: ret + %res = call @llvm.vector.interleave2.nxv4bf16( %a, %b) + ret %res +} + +define @vector_interleave_nxv8bf16_nxv4bf16( %a, %b) { +; V-LABEL: vector_interleave_nxv8bf16_nxv4bf16: +; V: # %bb.0: +; V-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; V-NEXT: vmv1r.v v10, v9 +; V-NEXT: vmv1r.v v11, v8 +; V-NEXT: vwaddu.vv v8, v11, v10 +; V-NEXT: li a0, -1 +; V-NEXT: vwmaccu.vx v8, a0, v10 +; V-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv8bf16_nxv4bf16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZVBB-NEXT: vmv1r.v v10, v9 +; ZVBB-NEXT: vmv1r.v v11, v8 +; ZVBB-NEXT: vwsll.vi v8, v10, 16 +; ZVBB-NEXT: vwaddu.wv v8, v8, v11 +; ZVBB-NEXT: ret +; +; ZIP-LABEL: vector_interleave_nxv8bf16_nxv4bf16: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZIP-NEXT: vmv1r.v v10, v9 +; ZIP-NEXT: vmv1r.v v11, v8 +; ZIP-NEXT: ri.vzip2b.vv v9, v8, v10 +; ZIP-NEXT: ri.vzip2a.vv v8, v11, v10 +; ZIP-NEXT: ret + %res = call @llvm.vector.interleave2.nxv8bf16( %a, %b) + ret %res +} + +define @vector_interleave_nxv4f16_nxv2f16( %a, %b) { +; V-LABEL: vector_interleave_nxv4f16_nxv2f16: +; V: # %bb.0: +; V-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; V-NEXT: vwaddu.vv v10, v8, v9 +; V-NEXT: li a0, -1 +; V-NEXT: csrr a1, vlenb +; V-NEXT: vwmaccu.vx v10, a0, v9 +; V-NEXT: srli a1, a1, 2 +; V-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; V-NEXT: vslidedown.vx v8, v10, a1 +; V-NEXT: add a0, a1, a1 +; V-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; V-NEXT: vslideup.vx v10, v8, a1 +; V-NEXT: vmv.v.v v8, v10 +; V-NEXT: ret ; ; ZVBB-LABEL: vector_interleave_nxv4f16_nxv2f16: ; ZVBB: # %bb.0: -; ZVBB-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; ZVBB-NEXT: vwsll.vi v10, v9, 16 +; ZVBB-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; ZVBB-NEXT: vwsll.vi v10, v9, 16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: vwaddu.wv v10, v10, v8 +; ZVBB-NEXT: srli a0, a0, 2 +; ZVBB-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; ZVBB-NEXT: vslidedown.vx v8, v10, a0 +; ZVBB-NEXT: add a1, a0, a0 +; ZVBB-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v10, v8, a0 +; ZVBB-NEXT: vmv.v.v v8, v10 +; ZVBB-NEXT: ret +; +; ZIP-LABEL: vector_interleave_nxv4f16_nxv2f16: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; ZIP-NEXT: ri.vzip2b.vv v11, v8, v9 +; ZIP-NEXT: ri.vzip2a.vv v10, v8, v9 +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: srli a0, a0, 2 +; ZIP-NEXT: add a1, a0, a0 +; ZIP-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; ZIP-NEXT: vslideup.vx v10, v11, a0 +; ZIP-NEXT: vmv.v.v v8, v10 +; ZIP-NEXT: ret + %res = call @llvm.vector.interleave2.nxv4f16( %a, %b) + ret %res +} + +define @vector_interleave_nxv8f16_nxv4f16( %a, %b) { +; V-LABEL: vector_interleave_nxv8f16_nxv4f16: +; V: # %bb.0: +; V-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; V-NEXT: vmv1r.v v10, v9 +; V-NEXT: vmv1r.v v11, v8 +; V-NEXT: vwaddu.vv v8, v11, v10 +; V-NEXT: li a0, -1 +; V-NEXT: vwmaccu.vx v8, a0, v10 +; V-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv8f16_nxv4f16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZVBB-NEXT: vmv1r.v v10, v9 +; ZVBB-NEXT: vmv1r.v v11, v8 +; ZVBB-NEXT: vwsll.vi v8, v10, 16 +; ZVBB-NEXT: vwaddu.wv v8, v8, v11 +; ZVBB-NEXT: ret +; +; ZIP-LABEL: vector_interleave_nxv8f16_nxv4f16: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZIP-NEXT: vmv1r.v v10, v9 +; ZIP-NEXT: vmv1r.v v11, v8 +; ZIP-NEXT: ri.vzip2b.vv v9, v8, v10 +; ZIP-NEXT: ri.vzip2a.vv v8, v11, v10 +; ZIP-NEXT: ret + %res = call @llvm.vector.interleave2.nxv8f16( %a, %b) + ret %res +} + +define @vector_interleave_nxv4f32_nxv2f32( %a, %b) { +; V-LABEL: vector_interleave_nxv4f32_nxv2f32: +; V: # %bb.0: +; V-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; V-NEXT: vmv1r.v v10, v9 +; V-NEXT: vmv1r.v v11, v8 +; V-NEXT: vwaddu.vv v8, v11, v10 +; V-NEXT: li a0, -1 +; V-NEXT: vwmaccu.vx v8, a0, v10 +; V-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv4f32_nxv2f32: +; ZVBB: # %bb.0: +; ZVBB-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; ZVBB-NEXT: vmv1r.v v10, v9 +; ZVBB-NEXT: vmv1r.v v11, v8 +; ZVBB-NEXT: li a0, 32 +; ZVBB-NEXT: vwsll.vx v8, v10, a0 +; ZVBB-NEXT: vwaddu.wv v8, v8, v11 +; ZVBB-NEXT: ret +; +; ZIP-LABEL: vector_interleave_nxv4f32_nxv2f32: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; ZIP-NEXT: vmv1r.v v10, v9 +; ZIP-NEXT: vmv1r.v v11, v8 +; ZIP-NEXT: ri.vzip2b.vv v9, v8, v10 +; ZIP-NEXT: ri.vzip2a.vv v8, v11, v10 +; ZIP-NEXT: ret + %res = call @llvm.vector.interleave2.nxv4f32( %a, %b) + ret %res +} + +define @vector_interleave_nxv16bf16_nxv8bf16( %a, %b) { +; V-LABEL: vector_interleave_nxv16bf16_nxv8bf16: +; V: # %bb.0: +; V-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; V-NEXT: vmv2r.v v12, v10 +; V-NEXT: vmv2r.v v14, v8 +; V-NEXT: vwaddu.vv v8, v14, v12 +; V-NEXT: li a0, -1 +; V-NEXT: vwmaccu.vx v8, a0, v12 +; V-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv16bf16_nxv8bf16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; ZVBB-NEXT: vmv2r.v v12, v10 +; ZVBB-NEXT: vmv2r.v v14, v8 +; ZVBB-NEXT: vwsll.vi v8, v12, 16 +; ZVBB-NEXT: vwaddu.wv v8, v8, v14 +; ZVBB-NEXT: ret +; +; ZIP-LABEL: vector_interleave_nxv16bf16_nxv8bf16: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; ZIP-NEXT: vmv2r.v v12, v10 +; ZIP-NEXT: vmv2r.v v14, v8 +; ZIP-NEXT: ri.vzip2b.vv v10, v8, v12 +; ZIP-NEXT: ri.vzip2a.vv v8, v14, v12 +; ZIP-NEXT: ret + %res = call @llvm.vector.interleave2.nxv16bf16( %a, %b) + ret %res +} + +define @vector_interleave_nxv16f16_nxv8f16( %a, %b) { +; V-LABEL: vector_interleave_nxv16f16_nxv8f16: +; V: # %bb.0: +; V-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; V-NEXT: vmv2r.v v12, v10 +; V-NEXT: vmv2r.v v14, v8 +; V-NEXT: vwaddu.vv v8, v14, v12 +; V-NEXT: li a0, -1 +; V-NEXT: vwmaccu.vx v8, a0, v12 +; V-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv16f16_nxv8f16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; ZVBB-NEXT: vmv2r.v v12, v10 +; ZVBB-NEXT: vmv2r.v v14, v8 +; ZVBB-NEXT: vwsll.vi v8, v12, 16 +; ZVBB-NEXT: vwaddu.wv v8, v8, v14 +; ZVBB-NEXT: ret +; +; ZIP-LABEL: vector_interleave_nxv16f16_nxv8f16: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; ZIP-NEXT: vmv2r.v v12, v10 +; ZIP-NEXT: vmv2r.v v14, v8 +; ZIP-NEXT: ri.vzip2b.vv v10, v8, v12 +; ZIP-NEXT: ri.vzip2a.vv v8, v14, v12 +; ZIP-NEXT: ret + %res = call @llvm.vector.interleave2.nxv16f16( %a, %b) + ret %res +} + +define @vector_interleave_nxv8f32_nxv4f32( %a, %b) { +; V-LABEL: vector_interleave_nxv8f32_nxv4f32: +; V: # %bb.0: +; V-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; V-NEXT: vmv2r.v v12, v10 +; V-NEXT: vmv2r.v v14, v8 +; V-NEXT: vwaddu.vv v8, v14, v12 +; V-NEXT: li a0, -1 +; V-NEXT: vwmaccu.vx v8, a0, v12 +; V-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv8f32_nxv4f32: +; ZVBB: # %bb.0: +; ZVBB-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; ZVBB-NEXT: vmv2r.v v12, v10 +; ZVBB-NEXT: vmv2r.v v14, v8 +; ZVBB-NEXT: li a0, 32 +; ZVBB-NEXT: vwsll.vx v8, v12, a0 +; ZVBB-NEXT: vwaddu.wv v8, v8, v14 +; ZVBB-NEXT: ret +; +; ZIP-LABEL: vector_interleave_nxv8f32_nxv4f32: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; ZIP-NEXT: vmv2r.v v12, v10 +; ZIP-NEXT: vmv2r.v v14, v8 +; ZIP-NEXT: ri.vzip2b.vv v10, v8, v12 +; ZIP-NEXT: ri.vzip2a.vv v8, v14, v12 +; ZIP-NEXT: ret + %res = call @llvm.vector.interleave2.nxv8f32( %a, %b) + ret %res +} + +define @vector_interleave_nxv4f64_nxv2f64( %a, %b) { +; V-LABEL: vector_interleave_nxv4f64_nxv2f64: +; V: # %bb.0: +; V-NEXT: csrr a0, vlenb +; V-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; V-NEXT: vid.v v12 +; V-NEXT: srli a0, a0, 2 +; V-NEXT: vand.vi v13, v12, 1 +; V-NEXT: vmsne.vi v0, v13, 0 +; V-NEXT: vsrl.vi v16, v12, 1 +; V-NEXT: vadd.vx v16, v16, a0, v0.t +; V-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; V-NEXT: vrgatherei16.vv v12, v8, v16 +; V-NEXT: vmv.v.v v8, v12 +; V-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv4f64_nxv2f64: +; ZVBB: # %bb.0: +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; ZVBB-NEXT: vid.v v12 +; ZVBB-NEXT: srli a0, a0, 2 +; ZVBB-NEXT: vand.vi v13, v12, 1 +; ZVBB-NEXT: vmsne.vi v0, v13, 0 +; ZVBB-NEXT: vsrl.vi v16, v12, 1 +; ZVBB-NEXT: vadd.vx v16, v16, a0, v0.t +; ZVBB-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; ZVBB-NEXT: vrgatherei16.vv v12, v8, v16 +; ZVBB-NEXT: vmv.v.v v8, v12 +; ZVBB-NEXT: ret +; +; ZIP-LABEL: vector_interleave_nxv4f64_nxv2f64: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; ZIP-NEXT: vmv2r.v v12, v10 +; ZIP-NEXT: vmv2r.v v14, v8 +; ZIP-NEXT: ri.vzip2b.vv v10, v8, v12 +; ZIP-NEXT: ri.vzip2a.vv v8, v14, v12 +; ZIP-NEXT: ret + %res = call @llvm.vector.interleave2.nxv4f64( %a, %b) + ret %res +} + + + +define @vector_interleave_nxv64bf16_nxv32bf16( %a, %b) { +; V-LABEL: vector_interleave_nxv64bf16_nxv32bf16: +; V: # %bb.0: +; V-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; V-NEXT: vmv8r.v v24, v8 +; V-NEXT: vwaddu.vv v8, v24, v16 +; V-NEXT: li a0, -1 +; V-NEXT: vwaddu.vv v0, v28, v20 +; V-NEXT: vwmaccu.vx v8, a0, v16 +; V-NEXT: vwmaccu.vx v0, a0, v20 +; V-NEXT: vmv8r.v v16, v0 +; V-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv64bf16_nxv32bf16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVBB-NEXT: vwsll.vi v24, v16, 16 +; ZVBB-NEXT: vwsll.vi v0, v20, 16 +; ZVBB-NEXT: vwaddu.wv v24, v24, v8 +; ZVBB-NEXT: vwaddu.wv v0, v0, v12 +; ZVBB-NEXT: vmv8r.v v8, v24 +; ZVBB-NEXT: vmv8r.v v16, v0 +; ZVBB-NEXT: ret +; +; ZIP-LABEL: vector_interleave_nxv64bf16_nxv32bf16: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZIP-NEXT: ri.vzip2b.vv v28, v8, v16 +; ZIP-NEXT: ri.vzip2b.vv v4, v12, v20 +; ZIP-NEXT: ri.vzip2a.vv v24, v8, v16 +; ZIP-NEXT: ri.vzip2a.vv v0, v12, v20 +; ZIP-NEXT: vmv8r.v v8, v24 +; ZIP-NEXT: vmv8r.v v16, v0 +; ZIP-NEXT: ret + %res = call @llvm.vector.interleave2.nxv64bf16( %a, %b) + ret %res +} + +define @vector_interleave_nxv64f16_nxv32f16( %a, %b) { +; V-LABEL: vector_interleave_nxv64f16_nxv32f16: +; V: # %bb.0: +; V-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; V-NEXT: vmv8r.v v24, v8 +; V-NEXT: vwaddu.vv v8, v24, v16 +; V-NEXT: li a0, -1 +; V-NEXT: vwaddu.vv v0, v28, v20 +; V-NEXT: vwmaccu.vx v8, a0, v16 +; V-NEXT: vwmaccu.vx v0, a0, v20 +; V-NEXT: vmv8r.v v16, v0 +; V-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv64f16_nxv32f16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVBB-NEXT: vwsll.vi v24, v16, 16 +; ZVBB-NEXT: vwsll.vi v0, v20, 16 +; ZVBB-NEXT: vwaddu.wv v24, v24, v8 +; ZVBB-NEXT: vwaddu.wv v0, v0, v12 +; ZVBB-NEXT: vmv8r.v v8, v24 +; ZVBB-NEXT: vmv8r.v v16, v0 +; ZVBB-NEXT: ret +; +; ZIP-LABEL: vector_interleave_nxv64f16_nxv32f16: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZIP-NEXT: ri.vzip2b.vv v28, v8, v16 +; ZIP-NEXT: ri.vzip2b.vv v4, v12, v20 +; ZIP-NEXT: ri.vzip2a.vv v24, v8, v16 +; ZIP-NEXT: ri.vzip2a.vv v0, v12, v20 +; ZIP-NEXT: vmv8r.v v8, v24 +; ZIP-NEXT: vmv8r.v v16, v0 +; ZIP-NEXT: ret + %res = call @llvm.vector.interleave2.nxv64f16( %a, %b) + ret %res +} + +define @vector_interleave_nxv32f32_nxv16f32( %a, %b) { +; V-LABEL: vector_interleave_nxv32f32_nxv16f32: +; V: # %bb.0: +; V-NEXT: vsetvli a0, zero, e32, m4, ta, ma +; V-NEXT: vmv8r.v v24, v8 +; V-NEXT: vwaddu.vv v8, v24, v16 +; V-NEXT: li a0, -1 +; V-NEXT: vwaddu.vv v0, v28, v20 +; V-NEXT: vwmaccu.vx v8, a0, v16 +; V-NEXT: vwmaccu.vx v0, a0, v20 +; V-NEXT: vmv8r.v v16, v0 +; V-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv32f32_nxv16f32: +; ZVBB: # %bb.0: +; ZVBB-NEXT: li a0, 32 +; ZVBB-NEXT: vsetvli a1, zero, e32, m4, ta, ma +; ZVBB-NEXT: vwsll.vx v24, v16, a0 +; ZVBB-NEXT: vwsll.vx v0, v20, a0 +; ZVBB-NEXT: vwaddu.wv v24, v24, v8 +; ZVBB-NEXT: vwaddu.wv v0, v0, v12 +; ZVBB-NEXT: vmv8r.v v8, v24 +; ZVBB-NEXT: vmv8r.v v16, v0 +; ZVBB-NEXT: ret +; +; ZIP-LABEL: vector_interleave_nxv32f32_nxv16f32: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetvli a0, zero, e32, m4, ta, ma +; ZIP-NEXT: ri.vzip2b.vv v28, v8, v16 +; ZIP-NEXT: ri.vzip2b.vv v4, v12, v20 +; ZIP-NEXT: ri.vzip2a.vv v24, v8, v16 +; ZIP-NEXT: ri.vzip2a.vv v0, v12, v20 +; ZIP-NEXT: vmv8r.v v8, v24 +; ZIP-NEXT: vmv8r.v v16, v0 +; ZIP-NEXT: ret + %res = call @llvm.vector.interleave2.nxv32f32( %a, %b) + ret %res +} + +define @vector_interleave_nxv16f64_nxv8f64( %a, %b) { +; V-LABEL: vector_interleave_nxv16f64_nxv8f64: +; V: # %bb.0: +; V-NEXT: csrr a0, vlenb +; V-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; V-NEXT: vid.v v6 +; V-NEXT: vmv8r.v v24, v8 +; V-NEXT: srli a0, a0, 1 +; V-NEXT: vmv4r.v v28, v16 +; V-NEXT: vmv4r.v v16, v12 +; V-NEXT: vand.vi v8, v6, 1 +; V-NEXT: vmsne.vi v0, v8, 0 +; V-NEXT: vsrl.vi v6, v6, 1 +; V-NEXT: vadd.vx v6, v6, a0, v0.t +; V-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; V-NEXT: vrgatherei16.vv v8, v24, v6 +; V-NEXT: vrgatherei16.vv v24, v16, v6 +; V-NEXT: vmv.v.v v16, v24 +; V-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv16f64_nxv8f64: +; ZVBB: # %bb.0: +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; ZVBB-NEXT: vid.v v6 +; ZVBB-NEXT: vmv8r.v v24, v8 +; ZVBB-NEXT: srli a0, a0, 1 +; ZVBB-NEXT: vmv4r.v v28, v16 +; ZVBB-NEXT: vmv4r.v v16, v12 +; ZVBB-NEXT: vand.vi v8, v6, 1 +; ZVBB-NEXT: vmsne.vi v0, v8, 0 +; ZVBB-NEXT: vsrl.vi v6, v6, 1 +; ZVBB-NEXT: vadd.vx v6, v6, a0, v0.t +; ZVBB-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; ZVBB-NEXT: vrgatherei16.vv v8, v24, v6 +; ZVBB-NEXT: vrgatherei16.vv v24, v16, v6 +; ZVBB-NEXT: vmv.v.v v16, v24 +; ZVBB-NEXT: ret +; +; ZIP-LABEL: vector_interleave_nxv16f64_nxv8f64: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; ZIP-NEXT: ri.vzip2b.vv v28, v8, v16 +; ZIP-NEXT: ri.vzip2b.vv v4, v12, v20 +; ZIP-NEXT: ri.vzip2a.vv v24, v8, v16 +; ZIP-NEXT: ri.vzip2a.vv v0, v12, v20 +; ZIP-NEXT: vmv8r.v v8, v24 +; ZIP-NEXT: vmv8r.v v16, v0 +; ZIP-NEXT: ret + %res = call @llvm.vector.interleave2.nxv16f64( %a, %b) + ret %res +} + +define @vector_interleave_nxv6f16_nxv2f16( %v0, %v1, %v2) nounwind { +; CHECK-LABEL: vector_interleave_nxv6f16_nxv2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a2, a1, 1 +; CHECK-NEXT: vsetvli a3, zero, e16, mf2, ta, ma +; CHECK-NEXT: vsseg3e16.v v8, (a0) +; CHECK-NEXT: add a3, a0, a2 +; CHECK-NEXT: vle16.v v9, (a3) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: add a0, a1, a1 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v8, v9, a1 +; CHECK-NEXT: add a2, a3, a2 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v9, (a2) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv6f16_nxv2f16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 1 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: srli a2, a1, 1 +; ZVBB-NEXT: vsetvli a3, zero, e16, mf2, ta, ma +; ZVBB-NEXT: vsseg3e16.v v8, (a0) +; ZVBB-NEXT: add a3, a0, a2 +; ZVBB-NEXT: vle16.v v9, (a3) +; ZVBB-NEXT: vle16.v v8, (a0) +; ZVBB-NEXT: srli a1, a1, 2 +; ZVBB-NEXT: add a0, a1, a1 +; ZVBB-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v8, v9, a1 +; ZVBB-NEXT: add a2, a3, a2 +; ZVBB-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; ZVBB-NEXT: vle16.v v9, (a2) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 1 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave3.nxv6f16( %v0, %v1, %v2) + ret %res +} + +define @vector_interleave_nxv12f16_nxv4f16( %v0, %v1, %v2) nounwind { +; CHECK-LABEL: vector_interleave_nxv12f16_nxv4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a1, a0, 1 +; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma +; CHECK-NEXT: vsseg3e16.v v8, (a0) +; CHECK-NEXT: vl1re16.v v8, (a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vl1re16.v v9, (a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vl1re16.v v10, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a1, a0, 1 +; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv12f16_nxv4f16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a1, a0, 1 +; ZVBB-NEXT: add a0, a1, a0 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: vsetvli a2, zero, e16, m1, ta, ma +; ZVBB-NEXT: vsseg3e16.v v8, (a0) +; ZVBB-NEXT: vl1re16.v v8, (a0) +; ZVBB-NEXT: add a0, a0, a1 +; ZVBB-NEXT: vl1re16.v v9, (a0) +; ZVBB-NEXT: add a0, a0, a1 +; ZVBB-NEXT: vl1re16.v v10, (a0) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a1, a0, 1 +; ZVBB-NEXT: add a0, a1, a0 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave3.nxv12f16( %v0, %v1, %v2) + ret %res +} + +define @vector_interleave_nxv24f16_nxv8f16( %v0, %v1, %v2) nounwind { +; CHECK-LABEL: vector_interleave_nxv24f16_nxv8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 6 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vsseg3e16.v v8, (a0) +; CHECK-NEXT: vl2re16.v v8, (a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vl2re16.v v10, (a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vl2re16.v v12, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 6 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv24f16_nxv8f16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: li a1, 6 +; ZVBB-NEXT: mul a0, a0, a1 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: slli a1, a1, 1 +; ZVBB-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; ZVBB-NEXT: vsseg3e16.v v8, (a0) +; ZVBB-NEXT: vl2re16.v v8, (a0) +; ZVBB-NEXT: add a0, a0, a1 +; ZVBB-NEXT: vl2re16.v v10, (a0) +; ZVBB-NEXT: add a0, a0, a1 +; ZVBB-NEXT: vl2re16.v v12, (a0) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: li a1, 6 +; ZVBB-NEXT: mul a0, a0, a1 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave3.nxv24f16( %v0, %v1, %v2) + ret %res +} + +define @vector_interleave_nxv6bf16_nxv2bf16( %v0, %v1, %v2) nounwind { +; CHECK-LABEL: vector_interleave_nxv6bf16_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a2, a1, 1 +; CHECK-NEXT: vsetvli a3, zero, e16, mf2, ta, ma +; CHECK-NEXT: vsseg3e16.v v8, (a0) +; CHECK-NEXT: add a3, a0, a2 +; CHECK-NEXT: vle16.v v9, (a3) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: add a0, a1, a1 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v8, v9, a1 +; CHECK-NEXT: add a2, a3, a2 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v9, (a2) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv6bf16_nxv2bf16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 1 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: srli a2, a1, 1 +; ZVBB-NEXT: vsetvli a3, zero, e16, mf2, ta, ma +; ZVBB-NEXT: vsseg3e16.v v8, (a0) +; ZVBB-NEXT: add a3, a0, a2 +; ZVBB-NEXT: vle16.v v9, (a3) +; ZVBB-NEXT: vle16.v v8, (a0) +; ZVBB-NEXT: srli a1, a1, 2 +; ZVBB-NEXT: add a0, a1, a1 +; ZVBB-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v8, v9, a1 +; ZVBB-NEXT: add a2, a3, a2 +; ZVBB-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; ZVBB-NEXT: vle16.v v9, (a2) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 1 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave3.nxv6bf16( %v0, %v1, %v2) + ret %res +} + +define @vector_interleave_nxv12bf16_nxv4bf16( %v0, %v1, %v2) nounwind { +; CHECK-LABEL: vector_interleave_nxv12bf16_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a1, a0, 1 +; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma +; CHECK-NEXT: vsseg3e16.v v8, (a0) +; CHECK-NEXT: vl1re16.v v8, (a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vl1re16.v v9, (a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vl1re16.v v10, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a1, a0, 1 +; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv12bf16_nxv4bf16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a1, a0, 1 +; ZVBB-NEXT: add a0, a1, a0 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: vsetvli a2, zero, e16, m1, ta, ma +; ZVBB-NEXT: vsseg3e16.v v8, (a0) +; ZVBB-NEXT: vl1re16.v v8, (a0) +; ZVBB-NEXT: add a0, a0, a1 +; ZVBB-NEXT: vl1re16.v v9, (a0) +; ZVBB-NEXT: add a0, a0, a1 +; ZVBB-NEXT: vl1re16.v v10, (a0) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a1, a0, 1 +; ZVBB-NEXT: add a0, a1, a0 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave3.nxv12bf16( %v0, %v1, %v2) + ret %res +} + +define @vector_interleave_nxv24bf16_nxv8bf16( %v0, %v1, %v2) nounwind { +; CHECK-LABEL: vector_interleave_nxv24bf16_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 6 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vsseg3e16.v v8, (a0) +; CHECK-NEXT: vl2re16.v v8, (a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vl2re16.v v10, (a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vl2re16.v v12, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 6 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv24bf16_nxv8bf16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: li a1, 6 +; ZVBB-NEXT: mul a0, a0, a1 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: slli a1, a1, 1 +; ZVBB-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; ZVBB-NEXT: vsseg3e16.v v8, (a0) +; ZVBB-NEXT: vl2re16.v v8, (a0) +; ZVBB-NEXT: add a0, a0, a1 +; ZVBB-NEXT: vl2re16.v v10, (a0) +; ZVBB-NEXT: add a0, a0, a1 +; ZVBB-NEXT: vl2re16.v v12, (a0) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: li a1, 6 +; ZVBB-NEXT: mul a0, a0, a1 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave3.nxv24bf16( %v0, %v1, %v2) + ret %res +} + +define @vector_interleave_nxv3f32_nxv1f32( %v0, %v1, %v2) nounwind { +; CHECK-LABEL: vector_interleave_nxv3f32_nxv1f32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a2, a1, 1 +; CHECK-NEXT: vsetvli a3, zero, e32, mf2, ta, ma +; CHECK-NEXT: vsseg3e32.v v8, (a0) +; CHECK-NEXT: add a3, a0, a2 +; CHECK-NEXT: vle32.v v9, (a3) +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: srli a1, a1, 3 +; CHECK-NEXT: add a0, a1, a1 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vslideup.vx v8, v9, a1 +; CHECK-NEXT: add a2, a3, a2 +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma +; CHECK-NEXT: vle32.v v9, (a2) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv3f32_nxv1f32: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 1 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: srli a2, a1, 1 +; ZVBB-NEXT: vsetvli a3, zero, e32, mf2, ta, ma +; ZVBB-NEXT: vsseg3e32.v v8, (a0) +; ZVBB-NEXT: add a3, a0, a2 +; ZVBB-NEXT: vle32.v v9, (a3) +; ZVBB-NEXT: vle32.v v8, (a0) +; ZVBB-NEXT: srli a1, a1, 3 +; ZVBB-NEXT: add a0, a1, a1 +; ZVBB-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v8, v9, a1 +; ZVBB-NEXT: add a2, a3, a2 +; ZVBB-NEXT: vsetvli a0, zero, e32, mf2, ta, ma +; ZVBB-NEXT: vle32.v v9, (a2) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 1 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave3.nxv3f32( %v0, %v1, %v2) + ret %res +} + +define @vector_interleave_nxv6f32_nxv2f32( %v0, %v1, %v2) nounwind { +; CHECK-LABEL: vector_interleave_nxv6f32_nxv2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a1, a0, 1 +; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma +; CHECK-NEXT: vsseg3e32.v v8, (a0) +; CHECK-NEXT: vl1re32.v v8, (a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vl1re32.v v9, (a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vl1re32.v v10, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a1, a0, 1 +; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv6f32_nxv2f32: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a1, a0, 1 +; ZVBB-NEXT: add a0, a1, a0 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: vsetvli a2, zero, e32, m1, ta, ma +; ZVBB-NEXT: vsseg3e32.v v8, (a0) +; ZVBB-NEXT: vl1re32.v v8, (a0) +; ZVBB-NEXT: add a0, a0, a1 +; ZVBB-NEXT: vl1re32.v v9, (a0) +; ZVBB-NEXT: add a0, a0, a1 +; ZVBB-NEXT: vl1re32.v v10, (a0) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a1, a0, 1 +; ZVBB-NEXT: add a0, a1, a0 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave3.nxv6f32( %v0, %v1, %v2) + ret %res +} + +define @vector_interleave_nxv12f32_nxv4f32( %v0, %v1, %v2) nounwind { +; CHECK-LABEL: vector_interleave_nxv12f32_nxv4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 6 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: vsetvli a2, zero, e32, m2, ta, ma +; CHECK-NEXT: vsseg3e32.v v8, (a0) +; CHECK-NEXT: vl2re32.v v8, (a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vl2re32.v v10, (a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vl2re32.v v12, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 6 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv12f32_nxv4f32: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: li a1, 6 +; ZVBB-NEXT: mul a0, a0, a1 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: slli a1, a1, 1 +; ZVBB-NEXT: vsetvli a2, zero, e32, m2, ta, ma +; ZVBB-NEXT: vsseg3e32.v v8, (a0) +; ZVBB-NEXT: vl2re32.v v8, (a0) +; ZVBB-NEXT: add a0, a0, a1 +; ZVBB-NEXT: vl2re32.v v10, (a0) +; ZVBB-NEXT: add a0, a0, a1 +; ZVBB-NEXT: vl2re32.v v12, (a0) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: li a1, 6 +; ZVBB-NEXT: mul a0, a0, a1 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave3.nxv12f32( %v0, %v1, %v2) + ret %res +} + +define @vector_interleave_nxv3f64_nxv1f64( %v0, %v1, %v2) nounwind { +; CHECK-LABEL: vector_interleave_nxv3f64_nxv1f64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a1, a0, 1 +; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: vsetvli a2, zero, e64, m1, ta, ma +; CHECK-NEXT: vsseg3e64.v v8, (a0) +; CHECK-NEXT: vl1re64.v v8, (a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vl1re64.v v9, (a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vl1re64.v v10, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a1, a0, 1 +; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv3f64_nxv1f64: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a1, a0, 1 +; ZVBB-NEXT: add a0, a1, a0 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: vsetvli a2, zero, e64, m1, ta, ma +; ZVBB-NEXT: vsseg3e64.v v8, (a0) +; ZVBB-NEXT: vl1re64.v v8, (a0) +; ZVBB-NEXT: add a0, a0, a1 +; ZVBB-NEXT: vl1re64.v v9, (a0) +; ZVBB-NEXT: add a0, a0, a1 +; ZVBB-NEXT: vl1re64.v v10, (a0) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a1, a0, 1 +; ZVBB-NEXT: add a0, a1, a0 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave3.nxv3f64( %v0, %v1, %v2) + ret %res +} + +define @vector_interleave_nxv6f64_nxv2f64( %v0, %v1, %v2) nounwind { +; CHECK-LABEL: vector_interleave_nxv6f64_nxv2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 6 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: vsetvli a2, zero, e64, m2, ta, ma +; CHECK-NEXT: vsseg3e64.v v8, (a0) +; CHECK-NEXT: vl2re64.v v8, (a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vl2re64.v v10, (a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vl2re64.v v12, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 6 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv6f64_nxv2f64: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: li a1, 6 +; ZVBB-NEXT: mul a0, a0, a1 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: slli a1, a1, 1 +; ZVBB-NEXT: vsetvli a2, zero, e64, m2, ta, ma +; ZVBB-NEXT: vsseg3e64.v v8, (a0) +; ZVBB-NEXT: vl2re64.v v8, (a0) +; ZVBB-NEXT: add a0, a0, a1 +; ZVBB-NEXT: vl2re64.v v10, (a0) +; ZVBB-NEXT: add a0, a0, a1 +; ZVBB-NEXT: vl2re64.v v12, (a0) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: li a1, 6 +; ZVBB-NEXT: mul a0, a0, a1 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave3.nxv6f64( %v0, %v1, %v2) + ret %res +} + +define @vector_interleave_nxv8f16_nxv2f16( %v0, %v1, %v2, %v3) nounwind { +; CHECK-LABEL: vector_interleave_nxv8f16_nxv2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a2, a1, 1 +; CHECK-NEXT: add a3, a0, a2 +; CHECK-NEXT: vsetvli a4, zero, e16, mf2, ta, ma +; CHECK-NEXT: vsseg4e16.v v8, (a0) +; CHECK-NEXT: add a4, a3, a2 +; CHECK-NEXT: add a2, a4, a2 +; CHECK-NEXT: vle16.v v9, (a4) +; CHECK-NEXT: vle16.v v8, (a2) +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: add a2, a1, a1 +; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v9, v8, a1 +; CHECK-NEXT: vsetvli a4, zero, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v10, (a3) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v8, v10, a1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv8f16_nxv2f16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 1 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: srli a2, a1, 1 +; ZVBB-NEXT: add a3, a0, a2 +; ZVBB-NEXT: vsetvli a4, zero, e16, mf2, ta, ma +; ZVBB-NEXT: vsseg4e16.v v8, (a0) +; ZVBB-NEXT: add a4, a3, a2 +; ZVBB-NEXT: add a2, a4, a2 +; ZVBB-NEXT: vle16.v v9, (a4) +; ZVBB-NEXT: vle16.v v8, (a2) +; ZVBB-NEXT: srli a1, a1, 2 +; ZVBB-NEXT: add a2, a1, a1 +; ZVBB-NEXT: vsetvli zero, a2, e16, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v9, v8, a1 +; ZVBB-NEXT: vsetvli a4, zero, e16, mf2, ta, ma +; ZVBB-NEXT: vle16.v v10, (a3) +; ZVBB-NEXT: vle16.v v8, (a0) +; ZVBB-NEXT: vsetvli zero, a2, e16, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v8, v10, a1 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 1 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave4.nxv8f16( %v0, %v1, %v2, %v3) + ret %res +} + +define @vector_interleave_nxv16f16_nxv4f16( %v0, %v1, %v2, %v3) nounwind { +; CHECK-LABEL: vector_interleave_nxv16f16_nxv4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: add a3, a2, a1 +; CHECK-NEXT: vsetvli a4, zero, e16, m1, ta, ma +; CHECK-NEXT: vsseg4e16.v v8, (a0) +; CHECK-NEXT: vl1re16.v v10, (a3) +; CHECK-NEXT: add a1, a3, a1 +; CHECK-NEXT: vl1re16.v v11, (a1) +; CHECK-NEXT: vl1re16.v v8, (a0) +; CHECK-NEXT: vl1re16.v v9, (a2) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv16f16_nxv4f16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 2 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: add a3, a2, a1 +; ZVBB-NEXT: vsetvli a4, zero, e16, m1, ta, ma +; ZVBB-NEXT: vsseg4e16.v v8, (a0) +; ZVBB-NEXT: vl1re16.v v10, (a3) +; ZVBB-NEXT: add a1, a3, a1 +; ZVBB-NEXT: vl1re16.v v11, (a1) +; ZVBB-NEXT: vl1re16.v v8, (a0) +; ZVBB-NEXT: vl1re16.v v9, (a2) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 2 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave4.nxv16f16( %v0, %v1, %v2, %v3) + ret %res +} + +define @vector_interleave_nxv32f16_nxv8f16( %v0, %v1, %v2, %v3) nounwind { +; CHECK-LABEL: vector_interleave_nxv32f16_nxv8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: vsetvli a3, zero, e16, m2, ta, ma +; CHECK-NEXT: vsseg4e16.v v8, (a0) +; CHECK-NEXT: add a3, a2, a1 +; CHECK-NEXT: vl2re16.v v12, (a3) +; CHECK-NEXT: add a1, a3, a1 +; CHECK-NEXT: vl2re16.v v14, (a1) +; CHECK-NEXT: vl2re16.v v8, (a0) +; CHECK-NEXT: vl2re16.v v10, (a2) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv32f16_nxv8f16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 3 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: slli a1, a1, 1 +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: vsetvli a3, zero, e16, m2, ta, ma +; ZVBB-NEXT: vsseg4e16.v v8, (a0) +; ZVBB-NEXT: add a3, a2, a1 +; ZVBB-NEXT: vl2re16.v v12, (a3) +; ZVBB-NEXT: add a1, a3, a1 +; ZVBB-NEXT: vl2re16.v v14, (a1) +; ZVBB-NEXT: vl2re16.v v8, (a0) +; ZVBB-NEXT: vl2re16.v v10, (a2) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 3 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave4.nxv32f16( %v0, %v1, %v2, %v3) + ret %res +} + +define @vector_interleave_nxv8bf16_nxv2bf16( %v0, %v1, %v2, %v3) nounwind { +; CHECK-LABEL: vector_interleave_nxv8bf16_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a2, a1, 1 +; CHECK-NEXT: add a3, a0, a2 +; CHECK-NEXT: vsetvli a4, zero, e16, mf2, ta, ma +; CHECK-NEXT: vsseg4e16.v v8, (a0) +; CHECK-NEXT: add a4, a3, a2 +; CHECK-NEXT: add a2, a4, a2 +; CHECK-NEXT: vle16.v v9, (a4) +; CHECK-NEXT: vle16.v v8, (a2) +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: add a2, a1, a1 +; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v9, v8, a1 +; CHECK-NEXT: vsetvli a4, zero, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v10, (a3) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v8, v10, a1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv8bf16_nxv2bf16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 1 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: srli a2, a1, 1 +; ZVBB-NEXT: add a3, a0, a2 +; ZVBB-NEXT: vsetvli a4, zero, e16, mf2, ta, ma +; ZVBB-NEXT: vsseg4e16.v v8, (a0) +; ZVBB-NEXT: add a4, a3, a2 +; ZVBB-NEXT: add a2, a4, a2 +; ZVBB-NEXT: vle16.v v9, (a4) +; ZVBB-NEXT: vle16.v v8, (a2) +; ZVBB-NEXT: srli a1, a1, 2 +; ZVBB-NEXT: add a2, a1, a1 +; ZVBB-NEXT: vsetvli zero, a2, e16, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v9, v8, a1 +; ZVBB-NEXT: vsetvli a4, zero, e16, mf2, ta, ma +; ZVBB-NEXT: vle16.v v10, (a3) +; ZVBB-NEXT: vle16.v v8, (a0) +; ZVBB-NEXT: vsetvli zero, a2, e16, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v8, v10, a1 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 1 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave4.nxv8bf16( %v0, %v1, %v2, %v3) + ret %res +} + +define @vector_interleave_nxv16bf16_nxv4bf16( %v0, %v1, %v2, %v3) nounwind { +; CHECK-LABEL: vector_interleave_nxv16bf16_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: add a3, a2, a1 +; CHECK-NEXT: vsetvli a4, zero, e16, m1, ta, ma +; CHECK-NEXT: vsseg4e16.v v8, (a0) +; CHECK-NEXT: vl1re16.v v10, (a3) +; CHECK-NEXT: add a1, a3, a1 +; CHECK-NEXT: vl1re16.v v11, (a1) +; CHECK-NEXT: vl1re16.v v8, (a0) +; CHECK-NEXT: vl1re16.v v9, (a2) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv16bf16_nxv4bf16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 ; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: vwaddu.wv v10, v10, v8 -; ZVBB-NEXT: srli a0, a0, 2 -; ZVBB-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; ZVBB-NEXT: vslidedown.vx v8, v10, a0 -; ZVBB-NEXT: add a1, a0, a0 -; ZVBB-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; ZVBB-NEXT: vslideup.vx v10, v8, a0 -; ZVBB-NEXT: vmv.v.v v8, v10 +; ZVBB-NEXT: slli a0, a0, 2 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: add a3, a2, a1 +; ZVBB-NEXT: vsetvli a4, zero, e16, m1, ta, ma +; ZVBB-NEXT: vsseg4e16.v v8, (a0) +; ZVBB-NEXT: vl1re16.v v10, (a3) +; ZVBB-NEXT: add a1, a3, a1 +; ZVBB-NEXT: vl1re16.v v11, (a1) +; ZVBB-NEXT: vl1re16.v v8, (a0) +; ZVBB-NEXT: vl1re16.v v9, (a2) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 2 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 ; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave4.nxv16bf16( %v0, %v1, %v2, %v3) + ret %res +} + +define @vector_interleave_nxv32bf16_nxv8bf16( %v0, %v1, %v2, %v3) nounwind { +; CHECK-LABEL: vector_interleave_nxv32bf16_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: vsetvli a3, zero, e16, m2, ta, ma +; CHECK-NEXT: vsseg4e16.v v8, (a0) +; CHECK-NEXT: add a3, a2, a1 +; CHECK-NEXT: vl2re16.v v12, (a3) +; CHECK-NEXT: add a1, a3, a1 +; CHECK-NEXT: vl2re16.v v14, (a1) +; CHECK-NEXT: vl2re16.v v8, (a0) +; CHECK-NEXT: vl2re16.v v10, (a2) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret ; -; ZIP-LABEL: vector_interleave_nxv4f16_nxv2f16: -; ZIP: # %bb.0: -; ZIP-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; ZIP-NEXT: ri.vzip2b.vv v11, v8, v9 -; ZIP-NEXT: ri.vzip2a.vv v10, v8, v9 -; ZIP-NEXT: csrr a0, vlenb -; ZIP-NEXT: srli a0, a0, 2 -; ZIP-NEXT: add a1, a0, a0 -; ZIP-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; ZIP-NEXT: vslideup.vx v10, v11, a0 -; ZIP-NEXT: vmv.v.v v8, v10 -; ZIP-NEXT: ret - %res = call @llvm.vector.interleave2.nxv4f16( %a, %b) - ret %res +; ZVBB-LABEL: vector_interleave_nxv32bf16_nxv8bf16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 3 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: slli a1, a1, 1 +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: vsetvli a3, zero, e16, m2, ta, ma +; ZVBB-NEXT: vsseg4e16.v v8, (a0) +; ZVBB-NEXT: add a3, a2, a1 +; ZVBB-NEXT: vl2re16.v v12, (a3) +; ZVBB-NEXT: add a1, a3, a1 +; ZVBB-NEXT: vl2re16.v v14, (a1) +; ZVBB-NEXT: vl2re16.v v8, (a0) +; ZVBB-NEXT: vl2re16.v v10, (a2) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 3 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave4.nxv32bf16( %v0, %v1, %v2, %v3) + ret %res } -define @vector_interleave_nxv8f16_nxv4f16( %a, %b) { -; V-LABEL: vector_interleave_nxv8f16_nxv4f16: -; V: # %bb.0: -; V-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; V-NEXT: vmv1r.v v10, v9 -; V-NEXT: vmv1r.v v11, v8 -; V-NEXT: vwaddu.vv v8, v11, v10 -; V-NEXT: li a0, -1 -; V-NEXT: vwmaccu.vx v8, a0, v10 -; V-NEXT: ret +define @vector_interleave_nxv4f32_nxv1f32( %v0, %v1, %v2, %v3) nounwind { +; CHECK-LABEL: vector_interleave_nxv4f32_nxv1f32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a2, a1, 1 +; CHECK-NEXT: add a3, a0, a2 +; CHECK-NEXT: vsetvli a4, zero, e32, mf2, ta, ma +; CHECK-NEXT: vsseg4e32.v v8, (a0) +; CHECK-NEXT: add a4, a3, a2 +; CHECK-NEXT: add a2, a4, a2 +; CHECK-NEXT: vle32.v v9, (a4) +; CHECK-NEXT: vle32.v v8, (a2) +; CHECK-NEXT: srli a1, a1, 3 +; CHECK-NEXT: add a2, a1, a1 +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vslideup.vx v9, v8, a1 +; CHECK-NEXT: vsetvli a4, zero, e32, mf2, ta, ma +; CHECK-NEXT: vle32.v v10, (a3) +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vslideup.vx v8, v10, a1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret ; -; ZVBB-LABEL: vector_interleave_nxv8f16_nxv4f16: +; ZVBB-LABEL: vector_interleave_nxv4f32_nxv1f32: ; ZVBB: # %bb.0: -; ZVBB-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; ZVBB-NEXT: vmv1r.v v10, v9 -; ZVBB-NEXT: vmv1r.v v11, v8 -; ZVBB-NEXT: vwsll.vi v8, v10, 16 -; ZVBB-NEXT: vwaddu.wv v8, v8, v11 +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 1 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: srli a2, a1, 1 +; ZVBB-NEXT: add a3, a0, a2 +; ZVBB-NEXT: vsetvli a4, zero, e32, mf2, ta, ma +; ZVBB-NEXT: vsseg4e32.v v8, (a0) +; ZVBB-NEXT: add a4, a3, a2 +; ZVBB-NEXT: add a2, a4, a2 +; ZVBB-NEXT: vle32.v v9, (a4) +; ZVBB-NEXT: vle32.v v8, (a2) +; ZVBB-NEXT: srli a1, a1, 3 +; ZVBB-NEXT: add a2, a1, a1 +; ZVBB-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v9, v8, a1 +; ZVBB-NEXT: vsetvli a4, zero, e32, mf2, ta, ma +; ZVBB-NEXT: vle32.v v10, (a3) +; ZVBB-NEXT: vle32.v v8, (a0) +; ZVBB-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v8, v10, a1 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 1 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 ; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave4.nxv4f32( %v0, %v1, %v2, %v3) + ret %res +} + +define @vector_interleave_nxv8f32_nxv2f32( %v0, %v1, %v2, %v3) nounwind { +; CHECK-LABEL: vector_interleave_nxv8f32_nxv2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: add a3, a2, a1 +; CHECK-NEXT: vsetvli a4, zero, e32, m1, ta, ma +; CHECK-NEXT: vsseg4e32.v v8, (a0) +; CHECK-NEXT: vl1re32.v v10, (a3) +; CHECK-NEXT: add a1, a3, a1 +; CHECK-NEXT: vl1re32.v v11, (a1) +; CHECK-NEXT: vl1re32.v v8, (a0) +; CHECK-NEXT: vl1re32.v v9, (a2) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret ; -; ZIP-LABEL: vector_interleave_nxv8f16_nxv4f16: -; ZIP: # %bb.0: -; ZIP-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; ZIP-NEXT: vmv1r.v v10, v9 -; ZIP-NEXT: vmv1r.v v11, v8 -; ZIP-NEXT: ri.vzip2b.vv v9, v8, v10 -; ZIP-NEXT: ri.vzip2a.vv v8, v11, v10 -; ZIP-NEXT: ret - %res = call @llvm.vector.interleave2.nxv8f16( %a, %b) - ret %res +; ZVBB-LABEL: vector_interleave_nxv8f32_nxv2f32: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 2 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: add a3, a2, a1 +; ZVBB-NEXT: vsetvli a4, zero, e32, m1, ta, ma +; ZVBB-NEXT: vsseg4e32.v v8, (a0) +; ZVBB-NEXT: vl1re32.v v10, (a3) +; ZVBB-NEXT: add a1, a3, a1 +; ZVBB-NEXT: vl1re32.v v11, (a1) +; ZVBB-NEXT: vl1re32.v v8, (a0) +; ZVBB-NEXT: vl1re32.v v9, (a2) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 2 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave4.nxv8f32( %v0, %v1, %v2, %v3) + ret %res } -define @vector_interleave_nxv4f32_nxv2f32( %a, %b) { -; V-LABEL: vector_interleave_nxv4f32_nxv2f32: -; V: # %bb.0: -; V-NEXT: vsetvli a0, zero, e32, m1, ta, ma -; V-NEXT: vmv1r.v v10, v9 -; V-NEXT: vmv1r.v v11, v8 -; V-NEXT: vwaddu.vv v8, v11, v10 -; V-NEXT: li a0, -1 -; V-NEXT: vwmaccu.vx v8, a0, v10 -; V-NEXT: ret +define @vector_interleave_nxv16f32_nxv4f32( %v0, %v1, %v2, %v3) nounwind { +; CHECK-LABEL: vector_interleave_nxv16f32_nxv4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: vsetvli a3, zero, e32, m2, ta, ma +; CHECK-NEXT: vsseg4e32.v v8, (a0) +; CHECK-NEXT: add a3, a2, a1 +; CHECK-NEXT: vl2re32.v v12, (a3) +; CHECK-NEXT: add a1, a3, a1 +; CHECK-NEXT: vl2re32.v v14, (a1) +; CHECK-NEXT: vl2re32.v v8, (a0) +; CHECK-NEXT: vl2re32.v v10, (a2) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret ; -; ZVBB-LABEL: vector_interleave_nxv4f32_nxv2f32: +; ZVBB-LABEL: vector_interleave_nxv16f32_nxv4f32: ; ZVBB: # %bb.0: -; ZVBB-NEXT: vsetvli a0, zero, e32, m1, ta, ma -; ZVBB-NEXT: vmv1r.v v10, v9 -; ZVBB-NEXT: vmv1r.v v11, v8 -; ZVBB-NEXT: li a0, 32 -; ZVBB-NEXT: vwsll.vx v8, v10, a0 -; ZVBB-NEXT: vwaddu.wv v8, v8, v11 +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 3 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: slli a1, a1, 1 +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: vsetvli a3, zero, e32, m2, ta, ma +; ZVBB-NEXT: vsseg4e32.v v8, (a0) +; ZVBB-NEXT: add a3, a2, a1 +; ZVBB-NEXT: vl2re32.v v12, (a3) +; ZVBB-NEXT: add a1, a3, a1 +; ZVBB-NEXT: vl2re32.v v14, (a1) +; ZVBB-NEXT: vl2re32.v v8, (a0) +; ZVBB-NEXT: vl2re32.v v10, (a2) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 3 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 ; ZVBB-NEXT: ret -; -; ZIP-LABEL: vector_interleave_nxv4f32_nxv2f32: -; ZIP: # %bb.0: -; ZIP-NEXT: vsetvli a0, zero, e32, m1, ta, ma -; ZIP-NEXT: vmv1r.v v10, v9 -; ZIP-NEXT: vmv1r.v v11, v8 -; ZIP-NEXT: ri.vzip2b.vv v9, v8, v10 -; ZIP-NEXT: ri.vzip2a.vv v8, v11, v10 -; ZIP-NEXT: ret - %res = call @llvm.vector.interleave2.nxv4f32( %a, %b) - ret %res + %res = call @llvm.vector.interleave4.nxv16f32( %v0, %v1, %v2, %v3) + ret %res } -define @vector_interleave_nxv16bf16_nxv8bf16( %a, %b) { -; V-LABEL: vector_interleave_nxv16bf16_nxv8bf16: -; V: # %bb.0: -; V-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; V-NEXT: vmv2r.v v12, v10 -; V-NEXT: vmv2r.v v14, v8 -; V-NEXT: vwaddu.vv v8, v14, v12 -; V-NEXT: li a0, -1 -; V-NEXT: vwmaccu.vx v8, a0, v12 -; V-NEXT: ret +define @vector_interleave_nxv4f64_nxv1f64( %v0, %v1, %v2, %v3) nounwind { +; CHECK-LABEL: vector_interleave_nxv4f64_nxv1f64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: add a3, a2, a1 +; CHECK-NEXT: vsetvli a4, zero, e64, m1, ta, ma +; CHECK-NEXT: vsseg4e64.v v8, (a0) +; CHECK-NEXT: vl1re64.v v10, (a3) +; CHECK-NEXT: add a1, a3, a1 +; CHECK-NEXT: vl1re64.v v11, (a1) +; CHECK-NEXT: vl1re64.v v8, (a0) +; CHECK-NEXT: vl1re64.v v9, (a2) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret ; -; ZVBB-LABEL: vector_interleave_nxv16bf16_nxv8bf16: +; ZVBB-LABEL: vector_interleave_nxv4f64_nxv1f64: ; ZVBB: # %bb.0: -; ZVBB-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; ZVBB-NEXT: vmv2r.v v12, v10 -; ZVBB-NEXT: vmv2r.v v14, v8 -; ZVBB-NEXT: vwsll.vi v8, v12, 16 -; ZVBB-NEXT: vwaddu.wv v8, v8, v14 +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 2 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: add a3, a2, a1 +; ZVBB-NEXT: vsetvli a4, zero, e64, m1, ta, ma +; ZVBB-NEXT: vsseg4e64.v v8, (a0) +; ZVBB-NEXT: vl1re64.v v10, (a3) +; ZVBB-NEXT: add a1, a3, a1 +; ZVBB-NEXT: vl1re64.v v11, (a1) +; ZVBB-NEXT: vl1re64.v v8, (a0) +; ZVBB-NEXT: vl1re64.v v9, (a2) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 2 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 ; ZVBB-NEXT: ret -; -; ZIP-LABEL: vector_interleave_nxv16bf16_nxv8bf16: -; ZIP: # %bb.0: -; ZIP-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; ZIP-NEXT: vmv2r.v v12, v10 -; ZIP-NEXT: vmv2r.v v14, v8 -; ZIP-NEXT: ri.vzip2b.vv v10, v8, v12 -; ZIP-NEXT: ri.vzip2a.vv v8, v14, v12 -; ZIP-NEXT: ret - %res = call @llvm.vector.interleave2.nxv16bf16( %a, %b) - ret %res + %res = call @llvm.vector.interleave4.nxv4f64( %v0, %v1, %v2, %v3) + ret %res } -define @vector_interleave_nxv16f16_nxv8f16( %a, %b) { -; V-LABEL: vector_interleave_nxv16f16_nxv8f16: -; V: # %bb.0: -; V-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; V-NEXT: vmv2r.v v12, v10 -; V-NEXT: vmv2r.v v14, v8 -; V-NEXT: vwaddu.vv v8, v14, v12 -; V-NEXT: li a0, -1 -; V-NEXT: vwmaccu.vx v8, a0, v12 -; V-NEXT: ret +define @vector_interleave_nxv8f64_nxv2f64( %v0, %v1, %v2, %v3) nounwind { +; CHECK-LABEL: vector_interleave_nxv8f64_nxv2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: vsetvli a3, zero, e64, m2, ta, ma +; CHECK-NEXT: vsseg4e64.v v8, (a0) +; CHECK-NEXT: add a3, a2, a1 +; CHECK-NEXT: vl2re64.v v12, (a3) +; CHECK-NEXT: add a1, a3, a1 +; CHECK-NEXT: vl2re64.v v14, (a1) +; CHECK-NEXT: vl2re64.v v8, (a0) +; CHECK-NEXT: vl2re64.v v10, (a2) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret ; -; ZVBB-LABEL: vector_interleave_nxv16f16_nxv8f16: +; ZVBB-LABEL: vector_interleave_nxv8f64_nxv2f64: ; ZVBB: # %bb.0: -; ZVBB-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; ZVBB-NEXT: vmv2r.v v12, v10 -; ZVBB-NEXT: vmv2r.v v14, v8 -; ZVBB-NEXT: vwsll.vi v8, v12, 16 -; ZVBB-NEXT: vwaddu.wv v8, v8, v14 +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 3 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: slli a1, a1, 1 +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: vsetvli a3, zero, e64, m2, ta, ma +; ZVBB-NEXT: vsseg4e64.v v8, (a0) +; ZVBB-NEXT: add a3, a2, a1 +; ZVBB-NEXT: vl2re64.v v12, (a3) +; ZVBB-NEXT: add a1, a3, a1 +; ZVBB-NEXT: vl2re64.v v14, (a1) +; ZVBB-NEXT: vl2re64.v v8, (a0) +; ZVBB-NEXT: vl2re64.v v10, (a2) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 3 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 ; ZVBB-NEXT: ret -; -; ZIP-LABEL: vector_interleave_nxv16f16_nxv8f16: -; ZIP: # %bb.0: -; ZIP-NEXT: vsetvli a0, zero, e16, m2, ta, ma -; ZIP-NEXT: vmv2r.v v12, v10 -; ZIP-NEXT: vmv2r.v v14, v8 -; ZIP-NEXT: ri.vzip2b.vv v10, v8, v12 -; ZIP-NEXT: ri.vzip2a.vv v8, v14, v12 -; ZIP-NEXT: ret - %res = call @llvm.vector.interleave2.nxv16f16( %a, %b) - ret %res + %res = call @llvm.vector.interleave4.nxv6f64( %v0, %v1, %v2, %v3) + ret %res } -define @vector_interleave_nxv8f32_nxv4f32( %a, %b) { -; V-LABEL: vector_interleave_nxv8f32_nxv4f32: -; V: # %bb.0: -; V-NEXT: vsetvli a0, zero, e32, m2, ta, ma -; V-NEXT: vmv2r.v v12, v10 -; V-NEXT: vmv2r.v v14, v8 -; V-NEXT: vwaddu.vv v8, v14, v12 -; V-NEXT: li a0, -1 -; V-NEXT: vwmaccu.vx v8, a0, v12 -; V-NEXT: ret +define @vector_interleave_nxv10f16_nxv2f16( %v0, %v1, %v2, %v3, %v4) nounwind { +; CHECK-LABEL: vector_interleave_nxv10f16_nxv2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a1, a0, 1 +; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a2, a1, 1 +; CHECK-NEXT: add a3, a0, a2 +; CHECK-NEXT: add a4, a3, a2 +; CHECK-NEXT: vsetvli a5, zero, e16, mf2, ta, ma +; CHECK-NEXT: vsseg5e16.v v8, (a0) +; CHECK-NEXT: add a5, a4, a2 +; CHECK-NEXT: vle16.v v8, (a5) +; CHECK-NEXT: vle16.v v9, (a4) +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: add a4, a1, a1 +; CHECK-NEXT: vle16.v v10, (a3) +; CHECK-NEXT: vsetvli zero, a4, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v9, v8, a1 +; CHECK-NEXT: vsetvli a3, zero, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli zero, a4, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v8, v10, a1 +; CHECK-NEXT: add a2, a5, a2 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v10, (a2) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a1, a0, 1 +; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret ; -; ZVBB-LABEL: vector_interleave_nxv8f32_nxv4f32: +; ZVBB-LABEL: vector_interleave_nxv10f16_nxv2f16: ; ZVBB: # %bb.0: -; ZVBB-NEXT: vsetvli a0, zero, e32, m2, ta, ma -; ZVBB-NEXT: vmv2r.v v12, v10 -; ZVBB-NEXT: vmv2r.v v14, v8 -; ZVBB-NEXT: li a0, 32 -; ZVBB-NEXT: vwsll.vx v8, v12, a0 -; ZVBB-NEXT: vwaddu.wv v8, v8, v14 +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a1, a0, 1 +; ZVBB-NEXT: add a0, a1, a0 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: srli a2, a1, 1 +; ZVBB-NEXT: add a3, a0, a2 +; ZVBB-NEXT: add a4, a3, a2 +; ZVBB-NEXT: vsetvli a5, zero, e16, mf2, ta, ma +; ZVBB-NEXT: vsseg5e16.v v8, (a0) +; ZVBB-NEXT: add a5, a4, a2 +; ZVBB-NEXT: vle16.v v8, (a5) +; ZVBB-NEXT: vle16.v v9, (a4) +; ZVBB-NEXT: srli a1, a1, 2 +; ZVBB-NEXT: add a4, a1, a1 +; ZVBB-NEXT: vle16.v v10, (a3) +; ZVBB-NEXT: vsetvli zero, a4, e16, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v9, v8, a1 +; ZVBB-NEXT: vsetvli a3, zero, e16, mf2, ta, ma +; ZVBB-NEXT: vle16.v v8, (a0) +; ZVBB-NEXT: vsetvli zero, a4, e16, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v8, v10, a1 +; ZVBB-NEXT: add a2, a5, a2 +; ZVBB-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; ZVBB-NEXT: vle16.v v10, (a2) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a1, a0, 1 +; ZVBB-NEXT: add a0, a1, a0 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 ; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave5.nxv10f16( %v0, %v1, %v2, %v3, %v4) + ret %res +} + +define @vector_interleave_nxv20f16_nxv4f16( %v0, %v1, %v2, %v3, %v4) nounwind { +; CHECK-LABEL: vector_interleave_nxv20f16_nxv4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a1, a0, 2 +; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: add a3, a2, a1 +; CHECK-NEXT: vsetvli a4, zero, e16, m1, ta, ma +; CHECK-NEXT: vsseg5e16.v v8, (a0) +; CHECK-NEXT: vl1re16.v v10, (a3) +; CHECK-NEXT: add a3, a3, a1 +; CHECK-NEXT: vl1re16.v v11, (a3) +; CHECK-NEXT: vl1re16.v v8, (a0) +; CHECK-NEXT: vl1re16.v v9, (a2) +; CHECK-NEXT: add a1, a3, a1 +; CHECK-NEXT: vl1re16.v v12, (a1) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a1, a0, 2 +; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret ; -; ZIP-LABEL: vector_interleave_nxv8f32_nxv4f32: -; ZIP: # %bb.0: -; ZIP-NEXT: vsetvli a0, zero, e32, m2, ta, ma -; ZIP-NEXT: vmv2r.v v12, v10 -; ZIP-NEXT: vmv2r.v v14, v8 -; ZIP-NEXT: ri.vzip2b.vv v10, v8, v12 -; ZIP-NEXT: ri.vzip2a.vv v8, v14, v12 -; ZIP-NEXT: ret - %res = call @llvm.vector.interleave2.nxv8f32( %a, %b) - ret %res -} - -define @vector_interleave_nxv4f64_nxv2f64( %a, %b) { -; V-LABEL: vector_interleave_nxv4f64_nxv2f64: -; V: # %bb.0: -; V-NEXT: csrr a0, vlenb -; V-NEXT: vsetvli a1, zero, e16, m1, ta, mu -; V-NEXT: vid.v v12 -; V-NEXT: srli a0, a0, 2 -; V-NEXT: vand.vi v13, v12, 1 -; V-NEXT: vmsne.vi v0, v13, 0 -; V-NEXT: vsrl.vi v16, v12, 1 -; V-NEXT: vadd.vx v16, v16, a0, v0.t -; V-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; V-NEXT: vrgatherei16.vv v12, v8, v16 -; V-NEXT: vmv.v.v v8, v12 -; V-NEXT: ret -; -; ZVBB-LABEL: vector_interleave_nxv4f64_nxv2f64: +; ZVBB-LABEL: vector_interleave_nxv20f16_nxv4f16: ; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 ; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: vsetvli a1, zero, e16, m1, ta, mu -; ZVBB-NEXT: vid.v v12 -; ZVBB-NEXT: srli a0, a0, 2 -; ZVBB-NEXT: vand.vi v13, v12, 1 -; ZVBB-NEXT: vmsne.vi v0, v13, 0 -; ZVBB-NEXT: vsrl.vi v16, v12, 1 -; ZVBB-NEXT: vadd.vx v16, v16, a0, v0.t -; ZVBB-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; ZVBB-NEXT: vrgatherei16.vv v12, v8, v16 -; ZVBB-NEXT: vmv.v.v v8, v12 +; ZVBB-NEXT: slli a1, a0, 2 +; ZVBB-NEXT: add a0, a1, a0 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: add a3, a2, a1 +; ZVBB-NEXT: vsetvli a4, zero, e16, m1, ta, ma +; ZVBB-NEXT: vsseg5e16.v v8, (a0) +; ZVBB-NEXT: vl1re16.v v10, (a3) +; ZVBB-NEXT: add a3, a3, a1 +; ZVBB-NEXT: vl1re16.v v11, (a3) +; ZVBB-NEXT: vl1re16.v v8, (a0) +; ZVBB-NEXT: vl1re16.v v9, (a2) +; ZVBB-NEXT: add a1, a3, a1 +; ZVBB-NEXT: vl1re16.v v12, (a1) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a1, a0, 2 +; ZVBB-NEXT: add a0, a1, a0 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 ; ZVBB-NEXT: ret -; -; ZIP-LABEL: vector_interleave_nxv4f64_nxv2f64: -; ZIP: # %bb.0: -; ZIP-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; ZIP-NEXT: vmv2r.v v12, v10 -; ZIP-NEXT: vmv2r.v v14, v8 -; ZIP-NEXT: ri.vzip2b.vv v10, v8, v12 -; ZIP-NEXT: ri.vzip2a.vv v8, v14, v12 -; ZIP-NEXT: ret - %res = call @llvm.vector.interleave2.nxv4f64( %a, %b) - ret %res + %res = call @llvm.vector.interleave5.nxv20f16( %v0, %v1, %v2, %v3, %v4) + ret %res } - - -define @vector_interleave_nxv64bf16_nxv32bf16( %a, %b) { -; V-LABEL: vector_interleave_nxv64bf16_nxv32bf16: -; V: # %bb.0: -; V-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; V-NEXT: vmv8r.v v24, v8 -; V-NEXT: vwaddu.vv v8, v24, v16 -; V-NEXT: li a0, -1 -; V-NEXT: vwaddu.vv v0, v28, v20 -; V-NEXT: vwmaccu.vx v8, a0, v16 -; V-NEXT: vwmaccu.vx v0, a0, v20 -; V-NEXT: vmv8r.v v16, v0 -; V-NEXT: ret +define @vector_interleave_nxv40f16_nxv8f16( %v0, %v1, %v2, %v3, %v4) nounwind { +; RV32-LABEL: vector_interleave_nxv40f16_nxv8f16: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -80 +; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; RV32-NEXT: addi s0, sp, 80 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 28 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: sub sp, sp, a0 +; RV32-NEXT: andi sp, sp, -64 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; RV32-NEXT: vmv2r.v v20, v16 +; RV32-NEXT: addi a0, sp, 64 +; RV32-NEXT: vmv2r.v v18, v12 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a2, a1, 2 +; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 64 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: vmv2r.v v16, v8 +; RV32-NEXT: vmv2r.v v22, v16 +; RV32-NEXT: vmv2r.v v24, v18 +; RV32-NEXT: vmv1r.v v26, v20 +; RV32-NEXT: add a3, a0, a2 +; RV32-NEXT: vmv1r.v v23, v10 +; RV32-NEXT: add a4, a1, a2 +; RV32-NEXT: add a5, a4, a2 +; RV32-NEXT: vmv1r.v v25, v14 +; RV32-NEXT: add a6, a5, a2 +; RV32-NEXT: vmv1r.v v18, v11 +; RV32-NEXT: vsseg5e16.v v22, (a0) +; RV32-NEXT: vmv1r.v v20, v15 +; RV32-NEXT: vsseg5e16.v v17, (a1) +; RV32-NEXT: vl1re16.v v16, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re16.v v17, (a6) +; RV32-NEXT: add a6, a3, a2 +; RV32-NEXT: vl1re16.v v10, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re16.v v11, (a6) +; RV32-NEXT: vl1re16.v v8, (a0) +; RV32-NEXT: vl1re16.v v9, (a3) +; RV32-NEXT: vl1re16.v v14, (a4) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a3, 10 +; RV32-NEXT: mul a0, a0, a3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 64 +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re16.v v15, (a5) +; RV32-NEXT: vl1re16.v v12, (a6) +; RV32-NEXT: vl1re16.v v13, (a1) +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, a0, a2 +; RV32-NEXT: vs2r.v v16, (a2) +; RV32-NEXT: vs8r.v v8, (a0) +; RV32-NEXT: vl8re16.v v16, (a2) +; RV32-NEXT: vl8re16.v v8, (a0) +; RV32-NEXT: addi sp, s0, -80 +; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 80 +; RV32-NEXT: ret +; +; RV64-LABEL: vector_interleave_nxv40f16_nxv8f16: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -80 +; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; RV64-NEXT: addi s0, sp, 80 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a1, 28 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: sub sp, sp, a0 +; RV64-NEXT: andi sp, sp, -64 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; RV64-NEXT: vmv2r.v v20, v16 +; RV64-NEXT: addi a0, sp, 64 +; RV64-NEXT: vmv2r.v v18, v12 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 2 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 64 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: vmv2r.v v16, v8 +; RV64-NEXT: vmv2r.v v22, v16 +; RV64-NEXT: vmv2r.v v24, v18 +; RV64-NEXT: vmv1r.v v26, v20 +; RV64-NEXT: add a3, a0, a2 +; RV64-NEXT: vmv1r.v v23, v10 +; RV64-NEXT: add a4, a1, a2 +; RV64-NEXT: add a5, a4, a2 +; RV64-NEXT: vmv1r.v v25, v14 +; RV64-NEXT: add a6, a5, a2 +; RV64-NEXT: vmv1r.v v18, v11 +; RV64-NEXT: vsseg5e16.v v22, (a0) +; RV64-NEXT: vmv1r.v v20, v15 +; RV64-NEXT: vsseg5e16.v v17, (a1) +; RV64-NEXT: vl1re16.v v16, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re16.v v17, (a6) +; RV64-NEXT: add a6, a3, a2 +; RV64-NEXT: vl1re16.v v10, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re16.v v11, (a6) +; RV64-NEXT: vl1re16.v v8, (a0) +; RV64-NEXT: vl1re16.v v9, (a3) +; RV64-NEXT: vl1re16.v v14, (a4) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a3, 10 +; RV64-NEXT: mul a0, a0, a3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 64 +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re16.v v15, (a5) +; RV64-NEXT: vl1re16.v v12, (a6) +; RV64-NEXT: vl1re16.v v13, (a1) +; RV64-NEXT: slli a2, a2, 3 +; RV64-NEXT: add a2, a0, a2 +; RV64-NEXT: vs2r.v v16, (a2) +; RV64-NEXT: vs8r.v v8, (a0) +; RV64-NEXT: vl8re16.v v16, (a2) +; RV64-NEXT: vl8re16.v v8, (a0) +; RV64-NEXT: addi sp, s0, -80 +; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 80 +; RV64-NEXT: ret +; +; ZVBB-RV32-LABEL: vector_interleave_nxv40f16_nxv8f16: +; ZVBB-RV32: # %bb.0: +; ZVBB-RV32-NEXT: addi sp, sp, -80 +; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; ZVBB-RV32-NEXT: addi s0, sp, 80 +; ZVBB-RV32-NEXT: csrr a0, vlenb +; ZVBB-RV32-NEXT: li a1, 28 +; ZVBB-RV32-NEXT: mul a0, a0, a1 +; ZVBB-RV32-NEXT: sub sp, sp, a0 +; ZVBB-RV32-NEXT: andi sp, sp, -64 +; ZVBB-RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZVBB-RV32-NEXT: vmv2r.v v20, v16 +; ZVBB-RV32-NEXT: addi a0, sp, 64 +; ZVBB-RV32-NEXT: vmv2r.v v18, v12 +; ZVBB-RV32-NEXT: csrr a1, vlenb +; ZVBB-RV32-NEXT: slli a2, a1, 2 +; ZVBB-RV32-NEXT: add a1, a2, a1 +; ZVBB-RV32-NEXT: add a1, sp, a1 +; ZVBB-RV32-NEXT: addi a1, a1, 64 +; ZVBB-RV32-NEXT: csrr a2, vlenb +; ZVBB-RV32-NEXT: vmv2r.v v16, v8 +; ZVBB-RV32-NEXT: vmv2r.v v22, v16 +; ZVBB-RV32-NEXT: vmv2r.v v24, v18 +; ZVBB-RV32-NEXT: vmv1r.v v26, v20 +; ZVBB-RV32-NEXT: add a3, a0, a2 +; ZVBB-RV32-NEXT: vmv1r.v v23, v10 +; ZVBB-RV32-NEXT: add a4, a1, a2 +; ZVBB-RV32-NEXT: add a5, a4, a2 +; ZVBB-RV32-NEXT: vmv1r.v v25, v14 +; ZVBB-RV32-NEXT: add a6, a5, a2 +; ZVBB-RV32-NEXT: vmv1r.v v18, v11 +; ZVBB-RV32-NEXT: vsseg5e16.v v22, (a0) +; ZVBB-RV32-NEXT: vmv1r.v v20, v15 +; ZVBB-RV32-NEXT: vsseg5e16.v v17, (a1) +; ZVBB-RV32-NEXT: vl1re16.v v16, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re16.v v17, (a6) +; ZVBB-RV32-NEXT: add a6, a3, a2 +; ZVBB-RV32-NEXT: vl1re16.v v10, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re16.v v11, (a6) +; ZVBB-RV32-NEXT: vl1re16.v v8, (a0) +; ZVBB-RV32-NEXT: vl1re16.v v9, (a3) +; ZVBB-RV32-NEXT: vl1re16.v v14, (a4) +; ZVBB-RV32-NEXT: csrr a0, vlenb +; ZVBB-RV32-NEXT: li a3, 10 +; ZVBB-RV32-NEXT: mul a0, a0, a3 +; ZVBB-RV32-NEXT: add a0, sp, a0 +; ZVBB-RV32-NEXT: addi a0, a0, 64 +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re16.v v15, (a5) +; ZVBB-RV32-NEXT: vl1re16.v v12, (a6) +; ZVBB-RV32-NEXT: vl1re16.v v13, (a1) +; ZVBB-RV32-NEXT: slli a2, a2, 3 +; ZVBB-RV32-NEXT: add a2, a0, a2 +; ZVBB-RV32-NEXT: vs2r.v v16, (a2) +; ZVBB-RV32-NEXT: vs8r.v v8, (a0) +; ZVBB-RV32-NEXT: vl8re16.v v16, (a2) +; ZVBB-RV32-NEXT: vl8re16.v v8, (a0) +; ZVBB-RV32-NEXT: addi sp, s0, -80 +; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; ZVBB-RV32-NEXT: addi sp, sp, 80 +; ZVBB-RV32-NEXT: ret ; -; ZVBB-LABEL: vector_interleave_nxv64bf16_nxv32bf16: -; ZVBB: # %bb.0: -; ZVBB-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVBB-NEXT: vwsll.vi v24, v16, 16 -; ZVBB-NEXT: vwsll.vi v0, v20, 16 -; ZVBB-NEXT: vwaddu.wv v24, v24, v8 -; ZVBB-NEXT: vwaddu.wv v0, v0, v12 -; ZVBB-NEXT: vmv8r.v v8, v24 -; ZVBB-NEXT: vmv8r.v v16, v0 -; ZVBB-NEXT: ret +; ZVBB-RV64-LABEL: vector_interleave_nxv40f16_nxv8f16: +; ZVBB-RV64: # %bb.0: +; ZVBB-RV64-NEXT: addi sp, sp, -80 +; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; ZVBB-RV64-NEXT: addi s0, sp, 80 +; ZVBB-RV64-NEXT: csrr a0, vlenb +; ZVBB-RV64-NEXT: li a1, 28 +; ZVBB-RV64-NEXT: mul a0, a0, a1 +; ZVBB-RV64-NEXT: sub sp, sp, a0 +; ZVBB-RV64-NEXT: andi sp, sp, -64 +; ZVBB-RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZVBB-RV64-NEXT: vmv2r.v v20, v16 +; ZVBB-RV64-NEXT: addi a0, sp, 64 +; ZVBB-RV64-NEXT: vmv2r.v v18, v12 +; ZVBB-RV64-NEXT: csrr a1, vlenb +; ZVBB-RV64-NEXT: slli a2, a1, 2 +; ZVBB-RV64-NEXT: add a1, a2, a1 +; ZVBB-RV64-NEXT: add a1, sp, a1 +; ZVBB-RV64-NEXT: addi a1, a1, 64 +; ZVBB-RV64-NEXT: csrr a2, vlenb +; ZVBB-RV64-NEXT: vmv2r.v v16, v8 +; ZVBB-RV64-NEXT: vmv2r.v v22, v16 +; ZVBB-RV64-NEXT: vmv2r.v v24, v18 +; ZVBB-RV64-NEXT: vmv1r.v v26, v20 +; ZVBB-RV64-NEXT: add a3, a0, a2 +; ZVBB-RV64-NEXT: vmv1r.v v23, v10 +; ZVBB-RV64-NEXT: add a4, a1, a2 +; ZVBB-RV64-NEXT: add a5, a4, a2 +; ZVBB-RV64-NEXT: vmv1r.v v25, v14 +; ZVBB-RV64-NEXT: add a6, a5, a2 +; ZVBB-RV64-NEXT: vmv1r.v v18, v11 +; ZVBB-RV64-NEXT: vsseg5e16.v v22, (a0) +; ZVBB-RV64-NEXT: vmv1r.v v20, v15 +; ZVBB-RV64-NEXT: vsseg5e16.v v17, (a1) +; ZVBB-RV64-NEXT: vl1re16.v v16, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re16.v v17, (a6) +; ZVBB-RV64-NEXT: add a6, a3, a2 +; ZVBB-RV64-NEXT: vl1re16.v v10, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re16.v v11, (a6) +; ZVBB-RV64-NEXT: vl1re16.v v8, (a0) +; ZVBB-RV64-NEXT: vl1re16.v v9, (a3) +; ZVBB-RV64-NEXT: vl1re16.v v14, (a4) +; ZVBB-RV64-NEXT: csrr a0, vlenb +; ZVBB-RV64-NEXT: li a3, 10 +; ZVBB-RV64-NEXT: mul a0, a0, a3 +; ZVBB-RV64-NEXT: add a0, sp, a0 +; ZVBB-RV64-NEXT: addi a0, a0, 64 +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re16.v v15, (a5) +; ZVBB-RV64-NEXT: vl1re16.v v12, (a6) +; ZVBB-RV64-NEXT: vl1re16.v v13, (a1) +; ZVBB-RV64-NEXT: slli a2, a2, 3 +; ZVBB-RV64-NEXT: add a2, a0, a2 +; ZVBB-RV64-NEXT: vs2r.v v16, (a2) +; ZVBB-RV64-NEXT: vs8r.v v8, (a0) +; ZVBB-RV64-NEXT: vl8re16.v v16, (a2) +; ZVBB-RV64-NEXT: vl8re16.v v8, (a0) +; ZVBB-RV64-NEXT: addi sp, s0, -80 +; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; ZVBB-RV64-NEXT: addi sp, sp, 80 +; ZVBB-RV64-NEXT: ret ; -; ZIP-LABEL: vector_interleave_nxv64bf16_nxv32bf16: +; ZIP-LABEL: vector_interleave_nxv40f16_nxv8f16: ; ZIP: # %bb.0: -; ZIP-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZIP-NEXT: ri.vzip2b.vv v28, v8, v16 -; ZIP-NEXT: ri.vzip2b.vv v4, v12, v20 -; ZIP-NEXT: ri.vzip2a.vv v24, v8, v16 -; ZIP-NEXT: ri.vzip2a.vv v0, v12, v20 -; ZIP-NEXT: vmv8r.v v8, v24 -; ZIP-NEXT: vmv8r.v v16, v0 +; ZIP-NEXT: addi sp, sp, -80 +; ZIP-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; ZIP-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; ZIP-NEXT: addi s0, sp, 80 +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: li a1, 28 +; ZIP-NEXT: mul a0, a0, a1 +; ZIP-NEXT: sub sp, sp, a0 +; ZIP-NEXT: andi sp, sp, -64 +; ZIP-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZIP-NEXT: vmv2r.v v20, v16 +; ZIP-NEXT: addi a0, sp, 64 +; ZIP-NEXT: vmv2r.v v18, v12 +; ZIP-NEXT: csrr a1, vlenb +; ZIP-NEXT: slli a2, a1, 2 +; ZIP-NEXT: add a1, a2, a1 +; ZIP-NEXT: add a1, sp, a1 +; ZIP-NEXT: addi a1, a1, 64 +; ZIP-NEXT: csrr a2, vlenb +; ZIP-NEXT: vmv2r.v v16, v8 +; ZIP-NEXT: vmv2r.v v22, v16 +; ZIP-NEXT: vmv2r.v v24, v18 +; ZIP-NEXT: vmv1r.v v26, v20 +; ZIP-NEXT: add a3, a0, a2 +; ZIP-NEXT: vmv1r.v v23, v10 +; ZIP-NEXT: add a4, a1, a2 +; ZIP-NEXT: add a5, a4, a2 +; ZIP-NEXT: vmv1r.v v25, v14 +; ZIP-NEXT: add a6, a5, a2 +; ZIP-NEXT: vmv1r.v v18, v11 +; ZIP-NEXT: vsseg5e16.v v22, (a0) +; ZIP-NEXT: vmv1r.v v20, v15 +; ZIP-NEXT: vsseg5e16.v v17, (a1) +; ZIP-NEXT: vl1re16.v v16, (a6) +; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: vl1re16.v v17, (a6) +; ZIP-NEXT: add a6, a3, a2 +; ZIP-NEXT: vl1re16.v v10, (a6) +; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: vl1re16.v v11, (a6) +; ZIP-NEXT: vl1re16.v v8, (a0) +; ZIP-NEXT: vl1re16.v v9, (a3) +; ZIP-NEXT: vl1re16.v v14, (a4) +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: li a3, 10 +; ZIP-NEXT: mul a0, a0, a3 +; ZIP-NEXT: add a0, sp, a0 +; ZIP-NEXT: addi a0, a0, 64 +; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: vl1re16.v v15, (a5) +; ZIP-NEXT: vl1re16.v v12, (a6) +; ZIP-NEXT: vl1re16.v v13, (a1) +; ZIP-NEXT: slli a2, a2, 3 +; ZIP-NEXT: add a2, a0, a2 +; ZIP-NEXT: vs2r.v v16, (a2) +; ZIP-NEXT: vs8r.v v8, (a0) +; ZIP-NEXT: vl8re16.v v16, (a2) +; ZIP-NEXT: vl8re16.v v8, (a0) +; ZIP-NEXT: addi sp, s0, -80 +; ZIP-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; ZIP-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; ZIP-NEXT: addi sp, sp, 80 ; ZIP-NEXT: ret - %res = call @llvm.vector.interleave2.nxv64bf16( %a, %b) - ret %res + %res = call @llvm.vector.interleave5.nxv40f16( %v0, %v1, %v2, %v3, %v4) + ret %res } -define @vector_interleave_nxv64f16_nxv32f16( %a, %b) { -; V-LABEL: vector_interleave_nxv64f16_nxv32f16: -; V: # %bb.0: -; V-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; V-NEXT: vmv8r.v v24, v8 -; V-NEXT: vwaddu.vv v8, v24, v16 -; V-NEXT: li a0, -1 -; V-NEXT: vwaddu.vv v0, v28, v20 -; V-NEXT: vwmaccu.vx v8, a0, v16 -; V-NEXT: vwmaccu.vx v0, a0, v20 -; V-NEXT: vmv8r.v v16, v0 -; V-NEXT: ret +define @vector_interleave_nxv10bf16_nxv2bf16( %v0, %v1, %v2, %v3, %v4) nounwind { +; CHECK-LABEL: vector_interleave_nxv10bf16_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a1, a0, 1 +; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a2, a1, 1 +; CHECK-NEXT: add a3, a0, a2 +; CHECK-NEXT: add a4, a3, a2 +; CHECK-NEXT: vsetvli a5, zero, e16, mf2, ta, ma +; CHECK-NEXT: vsseg5e16.v v8, (a0) +; CHECK-NEXT: add a5, a4, a2 +; CHECK-NEXT: vle16.v v8, (a5) +; CHECK-NEXT: vle16.v v9, (a4) +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: add a4, a1, a1 +; CHECK-NEXT: vle16.v v10, (a3) +; CHECK-NEXT: vsetvli zero, a4, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v9, v8, a1 +; CHECK-NEXT: vsetvli a3, zero, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli zero, a4, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v8, v10, a1 +; CHECK-NEXT: add a2, a5, a2 +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v10, (a2) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a1, a0, 1 +; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret ; -; ZVBB-LABEL: vector_interleave_nxv64f16_nxv32f16: +; ZVBB-LABEL: vector_interleave_nxv10bf16_nxv2bf16: ; ZVBB: # %bb.0: -; ZVBB-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVBB-NEXT: vwsll.vi v24, v16, 16 -; ZVBB-NEXT: vwsll.vi v0, v20, 16 -; ZVBB-NEXT: vwaddu.wv v24, v24, v8 -; ZVBB-NEXT: vwaddu.wv v0, v0, v12 -; ZVBB-NEXT: vmv8r.v v8, v24 -; ZVBB-NEXT: vmv8r.v v16, v0 +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a1, a0, 1 +; ZVBB-NEXT: add a0, a1, a0 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: srli a2, a1, 1 +; ZVBB-NEXT: add a3, a0, a2 +; ZVBB-NEXT: add a4, a3, a2 +; ZVBB-NEXT: vsetvli a5, zero, e16, mf2, ta, ma +; ZVBB-NEXT: vsseg5e16.v v8, (a0) +; ZVBB-NEXT: add a5, a4, a2 +; ZVBB-NEXT: vle16.v v8, (a5) +; ZVBB-NEXT: vle16.v v9, (a4) +; ZVBB-NEXT: srli a1, a1, 2 +; ZVBB-NEXT: add a4, a1, a1 +; ZVBB-NEXT: vle16.v v10, (a3) +; ZVBB-NEXT: vsetvli zero, a4, e16, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v9, v8, a1 +; ZVBB-NEXT: vsetvli a3, zero, e16, mf2, ta, ma +; ZVBB-NEXT: vle16.v v8, (a0) +; ZVBB-NEXT: vsetvli zero, a4, e16, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v8, v10, a1 +; ZVBB-NEXT: add a2, a5, a2 +; ZVBB-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; ZVBB-NEXT: vle16.v v10, (a2) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a1, a0, 1 +; ZVBB-NEXT: add a0, a1, a0 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 ; ZVBB-NEXT: ret -; -; ZIP-LABEL: vector_interleave_nxv64f16_nxv32f16: -; ZIP: # %bb.0: -; ZIP-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZIP-NEXT: ri.vzip2b.vv v28, v8, v16 -; ZIP-NEXT: ri.vzip2b.vv v4, v12, v20 -; ZIP-NEXT: ri.vzip2a.vv v24, v8, v16 -; ZIP-NEXT: ri.vzip2a.vv v0, v12, v20 -; ZIP-NEXT: vmv8r.v v8, v24 -; ZIP-NEXT: vmv8r.v v16, v0 -; ZIP-NEXT: ret - %res = call @llvm.vector.interleave2.nxv64f16( %a, %b) - ret %res + %res = call @llvm.vector.interleave5.nxv10bf16( %v0, %v1, %v2, %v3, %v4) + ret %res } -define @vector_interleave_nxv32f32_nxv16f32( %a, %b) { -; V-LABEL: vector_interleave_nxv32f32_nxv16f32: -; V: # %bb.0: -; V-NEXT: vsetvli a0, zero, e32, m4, ta, ma -; V-NEXT: vmv8r.v v24, v8 -; V-NEXT: vwaddu.vv v8, v24, v16 -; V-NEXT: li a0, -1 -; V-NEXT: vwaddu.vv v0, v28, v20 -; V-NEXT: vwmaccu.vx v8, a0, v16 -; V-NEXT: vwmaccu.vx v0, a0, v20 -; V-NEXT: vmv8r.v v16, v0 -; V-NEXT: ret +define @vector_interleave_nxv20bf16_nxv4bf16( %v0, %v1, %v2, %v3, %v4) nounwind { +; CHECK-LABEL: vector_interleave_nxv20bf16_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a1, a0, 2 +; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: add a3, a2, a1 +; CHECK-NEXT: vsetvli a4, zero, e16, m1, ta, ma +; CHECK-NEXT: vsseg5e16.v v8, (a0) +; CHECK-NEXT: vl1re16.v v10, (a3) +; CHECK-NEXT: add a3, a3, a1 +; CHECK-NEXT: vl1re16.v v11, (a3) +; CHECK-NEXT: vl1re16.v v8, (a0) +; CHECK-NEXT: vl1re16.v v9, (a2) +; CHECK-NEXT: add a1, a3, a1 +; CHECK-NEXT: vl1re16.v v12, (a1) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a1, a0, 2 +; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret ; -; ZVBB-LABEL: vector_interleave_nxv32f32_nxv16f32: +; ZVBB-LABEL: vector_interleave_nxv20bf16_nxv4bf16: ; ZVBB: # %bb.0: -; ZVBB-NEXT: li a0, 32 -; ZVBB-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; ZVBB-NEXT: vwsll.vx v24, v16, a0 -; ZVBB-NEXT: vwsll.vx v0, v20, a0 -; ZVBB-NEXT: vwaddu.wv v24, v24, v8 -; ZVBB-NEXT: vwaddu.wv v0, v0, v12 -; ZVBB-NEXT: vmv8r.v v8, v24 -; ZVBB-NEXT: vmv8r.v v16, v0 +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a1, a0, 2 +; ZVBB-NEXT: add a0, a1, a0 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: add a3, a2, a1 +; ZVBB-NEXT: vsetvli a4, zero, e16, m1, ta, ma +; ZVBB-NEXT: vsseg5e16.v v8, (a0) +; ZVBB-NEXT: vl1re16.v v10, (a3) +; ZVBB-NEXT: add a3, a3, a1 +; ZVBB-NEXT: vl1re16.v v11, (a3) +; ZVBB-NEXT: vl1re16.v v8, (a0) +; ZVBB-NEXT: vl1re16.v v9, (a2) +; ZVBB-NEXT: add a1, a3, a1 +; ZVBB-NEXT: vl1re16.v v12, (a1) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a1, a0, 2 +; ZVBB-NEXT: add a0, a1, a0 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 ; ZVBB-NEXT: ret -; -; ZIP-LABEL: vector_interleave_nxv32f32_nxv16f32: -; ZIP: # %bb.0: -; ZIP-NEXT: vsetvli a0, zero, e32, m4, ta, ma -; ZIP-NEXT: ri.vzip2b.vv v28, v8, v16 -; ZIP-NEXT: ri.vzip2b.vv v4, v12, v20 -; ZIP-NEXT: ri.vzip2a.vv v24, v8, v16 -; ZIP-NEXT: ri.vzip2a.vv v0, v12, v20 -; ZIP-NEXT: vmv8r.v v8, v24 -; ZIP-NEXT: vmv8r.v v16, v0 -; ZIP-NEXT: ret - %res = call @llvm.vector.interleave2.nxv32f32( %a, %b) - ret %res + %res = call @llvm.vector.interleave5.nxv20bf16( %v0, %v1, %v2, %v3, %v4) + ret %res } -define @vector_interleave_nxv16f64_nxv8f64( %a, %b) { -; V-LABEL: vector_interleave_nxv16f64_nxv8f64: -; V: # %bb.0: -; V-NEXT: csrr a0, vlenb -; V-NEXT: vsetvli a1, zero, e16, m2, ta, mu -; V-NEXT: vid.v v6 -; V-NEXT: vmv8r.v v24, v8 -; V-NEXT: srli a0, a0, 1 -; V-NEXT: vmv4r.v v28, v16 -; V-NEXT: vmv4r.v v16, v12 -; V-NEXT: vand.vi v8, v6, 1 -; V-NEXT: vmsne.vi v0, v8, 0 -; V-NEXT: vsrl.vi v6, v6, 1 -; V-NEXT: vadd.vx v6, v6, a0, v0.t -; V-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; V-NEXT: vrgatherei16.vv v8, v24, v6 -; V-NEXT: vrgatherei16.vv v24, v16, v6 -; V-NEXT: vmv.v.v v16, v24 -; V-NEXT: ret +define @vector_interleave_nxv40bf16_nxv8bf16( %v0, %v1, %v2, %v3, %v4) nounwind { +; RV32-LABEL: vector_interleave_nxv40bf16_nxv8bf16: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -80 +; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; RV32-NEXT: addi s0, sp, 80 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 28 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: sub sp, sp, a0 +; RV32-NEXT: andi sp, sp, -64 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; RV32-NEXT: vmv2r.v v20, v16 +; RV32-NEXT: addi a0, sp, 64 +; RV32-NEXT: vmv2r.v v18, v12 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a2, a1, 2 +; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 64 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: vmv2r.v v16, v8 +; RV32-NEXT: vmv2r.v v22, v16 +; RV32-NEXT: vmv2r.v v24, v18 +; RV32-NEXT: vmv1r.v v26, v20 +; RV32-NEXT: add a3, a0, a2 +; RV32-NEXT: vmv1r.v v23, v10 +; RV32-NEXT: add a4, a1, a2 +; RV32-NEXT: add a5, a4, a2 +; RV32-NEXT: vmv1r.v v25, v14 +; RV32-NEXT: add a6, a5, a2 +; RV32-NEXT: vmv1r.v v18, v11 +; RV32-NEXT: vsseg5e16.v v22, (a0) +; RV32-NEXT: vmv1r.v v20, v15 +; RV32-NEXT: vsseg5e16.v v17, (a1) +; RV32-NEXT: vl1re16.v v16, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re16.v v17, (a6) +; RV32-NEXT: add a6, a3, a2 +; RV32-NEXT: vl1re16.v v10, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re16.v v11, (a6) +; RV32-NEXT: vl1re16.v v8, (a0) +; RV32-NEXT: vl1re16.v v9, (a3) +; RV32-NEXT: vl1re16.v v14, (a4) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a3, 10 +; RV32-NEXT: mul a0, a0, a3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 64 +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re16.v v15, (a5) +; RV32-NEXT: vl1re16.v v12, (a6) +; RV32-NEXT: vl1re16.v v13, (a1) +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, a0, a2 +; RV32-NEXT: vs2r.v v16, (a2) +; RV32-NEXT: vs8r.v v8, (a0) +; RV32-NEXT: vl8re16.v v16, (a2) +; RV32-NEXT: vl8re16.v v8, (a0) +; RV32-NEXT: addi sp, s0, -80 +; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 80 +; RV32-NEXT: ret +; +; RV64-LABEL: vector_interleave_nxv40bf16_nxv8bf16: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -80 +; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; RV64-NEXT: addi s0, sp, 80 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a1, 28 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: sub sp, sp, a0 +; RV64-NEXT: andi sp, sp, -64 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; RV64-NEXT: vmv2r.v v20, v16 +; RV64-NEXT: addi a0, sp, 64 +; RV64-NEXT: vmv2r.v v18, v12 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 2 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 64 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: vmv2r.v v16, v8 +; RV64-NEXT: vmv2r.v v22, v16 +; RV64-NEXT: vmv2r.v v24, v18 +; RV64-NEXT: vmv1r.v v26, v20 +; RV64-NEXT: add a3, a0, a2 +; RV64-NEXT: vmv1r.v v23, v10 +; RV64-NEXT: add a4, a1, a2 +; RV64-NEXT: add a5, a4, a2 +; RV64-NEXT: vmv1r.v v25, v14 +; RV64-NEXT: add a6, a5, a2 +; RV64-NEXT: vmv1r.v v18, v11 +; RV64-NEXT: vsseg5e16.v v22, (a0) +; RV64-NEXT: vmv1r.v v20, v15 +; RV64-NEXT: vsseg5e16.v v17, (a1) +; RV64-NEXT: vl1re16.v v16, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re16.v v17, (a6) +; RV64-NEXT: add a6, a3, a2 +; RV64-NEXT: vl1re16.v v10, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re16.v v11, (a6) +; RV64-NEXT: vl1re16.v v8, (a0) +; RV64-NEXT: vl1re16.v v9, (a3) +; RV64-NEXT: vl1re16.v v14, (a4) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a3, 10 +; RV64-NEXT: mul a0, a0, a3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 64 +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re16.v v15, (a5) +; RV64-NEXT: vl1re16.v v12, (a6) +; RV64-NEXT: vl1re16.v v13, (a1) +; RV64-NEXT: slli a2, a2, 3 +; RV64-NEXT: add a2, a0, a2 +; RV64-NEXT: vs2r.v v16, (a2) +; RV64-NEXT: vs8r.v v8, (a0) +; RV64-NEXT: vl8re16.v v16, (a2) +; RV64-NEXT: vl8re16.v v8, (a0) +; RV64-NEXT: addi sp, s0, -80 +; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 80 +; RV64-NEXT: ret ; -; ZVBB-LABEL: vector_interleave_nxv16f64_nxv8f64: -; ZVBB: # %bb.0: -; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: vsetvli a1, zero, e16, m2, ta, mu -; ZVBB-NEXT: vid.v v6 -; ZVBB-NEXT: vmv8r.v v24, v8 -; ZVBB-NEXT: srli a0, a0, 1 -; ZVBB-NEXT: vmv4r.v v28, v16 -; ZVBB-NEXT: vmv4r.v v16, v12 -; ZVBB-NEXT: vand.vi v8, v6, 1 -; ZVBB-NEXT: vmsne.vi v0, v8, 0 -; ZVBB-NEXT: vsrl.vi v6, v6, 1 -; ZVBB-NEXT: vadd.vx v6, v6, a0, v0.t -; ZVBB-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; ZVBB-NEXT: vrgatherei16.vv v8, v24, v6 -; ZVBB-NEXT: vrgatherei16.vv v24, v16, v6 -; ZVBB-NEXT: vmv.v.v v16, v24 -; ZVBB-NEXT: ret +; ZVBB-RV32-LABEL: vector_interleave_nxv40bf16_nxv8bf16: +; ZVBB-RV32: # %bb.0: +; ZVBB-RV32-NEXT: addi sp, sp, -80 +; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; ZVBB-RV32-NEXT: addi s0, sp, 80 +; ZVBB-RV32-NEXT: csrr a0, vlenb +; ZVBB-RV32-NEXT: li a1, 28 +; ZVBB-RV32-NEXT: mul a0, a0, a1 +; ZVBB-RV32-NEXT: sub sp, sp, a0 +; ZVBB-RV32-NEXT: andi sp, sp, -64 +; ZVBB-RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZVBB-RV32-NEXT: vmv2r.v v20, v16 +; ZVBB-RV32-NEXT: addi a0, sp, 64 +; ZVBB-RV32-NEXT: vmv2r.v v18, v12 +; ZVBB-RV32-NEXT: csrr a1, vlenb +; ZVBB-RV32-NEXT: slli a2, a1, 2 +; ZVBB-RV32-NEXT: add a1, a2, a1 +; ZVBB-RV32-NEXT: add a1, sp, a1 +; ZVBB-RV32-NEXT: addi a1, a1, 64 +; ZVBB-RV32-NEXT: csrr a2, vlenb +; ZVBB-RV32-NEXT: vmv2r.v v16, v8 +; ZVBB-RV32-NEXT: vmv2r.v v22, v16 +; ZVBB-RV32-NEXT: vmv2r.v v24, v18 +; ZVBB-RV32-NEXT: vmv1r.v v26, v20 +; ZVBB-RV32-NEXT: add a3, a0, a2 +; ZVBB-RV32-NEXT: vmv1r.v v23, v10 +; ZVBB-RV32-NEXT: add a4, a1, a2 +; ZVBB-RV32-NEXT: add a5, a4, a2 +; ZVBB-RV32-NEXT: vmv1r.v v25, v14 +; ZVBB-RV32-NEXT: add a6, a5, a2 +; ZVBB-RV32-NEXT: vmv1r.v v18, v11 +; ZVBB-RV32-NEXT: vsseg5e16.v v22, (a0) +; ZVBB-RV32-NEXT: vmv1r.v v20, v15 +; ZVBB-RV32-NEXT: vsseg5e16.v v17, (a1) +; ZVBB-RV32-NEXT: vl1re16.v v16, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re16.v v17, (a6) +; ZVBB-RV32-NEXT: add a6, a3, a2 +; ZVBB-RV32-NEXT: vl1re16.v v10, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re16.v v11, (a6) +; ZVBB-RV32-NEXT: vl1re16.v v8, (a0) +; ZVBB-RV32-NEXT: vl1re16.v v9, (a3) +; ZVBB-RV32-NEXT: vl1re16.v v14, (a4) +; ZVBB-RV32-NEXT: csrr a0, vlenb +; ZVBB-RV32-NEXT: li a3, 10 +; ZVBB-RV32-NEXT: mul a0, a0, a3 +; ZVBB-RV32-NEXT: add a0, sp, a0 +; ZVBB-RV32-NEXT: addi a0, a0, 64 +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re16.v v15, (a5) +; ZVBB-RV32-NEXT: vl1re16.v v12, (a6) +; ZVBB-RV32-NEXT: vl1re16.v v13, (a1) +; ZVBB-RV32-NEXT: slli a2, a2, 3 +; ZVBB-RV32-NEXT: add a2, a0, a2 +; ZVBB-RV32-NEXT: vs2r.v v16, (a2) +; ZVBB-RV32-NEXT: vs8r.v v8, (a0) +; ZVBB-RV32-NEXT: vl8re16.v v16, (a2) +; ZVBB-RV32-NEXT: vl8re16.v v8, (a0) +; ZVBB-RV32-NEXT: addi sp, s0, -80 +; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; ZVBB-RV32-NEXT: addi sp, sp, 80 +; ZVBB-RV32-NEXT: ret ; -; ZIP-LABEL: vector_interleave_nxv16f64_nxv8f64: +; ZVBB-RV64-LABEL: vector_interleave_nxv40bf16_nxv8bf16: +; ZVBB-RV64: # %bb.0: +; ZVBB-RV64-NEXT: addi sp, sp, -80 +; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; ZVBB-RV64-NEXT: addi s0, sp, 80 +; ZVBB-RV64-NEXT: csrr a0, vlenb +; ZVBB-RV64-NEXT: li a1, 28 +; ZVBB-RV64-NEXT: mul a0, a0, a1 +; ZVBB-RV64-NEXT: sub sp, sp, a0 +; ZVBB-RV64-NEXT: andi sp, sp, -64 +; ZVBB-RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZVBB-RV64-NEXT: vmv2r.v v20, v16 +; ZVBB-RV64-NEXT: addi a0, sp, 64 +; ZVBB-RV64-NEXT: vmv2r.v v18, v12 +; ZVBB-RV64-NEXT: csrr a1, vlenb +; ZVBB-RV64-NEXT: slli a2, a1, 2 +; ZVBB-RV64-NEXT: add a1, a2, a1 +; ZVBB-RV64-NEXT: add a1, sp, a1 +; ZVBB-RV64-NEXT: addi a1, a1, 64 +; ZVBB-RV64-NEXT: csrr a2, vlenb +; ZVBB-RV64-NEXT: vmv2r.v v16, v8 +; ZVBB-RV64-NEXT: vmv2r.v v22, v16 +; ZVBB-RV64-NEXT: vmv2r.v v24, v18 +; ZVBB-RV64-NEXT: vmv1r.v v26, v20 +; ZVBB-RV64-NEXT: add a3, a0, a2 +; ZVBB-RV64-NEXT: vmv1r.v v23, v10 +; ZVBB-RV64-NEXT: add a4, a1, a2 +; ZVBB-RV64-NEXT: add a5, a4, a2 +; ZVBB-RV64-NEXT: vmv1r.v v25, v14 +; ZVBB-RV64-NEXT: add a6, a5, a2 +; ZVBB-RV64-NEXT: vmv1r.v v18, v11 +; ZVBB-RV64-NEXT: vsseg5e16.v v22, (a0) +; ZVBB-RV64-NEXT: vmv1r.v v20, v15 +; ZVBB-RV64-NEXT: vsseg5e16.v v17, (a1) +; ZVBB-RV64-NEXT: vl1re16.v v16, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re16.v v17, (a6) +; ZVBB-RV64-NEXT: add a6, a3, a2 +; ZVBB-RV64-NEXT: vl1re16.v v10, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re16.v v11, (a6) +; ZVBB-RV64-NEXT: vl1re16.v v8, (a0) +; ZVBB-RV64-NEXT: vl1re16.v v9, (a3) +; ZVBB-RV64-NEXT: vl1re16.v v14, (a4) +; ZVBB-RV64-NEXT: csrr a0, vlenb +; ZVBB-RV64-NEXT: li a3, 10 +; ZVBB-RV64-NEXT: mul a0, a0, a3 +; ZVBB-RV64-NEXT: add a0, sp, a0 +; ZVBB-RV64-NEXT: addi a0, a0, 64 +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re16.v v15, (a5) +; ZVBB-RV64-NEXT: vl1re16.v v12, (a6) +; ZVBB-RV64-NEXT: vl1re16.v v13, (a1) +; ZVBB-RV64-NEXT: slli a2, a2, 3 +; ZVBB-RV64-NEXT: add a2, a0, a2 +; ZVBB-RV64-NEXT: vs2r.v v16, (a2) +; ZVBB-RV64-NEXT: vs8r.v v8, (a0) +; ZVBB-RV64-NEXT: vl8re16.v v16, (a2) +; ZVBB-RV64-NEXT: vl8re16.v v8, (a0) +; ZVBB-RV64-NEXT: addi sp, s0, -80 +; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; ZVBB-RV64-NEXT: addi sp, sp, 80 +; ZVBB-RV64-NEXT: ret +; +; ZIP-LABEL: vector_interleave_nxv40bf16_nxv8bf16: ; ZIP: # %bb.0: -; ZIP-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; ZIP-NEXT: ri.vzip2b.vv v28, v8, v16 -; ZIP-NEXT: ri.vzip2b.vv v4, v12, v20 -; ZIP-NEXT: ri.vzip2a.vv v24, v8, v16 -; ZIP-NEXT: ri.vzip2a.vv v0, v12, v20 -; ZIP-NEXT: vmv8r.v v8, v24 -; ZIP-NEXT: vmv8r.v v16, v0 +; ZIP-NEXT: addi sp, sp, -80 +; ZIP-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; ZIP-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; ZIP-NEXT: addi s0, sp, 80 +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: li a1, 28 +; ZIP-NEXT: mul a0, a0, a1 +; ZIP-NEXT: sub sp, sp, a0 +; ZIP-NEXT: andi sp, sp, -64 +; ZIP-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZIP-NEXT: vmv2r.v v20, v16 +; ZIP-NEXT: addi a0, sp, 64 +; ZIP-NEXT: vmv2r.v v18, v12 +; ZIP-NEXT: csrr a1, vlenb +; ZIP-NEXT: slli a2, a1, 2 +; ZIP-NEXT: add a1, a2, a1 +; ZIP-NEXT: add a1, sp, a1 +; ZIP-NEXT: addi a1, a1, 64 +; ZIP-NEXT: csrr a2, vlenb +; ZIP-NEXT: vmv2r.v v16, v8 +; ZIP-NEXT: vmv2r.v v22, v16 +; ZIP-NEXT: vmv2r.v v24, v18 +; ZIP-NEXT: vmv1r.v v26, v20 +; ZIP-NEXT: add a3, a0, a2 +; ZIP-NEXT: vmv1r.v v23, v10 +; ZIP-NEXT: add a4, a1, a2 +; ZIP-NEXT: add a5, a4, a2 +; ZIP-NEXT: vmv1r.v v25, v14 +; ZIP-NEXT: add a6, a5, a2 +; ZIP-NEXT: vmv1r.v v18, v11 +; ZIP-NEXT: vsseg5e16.v v22, (a0) +; ZIP-NEXT: vmv1r.v v20, v15 +; ZIP-NEXT: vsseg5e16.v v17, (a1) +; ZIP-NEXT: vl1re16.v v16, (a6) +; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: vl1re16.v v17, (a6) +; ZIP-NEXT: add a6, a3, a2 +; ZIP-NEXT: vl1re16.v v10, (a6) +; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: vl1re16.v v11, (a6) +; ZIP-NEXT: vl1re16.v v8, (a0) +; ZIP-NEXT: vl1re16.v v9, (a3) +; ZIP-NEXT: vl1re16.v v14, (a4) +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: li a3, 10 +; ZIP-NEXT: mul a0, a0, a3 +; ZIP-NEXT: add a0, sp, a0 +; ZIP-NEXT: addi a0, a0, 64 +; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: vl1re16.v v15, (a5) +; ZIP-NEXT: vl1re16.v v12, (a6) +; ZIP-NEXT: vl1re16.v v13, (a1) +; ZIP-NEXT: slli a2, a2, 3 +; ZIP-NEXT: add a2, a0, a2 +; ZIP-NEXT: vs2r.v v16, (a2) +; ZIP-NEXT: vs8r.v v8, (a0) +; ZIP-NEXT: vl8re16.v v16, (a2) +; ZIP-NEXT: vl8re16.v v8, (a0) +; ZIP-NEXT: addi sp, s0, -80 +; ZIP-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; ZIP-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; ZIP-NEXT: addi sp, sp, 80 ; ZIP-NEXT: ret - %res = call @llvm.vector.interleave2.nxv16f64( %a, %b) - ret %res + %res = call @llvm.vector.interleave5.nxv40bf16( %v0, %v1, %v2, %v3, %v4) + ret %res } -define @vector_interleave_nxv6f16_nxv2f16( %v0, %v1, %v2) nounwind { -; CHECK-LABEL: vector_interleave_nxv6f16_nxv2f16: +define @vector_interleave_nxv5f32_nxv1f32( %v0, %v1, %v2, %v3, %v4) nounwind { +; CHECK-LABEL: vector_interleave_nxv5f32_nxv1f32: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: slli a1, a0, 1 +; CHECK-NEXT: add a0, a1, a0 ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a2, a1, 1 -; CHECK-NEXT: vsetvli a3, zero, e16, mf2, ta, ma -; CHECK-NEXT: vsseg3e16.v v8, (a0) ; CHECK-NEXT: add a3, a0, a2 -; CHECK-NEXT: vle16.v v9, (a3) -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: srli a1, a1, 2 -; CHECK-NEXT: add a0, a1, a1 -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vx v8, v9, a1 -; CHECK-NEXT: add a2, a3, a2 -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a2) +; CHECK-NEXT: add a4, a3, a2 +; CHECK-NEXT: vsetvli a5, zero, e32, mf2, ta, ma +; CHECK-NEXT: vsseg5e32.v v8, (a0) +; CHECK-NEXT: add a5, a4, a2 +; CHECK-NEXT: vle32.v v8, (a5) +; CHECK-NEXT: vle32.v v9, (a4) +; CHECK-NEXT: srli a1, a1, 3 +; CHECK-NEXT: add a4, a1, a1 +; CHECK-NEXT: vle32.v v10, (a3) +; CHECK-NEXT: vsetvli zero, a4, e32, m1, ta, ma +; CHECK-NEXT: vslideup.vx v9, v8, a1 +; CHECK-NEXT: vsetvli a3, zero, e32, mf2, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli zero, a4, e32, m1, ta, ma +; CHECK-NEXT: vslideup.vx v8, v10, a1 +; CHECK-NEXT: add a2, a5, a2 +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma +; CHECK-NEXT: vle32.v v10, (a2) ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: slli a1, a0, 1 +; CHECK-NEXT: add a0, a1, a0 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret ; -; ZVBB-LABEL: vector_interleave_nxv6f16_nxv2f16: +; ZVBB-LABEL: vector_interleave_nxv5f32_nxv1f32: ; ZVBB: # %bb.0: ; ZVBB-NEXT: addi sp, sp, -16 ; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: slli a0, a0, 1 +; ZVBB-NEXT: slli a1, a0, 1 +; ZVBB-NEXT: add a0, a1, a0 ; ZVBB-NEXT: sub sp, sp, a0 ; ZVBB-NEXT: addi a0, sp, 16 ; ZVBB-NEXT: csrr a1, vlenb ; ZVBB-NEXT: srli a2, a1, 1 -; ZVBB-NEXT: vsetvli a3, zero, e16, mf2, ta, ma -; ZVBB-NEXT: vsseg3e16.v v8, (a0) ; ZVBB-NEXT: add a3, a0, a2 -; ZVBB-NEXT: vle16.v v9, (a3) -; ZVBB-NEXT: vle16.v v8, (a0) -; ZVBB-NEXT: srli a1, a1, 2 -; ZVBB-NEXT: add a0, a1, a1 -; ZVBB-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; ZVBB-NEXT: vslideup.vx v8, v9, a1 -; ZVBB-NEXT: add a2, a3, a2 -; ZVBB-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; ZVBB-NEXT: vle16.v v9, (a2) +; ZVBB-NEXT: add a4, a3, a2 +; ZVBB-NEXT: vsetvli a5, zero, e32, mf2, ta, ma +; ZVBB-NEXT: vsseg5e32.v v8, (a0) +; ZVBB-NEXT: add a5, a4, a2 +; ZVBB-NEXT: vle32.v v8, (a5) +; ZVBB-NEXT: vle32.v v9, (a4) +; ZVBB-NEXT: srli a1, a1, 3 +; ZVBB-NEXT: add a4, a1, a1 +; ZVBB-NEXT: vle32.v v10, (a3) +; ZVBB-NEXT: vsetvli zero, a4, e32, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v9, v8, a1 +; ZVBB-NEXT: vsetvli a3, zero, e32, mf2, ta, ma +; ZVBB-NEXT: vle32.v v8, (a0) +; ZVBB-NEXT: vsetvli zero, a4, e32, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v8, v10, a1 +; ZVBB-NEXT: add a2, a5, a2 +; ZVBB-NEXT: vsetvli a0, zero, e32, mf2, ta, ma +; ZVBB-NEXT: vle32.v v10, (a2) ; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: slli a0, a0, 1 +; ZVBB-NEXT: slli a1, a0, 1 +; ZVBB-NEXT: add a0, a1, a0 ; ZVBB-NEXT: add sp, sp, a0 ; ZVBB-NEXT: addi sp, sp, 16 ; ZVBB-NEXT: ret - %res = call @llvm.vector.interleave3.nxv6f16( %v0, %v1, %v2) - ret %res + %res = call @llvm.vector.interleave5.nxv5f32( %v0, %v1, %v2, %v3, %v4) + ret %res } -define @vector_interleave_nxv12f16_nxv4f16( %v0, %v1, %v2) nounwind { -; CHECK-LABEL: vector_interleave_nxv12f16_nxv4f16: +define @vector_interleave_nxv10f32_nxv2f32( %v0, %v1, %v2, %v3, %v4) nounwind { +; CHECK-LABEL: vector_interleave_nxv10f32_nxv2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a1, a0, 1 +; CHECK-NEXT: slli a1, a0, 2 ; CHECK-NEXT: add a0, a1, a0 ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma -; CHECK-NEXT: vsseg3e16.v v8, (a0) -; CHECK-NEXT: vl1re16.v v8, (a0) -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: vl1re16.v v9, (a0) -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: vl1re16.v v10, (a0) +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: add a3, a2, a1 +; CHECK-NEXT: vsetvli a4, zero, e32, m1, ta, ma +; CHECK-NEXT: vsseg5e32.v v8, (a0) +; CHECK-NEXT: vl1re32.v v10, (a3) +; CHECK-NEXT: add a3, a3, a1 +; CHECK-NEXT: vl1re32.v v11, (a3) +; CHECK-NEXT: vl1re32.v v8, (a0) +; CHECK-NEXT: vl1re32.v v9, (a2) +; CHECK-NEXT: add a1, a3, a1 +; CHECK-NEXT: vl1re32.v v12, (a1) ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a1, a0, 1 +; CHECK-NEXT: slli a1, a0, 2 ; CHECK-NEXT: add a0, a1, a0 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret ; -; ZVBB-LABEL: vector_interleave_nxv12f16_nxv4f16: +; ZVBB-LABEL: vector_interleave_nxv10f32_nxv2f32: ; ZVBB: # %bb.0: ; ZVBB-NEXT: addi sp, sp, -16 ; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: slli a1, a0, 1 +; ZVBB-NEXT: slli a1, a0, 2 ; ZVBB-NEXT: add a0, a1, a0 ; ZVBB-NEXT: sub sp, sp, a0 ; ZVBB-NEXT: addi a0, sp, 16 ; ZVBB-NEXT: csrr a1, vlenb -; ZVBB-NEXT: vsetvli a2, zero, e16, m1, ta, ma -; ZVBB-NEXT: vsseg3e16.v v8, (a0) -; ZVBB-NEXT: vl1re16.v v8, (a0) -; ZVBB-NEXT: add a0, a0, a1 -; ZVBB-NEXT: vl1re16.v v9, (a0) -; ZVBB-NEXT: add a0, a0, a1 -; ZVBB-NEXT: vl1re16.v v10, (a0) +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: add a3, a2, a1 +; ZVBB-NEXT: vsetvli a4, zero, e32, m1, ta, ma +; ZVBB-NEXT: vsseg5e32.v v8, (a0) +; ZVBB-NEXT: vl1re32.v v10, (a3) +; ZVBB-NEXT: add a3, a3, a1 +; ZVBB-NEXT: vl1re32.v v11, (a3) +; ZVBB-NEXT: vl1re32.v v8, (a0) +; ZVBB-NEXT: vl1re32.v v9, (a2) +; ZVBB-NEXT: add a1, a3, a1 +; ZVBB-NEXT: vl1re32.v v12, (a1) ; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: slli a1, a0, 1 +; ZVBB-NEXT: slli a1, a0, 2 ; ZVBB-NEXT: add a0, a1, a0 ; ZVBB-NEXT: add sp, sp, a0 ; ZVBB-NEXT: addi sp, sp, 16 ; ZVBB-NEXT: ret - %res = call @llvm.vector.interleave3.nxv12f16( %v0, %v1, %v2) - ret %res + %res = call @llvm.vector.interleave5.nxv10f32( %v0, %v1, %v2, %v3, %v4) + ret %res } -define @vector_interleave_nxv24f16_nxv8f16( %v0, %v1, %v2) nounwind { -; CHECK-LABEL: vector_interleave_nxv24f16_nxv8f16: -; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 6 -; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; CHECK-NEXT: vsseg3e16.v v8, (a0) -; CHECK-NEXT: vl2re16.v v8, (a0) -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: vl2re16.v v10, (a0) -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: vl2re16.v v12, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 6 -; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: ret +define @vector_interleave_nxv20f32_nxv4f32( %v0, %v1, %v2, %v3, %v4) nounwind { +; RV32-LABEL: vector_interleave_nxv20f32_nxv4f32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -80 +; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; RV32-NEXT: addi s0, sp, 80 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 28 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: sub sp, sp, a0 +; RV32-NEXT: andi sp, sp, -64 +; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; RV32-NEXT: vmv2r.v v20, v16 +; RV32-NEXT: addi a0, sp, 64 +; RV32-NEXT: vmv2r.v v18, v12 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a2, a1, 2 +; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 64 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: vmv2r.v v16, v8 +; RV32-NEXT: vmv2r.v v22, v16 +; RV32-NEXT: vmv2r.v v24, v18 +; RV32-NEXT: vmv1r.v v26, v20 +; RV32-NEXT: add a3, a0, a2 +; RV32-NEXT: vmv1r.v v23, v10 +; RV32-NEXT: add a4, a1, a2 +; RV32-NEXT: add a5, a4, a2 +; RV32-NEXT: vmv1r.v v25, v14 +; RV32-NEXT: add a6, a5, a2 +; RV32-NEXT: vmv1r.v v18, v11 +; RV32-NEXT: vsseg5e32.v v22, (a0) +; RV32-NEXT: vmv1r.v v20, v15 +; RV32-NEXT: vsseg5e32.v v17, (a1) +; RV32-NEXT: vl1re32.v v16, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re32.v v17, (a6) +; RV32-NEXT: add a6, a3, a2 +; RV32-NEXT: vl1re32.v v10, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re32.v v11, (a6) +; RV32-NEXT: vl1re32.v v8, (a0) +; RV32-NEXT: vl1re32.v v9, (a3) +; RV32-NEXT: vl1re32.v v14, (a4) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a3, 10 +; RV32-NEXT: mul a0, a0, a3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 64 +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re32.v v15, (a5) +; RV32-NEXT: vl1re32.v v12, (a6) +; RV32-NEXT: vl1re32.v v13, (a1) +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, a0, a2 +; RV32-NEXT: vs2r.v v16, (a2) +; RV32-NEXT: vs8r.v v8, (a0) +; RV32-NEXT: vl8re32.v v16, (a2) +; RV32-NEXT: vl8re32.v v8, (a0) +; RV32-NEXT: addi sp, s0, -80 +; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 80 +; RV32-NEXT: ret +; +; RV64-LABEL: vector_interleave_nxv20f32_nxv4f32: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -80 +; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; RV64-NEXT: addi s0, sp, 80 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a1, 28 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: sub sp, sp, a0 +; RV64-NEXT: andi sp, sp, -64 +; RV64-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; RV64-NEXT: vmv2r.v v20, v16 +; RV64-NEXT: addi a0, sp, 64 +; RV64-NEXT: vmv2r.v v18, v12 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 2 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 64 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: vmv2r.v v16, v8 +; RV64-NEXT: vmv2r.v v22, v16 +; RV64-NEXT: vmv2r.v v24, v18 +; RV64-NEXT: vmv1r.v v26, v20 +; RV64-NEXT: add a3, a0, a2 +; RV64-NEXT: vmv1r.v v23, v10 +; RV64-NEXT: add a4, a1, a2 +; RV64-NEXT: add a5, a4, a2 +; RV64-NEXT: vmv1r.v v25, v14 +; RV64-NEXT: add a6, a5, a2 +; RV64-NEXT: vmv1r.v v18, v11 +; RV64-NEXT: vsseg5e32.v v22, (a0) +; RV64-NEXT: vmv1r.v v20, v15 +; RV64-NEXT: vsseg5e32.v v17, (a1) +; RV64-NEXT: vl1re32.v v16, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re32.v v17, (a6) +; RV64-NEXT: add a6, a3, a2 +; RV64-NEXT: vl1re32.v v10, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re32.v v11, (a6) +; RV64-NEXT: vl1re32.v v8, (a0) +; RV64-NEXT: vl1re32.v v9, (a3) +; RV64-NEXT: vl1re32.v v14, (a4) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a3, 10 +; RV64-NEXT: mul a0, a0, a3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 64 +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re32.v v15, (a5) +; RV64-NEXT: vl1re32.v v12, (a6) +; RV64-NEXT: vl1re32.v v13, (a1) +; RV64-NEXT: slli a2, a2, 3 +; RV64-NEXT: add a2, a0, a2 +; RV64-NEXT: vs2r.v v16, (a2) +; RV64-NEXT: vs8r.v v8, (a0) +; RV64-NEXT: vl8re32.v v16, (a2) +; RV64-NEXT: vl8re32.v v8, (a0) +; RV64-NEXT: addi sp, s0, -80 +; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 80 +; RV64-NEXT: ret ; -; ZVBB-LABEL: vector_interleave_nxv24f16_nxv8f16: -; ZVBB: # %bb.0: -; ZVBB-NEXT: addi sp, sp, -16 -; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: li a1, 6 -; ZVBB-NEXT: mul a0, a0, a1 -; ZVBB-NEXT: sub sp, sp, a0 -; ZVBB-NEXT: addi a0, sp, 16 -; ZVBB-NEXT: csrr a1, vlenb -; ZVBB-NEXT: slli a1, a1, 1 -; ZVBB-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; ZVBB-NEXT: vsseg3e16.v v8, (a0) -; ZVBB-NEXT: vl2re16.v v8, (a0) -; ZVBB-NEXT: add a0, a0, a1 -; ZVBB-NEXT: vl2re16.v v10, (a0) -; ZVBB-NEXT: add a0, a0, a1 -; ZVBB-NEXT: vl2re16.v v12, (a0) -; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: li a1, 6 -; ZVBB-NEXT: mul a0, a0, a1 -; ZVBB-NEXT: add sp, sp, a0 -; ZVBB-NEXT: addi sp, sp, 16 -; ZVBB-NEXT: ret - %res = call @llvm.vector.interleave3.nxv24f16( %v0, %v1, %v2) - ret %res -} - -define @vector_interleave_nxv6bf16_nxv2bf16( %v0, %v1, %v2) nounwind { -; CHECK-LABEL: vector_interleave_nxv6bf16_nxv2bf16: -; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a2, a1, 1 -; CHECK-NEXT: vsetvli a3, zero, e16, mf2, ta, ma -; CHECK-NEXT: vsseg3e16.v v8, (a0) -; CHECK-NEXT: add a3, a0, a2 -; CHECK-NEXT: vle16.v v9, (a3) -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: srli a1, a1, 2 -; CHECK-NEXT: add a0, a1, a1 -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vx v8, v9, a1 -; CHECK-NEXT: add a2, a3, a2 -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a2) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: ret +; ZVBB-RV32-LABEL: vector_interleave_nxv20f32_nxv4f32: +; ZVBB-RV32: # %bb.0: +; ZVBB-RV32-NEXT: addi sp, sp, -80 +; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; ZVBB-RV32-NEXT: addi s0, sp, 80 +; ZVBB-RV32-NEXT: csrr a0, vlenb +; ZVBB-RV32-NEXT: li a1, 28 +; ZVBB-RV32-NEXT: mul a0, a0, a1 +; ZVBB-RV32-NEXT: sub sp, sp, a0 +; ZVBB-RV32-NEXT: andi sp, sp, -64 +; ZVBB-RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; ZVBB-RV32-NEXT: vmv2r.v v20, v16 +; ZVBB-RV32-NEXT: addi a0, sp, 64 +; ZVBB-RV32-NEXT: vmv2r.v v18, v12 +; ZVBB-RV32-NEXT: csrr a1, vlenb +; ZVBB-RV32-NEXT: slli a2, a1, 2 +; ZVBB-RV32-NEXT: add a1, a2, a1 +; ZVBB-RV32-NEXT: add a1, sp, a1 +; ZVBB-RV32-NEXT: addi a1, a1, 64 +; ZVBB-RV32-NEXT: csrr a2, vlenb +; ZVBB-RV32-NEXT: vmv2r.v v16, v8 +; ZVBB-RV32-NEXT: vmv2r.v v22, v16 +; ZVBB-RV32-NEXT: vmv2r.v v24, v18 +; ZVBB-RV32-NEXT: vmv1r.v v26, v20 +; ZVBB-RV32-NEXT: add a3, a0, a2 +; ZVBB-RV32-NEXT: vmv1r.v v23, v10 +; ZVBB-RV32-NEXT: add a4, a1, a2 +; ZVBB-RV32-NEXT: add a5, a4, a2 +; ZVBB-RV32-NEXT: vmv1r.v v25, v14 +; ZVBB-RV32-NEXT: add a6, a5, a2 +; ZVBB-RV32-NEXT: vmv1r.v v18, v11 +; ZVBB-RV32-NEXT: vsseg5e32.v v22, (a0) +; ZVBB-RV32-NEXT: vmv1r.v v20, v15 +; ZVBB-RV32-NEXT: vsseg5e32.v v17, (a1) +; ZVBB-RV32-NEXT: vl1re32.v v16, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re32.v v17, (a6) +; ZVBB-RV32-NEXT: add a6, a3, a2 +; ZVBB-RV32-NEXT: vl1re32.v v10, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re32.v v11, (a6) +; ZVBB-RV32-NEXT: vl1re32.v v8, (a0) +; ZVBB-RV32-NEXT: vl1re32.v v9, (a3) +; ZVBB-RV32-NEXT: vl1re32.v v14, (a4) +; ZVBB-RV32-NEXT: csrr a0, vlenb +; ZVBB-RV32-NEXT: li a3, 10 +; ZVBB-RV32-NEXT: mul a0, a0, a3 +; ZVBB-RV32-NEXT: add a0, sp, a0 +; ZVBB-RV32-NEXT: addi a0, a0, 64 +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re32.v v15, (a5) +; ZVBB-RV32-NEXT: vl1re32.v v12, (a6) +; ZVBB-RV32-NEXT: vl1re32.v v13, (a1) +; ZVBB-RV32-NEXT: slli a2, a2, 3 +; ZVBB-RV32-NEXT: add a2, a0, a2 +; ZVBB-RV32-NEXT: vs2r.v v16, (a2) +; ZVBB-RV32-NEXT: vs8r.v v8, (a0) +; ZVBB-RV32-NEXT: vl8re32.v v16, (a2) +; ZVBB-RV32-NEXT: vl8re32.v v8, (a0) +; ZVBB-RV32-NEXT: addi sp, s0, -80 +; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; ZVBB-RV32-NEXT: addi sp, sp, 80 +; ZVBB-RV32-NEXT: ret ; -; ZVBB-LABEL: vector_interleave_nxv6bf16_nxv2bf16: -; ZVBB: # %bb.0: -; ZVBB-NEXT: addi sp, sp, -16 -; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: slli a0, a0, 1 -; ZVBB-NEXT: sub sp, sp, a0 -; ZVBB-NEXT: addi a0, sp, 16 -; ZVBB-NEXT: csrr a1, vlenb -; ZVBB-NEXT: srli a2, a1, 1 -; ZVBB-NEXT: vsetvli a3, zero, e16, mf2, ta, ma -; ZVBB-NEXT: vsseg3e16.v v8, (a0) -; ZVBB-NEXT: add a3, a0, a2 -; ZVBB-NEXT: vle16.v v9, (a3) -; ZVBB-NEXT: vle16.v v8, (a0) -; ZVBB-NEXT: srli a1, a1, 2 -; ZVBB-NEXT: add a0, a1, a1 -; ZVBB-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; ZVBB-NEXT: vslideup.vx v8, v9, a1 -; ZVBB-NEXT: add a2, a3, a2 -; ZVBB-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; ZVBB-NEXT: vle16.v v9, (a2) -; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: slli a0, a0, 1 -; ZVBB-NEXT: add sp, sp, a0 -; ZVBB-NEXT: addi sp, sp, 16 -; ZVBB-NEXT: ret - %res = call @llvm.vector.interleave3.nxv6bf16( %v0, %v1, %v2) - ret %res +; ZVBB-RV64-LABEL: vector_interleave_nxv20f32_nxv4f32: +; ZVBB-RV64: # %bb.0: +; ZVBB-RV64-NEXT: addi sp, sp, -80 +; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; ZVBB-RV64-NEXT: addi s0, sp, 80 +; ZVBB-RV64-NEXT: csrr a0, vlenb +; ZVBB-RV64-NEXT: li a1, 28 +; ZVBB-RV64-NEXT: mul a0, a0, a1 +; ZVBB-RV64-NEXT: sub sp, sp, a0 +; ZVBB-RV64-NEXT: andi sp, sp, -64 +; ZVBB-RV64-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; ZVBB-RV64-NEXT: vmv2r.v v20, v16 +; ZVBB-RV64-NEXT: addi a0, sp, 64 +; ZVBB-RV64-NEXT: vmv2r.v v18, v12 +; ZVBB-RV64-NEXT: csrr a1, vlenb +; ZVBB-RV64-NEXT: slli a2, a1, 2 +; ZVBB-RV64-NEXT: add a1, a2, a1 +; ZVBB-RV64-NEXT: add a1, sp, a1 +; ZVBB-RV64-NEXT: addi a1, a1, 64 +; ZVBB-RV64-NEXT: csrr a2, vlenb +; ZVBB-RV64-NEXT: vmv2r.v v16, v8 +; ZVBB-RV64-NEXT: vmv2r.v v22, v16 +; ZVBB-RV64-NEXT: vmv2r.v v24, v18 +; ZVBB-RV64-NEXT: vmv1r.v v26, v20 +; ZVBB-RV64-NEXT: add a3, a0, a2 +; ZVBB-RV64-NEXT: vmv1r.v v23, v10 +; ZVBB-RV64-NEXT: add a4, a1, a2 +; ZVBB-RV64-NEXT: add a5, a4, a2 +; ZVBB-RV64-NEXT: vmv1r.v v25, v14 +; ZVBB-RV64-NEXT: add a6, a5, a2 +; ZVBB-RV64-NEXT: vmv1r.v v18, v11 +; ZVBB-RV64-NEXT: vsseg5e32.v v22, (a0) +; ZVBB-RV64-NEXT: vmv1r.v v20, v15 +; ZVBB-RV64-NEXT: vsseg5e32.v v17, (a1) +; ZVBB-RV64-NEXT: vl1re32.v v16, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re32.v v17, (a6) +; ZVBB-RV64-NEXT: add a6, a3, a2 +; ZVBB-RV64-NEXT: vl1re32.v v10, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re32.v v11, (a6) +; ZVBB-RV64-NEXT: vl1re32.v v8, (a0) +; ZVBB-RV64-NEXT: vl1re32.v v9, (a3) +; ZVBB-RV64-NEXT: vl1re32.v v14, (a4) +; ZVBB-RV64-NEXT: csrr a0, vlenb +; ZVBB-RV64-NEXT: li a3, 10 +; ZVBB-RV64-NEXT: mul a0, a0, a3 +; ZVBB-RV64-NEXT: add a0, sp, a0 +; ZVBB-RV64-NEXT: addi a0, a0, 64 +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re32.v v15, (a5) +; ZVBB-RV64-NEXT: vl1re32.v v12, (a6) +; ZVBB-RV64-NEXT: vl1re32.v v13, (a1) +; ZVBB-RV64-NEXT: slli a2, a2, 3 +; ZVBB-RV64-NEXT: add a2, a0, a2 +; ZVBB-RV64-NEXT: vs2r.v v16, (a2) +; ZVBB-RV64-NEXT: vs8r.v v8, (a0) +; ZVBB-RV64-NEXT: vl8re32.v v16, (a2) +; ZVBB-RV64-NEXT: vl8re32.v v8, (a0) +; ZVBB-RV64-NEXT: addi sp, s0, -80 +; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; ZVBB-RV64-NEXT: addi sp, sp, 80 +; ZVBB-RV64-NEXT: ret +; +; ZIP-LABEL: vector_interleave_nxv20f32_nxv4f32: +; ZIP: # %bb.0: +; ZIP-NEXT: addi sp, sp, -80 +; ZIP-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; ZIP-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; ZIP-NEXT: addi s0, sp, 80 +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: li a1, 28 +; ZIP-NEXT: mul a0, a0, a1 +; ZIP-NEXT: sub sp, sp, a0 +; ZIP-NEXT: andi sp, sp, -64 +; ZIP-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; ZIP-NEXT: vmv2r.v v20, v16 +; ZIP-NEXT: addi a0, sp, 64 +; ZIP-NEXT: vmv2r.v v18, v12 +; ZIP-NEXT: csrr a1, vlenb +; ZIP-NEXT: slli a2, a1, 2 +; ZIP-NEXT: add a1, a2, a1 +; ZIP-NEXT: add a1, sp, a1 +; ZIP-NEXT: addi a1, a1, 64 +; ZIP-NEXT: csrr a2, vlenb +; ZIP-NEXT: vmv2r.v v16, v8 +; ZIP-NEXT: vmv2r.v v22, v16 +; ZIP-NEXT: vmv2r.v v24, v18 +; ZIP-NEXT: vmv1r.v v26, v20 +; ZIP-NEXT: add a3, a0, a2 +; ZIP-NEXT: vmv1r.v v23, v10 +; ZIP-NEXT: add a4, a1, a2 +; ZIP-NEXT: add a5, a4, a2 +; ZIP-NEXT: vmv1r.v v25, v14 +; ZIP-NEXT: add a6, a5, a2 +; ZIP-NEXT: vmv1r.v v18, v11 +; ZIP-NEXT: vsseg5e32.v v22, (a0) +; ZIP-NEXT: vmv1r.v v20, v15 +; ZIP-NEXT: vsseg5e32.v v17, (a1) +; ZIP-NEXT: vl1re32.v v16, (a6) +; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: vl1re32.v v17, (a6) +; ZIP-NEXT: add a6, a3, a2 +; ZIP-NEXT: vl1re32.v v10, (a6) +; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: vl1re32.v v11, (a6) +; ZIP-NEXT: vl1re32.v v8, (a0) +; ZIP-NEXT: vl1re32.v v9, (a3) +; ZIP-NEXT: vl1re32.v v14, (a4) +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: li a3, 10 +; ZIP-NEXT: mul a0, a0, a3 +; ZIP-NEXT: add a0, sp, a0 +; ZIP-NEXT: addi a0, a0, 64 +; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: vl1re32.v v15, (a5) +; ZIP-NEXT: vl1re32.v v12, (a6) +; ZIP-NEXT: vl1re32.v v13, (a1) +; ZIP-NEXT: slli a2, a2, 3 +; ZIP-NEXT: add a2, a0, a2 +; ZIP-NEXT: vs2r.v v16, (a2) +; ZIP-NEXT: vs8r.v v8, (a0) +; ZIP-NEXT: vl8re32.v v16, (a2) +; ZIP-NEXT: vl8re32.v v8, (a0) +; ZIP-NEXT: addi sp, s0, -80 +; ZIP-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; ZIP-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; ZIP-NEXT: addi sp, sp, 80 +; ZIP-NEXT: ret + %res = call @llvm.vector.interleave5.nxv20f32( %v0, %v1, %v2, %v3, %v4) + ret %res } -define @vector_interleave_nxv12bf16_nxv4bf16( %v0, %v1, %v2) nounwind { -; CHECK-LABEL: vector_interleave_nxv12bf16_nxv4bf16: +define @vector_interleave_nxv5f64_nxv1f64( %v0, %v1, %v2, %v3, %v4) nounwind { +; CHECK-LABEL: vector_interleave_nxv5f64_nxv1f64: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a1, a0, 1 +; CHECK-NEXT: slli a1, a0, 2 ; CHECK-NEXT: add a0, a1, a0 ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma -; CHECK-NEXT: vsseg3e16.v v8, (a0) -; CHECK-NEXT: vl1re16.v v8, (a0) -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: vl1re16.v v9, (a0) -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: vl1re16.v v10, (a0) +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: add a3, a2, a1 +; CHECK-NEXT: vsetvli a4, zero, e64, m1, ta, ma +; CHECK-NEXT: vsseg5e64.v v8, (a0) +; CHECK-NEXT: vl1re64.v v10, (a3) +; CHECK-NEXT: add a3, a3, a1 +; CHECK-NEXT: vl1re64.v v11, (a3) +; CHECK-NEXT: vl1re64.v v8, (a0) +; CHECK-NEXT: vl1re64.v v9, (a2) +; CHECK-NEXT: add a1, a3, a1 +; CHECK-NEXT: vl1re64.v v12, (a1) ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a1, a0, 1 +; CHECK-NEXT: slli a1, a0, 2 ; CHECK-NEXT: add a0, a1, a0 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret ; -; ZVBB-LABEL: vector_interleave_nxv12bf16_nxv4bf16: +; ZVBB-LABEL: vector_interleave_nxv5f64_nxv1f64: ; ZVBB: # %bb.0: ; ZVBB-NEXT: addi sp, sp, -16 ; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: slli a1, a0, 1 +; ZVBB-NEXT: slli a1, a0, 2 ; ZVBB-NEXT: add a0, a1, a0 ; ZVBB-NEXT: sub sp, sp, a0 ; ZVBB-NEXT: addi a0, sp, 16 ; ZVBB-NEXT: csrr a1, vlenb -; ZVBB-NEXT: vsetvli a2, zero, e16, m1, ta, ma -; ZVBB-NEXT: vsseg3e16.v v8, (a0) -; ZVBB-NEXT: vl1re16.v v8, (a0) -; ZVBB-NEXT: add a0, a0, a1 -; ZVBB-NEXT: vl1re16.v v9, (a0) -; ZVBB-NEXT: add a0, a0, a1 -; ZVBB-NEXT: vl1re16.v v10, (a0) +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: add a3, a2, a1 +; ZVBB-NEXT: vsetvli a4, zero, e64, m1, ta, ma +; ZVBB-NEXT: vsseg5e64.v v8, (a0) +; ZVBB-NEXT: vl1re64.v v10, (a3) +; ZVBB-NEXT: add a3, a3, a1 +; ZVBB-NEXT: vl1re64.v v11, (a3) +; ZVBB-NEXT: vl1re64.v v8, (a0) +; ZVBB-NEXT: vl1re64.v v9, (a2) +; ZVBB-NEXT: add a1, a3, a1 +; ZVBB-NEXT: vl1re64.v v12, (a1) ; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: slli a1, a0, 1 +; ZVBB-NEXT: slli a1, a0, 2 ; ZVBB-NEXT: add a0, a1, a0 ; ZVBB-NEXT: add sp, sp, a0 ; ZVBB-NEXT: addi sp, sp, 16 ; ZVBB-NEXT: ret - %res = call @llvm.vector.interleave3.nxv12bf16( %v0, %v1, %v2) - ret %res + %res = call @llvm.vector.interleave5.nxv5f64( %v0, %v1, %v2, %v3, %v4) + ret %res } -define @vector_interleave_nxv24bf16_nxv8bf16( %v0, %v1, %v2) nounwind { -; CHECK-LABEL: vector_interleave_nxv24bf16_nxv8bf16: -; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 6 -; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; CHECK-NEXT: vsseg3e16.v v8, (a0) -; CHECK-NEXT: vl2re16.v v8, (a0) -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: vl2re16.v v10, (a0) -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: vl2re16.v v12, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 6 -; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: ret +define @vector_interleave_nxv10f64_nxv2f64( %v0, %v1, %v2, %v3, %v4) nounwind { +; RV32-LABEL: vector_interleave_nxv10f64_nxv2f64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -80 +; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; RV32-NEXT: addi s0, sp, 80 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 28 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: sub sp, sp, a0 +; RV32-NEXT: andi sp, sp, -64 +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32-NEXT: vmv2r.v v20, v16 +; RV32-NEXT: addi a0, sp, 64 +; RV32-NEXT: vmv2r.v v18, v12 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a2, a1, 2 +; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 64 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: vmv2r.v v16, v8 +; RV32-NEXT: vmv2r.v v22, v16 +; RV32-NEXT: vmv2r.v v24, v18 +; RV32-NEXT: vmv1r.v v26, v20 +; RV32-NEXT: add a3, a0, a2 +; RV32-NEXT: vmv1r.v v23, v10 +; RV32-NEXT: add a4, a1, a2 +; RV32-NEXT: add a5, a4, a2 +; RV32-NEXT: vmv1r.v v25, v14 +; RV32-NEXT: add a6, a5, a2 +; RV32-NEXT: vmv1r.v v18, v11 +; RV32-NEXT: vsseg5e64.v v22, (a0) +; RV32-NEXT: vmv1r.v v20, v15 +; RV32-NEXT: vsseg5e64.v v17, (a1) +; RV32-NEXT: vl1re64.v v16, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re64.v v17, (a6) +; RV32-NEXT: add a6, a3, a2 +; RV32-NEXT: vl1re64.v v10, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re64.v v11, (a6) +; RV32-NEXT: vl1re64.v v8, (a0) +; RV32-NEXT: vl1re64.v v9, (a3) +; RV32-NEXT: vl1re64.v v14, (a4) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a3, 10 +; RV32-NEXT: mul a0, a0, a3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 64 +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re64.v v15, (a5) +; RV32-NEXT: vl1re64.v v12, (a6) +; RV32-NEXT: vl1re64.v v13, (a1) +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, a0, a2 +; RV32-NEXT: vs2r.v v16, (a2) +; RV32-NEXT: vs8r.v v8, (a0) +; RV32-NEXT: vl8re64.v v16, (a2) +; RV32-NEXT: vl8re64.v v8, (a0) +; RV32-NEXT: addi sp, s0, -80 +; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 80 +; RV32-NEXT: ret +; +; RV64-LABEL: vector_interleave_nxv10f64_nxv2f64: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -80 +; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; RV64-NEXT: addi s0, sp, 80 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a1, 28 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: sub sp, sp, a0 +; RV64-NEXT: andi sp, sp, -64 +; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV64-NEXT: vmv2r.v v20, v16 +; RV64-NEXT: addi a0, sp, 64 +; RV64-NEXT: vmv2r.v v18, v12 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 2 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 64 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: vmv2r.v v16, v8 +; RV64-NEXT: vmv2r.v v22, v16 +; RV64-NEXT: vmv2r.v v24, v18 +; RV64-NEXT: vmv1r.v v26, v20 +; RV64-NEXT: add a3, a0, a2 +; RV64-NEXT: vmv1r.v v23, v10 +; RV64-NEXT: add a4, a1, a2 +; RV64-NEXT: add a5, a4, a2 +; RV64-NEXT: vmv1r.v v25, v14 +; RV64-NEXT: add a6, a5, a2 +; RV64-NEXT: vmv1r.v v18, v11 +; RV64-NEXT: vsseg5e64.v v22, (a0) +; RV64-NEXT: vmv1r.v v20, v15 +; RV64-NEXT: vsseg5e64.v v17, (a1) +; RV64-NEXT: vl1re64.v v16, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re64.v v17, (a6) +; RV64-NEXT: add a6, a3, a2 +; RV64-NEXT: vl1re64.v v10, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re64.v v11, (a6) +; RV64-NEXT: vl1re64.v v8, (a0) +; RV64-NEXT: vl1re64.v v9, (a3) +; RV64-NEXT: vl1re64.v v14, (a4) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a3, 10 +; RV64-NEXT: mul a0, a0, a3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 64 +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re64.v v15, (a5) +; RV64-NEXT: vl1re64.v v12, (a6) +; RV64-NEXT: vl1re64.v v13, (a1) +; RV64-NEXT: slli a2, a2, 3 +; RV64-NEXT: add a2, a0, a2 +; RV64-NEXT: vs2r.v v16, (a2) +; RV64-NEXT: vs8r.v v8, (a0) +; RV64-NEXT: vl8re64.v v16, (a2) +; RV64-NEXT: vl8re64.v v8, (a0) +; RV64-NEXT: addi sp, s0, -80 +; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 80 +; RV64-NEXT: ret +; +; ZVBB-RV32-LABEL: vector_interleave_nxv10f64_nxv2f64: +; ZVBB-RV32: # %bb.0: +; ZVBB-RV32-NEXT: addi sp, sp, -80 +; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; ZVBB-RV32-NEXT: addi s0, sp, 80 +; ZVBB-RV32-NEXT: csrr a0, vlenb +; ZVBB-RV32-NEXT: li a1, 28 +; ZVBB-RV32-NEXT: mul a0, a0, a1 +; ZVBB-RV32-NEXT: sub sp, sp, a0 +; ZVBB-RV32-NEXT: andi sp, sp, -64 +; ZVBB-RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; ZVBB-RV32-NEXT: vmv2r.v v20, v16 +; ZVBB-RV32-NEXT: addi a0, sp, 64 +; ZVBB-RV32-NEXT: vmv2r.v v18, v12 +; ZVBB-RV32-NEXT: csrr a1, vlenb +; ZVBB-RV32-NEXT: slli a2, a1, 2 +; ZVBB-RV32-NEXT: add a1, a2, a1 +; ZVBB-RV32-NEXT: add a1, sp, a1 +; ZVBB-RV32-NEXT: addi a1, a1, 64 +; ZVBB-RV32-NEXT: csrr a2, vlenb +; ZVBB-RV32-NEXT: vmv2r.v v16, v8 +; ZVBB-RV32-NEXT: vmv2r.v v22, v16 +; ZVBB-RV32-NEXT: vmv2r.v v24, v18 +; ZVBB-RV32-NEXT: vmv1r.v v26, v20 +; ZVBB-RV32-NEXT: add a3, a0, a2 +; ZVBB-RV32-NEXT: vmv1r.v v23, v10 +; ZVBB-RV32-NEXT: add a4, a1, a2 +; ZVBB-RV32-NEXT: add a5, a4, a2 +; ZVBB-RV32-NEXT: vmv1r.v v25, v14 +; ZVBB-RV32-NEXT: add a6, a5, a2 +; ZVBB-RV32-NEXT: vmv1r.v v18, v11 +; ZVBB-RV32-NEXT: vsseg5e64.v v22, (a0) +; ZVBB-RV32-NEXT: vmv1r.v v20, v15 +; ZVBB-RV32-NEXT: vsseg5e64.v v17, (a1) +; ZVBB-RV32-NEXT: vl1re64.v v16, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re64.v v17, (a6) +; ZVBB-RV32-NEXT: add a6, a3, a2 +; ZVBB-RV32-NEXT: vl1re64.v v10, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re64.v v11, (a6) +; ZVBB-RV32-NEXT: vl1re64.v v8, (a0) +; ZVBB-RV32-NEXT: vl1re64.v v9, (a3) +; ZVBB-RV32-NEXT: vl1re64.v v14, (a4) +; ZVBB-RV32-NEXT: csrr a0, vlenb +; ZVBB-RV32-NEXT: li a3, 10 +; ZVBB-RV32-NEXT: mul a0, a0, a3 +; ZVBB-RV32-NEXT: add a0, sp, a0 +; ZVBB-RV32-NEXT: addi a0, a0, 64 +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re64.v v15, (a5) +; ZVBB-RV32-NEXT: vl1re64.v v12, (a6) +; ZVBB-RV32-NEXT: vl1re64.v v13, (a1) +; ZVBB-RV32-NEXT: slli a2, a2, 3 +; ZVBB-RV32-NEXT: add a2, a0, a2 +; ZVBB-RV32-NEXT: vs2r.v v16, (a2) +; ZVBB-RV32-NEXT: vs8r.v v8, (a0) +; ZVBB-RV32-NEXT: vl8re64.v v16, (a2) +; ZVBB-RV32-NEXT: vl8re64.v v8, (a0) +; ZVBB-RV32-NEXT: addi sp, s0, -80 +; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; ZVBB-RV32-NEXT: addi sp, sp, 80 +; ZVBB-RV32-NEXT: ret ; -; ZVBB-LABEL: vector_interleave_nxv24bf16_nxv8bf16: -; ZVBB: # %bb.0: -; ZVBB-NEXT: addi sp, sp, -16 -; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: li a1, 6 -; ZVBB-NEXT: mul a0, a0, a1 -; ZVBB-NEXT: sub sp, sp, a0 -; ZVBB-NEXT: addi a0, sp, 16 -; ZVBB-NEXT: csrr a1, vlenb -; ZVBB-NEXT: slli a1, a1, 1 -; ZVBB-NEXT: vsetvli a2, zero, e16, m2, ta, ma -; ZVBB-NEXT: vsseg3e16.v v8, (a0) -; ZVBB-NEXT: vl2re16.v v8, (a0) -; ZVBB-NEXT: add a0, a0, a1 -; ZVBB-NEXT: vl2re16.v v10, (a0) -; ZVBB-NEXT: add a0, a0, a1 -; ZVBB-NEXT: vl2re16.v v12, (a0) -; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: li a1, 6 -; ZVBB-NEXT: mul a0, a0, a1 -; ZVBB-NEXT: add sp, sp, a0 -; ZVBB-NEXT: addi sp, sp, 16 -; ZVBB-NEXT: ret - %res = call @llvm.vector.interleave3.nxv24bf16( %v0, %v1, %v2) - ret %res -} - -define @vector_interleave_nxv3f32_nxv1f32( %v0, %v1, %v2) nounwind { -; CHECK-LABEL: vector_interleave_nxv3f32_nxv1f32: -; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a2, a1, 1 -; CHECK-NEXT: vsetvli a3, zero, e32, mf2, ta, ma -; CHECK-NEXT: vsseg3e32.v v8, (a0) -; CHECK-NEXT: add a3, a0, a2 -; CHECK-NEXT: vle32.v v9, (a3) -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: srli a1, a1, 3 -; CHECK-NEXT: add a0, a1, a1 -; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; CHECK-NEXT: vslideup.vx v8, v9, a1 -; CHECK-NEXT: add a2, a3, a2 -; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a2) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: ret +; ZVBB-RV64-LABEL: vector_interleave_nxv10f64_nxv2f64: +; ZVBB-RV64: # %bb.0: +; ZVBB-RV64-NEXT: addi sp, sp, -80 +; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; ZVBB-RV64-NEXT: addi s0, sp, 80 +; ZVBB-RV64-NEXT: csrr a0, vlenb +; ZVBB-RV64-NEXT: li a1, 28 +; ZVBB-RV64-NEXT: mul a0, a0, a1 +; ZVBB-RV64-NEXT: sub sp, sp, a0 +; ZVBB-RV64-NEXT: andi sp, sp, -64 +; ZVBB-RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; ZVBB-RV64-NEXT: vmv2r.v v20, v16 +; ZVBB-RV64-NEXT: addi a0, sp, 64 +; ZVBB-RV64-NEXT: vmv2r.v v18, v12 +; ZVBB-RV64-NEXT: csrr a1, vlenb +; ZVBB-RV64-NEXT: slli a2, a1, 2 +; ZVBB-RV64-NEXT: add a1, a2, a1 +; ZVBB-RV64-NEXT: add a1, sp, a1 +; ZVBB-RV64-NEXT: addi a1, a1, 64 +; ZVBB-RV64-NEXT: csrr a2, vlenb +; ZVBB-RV64-NEXT: vmv2r.v v16, v8 +; ZVBB-RV64-NEXT: vmv2r.v v22, v16 +; ZVBB-RV64-NEXT: vmv2r.v v24, v18 +; ZVBB-RV64-NEXT: vmv1r.v v26, v20 +; ZVBB-RV64-NEXT: add a3, a0, a2 +; ZVBB-RV64-NEXT: vmv1r.v v23, v10 +; ZVBB-RV64-NEXT: add a4, a1, a2 +; ZVBB-RV64-NEXT: add a5, a4, a2 +; ZVBB-RV64-NEXT: vmv1r.v v25, v14 +; ZVBB-RV64-NEXT: add a6, a5, a2 +; ZVBB-RV64-NEXT: vmv1r.v v18, v11 +; ZVBB-RV64-NEXT: vsseg5e64.v v22, (a0) +; ZVBB-RV64-NEXT: vmv1r.v v20, v15 +; ZVBB-RV64-NEXT: vsseg5e64.v v17, (a1) +; ZVBB-RV64-NEXT: vl1re64.v v16, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re64.v v17, (a6) +; ZVBB-RV64-NEXT: add a6, a3, a2 +; ZVBB-RV64-NEXT: vl1re64.v v10, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re64.v v11, (a6) +; ZVBB-RV64-NEXT: vl1re64.v v8, (a0) +; ZVBB-RV64-NEXT: vl1re64.v v9, (a3) +; ZVBB-RV64-NEXT: vl1re64.v v14, (a4) +; ZVBB-RV64-NEXT: csrr a0, vlenb +; ZVBB-RV64-NEXT: li a3, 10 +; ZVBB-RV64-NEXT: mul a0, a0, a3 +; ZVBB-RV64-NEXT: add a0, sp, a0 +; ZVBB-RV64-NEXT: addi a0, a0, 64 +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re64.v v15, (a5) +; ZVBB-RV64-NEXT: vl1re64.v v12, (a6) +; ZVBB-RV64-NEXT: vl1re64.v v13, (a1) +; ZVBB-RV64-NEXT: slli a2, a2, 3 +; ZVBB-RV64-NEXT: add a2, a0, a2 +; ZVBB-RV64-NEXT: vs2r.v v16, (a2) +; ZVBB-RV64-NEXT: vs8r.v v8, (a0) +; ZVBB-RV64-NEXT: vl8re64.v v16, (a2) +; ZVBB-RV64-NEXT: vl8re64.v v8, (a0) +; ZVBB-RV64-NEXT: addi sp, s0, -80 +; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; ZVBB-RV64-NEXT: addi sp, sp, 80 +; ZVBB-RV64-NEXT: ret ; -; ZVBB-LABEL: vector_interleave_nxv3f32_nxv1f32: -; ZVBB: # %bb.0: -; ZVBB-NEXT: addi sp, sp, -16 -; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: slli a0, a0, 1 -; ZVBB-NEXT: sub sp, sp, a0 -; ZVBB-NEXT: addi a0, sp, 16 -; ZVBB-NEXT: csrr a1, vlenb -; ZVBB-NEXT: srli a2, a1, 1 -; ZVBB-NEXT: vsetvli a3, zero, e32, mf2, ta, ma -; ZVBB-NEXT: vsseg3e32.v v8, (a0) -; ZVBB-NEXT: add a3, a0, a2 -; ZVBB-NEXT: vle32.v v9, (a3) -; ZVBB-NEXT: vle32.v v8, (a0) -; ZVBB-NEXT: srli a1, a1, 3 -; ZVBB-NEXT: add a0, a1, a1 -; ZVBB-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVBB-NEXT: vslideup.vx v8, v9, a1 -; ZVBB-NEXT: add a2, a3, a2 -; ZVBB-NEXT: vsetvli a0, zero, e32, mf2, ta, ma -; ZVBB-NEXT: vle32.v v9, (a2) -; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: slli a0, a0, 1 -; ZVBB-NEXT: add sp, sp, a0 -; ZVBB-NEXT: addi sp, sp, 16 -; ZVBB-NEXT: ret - %res = call @llvm.vector.interleave3.nxv3f32( %v0, %v1, %v2) - ret %res +; ZIP-LABEL: vector_interleave_nxv10f64_nxv2f64: +; ZIP: # %bb.0: +; ZIP-NEXT: addi sp, sp, -80 +; ZIP-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; ZIP-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; ZIP-NEXT: addi s0, sp, 80 +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: li a1, 28 +; ZIP-NEXT: mul a0, a0, a1 +; ZIP-NEXT: sub sp, sp, a0 +; ZIP-NEXT: andi sp, sp, -64 +; ZIP-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; ZIP-NEXT: vmv2r.v v20, v16 +; ZIP-NEXT: addi a0, sp, 64 +; ZIP-NEXT: vmv2r.v v18, v12 +; ZIP-NEXT: csrr a1, vlenb +; ZIP-NEXT: slli a2, a1, 2 +; ZIP-NEXT: add a1, a2, a1 +; ZIP-NEXT: add a1, sp, a1 +; ZIP-NEXT: addi a1, a1, 64 +; ZIP-NEXT: csrr a2, vlenb +; ZIP-NEXT: vmv2r.v v16, v8 +; ZIP-NEXT: vmv2r.v v22, v16 +; ZIP-NEXT: vmv2r.v v24, v18 +; ZIP-NEXT: vmv1r.v v26, v20 +; ZIP-NEXT: add a3, a0, a2 +; ZIP-NEXT: vmv1r.v v23, v10 +; ZIP-NEXT: add a4, a1, a2 +; ZIP-NEXT: add a5, a4, a2 +; ZIP-NEXT: vmv1r.v v25, v14 +; ZIP-NEXT: add a6, a5, a2 +; ZIP-NEXT: vmv1r.v v18, v11 +; ZIP-NEXT: vsseg5e64.v v22, (a0) +; ZIP-NEXT: vmv1r.v v20, v15 +; ZIP-NEXT: vsseg5e64.v v17, (a1) +; ZIP-NEXT: vl1re64.v v16, (a6) +; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: vl1re64.v v17, (a6) +; ZIP-NEXT: add a6, a3, a2 +; ZIP-NEXT: vl1re64.v v10, (a6) +; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: vl1re64.v v11, (a6) +; ZIP-NEXT: vl1re64.v v8, (a0) +; ZIP-NEXT: vl1re64.v v9, (a3) +; ZIP-NEXT: vl1re64.v v14, (a4) +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: li a3, 10 +; ZIP-NEXT: mul a0, a0, a3 +; ZIP-NEXT: add a0, sp, a0 +; ZIP-NEXT: addi a0, a0, 64 +; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: vl1re64.v v15, (a5) +; ZIP-NEXT: vl1re64.v v12, (a6) +; ZIP-NEXT: vl1re64.v v13, (a1) +; ZIP-NEXT: slli a2, a2, 3 +; ZIP-NEXT: add a2, a0, a2 +; ZIP-NEXT: vs2r.v v16, (a2) +; ZIP-NEXT: vs8r.v v8, (a0) +; ZIP-NEXT: vl8re64.v v16, (a2) +; ZIP-NEXT: vl8re64.v v8, (a0) +; ZIP-NEXT: addi sp, s0, -80 +; ZIP-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; ZIP-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; ZIP-NEXT: addi sp, sp, 80 +; ZIP-NEXT: ret + %res = call @llvm.vector.interleave5.nxv10f64( %v0, %v1, %v2, %v3, %v4) + ret %res } -define @vector_interleave_nxv6f32_nxv2f32( %v0, %v1, %v2) nounwind { -; CHECK-LABEL: vector_interleave_nxv6f32_nxv2f32: +define @vector_interleave_nxv12f16_nxv2f16( %v0, %v1, %v2, %v3, %v4, %v5) nounwind { +; CHECK-LABEL: vector_interleave_nxv12f16_nxv2f16: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a0, vlenb @@ -4757,13 +9785,30 @@ define @vector_interleave_nxv6f32_nxv2f32( @vector_interleave_nxv6f32_nxv2f32( @vector_interleave_nxv6f32_nxv2f32( @llvm.vector.interleave3.nxv6f32( %v0, %v1, %v2) - ret %res + %res = call @llvm.vector.interleave6.nxv12f16( %v0, %v1, %v2, %v3, %v4, %v5) + ret %res } -define @vector_interleave_nxv12f32_nxv4f32( %v0, %v1, %v2) nounwind { -; CHECK-LABEL: vector_interleave_nxv12f32_nxv4f32: +define @vector_interleave_nxv24f16_nxv4f16( %v0, %v1, %v2, %v3, %v4, %v5) nounwind { +; CHECK-LABEL: vector_interleave_nxv24f16_nxv4f16: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a0, vlenb @@ -4807,14 +9869,19 @@ define @vector_interleave_nxv12f32_nxv4f32( @vector_interleave_nxv12f32_nxv4f32( @vector_interleave_nxv12f32_nxv4f32( @llvm.vector.interleave3.nxv12f32( %v0, %v1, %v2) - ret %res + %res = call @llvm.vector.interleave6.nxv24f16( %v0, %v1, %v2, %v3, %v4, %v5) + ret %res } -define @vector_interleave_nxv3f64_nxv1f64( %v0, %v1, %v2) nounwind { -; CHECK-LABEL: vector_interleave_nxv3f64_nxv1f64: -; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a1, a0, 1 -; CHECK-NEXT: add a0, a1, a0 -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: vsetvli a2, zero, e64, m1, ta, ma -; CHECK-NEXT: vsseg3e64.v v8, (a0) -; CHECK-NEXT: vl1re64.v v8, (a0) -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: vl1re64.v v9, (a0) -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: vl1re64.v v10, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a1, a0, 1 -; CHECK-NEXT: add a0, a1, a0 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: ret +define @vector_interleave_nxv48f16_nxv8f16( %v0, %v1, %v2, %v3, %v4, %v5) nounwind { +; RV32-LABEL: vector_interleave_nxv48f16_nxv8f16: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -80 +; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; RV32-NEXT: addi s0, sp, 80 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 28 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: sub sp, sp, a0 +; RV32-NEXT: andi sp, sp, -64 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; RV32-NEXT: vmv2r.v v20, v14 +; RV32-NEXT: vmv2r.v v22, v12 +; RV32-NEXT: vmv2r.v v24, v10 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a0, 6 +; RV32-NEXT: mul a1, a1, a0 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 64 +; RV32-NEXT: vmv1r.v v10, v25 +; RV32-NEXT: vmv1r.v v11, v23 +; RV32-NEXT: vmv1r.v v12, v21 +; RV32-NEXT: addi a0, sp, 64 +; RV32-NEXT: vmv1r.v v13, v17 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: vmv1r.v v14, v19 +; RV32-NEXT: vsseg6e16.v v9, (a1) +; RV32-NEXT: vmv1r.v v9, v24 +; RV32-NEXT: add a5, a1, a2 +; RV32-NEXT: vmv1r.v v10, v22 +; RV32-NEXT: add a3, a0, a2 +; RV32-NEXT: vmv1r.v v11, v20 +; RV32-NEXT: add a4, a3, a2 +; RV32-NEXT: vmv1r.v v12, v16 +; RV32-NEXT: add a6, a5, a2 +; RV32-NEXT: vmv1r.v v13, v18 +; RV32-NEXT: vsseg6e16.v v8, (a0) +; RV32-NEXT: vl1re16.v v14, (a1) +; RV32-NEXT: add a1, a6, a2 +; RV32-NEXT: vl1re16.v v15, (a5) +; RV32-NEXT: add a5, a1, a2 +; RV32-NEXT: vl1re16.v v18, (a5) +; RV32-NEXT: add a5, a5, a2 +; RV32-NEXT: vl1re16.v v19, (a5) +; RV32-NEXT: add a5, a4, a2 +; RV32-NEXT: vl1re16.v v16, (a6) +; RV32-NEXT: add a6, a5, a2 +; RV32-NEXT: vl1re16.v v12, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re16.v v13, (a6) +; RV32-NEXT: csrr a6, vlenb +; RV32-NEXT: li a7, 12 +; RV32-NEXT: mul a6, a6, a7 +; RV32-NEXT: add a6, sp, a6 +; RV32-NEXT: addi a6, a6, 64 +; RV32-NEXT: vl1re16.v v17, (a1) +; RV32-NEXT: vl1re16.v v10, (a4) +; RV32-NEXT: vl1re16.v v11, (a5) +; RV32-NEXT: vl1re16.v v8, (a0) +; RV32-NEXT: vl1re16.v v9, (a3) +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, a6, a2 +; RV32-NEXT: vs4r.v v16, (a2) +; RV32-NEXT: vs8r.v v8, (a6) +; RV32-NEXT: vl8re16.v v16, (a2) +; RV32-NEXT: vl8re16.v v8, (a6) +; RV32-NEXT: addi sp, s0, -80 +; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 80 +; RV32-NEXT: ret ; -; ZVBB-LABEL: vector_interleave_nxv3f64_nxv1f64: -; ZVBB: # %bb.0: -; ZVBB-NEXT: addi sp, sp, -16 -; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: slli a1, a0, 1 -; ZVBB-NEXT: add a0, a1, a0 -; ZVBB-NEXT: sub sp, sp, a0 -; ZVBB-NEXT: addi a0, sp, 16 -; ZVBB-NEXT: csrr a1, vlenb -; ZVBB-NEXT: vsetvli a2, zero, e64, m1, ta, ma -; ZVBB-NEXT: vsseg3e64.v v8, (a0) -; ZVBB-NEXT: vl1re64.v v8, (a0) -; ZVBB-NEXT: add a0, a0, a1 -; ZVBB-NEXT: vl1re64.v v9, (a0) -; ZVBB-NEXT: add a0, a0, a1 -; ZVBB-NEXT: vl1re64.v v10, (a0) -; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: slli a1, a0, 1 -; ZVBB-NEXT: add a0, a1, a0 -; ZVBB-NEXT: add sp, sp, a0 -; ZVBB-NEXT: addi sp, sp, 16 -; ZVBB-NEXT: ret - %res = call @llvm.vector.interleave3.nxv3f64( %v0, %v1, %v2) - ret %res -} - -define @vector_interleave_nxv6f64_nxv2f64( %v0, %v1, %v2) nounwind { -; CHECK-LABEL: vector_interleave_nxv6f64_nxv2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 6 -; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: vsetvli a2, zero, e64, m2, ta, ma -; CHECK-NEXT: vsseg3e64.v v8, (a0) -; CHECK-NEXT: vl2re64.v v8, (a0) -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: vl2re64.v v10, (a0) -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: vl2re64.v v12, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 6 -; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: ret +; RV64-LABEL: vector_interleave_nxv48f16_nxv8f16: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -80 +; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; RV64-NEXT: addi s0, sp, 80 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a1, 28 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: sub sp, sp, a0 +; RV64-NEXT: andi sp, sp, -64 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; RV64-NEXT: vmv2r.v v20, v14 +; RV64-NEXT: vmv2r.v v22, v12 +; RV64-NEXT: vmv2r.v v24, v10 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a0, 6 +; RV64-NEXT: mul a1, a1, a0 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 64 +; RV64-NEXT: vmv1r.v v10, v25 +; RV64-NEXT: vmv1r.v v11, v23 +; RV64-NEXT: vmv1r.v v12, v21 +; RV64-NEXT: addi a0, sp, 64 +; RV64-NEXT: vmv1r.v v13, v17 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: vmv1r.v v14, v19 +; RV64-NEXT: vsseg6e16.v v9, (a1) +; RV64-NEXT: vmv1r.v v9, v24 +; RV64-NEXT: add a5, a1, a2 +; RV64-NEXT: vmv1r.v v10, v22 +; RV64-NEXT: add a3, a0, a2 +; RV64-NEXT: vmv1r.v v11, v20 +; RV64-NEXT: add a4, a3, a2 +; RV64-NEXT: vmv1r.v v12, v16 +; RV64-NEXT: add a6, a5, a2 +; RV64-NEXT: vmv1r.v v13, v18 +; RV64-NEXT: vsseg6e16.v v8, (a0) +; RV64-NEXT: vl1re16.v v14, (a1) +; RV64-NEXT: add a1, a6, a2 +; RV64-NEXT: vl1re16.v v15, (a5) +; RV64-NEXT: add a5, a1, a2 +; RV64-NEXT: vl1re16.v v18, (a5) +; RV64-NEXT: add a5, a5, a2 +; RV64-NEXT: vl1re16.v v19, (a5) +; RV64-NEXT: add a5, a4, a2 +; RV64-NEXT: vl1re16.v v16, (a6) +; RV64-NEXT: add a6, a5, a2 +; RV64-NEXT: vl1re16.v v12, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re16.v v13, (a6) +; RV64-NEXT: csrr a6, vlenb +; RV64-NEXT: li a7, 12 +; RV64-NEXT: mul a6, a6, a7 +; RV64-NEXT: add a6, sp, a6 +; RV64-NEXT: addi a6, a6, 64 +; RV64-NEXT: vl1re16.v v17, (a1) +; RV64-NEXT: vl1re16.v v10, (a4) +; RV64-NEXT: vl1re16.v v11, (a5) +; RV64-NEXT: vl1re16.v v8, (a0) +; RV64-NEXT: vl1re16.v v9, (a3) +; RV64-NEXT: slli a2, a2, 3 +; RV64-NEXT: add a2, a6, a2 +; RV64-NEXT: vs4r.v v16, (a2) +; RV64-NEXT: vs8r.v v8, (a6) +; RV64-NEXT: vl8re16.v v16, (a2) +; RV64-NEXT: vl8re16.v v8, (a6) +; RV64-NEXT: addi sp, s0, -80 +; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 80 +; RV64-NEXT: ret +; +; ZVBB-RV32-LABEL: vector_interleave_nxv48f16_nxv8f16: +; ZVBB-RV32: # %bb.0: +; ZVBB-RV32-NEXT: addi sp, sp, -80 +; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill +; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill +; ZVBB-RV32-NEXT: addi s0, sp, 80 +; ZVBB-RV32-NEXT: csrr a0, vlenb +; ZVBB-RV32-NEXT: li a1, 28 +; ZVBB-RV32-NEXT: mul a0, a0, a1 +; ZVBB-RV32-NEXT: sub sp, sp, a0 +; ZVBB-RV32-NEXT: andi sp, sp, -64 +; ZVBB-RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZVBB-RV32-NEXT: vmv2r.v v20, v14 +; ZVBB-RV32-NEXT: vmv2r.v v22, v12 +; ZVBB-RV32-NEXT: vmv2r.v v24, v10 +; ZVBB-RV32-NEXT: csrr a1, vlenb +; ZVBB-RV32-NEXT: li a0, 6 +; ZVBB-RV32-NEXT: mul a1, a1, a0 +; ZVBB-RV32-NEXT: add a1, sp, a1 +; ZVBB-RV32-NEXT: addi a1, a1, 64 +; ZVBB-RV32-NEXT: vmv1r.v v10, v25 +; ZVBB-RV32-NEXT: vmv1r.v v11, v23 +; ZVBB-RV32-NEXT: vmv1r.v v12, v21 +; ZVBB-RV32-NEXT: addi a0, sp, 64 +; ZVBB-RV32-NEXT: vmv1r.v v13, v17 +; ZVBB-RV32-NEXT: csrr a2, vlenb +; ZVBB-RV32-NEXT: vmv1r.v v14, v19 +; ZVBB-RV32-NEXT: vsseg6e16.v v9, (a1) +; ZVBB-RV32-NEXT: vmv1r.v v9, v24 +; ZVBB-RV32-NEXT: add a5, a1, a2 +; ZVBB-RV32-NEXT: vmv1r.v v10, v22 +; ZVBB-RV32-NEXT: add a3, a0, a2 +; ZVBB-RV32-NEXT: vmv1r.v v11, v20 +; ZVBB-RV32-NEXT: add a4, a3, a2 +; ZVBB-RV32-NEXT: vmv1r.v v12, v16 +; ZVBB-RV32-NEXT: add a6, a5, a2 +; ZVBB-RV32-NEXT: vmv1r.v v13, v18 +; ZVBB-RV32-NEXT: vsseg6e16.v v8, (a0) +; ZVBB-RV32-NEXT: vl1re16.v v14, (a1) +; ZVBB-RV32-NEXT: add a1, a6, a2 +; ZVBB-RV32-NEXT: vl1re16.v v15, (a5) +; ZVBB-RV32-NEXT: add a5, a1, a2 +; ZVBB-RV32-NEXT: vl1re16.v v18, (a5) +; ZVBB-RV32-NEXT: add a5, a5, a2 +; ZVBB-RV32-NEXT: vl1re16.v v19, (a5) +; ZVBB-RV32-NEXT: add a5, a4, a2 +; ZVBB-RV32-NEXT: vl1re16.v v16, (a6) +; ZVBB-RV32-NEXT: add a6, a5, a2 +; ZVBB-RV32-NEXT: vl1re16.v v12, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re16.v v13, (a6) +; ZVBB-RV32-NEXT: csrr a6, vlenb +; ZVBB-RV32-NEXT: li a7, 12 +; ZVBB-RV32-NEXT: mul a6, a6, a7 +; ZVBB-RV32-NEXT: add a6, sp, a6 +; ZVBB-RV32-NEXT: addi a6, a6, 64 +; ZVBB-RV32-NEXT: vl1re16.v v17, (a1) +; ZVBB-RV32-NEXT: vl1re16.v v10, (a4) +; ZVBB-RV32-NEXT: vl1re16.v v11, (a5) +; ZVBB-RV32-NEXT: vl1re16.v v8, (a0) +; ZVBB-RV32-NEXT: vl1re16.v v9, (a3) +; ZVBB-RV32-NEXT: slli a2, a2, 3 +; ZVBB-RV32-NEXT: add a2, a6, a2 +; ZVBB-RV32-NEXT: vs4r.v v16, (a2) +; ZVBB-RV32-NEXT: vs8r.v v8, (a6) +; ZVBB-RV32-NEXT: vl8re16.v v16, (a2) +; ZVBB-RV32-NEXT: vl8re16.v v8, (a6) +; ZVBB-RV32-NEXT: addi sp, s0, -80 +; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload +; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload +; ZVBB-RV32-NEXT: addi sp, sp, 80 +; ZVBB-RV32-NEXT: ret ; -; ZVBB-LABEL: vector_interleave_nxv6f64_nxv2f64: -; ZVBB: # %bb.0: -; ZVBB-NEXT: addi sp, sp, -16 -; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: li a1, 6 -; ZVBB-NEXT: mul a0, a0, a1 -; ZVBB-NEXT: sub sp, sp, a0 -; ZVBB-NEXT: addi a0, sp, 16 -; ZVBB-NEXT: csrr a1, vlenb -; ZVBB-NEXT: slli a1, a1, 1 -; ZVBB-NEXT: vsetvli a2, zero, e64, m2, ta, ma -; ZVBB-NEXT: vsseg3e64.v v8, (a0) -; ZVBB-NEXT: vl2re64.v v8, (a0) -; ZVBB-NEXT: add a0, a0, a1 -; ZVBB-NEXT: vl2re64.v v10, (a0) -; ZVBB-NEXT: add a0, a0, a1 -; ZVBB-NEXT: vl2re64.v v12, (a0) -; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: li a1, 6 -; ZVBB-NEXT: mul a0, a0, a1 -; ZVBB-NEXT: add sp, sp, a0 -; ZVBB-NEXT: addi sp, sp, 16 -; ZVBB-NEXT: ret - %res = call @llvm.vector.interleave3.nxv6f64( %v0, %v1, %v2) - ret %res +; ZVBB-RV64-LABEL: vector_interleave_nxv48f16_nxv8f16: +; ZVBB-RV64: # %bb.0: +; ZVBB-RV64-NEXT: addi sp, sp, -80 +; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; ZVBB-RV64-NEXT: addi s0, sp, 80 +; ZVBB-RV64-NEXT: csrr a0, vlenb +; ZVBB-RV64-NEXT: li a1, 28 +; ZVBB-RV64-NEXT: mul a0, a0, a1 +; ZVBB-RV64-NEXT: sub sp, sp, a0 +; ZVBB-RV64-NEXT: andi sp, sp, -64 +; ZVBB-RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZVBB-RV64-NEXT: vmv2r.v v20, v14 +; ZVBB-RV64-NEXT: vmv2r.v v22, v12 +; ZVBB-RV64-NEXT: vmv2r.v v24, v10 +; ZVBB-RV64-NEXT: csrr a1, vlenb +; ZVBB-RV64-NEXT: li a0, 6 +; ZVBB-RV64-NEXT: mul a1, a1, a0 +; ZVBB-RV64-NEXT: add a1, sp, a1 +; ZVBB-RV64-NEXT: addi a1, a1, 64 +; ZVBB-RV64-NEXT: vmv1r.v v10, v25 +; ZVBB-RV64-NEXT: vmv1r.v v11, v23 +; ZVBB-RV64-NEXT: vmv1r.v v12, v21 +; ZVBB-RV64-NEXT: addi a0, sp, 64 +; ZVBB-RV64-NEXT: vmv1r.v v13, v17 +; ZVBB-RV64-NEXT: csrr a2, vlenb +; ZVBB-RV64-NEXT: vmv1r.v v14, v19 +; ZVBB-RV64-NEXT: vsseg6e16.v v9, (a1) +; ZVBB-RV64-NEXT: vmv1r.v v9, v24 +; ZVBB-RV64-NEXT: add a5, a1, a2 +; ZVBB-RV64-NEXT: vmv1r.v v10, v22 +; ZVBB-RV64-NEXT: add a3, a0, a2 +; ZVBB-RV64-NEXT: vmv1r.v v11, v20 +; ZVBB-RV64-NEXT: add a4, a3, a2 +; ZVBB-RV64-NEXT: vmv1r.v v12, v16 +; ZVBB-RV64-NEXT: add a6, a5, a2 +; ZVBB-RV64-NEXT: vmv1r.v v13, v18 +; ZVBB-RV64-NEXT: vsseg6e16.v v8, (a0) +; ZVBB-RV64-NEXT: vl1re16.v v14, (a1) +; ZVBB-RV64-NEXT: add a1, a6, a2 +; ZVBB-RV64-NEXT: vl1re16.v v15, (a5) +; ZVBB-RV64-NEXT: add a5, a1, a2 +; ZVBB-RV64-NEXT: vl1re16.v v18, (a5) +; ZVBB-RV64-NEXT: add a5, a5, a2 +; ZVBB-RV64-NEXT: vl1re16.v v19, (a5) +; ZVBB-RV64-NEXT: add a5, a4, a2 +; ZVBB-RV64-NEXT: vl1re16.v v16, (a6) +; ZVBB-RV64-NEXT: add a6, a5, a2 +; ZVBB-RV64-NEXT: vl1re16.v v12, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re16.v v13, (a6) +; ZVBB-RV64-NEXT: csrr a6, vlenb +; ZVBB-RV64-NEXT: li a7, 12 +; ZVBB-RV64-NEXT: mul a6, a6, a7 +; ZVBB-RV64-NEXT: add a6, sp, a6 +; ZVBB-RV64-NEXT: addi a6, a6, 64 +; ZVBB-RV64-NEXT: vl1re16.v v17, (a1) +; ZVBB-RV64-NEXT: vl1re16.v v10, (a4) +; ZVBB-RV64-NEXT: vl1re16.v v11, (a5) +; ZVBB-RV64-NEXT: vl1re16.v v8, (a0) +; ZVBB-RV64-NEXT: vl1re16.v v9, (a3) +; ZVBB-RV64-NEXT: slli a2, a2, 3 +; ZVBB-RV64-NEXT: add a2, a6, a2 +; ZVBB-RV64-NEXT: vs4r.v v16, (a2) +; ZVBB-RV64-NEXT: vs8r.v v8, (a6) +; ZVBB-RV64-NEXT: vl8re16.v v16, (a2) +; ZVBB-RV64-NEXT: vl8re16.v v8, (a6) +; ZVBB-RV64-NEXT: addi sp, s0, -80 +; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; ZVBB-RV64-NEXT: addi sp, sp, 80 +; ZVBB-RV64-NEXT: ret +; +; ZIP-LABEL: vector_interleave_nxv48f16_nxv8f16: +; ZIP: # %bb.0: +; ZIP-NEXT: addi sp, sp, -80 +; ZIP-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; ZIP-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; ZIP-NEXT: addi s0, sp, 80 +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: li a1, 28 +; ZIP-NEXT: mul a0, a0, a1 +; ZIP-NEXT: sub sp, sp, a0 +; ZIP-NEXT: andi sp, sp, -64 +; ZIP-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZIP-NEXT: vmv2r.v v20, v14 +; ZIP-NEXT: vmv2r.v v22, v12 +; ZIP-NEXT: vmv2r.v v24, v10 +; ZIP-NEXT: csrr a1, vlenb +; ZIP-NEXT: li a0, 6 +; ZIP-NEXT: mul a1, a1, a0 +; ZIP-NEXT: add a1, sp, a1 +; ZIP-NEXT: addi a1, a1, 64 +; ZIP-NEXT: vmv1r.v v10, v25 +; ZIP-NEXT: vmv1r.v v11, v23 +; ZIP-NEXT: vmv1r.v v12, v21 +; ZIP-NEXT: addi a0, sp, 64 +; ZIP-NEXT: vmv1r.v v13, v17 +; ZIP-NEXT: csrr a2, vlenb +; ZIP-NEXT: vmv1r.v v14, v19 +; ZIP-NEXT: vsseg6e16.v v9, (a1) +; ZIP-NEXT: vmv1r.v v9, v24 +; ZIP-NEXT: add a5, a1, a2 +; ZIP-NEXT: vmv1r.v v10, v22 +; ZIP-NEXT: add a3, a0, a2 +; ZIP-NEXT: vmv1r.v v11, v20 +; ZIP-NEXT: add a4, a3, a2 +; ZIP-NEXT: vmv1r.v v12, v16 +; ZIP-NEXT: add a6, a5, a2 +; ZIP-NEXT: vmv1r.v v13, v18 +; ZIP-NEXT: vsseg6e16.v v8, (a0) +; ZIP-NEXT: vl1re16.v v14, (a1) +; ZIP-NEXT: add a1, a6, a2 +; ZIP-NEXT: vl1re16.v v15, (a5) +; ZIP-NEXT: add a5, a1, a2 +; ZIP-NEXT: vl1re16.v v18, (a5) +; ZIP-NEXT: add a5, a5, a2 +; ZIP-NEXT: vl1re16.v v19, (a5) +; ZIP-NEXT: add a5, a4, a2 +; ZIP-NEXT: vl1re16.v v16, (a6) +; ZIP-NEXT: add a6, a5, a2 +; ZIP-NEXT: vl1re16.v v12, (a6) +; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: vl1re16.v v13, (a6) +; ZIP-NEXT: csrr a6, vlenb +; ZIP-NEXT: li a7, 12 +; ZIP-NEXT: mul a6, a6, a7 +; ZIP-NEXT: add a6, sp, a6 +; ZIP-NEXT: addi a6, a6, 64 +; ZIP-NEXT: vl1re16.v v17, (a1) +; ZIP-NEXT: vl1re16.v v10, (a4) +; ZIP-NEXT: vl1re16.v v11, (a5) +; ZIP-NEXT: vl1re16.v v8, (a0) +; ZIP-NEXT: vl1re16.v v9, (a3) +; ZIP-NEXT: slli a2, a2, 3 +; ZIP-NEXT: add a2, a6, a2 +; ZIP-NEXT: vs4r.v v16, (a2) +; ZIP-NEXT: vs8r.v v8, (a6) +; ZIP-NEXT: vl8re16.v v16, (a2) +; ZIP-NEXT: vl8re16.v v8, (a6) +; ZIP-NEXT: addi sp, s0, -80 +; ZIP-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; ZIP-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; ZIP-NEXT: addi sp, sp, 80 +; ZIP-NEXT: ret + %res = call @llvm.vector.interleave6.nxv48f16( %v0, %v1, %v2, %v3, %v4, %v5) + ret %res } -define @vector_interleave_nxv10f16_nxv2f16( %v0, %v1, %v2, %v3, %v4) nounwind { -; CHECK-LABEL: vector_interleave_nxv10f16_nxv2f16: +define @vector_interleave_nxv12bf16_nxv2bf16( %v0, %v1, %v2, %v3, %v4, %v5) nounwind { +; CHECK-LABEL: vector_interleave_nxv12bf16_nxv2bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a0, vlenb @@ -4964,23 +10303,27 @@ define @vector_interleave_nxv10f16_nxv2f16( @vector_interleave_nxv10f16_nxv2f16( @vector_interleave_nxv10f16_nxv2f16( @llvm.vector.interleave5.nxv10f16( %v0, %v1, %v2, %v3, %v4) - ret %res + %res = call @llvm.vector.interleave6.nxv12bf16( %v0, %v1, %v2, %v3, %v4, %v5) + ret %res } -define @vector_interleave_nxv20f16_nxv4f16( %v0, %v1, %v2, %v3, %v4) nounwind { -; CHECK-LABEL: vector_interleave_nxv20f16_nxv4f16: +define @vector_interleave_nxv24bf16_nxv4bf16( %v0, %v1, %v2, %v3, %v4, %v5) nounwind { +; CHECK-LABEL: vector_interleave_nxv24bf16_nxv4bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a1, a0, 2 -; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: li a1, 6 +; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: add a2, a0, a1 ; CHECK-NEXT: add a3, a2, a1 ; CHECK-NEXT: vsetvli a4, zero, e16, m1, ta, ma -; CHECK-NEXT: vsseg5e16.v v8, (a0) +; CHECK-NEXT: vsseg6e16.v v8, (a0) ; CHECK-NEXT: vl1re16.v v10, (a3) ; CHECK-NEXT: add a3, a3, a1 ; CHECK-NEXT: vl1re16.v v11, (a3) +; CHECK-NEXT: add a3, a3, a1 ; CHECK-NEXT: vl1re16.v v8, (a0) ; CHECK-NEXT: vl1re16.v v9, (a2) +; CHECK-NEXT: vl1re16.v v12, (a3) ; CHECK-NEXT: add a1, a3, a1 -; CHECK-NEXT: vl1re16.v v12, (a1) +; CHECK-NEXT: vl1re16.v v13, (a1) ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a1, a0, 2 -; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: li a1, 6 +; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret ; -; ZVBB-LABEL: vector_interleave_nxv20f16_nxv4f16: +; ZVBB-LABEL: vector_interleave_nxv24bf16_nxv4bf16: ; ZVBB: # %bb.0: ; ZVBB-NEXT: addi sp, sp, -16 ; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: slli a1, a0, 2 -; ZVBB-NEXT: add a0, a1, a0 +; ZVBB-NEXT: li a1, 6 +; ZVBB-NEXT: mul a0, a0, a1 ; ZVBB-NEXT: sub sp, sp, a0 ; ZVBB-NEXT: addi a0, sp, 16 ; ZVBB-NEXT: csrr a1, vlenb ; ZVBB-NEXT: add a2, a0, a1 ; ZVBB-NEXT: add a3, a2, a1 ; ZVBB-NEXT: vsetvli a4, zero, e16, m1, ta, ma -; ZVBB-NEXT: vsseg5e16.v v8, (a0) +; ZVBB-NEXT: vsseg6e16.v v8, (a0) ; ZVBB-NEXT: vl1re16.v v10, (a3) ; ZVBB-NEXT: add a3, a3, a1 ; ZVBB-NEXT: vl1re16.v v11, (a3) +; ZVBB-NEXT: add a3, a3, a1 ; ZVBB-NEXT: vl1re16.v v8, (a0) ; ZVBB-NEXT: vl1re16.v v9, (a2) +; ZVBB-NEXT: vl1re16.v v12, (a3) ; ZVBB-NEXT: add a1, a3, a1 -; ZVBB-NEXT: vl1re16.v v12, (a1) +; ZVBB-NEXT: vl1re16.v v13, (a1) ; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: slli a1, a0, 2 -; ZVBB-NEXT: add a0, a1, a0 +; ZVBB-NEXT: li a1, 6 +; ZVBB-NEXT: mul a0, a0, a1 ; ZVBB-NEXT: add sp, sp, a0 ; ZVBB-NEXT: addi sp, sp, 16 ; ZVBB-NEXT: ret - %res = call @llvm.vector.interleave5.nxv20f16( %v0, %v1, %v2, %v3, %v4) - ret %res + %res = call @llvm.vector.interleave6.nxv24bf16( %v0, %v1, %v2, %v3, %v4, %v5) + ret %res } -define @vector_interleave_nxv40f16_nxv8f16( %v0, %v1, %v2, %v3, %v4) nounwind { -; RV32-LABEL: vector_interleave_nxv40f16_nxv8f16: +define @vector_interleave_nxv48bf16_nxv8bf16( %v0, %v1, %v2, %v3, %v4, %v5) nounwind { +; RV32-LABEL: vector_interleave_nxv48bf16_nxv8bf16: ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -80 ; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill @@ -5098,61 +10449,68 @@ define @vector_interleave_nxv40f16_nxv8f16( @vector_interleave_nxv40f16_nxv8f16( @vector_interleave_nxv40f16_nxv8f16( @vector_interleave_nxv40f16_nxv8f16( @vector_interleave_nxv40f16_nxv8f16( @llvm.vector.interleave5.nxv40f16( %v0, %v1, %v2, %v3, %v4) - ret %res + %res = call @llvm.vector.interleave6.nxv48bf16( %v0, %v1, %v2, %v3, %v4, %v5) + ret %res } -define @vector_interleave_nxv10bf16_nxv2bf16( %v0, %v1, %v2, %v3, %v4) nounwind { -; CHECK-LABEL: vector_interleave_nxv10bf16_nxv2bf16: +define @vector_interleave_nxv6f32_nxv1f32( %v0, %v1, %v2, %v3, %v4, %v6) nounwind { +; CHECK-LABEL: vector_interleave_nxv6f32_nxv1f32: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a0, vlenb @@ -5432,23 +10818,27 @@ define @vector_interleave_nxv10bf16_nxv2bf16( @vector_interleave_nxv10bf16_nxv2bf16( @vector_interleave_nxv10bf16_nxv2bf16( @llvm.vector.interleave5.nxv10bf16( %v0, %v1, %v2, %v3, %v4) - ret %res + %res = call @llvm.vector.interleave6.nxv6f32( %v0, %v1, %v2, %v3, %v4, %v6) + ret %res } -define @vector_interleave_nxv20bf16_nxv4bf16( %v0, %v1, %v2, %v3, %v4) nounwind { -; CHECK-LABEL: vector_interleave_nxv20bf16_nxv4bf16: +define @vector_interleave_nxv12f32_nxv2f32( %v0, %v1, %v2, %v3, %v4, %v5) nounwind { +; CHECK-LABEL: vector_interleave_nxv12f32_nxv2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a1, a0, 2 -; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: li a1, 6 +; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: add a2, a0, a1 ; CHECK-NEXT: add a3, a2, a1 -; CHECK-NEXT: vsetvli a4, zero, e16, m1, ta, ma -; CHECK-NEXT: vsseg5e16.v v8, (a0) -; CHECK-NEXT: vl1re16.v v10, (a3) +; CHECK-NEXT: vsetvli a4, zero, e32, m1, ta, ma +; CHECK-NEXT: vsseg6e32.v v8, (a0) +; CHECK-NEXT: vl1re32.v v10, (a3) ; CHECK-NEXT: add a3, a3, a1 -; CHECK-NEXT: vl1re16.v v11, (a3) -; CHECK-NEXT: vl1re16.v v8, (a0) -; CHECK-NEXT: vl1re16.v v9, (a2) +; CHECK-NEXT: vl1re32.v v11, (a3) +; CHECK-NEXT: add a3, a3, a1 +; CHECK-NEXT: vl1re32.v v8, (a0) +; CHECK-NEXT: vl1re32.v v9, (a2) +; CHECK-NEXT: vl1re32.v v12, (a3) ; CHECK-NEXT: add a1, a3, a1 -; CHECK-NEXT: vl1re16.v v12, (a1) +; CHECK-NEXT: vl1re32.v v13, (a1) ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a1, a0, 2 -; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: li a1, 6 +; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret ; -; ZVBB-LABEL: vector_interleave_nxv20bf16_nxv4bf16: +; ZVBB-LABEL: vector_interleave_nxv12f32_nxv2f32: ; ZVBB: # %bb.0: ; ZVBB-NEXT: addi sp, sp, -16 ; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: slli a1, a0, 2 -; ZVBB-NEXT: add a0, a1, a0 +; ZVBB-NEXT: li a1, 6 +; ZVBB-NEXT: mul a0, a0, a1 ; ZVBB-NEXT: sub sp, sp, a0 ; ZVBB-NEXT: addi a0, sp, 16 ; ZVBB-NEXT: csrr a1, vlenb ; ZVBB-NEXT: add a2, a0, a1 ; ZVBB-NEXT: add a3, a2, a1 -; ZVBB-NEXT: vsetvli a4, zero, e16, m1, ta, ma -; ZVBB-NEXT: vsseg5e16.v v8, (a0) -; ZVBB-NEXT: vl1re16.v v10, (a3) +; ZVBB-NEXT: vsetvli a4, zero, e32, m1, ta, ma +; ZVBB-NEXT: vsseg6e32.v v8, (a0) +; ZVBB-NEXT: vl1re32.v v10, (a3) ; ZVBB-NEXT: add a3, a3, a1 -; ZVBB-NEXT: vl1re16.v v11, (a3) -; ZVBB-NEXT: vl1re16.v v8, (a0) -; ZVBB-NEXT: vl1re16.v v9, (a2) +; ZVBB-NEXT: vl1re32.v v11, (a3) +; ZVBB-NEXT: add a3, a3, a1 +; ZVBB-NEXT: vl1re32.v v8, (a0) +; ZVBB-NEXT: vl1re32.v v9, (a2) +; ZVBB-NEXT: vl1re32.v v12, (a3) ; ZVBB-NEXT: add a1, a3, a1 -; ZVBB-NEXT: vl1re16.v v12, (a1) +; ZVBB-NEXT: vl1re32.v v13, (a1) ; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: slli a1, a0, 2 -; ZVBB-NEXT: add a0, a1, a0 +; ZVBB-NEXT: li a1, 6 +; ZVBB-NEXT: mul a0, a0, a1 ; ZVBB-NEXT: add sp, sp, a0 ; ZVBB-NEXT: addi sp, sp, 16 ; ZVBB-NEXT: ret - %res = call @llvm.vector.interleave5.nxv20bf16( %v0, %v1, %v2, %v3, %v4) - ret %res + %res = call @llvm.vector.interleave6.nxv12f32( %v0, %v1, %v2, %v3, %v4, %v5) + ret %res } -define @vector_interleave_nxv40bf16_nxv8bf16( %v0, %v1, %v2, %v3, %v4) nounwind { -; RV32-LABEL: vector_interleave_nxv40bf16_nxv8bf16: +define @vector_interleave_nxv24f32_nxv4f32( %v0, %v1, %v2, %v3, %v4, %v5) nounwind { +; RV32-LABEL: vector_interleave_nxv24f32_nxv4f32: ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -80 ; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill @@ -5565,62 +10963,69 @@ define @vector_interleave_nxv40bf16_nxv8bf16( @vector_interleave_nxv40bf16_nxv8bf16( @vector_interleave_nxv40bf16_nxv8bf16( @vector_interleave_nxv40bf16_nxv8bf16( @vector_interleave_nxv40bf16_nxv8bf16( @llvm.vector.interleave5.nxv40bf16( %v0, %v1, %v2, %v3, %v4) - ret %res -} - -define @vector_interleave_nxv5f32_nxv1f32( %v0, %v1, %v2, %v3, %v4) nounwind { -; CHECK-LABEL: vector_interleave_nxv5f32_nxv1f32: -; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a1, a0, 1 -; CHECK-NEXT: add a0, a1, a0 -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a2, a1, 1 -; CHECK-NEXT: add a3, a0, a2 -; CHECK-NEXT: add a4, a3, a2 -; CHECK-NEXT: vsetvli a5, zero, e32, mf2, ta, ma -; CHECK-NEXT: vsseg5e32.v v8, (a0) -; CHECK-NEXT: add a5, a4, a2 -; CHECK-NEXT: vle32.v v8, (a5) -; CHECK-NEXT: vle32.v v9, (a4) -; CHECK-NEXT: srli a1, a1, 3 -; CHECK-NEXT: add a4, a1, a1 -; CHECK-NEXT: vle32.v v10, (a3) -; CHECK-NEXT: vsetvli zero, a4, e32, m1, ta, ma -; CHECK-NEXT: vslideup.vx v9, v8, a1 -; CHECK-NEXT: vsetvli a3, zero, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetvli zero, a4, e32, m1, ta, ma -; CHECK-NEXT: vslideup.vx v8, v10, a1 -; CHECK-NEXT: add a2, a5, a2 -; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v10, (a2) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a1, a0, 1 -; CHECK-NEXT: add a0, a1, a0 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: ret -; -; ZVBB-LABEL: vector_interleave_nxv5f32_nxv1f32: -; ZVBB: # %bb.0: -; ZVBB-NEXT: addi sp, sp, -16 -; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: slli a1, a0, 1 -; ZVBB-NEXT: add a0, a1, a0 -; ZVBB-NEXT: sub sp, sp, a0 -; ZVBB-NEXT: addi a0, sp, 16 -; ZVBB-NEXT: csrr a1, vlenb -; ZVBB-NEXT: srli a2, a1, 1 -; ZVBB-NEXT: add a3, a0, a2 -; ZVBB-NEXT: add a4, a3, a2 -; ZVBB-NEXT: vsetvli a5, zero, e32, mf2, ta, ma -; ZVBB-NEXT: vsseg5e32.v v8, (a0) -; ZVBB-NEXT: add a5, a4, a2 -; ZVBB-NEXT: vle32.v v8, (a5) -; ZVBB-NEXT: vle32.v v9, (a4) -; ZVBB-NEXT: srli a1, a1, 3 -; ZVBB-NEXT: add a4, a1, a1 -; ZVBB-NEXT: vle32.v v10, (a3) -; ZVBB-NEXT: vsetvli zero, a4, e32, m1, ta, ma -; ZVBB-NEXT: vslideup.vx v9, v8, a1 -; ZVBB-NEXT: vsetvli a3, zero, e32, mf2, ta, ma -; ZVBB-NEXT: vle32.v v8, (a0) -; ZVBB-NEXT: vsetvli zero, a4, e32, m1, ta, ma -; ZVBB-NEXT: vslideup.vx v8, v10, a1 -; ZVBB-NEXT: add a2, a5, a2 -; ZVBB-NEXT: vsetvli a0, zero, e32, mf2, ta, ma -; ZVBB-NEXT: vle32.v v10, (a2) -; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: slli a1, a0, 1 -; ZVBB-NEXT: add a0, a1, a0 -; ZVBB-NEXT: add sp, sp, a0 -; ZVBB-NEXT: addi sp, sp, 16 -; ZVBB-NEXT: ret - %res = call @llvm.vector.interleave5.nxv5f32( %v0, %v1, %v2, %v3, %v4) - ret %res + %res = call @llvm.vector.interleave6.nxv24f32( %v0, %v1, %v2, %v3, %v4, %v5) + ret %res } -define @vector_interleave_nxv10f32_nxv2f32( %v0, %v1, %v2, %v3, %v4) nounwind { -; CHECK-LABEL: vector_interleave_nxv10f32_nxv2f32: +define @vector_interleave_nxv6f64_nxv1f64( %v0, %v1, %v2, %v3, %v4, %v5) nounwind { +; CHECK-LABEL: vector_interleave_nxv6f64_nxv1f64: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a1, a0, 2 -; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: li a1, 6 +; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: add a2, a0, a1 ; CHECK-NEXT: add a3, a2, a1 -; CHECK-NEXT: vsetvli a4, zero, e32, m1, ta, ma -; CHECK-NEXT: vsseg5e32.v v8, (a0) -; CHECK-NEXT: vl1re32.v v10, (a3) +; CHECK-NEXT: vsetvli a4, zero, e64, m1, ta, ma +; CHECK-NEXT: vsseg6e64.v v8, (a0) +; CHECK-NEXT: vl1re64.v v10, (a3) ; CHECK-NEXT: add a3, a3, a1 -; CHECK-NEXT: vl1re32.v v11, (a3) -; CHECK-NEXT: vl1re32.v v8, (a0) -; CHECK-NEXT: vl1re32.v v9, (a2) +; CHECK-NEXT: vl1re64.v v11, (a3) +; CHECK-NEXT: add a3, a3, a1 +; CHECK-NEXT: vl1re64.v v8, (a0) +; CHECK-NEXT: vl1re64.v v9, (a2) +; CHECK-NEXT: vl1re64.v v12, (a3) ; CHECK-NEXT: add a1, a3, a1 -; CHECK-NEXT: vl1re32.v v12, (a1) +; CHECK-NEXT: vl1re64.v v13, (a1) ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a1, a0, 2 -; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: li a1, 6 +; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret ; -; ZVBB-LABEL: vector_interleave_nxv10f32_nxv2f32: +; ZVBB-LABEL: vector_interleave_nxv6f64_nxv1f64: ; ZVBB: # %bb.0: ; ZVBB-NEXT: addi sp, sp, -16 ; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: slli a1, a0, 2 -; ZVBB-NEXT: add a0, a1, a0 +; ZVBB-NEXT: li a1, 6 +; ZVBB-NEXT: mul a0, a0, a1 ; ZVBB-NEXT: sub sp, sp, a0 ; ZVBB-NEXT: addi a0, sp, 16 ; ZVBB-NEXT: csrr a1, vlenb ; ZVBB-NEXT: add a2, a0, a1 ; ZVBB-NEXT: add a3, a2, a1 -; ZVBB-NEXT: vsetvli a4, zero, e32, m1, ta, ma -; ZVBB-NEXT: vsseg5e32.v v8, (a0) -; ZVBB-NEXT: vl1re32.v v10, (a3) +; ZVBB-NEXT: vsetvli a4, zero, e64, m1, ta, ma +; ZVBB-NEXT: vsseg6e64.v v8, (a0) +; ZVBB-NEXT: vl1re64.v v10, (a3) ; ZVBB-NEXT: add a3, a3, a1 -; ZVBB-NEXT: vl1re32.v v11, (a3) -; ZVBB-NEXT: vl1re32.v v8, (a0) -; ZVBB-NEXT: vl1re32.v v9, (a2) +; ZVBB-NEXT: vl1re64.v v11, (a3) +; ZVBB-NEXT: add a3, a3, a1 +; ZVBB-NEXT: vl1re64.v v8, (a0) +; ZVBB-NEXT: vl1re64.v v9, (a2) +; ZVBB-NEXT: vl1re64.v v12, (a3) ; ZVBB-NEXT: add a1, a3, a1 -; ZVBB-NEXT: vl1re32.v v12, (a1) +; ZVBB-NEXT: vl1re64.v v13, (a1) ; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: slli a1, a0, 2 -; ZVBB-NEXT: add a0, a1, a0 +; ZVBB-NEXT: li a1, 6 +; ZVBB-NEXT: mul a0, a0, a1 ; ZVBB-NEXT: add sp, sp, a0 ; ZVBB-NEXT: addi sp, sp, 16 ; ZVBB-NEXT: ret - %res = call @llvm.vector.interleave5.nxv10f32( %v0, %v1, %v2, %v3, %v4) - ret %res + %res = call @llvm.vector.interleave6.nxv6f64( %v0, %v1, %v2, %v3, %v4, %v5) + ret %res } -define @vector_interleave_nxv20f32_nxv4f32( %v0, %v1, %v2, %v3, %v4) nounwind { -; RV32-LABEL: vector_interleave_nxv20f32_nxv4f32: +define @vector_interleave_nxv12f64_nxv2f64( %v0, %v1, %v2, %v3, %v4, %v5) nounwind { +; RV32-LABEL: vector_interleave_nxv12f64_nxv2f64: ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -80 ; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill @@ -6033,62 +11394,69 @@ define @vector_interleave_nxv20f32_nxv4f32( @vector_interleave_nxv20f32_nxv4f32( @vector_interleave_nxv20f32_nxv4f32( @vector_interleave_nxv20f32_nxv4f32( @vector_interleave_nxv20f32_nxv4f32( @llvm.vector.interleave5.nxv20f32( %v0, %v1, %v2, %v3, %v4) - ret %res + %res = call @llvm.vector.interleave6.nxv12f64( %v0, %v1, %v2, %v3, %v4, %v5) + ret %res +} + +define @vector_interleave_nxv14f16_nxv2f16( %v0, %v1, %v2, %v3, %v4, %v5, %v6) nounwind { +; CHECK-LABEL: vector_interleave_nxv14f16_nxv2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a2, a1, 1 +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: add a3, a0, a2 +; CHECK-NEXT: add a4, a3, a2 +; CHECK-NEXT: add a5, a4, a2 +; CHECK-NEXT: add a6, a5, a2 +; CHECK-NEXT: vsetvli a7, zero, e16, mf2, ta, ma +; CHECK-NEXT: vsseg7e16.v v8, (a0) +; CHECK-NEXT: add a7, a6, a2 +; CHECK-NEXT: vle16.v v8, (a7) +; CHECK-NEXT: vle16.v v10, (a6) +; CHECK-NEXT: add a6, a1, a1 +; CHECK-NEXT: add a2, a7, a2 +; CHECK-NEXT: vle16.v v12, (a5) +; CHECK-NEXT: vsetvli zero, a6, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v10, v8, a1 +; CHECK-NEXT: vsetvli a5, zero, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v11, (a2) +; CHECK-NEXT: vle16.v v9, (a4) +; CHECK-NEXT: vsetvli zero, a6, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v9, v12, a1 +; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v12, (a3) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli zero, a6, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v8, v12, a1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv14f16_nxv2f16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 2 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: srli a2, a1, 1 +; ZVBB-NEXT: srli a1, a1, 2 +; ZVBB-NEXT: add a3, a0, a2 +; ZVBB-NEXT: add a4, a3, a2 +; ZVBB-NEXT: add a5, a4, a2 +; ZVBB-NEXT: add a6, a5, a2 +; ZVBB-NEXT: vsetvli a7, zero, e16, mf2, ta, ma +; ZVBB-NEXT: vsseg7e16.v v8, (a0) +; ZVBB-NEXT: add a7, a6, a2 +; ZVBB-NEXT: vle16.v v8, (a7) +; ZVBB-NEXT: vle16.v v10, (a6) +; ZVBB-NEXT: add a6, a1, a1 +; ZVBB-NEXT: add a2, a7, a2 +; ZVBB-NEXT: vle16.v v12, (a5) +; ZVBB-NEXT: vsetvli zero, a6, e16, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v10, v8, a1 +; ZVBB-NEXT: vsetvli a5, zero, e16, mf2, ta, ma +; ZVBB-NEXT: vle16.v v11, (a2) +; ZVBB-NEXT: vle16.v v9, (a4) +; ZVBB-NEXT: vsetvli zero, a6, e16, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v9, v12, a1 +; ZVBB-NEXT: vsetvli a2, zero, e16, mf2, ta, ma +; ZVBB-NEXT: vle16.v v12, (a3) +; ZVBB-NEXT: vle16.v v8, (a0) +; ZVBB-NEXT: vsetvli zero, a6, e16, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v8, v12, a1 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 2 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave7.nxv14f16( %v0, %v1, %v2, %v3, %v4, %v5, %v6) + ret %res } -define @vector_interleave_nxv5f64_nxv1f64( %v0, %v1, %v2, %v3, %v4) nounwind { -; CHECK-LABEL: vector_interleave_nxv5f64_nxv1f64: +define @vector_interleave_nxv28f16_nxv4f16( %v0, %v1, %v2, %v3, %v4, %v5, %v6) nounwind { +; CHECK-LABEL: vector_interleave_nxv28f16_nxv4f16: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a1, a0, 2 -; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: slli a1, a0, 3 +; CHECK-NEXT: sub a0, a1, a0 ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: add a2, a0, a1 ; CHECK-NEXT: add a3, a2, a1 -; CHECK-NEXT: vsetvli a4, zero, e64, m1, ta, ma -; CHECK-NEXT: vsseg5e64.v v8, (a0) -; CHECK-NEXT: vl1re64.v v10, (a3) +; CHECK-NEXT: vsetvli a4, zero, e16, m1, ta, ma +; CHECK-NEXT: vsseg7e16.v v8, (a0) +; CHECK-NEXT: vl1re16.v v10, (a3) ; CHECK-NEXT: add a3, a3, a1 -; CHECK-NEXT: vl1re64.v v11, (a3) -; CHECK-NEXT: vl1re64.v v8, (a0) -; CHECK-NEXT: vl1re64.v v9, (a2) -; CHECK-NEXT: add a1, a3, a1 -; CHECK-NEXT: vl1re64.v v12, (a1) +; CHECK-NEXT: vl1re16.v v11, (a3) +; CHECK-NEXT: add a3, a3, a1 +; CHECK-NEXT: vl1re16.v v8, (a0) +; CHECK-NEXT: add a0, a3, a1 +; CHECK-NEXT: vl1re16.v v9, (a2) +; CHECK-NEXT: vl1re16.v v12, (a3) +; CHECK-NEXT: vl1re16.v v13, (a0) +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vl1re16.v v14, (a0) ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a1, a0, 2 -; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: slli a1, a0, 3 +; CHECK-NEXT: sub a0, a1, a0 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret ; -; ZVBB-LABEL: vector_interleave_nxv5f64_nxv1f64: +; ZVBB-LABEL: vector_interleave_nxv28f16_nxv4f16: ; ZVBB: # %bb.0: ; ZVBB-NEXT: addi sp, sp, -16 ; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: slli a1, a0, 2 -; ZVBB-NEXT: add a0, a1, a0 +; ZVBB-NEXT: slli a1, a0, 3 +; ZVBB-NEXT: sub a0, a1, a0 ; ZVBB-NEXT: sub sp, sp, a0 ; ZVBB-NEXT: addi a0, sp, 16 ; ZVBB-NEXT: csrr a1, vlenb ; ZVBB-NEXT: add a2, a0, a1 ; ZVBB-NEXT: add a3, a2, a1 -; ZVBB-NEXT: vsetvli a4, zero, e64, m1, ta, ma -; ZVBB-NEXT: vsseg5e64.v v8, (a0) -; ZVBB-NEXT: vl1re64.v v10, (a3) +; ZVBB-NEXT: vsetvli a4, zero, e16, m1, ta, ma +; ZVBB-NEXT: vsseg7e16.v v8, (a0) +; ZVBB-NEXT: vl1re16.v v10, (a3) ; ZVBB-NEXT: add a3, a3, a1 -; ZVBB-NEXT: vl1re64.v v11, (a3) -; ZVBB-NEXT: vl1re64.v v8, (a0) -; ZVBB-NEXT: vl1re64.v v9, (a2) -; ZVBB-NEXT: add a1, a3, a1 -; ZVBB-NEXT: vl1re64.v v12, (a1) +; ZVBB-NEXT: vl1re16.v v11, (a3) +; ZVBB-NEXT: add a3, a3, a1 +; ZVBB-NEXT: vl1re16.v v8, (a0) +; ZVBB-NEXT: add a0, a3, a1 +; ZVBB-NEXT: vl1re16.v v9, (a2) +; ZVBB-NEXT: vl1re16.v v12, (a3) +; ZVBB-NEXT: vl1re16.v v13, (a0) +; ZVBB-NEXT: add a0, a0, a1 +; ZVBB-NEXT: vl1re16.v v14, (a0) ; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: slli a1, a0, 2 -; ZVBB-NEXT: add a0, a1, a0 +; ZVBB-NEXT: slli a1, a0, 3 +; ZVBB-NEXT: sub a0, a1, a0 ; ZVBB-NEXT: add sp, sp, a0 ; ZVBB-NEXT: addi sp, sp, 16 ; ZVBB-NEXT: ret - %res = call @llvm.vector.interleave5.nxv5f64( %v0, %v1, %v2, %v3, %v4) - ret %res + %res = call @llvm.vector.interleave7.nxv28f16( %v0, %v1, %v2, %v3, %v4, %v5, %v6) + ret %res } -define @vector_interleave_nxv10f64_nxv2f64( %v0, %v1, %v2, %v3, %v4) nounwind { -; RV32-LABEL: vector_interleave_nxv10f64_nxv2f64: +define @vector_interleave_nxv56f16_nxv8f16( %v0, %v1, %v2, %v3, %v4, %v5, %v6) nounwind { +; RV32-LABEL: vector_interleave_nxv56f16_nxv8f16: ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -80 ; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill ; RV32-NEXT: addi s0, sp, 80 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 28 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: sub sp, sp, a0 ; RV32-NEXT: andi sp, sp, -64 -; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32-NEXT: vmv2r.v v20, v16 +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; RV32-NEXT: vmv2r.v v26, v20 ; RV32-NEXT: addi a0, sp, 64 -; RV32-NEXT: vmv2r.v v18, v12 +; RV32-NEXT: vmv2r.v v24, v16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a2, a1, 2 -; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: slli a2, a1, 3 +; RV32-NEXT: sub a1, a2, a1 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 64 +; RV32-NEXT: vmv2r.v v22, v12 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: vmv2r.v v16, v8 -; RV32-NEXT: vmv2r.v v22, v16 -; RV32-NEXT: vmv2r.v v24, v18 -; RV32-NEXT: vmv1r.v v26, v20 +; RV32-NEXT: vmv2r.v v20, v8 +; RV32-NEXT: vmv1r.v v1, v20 +; RV32-NEXT: vmv1r.v v3, v22 +; RV32-NEXT: vmv1r.v v5, v24 +; RV32-NEXT: vmv1r.v v7, v26 ; RV32-NEXT: add a3, a0, a2 -; RV32-NEXT: vmv1r.v v23, v10 +; RV32-NEXT: vmv1r.v v2, v10 ; RV32-NEXT: add a4, a1, a2 -; RV32-NEXT: add a5, a4, a2 -; RV32-NEXT: vmv1r.v v25, v14 -; RV32-NEXT: add a6, a5, a2 -; RV32-NEXT: vmv1r.v v18, v11 -; RV32-NEXT: vsseg5e64.v v22, (a0) -; RV32-NEXT: vmv1r.v v20, v15 -; RV32-NEXT: vsseg5e64.v v17, (a1) -; RV32-NEXT: vl1re64.v v16, (a6) +; RV32-NEXT: slli a5, a2, 2 +; RV32-NEXT: vmv1r.v v4, v14 +; RV32-NEXT: slli a6, a2, 4 +; RV32-NEXT: add a7, a4, a2 +; RV32-NEXT: vmv1r.v v6, v18 +; RV32-NEXT: sub a5, a6, a5 +; RV32-NEXT: vmv1r.v v22, v11 +; RV32-NEXT: add a6, a7, a2 +; RV32-NEXT: vmv1r.v v24, v15 +; RV32-NEXT: vsseg7e16.v v1, (a0) +; RV32-NEXT: vmv1r.v v26, v19 +; RV32-NEXT: vsseg7e16.v v21, (a1) +; RV32-NEXT: vl1re16.v v18, (a6) ; RV32-NEXT: add a6, a6, a2 -; RV32-NEXT: vl1re64.v v17, (a6) +; RV32-NEXT: vl1re16.v v19, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re16.v v20, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re16.v v21, (a6) ; RV32-NEXT: add a6, a3, a2 -; RV32-NEXT: vl1re64.v v10, (a6) +; RV32-NEXT: vl1re16.v v10, (a6) ; RV32-NEXT: add a6, a6, a2 -; RV32-NEXT: vl1re64.v v11, (a6) -; RV32-NEXT: vl1re64.v v8, (a0) -; RV32-NEXT: vl1re64.v v9, (a3) -; RV32-NEXT: vl1re64.v v14, (a4) +; RV32-NEXT: vl1re16.v v11, (a6) +; RV32-NEXT: vl1re16.v v8, (a0) +; RV32-NEXT: vl1re16.v v16, (a4) +; RV32-NEXT: vl1re16.v v9, (a3) +; RV32-NEXT: vl1re16.v v17, (a7) ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a3, 10 +; RV32-NEXT: li a3, 14 ; RV32-NEXT: mul a0, a0, a3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 64 ; RV32-NEXT: add a6, a6, a2 -; RV32-NEXT: vl1re64.v v15, (a5) -; RV32-NEXT: vl1re64.v v12, (a6) -; RV32-NEXT: vl1re64.v v13, (a1) +; RV32-NEXT: vl1re16.v v12, (a6) +; RV32-NEXT: add a6, a6, a2 +; RV32-NEXT: vl1re16.v v13, (a6) +; RV32-NEXT: add a6, a6, a2 ; RV32-NEXT: slli a2, a2, 3 ; RV32-NEXT: add a2, a0, a2 -; RV32-NEXT: vs2r.v v16, (a2) +; RV32-NEXT: vl1re16.v v14, (a6) +; RV32-NEXT: vl1re16.v v15, (a1) +; RV32-NEXT: add a5, a0, a5 +; RV32-NEXT: vs2r.v v20, (a5) +; RV32-NEXT: vs4r.v v16, (a2) ; RV32-NEXT: vs8r.v v8, (a0) -; RV32-NEXT: vl8re64.v v16, (a2) -; RV32-NEXT: vl8re64.v v8, (a0) +; RV32-NEXT: vl8re16.v v16, (a2) +; RV32-NEXT: vl8re16.v v8, (a0) ; RV32-NEXT: addi sp, s0, -80 ; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 80 ; RV32-NEXT: ret ; -; RV64-LABEL: vector_interleave_nxv10f64_nxv2f64: +; RV64-LABEL: vector_interleave_nxv56f16_nxv8f16: ; RV64: # %bb.0: ; RV64-NEXT: addi sp, sp, -80 ; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill ; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill ; RV64-NEXT: addi s0, sp, 80 ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: li a1, 28 -; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: slli a0, a0, 5 ; RV64-NEXT: sub sp, sp, a0 ; RV64-NEXT: andi sp, sp, -64 -; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV64-NEXT: vmv2r.v v20, v16 +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; RV64-NEXT: vmv2r.v v26, v20 ; RV64-NEXT: addi a0, sp, 64 -; RV64-NEXT: vmv2r.v v18, v12 +; RV64-NEXT: vmv2r.v v24, v16 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 2 -; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: slli a2, a1, 3 +; RV64-NEXT: sub a1, a2, a1 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 64 +; RV64-NEXT: vmv2r.v v22, v12 ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: vmv2r.v v16, v8 -; RV64-NEXT: vmv2r.v v22, v16 -; RV64-NEXT: vmv2r.v v24, v18 -; RV64-NEXT: vmv1r.v v26, v20 +; RV64-NEXT: vmv2r.v v20, v8 +; RV64-NEXT: vmv1r.v v1, v20 +; RV64-NEXT: vmv1r.v v3, v22 +; RV64-NEXT: vmv1r.v v5, v24 +; RV64-NEXT: vmv1r.v v7, v26 ; RV64-NEXT: add a3, a0, a2 -; RV64-NEXT: vmv1r.v v23, v10 +; RV64-NEXT: vmv1r.v v2, v10 ; RV64-NEXT: add a4, a1, a2 -; RV64-NEXT: add a5, a4, a2 -; RV64-NEXT: vmv1r.v v25, v14 -; RV64-NEXT: add a6, a5, a2 -; RV64-NEXT: vmv1r.v v18, v11 -; RV64-NEXT: vsseg5e64.v v22, (a0) -; RV64-NEXT: vmv1r.v v20, v15 -; RV64-NEXT: vsseg5e64.v v17, (a1) -; RV64-NEXT: vl1re64.v v16, (a6) +; RV64-NEXT: slli a5, a2, 2 +; RV64-NEXT: vmv1r.v v4, v14 +; RV64-NEXT: slli a6, a2, 4 +; RV64-NEXT: add a7, a4, a2 +; RV64-NEXT: vmv1r.v v6, v18 +; RV64-NEXT: sub a5, a6, a5 +; RV64-NEXT: vmv1r.v v22, v11 +; RV64-NEXT: add a6, a7, a2 +; RV64-NEXT: vmv1r.v v24, v15 +; RV64-NEXT: vsseg7e16.v v1, (a0) +; RV64-NEXT: vmv1r.v v26, v19 +; RV64-NEXT: vsseg7e16.v v21, (a1) +; RV64-NEXT: vl1re16.v v18, (a6) ; RV64-NEXT: add a6, a6, a2 -; RV64-NEXT: vl1re64.v v17, (a6) +; RV64-NEXT: vl1re16.v v19, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re16.v v20, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re16.v v21, (a6) ; RV64-NEXT: add a6, a3, a2 -; RV64-NEXT: vl1re64.v v10, (a6) +; RV64-NEXT: vl1re16.v v10, (a6) ; RV64-NEXT: add a6, a6, a2 -; RV64-NEXT: vl1re64.v v11, (a6) -; RV64-NEXT: vl1re64.v v8, (a0) -; RV64-NEXT: vl1re64.v v9, (a3) -; RV64-NEXT: vl1re64.v v14, (a4) +; RV64-NEXT: vl1re16.v v11, (a6) +; RV64-NEXT: vl1re16.v v8, (a0) +; RV64-NEXT: vl1re16.v v16, (a4) +; RV64-NEXT: vl1re16.v v9, (a3) +; RV64-NEXT: vl1re16.v v17, (a7) ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: li a3, 10 +; RV64-NEXT: li a3, 14 ; RV64-NEXT: mul a0, a0, a3 ; RV64-NEXT: add a0, sp, a0 ; RV64-NEXT: addi a0, a0, 64 ; RV64-NEXT: add a6, a6, a2 -; RV64-NEXT: vl1re64.v v15, (a5) -; RV64-NEXT: vl1re64.v v12, (a6) -; RV64-NEXT: vl1re64.v v13, (a1) +; RV64-NEXT: vl1re16.v v12, (a6) +; RV64-NEXT: add a6, a6, a2 +; RV64-NEXT: vl1re16.v v13, (a6) +; RV64-NEXT: add a6, a6, a2 ; RV64-NEXT: slli a2, a2, 3 ; RV64-NEXT: add a2, a0, a2 -; RV64-NEXT: vs2r.v v16, (a2) +; RV64-NEXT: vl1re16.v v14, (a6) +; RV64-NEXT: vl1re16.v v15, (a1) +; RV64-NEXT: add a5, a0, a5 +; RV64-NEXT: vs2r.v v20, (a5) +; RV64-NEXT: vs4r.v v16, (a2) ; RV64-NEXT: vs8r.v v8, (a0) -; RV64-NEXT: vl8re64.v v16, (a2) -; RV64-NEXT: vl8re64.v v8, (a0) +; RV64-NEXT: vl8re16.v v16, (a2) +; RV64-NEXT: vl8re16.v v8, (a0) ; RV64-NEXT: addi sp, s0, -80 ; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload ; RV64-NEXT: addi sp, sp, 80 ; RV64-NEXT: ret ; -; ZVBB-RV32-LABEL: vector_interleave_nxv10f64_nxv2f64: +; ZVBB-RV32-LABEL: vector_interleave_nxv56f16_nxv8f16: ; ZVBB-RV32: # %bb.0: ; ZVBB-RV32-NEXT: addi sp, sp, -80 ; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill ; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill ; ZVBB-RV32-NEXT: addi s0, sp, 80 ; ZVBB-RV32-NEXT: csrr a0, vlenb -; ZVBB-RV32-NEXT: li a1, 28 -; ZVBB-RV32-NEXT: mul a0, a0, a1 +; ZVBB-RV32-NEXT: slli a0, a0, 5 ; ZVBB-RV32-NEXT: sub sp, sp, a0 ; ZVBB-RV32-NEXT: andi sp, sp, -64 -; ZVBB-RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; ZVBB-RV32-NEXT: vmv2r.v v20, v16 +; ZVBB-RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZVBB-RV32-NEXT: vmv2r.v v26, v20 ; ZVBB-RV32-NEXT: addi a0, sp, 64 -; ZVBB-RV32-NEXT: vmv2r.v v18, v12 +; ZVBB-RV32-NEXT: vmv2r.v v24, v16 ; ZVBB-RV32-NEXT: csrr a1, vlenb -; ZVBB-RV32-NEXT: slli a2, a1, 2 -; ZVBB-RV32-NEXT: add a1, a2, a1 +; ZVBB-RV32-NEXT: slli a2, a1, 3 +; ZVBB-RV32-NEXT: sub a1, a2, a1 ; ZVBB-RV32-NEXT: add a1, sp, a1 ; ZVBB-RV32-NEXT: addi a1, a1, 64 +; ZVBB-RV32-NEXT: vmv2r.v v22, v12 ; ZVBB-RV32-NEXT: csrr a2, vlenb -; ZVBB-RV32-NEXT: vmv2r.v v16, v8 -; ZVBB-RV32-NEXT: vmv2r.v v22, v16 -; ZVBB-RV32-NEXT: vmv2r.v v24, v18 -; ZVBB-RV32-NEXT: vmv1r.v v26, v20 +; ZVBB-RV32-NEXT: vmv2r.v v20, v8 +; ZVBB-RV32-NEXT: vmv1r.v v1, v20 +; ZVBB-RV32-NEXT: vmv1r.v v3, v22 +; ZVBB-RV32-NEXT: vmv1r.v v5, v24 +; ZVBB-RV32-NEXT: vmv1r.v v7, v26 ; ZVBB-RV32-NEXT: add a3, a0, a2 -; ZVBB-RV32-NEXT: vmv1r.v v23, v10 +; ZVBB-RV32-NEXT: vmv1r.v v2, v10 ; ZVBB-RV32-NEXT: add a4, a1, a2 -; ZVBB-RV32-NEXT: add a5, a4, a2 -; ZVBB-RV32-NEXT: vmv1r.v v25, v14 -; ZVBB-RV32-NEXT: add a6, a5, a2 -; ZVBB-RV32-NEXT: vmv1r.v v18, v11 -; ZVBB-RV32-NEXT: vsseg5e64.v v22, (a0) -; ZVBB-RV32-NEXT: vmv1r.v v20, v15 -; ZVBB-RV32-NEXT: vsseg5e64.v v17, (a1) -; ZVBB-RV32-NEXT: vl1re64.v v16, (a6) +; ZVBB-RV32-NEXT: slli a5, a2, 2 +; ZVBB-RV32-NEXT: vmv1r.v v4, v14 +; ZVBB-RV32-NEXT: slli a6, a2, 4 +; ZVBB-RV32-NEXT: add a7, a4, a2 +; ZVBB-RV32-NEXT: vmv1r.v v6, v18 +; ZVBB-RV32-NEXT: sub a5, a6, a5 +; ZVBB-RV32-NEXT: vmv1r.v v22, v11 +; ZVBB-RV32-NEXT: add a6, a7, a2 +; ZVBB-RV32-NEXT: vmv1r.v v24, v15 +; ZVBB-RV32-NEXT: vsseg7e16.v v1, (a0) +; ZVBB-RV32-NEXT: vmv1r.v v26, v19 +; ZVBB-RV32-NEXT: vsseg7e16.v v21, (a1) +; ZVBB-RV32-NEXT: vl1re16.v v18, (a6) ; ZVBB-RV32-NEXT: add a6, a6, a2 -; ZVBB-RV32-NEXT: vl1re64.v v17, (a6) +; ZVBB-RV32-NEXT: vl1re16.v v19, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re16.v v20, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re16.v v21, (a6) ; ZVBB-RV32-NEXT: add a6, a3, a2 -; ZVBB-RV32-NEXT: vl1re64.v v10, (a6) +; ZVBB-RV32-NEXT: vl1re16.v v10, (a6) ; ZVBB-RV32-NEXT: add a6, a6, a2 -; ZVBB-RV32-NEXT: vl1re64.v v11, (a6) -; ZVBB-RV32-NEXT: vl1re64.v v8, (a0) -; ZVBB-RV32-NEXT: vl1re64.v v9, (a3) -; ZVBB-RV32-NEXT: vl1re64.v v14, (a4) +; ZVBB-RV32-NEXT: vl1re16.v v11, (a6) +; ZVBB-RV32-NEXT: vl1re16.v v8, (a0) +; ZVBB-RV32-NEXT: vl1re16.v v16, (a4) +; ZVBB-RV32-NEXT: vl1re16.v v9, (a3) +; ZVBB-RV32-NEXT: vl1re16.v v17, (a7) ; ZVBB-RV32-NEXT: csrr a0, vlenb -; ZVBB-RV32-NEXT: li a3, 10 +; ZVBB-RV32-NEXT: li a3, 14 ; ZVBB-RV32-NEXT: mul a0, a0, a3 ; ZVBB-RV32-NEXT: add a0, sp, a0 ; ZVBB-RV32-NEXT: addi a0, a0, 64 ; ZVBB-RV32-NEXT: add a6, a6, a2 -; ZVBB-RV32-NEXT: vl1re64.v v15, (a5) -; ZVBB-RV32-NEXT: vl1re64.v v12, (a6) -; ZVBB-RV32-NEXT: vl1re64.v v13, (a1) +; ZVBB-RV32-NEXT: vl1re16.v v12, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 +; ZVBB-RV32-NEXT: vl1re16.v v13, (a6) +; ZVBB-RV32-NEXT: add a6, a6, a2 ; ZVBB-RV32-NEXT: slli a2, a2, 3 ; ZVBB-RV32-NEXT: add a2, a0, a2 -; ZVBB-RV32-NEXT: vs2r.v v16, (a2) +; ZVBB-RV32-NEXT: vl1re16.v v14, (a6) +; ZVBB-RV32-NEXT: vl1re16.v v15, (a1) +; ZVBB-RV32-NEXT: add a5, a0, a5 +; ZVBB-RV32-NEXT: vs2r.v v20, (a5) +; ZVBB-RV32-NEXT: vs4r.v v16, (a2) ; ZVBB-RV32-NEXT: vs8r.v v8, (a0) -; ZVBB-RV32-NEXT: vl8re64.v v16, (a2) -; ZVBB-RV32-NEXT: vl8re64.v v8, (a0) +; ZVBB-RV32-NEXT: vl8re16.v v16, (a2) +; ZVBB-RV32-NEXT: vl8re16.v v8, (a0) ; ZVBB-RV32-NEXT: addi sp, s0, -80 ; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload ; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload ; ZVBB-RV32-NEXT: addi sp, sp, 80 ; ZVBB-RV32-NEXT: ret ; -; ZVBB-RV64-LABEL: vector_interleave_nxv10f64_nxv2f64: +; ZVBB-RV64-LABEL: vector_interleave_nxv56f16_nxv8f16: ; ZVBB-RV64: # %bb.0: ; ZVBB-RV64-NEXT: addi sp, sp, -80 ; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill ; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill ; ZVBB-RV64-NEXT: addi s0, sp, 80 ; ZVBB-RV64-NEXT: csrr a0, vlenb -; ZVBB-RV64-NEXT: li a1, 28 -; ZVBB-RV64-NEXT: mul a0, a0, a1 +; ZVBB-RV64-NEXT: slli a0, a0, 5 ; ZVBB-RV64-NEXT: sub sp, sp, a0 ; ZVBB-RV64-NEXT: andi sp, sp, -64 -; ZVBB-RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; ZVBB-RV64-NEXT: vmv2r.v v20, v16 +; ZVBB-RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZVBB-RV64-NEXT: vmv2r.v v26, v20 ; ZVBB-RV64-NEXT: addi a0, sp, 64 -; ZVBB-RV64-NEXT: vmv2r.v v18, v12 +; ZVBB-RV64-NEXT: vmv2r.v v24, v16 ; ZVBB-RV64-NEXT: csrr a1, vlenb -; ZVBB-RV64-NEXT: slli a2, a1, 2 -; ZVBB-RV64-NEXT: add a1, a2, a1 +; ZVBB-RV64-NEXT: slli a2, a1, 3 +; ZVBB-RV64-NEXT: sub a1, a2, a1 ; ZVBB-RV64-NEXT: add a1, sp, a1 ; ZVBB-RV64-NEXT: addi a1, a1, 64 +; ZVBB-RV64-NEXT: vmv2r.v v22, v12 ; ZVBB-RV64-NEXT: csrr a2, vlenb -; ZVBB-RV64-NEXT: vmv2r.v v16, v8 -; ZVBB-RV64-NEXT: vmv2r.v v22, v16 -; ZVBB-RV64-NEXT: vmv2r.v v24, v18 -; ZVBB-RV64-NEXT: vmv1r.v v26, v20 +; ZVBB-RV64-NEXT: vmv2r.v v20, v8 +; ZVBB-RV64-NEXT: vmv1r.v v1, v20 +; ZVBB-RV64-NEXT: vmv1r.v v3, v22 +; ZVBB-RV64-NEXT: vmv1r.v v5, v24 +; ZVBB-RV64-NEXT: vmv1r.v v7, v26 ; ZVBB-RV64-NEXT: add a3, a0, a2 -; ZVBB-RV64-NEXT: vmv1r.v v23, v10 +; ZVBB-RV64-NEXT: vmv1r.v v2, v10 ; ZVBB-RV64-NEXT: add a4, a1, a2 -; ZVBB-RV64-NEXT: add a5, a4, a2 -; ZVBB-RV64-NEXT: vmv1r.v v25, v14 -; ZVBB-RV64-NEXT: add a6, a5, a2 -; ZVBB-RV64-NEXT: vmv1r.v v18, v11 -; ZVBB-RV64-NEXT: vsseg5e64.v v22, (a0) -; ZVBB-RV64-NEXT: vmv1r.v v20, v15 -; ZVBB-RV64-NEXT: vsseg5e64.v v17, (a1) -; ZVBB-RV64-NEXT: vl1re64.v v16, (a6) +; ZVBB-RV64-NEXT: slli a5, a2, 2 +; ZVBB-RV64-NEXT: vmv1r.v v4, v14 +; ZVBB-RV64-NEXT: slli a6, a2, 4 +; ZVBB-RV64-NEXT: add a7, a4, a2 +; ZVBB-RV64-NEXT: vmv1r.v v6, v18 +; ZVBB-RV64-NEXT: sub a5, a6, a5 +; ZVBB-RV64-NEXT: vmv1r.v v22, v11 +; ZVBB-RV64-NEXT: add a6, a7, a2 +; ZVBB-RV64-NEXT: vmv1r.v v24, v15 +; ZVBB-RV64-NEXT: vsseg7e16.v v1, (a0) +; ZVBB-RV64-NEXT: vmv1r.v v26, v19 +; ZVBB-RV64-NEXT: vsseg7e16.v v21, (a1) +; ZVBB-RV64-NEXT: vl1re16.v v18, (a6) ; ZVBB-RV64-NEXT: add a6, a6, a2 -; ZVBB-RV64-NEXT: vl1re64.v v17, (a6) +; ZVBB-RV64-NEXT: vl1re16.v v19, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re16.v v20, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re16.v v21, (a6) ; ZVBB-RV64-NEXT: add a6, a3, a2 -; ZVBB-RV64-NEXT: vl1re64.v v10, (a6) +; ZVBB-RV64-NEXT: vl1re16.v v10, (a6) ; ZVBB-RV64-NEXT: add a6, a6, a2 -; ZVBB-RV64-NEXT: vl1re64.v v11, (a6) -; ZVBB-RV64-NEXT: vl1re64.v v8, (a0) -; ZVBB-RV64-NEXT: vl1re64.v v9, (a3) -; ZVBB-RV64-NEXT: vl1re64.v v14, (a4) +; ZVBB-RV64-NEXT: vl1re16.v v11, (a6) +; ZVBB-RV64-NEXT: vl1re16.v v8, (a0) +; ZVBB-RV64-NEXT: vl1re16.v v16, (a4) +; ZVBB-RV64-NEXT: vl1re16.v v9, (a3) +; ZVBB-RV64-NEXT: vl1re16.v v17, (a7) ; ZVBB-RV64-NEXT: csrr a0, vlenb -; ZVBB-RV64-NEXT: li a3, 10 +; ZVBB-RV64-NEXT: li a3, 14 ; ZVBB-RV64-NEXT: mul a0, a0, a3 ; ZVBB-RV64-NEXT: add a0, sp, a0 ; ZVBB-RV64-NEXT: addi a0, a0, 64 ; ZVBB-RV64-NEXT: add a6, a6, a2 -; ZVBB-RV64-NEXT: vl1re64.v v15, (a5) -; ZVBB-RV64-NEXT: vl1re64.v v12, (a6) -; ZVBB-RV64-NEXT: vl1re64.v v13, (a1) +; ZVBB-RV64-NEXT: vl1re16.v v12, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 +; ZVBB-RV64-NEXT: vl1re16.v v13, (a6) +; ZVBB-RV64-NEXT: add a6, a6, a2 ; ZVBB-RV64-NEXT: slli a2, a2, 3 ; ZVBB-RV64-NEXT: add a2, a0, a2 -; ZVBB-RV64-NEXT: vs2r.v v16, (a2) +; ZVBB-RV64-NEXT: vl1re16.v v14, (a6) +; ZVBB-RV64-NEXT: vl1re16.v v15, (a1) +; ZVBB-RV64-NEXT: add a5, a0, a5 +; ZVBB-RV64-NEXT: vs2r.v v20, (a5) +; ZVBB-RV64-NEXT: vs4r.v v16, (a2) ; ZVBB-RV64-NEXT: vs8r.v v8, (a0) -; ZVBB-RV64-NEXT: vl8re64.v v16, (a2) -; ZVBB-RV64-NEXT: vl8re64.v v8, (a0) +; ZVBB-RV64-NEXT: vl8re16.v v16, (a2) +; ZVBB-RV64-NEXT: vl8re16.v v8, (a0) ; ZVBB-RV64-NEXT: addi sp, s0, -80 ; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload ; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload ; ZVBB-RV64-NEXT: addi sp, sp, 80 ; ZVBB-RV64-NEXT: ret ; -; ZIP-LABEL: vector_interleave_nxv10f64_nxv2f64: +; ZIP-LABEL: vector_interleave_nxv56f16_nxv8f16: ; ZIP: # %bb.0: ; ZIP-NEXT: addi sp, sp, -80 ; ZIP-NEXT: sd ra, 72(sp) # 8-byte Folded Spill ; ZIP-NEXT: sd s0, 64(sp) # 8-byte Folded Spill ; ZIP-NEXT: addi s0, sp, 80 ; ZIP-NEXT: csrr a0, vlenb -; ZIP-NEXT: li a1, 28 -; ZIP-NEXT: mul a0, a0, a1 +; ZIP-NEXT: slli a0, a0, 5 ; ZIP-NEXT: sub sp, sp, a0 ; ZIP-NEXT: andi sp, sp, -64 -; ZIP-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; ZIP-NEXT: vmv2r.v v20, v16 +; ZIP-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZIP-NEXT: vmv2r.v v26, v20 ; ZIP-NEXT: addi a0, sp, 64 -; ZIP-NEXT: vmv2r.v v18, v12 +; ZIP-NEXT: vmv2r.v v24, v16 ; ZIP-NEXT: csrr a1, vlenb -; ZIP-NEXT: slli a2, a1, 2 -; ZIP-NEXT: add a1, a2, a1 +; ZIP-NEXT: slli a2, a1, 3 +; ZIP-NEXT: sub a1, a2, a1 ; ZIP-NEXT: add a1, sp, a1 ; ZIP-NEXT: addi a1, a1, 64 +; ZIP-NEXT: vmv2r.v v22, v12 ; ZIP-NEXT: csrr a2, vlenb -; ZIP-NEXT: vmv2r.v v16, v8 -; ZIP-NEXT: vmv2r.v v22, v16 -; ZIP-NEXT: vmv2r.v v24, v18 -; ZIP-NEXT: vmv1r.v v26, v20 +; ZIP-NEXT: vmv2r.v v20, v8 +; ZIP-NEXT: vmv1r.v v1, v20 +; ZIP-NEXT: vmv1r.v v3, v22 +; ZIP-NEXT: vmv1r.v v5, v24 +; ZIP-NEXT: vmv1r.v v7, v26 ; ZIP-NEXT: add a3, a0, a2 -; ZIP-NEXT: vmv1r.v v23, v10 +; ZIP-NEXT: vmv1r.v v2, v10 ; ZIP-NEXT: add a4, a1, a2 -; ZIP-NEXT: add a5, a4, a2 -; ZIP-NEXT: vmv1r.v v25, v14 -; ZIP-NEXT: add a6, a5, a2 -; ZIP-NEXT: vmv1r.v v18, v11 -; ZIP-NEXT: vsseg5e64.v v22, (a0) -; ZIP-NEXT: vmv1r.v v20, v15 -; ZIP-NEXT: vsseg5e64.v v17, (a1) -; ZIP-NEXT: vl1re64.v v16, (a6) +; ZIP-NEXT: slli a5, a2, 2 +; ZIP-NEXT: vmv1r.v v4, v14 +; ZIP-NEXT: slli a6, a2, 4 +; ZIP-NEXT: add a7, a4, a2 +; ZIP-NEXT: vmv1r.v v6, v18 +; ZIP-NEXT: sub a5, a6, a5 +; ZIP-NEXT: vmv1r.v v22, v11 +; ZIP-NEXT: add a6, a7, a2 +; ZIP-NEXT: vmv1r.v v24, v15 +; ZIP-NEXT: vsseg7e16.v v1, (a0) +; ZIP-NEXT: vmv1r.v v26, v19 +; ZIP-NEXT: vsseg7e16.v v21, (a1) +; ZIP-NEXT: vl1re16.v v18, (a6) ; ZIP-NEXT: add a6, a6, a2 -; ZIP-NEXT: vl1re64.v v17, (a6) +; ZIP-NEXT: vl1re16.v v19, (a6) +; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: vl1re16.v v20, (a6) +; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: vl1re16.v v21, (a6) ; ZIP-NEXT: add a6, a3, a2 -; ZIP-NEXT: vl1re64.v v10, (a6) +; ZIP-NEXT: vl1re16.v v10, (a6) ; ZIP-NEXT: add a6, a6, a2 -; ZIP-NEXT: vl1re64.v v11, (a6) -; ZIP-NEXT: vl1re64.v v8, (a0) -; ZIP-NEXT: vl1re64.v v9, (a3) -; ZIP-NEXT: vl1re64.v v14, (a4) +; ZIP-NEXT: vl1re16.v v11, (a6) +; ZIP-NEXT: vl1re16.v v8, (a0) +; ZIP-NEXT: vl1re16.v v16, (a4) +; ZIP-NEXT: vl1re16.v v9, (a3) +; ZIP-NEXT: vl1re16.v v17, (a7) ; ZIP-NEXT: csrr a0, vlenb -; ZIP-NEXT: li a3, 10 +; ZIP-NEXT: li a3, 14 ; ZIP-NEXT: mul a0, a0, a3 ; ZIP-NEXT: add a0, sp, a0 ; ZIP-NEXT: addi a0, a0, 64 ; ZIP-NEXT: add a6, a6, a2 -; ZIP-NEXT: vl1re64.v v15, (a5) -; ZIP-NEXT: vl1re64.v v12, (a6) -; ZIP-NEXT: vl1re64.v v13, (a1) +; ZIP-NEXT: vl1re16.v v12, (a6) +; ZIP-NEXT: add a6, a6, a2 +; ZIP-NEXT: vl1re16.v v13, (a6) +; ZIP-NEXT: add a6, a6, a2 ; ZIP-NEXT: slli a2, a2, 3 ; ZIP-NEXT: add a2, a0, a2 -; ZIP-NEXT: vs2r.v v16, (a2) +; ZIP-NEXT: vl1re16.v v14, (a6) +; ZIP-NEXT: vl1re16.v v15, (a1) +; ZIP-NEXT: add a5, a0, a5 +; ZIP-NEXT: vs2r.v v20, (a5) +; ZIP-NEXT: vs4r.v v16, (a2) ; ZIP-NEXT: vs8r.v v8, (a0) -; ZIP-NEXT: vl8re64.v v16, (a2) -; ZIP-NEXT: vl8re64.v v8, (a0) +; ZIP-NEXT: vl8re16.v v16, (a2) +; ZIP-NEXT: vl8re16.v v8, (a0) ; ZIP-NEXT: addi sp, s0, -80 ; ZIP-NEXT: ld ra, 72(sp) # 8-byte Folded Reload ; ZIP-NEXT: ld s0, 64(sp) # 8-byte Folded Reload ; ZIP-NEXT: addi sp, sp, 80 ; ZIP-NEXT: ret - %res = call @llvm.vector.interleave5.nxv10f64( %v0, %v1, %v2, %v3, %v4) - ret %res + %res = call @llvm.vector.interleave7.nxv56f16( %v0, %v1, %v2, %v3, %v4, %v5, %v6) + ret %res } -define @vector_interleave_nxv14f16_nxv2f16( %v0, %v1, %v2, %v3, %v4, %v5, %v6) nounwind { -; CHECK-LABEL: vector_interleave_nxv14f16_nxv2f16: +define @vector_interleave_nxv14bf16_nxv2bf16( %v0, %v1, %v2, %v3, %v4, %v5, %v6) nounwind { +; CHECK-LABEL: vector_interleave_nxv14bf16_nxv2bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a0, vlenb @@ -6788,7 +12356,7 @@ define @vector_interleave_nxv14f16_nxv2f16( @vector_interleave_nxv14f16_nxv2f16( @llvm.vector.interleave7.nxv14f16( %v0, %v1, %v2, %v3, %v4, %v5, %v6) - ret %res +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave7.nxv14bf16( %v0, %v1, %v2, %v3, %v4, %v5, %v6) + ret %res } -define @vector_interleave_nxv28f16_nxv4f16( %v0, %v1, %v2, %v3, %v4, %v5, %v6) nounwind { -; CHECK-LABEL: vector_interleave_nxv28f16_nxv4f16: +define @vector_interleave_nxv28bf16_nxv4bf16( %v0, %v1, %v2, %v3, %v4, %v5, %v6) nounwind { +; CHECK-LABEL: vector_interleave_nxv28bf16_nxv4bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a0, vlenb @@ -6863,7 +12431,7 @@ define @vector_interleave_nxv28f16_nxv4f16( @vector_interleave_nxv28f16_nxv4f16( @llvm.vector.interleave7.nxv28f16( %v0, %v1, %v2, %v3, %v4, %v5, %v6) - ret %res + %res = call @llvm.vector.interleave7.nxv28bf16( %v0, %v1, %v2, %v3, %v4, %v5, %v6) + ret %res } -define @vector_interleave_nxv56f16_nxv8f16( %v0, %v1, %v2, %v3, %v4, %v5, %v6) nounwind { -; RV32-LABEL: vector_interleave_nxv56f16_nxv8f16: +define @vector_interleave_nxv56bf16_nxv8bf16( %v0, %v1, %v2, %v3, %v4, %v5, %v6) nounwind { +; RV32-LABEL: vector_interleave_nxv56bf16_nxv8bf16: ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -80 ; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill @@ -6980,7 +12548,7 @@ define @vector_interleave_nxv56f16_nxv8f16( @vector_interleave_nxv56f16_nxv8f16( @vector_interleave_nxv56f16_nxv8f16( @vector_interleave_nxv56f16_nxv8f16( @vector_interleave_nxv56f16_nxv8f16( @llvm.vector.interleave7.nxv56f16( %v0, %v1, %v2, %v3, %v4, %v5, %v6) - ret %res + %res = call @llvm.vector.interleave7.nxv56bf16( %v0, %v1, %v2, %v3, %v4, %v5, %v6) + ret %res } -define @vector_interleave_nxv14bf16_nxv2bf16( %v0, %v1, %v2, %v3, %v4, %v5, %v6) nounwind { -; CHECK-LABEL: vector_interleave_nxv14bf16_nxv2bf16: +define @vector_interleave_nxv7f32_nxv1f32( %v0, %v1, %v2, %v3, %v4, %v5, %v6) nounwind { +; CHECK-LABEL: vector_interleave_nxv7f32_nxv1f32: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a0, vlenb @@ -7321,30 +12889,30 @@ define @vector_interleave_nxv14bf16_nxv2bf16( @vector_interleave_nxv14bf16_nxv2bf16( @vector_interleave_nxv14bf16_nxv2bf16( @llvm.vector.interleave7.nxv14bf16( %v0, %v1, %v2, %v3, %v4, %v5, %v6) - ret %res + %res = call @llvm.vector.interleave7.nxv7f32( %v0, %v1, %v2, %v3, %v4, %v5, %v6) + ret %res } -define @vector_interleave_nxv28bf16_nxv4bf16( %v0, %v1, %v2, %v3, %v4, %v5, %v6) nounwind { -; CHECK-LABEL: vector_interleave_nxv28bf16_nxv4bf16: +define @vector_interleave_nxv14f32_nxv2f32( %v0, %v1, %v2, %v3, %v4, %v5, %v6) nounwind { +; CHECK-LABEL: vector_interleave_nxv14f32_nxv2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a0, vlenb @@ -7407,19 +12975,19 @@ define @vector_interleave_nxv28bf16_nxv4bf16( @vector_interleave_nxv28bf16_nxv4bf16( @vector_interleave_nxv28bf16_nxv4bf16( @llvm.vector.interleave7.nxv28bf16( %v0, %v1, %v2, %v3, %v4, %v5, %v6) - ret %res + %res = call @llvm.vector.interleave7.nxv14f32( %v0, %v1, %v2, %v3, %v4, %v5, %v6) + ret %res } -define @vector_interleave_nxv56bf16_nxv8bf16( %v0, %v1, %v2, %v3, %v4, %v5, %v6) nounwind { -; RV32-LABEL: vector_interleave_nxv56bf16_nxv8bf16: +define @vector_interleave_nxv28f32_nxv4f32( %v0, %v1, %v2, %v3, %v4, %v5, %v6) nounwind { +; RV32-LABEL: vector_interleave_nxv28f32_nxv4f32: ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -80 ; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill @@ -7472,7 +13040,7 @@ define @vector_interleave_nxv56bf16_nxv8bf16( @vector_interleave_nxv56bf16_nxv8bf16( @vector_interleave_nxv56bf16_nxv8bf16( @vector_interleave_nxv56bf16_nxv8bf16( @vector_interleave_nxv56bf16_nxv8bf16( @vector_interleave_nxv56bf16_nxv8bf16( @vector_interleave_nxv56bf16_nxv8bf16( @vector_interleave_nxv56bf16_nxv8bf16( @vector_interleave_nxv56bf16_nxv8bf16( @vector_interleave_nxv56bf16_nxv8bf16( @llvm.vector.interleave7.nxv56bf16( %v0, %v1, %v2, %v3, %v4, %v5, %v6) - ret %res -} - -define @vector_interleave_nxv7f32_nxv1f32( %v0, %v1, %v2, %v3, %v4, %v5, %v6) nounwind { -; CHECK-LABEL: vector_interleave_nxv7f32_nxv1f32: -; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 2 -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: srli a2, a1, 1 -; CHECK-NEXT: srli a1, a1, 3 -; CHECK-NEXT: add a3, a0, a2 -; CHECK-NEXT: add a4, a3, a2 -; CHECK-NEXT: add a5, a4, a2 -; CHECK-NEXT: add a6, a5, a2 -; CHECK-NEXT: vsetvli a7, zero, e32, mf2, ta, ma -; CHECK-NEXT: vsseg7e32.v v8, (a0) -; CHECK-NEXT: add a7, a6, a2 -; CHECK-NEXT: vle32.v v8, (a7) -; CHECK-NEXT: vle32.v v10, (a6) -; CHECK-NEXT: add a6, a1, a1 -; CHECK-NEXT: add a2, a7, a2 -; CHECK-NEXT: vle32.v v12, (a5) -; CHECK-NEXT: vsetvli zero, a6, e32, m1, ta, ma -; CHECK-NEXT: vslideup.vx v10, v8, a1 -; CHECK-NEXT: vsetvli a5, zero, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v11, (a2) -; CHECK-NEXT: vle32.v v9, (a4) -; CHECK-NEXT: vsetvli zero, a6, e32, m1, ta, ma -; CHECK-NEXT: vslideup.vx v9, v12, a1 -; CHECK-NEXT: vsetvli a2, zero, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v12, (a3) -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetvli zero, a6, e32, m1, ta, ma -; CHECK-NEXT: vslideup.vx v8, v12, a1 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 2 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: ret -; -; ZVBB-LABEL: vector_interleave_nxv7f32_nxv1f32: -; ZVBB: # %bb.0: -; ZVBB-NEXT: addi sp, sp, -16 -; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: slli a0, a0, 2 -; ZVBB-NEXT: sub sp, sp, a0 -; ZVBB-NEXT: addi a0, sp, 16 -; ZVBB-NEXT: csrr a1, vlenb -; ZVBB-NEXT: srli a2, a1, 1 -; ZVBB-NEXT: srli a1, a1, 3 -; ZVBB-NEXT: add a3, a0, a2 -; ZVBB-NEXT: add a4, a3, a2 -; ZVBB-NEXT: add a5, a4, a2 -; ZVBB-NEXT: add a6, a5, a2 -; ZVBB-NEXT: vsetvli a7, zero, e32, mf2, ta, ma -; ZVBB-NEXT: vsseg7e32.v v8, (a0) -; ZVBB-NEXT: add a7, a6, a2 -; ZVBB-NEXT: vle32.v v8, (a7) -; ZVBB-NEXT: vle32.v v10, (a6) -; ZVBB-NEXT: add a6, a1, a1 -; ZVBB-NEXT: add a2, a7, a2 -; ZVBB-NEXT: vle32.v v12, (a5) -; ZVBB-NEXT: vsetvli zero, a6, e32, m1, ta, ma -; ZVBB-NEXT: vslideup.vx v10, v8, a1 -; ZVBB-NEXT: vsetvli a5, zero, e32, mf2, ta, ma -; ZVBB-NEXT: vle32.v v11, (a2) -; ZVBB-NEXT: vle32.v v9, (a4) -; ZVBB-NEXT: vsetvli zero, a6, e32, m1, ta, ma -; ZVBB-NEXT: vslideup.vx v9, v12, a1 -; ZVBB-NEXT: vsetvli a2, zero, e32, mf2, ta, ma -; ZVBB-NEXT: vle32.v v12, (a3) -; ZVBB-NEXT: vle32.v v8, (a0) -; ZVBB-NEXT: vsetvli zero, a6, e32, m1, ta, ma -; ZVBB-NEXT: vslideup.vx v8, v12, a1 -; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: slli a0, a0, 2 -; ZVBB-NEXT: add sp, sp, a0 -; ZVBB-NEXT: addi sp, sp, 16 -; ZVBB-NEXT: ret - %res = call @llvm.vector.interleave7.nxv7f32( %v0, %v1, %v2, %v3, %v4, %v5, %v6) - ret %res + %res = call @llvm.vector.interleave7.nxv28f32( %v0, %v1, %v2, %v3, %v4, %v5, %v6) + ret %res } -define @vector_interleave_nxv14f32_nxv2f32( %v0, %v1, %v2, %v3, %v4, %v5, %v6) nounwind { -; CHECK-LABEL: vector_interleave_nxv14f32_nxv2f32: +define @vector_interleave_nxv7f64_nxv1f64( %v0, %v1, %v2, %v3, %v4, %v5, %v6) nounwind { +; CHECK-LABEL: vector_interleave_nxv7f64_nxv1f64: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a0, vlenb @@ -7971,19 +13455,19 @@ define @vector_interleave_nxv14f32_nxv2f32( @vector_interleave_nxv14f32_nxv2f32( @vector_interleave_nxv14f32_nxv2f32( @llvm.vector.interleave7.nxv14f32( %v0, %v1, %v2, %v3, %v4, %v5, %v6) - ret %res + %res = call @llvm.vector.interleave7.nxv7f64( %v0, %v1, %v2, %v3, %v4, %v5, %v6) + ret %res } -define @vector_interleave_nxv28f32_nxv4f32( %v0, %v1, %v2, %v3, %v4, %v5, %v6) nounwind { -; RV32-LABEL: vector_interleave_nxv28f32_nxv4f32: +define @vector_interleave_nxv14f64_nxv2f64( %v0, %v1, %v2, %v3, %v4, %v5, %v6) nounwind { +; RV32-LABEL: vector_interleave_nxv14f64_nxv2f64: ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -80 ; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill @@ -8036,7 +13520,7 @@ define @vector_interleave_nxv28f32_nxv4f32( @vector_interleave_nxv28f32_nxv4f32( @vector_interleave_nxv28f32_nxv4f32( @vector_interleave_nxv28f32_nxv4f32( @vector_interleave_nxv28f32_nxv4f32( @vector_interleave_nxv28f32_nxv4f32( @vector_interleave_nxv28f32_nxv4f32( @vector_interleave_nxv28f32_nxv4f32( @vector_interleave_nxv28f32_nxv4f32( @vector_interleave_nxv28f32_nxv4f32( @llvm.vector.interleave7.nxv28f32( %v0, %v1, %v2, %v3, %v4, %v5, %v6) - ret %res + %res = call @llvm.vector.interleave7.nxv14f64( %v0, %v1, %v2, %v3, %v4, %v5, %v6) + ret %res +} + +define @vector_interleave_nxv16f16_nxv2f16( %v0, %v1, %v2, %v3, %v4, %v5, %v6, %v7) nounwind { +; CHECK-LABEL: vector_interleave_nxv16f16_nxv2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a2, a1, 1 +; CHECK-NEXT: add a3, a0, a2 +; CHECK-NEXT: add a4, a3, a2 +; CHECK-NEXT: add a5, a4, a2 +; CHECK-NEXT: add a6, a5, a2 +; CHECK-NEXT: add a7, a6, a2 +; CHECK-NEXT: vsetvli t0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vsseg8e16.v v8, (a0) +; CHECK-NEXT: add t0, a7, a2 +; CHECK-NEXT: add a2, t0, a2 +; CHECK-NEXT: vle16.v v11, (t0) +; CHECK-NEXT: vle16.v v8, (a2) +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: add a2, a1, a1 +; CHECK-NEXT: vle16.v v9, (a7) +; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v11, v8, a1 +; CHECK-NEXT: vsetvli a7, zero, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v10, (a6) +; CHECK-NEXT: vle16.v v8, (a5) +; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v10, v9, a1 +; CHECK-NEXT: vsetvli a5, zero, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v9, (a4) +; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v9, v8, a1 +; CHECK-NEXT: vsetvli a4, zero, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v12, (a3) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v8, v12, a1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv16f16_nxv2f16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 2 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: srli a2, a1, 1 +; ZVBB-NEXT: add a3, a0, a2 +; ZVBB-NEXT: add a4, a3, a2 +; ZVBB-NEXT: add a5, a4, a2 +; ZVBB-NEXT: add a6, a5, a2 +; ZVBB-NEXT: add a7, a6, a2 +; ZVBB-NEXT: vsetvli t0, zero, e16, mf2, ta, ma +; ZVBB-NEXT: vsseg8e16.v v8, (a0) +; ZVBB-NEXT: add t0, a7, a2 +; ZVBB-NEXT: add a2, t0, a2 +; ZVBB-NEXT: vle16.v v11, (t0) +; ZVBB-NEXT: vle16.v v8, (a2) +; ZVBB-NEXT: srli a1, a1, 2 +; ZVBB-NEXT: add a2, a1, a1 +; ZVBB-NEXT: vle16.v v9, (a7) +; ZVBB-NEXT: vsetvli zero, a2, e16, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v11, v8, a1 +; ZVBB-NEXT: vsetvli a7, zero, e16, mf2, ta, ma +; ZVBB-NEXT: vle16.v v10, (a6) +; ZVBB-NEXT: vle16.v v8, (a5) +; ZVBB-NEXT: vsetvli zero, a2, e16, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v10, v9, a1 +; ZVBB-NEXT: vsetvli a5, zero, e16, mf2, ta, ma +; ZVBB-NEXT: vle16.v v9, (a4) +; ZVBB-NEXT: vsetvli zero, a2, e16, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v9, v8, a1 +; ZVBB-NEXT: vsetvli a4, zero, e16, mf2, ta, ma +; ZVBB-NEXT: vle16.v v12, (a3) +; ZVBB-NEXT: vle16.v v8, (a0) +; ZVBB-NEXT: vsetvli zero, a2, e16, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v8, v12, a1 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 2 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave8.nxv16f16( %v0, %v1, %v2, %v3, %v4, %v5, %v6, %v7) + ret %res +} + +define @vector_interleave_nxv32f16_nxv4f16( %v0, %v1, %v2, %v3, %v4, %v5, %v6, %v7) nounwind { +; CHECK-LABEL: vector_interleave_nxv32f16_nxv4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: add a3, a2, a1 +; CHECK-NEXT: add a4, a3, a1 +; CHECK-NEXT: add a5, a4, a1 +; CHECK-NEXT: add a6, a5, a1 +; CHECK-NEXT: add a7, a6, a1 +; CHECK-NEXT: vsetvli t0, zero, e16, m1, ta, ma +; CHECK-NEXT: vsseg8e16.v v8, (a0) +; CHECK-NEXT: vl1re16.v v14, (a7) +; CHECK-NEXT: add a1, a7, a1 +; CHECK-NEXT: vl1re16.v v15, (a1) +; CHECK-NEXT: vl1re16.v v12, (a5) +; CHECK-NEXT: vl1re16.v v13, (a6) +; CHECK-NEXT: vl1re16.v v10, (a3) +; CHECK-NEXT: vl1re16.v v11, (a4) +; CHECK-NEXT: vl1re16.v v8, (a0) +; CHECK-NEXT: vl1re16.v v9, (a2) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv32f16_nxv4f16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 3 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: add a3, a2, a1 +; ZVBB-NEXT: add a4, a3, a1 +; ZVBB-NEXT: add a5, a4, a1 +; ZVBB-NEXT: add a6, a5, a1 +; ZVBB-NEXT: add a7, a6, a1 +; ZVBB-NEXT: vsetvli t0, zero, e16, m1, ta, ma +; ZVBB-NEXT: vsseg8e16.v v8, (a0) +; ZVBB-NEXT: vl1re16.v v14, (a7) +; ZVBB-NEXT: add a1, a7, a1 +; ZVBB-NEXT: vl1re16.v v15, (a1) +; ZVBB-NEXT: vl1re16.v v12, (a5) +; ZVBB-NEXT: vl1re16.v v13, (a6) +; ZVBB-NEXT: vl1re16.v v10, (a3) +; ZVBB-NEXT: vl1re16.v v11, (a4) +; ZVBB-NEXT: vl1re16.v v8, (a0) +; ZVBB-NEXT: vl1re16.v v9, (a2) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 3 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave8.nxv32f16( %v0, %v1, %v2, %v3, %v4, %v5, %v6, %v7) + ret %res } -define @vector_interleave_nxv7f64_nxv1f64( %v0, %v1, %v2, %v3, %v4, %v5, %v6) nounwind { -; CHECK-LABEL: vector_interleave_nxv7f64_nxv1f64: +define @vector_interleave_nxv64f16_nxv8f16( %v0, %v1, %v2, %v3, %v4, %v5, %v6, %v7) nounwind { +; CHECK-LABEL: vector_interleave_nxv64f16_nxv8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a1, a0, 3 -; CHECK-NEXT: sub a0, a1, a0 +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv2r.v v28, v22 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv2r.v v26, v18 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: add a2, a0, a3 +; CHECK-NEXT: add a4, a1, a3 +; CHECK-NEXT: add a5, a2, a3 +; CHECK-NEXT: vmv1r.v v1, v8 +; CHECK-NEXT: vmv2r.v v24, v14 +; CHECK-NEXT: add a6, a4, a3 +; CHECK-NEXT: vmv2r.v v22, v10 +; CHECK-NEXT: vmv1r.v v2, v22 +; CHECK-NEXT: add a7, a5, a3 +; CHECK-NEXT: vmv1r.v v3, v12 +; CHECK-NEXT: add t0, a6, a3 +; CHECK-NEXT: vmv1r.v v4, v24 +; CHECK-NEXT: add t1, a7, a3 +; CHECK-NEXT: vmv1r.v v5, v16 +; CHECK-NEXT: add t2, t0, a3 +; CHECK-NEXT: vmv1r.v v6, v26 +; CHECK-NEXT: add t3, t1, a3 +; CHECK-NEXT: vmv1r.v v7, v20 +; CHECK-NEXT: add t4, t2, a3 +; CHECK-NEXT: vmv1r.v v8, v28 +; CHECK-NEXT: vmv1r.v v22, v9 +; CHECK-NEXT: add t5, t3, a3 +; CHECK-NEXT: vmv1r.v v24, v13 +; CHECK-NEXT: add t6, t4, a3 +; CHECK-NEXT: vmv1r.v v26, v17 +; CHECK-NEXT: vsseg8e16.v v1, (a0) +; CHECK-NEXT: vmv1r.v v28, v21 +; CHECK-NEXT: vsseg8e16.v v22, (a1) +; CHECK-NEXT: vl1re16.v v14, (t5) +; CHECK-NEXT: add t5, t5, a3 +; CHECK-NEXT: add a3, t6, a3 +; CHECK-NEXT: vl1re16.v v22, (t6) +; CHECK-NEXT: vl1re16.v v15, (t5) +; CHECK-NEXT: vl1re16.v v23, (a3) +; CHECK-NEXT: vl1re16.v v12, (t1) +; CHECK-NEXT: vl1re16.v v20, (t2) +; CHECK-NEXT: vl1re16.v v13, (t3) +; CHECK-NEXT: vl1re16.v v21, (t4) +; CHECK-NEXT: vl1re16.v v10, (a5) +; CHECK-NEXT: vl1re16.v v18, (a6) +; CHECK-NEXT: vl1re16.v v11, (a7) +; CHECK-NEXT: vl1re16.v v19, (t0) +; CHECK-NEXT: vl1re16.v v8, (a0) +; CHECK-NEXT: vl1re16.v v16, (a1) +; CHECK-NEXT: vl1re16.v v9, (a2) +; CHECK-NEXT: vl1re16.v v17, (a4) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv64f16_nxv8f16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 4 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZVBB-NEXT: vmv2r.v v28, v22 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: vmv2r.v v26, v18 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: slli a1, a1, 3 +; ZVBB-NEXT: add a1, sp, a1 +; ZVBB-NEXT: addi a1, a1, 16 +; ZVBB-NEXT: csrr a3, vlenb +; ZVBB-NEXT: add a2, a0, a3 +; ZVBB-NEXT: add a4, a1, a3 +; ZVBB-NEXT: add a5, a2, a3 +; ZVBB-NEXT: vmv1r.v v1, v8 +; ZVBB-NEXT: vmv2r.v v24, v14 +; ZVBB-NEXT: add a6, a4, a3 +; ZVBB-NEXT: vmv2r.v v22, v10 +; ZVBB-NEXT: vmv1r.v v2, v22 +; ZVBB-NEXT: add a7, a5, a3 +; ZVBB-NEXT: vmv1r.v v3, v12 +; ZVBB-NEXT: add t0, a6, a3 +; ZVBB-NEXT: vmv1r.v v4, v24 +; ZVBB-NEXT: add t1, a7, a3 +; ZVBB-NEXT: vmv1r.v v5, v16 +; ZVBB-NEXT: add t2, t0, a3 +; ZVBB-NEXT: vmv1r.v v6, v26 +; ZVBB-NEXT: add t3, t1, a3 +; ZVBB-NEXT: vmv1r.v v7, v20 +; ZVBB-NEXT: add t4, t2, a3 +; ZVBB-NEXT: vmv1r.v v8, v28 +; ZVBB-NEXT: vmv1r.v v22, v9 +; ZVBB-NEXT: add t5, t3, a3 +; ZVBB-NEXT: vmv1r.v v24, v13 +; ZVBB-NEXT: add t6, t4, a3 +; ZVBB-NEXT: vmv1r.v v26, v17 +; ZVBB-NEXT: vsseg8e16.v v1, (a0) +; ZVBB-NEXT: vmv1r.v v28, v21 +; ZVBB-NEXT: vsseg8e16.v v22, (a1) +; ZVBB-NEXT: vl1re16.v v14, (t5) +; ZVBB-NEXT: add t5, t5, a3 +; ZVBB-NEXT: add a3, t6, a3 +; ZVBB-NEXT: vl1re16.v v22, (t6) +; ZVBB-NEXT: vl1re16.v v15, (t5) +; ZVBB-NEXT: vl1re16.v v23, (a3) +; ZVBB-NEXT: vl1re16.v v12, (t1) +; ZVBB-NEXT: vl1re16.v v20, (t2) +; ZVBB-NEXT: vl1re16.v v13, (t3) +; ZVBB-NEXT: vl1re16.v v21, (t4) +; ZVBB-NEXT: vl1re16.v v10, (a5) +; ZVBB-NEXT: vl1re16.v v18, (a6) +; ZVBB-NEXT: vl1re16.v v11, (a7) +; ZVBB-NEXT: vl1re16.v v19, (t0) +; ZVBB-NEXT: vl1re16.v v8, (a0) +; ZVBB-NEXT: vl1re16.v v16, (a1) +; ZVBB-NEXT: vl1re16.v v9, (a2) +; ZVBB-NEXT: vl1re16.v v17, (a4) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 4 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave8.nxv64f16( %v0, %v1, %v2, %v3, %v4, %v5, %v6, %v7) + ret %res +} + +define @vector_interleave_nxv16bf16_nxv2bf16( %v0, %v1, %v2, %v3, %v4, %v5, %v6, %v7) nounwind { +; CHECK-LABEL: vector_interleave_nxv16bf16_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a2, a1, 1 +; CHECK-NEXT: add a3, a0, a2 +; CHECK-NEXT: add a4, a3, a2 +; CHECK-NEXT: add a5, a4, a2 +; CHECK-NEXT: add a6, a5, a2 +; CHECK-NEXT: add a7, a6, a2 +; CHECK-NEXT: vsetvli t0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vsseg8e16.v v8, (a0) +; CHECK-NEXT: add t0, a7, a2 +; CHECK-NEXT: add a2, t0, a2 +; CHECK-NEXT: vle16.v v11, (t0) +; CHECK-NEXT: vle16.v v8, (a2) +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: add a2, a1, a1 +; CHECK-NEXT: vle16.v v9, (a7) +; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v11, v8, a1 +; CHECK-NEXT: vsetvli a7, zero, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v10, (a6) +; CHECK-NEXT: vle16.v v8, (a5) +; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v10, v9, a1 +; CHECK-NEXT: vsetvli a5, zero, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v9, (a4) +; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v9, v8, a1 +; CHECK-NEXT: vsetvli a4, zero, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v12, (a3) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v8, v12, a1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv16bf16_nxv2bf16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 2 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: srli a2, a1, 1 +; ZVBB-NEXT: add a3, a0, a2 +; ZVBB-NEXT: add a4, a3, a2 +; ZVBB-NEXT: add a5, a4, a2 +; ZVBB-NEXT: add a6, a5, a2 +; ZVBB-NEXT: add a7, a6, a2 +; ZVBB-NEXT: vsetvli t0, zero, e16, mf2, ta, ma +; ZVBB-NEXT: vsseg8e16.v v8, (a0) +; ZVBB-NEXT: add t0, a7, a2 +; ZVBB-NEXT: add a2, t0, a2 +; ZVBB-NEXT: vle16.v v11, (t0) +; ZVBB-NEXT: vle16.v v8, (a2) +; ZVBB-NEXT: srli a1, a1, 2 +; ZVBB-NEXT: add a2, a1, a1 +; ZVBB-NEXT: vle16.v v9, (a7) +; ZVBB-NEXT: vsetvli zero, a2, e16, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v11, v8, a1 +; ZVBB-NEXT: vsetvli a7, zero, e16, mf2, ta, ma +; ZVBB-NEXT: vle16.v v10, (a6) +; ZVBB-NEXT: vle16.v v8, (a5) +; ZVBB-NEXT: vsetvli zero, a2, e16, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v10, v9, a1 +; ZVBB-NEXT: vsetvli a5, zero, e16, mf2, ta, ma +; ZVBB-NEXT: vle16.v v9, (a4) +; ZVBB-NEXT: vsetvli zero, a2, e16, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v9, v8, a1 +; ZVBB-NEXT: vsetvli a4, zero, e16, mf2, ta, ma +; ZVBB-NEXT: vle16.v v12, (a3) +; ZVBB-NEXT: vle16.v v8, (a0) +; ZVBB-NEXT: vsetvli zero, a2, e16, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v8, v12, a1 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 2 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave8.nxv16bf16( %v0, %v1, %v2, %v3, %v4, %v5, %v6, %v7) + ret %res +} + +define @vector_interleave_nxv32bf16_nxv4bf16( %v0, %v1, %v2, %v3, %v4, %v5, %v6, %v7) nounwind { +; CHECK-LABEL: vector_interleave_nxv32bf16_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: add a2, a0, a1 ; CHECK-NEXT: add a3, a2, a1 -; CHECK-NEXT: vsetvli a4, zero, e64, m1, ta, ma -; CHECK-NEXT: vsseg7e64.v v8, (a0) -; CHECK-NEXT: vl1re64.v v10, (a3) -; CHECK-NEXT: add a3, a3, a1 -; CHECK-NEXT: vl1re64.v v11, (a3) -; CHECK-NEXT: add a3, a3, a1 -; CHECK-NEXT: vl1re64.v v8, (a0) -; CHECK-NEXT: add a0, a3, a1 -; CHECK-NEXT: vl1re64.v v9, (a2) -; CHECK-NEXT: vl1re64.v v12, (a3) -; CHECK-NEXT: vl1re64.v v13, (a0) -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: vl1re64.v v14, (a0) +; CHECK-NEXT: add a4, a3, a1 +; CHECK-NEXT: add a5, a4, a1 +; CHECK-NEXT: add a6, a5, a1 +; CHECK-NEXT: add a7, a6, a1 +; CHECK-NEXT: vsetvli t0, zero, e16, m1, ta, ma +; CHECK-NEXT: vsseg8e16.v v8, (a0) +; CHECK-NEXT: vl1re16.v v14, (a7) +; CHECK-NEXT: add a1, a7, a1 +; CHECK-NEXT: vl1re16.v v15, (a1) +; CHECK-NEXT: vl1re16.v v12, (a5) +; CHECK-NEXT: vl1re16.v v13, (a6) +; CHECK-NEXT: vl1re16.v v10, (a3) +; CHECK-NEXT: vl1re16.v v11, (a4) +; CHECK-NEXT: vl1re16.v v8, (a0) +; CHECK-NEXT: vl1re16.v v9, (a2) ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a1, a0, 3 -; CHECK-NEXT: sub a0, a1, a0 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret ; -; ZVBB-LABEL: vector_interleave_nxv7f64_nxv1f64: +; ZVBB-LABEL: vector_interleave_nxv32bf16_nxv4bf16: ; ZVBB: # %bb.0: ; ZVBB-NEXT: addi sp, sp, -16 ; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: slli a1, a0, 3 -; ZVBB-NEXT: sub a0, a1, a0 +; ZVBB-NEXT: slli a0, a0, 3 ; ZVBB-NEXT: sub sp, sp, a0 ; ZVBB-NEXT: addi a0, sp, 16 ; ZVBB-NEXT: csrr a1, vlenb ; ZVBB-NEXT: add a2, a0, a1 ; ZVBB-NEXT: add a3, a2, a1 -; ZVBB-NEXT: vsetvli a4, zero, e64, m1, ta, ma -; ZVBB-NEXT: vsseg7e64.v v8, (a0) -; ZVBB-NEXT: vl1re64.v v10, (a3) -; ZVBB-NEXT: add a3, a3, a1 -; ZVBB-NEXT: vl1re64.v v11, (a3) -; ZVBB-NEXT: add a3, a3, a1 -; ZVBB-NEXT: vl1re64.v v8, (a0) -; ZVBB-NEXT: add a0, a3, a1 -; ZVBB-NEXT: vl1re64.v v9, (a2) -; ZVBB-NEXT: vl1re64.v v12, (a3) -; ZVBB-NEXT: vl1re64.v v13, (a0) -; ZVBB-NEXT: add a0, a0, a1 -; ZVBB-NEXT: vl1re64.v v14, (a0) +; ZVBB-NEXT: add a4, a3, a1 +; ZVBB-NEXT: add a5, a4, a1 +; ZVBB-NEXT: add a6, a5, a1 +; ZVBB-NEXT: add a7, a6, a1 +; ZVBB-NEXT: vsetvli t0, zero, e16, m1, ta, ma +; ZVBB-NEXT: vsseg8e16.v v8, (a0) +; ZVBB-NEXT: vl1re16.v v14, (a7) +; ZVBB-NEXT: add a1, a7, a1 +; ZVBB-NEXT: vl1re16.v v15, (a1) +; ZVBB-NEXT: vl1re16.v v12, (a5) +; ZVBB-NEXT: vl1re16.v v13, (a6) +; ZVBB-NEXT: vl1re16.v v10, (a3) +; ZVBB-NEXT: vl1re16.v v11, (a4) +; ZVBB-NEXT: vl1re16.v v8, (a0) +; ZVBB-NEXT: vl1re16.v v9, (a2) ; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: slli a1, a0, 3 -; ZVBB-NEXT: sub a0, a1, a0 +; ZVBB-NEXT: slli a0, a0, 3 ; ZVBB-NEXT: add sp, sp, a0 ; ZVBB-NEXT: addi sp, sp, 16 ; ZVBB-NEXT: ret - %res = call @llvm.vector.interleave7.nxv7f64( %v0, %v1, %v2, %v3, %v4, %v5, %v6) - ret %res + %res = call @llvm.vector.interleave8.nxv32bf16( %v0, %v1, %v2, %v3, %v4, %v5, %v6, %v7) + ret %res } -define @vector_interleave_nxv14f64_nxv2f64( %v0, %v1, %v2, %v3, %v4, %v5, %v6) nounwind { -; RV32-LABEL: vector_interleave_nxv14f64_nxv2f64: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -80 -; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill -; RV32-NEXT: addi s0, sp, 80 -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 -; RV32-NEXT: sub sp, sp, a0 -; RV32-NEXT: andi sp, sp, -64 -; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV32-NEXT: vmv2r.v v26, v20 -; RV32-NEXT: addi a0, sp, 64 -; RV32-NEXT: vmv2r.v v24, v16 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a2, a1, 3 -; RV32-NEXT: sub a1, a2, a1 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 64 -; RV32-NEXT: vmv2r.v v22, v12 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: vmv2r.v v20, v8 -; RV32-NEXT: vmv1r.v v1, v20 -; RV32-NEXT: vmv1r.v v3, v22 -; RV32-NEXT: vmv1r.v v5, v24 -; RV32-NEXT: vmv1r.v v7, v26 -; RV32-NEXT: add a3, a0, a2 -; RV32-NEXT: vmv1r.v v2, v10 -; RV32-NEXT: add a4, a1, a2 -; RV32-NEXT: slli a5, a2, 2 -; RV32-NEXT: vmv1r.v v4, v14 -; RV32-NEXT: slli a6, a2, 4 -; RV32-NEXT: add a7, a4, a2 -; RV32-NEXT: vmv1r.v v6, v18 -; RV32-NEXT: sub a5, a6, a5 -; RV32-NEXT: vmv1r.v v22, v11 -; RV32-NEXT: add a6, a7, a2 -; RV32-NEXT: vmv1r.v v24, v15 -; RV32-NEXT: vsseg7e64.v v1, (a0) -; RV32-NEXT: vmv1r.v v26, v19 -; RV32-NEXT: vsseg7e64.v v21, (a1) -; RV32-NEXT: vl1re64.v v18, (a6) -; RV32-NEXT: add a6, a6, a2 -; RV32-NEXT: vl1re64.v v19, (a6) -; RV32-NEXT: add a6, a6, a2 -; RV32-NEXT: vl1re64.v v20, (a6) -; RV32-NEXT: add a6, a6, a2 -; RV32-NEXT: vl1re64.v v21, (a6) -; RV32-NEXT: add a6, a3, a2 -; RV32-NEXT: vl1re64.v v10, (a6) -; RV32-NEXT: add a6, a6, a2 -; RV32-NEXT: vl1re64.v v11, (a6) -; RV32-NEXT: vl1re64.v v8, (a0) -; RV32-NEXT: vl1re64.v v16, (a4) -; RV32-NEXT: vl1re64.v v9, (a3) -; RV32-NEXT: vl1re64.v v17, (a7) -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a3, 14 -; RV32-NEXT: mul a0, a0, a3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 64 -; RV32-NEXT: add a6, a6, a2 -; RV32-NEXT: vl1re64.v v12, (a6) -; RV32-NEXT: add a6, a6, a2 -; RV32-NEXT: vl1re64.v v13, (a6) -; RV32-NEXT: add a6, a6, a2 -; RV32-NEXT: slli a2, a2, 3 -; RV32-NEXT: add a2, a0, a2 -; RV32-NEXT: vl1re64.v v14, (a6) -; RV32-NEXT: vl1re64.v v15, (a1) -; RV32-NEXT: add a5, a0, a5 -; RV32-NEXT: vs2r.v v20, (a5) -; RV32-NEXT: vs4r.v v16, (a2) -; RV32-NEXT: vs8r.v v8, (a0) -; RV32-NEXT: vl8re64.v v16, (a2) -; RV32-NEXT: vl8re64.v v8, (a0) -; RV32-NEXT: addi sp, s0, -80 -; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 80 -; RV32-NEXT: ret +define @vector_interleave_nxv64bf16_nxv8bf16( %v0, %v1, %v2, %v3, %v4, %v5, %v6, %v7) nounwind { +; CHECK-LABEL: vector_interleave_nxv64bf16_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv2r.v v28, v22 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv2r.v v26, v18 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: add a2, a0, a3 +; CHECK-NEXT: add a4, a1, a3 +; CHECK-NEXT: add a5, a2, a3 +; CHECK-NEXT: vmv1r.v v1, v8 +; CHECK-NEXT: vmv2r.v v24, v14 +; CHECK-NEXT: add a6, a4, a3 +; CHECK-NEXT: vmv2r.v v22, v10 +; CHECK-NEXT: vmv1r.v v2, v22 +; CHECK-NEXT: add a7, a5, a3 +; CHECK-NEXT: vmv1r.v v3, v12 +; CHECK-NEXT: add t0, a6, a3 +; CHECK-NEXT: vmv1r.v v4, v24 +; CHECK-NEXT: add t1, a7, a3 +; CHECK-NEXT: vmv1r.v v5, v16 +; CHECK-NEXT: add t2, t0, a3 +; CHECK-NEXT: vmv1r.v v6, v26 +; CHECK-NEXT: add t3, t1, a3 +; CHECK-NEXT: vmv1r.v v7, v20 +; CHECK-NEXT: add t4, t2, a3 +; CHECK-NEXT: vmv1r.v v8, v28 +; CHECK-NEXT: vmv1r.v v22, v9 +; CHECK-NEXT: add t5, t3, a3 +; CHECK-NEXT: vmv1r.v v24, v13 +; CHECK-NEXT: add t6, t4, a3 +; CHECK-NEXT: vmv1r.v v26, v17 +; CHECK-NEXT: vsseg8e16.v v1, (a0) +; CHECK-NEXT: vmv1r.v v28, v21 +; CHECK-NEXT: vsseg8e16.v v22, (a1) +; CHECK-NEXT: vl1re16.v v14, (t5) +; CHECK-NEXT: add t5, t5, a3 +; CHECK-NEXT: add a3, t6, a3 +; CHECK-NEXT: vl1re16.v v22, (t6) +; CHECK-NEXT: vl1re16.v v15, (t5) +; CHECK-NEXT: vl1re16.v v23, (a3) +; CHECK-NEXT: vl1re16.v v12, (t1) +; CHECK-NEXT: vl1re16.v v20, (t2) +; CHECK-NEXT: vl1re16.v v13, (t3) +; CHECK-NEXT: vl1re16.v v21, (t4) +; CHECK-NEXT: vl1re16.v v10, (a5) +; CHECK-NEXT: vl1re16.v v18, (a6) +; CHECK-NEXT: vl1re16.v v11, (a7) +; CHECK-NEXT: vl1re16.v v19, (t0) +; CHECK-NEXT: vl1re16.v v8, (a0) +; CHECK-NEXT: vl1re16.v v16, (a1) +; CHECK-NEXT: vl1re16.v v9, (a2) +; CHECK-NEXT: vl1re16.v v17, (a4) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv64bf16_nxv8bf16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 4 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZVBB-NEXT: vmv2r.v v28, v22 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: vmv2r.v v26, v18 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: slli a1, a1, 3 +; ZVBB-NEXT: add a1, sp, a1 +; ZVBB-NEXT: addi a1, a1, 16 +; ZVBB-NEXT: csrr a3, vlenb +; ZVBB-NEXT: add a2, a0, a3 +; ZVBB-NEXT: add a4, a1, a3 +; ZVBB-NEXT: add a5, a2, a3 +; ZVBB-NEXT: vmv1r.v v1, v8 +; ZVBB-NEXT: vmv2r.v v24, v14 +; ZVBB-NEXT: add a6, a4, a3 +; ZVBB-NEXT: vmv2r.v v22, v10 +; ZVBB-NEXT: vmv1r.v v2, v22 +; ZVBB-NEXT: add a7, a5, a3 +; ZVBB-NEXT: vmv1r.v v3, v12 +; ZVBB-NEXT: add t0, a6, a3 +; ZVBB-NEXT: vmv1r.v v4, v24 +; ZVBB-NEXT: add t1, a7, a3 +; ZVBB-NEXT: vmv1r.v v5, v16 +; ZVBB-NEXT: add t2, t0, a3 +; ZVBB-NEXT: vmv1r.v v6, v26 +; ZVBB-NEXT: add t3, t1, a3 +; ZVBB-NEXT: vmv1r.v v7, v20 +; ZVBB-NEXT: add t4, t2, a3 +; ZVBB-NEXT: vmv1r.v v8, v28 +; ZVBB-NEXT: vmv1r.v v22, v9 +; ZVBB-NEXT: add t5, t3, a3 +; ZVBB-NEXT: vmv1r.v v24, v13 +; ZVBB-NEXT: add t6, t4, a3 +; ZVBB-NEXT: vmv1r.v v26, v17 +; ZVBB-NEXT: vsseg8e16.v v1, (a0) +; ZVBB-NEXT: vmv1r.v v28, v21 +; ZVBB-NEXT: vsseg8e16.v v22, (a1) +; ZVBB-NEXT: vl1re16.v v14, (t5) +; ZVBB-NEXT: add t5, t5, a3 +; ZVBB-NEXT: add a3, t6, a3 +; ZVBB-NEXT: vl1re16.v v22, (t6) +; ZVBB-NEXT: vl1re16.v v15, (t5) +; ZVBB-NEXT: vl1re16.v v23, (a3) +; ZVBB-NEXT: vl1re16.v v12, (t1) +; ZVBB-NEXT: vl1re16.v v20, (t2) +; ZVBB-NEXT: vl1re16.v v13, (t3) +; ZVBB-NEXT: vl1re16.v v21, (t4) +; ZVBB-NEXT: vl1re16.v v10, (a5) +; ZVBB-NEXT: vl1re16.v v18, (a6) +; ZVBB-NEXT: vl1re16.v v11, (a7) +; ZVBB-NEXT: vl1re16.v v19, (t0) +; ZVBB-NEXT: vl1re16.v v8, (a0) +; ZVBB-NEXT: vl1re16.v v16, (a1) +; ZVBB-NEXT: vl1re16.v v9, (a2) +; ZVBB-NEXT: vl1re16.v v17, (a4) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 4 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave8.nxv64bf16( %v0, %v1, %v2, %v3, %v4, %v5, %v6, %v7) + ret %res +} + +define @vector_interleave_nxv8f32_nxv1f32( %v0, %v1, %v2, %v3, %v4, %v5, %v6, %v8) nounwind { +; CHECK-LABEL: vector_interleave_nxv8f32_nxv1f32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a2, a1, 1 +; CHECK-NEXT: add a3, a0, a2 +; CHECK-NEXT: add a4, a3, a2 +; CHECK-NEXT: add a5, a4, a2 +; CHECK-NEXT: add a6, a5, a2 +; CHECK-NEXT: add a7, a6, a2 +; CHECK-NEXT: vsetvli t0, zero, e32, mf2, ta, ma +; CHECK-NEXT: vsseg8e32.v v8, (a0) +; CHECK-NEXT: add t0, a7, a2 +; CHECK-NEXT: add a2, t0, a2 +; CHECK-NEXT: vle32.v v11, (t0) +; CHECK-NEXT: vle32.v v8, (a2) +; CHECK-NEXT: srli a1, a1, 3 +; CHECK-NEXT: add a2, a1, a1 +; CHECK-NEXT: vle32.v v9, (a7) +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vslideup.vx v11, v8, a1 +; CHECK-NEXT: vsetvli a7, zero, e32, mf2, ta, ma +; CHECK-NEXT: vle32.v v10, (a6) +; CHECK-NEXT: vle32.v v8, (a5) +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vslideup.vx v10, v9, a1 +; CHECK-NEXT: vsetvli a5, zero, e32, mf2, ta, ma +; CHECK-NEXT: vle32.v v9, (a4) +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vslideup.vx v9, v8, a1 +; CHECK-NEXT: vsetvli a4, zero, e32, mf2, ta, ma +; CHECK-NEXT: vle32.v v12, (a3) +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; CHECK-NEXT: vslideup.vx v8, v12, a1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret ; -; RV64-LABEL: vector_interleave_nxv14f64_nxv2f64: -; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -80 -; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill -; RV64-NEXT: addi s0, sp, 80 -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 5 -; RV64-NEXT: sub sp, sp, a0 -; RV64-NEXT: andi sp, sp, -64 -; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; RV64-NEXT: vmv2r.v v26, v20 -; RV64-NEXT: addi a0, sp, 64 -; RV64-NEXT: vmv2r.v v24, v16 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 3 -; RV64-NEXT: sub a1, a2, a1 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 64 -; RV64-NEXT: vmv2r.v v22, v12 -; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: vmv2r.v v20, v8 -; RV64-NEXT: vmv1r.v v1, v20 -; RV64-NEXT: vmv1r.v v3, v22 -; RV64-NEXT: vmv1r.v v5, v24 -; RV64-NEXT: vmv1r.v v7, v26 -; RV64-NEXT: add a3, a0, a2 -; RV64-NEXT: vmv1r.v v2, v10 -; RV64-NEXT: add a4, a1, a2 -; RV64-NEXT: slli a5, a2, 2 -; RV64-NEXT: vmv1r.v v4, v14 -; RV64-NEXT: slli a6, a2, 4 -; RV64-NEXT: add a7, a4, a2 -; RV64-NEXT: vmv1r.v v6, v18 -; RV64-NEXT: sub a5, a6, a5 -; RV64-NEXT: vmv1r.v v22, v11 -; RV64-NEXT: add a6, a7, a2 -; RV64-NEXT: vmv1r.v v24, v15 -; RV64-NEXT: vsseg7e64.v v1, (a0) -; RV64-NEXT: vmv1r.v v26, v19 -; RV64-NEXT: vsseg7e64.v v21, (a1) -; RV64-NEXT: vl1re64.v v18, (a6) -; RV64-NEXT: add a6, a6, a2 -; RV64-NEXT: vl1re64.v v19, (a6) -; RV64-NEXT: add a6, a6, a2 -; RV64-NEXT: vl1re64.v v20, (a6) -; RV64-NEXT: add a6, a6, a2 -; RV64-NEXT: vl1re64.v v21, (a6) -; RV64-NEXT: add a6, a3, a2 -; RV64-NEXT: vl1re64.v v10, (a6) -; RV64-NEXT: add a6, a6, a2 -; RV64-NEXT: vl1re64.v v11, (a6) -; RV64-NEXT: vl1re64.v v8, (a0) -; RV64-NEXT: vl1re64.v v16, (a4) -; RV64-NEXT: vl1re64.v v9, (a3) -; RV64-NEXT: vl1re64.v v17, (a7) -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: li a3, 14 -; RV64-NEXT: mul a0, a0, a3 -; RV64-NEXT: add a0, sp, a0 -; RV64-NEXT: addi a0, a0, 64 -; RV64-NEXT: add a6, a6, a2 -; RV64-NEXT: vl1re64.v v12, (a6) -; RV64-NEXT: add a6, a6, a2 -; RV64-NEXT: vl1re64.v v13, (a6) -; RV64-NEXT: add a6, a6, a2 -; RV64-NEXT: slli a2, a2, 3 -; RV64-NEXT: add a2, a0, a2 -; RV64-NEXT: vl1re64.v v14, (a6) -; RV64-NEXT: vl1re64.v v15, (a1) -; RV64-NEXT: add a5, a0, a5 -; RV64-NEXT: vs2r.v v20, (a5) -; RV64-NEXT: vs4r.v v16, (a2) -; RV64-NEXT: vs8r.v v8, (a0) -; RV64-NEXT: vl8re64.v v16, (a2) -; RV64-NEXT: vl8re64.v v8, (a0) -; RV64-NEXT: addi sp, s0, -80 -; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 80 -; RV64-NEXT: ret +; ZVBB-LABEL: vector_interleave_nxv8f32_nxv1f32: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 2 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: srli a2, a1, 1 +; ZVBB-NEXT: add a3, a0, a2 +; ZVBB-NEXT: add a4, a3, a2 +; ZVBB-NEXT: add a5, a4, a2 +; ZVBB-NEXT: add a6, a5, a2 +; ZVBB-NEXT: add a7, a6, a2 +; ZVBB-NEXT: vsetvli t0, zero, e32, mf2, ta, ma +; ZVBB-NEXT: vsseg8e32.v v8, (a0) +; ZVBB-NEXT: add t0, a7, a2 +; ZVBB-NEXT: add a2, t0, a2 +; ZVBB-NEXT: vle32.v v11, (t0) +; ZVBB-NEXT: vle32.v v8, (a2) +; ZVBB-NEXT: srli a1, a1, 3 +; ZVBB-NEXT: add a2, a1, a1 +; ZVBB-NEXT: vle32.v v9, (a7) +; ZVBB-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v11, v8, a1 +; ZVBB-NEXT: vsetvli a7, zero, e32, mf2, ta, ma +; ZVBB-NEXT: vle32.v v10, (a6) +; ZVBB-NEXT: vle32.v v8, (a5) +; ZVBB-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v10, v9, a1 +; ZVBB-NEXT: vsetvli a5, zero, e32, mf2, ta, ma +; ZVBB-NEXT: vle32.v v9, (a4) +; ZVBB-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v9, v8, a1 +; ZVBB-NEXT: vsetvli a4, zero, e32, mf2, ta, ma +; ZVBB-NEXT: vle32.v v12, (a3) +; ZVBB-NEXT: vle32.v v8, (a0) +; ZVBB-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v8, v12, a1 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 2 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave8.nxv8f32( %v0, %v1, %v2, %v3, %v4, %v5, %v6, %v8) + ret %res +} + +define @vector_interleave_nxv16f32_nxv2f32( %v0, %v1, %v2, %v3, %v4, %v5, %v6, %v7) nounwind { +; CHECK-LABEL: vector_interleave_nxv16f32_nxv2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: add a3, a2, a1 +; CHECK-NEXT: add a4, a3, a1 +; CHECK-NEXT: add a5, a4, a1 +; CHECK-NEXT: add a6, a5, a1 +; CHECK-NEXT: add a7, a6, a1 +; CHECK-NEXT: vsetvli t0, zero, e32, m1, ta, ma +; CHECK-NEXT: vsseg8e32.v v8, (a0) +; CHECK-NEXT: vl1re32.v v14, (a7) +; CHECK-NEXT: add a1, a7, a1 +; CHECK-NEXT: vl1re32.v v15, (a1) +; CHECK-NEXT: vl1re32.v v12, (a5) +; CHECK-NEXT: vl1re32.v v13, (a6) +; CHECK-NEXT: vl1re32.v v10, (a3) +; CHECK-NEXT: vl1re32.v v11, (a4) +; CHECK-NEXT: vl1re32.v v8, (a0) +; CHECK-NEXT: vl1re32.v v9, (a2) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret ; -; ZVBB-RV32-LABEL: vector_interleave_nxv14f64_nxv2f64: -; ZVBB-RV32: # %bb.0: -; ZVBB-RV32-NEXT: addi sp, sp, -80 -; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill -; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill -; ZVBB-RV32-NEXT: addi s0, sp, 80 -; ZVBB-RV32-NEXT: csrr a0, vlenb -; ZVBB-RV32-NEXT: slli a0, a0, 5 -; ZVBB-RV32-NEXT: sub sp, sp, a0 -; ZVBB-RV32-NEXT: andi sp, sp, -64 -; ZVBB-RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; ZVBB-RV32-NEXT: vmv2r.v v26, v20 -; ZVBB-RV32-NEXT: addi a0, sp, 64 -; ZVBB-RV32-NEXT: vmv2r.v v24, v16 -; ZVBB-RV32-NEXT: csrr a1, vlenb -; ZVBB-RV32-NEXT: slli a2, a1, 3 -; ZVBB-RV32-NEXT: sub a1, a2, a1 -; ZVBB-RV32-NEXT: add a1, sp, a1 -; ZVBB-RV32-NEXT: addi a1, a1, 64 -; ZVBB-RV32-NEXT: vmv2r.v v22, v12 -; ZVBB-RV32-NEXT: csrr a2, vlenb -; ZVBB-RV32-NEXT: vmv2r.v v20, v8 -; ZVBB-RV32-NEXT: vmv1r.v v1, v20 -; ZVBB-RV32-NEXT: vmv1r.v v3, v22 -; ZVBB-RV32-NEXT: vmv1r.v v5, v24 -; ZVBB-RV32-NEXT: vmv1r.v v7, v26 -; ZVBB-RV32-NEXT: add a3, a0, a2 -; ZVBB-RV32-NEXT: vmv1r.v v2, v10 -; ZVBB-RV32-NEXT: add a4, a1, a2 -; ZVBB-RV32-NEXT: slli a5, a2, 2 -; ZVBB-RV32-NEXT: vmv1r.v v4, v14 -; ZVBB-RV32-NEXT: slli a6, a2, 4 -; ZVBB-RV32-NEXT: add a7, a4, a2 -; ZVBB-RV32-NEXT: vmv1r.v v6, v18 -; ZVBB-RV32-NEXT: sub a5, a6, a5 -; ZVBB-RV32-NEXT: vmv1r.v v22, v11 -; ZVBB-RV32-NEXT: add a6, a7, a2 -; ZVBB-RV32-NEXT: vmv1r.v v24, v15 -; ZVBB-RV32-NEXT: vsseg7e64.v v1, (a0) -; ZVBB-RV32-NEXT: vmv1r.v v26, v19 -; ZVBB-RV32-NEXT: vsseg7e64.v v21, (a1) -; ZVBB-RV32-NEXT: vl1re64.v v18, (a6) -; ZVBB-RV32-NEXT: add a6, a6, a2 -; ZVBB-RV32-NEXT: vl1re64.v v19, (a6) -; ZVBB-RV32-NEXT: add a6, a6, a2 -; ZVBB-RV32-NEXT: vl1re64.v v20, (a6) -; ZVBB-RV32-NEXT: add a6, a6, a2 -; ZVBB-RV32-NEXT: vl1re64.v v21, (a6) -; ZVBB-RV32-NEXT: add a6, a3, a2 -; ZVBB-RV32-NEXT: vl1re64.v v10, (a6) -; ZVBB-RV32-NEXT: add a6, a6, a2 -; ZVBB-RV32-NEXT: vl1re64.v v11, (a6) -; ZVBB-RV32-NEXT: vl1re64.v v8, (a0) -; ZVBB-RV32-NEXT: vl1re64.v v16, (a4) -; ZVBB-RV32-NEXT: vl1re64.v v9, (a3) -; ZVBB-RV32-NEXT: vl1re64.v v17, (a7) -; ZVBB-RV32-NEXT: csrr a0, vlenb -; ZVBB-RV32-NEXT: li a3, 14 -; ZVBB-RV32-NEXT: mul a0, a0, a3 -; ZVBB-RV32-NEXT: add a0, sp, a0 -; ZVBB-RV32-NEXT: addi a0, a0, 64 -; ZVBB-RV32-NEXT: add a6, a6, a2 -; ZVBB-RV32-NEXT: vl1re64.v v12, (a6) -; ZVBB-RV32-NEXT: add a6, a6, a2 -; ZVBB-RV32-NEXT: vl1re64.v v13, (a6) -; ZVBB-RV32-NEXT: add a6, a6, a2 -; ZVBB-RV32-NEXT: slli a2, a2, 3 -; ZVBB-RV32-NEXT: add a2, a0, a2 -; ZVBB-RV32-NEXT: vl1re64.v v14, (a6) -; ZVBB-RV32-NEXT: vl1re64.v v15, (a1) -; ZVBB-RV32-NEXT: add a5, a0, a5 -; ZVBB-RV32-NEXT: vs2r.v v20, (a5) -; ZVBB-RV32-NEXT: vs4r.v v16, (a2) -; ZVBB-RV32-NEXT: vs8r.v v8, (a0) -; ZVBB-RV32-NEXT: vl8re64.v v16, (a2) -; ZVBB-RV32-NEXT: vl8re64.v v8, (a0) -; ZVBB-RV32-NEXT: addi sp, s0, -80 -; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload -; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload -; ZVBB-RV32-NEXT: addi sp, sp, 80 -; ZVBB-RV32-NEXT: ret +; ZVBB-LABEL: vector_interleave_nxv16f32_nxv2f32: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 3 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: add a3, a2, a1 +; ZVBB-NEXT: add a4, a3, a1 +; ZVBB-NEXT: add a5, a4, a1 +; ZVBB-NEXT: add a6, a5, a1 +; ZVBB-NEXT: add a7, a6, a1 +; ZVBB-NEXT: vsetvli t0, zero, e32, m1, ta, ma +; ZVBB-NEXT: vsseg8e32.v v8, (a0) +; ZVBB-NEXT: vl1re32.v v14, (a7) +; ZVBB-NEXT: add a1, a7, a1 +; ZVBB-NEXT: vl1re32.v v15, (a1) +; ZVBB-NEXT: vl1re32.v v12, (a5) +; ZVBB-NEXT: vl1re32.v v13, (a6) +; ZVBB-NEXT: vl1re32.v v10, (a3) +; ZVBB-NEXT: vl1re32.v v11, (a4) +; ZVBB-NEXT: vl1re32.v v8, (a0) +; ZVBB-NEXT: vl1re32.v v9, (a2) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 3 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave8.nxv16f32( %v0, %v1, %v2, %v3, %v4, %v5, %v6, %v7) + ret %res +} + +define @vector_interleave_nxv32f32_nxv4f32( %v0, %v1, %v2, %v3, %v4, %v5, %v6, %v7) nounwind { +; CHECK-LABEL: vector_interleave_nxv32f32_nxv4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vmv2r.v v28, v22 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv2r.v v26, v18 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: add a2, a0, a3 +; CHECK-NEXT: add a4, a1, a3 +; CHECK-NEXT: add a5, a2, a3 +; CHECK-NEXT: vmv1r.v v1, v8 +; CHECK-NEXT: vmv2r.v v24, v14 +; CHECK-NEXT: add a6, a4, a3 +; CHECK-NEXT: vmv2r.v v22, v10 +; CHECK-NEXT: vmv1r.v v2, v22 +; CHECK-NEXT: add a7, a5, a3 +; CHECK-NEXT: vmv1r.v v3, v12 +; CHECK-NEXT: add t0, a6, a3 +; CHECK-NEXT: vmv1r.v v4, v24 +; CHECK-NEXT: add t1, a7, a3 +; CHECK-NEXT: vmv1r.v v5, v16 +; CHECK-NEXT: add t2, t0, a3 +; CHECK-NEXT: vmv1r.v v6, v26 +; CHECK-NEXT: add t3, t1, a3 +; CHECK-NEXT: vmv1r.v v7, v20 +; CHECK-NEXT: add t4, t2, a3 +; CHECK-NEXT: vmv1r.v v8, v28 +; CHECK-NEXT: vmv1r.v v22, v9 +; CHECK-NEXT: add t5, t3, a3 +; CHECK-NEXT: vmv1r.v v24, v13 +; CHECK-NEXT: add t6, t4, a3 +; CHECK-NEXT: vmv1r.v v26, v17 +; CHECK-NEXT: vsseg8e32.v v1, (a0) +; CHECK-NEXT: vmv1r.v v28, v21 +; CHECK-NEXT: vsseg8e32.v v22, (a1) +; CHECK-NEXT: vl1re32.v v14, (t5) +; CHECK-NEXT: add t5, t5, a3 +; CHECK-NEXT: add a3, t6, a3 +; CHECK-NEXT: vl1re32.v v22, (t6) +; CHECK-NEXT: vl1re32.v v15, (t5) +; CHECK-NEXT: vl1re32.v v23, (a3) +; CHECK-NEXT: vl1re32.v v12, (t1) +; CHECK-NEXT: vl1re32.v v20, (t2) +; CHECK-NEXT: vl1re32.v v13, (t3) +; CHECK-NEXT: vl1re32.v v21, (t4) +; CHECK-NEXT: vl1re32.v v10, (a5) +; CHECK-NEXT: vl1re32.v v18, (a6) +; CHECK-NEXT: vl1re32.v v11, (a7) +; CHECK-NEXT: vl1re32.v v19, (t0) +; CHECK-NEXT: vl1re32.v v8, (a0) +; CHECK-NEXT: vl1re32.v v16, (a1) +; CHECK-NEXT: vl1re32.v v9, (a2) +; CHECK-NEXT: vl1re32.v v17, (a4) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret ; -; ZVBB-RV64-LABEL: vector_interleave_nxv14f64_nxv2f64: -; ZVBB-RV64: # %bb.0: -; ZVBB-RV64-NEXT: addi sp, sp, -80 -; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill -; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill -; ZVBB-RV64-NEXT: addi s0, sp, 80 -; ZVBB-RV64-NEXT: csrr a0, vlenb -; ZVBB-RV64-NEXT: slli a0, a0, 5 -; ZVBB-RV64-NEXT: sub sp, sp, a0 -; ZVBB-RV64-NEXT: andi sp, sp, -64 -; ZVBB-RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; ZVBB-RV64-NEXT: vmv2r.v v26, v20 -; ZVBB-RV64-NEXT: addi a0, sp, 64 -; ZVBB-RV64-NEXT: vmv2r.v v24, v16 -; ZVBB-RV64-NEXT: csrr a1, vlenb -; ZVBB-RV64-NEXT: slli a2, a1, 3 -; ZVBB-RV64-NEXT: sub a1, a2, a1 -; ZVBB-RV64-NEXT: add a1, sp, a1 -; ZVBB-RV64-NEXT: addi a1, a1, 64 -; ZVBB-RV64-NEXT: vmv2r.v v22, v12 -; ZVBB-RV64-NEXT: csrr a2, vlenb -; ZVBB-RV64-NEXT: vmv2r.v v20, v8 -; ZVBB-RV64-NEXT: vmv1r.v v1, v20 -; ZVBB-RV64-NEXT: vmv1r.v v3, v22 -; ZVBB-RV64-NEXT: vmv1r.v v5, v24 -; ZVBB-RV64-NEXT: vmv1r.v v7, v26 -; ZVBB-RV64-NEXT: add a3, a0, a2 -; ZVBB-RV64-NEXT: vmv1r.v v2, v10 -; ZVBB-RV64-NEXT: add a4, a1, a2 -; ZVBB-RV64-NEXT: slli a5, a2, 2 -; ZVBB-RV64-NEXT: vmv1r.v v4, v14 -; ZVBB-RV64-NEXT: slli a6, a2, 4 -; ZVBB-RV64-NEXT: add a7, a4, a2 -; ZVBB-RV64-NEXT: vmv1r.v v6, v18 -; ZVBB-RV64-NEXT: sub a5, a6, a5 -; ZVBB-RV64-NEXT: vmv1r.v v22, v11 -; ZVBB-RV64-NEXT: add a6, a7, a2 -; ZVBB-RV64-NEXT: vmv1r.v v24, v15 -; ZVBB-RV64-NEXT: vsseg7e64.v v1, (a0) -; ZVBB-RV64-NEXT: vmv1r.v v26, v19 -; ZVBB-RV64-NEXT: vsseg7e64.v v21, (a1) -; ZVBB-RV64-NEXT: vl1re64.v v18, (a6) -; ZVBB-RV64-NEXT: add a6, a6, a2 -; ZVBB-RV64-NEXT: vl1re64.v v19, (a6) -; ZVBB-RV64-NEXT: add a6, a6, a2 -; ZVBB-RV64-NEXT: vl1re64.v v20, (a6) -; ZVBB-RV64-NEXT: add a6, a6, a2 -; ZVBB-RV64-NEXT: vl1re64.v v21, (a6) -; ZVBB-RV64-NEXT: add a6, a3, a2 -; ZVBB-RV64-NEXT: vl1re64.v v10, (a6) -; ZVBB-RV64-NEXT: add a6, a6, a2 -; ZVBB-RV64-NEXT: vl1re64.v v11, (a6) -; ZVBB-RV64-NEXT: vl1re64.v v8, (a0) -; ZVBB-RV64-NEXT: vl1re64.v v16, (a4) -; ZVBB-RV64-NEXT: vl1re64.v v9, (a3) -; ZVBB-RV64-NEXT: vl1re64.v v17, (a7) -; ZVBB-RV64-NEXT: csrr a0, vlenb -; ZVBB-RV64-NEXT: li a3, 14 -; ZVBB-RV64-NEXT: mul a0, a0, a3 -; ZVBB-RV64-NEXT: add a0, sp, a0 -; ZVBB-RV64-NEXT: addi a0, a0, 64 -; ZVBB-RV64-NEXT: add a6, a6, a2 -; ZVBB-RV64-NEXT: vl1re64.v v12, (a6) -; ZVBB-RV64-NEXT: add a6, a6, a2 -; ZVBB-RV64-NEXT: vl1re64.v v13, (a6) -; ZVBB-RV64-NEXT: add a6, a6, a2 -; ZVBB-RV64-NEXT: slli a2, a2, 3 -; ZVBB-RV64-NEXT: add a2, a0, a2 -; ZVBB-RV64-NEXT: vl1re64.v v14, (a6) -; ZVBB-RV64-NEXT: vl1re64.v v15, (a1) -; ZVBB-RV64-NEXT: add a5, a0, a5 -; ZVBB-RV64-NEXT: vs2r.v v20, (a5) -; ZVBB-RV64-NEXT: vs4r.v v16, (a2) -; ZVBB-RV64-NEXT: vs8r.v v8, (a0) -; ZVBB-RV64-NEXT: vl8re64.v v16, (a2) -; ZVBB-RV64-NEXT: vl8re64.v v8, (a0) -; ZVBB-RV64-NEXT: addi sp, s0, -80 -; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload -; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload -; ZVBB-RV64-NEXT: addi sp, sp, 80 -; ZVBB-RV64-NEXT: ret +; ZVBB-LABEL: vector_interleave_nxv32f32_nxv4f32: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 4 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; ZVBB-NEXT: vmv2r.v v28, v22 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: vmv2r.v v26, v18 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: slli a1, a1, 3 +; ZVBB-NEXT: add a1, sp, a1 +; ZVBB-NEXT: addi a1, a1, 16 +; ZVBB-NEXT: csrr a3, vlenb +; ZVBB-NEXT: add a2, a0, a3 +; ZVBB-NEXT: add a4, a1, a3 +; ZVBB-NEXT: add a5, a2, a3 +; ZVBB-NEXT: vmv1r.v v1, v8 +; ZVBB-NEXT: vmv2r.v v24, v14 +; ZVBB-NEXT: add a6, a4, a3 +; ZVBB-NEXT: vmv2r.v v22, v10 +; ZVBB-NEXT: vmv1r.v v2, v22 +; ZVBB-NEXT: add a7, a5, a3 +; ZVBB-NEXT: vmv1r.v v3, v12 +; ZVBB-NEXT: add t0, a6, a3 +; ZVBB-NEXT: vmv1r.v v4, v24 +; ZVBB-NEXT: add t1, a7, a3 +; ZVBB-NEXT: vmv1r.v v5, v16 +; ZVBB-NEXT: add t2, t0, a3 +; ZVBB-NEXT: vmv1r.v v6, v26 +; ZVBB-NEXT: add t3, t1, a3 +; ZVBB-NEXT: vmv1r.v v7, v20 +; ZVBB-NEXT: add t4, t2, a3 +; ZVBB-NEXT: vmv1r.v v8, v28 +; ZVBB-NEXT: vmv1r.v v22, v9 +; ZVBB-NEXT: add t5, t3, a3 +; ZVBB-NEXT: vmv1r.v v24, v13 +; ZVBB-NEXT: add t6, t4, a3 +; ZVBB-NEXT: vmv1r.v v26, v17 +; ZVBB-NEXT: vsseg8e32.v v1, (a0) +; ZVBB-NEXT: vmv1r.v v28, v21 +; ZVBB-NEXT: vsseg8e32.v v22, (a1) +; ZVBB-NEXT: vl1re32.v v14, (t5) +; ZVBB-NEXT: add t5, t5, a3 +; ZVBB-NEXT: add a3, t6, a3 +; ZVBB-NEXT: vl1re32.v v22, (t6) +; ZVBB-NEXT: vl1re32.v v15, (t5) +; ZVBB-NEXT: vl1re32.v v23, (a3) +; ZVBB-NEXT: vl1re32.v v12, (t1) +; ZVBB-NEXT: vl1re32.v v20, (t2) +; ZVBB-NEXT: vl1re32.v v13, (t3) +; ZVBB-NEXT: vl1re32.v v21, (t4) +; ZVBB-NEXT: vl1re32.v v10, (a5) +; ZVBB-NEXT: vl1re32.v v18, (a6) +; ZVBB-NEXT: vl1re32.v v11, (a7) +; ZVBB-NEXT: vl1re32.v v19, (t0) +; ZVBB-NEXT: vl1re32.v v8, (a0) +; ZVBB-NEXT: vl1re32.v v16, (a1) +; ZVBB-NEXT: vl1re32.v v9, (a2) +; ZVBB-NEXT: vl1re32.v v17, (a4) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 4 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave8.nxv32f32( %v0, %v1, %v2, %v3, %v4, %v5, %v6, %v7) + ret %res +} + +define @vector_interleave_nxv8f64_nxv1f64( %v0, %v1, %v2, %v3, %v4, %v5, %v6, %v8) nounwind { +; CHECK-LABEL: vector_interleave_nxv8f64_nxv1f64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: add a3, a2, a1 +; CHECK-NEXT: add a4, a3, a1 +; CHECK-NEXT: add a5, a4, a1 +; CHECK-NEXT: add a6, a5, a1 +; CHECK-NEXT: add a7, a6, a1 +; CHECK-NEXT: vsetvli t0, zero, e64, m1, ta, ma +; CHECK-NEXT: vsseg8e64.v v8, (a0) +; CHECK-NEXT: vl1re64.v v14, (a7) +; CHECK-NEXT: add a1, a7, a1 +; CHECK-NEXT: vl1re64.v v15, (a1) +; CHECK-NEXT: vl1re64.v v12, (a5) +; CHECK-NEXT: vl1re64.v v13, (a6) +; CHECK-NEXT: vl1re64.v v10, (a3) +; CHECK-NEXT: vl1re64.v v11, (a4) +; CHECK-NEXT: vl1re64.v v8, (a0) +; CHECK-NEXT: vl1re64.v v9, (a2) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret ; -; ZIP-LABEL: vector_interleave_nxv14f64_nxv2f64: -; ZIP: # %bb.0: -; ZIP-NEXT: addi sp, sp, -80 -; ZIP-NEXT: sd ra, 72(sp) # 8-byte Folded Spill -; ZIP-NEXT: sd s0, 64(sp) # 8-byte Folded Spill -; ZIP-NEXT: addi s0, sp, 80 -; ZIP-NEXT: csrr a0, vlenb -; ZIP-NEXT: slli a0, a0, 5 -; ZIP-NEXT: sub sp, sp, a0 -; ZIP-NEXT: andi sp, sp, -64 -; ZIP-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; ZIP-NEXT: vmv2r.v v26, v20 -; ZIP-NEXT: addi a0, sp, 64 -; ZIP-NEXT: vmv2r.v v24, v16 -; ZIP-NEXT: csrr a1, vlenb -; ZIP-NEXT: slli a2, a1, 3 -; ZIP-NEXT: sub a1, a2, a1 -; ZIP-NEXT: add a1, sp, a1 -; ZIP-NEXT: addi a1, a1, 64 -; ZIP-NEXT: vmv2r.v v22, v12 -; ZIP-NEXT: csrr a2, vlenb -; ZIP-NEXT: vmv2r.v v20, v8 -; ZIP-NEXT: vmv1r.v v1, v20 -; ZIP-NEXT: vmv1r.v v3, v22 -; ZIP-NEXT: vmv1r.v v5, v24 -; ZIP-NEXT: vmv1r.v v7, v26 -; ZIP-NEXT: add a3, a0, a2 -; ZIP-NEXT: vmv1r.v v2, v10 -; ZIP-NEXT: add a4, a1, a2 -; ZIP-NEXT: slli a5, a2, 2 -; ZIP-NEXT: vmv1r.v v4, v14 -; ZIP-NEXT: slli a6, a2, 4 -; ZIP-NEXT: add a7, a4, a2 -; ZIP-NEXT: vmv1r.v v6, v18 -; ZIP-NEXT: sub a5, a6, a5 -; ZIP-NEXT: vmv1r.v v22, v11 -; ZIP-NEXT: add a6, a7, a2 -; ZIP-NEXT: vmv1r.v v24, v15 -; ZIP-NEXT: vsseg7e64.v v1, (a0) -; ZIP-NEXT: vmv1r.v v26, v19 -; ZIP-NEXT: vsseg7e64.v v21, (a1) -; ZIP-NEXT: vl1re64.v v18, (a6) -; ZIP-NEXT: add a6, a6, a2 -; ZIP-NEXT: vl1re64.v v19, (a6) -; ZIP-NEXT: add a6, a6, a2 -; ZIP-NEXT: vl1re64.v v20, (a6) -; ZIP-NEXT: add a6, a6, a2 -; ZIP-NEXT: vl1re64.v v21, (a6) -; ZIP-NEXT: add a6, a3, a2 -; ZIP-NEXT: vl1re64.v v10, (a6) -; ZIP-NEXT: add a6, a6, a2 -; ZIP-NEXT: vl1re64.v v11, (a6) -; ZIP-NEXT: vl1re64.v v8, (a0) -; ZIP-NEXT: vl1re64.v v16, (a4) -; ZIP-NEXT: vl1re64.v v9, (a3) -; ZIP-NEXT: vl1re64.v v17, (a7) -; ZIP-NEXT: csrr a0, vlenb -; ZIP-NEXT: li a3, 14 -; ZIP-NEXT: mul a0, a0, a3 -; ZIP-NEXT: add a0, sp, a0 -; ZIP-NEXT: addi a0, a0, 64 -; ZIP-NEXT: add a6, a6, a2 -; ZIP-NEXT: vl1re64.v v12, (a6) -; ZIP-NEXT: add a6, a6, a2 -; ZIP-NEXT: vl1re64.v v13, (a6) -; ZIP-NEXT: add a6, a6, a2 -; ZIP-NEXT: slli a2, a2, 3 -; ZIP-NEXT: add a2, a0, a2 -; ZIP-NEXT: vl1re64.v v14, (a6) -; ZIP-NEXT: vl1re64.v v15, (a1) -; ZIP-NEXT: add a5, a0, a5 -; ZIP-NEXT: vs2r.v v20, (a5) -; ZIP-NEXT: vs4r.v v16, (a2) -; ZIP-NEXT: vs8r.v v8, (a0) -; ZIP-NEXT: vl8re64.v v16, (a2) -; ZIP-NEXT: vl8re64.v v8, (a0) -; ZIP-NEXT: addi sp, s0, -80 -; ZIP-NEXT: ld ra, 72(sp) # 8-byte Folded Reload -; ZIP-NEXT: ld s0, 64(sp) # 8-byte Folded Reload -; ZIP-NEXT: addi sp, sp, 80 -; ZIP-NEXT: ret - %res = call @llvm.vector.interleave7.nxv14f64( %v0, %v1, %v2, %v3, %v4, %v5, %v6) - ret %res +; ZVBB-LABEL: vector_interleave_nxv8f64_nxv1f64: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 3 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: add a3, a2, a1 +; ZVBB-NEXT: add a4, a3, a1 +; ZVBB-NEXT: add a5, a4, a1 +; ZVBB-NEXT: add a6, a5, a1 +; ZVBB-NEXT: add a7, a6, a1 +; ZVBB-NEXT: vsetvli t0, zero, e64, m1, ta, ma +; ZVBB-NEXT: vsseg8e64.v v8, (a0) +; ZVBB-NEXT: vl1re64.v v14, (a7) +; ZVBB-NEXT: add a1, a7, a1 +; ZVBB-NEXT: vl1re64.v v15, (a1) +; ZVBB-NEXT: vl1re64.v v12, (a5) +; ZVBB-NEXT: vl1re64.v v13, (a6) +; ZVBB-NEXT: vl1re64.v v10, (a3) +; ZVBB-NEXT: vl1re64.v v11, (a4) +; ZVBB-NEXT: vl1re64.v v8, (a0) +; ZVBB-NEXT: vl1re64.v v9, (a2) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 3 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave8.nxv8f64( %v0, %v1, %v2, %v3, %v4, %v5, %v6, %v8) + ret %res +} + +define @vector_interleave_nxv16f64_nxv2f64( %v0, %v1, %v2, %v3, %v4, %v5, %v6, %v7) nounwind { +; CHECK-LABEL: vector_interleave_nxv16f64_nxv2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; CHECK-NEXT: vmv2r.v v28, v22 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv2r.v v26, v18 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: add a2, a0, a3 +; CHECK-NEXT: add a4, a1, a3 +; CHECK-NEXT: add a5, a2, a3 +; CHECK-NEXT: vmv1r.v v1, v8 +; CHECK-NEXT: vmv2r.v v24, v14 +; CHECK-NEXT: add a6, a4, a3 +; CHECK-NEXT: vmv2r.v v22, v10 +; CHECK-NEXT: vmv1r.v v2, v22 +; CHECK-NEXT: add a7, a5, a3 +; CHECK-NEXT: vmv1r.v v3, v12 +; CHECK-NEXT: add t0, a6, a3 +; CHECK-NEXT: vmv1r.v v4, v24 +; CHECK-NEXT: add t1, a7, a3 +; CHECK-NEXT: vmv1r.v v5, v16 +; CHECK-NEXT: add t2, t0, a3 +; CHECK-NEXT: vmv1r.v v6, v26 +; CHECK-NEXT: add t3, t1, a3 +; CHECK-NEXT: vmv1r.v v7, v20 +; CHECK-NEXT: add t4, t2, a3 +; CHECK-NEXT: vmv1r.v v8, v28 +; CHECK-NEXT: vmv1r.v v22, v9 +; CHECK-NEXT: add t5, t3, a3 +; CHECK-NEXT: vmv1r.v v24, v13 +; CHECK-NEXT: add t6, t4, a3 +; CHECK-NEXT: vmv1r.v v26, v17 +; CHECK-NEXT: vsseg8e64.v v1, (a0) +; CHECK-NEXT: vmv1r.v v28, v21 +; CHECK-NEXT: vsseg8e64.v v22, (a1) +; CHECK-NEXT: vl1re64.v v14, (t5) +; CHECK-NEXT: add t5, t5, a3 +; CHECK-NEXT: add a3, t6, a3 +; CHECK-NEXT: vl1re64.v v22, (t6) +; CHECK-NEXT: vl1re64.v v15, (t5) +; CHECK-NEXT: vl1re64.v v23, (a3) +; CHECK-NEXT: vl1re64.v v12, (t1) +; CHECK-NEXT: vl1re64.v v20, (t2) +; CHECK-NEXT: vl1re64.v v13, (t3) +; CHECK-NEXT: vl1re64.v v21, (t4) +; CHECK-NEXT: vl1re64.v v10, (a5) +; CHECK-NEXT: vl1re64.v v18, (a6) +; CHECK-NEXT: vl1re64.v v11, (a7) +; CHECK-NEXT: vl1re64.v v19, (t0) +; CHECK-NEXT: vl1re64.v v8, (a0) +; CHECK-NEXT: vl1re64.v v16, (a1) +; CHECK-NEXT: vl1re64.v v9, (a2) +; CHECK-NEXT: vl1re64.v v17, (a4) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: vector_interleave_nxv16f64_nxv2f64: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 4 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; ZVBB-NEXT: vmv2r.v v28, v22 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: vmv2r.v v26, v18 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: slli a1, a1, 3 +; ZVBB-NEXT: add a1, sp, a1 +; ZVBB-NEXT: addi a1, a1, 16 +; ZVBB-NEXT: csrr a3, vlenb +; ZVBB-NEXT: add a2, a0, a3 +; ZVBB-NEXT: add a4, a1, a3 +; ZVBB-NEXT: add a5, a2, a3 +; ZVBB-NEXT: vmv1r.v v1, v8 +; ZVBB-NEXT: vmv2r.v v24, v14 +; ZVBB-NEXT: add a6, a4, a3 +; ZVBB-NEXT: vmv2r.v v22, v10 +; ZVBB-NEXT: vmv1r.v v2, v22 +; ZVBB-NEXT: add a7, a5, a3 +; ZVBB-NEXT: vmv1r.v v3, v12 +; ZVBB-NEXT: add t0, a6, a3 +; ZVBB-NEXT: vmv1r.v v4, v24 +; ZVBB-NEXT: add t1, a7, a3 +; ZVBB-NEXT: vmv1r.v v5, v16 +; ZVBB-NEXT: add t2, t0, a3 +; ZVBB-NEXT: vmv1r.v v6, v26 +; ZVBB-NEXT: add t3, t1, a3 +; ZVBB-NEXT: vmv1r.v v7, v20 +; ZVBB-NEXT: add t4, t2, a3 +; ZVBB-NEXT: vmv1r.v v8, v28 +; ZVBB-NEXT: vmv1r.v v22, v9 +; ZVBB-NEXT: add t5, t3, a3 +; ZVBB-NEXT: vmv1r.v v24, v13 +; ZVBB-NEXT: add t6, t4, a3 +; ZVBB-NEXT: vmv1r.v v26, v17 +; ZVBB-NEXT: vsseg8e64.v v1, (a0) +; ZVBB-NEXT: vmv1r.v v28, v21 +; ZVBB-NEXT: vsseg8e64.v v22, (a1) +; ZVBB-NEXT: vl1re64.v v14, (t5) +; ZVBB-NEXT: add t5, t5, a3 +; ZVBB-NEXT: add a3, t6, a3 +; ZVBB-NEXT: vl1re64.v v22, (t6) +; ZVBB-NEXT: vl1re64.v v15, (t5) +; ZVBB-NEXT: vl1re64.v v23, (a3) +; ZVBB-NEXT: vl1re64.v v12, (t1) +; ZVBB-NEXT: vl1re64.v v20, (t2) +; ZVBB-NEXT: vl1re64.v v13, (t3) +; ZVBB-NEXT: vl1re64.v v21, (t4) +; ZVBB-NEXT: vl1re64.v v10, (a5) +; ZVBB-NEXT: vl1re64.v v18, (a6) +; ZVBB-NEXT: vl1re64.v v11, (a7) +; ZVBB-NEXT: vl1re64.v v19, (t0) +; ZVBB-NEXT: vl1re64.v v8, (a0) +; ZVBB-NEXT: vl1re64.v v16, (a1) +; ZVBB-NEXT: vl1re64.v v9, (a2) +; ZVBB-NEXT: vl1re64.v v17, (a4) +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 4 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: ret + %res = call @llvm.vector.interleave8.nxv16f64( %v0, %v1, %v2, %v3, %v4, %v5, %v6, %v7) + ret %res }