diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 6e8e4ac1c6a95..74686d9fc4188 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -5110,9 +5110,23 @@ static SDValue lowerVZIP(unsigned Opc, SDValue Op0, SDValue Op1, Op1 = convertToScalableVector(ContainerVT, Op1, DAG, Subtarget); } - auto [Mask, VL] = getDefaultVLOps(IntVT, ContainerVT, DL, DAG, Subtarget); - SDValue Passthru = DAG.getUNDEF(ContainerVT); - SDValue Res = DAG.getNode(Opc, DL, ContainerVT, Op0, Op1, Passthru, Mask, VL); + MVT InnerVT = ContainerVT; + auto [Mask, VL] = getDefaultVLOps(IntVT, InnerVT, DL, DAG, Subtarget); + if (Op1.isUndef() && ContainerVT.bitsGT(getLMUL1VT(ContainerVT)) && + (RISCVISD::RI_VUNZIP2A_VL == Opc || RISCVISD::RI_VUNZIP2B_VL == Opc)) { + InnerVT = ContainerVT.getHalfNumVectorElementsVT(); + VL = DAG.getConstant(VT.getVectorNumElements() / 2, DL, + Subtarget.getXLenVT()); + Mask = getAllOnesMask(InnerVT, VL, DL, DAG); + unsigned HighIdx = InnerVT.getVectorElementCount().getKnownMinValue(); + Op1 = DAG.getExtractSubvector(DL, InnerVT, Op0, HighIdx); + Op0 = DAG.getExtractSubvector(DL, InnerVT, Op0, 0); + } + + SDValue Passthru = DAG.getUNDEF(InnerVT); + SDValue Res = DAG.getNode(Opc, DL, InnerVT, Op0, Op1, Passthru, Mask, VL); + if (InnerVT.bitsLT(ContainerVT)) + Res = DAG.getInsertSubvector(DL, DAG.getUNDEF(ContainerVT), Res, 0); if (IntVT.isFixedLengthVector()) Res = convertFromScalableVector(IntVT, Res, DAG, Subtarget); Res = DAG.getBitcast(VT, Res); @@ -5808,6 +5822,25 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, } } + // If this is a deinterleave(2), try using vunzip{a,b}. This mostly catches + // e64 which can't match above. + unsigned Index = 0; + if (Subtarget.hasVendorXRivosVizip() && + ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, 2, Index) && + 1 < count_if(Mask, [](int Idx) { return Idx != -1; })) { + unsigned Opc = + Index == 0 ? RISCVISD::RI_VUNZIP2A_VL : RISCVISD::RI_VUNZIP2B_VL; + if (V2.isUndef()) + return lowerVZIP(Opc, V1, V2, DL, DAG, Subtarget); + if (SDValue Src = foldConcatVector(V1, V2)) { + EVT NewVT = VT.getDoubleNumVectorElementsVT(); + Src = DAG.getExtractSubvector(DL, NewVT, Src, 0); + SDValue Res = + lowerVZIP(Opc, Src, DAG.getUNDEF(NewVT), DL, DAG, Subtarget); + return DAG.getExtractSubvector(DL, VT, Res, 0); + } + } + if (SDValue V = lowerVECTOR_SHUFFLEAsVSlideup(DL, VT, V1, V2, Mask, Subtarget, DAG)) return V; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave2.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave2.ll index c65d7c36a2198..b692a80159288 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave2.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave2.ll @@ -347,9 +347,8 @@ define void @vnsrl_0_i64(ptr %in, ptr %out) { ; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, ma ; ZIP-NEXT: vle64.v v8, (a0) ; ZIP-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; ZIP-NEXT: vslidedown.vi v9, v8, 2 -; ZIP-NEXT: vslideup.vi v8, v9, 1 -; ZIP-NEXT: vse64.v v8, (a1) +; ZIP-NEXT: ri.vunzip2a.vv v10, v8, v9 +; ZIP-NEXT: vse64.v v10, (a1) ; ZIP-NEXT: ret entry: %0 = load <4 x i64>, ptr %in, align 8 @@ -383,8 +382,7 @@ define void @vnsrl_64_i64(ptr %in, ptr %out) { ; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, ma ; ZIP-NEXT: vle64.v v8, (a0) ; ZIP-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; ZIP-NEXT: vslidedown.vi v9, v8, 2 -; ZIP-NEXT: ri.vzipodd.vv v10, v8, v9 +; ZIP-NEXT: ri.vunzip2b.vv v10, v8, v9 ; ZIP-NEXT: vse64.v v10, (a1) ; ZIP-NEXT: ret entry: @@ -417,10 +415,9 @@ define void @vnsrl_0_double(ptr %in, ptr %out) { ; ZIP: # %bb.0: # %entry ; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, ma ; ZIP-NEXT: vle64.v v8, (a0) +; ZIP-NEXT: ri.vunzip2a.vv v10, v8, v9 ; ZIP-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; ZIP-NEXT: vslidedown.vi v9, v8, 2 -; ZIP-NEXT: vslideup.vi v8, v9, 1 -; ZIP-NEXT: vse64.v v8, (a1) +; ZIP-NEXT: vse64.v v10, (a1) ; ZIP-NEXT: ret entry: %0 = load <4 x double>, ptr %in, align 8 @@ -453,9 +450,8 @@ define void @vnsrl_64_double(ptr %in, ptr %out) { ; ZIP: # %bb.0: # %entry ; ZIP-NEXT: vsetivli zero, 4, e64, m1, ta, ma ; ZIP-NEXT: vle64.v v8, (a0) +; ZIP-NEXT: ri.vunzip2b.vv v10, v8, v9 ; ZIP-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; ZIP-NEXT: vslidedown.vi v9, v8, 2 -; ZIP-NEXT: ri.vzipodd.vv v10, v8, v9 ; ZIP-NEXT: vse64.v v10, (a1) ; ZIP-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll index aab2f08277831..ca7f2563e4fc9 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll @@ -85,11 +85,11 @@ define {<2 x i64>, <2 x i64>} @vector_deinterleave_v2i64_v4i64(<4 x i64> %vec) { ; ; ZIP-LABEL: vector_deinterleave_v2i64_v4i64: ; ZIP: # %bb.0: -; ZIP-NEXT: vsetivli zero, 2, e64, m2, ta, ma -; ZIP-NEXT: vslidedown.vi v10, v8, 2 ; ZIP-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; ZIP-NEXT: ri.vzipodd.vv v9, v8, v10 -; ZIP-NEXT: vslideup.vi v8, v10, 1 +; ZIP-NEXT: ri.vunzip2a.vv v10, v8, v9 +; ZIP-NEXT: ri.vunzip2b.vv v11, v8, v9 +; ZIP-NEXT: vmv.v.v v8, v10 +; ZIP-NEXT: vmv.v.v v9, v11 ; ZIP-NEXT: ret %retval = call {<2 x i64>, <2 x i64>} @llvm.vector.deinterleave2.v4i64(<4 x i64> %vec) ret {<2 x i64>, <2 x i64>} %retval @@ -129,62 +129,51 @@ define {<4 x i64>, <4 x i64>} @vector_deinterleave_v4i64_v8i64(<8 x i64> %vec) { ; ZIP-LABEL: vector_deinterleave_v4i64_v8i64: ; ZIP: # %bb.0: ; ZIP-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; ZIP-NEXT: vslidedown.vi v12, v8, 1 -; ZIP-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; ZIP-NEXT: vmv.v.i v0, 2 -; ZIP-NEXT: vmv.v.i v14, 12 -; ZIP-NEXT: vsetivli zero, 4, e64, m4, ta, ma -; ZIP-NEXT: vslidedown.vi v16, v8, 4 -; ZIP-NEXT: vsetivli zero, 2, e64, m2, ta, ma -; ZIP-NEXT: vslidedown.vi v10, v8, 2 -; ZIP-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; ZIP-NEXT: vslidedown.vi v12, v8, 2, v0.t -; ZIP-NEXT: ri.vzip2a.vv v18, v8, v10 -; ZIP-NEXT: vsetivli zero, 2, e64, m2, ta, ma -; ZIP-NEXT: vslidedown.vi v8, v16, 2 -; ZIP-NEXT: vmv1r.v v0, v14 -; ZIP-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; ZIP-NEXT: ri.vzip2a.vv v12, v16, v8, v0.t -; ZIP-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; ZIP-NEXT: vmv.v.i v0, 8 -; ZIP-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; ZIP-NEXT: vslideup.vi v8, v16, 2 -; ZIP-NEXT: vslideup.vi v8, v16, 1, v0.t -; ZIP-NEXT: vmv1r.v v0, v14 -; ZIP-NEXT: vmerge.vvm v8, v18, v8, v0 -; ZIP-NEXT: vmv2r.v v10, v12 +; ZIP-NEXT: ri.vunzip2a.vv v12, v8, v10 +; ZIP-NEXT: ri.vunzip2b.vv v14, v8, v10 +; ZIP-NEXT: vmv.v.v v8, v12 +; ZIP-NEXT: vmv.v.v v10, v14 ; ZIP-NEXT: ret %retval = call {<4 x i64>, <4 x i64>} @llvm.vector.deinterleave2.v8i64(<8 x i64> %vec) ret {<4 x i64>, <4 x i64>} %retval } define {<8 x i64>, <8 x i64>} @vector_deinterleave_v8i64_v16i64(<16 x i64> %vec) { -; CHECK-LABEL: vector_deinterleave_v8i64_v16i64: -; CHECK: # %bb.0: -; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vmv.v.i v0, -16 -; CHECK-NEXT: vid.v v16 -; CHECK-NEXT: vsetivli zero, 8, e64, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v8, 8 -; CHECK-NEXT: vmv.s.x v12, a0 -; CHECK-NEXT: li a0, 170 -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vadd.vv v20, v16, v16 -; CHECK-NEXT: vmv.s.x v21, a0 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vcompress.vm v16, v8, v12 -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-NEXT: vadd.vi v22, v20, -8 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vcompress.vm v12, v8, v21 -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-NEXT: vadd.vi v8, v20, -7 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vrgatherei16.vv v16, v24, v22, v0.t -; CHECK-NEXT: vrgatherei16.vv v12, v24, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v16 -; CHECK-NEXT: ret +; V-LABEL: vector_deinterleave_v8i64_v16i64: +; V: # %bb.0: +; V-NEXT: li a0, 85 +; V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; V-NEXT: vmv.v.i v0, -16 +; V-NEXT: vid.v v16 +; V-NEXT: vsetivli zero, 8, e64, m8, ta, ma +; V-NEXT: vslidedown.vi v24, v8, 8 +; V-NEXT: vmv.s.x v12, a0 +; V-NEXT: li a0, 170 +; V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; V-NEXT: vadd.vv v20, v16, v16 +; V-NEXT: vmv.s.x v21, a0 +; V-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; V-NEXT: vcompress.vm v16, v8, v12 +; V-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; V-NEXT: vadd.vi v22, v20, -8 +; V-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; V-NEXT: vcompress.vm v12, v8, v21 +; V-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; V-NEXT: vadd.vi v8, v20, -7 +; V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; V-NEXT: vrgatherei16.vv v16, v24, v22, v0.t +; V-NEXT: vrgatherei16.vv v12, v24, v8, v0.t +; V-NEXT: vmv.v.v v8, v16 +; V-NEXT: ret +; +; ZIP-LABEL: vector_deinterleave_v8i64_v16i64: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; ZIP-NEXT: ri.vunzip2a.vv v16, v8, v12 +; ZIP-NEXT: ri.vunzip2b.vv v20, v8, v12 +; ZIP-NEXT: vmv.v.v v8, v16 +; ZIP-NEXT: vmv.v.v v12, v20 +; ZIP-NEXT: ret %retval = call {<8 x i64>, <8 x i64>} @llvm.vector.deinterleave2.v16i64(<16 x i64> %vec) ret {<8 x i64>, <8 x i64>} %retval } @@ -498,11 +487,11 @@ define {<2 x double>, <2 x double>} @vector_deinterleave_v2f64_v4f64(<4 x double ; ; ZIP-LABEL: vector_deinterleave_v2f64_v4f64: ; ZIP: # %bb.0: -; ZIP-NEXT: vsetivli zero, 2, e64, m2, ta, ma -; ZIP-NEXT: vslidedown.vi v10, v8, 2 ; ZIP-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; ZIP-NEXT: ri.vzipodd.vv v9, v8, v10 -; ZIP-NEXT: vslideup.vi v8, v10, 1 +; ZIP-NEXT: ri.vunzip2a.vv v10, v8, v9 +; ZIP-NEXT: ri.vunzip2b.vv v12, v8, v9 +; ZIP-NEXT: vmv.v.v v8, v10 +; ZIP-NEXT: vmv.v.v v9, v12 ; ZIP-NEXT: ret %retval = call {<2 x double>, <2 x double>} @llvm.vector.deinterleave2.v4f64(<4 x double> %vec) ret {<2 x double>, <2 x double>} %retval @@ -541,31 +530,11 @@ define {<4 x double>, <4 x double>} @vector_deinterleave_v4f64_v8f64(<8 x double ; ; ZIP-LABEL: vector_deinterleave_v4f64_v8f64: ; ZIP: # %bb.0: -; ZIP-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; ZIP-NEXT: vmv.v.i v0, 8 -; ZIP-NEXT: vsetivli zero, 4, e64, m4, ta, ma -; ZIP-NEXT: vslidedown.vi v16, v8, 4 -; ZIP-NEXT: vsetivli zero, 2, e64, m2, ta, ma -; ZIP-NEXT: vslidedown.vi v12, v8, 2 -; ZIP-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; ZIP-NEXT: vmv.v.i v10, 12 -; ZIP-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; ZIP-NEXT: vslideup.vi v14, v16, 2 -; ZIP-NEXT: vslideup.vi v14, v16, 1, v0.t -; ZIP-NEXT: ri.vzip2a.vv v18, v8, v12 -; ZIP-NEXT: vmv1r.v v0, v10 -; ZIP-NEXT: vmerge.vvm v12, v18, v14, v0 -; ZIP-NEXT: vsetivli zero, 2, e64, m2, ta, ma -; ZIP-NEXT: vslidedown.vi v14, v16, 2 -; ZIP-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; ZIP-NEXT: vmv.v.i v0, 2 -; ZIP-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; ZIP-NEXT: ri.vzip2a.vv v18, v16, v14 -; ZIP-NEXT: vslidedown.vi v14, v8, 1 -; ZIP-NEXT: vslidedown.vi v14, v8, 2, v0.t -; ZIP-NEXT: vmv1r.v v0, v10 -; ZIP-NEXT: vmerge.vvm v10, v14, v18, v0 -; ZIP-NEXT: vmv2r.v v8, v12 +; ZIP-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; ZIP-NEXT: ri.vunzip2a.vv v12, v8, v10 +; ZIP-NEXT: ri.vunzip2b.vv v16, v8, v10 +; ZIP-NEXT: vmv.v.v v8, v12 +; ZIP-NEXT: vmv.v.v v10, v16 ; ZIP-NEXT: ret %retval = call {<4 x double>, <4 x double>} @llvm.vector.deinterleave2.v8f64(<8 x double> %vec) ret {<4 x double>, <4 x double>} %retval