From b04b30324516cdebd922e38a7a5d58225a3d917a Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Thu, 13 Feb 2025 09:35:28 -0800 Subject: [PATCH 1/2] [RISCV] Lower shuffle which splats a single span (without exact VLEN) If we have a shuffle which repeats the same pattern of elements, all of which come from the first register in the source register group, we can lower this to a single vrgather at m1 to perform the element rearrangement, and reuse that for each register in the result vector register group. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 48 +++++++++++++++++++ .../RISCV/rvv/fixed-vectors-int-shuffles.ll | 37 ++++++++------ 2 files changed, 71 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 829eef2e4d9d9..1156fd2e67fed 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -5360,6 +5360,23 @@ static bool isLowSourceShuffle(ArrayRef Mask, int Span) { [&](const auto &Idx) { return Idx == -1 || Idx < Span; }); } +/// Return true for a mask which performs an arbitrary shuffle within the first +/// span, and then repeats that same result across all remaining spans. Note +/// that this doesn't check if all the inputs come from a single span! +static bool isSpanSplatShuffle(ArrayRef Mask, int Span) { + SmallVector LowSpan(Span, -1); + for (auto [I, M] : enumerate(Mask)) { + if (M == -1) + continue; + int SpanIdx = I % Span; + if (LowSpan[SpanIdx] == -1) + LowSpan[SpanIdx] = M; + if (LowSpan[SpanIdx] != M) + return false; + } + return true; +} + /// Try to widen element type to get a new mask value for a better permutation /// sequence. This doesn't try to inspect the widened mask for profitability; /// we speculate the widened form is equal or better. This has the effect of @@ -5775,6 +5792,37 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, Gather = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, Gather, SubVec, SubIdx); } + } else if (NumElts > MinVLMAX && isLowSourceShuffle(Mask, MinVLMAX) && + isSpanSplatShuffle(Mask, MinVLMAX)) { + // If we have a shuffle which only uses the first register in our source + // register group, and repeats the same index across all spans, we can + // use a single vrgather (and possibly some register moves). + // TODO: This can be generalized for m2 or m4, or for any shuffle for + // which we can do a linear number of shuffles to form an m1 which + // contains all the output elements. + const MVT M1VT = getLMUL1VT(ContainerVT); + EVT SubIndexVT = M1VT.changeVectorElementType(IndexVT.getScalarType()); + auto [InnerTrueMask, InnerVL] = + getDefaultScalableVLOps(M1VT, DL, DAG, Subtarget); + int N = ContainerVT.getVectorMinNumElements() / + M1VT.getVectorMinNumElements(); + assert(isPowerOf2_32(N) && N <= 8); + SDValue SubV1 = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, V1, + DAG.getVectorIdxConstant(0, DL)); + SDValue SubIndex = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubIndexVT, LHSIndices, + DAG.getVectorIdxConstant(0, DL)); + SDValue SubVec = + DAG.getNode(GatherVVOpc, DL, M1VT, SubV1, SubIndex, + DAG.getUNDEF(M1VT), InnerTrueMask, InnerVL); + Gather = DAG.getUNDEF(ContainerVT); + for (int i = 0; i < N; i++) { + SDValue SubIdx = + DAG.getVectorIdxConstant(M1VT.getVectorMinNumElements() * i, DL); + Gather = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, Gather, + SubVec, SubIdx); + } } else if (NumElts > MinVLMAX && isLowSourceShuffle(Mask, MinVLMAX)) { // If we have a shuffle which only uses the first register in our // source register group, we can do a linear number of m1 vrgathers diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll index d7120b4a16938..3e31c9de61657 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll @@ -1311,22 +1311,14 @@ define void @shuffle_i128_splat(ptr %p) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: lui a2, 16 -; CHECK-NEXT: srli a1, a1, 3 +; CHECK-NEXT: lui a1, 16 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmv.v.x v9, a2 -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vx v10, v9, a1 -; CHECK-NEXT: vslidedown.vx v11, v10, a1 -; CHECK-NEXT: vsetvli a2, zero, e64, m1, ta, ma -; CHECK-NEXT: vrgatherei16.vv v13, v8, v10 -; CHECK-NEXT: vrgatherei16.vv v12, v8, v9 -; CHECK-NEXT: vrgatherei16.vv v14, v8, v11 -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vx v9, v11, a1 +; CHECK-NEXT: vmv.v.x v9, a1 ; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma -; CHECK-NEXT: vrgatherei16.vv v15, v8, v9 +; CHECK-NEXT: vrgatherei16.vv v12, v8, v9 +; CHECK-NEXT: vmv.v.v v13, v12 +; CHECK-NEXT: vmv.v.v v14, v12 +; CHECK-NEXT: vmv.v.v v15, v12 ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vse64.v v12, (a0) ; CHECK-NEXT: ret @@ -1435,3 +1427,20 @@ define <4 x i16> @vmerge_3(<4 x i16> %x) { %s = shufflevector <4 x i16> %x, <4 x i16> , <4 x i32> ret <4 x i16> %s } + + +define <8 x i64> @shuffle_v8i164_span_splat(<8 x i64> %a) nounwind { +; CHECK-LABEL: shuffle_v8i164_span_splat: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.i v9, 1 +; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v12, v8, v9 +; CHECK-NEXT: vmv.v.v v13, v12 +; CHECK-NEXT: vmv.v.v v14, v12 +; CHECK-NEXT: vmv.v.v v15, v12 +; CHECK-NEXT: vmv4r.v v8, v12 +; CHECK-NEXT: ret + %res = shufflevector <8 x i64> %a, <8 x i64> poison, <8 x i32> + ret <8 x i64> %res +} From 7ec5b854035c6573770181b5883671ca4e92316c Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Thu, 13 Feb 2025 11:00:16 -0800 Subject: [PATCH 2/2] clang-format --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 1156fd2e67fed..13ea184c817fb 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -5356,8 +5356,7 @@ static bool isLocalRepeatingShuffle(ArrayRef Mask, int Span) { /// Is this mask only using elements from the first span of the input? static bool isLowSourceShuffle(ArrayRef Mask, int Span) { - return all_of(Mask, - [&](const auto &Idx) { return Idx == -1 || Idx < Span; }); + return all_of(Mask, [&](const auto &Idx) { return Idx == -1 || Idx < Span; }); } /// Return true for a mask which performs an arbitrary shuffle within the first @@ -5807,15 +5806,13 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, int N = ContainerVT.getVectorMinNumElements() / M1VT.getVectorMinNumElements(); assert(isPowerOf2_32(N) && N <= 8); - SDValue SubV1 = - DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, V1, - DAG.getVectorIdxConstant(0, DL)); + SDValue SubV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, V1, + DAG.getVectorIdxConstant(0, DL)); SDValue SubIndex = - DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubIndexVT, LHSIndices, - DAG.getVectorIdxConstant(0, DL)); - SDValue SubVec = - DAG.getNode(GatherVVOpc, DL, M1VT, SubV1, SubIndex, - DAG.getUNDEF(M1VT), InnerTrueMask, InnerVL); + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubIndexVT, LHSIndices, + DAG.getVectorIdxConstant(0, DL)); + SDValue SubVec = DAG.getNode(GatherVVOpc, DL, M1VT, SubV1, SubIndex, + DAG.getUNDEF(M1VT), InnerTrueMask, InnerVL); Gather = DAG.getUNDEF(ContainerVT); for (int i = 0; i < N; i++) { SDValue SubIdx =