From c848ac69709247462c31dcc73a84ba15850eada9 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 13 Mar 2024 18:10:20 -0700 Subject: [PATCH] [RISCV] Optimize lowering of VECREDUCE_FMINIMUM/VECREDUCE_FMAXIMUM. Use a normal min/max reduction that doesn't propagate nans, and force the result to nan at the end if any elements were nan. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 38 +- .../RISCV/rvv/fixed-vectors-reduction-fp.ll | 1868 +++-------------- 2 files changed, 352 insertions(+), 1554 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 803774fd16dbf..105587f0810b8 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -717,7 +717,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, static const unsigned FloatingPointVecReduceOps[] = { ISD::VECREDUCE_FADD, ISD::VECREDUCE_SEQ_FADD, ISD::VECREDUCE_FMIN, - ISD::VECREDUCE_FMAX}; + ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMINIMUM, ISD::VECREDUCE_FMAXIMUM}; if (!Subtarget.is64Bit()) { // We must custom-lower certain vXi64 operations on RV32 due to the vector @@ -6541,6 +6541,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, case ISD::VECREDUCE_SEQ_FADD: case ISD::VECREDUCE_FMIN: case ISD::VECREDUCE_FMAX: + case ISD::VECREDUCE_FMAXIMUM: + case ISD::VECREDUCE_FMINIMUM: return lowerFPVECREDUCE(Op, DAG); case ISD::VP_REDUCE_ADD: case ISD::VP_REDUCE_UMAX: @@ -9541,14 +9543,17 @@ getRVVFPReductionOpAndOperands(SDValue Op, SelectionDAG &DAG, EVT EltVT, case ISD::VECREDUCE_SEQ_FADD: return std::make_tuple(RISCVISD::VECREDUCE_SEQ_FADD_VL, Op.getOperand(1), Op.getOperand(0)); + case ISD::VECREDUCE_FMINIMUM: + case ISD::VECREDUCE_FMAXIMUM: case ISD::VECREDUCE_FMIN: case ISD::VECREDUCE_FMAX: { SDValue Front = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Op.getOperand(0), DAG.getVectorIdxConstant(0, DL)); - unsigned RVVOpc = (Opcode == ISD::VECREDUCE_FMIN) - ? RISCVISD::VECREDUCE_FMIN_VL - : RISCVISD::VECREDUCE_FMAX_VL; + unsigned RVVOpc = + (Opcode == ISD::VECREDUCE_FMIN || Opcode == ISD::VECREDUCE_FMINIMUM) + ? RISCVISD::VECREDUCE_FMIN_VL + : RISCVISD::VECREDUCE_FMAX_VL; return std::make_tuple(RVVOpc, Op.getOperand(0), Front); } } @@ -9571,9 +9576,30 @@ SDValue RISCVTargetLowering::lowerFPVECREDUCE(SDValue Op, VectorVal = convertToScalableVector(ContainerVT, VectorVal, DAG, Subtarget); } + MVT ResVT = Op.getSimpleValueType(); auto [Mask, VL] = getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget); - return lowerReductionSeq(RVVOpcode, Op.getSimpleValueType(), ScalarVal, - VectorVal, Mask, VL, DL, DAG, Subtarget); + SDValue Res = lowerReductionSeq(RVVOpcode, ResVT, ScalarVal, VectorVal, Mask, + VL, DL, DAG, Subtarget); + if (Op.getOpcode() != ISD::VECREDUCE_FMINIMUM && + Op.getOpcode() != ISD::VECREDUCE_FMAXIMUM) + return Res; + + if (Op->getFlags().hasNoNaNs()) + return Res; + + // Force output to NaN if any element is Nan. + SDValue IsNan = + DAG.getNode(RISCVISD::SETCC_VL, DL, Mask.getValueType(), + {VectorVal, VectorVal, DAG.getCondCode(ISD::SETNE), + DAG.getUNDEF(Mask.getValueType()), Mask, VL}); + MVT XLenVT = Subtarget.getXLenVT(); + SDValue CPop = DAG.getNode(RISCVISD::VCPOP_VL, DL, XLenVT, IsNan, Mask, VL); + SDValue NoNaNs = DAG.getSetCC(DL, XLenVT, CPop, + DAG.getConstant(0, DL, XLenVT), ISD::SETEQ); + return DAG.getSelect( + DL, ResVT, NoNaNs, Res, + DAG.getConstantFP(APFloat::getNaN(DAG.EVTToAPFloatSemantics(ResVT)), DL, + ResVT)); } SDValue RISCVTargetLowering::lowerVPREDUCE(SDValue Op, diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll index 68740eec56e4c..7dcfb247d37cb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll @@ -1599,15 +1599,16 @@ define float @vreduce_fminimum_v2f32(ptr %x) { ; CHECK-LABEL: vreduce_fminimum_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v8, v11, v8 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vmfne.vv v9, v8, v8 +; CHECK-NEXT: vcpop.m a0, v9 +; CHECK-NEXT: beqz a0, .LBB99_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: lui a0, 523264 +; CHECK-NEXT: fmv.w.x fa0, a0 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB99_2: +; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <2 x float>, ptr %x @@ -1619,15 +1620,8 @@ define float @vreduce_fminimum_v2f32_nonans(ptr %x) { ; CHECK-LABEL: vreduce_fminimum_v2f32_nonans: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v8, v11, v8 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <2 x float>, ptr %x @@ -1641,24 +1635,16 @@ define float @vreduce_fminimum_v4f32(ptr %x) { ; CHECK-LABEL: vreduce_fminimum_v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v9, 2 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v9, v11, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v8, v11, v8 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vmfne.vv v9, v8, v8 +; CHECK-NEXT: vcpop.m a0, v9 +; CHECK-NEXT: beqz a0, .LBB101_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: lui a0, 523264 +; CHECK-NEXT: fmv.w.x fa0, a0 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB101_2: +; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <4 x float>, ptr %x @@ -1670,24 +1656,8 @@ define float @vreduce_fminimum_v4f32_nonans(ptr %x) { ; CHECK-LABEL: vreduce_fminimum_v4f32_nonans: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v9, 2 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v9, v11, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v8, v11, v8 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <4 x float>, ptr %x @@ -1701,33 +1671,16 @@ define float @vreduce_fminimum_v8f32(ptr %x) { ; CHECK-LABEL: vreduce_fminimum_v8f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 4 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmin.vv v9, v9, v8 -; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v9, 2 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v9, v11, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v8, v11, v8 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vmfne.vv v10, v8, v8 +; CHECK-NEXT: vcpop.m a0, v10 +; CHECK-NEXT: beqz a0, .LBB103_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: lui a0, 523264 +; CHECK-NEXT: fmv.w.x fa0, a0 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB103_2: +; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <8 x float>, ptr %x @@ -1739,33 +1692,8 @@ define float @vreduce_fminimum_v8f32_nonans(ptr %x) { ; CHECK-LABEL: vreduce_fminimum_v8f32_nonans: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 4 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmin.vv v9, v9, v8 -; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v9, 2 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v9, v11, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v8, v11, v8 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <8 x float>, ptr %x @@ -1779,42 +1707,16 @@ define float @vreduce_fminimum_v16f32(ptr %x) { ; CHECK-LABEL: vreduce_fminimum_v16f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vsetivli zero, 8, e32, m4, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v12, 8 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v8, v12, v12 -; CHECK-NEXT: vmerge.vvm v10, v16, v12, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 -; CHECK-NEXT: vfmin.vv v10, v10, v8 -; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 4 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmin.vv v9, v9, v8 -; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v9, 2 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v9, v11, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v8, v11, v8 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vmfne.vv v12, v8, v8 +; CHECK-NEXT: vcpop.m a0, v12 +; CHECK-NEXT: beqz a0, .LBB105_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: lui a0, 523264 +; CHECK-NEXT: fmv.w.x fa0, a0 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB105_2: +; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <16 x float>, ptr %x @@ -1826,42 +1728,8 @@ define float @vreduce_fminimum_v16f32_nonans(ptr %x) { ; CHECK-LABEL: vreduce_fminimum_v16f32_nonans: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vsetivli zero, 8, e32, m4, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v12, 8 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v8, v12, v12 -; CHECK-NEXT: vmerge.vvm v10, v16, v12, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 -; CHECK-NEXT: vfmin.vv v10, v10, v8 -; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 4 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmin.vv v9, v9, v8 -; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v9, 2 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v9, v11, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v8, v11, v8 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <16 x float>, ptr %x @@ -1876,51 +1744,16 @@ define float @vreduce_fminimum_v32f32(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v16, 16 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v24, v24 -; CHECK-NEXT: vmfeq.vv v8, v16, v16 -; CHECK-NEXT: vmerge.vvm v12, v24, v16, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: vfmin.vv v12, v12, v8 -; CHECK-NEXT: vsetivli zero, 8, e32, m4, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v12, 8 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v8, v12, v12 -; CHECK-NEXT: vmerge.vvm v10, v16, v12, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 -; CHECK-NEXT: vfmin.vv v10, v10, v8 -; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 4 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmin.vv v9, v9, v8 -; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v9, 2 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v9, v11, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v8, v11, v8 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vmfne.vv v16, v8, v8 +; CHECK-NEXT: vcpop.m a0, v16 +; CHECK-NEXT: beqz a0, .LBB107_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: lui a0, 523264 +; CHECK-NEXT: fmv.w.x fa0, a0 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB107_2: +; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <32 x float>, ptr %x @@ -1933,51 +1766,8 @@ define float @vreduce_fminimum_v32f32_nonans(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v16, 16 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v24, v24 -; CHECK-NEXT: vmfeq.vv v8, v16, v16 -; CHECK-NEXT: vmerge.vvm v12, v24, v16, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: vfmin.vv v12, v12, v8 -; CHECK-NEXT: vsetivli zero, 8, e32, m4, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v12, 8 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v8, v12, v12 -; CHECK-NEXT: vmerge.vvm v10, v16, v12, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 -; CHECK-NEXT: vfmin.vv v10, v10, v8 -; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 4 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmin.vv v9, v9, v8 -; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v9, 2 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v9, v11, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v8, v11, v8 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <32 x float>, ptr %x @@ -2009,52 +1799,18 @@ define float @vreduce_fminimum_v64f32(ptr %x) { ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmin.vv v16, v8, v16 -; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v16, 16 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v24, v24 -; CHECK-NEXT: vmfeq.vv v8, v16, v16 -; CHECK-NEXT: vmerge.vvm v12, v24, v16, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: vfmin.vv v12, v12, v8 -; CHECK-NEXT: vsetivli zero, 8, e32, m4, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v12, 8 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v8, v12, v12 -; CHECK-NEXT: vmerge.vvm v10, v16, v12, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 -; CHECK-NEXT: vfmin.vv v10, v10, v8 -; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 4 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmin.vv v9, v9, v8 -; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v9, 2 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v9, v11, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v8, v11, v8 +; CHECK-NEXT: vfmin.vv v8, v8, v16 +; CHECK-NEXT: vmfne.vv v16, v8, v8 +; CHECK-NEXT: vcpop.m a0, v16 +; CHECK-NEXT: beqz a0, .LBB109_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: lui a0, 523264 +; CHECK-NEXT: fmv.w.x fa0, a0 +; CHECK-NEXT: j .LBB109_3 +; CHECK-NEXT: .LBB109_2: +; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: .LBB109_3: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -2073,51 +1829,8 @@ define float @vreduce_fminimum_v64f32_nonans(ptr %x) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vfmin.vv v16, v8, v16 -; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v16, 16 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v24, v24 -; CHECK-NEXT: vmfeq.vv v8, v16, v16 -; CHECK-NEXT: vmerge.vvm v12, v24, v16, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: vfmin.vv v12, v12, v8 -; CHECK-NEXT: vsetivli zero, 8, e32, m4, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v12, 8 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v8, v12, v12 -; CHECK-NEXT: vmerge.vvm v10, v16, v12, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 -; CHECK-NEXT: vfmin.vv v10, v10, v8 -; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 4 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmin.vv v9, v9, v8 -; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v9, 2 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v9, v11, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v8, v11, v8 +; CHECK-NEXT: vfmin.vv v8, v8, v16 +; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <64 x float>, ptr %x @@ -2208,52 +1921,18 @@ define float @vreduce_fminimum_v128f32(ptr %x) { ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmin.vv v16, v8, v16 -; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v16, 16 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v24, v24 -; CHECK-NEXT: vmfeq.vv v8, v16, v16 -; CHECK-NEXT: vmerge.vvm v12, v24, v16, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: vfmin.vv v12, v12, v8 -; CHECK-NEXT: vsetivli zero, 8, e32, m4, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v12, 8 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v8, v12, v12 -; CHECK-NEXT: vmerge.vvm v10, v16, v12, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 -; CHECK-NEXT: vfmin.vv v10, v10, v8 -; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 4 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmin.vv v9, v9, v8 -; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v9, 2 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v9, v11, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v8, v11, v8 +; CHECK-NEXT: vfmin.vv v8, v8, v16 +; CHECK-NEXT: vmfne.vv v16, v8, v8 +; CHECK-NEXT: vcpop.m a0, v16 +; CHECK-NEXT: beqz a0, .LBB111_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: lui a0, 523264 +; CHECK-NEXT: fmv.w.x fa0, a0 +; CHECK-NEXT: j .LBB111_3 +; CHECK-NEXT: .LBB111_2: +; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: .LBB111_3: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: mv a1, a0 @@ -2281,51 +1960,8 @@ define float @vreduce_fminimum_v128f32_nonans(ptr %x) { ; CHECK-NEXT: vle32.v v0, (a1) ; CHECK-NEXT: vfmin.vv v16, v24, v16 ; CHECK-NEXT: vfmin.vv v8, v8, v0 -; CHECK-NEXT: vfmin.vv v16, v8, v16 -; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v16, 16 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v24, v24 -; CHECK-NEXT: vmfeq.vv v8, v16, v16 -; CHECK-NEXT: vmerge.vvm v12, v24, v16, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: vfmin.vv v12, v12, v8 -; CHECK-NEXT: vsetivli zero, 8, e32, m4, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v12, 8 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v8, v12, v12 -; CHECK-NEXT: vmerge.vvm v10, v16, v12, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 -; CHECK-NEXT: vfmin.vv v10, v10, v8 -; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 4 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmin.vv v9, v9, v8 -; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v9, 2 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v9, v11, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v8, v11, v8 +; CHECK-NEXT: vfmin.vv v8, v8, v16 +; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <128 x float>, ptr %x @@ -2339,15 +1975,16 @@ define double @vreduce_fminimum_v2f64(ptr %x) { ; CHECK-LABEL: vreduce_fminimum_v2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vle64.v v9, (a0) -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v8, v11, v8 +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vmfne.vv v9, v8, v8 +; CHECK-NEXT: vcpop.m a0, v9 +; CHECK-NEXT: beqz a0, .LBB113_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: lui a0, %hi(.LCPI113_0) +; CHECK-NEXT: fld fa0, %lo(.LCPI113_0)(a0) +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB113_2: +; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <2 x double>, ptr %x @@ -2359,15 +1996,8 @@ define double @vreduce_fminimum_v2f64_nonans(ptr %x) { ; CHECK-LABEL: vreduce_fminimum_v2f64_nonans: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vle64.v v9, (a0) -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v8, v11, v8 +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <2 x double>, ptr %x @@ -2381,24 +2011,16 @@ define double @vreduce_fminimum_v4f64(ptr %x) { ; CHECK-LABEL: vreduce_fminimum_v4f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vle64.v v10, (a0) -; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 2 -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmin.vv v9, v9, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v8, v11, v8 +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vmfne.vv v10, v8, v8 +; CHECK-NEXT: vcpop.m a0, v10 +; CHECK-NEXT: beqz a0, .LBB115_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: lui a0, %hi(.LCPI115_0) +; CHECK-NEXT: fld fa0, %lo(.LCPI115_0)(a0) +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB115_2: +; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <4 x double>, ptr %x @@ -2410,24 +2032,8 @@ define double @vreduce_fminimum_v4f64_nonans(ptr %x) { ; CHECK-LABEL: vreduce_fminimum_v4f64_nonans: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vle64.v v10, (a0) -; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 2 -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmin.vv v9, v9, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v8, v11, v8 +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <4 x double>, ptr %x @@ -2441,33 +2047,16 @@ define double @vreduce_fminimum_v8f64(ptr %x) { ; CHECK-LABEL: vreduce_fminimum_v8f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vle64.v v12, (a0) -; CHECK-NEXT: vsetivli zero, 4, e64, m4, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v12, 4 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v8, v12, v12 -; CHECK-NEXT: vmerge.vvm v10, v16, v12, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 -; CHECK-NEXT: vfmin.vv v10, v10, v8 -; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 2 -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmin.vv v9, v9, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v8, v11, v8 +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vmfne.vv v12, v8, v8 +; CHECK-NEXT: vcpop.m a0, v12 +; CHECK-NEXT: beqz a0, .LBB117_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: lui a0, %hi(.LCPI117_0) +; CHECK-NEXT: fld fa0, %lo(.LCPI117_0)(a0) +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB117_2: +; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <8 x double>, ptr %x @@ -2479,33 +2068,8 @@ define double @vreduce_fminimum_v8f64_nonans(ptr %x) { ; CHECK-LABEL: vreduce_fminimum_v8f64_nonans: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vle64.v v12, (a0) -; CHECK-NEXT: vsetivli zero, 4, e64, m4, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v12, 4 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v8, v12, v12 -; CHECK-NEXT: vmerge.vvm v10, v16, v12, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 -; CHECK-NEXT: vfmin.vv v10, v10, v8 -; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 2 -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmin.vv v9, v9, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v8, v11, v8 +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <8 x double>, ptr %x @@ -2519,42 +2083,16 @@ define double @vreduce_fminimum_v16f64(ptr %x) { ; CHECK-LABEL: vreduce_fminimum_v16f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v16, (a0) -; CHECK-NEXT: vsetivli zero, 8, e64, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v16, 8 -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v24, v24 -; CHECK-NEXT: vmfeq.vv v8, v16, v16 -; CHECK-NEXT: vmerge.vvm v12, v24, v16, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: vfmin.vv v12, v12, v8 -; CHECK-NEXT: vsetivli zero, 4, e64, m4, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v12, 4 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v8, v12, v12 -; CHECK-NEXT: vmerge.vvm v10, v16, v12, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 -; CHECK-NEXT: vfmin.vv v10, v10, v8 -; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 2 -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmin.vv v9, v9, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v8, v11, v8 +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vmfne.vv v16, v8, v8 +; CHECK-NEXT: vcpop.m a0, v16 +; CHECK-NEXT: beqz a0, .LBB119_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: lui a0, %hi(.LCPI119_0) +; CHECK-NEXT: fld fa0, %lo(.LCPI119_0)(a0) +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB119_2: +; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <16 x double>, ptr %x @@ -2566,42 +2104,8 @@ define double @vreduce_fminimum_v16f64_nonans(ptr %x) { ; CHECK-LABEL: vreduce_fminimum_v16f64_nonans: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v16, (a0) -; CHECK-NEXT: vsetivli zero, 8, e64, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v16, 8 -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v24, v24 -; CHECK-NEXT: vmfeq.vv v8, v16, v16 -; CHECK-NEXT: vmerge.vvm v12, v24, v16, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: vfmin.vv v12, v12, v8 -; CHECK-NEXT: vsetivli zero, 4, e64, m4, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v12, 4 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v8, v12, v12 -; CHECK-NEXT: vmerge.vvm v10, v16, v12, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 -; CHECK-NEXT: vfmin.vv v10, v10, v8 -; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 2 -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmin.vv v9, v9, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v8, v11, v8 +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <16 x double>, ptr %x @@ -2632,43 +2136,18 @@ define double @vreduce_fminimum_v32f64(ptr %x) { ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmin.vv v16, v8, v16 -; CHECK-NEXT: vsetivli zero, 8, e64, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v16, 8 -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v24, v24 -; CHECK-NEXT: vmfeq.vv v8, v16, v16 -; CHECK-NEXT: vmerge.vvm v12, v24, v16, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: vfmin.vv v12, v12, v8 -; CHECK-NEXT: vsetivli zero, 4, e64, m4, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v12, 4 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v8, v12, v12 -; CHECK-NEXT: vmerge.vvm v10, v16, v12, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 -; CHECK-NEXT: vfmin.vv v10, v10, v8 -; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 2 -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmin.vv v9, v9, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v8, v11, v8 +; CHECK-NEXT: vfmin.vv v8, v8, v16 +; CHECK-NEXT: vmfne.vv v16, v8, v8 +; CHECK-NEXT: vcpop.m a0, v16 +; CHECK-NEXT: beqz a0, .LBB121_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: lui a0, %hi(.LCPI121_0) +; CHECK-NEXT: fld fa0, %lo(.LCPI121_0)(a0) +; CHECK-NEXT: j .LBB121_3 +; CHECK-NEXT: .LBB121_2: +; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: .LBB121_3: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -2686,42 +2165,8 @@ define double @vreduce_fminimum_v32f64_nonans(ptr %x) { ; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle64.v v16, (a0) -; CHECK-NEXT: vfmin.vv v16, v8, v16 -; CHECK-NEXT: vsetivli zero, 8, e64, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v16, 8 -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v24, v24 -; CHECK-NEXT: vmfeq.vv v8, v16, v16 -; CHECK-NEXT: vmerge.vvm v12, v24, v16, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: vfmin.vv v12, v12, v8 -; CHECK-NEXT: vsetivli zero, 4, e64, m4, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v12, 4 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v8, v12, v12 -; CHECK-NEXT: vmerge.vvm v10, v16, v12, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 -; CHECK-NEXT: vfmin.vv v10, v10, v8 -; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 2 -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmin.vv v9, v9, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v8, v11, v8 +; CHECK-NEXT: vfmin.vv v8, v8, v16 +; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <32 x double>, ptr %x @@ -2811,43 +2256,18 @@ define double @vreduce_fminimum_v64f64(ptr %x) { ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmin.vv v16, v8, v16 -; CHECK-NEXT: vsetivli zero, 8, e64, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v16, 8 -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v24, v24 -; CHECK-NEXT: vmfeq.vv v8, v16, v16 -; CHECK-NEXT: vmerge.vvm v12, v24, v16, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: vfmin.vv v12, v12, v8 -; CHECK-NEXT: vsetivli zero, 4, e64, m4, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v12, 4 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v8, v12, v12 -; CHECK-NEXT: vmerge.vvm v10, v16, v12, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 -; CHECK-NEXT: vfmin.vv v10, v10, v8 -; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 2 -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmin.vv v9, v9, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v8, v11, v8 +; CHECK-NEXT: vfmin.vv v8, v8, v16 +; CHECK-NEXT: vmfne.vv v16, v8, v8 +; CHECK-NEXT: vcpop.m a0, v16 +; CHECK-NEXT: beqz a0, .LBB123_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: lui a0, %hi(.LCPI123_0) +; CHECK-NEXT: fld fa0, %lo(.LCPI123_0)(a0) +; CHECK-NEXT: j .LBB123_3 +; CHECK-NEXT: .LBB123_2: +; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: .LBB123_3: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: mv a1, a0 @@ -2874,42 +2294,8 @@ define double @vreduce_fminimum_v64f64_nonans(ptr %x) { ; CHECK-NEXT: vle64.v v0, (a1) ; CHECK-NEXT: vfmin.vv v16, v24, v16 ; CHECK-NEXT: vfmin.vv v8, v8, v0 -; CHECK-NEXT: vfmin.vv v16, v8, v16 -; CHECK-NEXT: vsetivli zero, 8, e64, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v16, 8 -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v24, v24 -; CHECK-NEXT: vmfeq.vv v8, v16, v16 -; CHECK-NEXT: vmerge.vvm v12, v24, v16, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: vfmin.vv v12, v12, v8 -; CHECK-NEXT: vsetivli zero, 4, e64, m4, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v12, 4 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v8, v12, v12 -; CHECK-NEXT: vmerge.vvm v10, v16, v12, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 -; CHECK-NEXT: vfmin.vv v10, v10, v8 -; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 2 -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmin.vv v9, v9, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmin.vv v8, v11, v8 +; CHECK-NEXT: vfmin.vv v8, v8, v16 +; CHECK-NEXT: vfredmin.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <64 x double>, ptr %x @@ -2923,15 +2309,16 @@ define float @vreduce_fmaximum_v2f32(ptr %x) { ; CHECK-LABEL: vreduce_fmaximum_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v8, v11, v8 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vmfne.vv v9, v8, v8 +; CHECK-NEXT: vcpop.m a0, v9 +; CHECK-NEXT: beqz a0, .LBB125_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: lui a0, 523264 +; CHECK-NEXT: fmv.w.x fa0, a0 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB125_2: +; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <2 x float>, ptr %x @@ -2943,15 +2330,8 @@ define float @vreduce_fmaximum_v2f32_nonans(ptr %x) { ; CHECK-LABEL: vreduce_fmaximum_v2f32_nonans: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v8, v11, v8 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <2 x float>, ptr %x @@ -2965,24 +2345,16 @@ define float @vreduce_fmaximum_v4f32(ptr %x) { ; CHECK-LABEL: vreduce_fmaximum_v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v9, 2 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v9, v11, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v8, v11, v8 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vmfne.vv v9, v8, v8 +; CHECK-NEXT: vcpop.m a0, v9 +; CHECK-NEXT: beqz a0, .LBB127_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: lui a0, 523264 +; CHECK-NEXT: fmv.w.x fa0, a0 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB127_2: +; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <4 x float>, ptr %x @@ -2994,24 +2366,8 @@ define float @vreduce_fmaximum_v4f32_nonans(ptr %x) { ; CHECK-LABEL: vreduce_fmaximum_v4f32_nonans: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v9, 2 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v9, v11, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v8, v11, v8 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <4 x float>, ptr %x @@ -3025,33 +2381,16 @@ define float @vreduce_fmaximum_v8f32(ptr %x) { ; CHECK-LABEL: vreduce_fmaximum_v8f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 4 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmax.vv v9, v9, v8 -; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v9, 2 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v9, v11, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v8, v11, v8 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vmfne.vv v10, v8, v8 +; CHECK-NEXT: vcpop.m a0, v10 +; CHECK-NEXT: beqz a0, .LBB129_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: lui a0, 523264 +; CHECK-NEXT: fmv.w.x fa0, a0 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB129_2: +; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <8 x float>, ptr %x @@ -3063,33 +2402,8 @@ define float @vreduce_fmaximum_v8f32_nonans(ptr %x) { ; CHECK-LABEL: vreduce_fmaximum_v8f32_nonans: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 4 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmax.vv v9, v9, v8 -; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v9, 2 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v9, v11, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v8, v11, v8 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <8 x float>, ptr %x @@ -3103,42 +2417,16 @@ define float @vreduce_fmaximum_v16f32(ptr %x) { ; CHECK-LABEL: vreduce_fmaximum_v16f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vsetivli zero, 8, e32, m4, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v12, 8 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v8, v12, v12 -; CHECK-NEXT: vmerge.vvm v10, v16, v12, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 -; CHECK-NEXT: vfmax.vv v10, v10, v8 -; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 4 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmax.vv v9, v9, v8 -; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v9, 2 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v9, v11, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v8, v11, v8 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vmfne.vv v12, v8, v8 +; CHECK-NEXT: vcpop.m a0, v12 +; CHECK-NEXT: beqz a0, .LBB131_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: lui a0, 523264 +; CHECK-NEXT: fmv.w.x fa0, a0 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB131_2: +; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <16 x float>, ptr %x @@ -3150,42 +2438,8 @@ define float @vreduce_fmaximum_v16f32_nonans(ptr %x) { ; CHECK-LABEL: vreduce_fmaximum_v16f32_nonans: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vsetivli zero, 8, e32, m4, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v12, 8 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v8, v12, v12 -; CHECK-NEXT: vmerge.vvm v10, v16, v12, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 -; CHECK-NEXT: vfmax.vv v10, v10, v8 -; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 4 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmax.vv v9, v9, v8 -; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v9, 2 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v9, v11, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v8, v11, v8 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <16 x float>, ptr %x @@ -3200,51 +2454,16 @@ define float @vreduce_fmaximum_v32f32(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v16, 16 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v24, v24 -; CHECK-NEXT: vmfeq.vv v8, v16, v16 -; CHECK-NEXT: vmerge.vvm v12, v24, v16, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: vfmax.vv v12, v12, v8 -; CHECK-NEXT: vsetivli zero, 8, e32, m4, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v12, 8 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v8, v12, v12 -; CHECK-NEXT: vmerge.vvm v10, v16, v12, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 -; CHECK-NEXT: vfmax.vv v10, v10, v8 -; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 4 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmax.vv v9, v9, v8 -; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v9, 2 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v9, v11, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v8, v11, v8 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vmfne.vv v16, v8, v8 +; CHECK-NEXT: vcpop.m a0, v16 +; CHECK-NEXT: beqz a0, .LBB133_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: lui a0, 523264 +; CHECK-NEXT: fmv.w.x fa0, a0 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB133_2: +; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <32 x float>, ptr %x @@ -3257,51 +2476,8 @@ define float @vreduce_fmaximum_v32f32_nonans(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v16, 16 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v24, v24 -; CHECK-NEXT: vmfeq.vv v8, v16, v16 -; CHECK-NEXT: vmerge.vvm v12, v24, v16, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: vfmax.vv v12, v12, v8 -; CHECK-NEXT: vsetivli zero, 8, e32, m4, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v12, 8 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v8, v12, v12 -; CHECK-NEXT: vmerge.vvm v10, v16, v12, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 -; CHECK-NEXT: vfmax.vv v10, v10, v8 -; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 4 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmax.vv v9, v9, v8 -; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v9, 2 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v9, v11, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v8, v11, v8 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <32 x float>, ptr %x @@ -3333,52 +2509,18 @@ define float @vreduce_fmaximum_v64f32(ptr %x) { ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmax.vv v16, v8, v16 -; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v16, 16 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v24, v24 -; CHECK-NEXT: vmfeq.vv v8, v16, v16 -; CHECK-NEXT: vmerge.vvm v12, v24, v16, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: vfmax.vv v12, v12, v8 -; CHECK-NEXT: vsetivli zero, 8, e32, m4, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v12, 8 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v8, v12, v12 -; CHECK-NEXT: vmerge.vvm v10, v16, v12, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 -; CHECK-NEXT: vfmax.vv v10, v10, v8 -; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 4 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmax.vv v9, v9, v8 -; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v9, 2 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v9, v11, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v8, v11, v8 +; CHECK-NEXT: vfmax.vv v8, v8, v16 +; CHECK-NEXT: vmfne.vv v16, v8, v8 +; CHECK-NEXT: vcpop.m a0, v16 +; CHECK-NEXT: beqz a0, .LBB135_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: lui a0, 523264 +; CHECK-NEXT: fmv.w.x fa0, a0 +; CHECK-NEXT: j .LBB135_3 +; CHECK-NEXT: .LBB135_2: +; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: .LBB135_3: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -3397,51 +2539,8 @@ define float @vreduce_fmaximum_v64f32_nonans(ptr %x) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vfmax.vv v16, v8, v16 -; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v16, 16 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v24, v24 -; CHECK-NEXT: vmfeq.vv v8, v16, v16 -; CHECK-NEXT: vmerge.vvm v12, v24, v16, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: vfmax.vv v12, v12, v8 -; CHECK-NEXT: vsetivli zero, 8, e32, m4, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v12, 8 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v8, v12, v12 -; CHECK-NEXT: vmerge.vvm v10, v16, v12, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 -; CHECK-NEXT: vfmax.vv v10, v10, v8 -; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 4 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmax.vv v9, v9, v8 -; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v9, 2 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v9, v11, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v8, v11, v8 +; CHECK-NEXT: vfmax.vv v8, v8, v16 +; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <64 x float>, ptr %x @@ -3532,52 +2631,18 @@ define float @vreduce_fmaximum_v128f32(ptr %x) { ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmax.vv v16, v8, v16 -; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v16, 16 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v24, v24 -; CHECK-NEXT: vmfeq.vv v8, v16, v16 -; CHECK-NEXT: vmerge.vvm v12, v24, v16, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: vfmax.vv v12, v12, v8 -; CHECK-NEXT: vsetivli zero, 8, e32, m4, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v12, 8 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v8, v12, v12 -; CHECK-NEXT: vmerge.vvm v10, v16, v12, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 -; CHECK-NEXT: vfmax.vv v10, v10, v8 -; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 4 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmax.vv v9, v9, v8 -; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v9, 2 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v9, v11, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v8, v11, v8 +; CHECK-NEXT: vfmax.vv v8, v8, v16 +; CHECK-NEXT: vmfne.vv v16, v8, v8 +; CHECK-NEXT: vcpop.m a0, v16 +; CHECK-NEXT: beqz a0, .LBB137_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: lui a0, 523264 +; CHECK-NEXT: fmv.w.x fa0, a0 +; CHECK-NEXT: j .LBB137_3 +; CHECK-NEXT: .LBB137_2: +; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: .LBB137_3: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: mv a1, a0 @@ -3605,51 +2670,8 @@ define float @vreduce_fmaximum_v128f32_nonans(ptr %x) { ; CHECK-NEXT: vle32.v v0, (a1) ; CHECK-NEXT: vfmax.vv v16, v24, v16 ; CHECK-NEXT: vfmax.vv v8, v8, v0 -; CHECK-NEXT: vfmax.vv v16, v8, v16 -; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v16, 16 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v24, v24 -; CHECK-NEXT: vmfeq.vv v8, v16, v16 -; CHECK-NEXT: vmerge.vvm v12, v24, v16, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: vfmax.vv v12, v12, v8 -; CHECK-NEXT: vsetivli zero, 8, e32, m4, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v12, 8 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v8, v12, v12 -; CHECK-NEXT: vmerge.vvm v10, v16, v12, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 -; CHECK-NEXT: vfmax.vv v10, v10, v8 -; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 4 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmax.vv v9, v9, v8 -; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v9, 2 -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v9, v11, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v8, v11, v8 +; CHECK-NEXT: vfmax.vv v8, v8, v16 +; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <128 x float>, ptr %x @@ -3663,15 +2685,16 @@ define double @vreduce_fmaximum_v2f64(ptr %x) { ; CHECK-LABEL: vreduce_fmaximum_v2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vle64.v v9, (a0) -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v8, v11, v8 +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vmfne.vv v9, v8, v8 +; CHECK-NEXT: vcpop.m a0, v9 +; CHECK-NEXT: beqz a0, .LBB139_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: lui a0, %hi(.LCPI139_0) +; CHECK-NEXT: fld fa0, %lo(.LCPI139_0)(a0) +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB139_2: +; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <2 x double>, ptr %x @@ -3683,15 +2706,8 @@ define double @vreduce_fmaximum_v2f64_nonans(ptr %x) { ; CHECK-LABEL: vreduce_fmaximum_v2f64_nonans: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vle64.v v9, (a0) -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v8, v11, v8 +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <2 x double>, ptr %x @@ -3705,24 +2721,16 @@ define double @vreduce_fmaximum_v4f64(ptr %x) { ; CHECK-LABEL: vreduce_fmaximum_v4f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vle64.v v10, (a0) -; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 2 -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmax.vv v9, v9, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v8, v11, v8 +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vmfne.vv v10, v8, v8 +; CHECK-NEXT: vcpop.m a0, v10 +; CHECK-NEXT: beqz a0, .LBB141_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: lui a0, %hi(.LCPI141_0) +; CHECK-NEXT: fld fa0, %lo(.LCPI141_0)(a0) +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB141_2: +; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <4 x double>, ptr %x @@ -3734,24 +2742,8 @@ define double @vreduce_fmaximum_v4f64_nonans(ptr %x) { ; CHECK-LABEL: vreduce_fmaximum_v4f64_nonans: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vle64.v v10, (a0) -; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 2 -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmax.vv v9, v9, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v8, v11, v8 +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <4 x double>, ptr %x @@ -3765,33 +2757,16 @@ define double @vreduce_fmaximum_v8f64(ptr %x) { ; CHECK-LABEL: vreduce_fmaximum_v8f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vle64.v v12, (a0) -; CHECK-NEXT: vsetivli zero, 4, e64, m4, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v12, 4 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v8, v12, v12 -; CHECK-NEXT: vmerge.vvm v10, v16, v12, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 -; CHECK-NEXT: vfmax.vv v10, v10, v8 -; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 2 -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmax.vv v9, v9, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v8, v11, v8 +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vmfne.vv v12, v8, v8 +; CHECK-NEXT: vcpop.m a0, v12 +; CHECK-NEXT: beqz a0, .LBB143_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: lui a0, %hi(.LCPI143_0) +; CHECK-NEXT: fld fa0, %lo(.LCPI143_0)(a0) +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB143_2: +; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <8 x double>, ptr %x @@ -3803,33 +2778,8 @@ define double @vreduce_fmaximum_v8f64_nonans(ptr %x) { ; CHECK-LABEL: vreduce_fmaximum_v8f64_nonans: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vle64.v v12, (a0) -; CHECK-NEXT: vsetivli zero, 4, e64, m4, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v12, 4 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v8, v12, v12 -; CHECK-NEXT: vmerge.vvm v10, v16, v12, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 -; CHECK-NEXT: vfmax.vv v10, v10, v8 -; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 2 -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmax.vv v9, v9, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v8, v11, v8 +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <8 x double>, ptr %x @@ -3843,42 +2793,16 @@ define double @vreduce_fmaximum_v16f64(ptr %x) { ; CHECK-LABEL: vreduce_fmaximum_v16f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v16, (a0) -; CHECK-NEXT: vsetivli zero, 8, e64, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v16, 8 -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v24, v24 -; CHECK-NEXT: vmfeq.vv v8, v16, v16 -; CHECK-NEXT: vmerge.vvm v12, v24, v16, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: vfmax.vv v12, v12, v8 -; CHECK-NEXT: vsetivli zero, 4, e64, m4, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v12, 4 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v8, v12, v12 -; CHECK-NEXT: vmerge.vvm v10, v16, v12, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 -; CHECK-NEXT: vfmax.vv v10, v10, v8 -; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 2 -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmax.vv v9, v9, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v8, v11, v8 +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vmfne.vv v16, v8, v8 +; CHECK-NEXT: vcpop.m a0, v16 +; CHECK-NEXT: beqz a0, .LBB145_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: lui a0, %hi(.LCPI145_0) +; CHECK-NEXT: fld fa0, %lo(.LCPI145_0)(a0) +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB145_2: +; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <16 x double>, ptr %x @@ -3890,42 +2814,8 @@ define double @vreduce_fmaximum_v16f64_nonans(ptr %x) { ; CHECK-LABEL: vreduce_fmaximum_v16f64_nonans: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v16, (a0) -; CHECK-NEXT: vsetivli zero, 8, e64, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v16, 8 -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v24, v24 -; CHECK-NEXT: vmfeq.vv v8, v16, v16 -; CHECK-NEXT: vmerge.vvm v12, v24, v16, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: vfmax.vv v12, v12, v8 -; CHECK-NEXT: vsetivli zero, 4, e64, m4, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v12, 4 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v8, v12, v12 -; CHECK-NEXT: vmerge.vvm v10, v16, v12, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 -; CHECK-NEXT: vfmax.vv v10, v10, v8 -; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 2 -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmax.vv v9, v9, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v8, v11, v8 +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <16 x double>, ptr %x @@ -3956,43 +2846,18 @@ define double @vreduce_fmaximum_v32f64(ptr %x) { ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmax.vv v16, v8, v16 -; CHECK-NEXT: vsetivli zero, 8, e64, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v16, 8 -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v24, v24 -; CHECK-NEXT: vmfeq.vv v8, v16, v16 -; CHECK-NEXT: vmerge.vvm v12, v24, v16, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: vfmax.vv v12, v12, v8 -; CHECK-NEXT: vsetivli zero, 4, e64, m4, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v12, 4 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v8, v12, v12 -; CHECK-NEXT: vmerge.vvm v10, v16, v12, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 -; CHECK-NEXT: vfmax.vv v10, v10, v8 -; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 2 -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmax.vv v9, v9, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v8, v11, v8 +; CHECK-NEXT: vfmax.vv v8, v8, v16 +; CHECK-NEXT: vmfne.vv v16, v8, v8 +; CHECK-NEXT: vcpop.m a0, v16 +; CHECK-NEXT: beqz a0, .LBB147_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: lui a0, %hi(.LCPI147_0) +; CHECK-NEXT: fld fa0, %lo(.LCPI147_0)(a0) +; CHECK-NEXT: j .LBB147_3 +; CHECK-NEXT: .LBB147_2: +; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: .LBB147_3: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -4010,42 +2875,8 @@ define double @vreduce_fmaximum_v32f64_nonans(ptr %x) { ; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle64.v v16, (a0) -; CHECK-NEXT: vfmax.vv v16, v8, v16 -; CHECK-NEXT: vsetivli zero, 8, e64, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v16, 8 -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v24, v24 -; CHECK-NEXT: vmfeq.vv v8, v16, v16 -; CHECK-NEXT: vmerge.vvm v12, v24, v16, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: vfmax.vv v12, v12, v8 -; CHECK-NEXT: vsetivli zero, 4, e64, m4, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v12, 4 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v8, v12, v12 -; CHECK-NEXT: vmerge.vvm v10, v16, v12, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 -; CHECK-NEXT: vfmax.vv v10, v10, v8 -; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 2 -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmax.vv v9, v9, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v8, v11, v8 +; CHECK-NEXT: vfmax.vv v8, v8, v16 +; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <32 x double>, ptr %x @@ -4135,43 +2966,18 @@ define double @vreduce_fmaximum_v64f64(ptr %x) { ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfmax.vv v16, v8, v16 -; CHECK-NEXT: vsetivli zero, 8, e64, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v16, 8 -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v24, v24 -; CHECK-NEXT: vmfeq.vv v8, v16, v16 -; CHECK-NEXT: vmerge.vvm v12, v24, v16, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: vfmax.vv v12, v12, v8 -; CHECK-NEXT: vsetivli zero, 4, e64, m4, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v12, 4 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v8, v12, v12 -; CHECK-NEXT: vmerge.vvm v10, v16, v12, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 -; CHECK-NEXT: vfmax.vv v10, v10, v8 -; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 2 -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmax.vv v9, v9, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v8, v11, v8 +; CHECK-NEXT: vfmax.vv v8, v8, v16 +; CHECK-NEXT: vmfne.vv v16, v8, v8 +; CHECK-NEXT: vcpop.m a0, v16 +; CHECK-NEXT: beqz a0, .LBB149_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: lui a0, %hi(.LCPI149_0) +; CHECK-NEXT: fld fa0, %lo(.LCPI149_0)(a0) +; CHECK-NEXT: j .LBB149_3 +; CHECK-NEXT: .LBB149_2: +; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: .LBB149_3: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: mv a1, a0 @@ -4198,42 +3004,8 @@ define double @vreduce_fmaximum_v64f64_nonans(ptr %x) { ; CHECK-NEXT: vle64.v v0, (a1) ; CHECK-NEXT: vfmax.vv v16, v24, v16 ; CHECK-NEXT: vfmax.vv v8, v8, v0 -; CHECK-NEXT: vfmax.vv v16, v8, v16 -; CHECK-NEXT: vsetivli zero, 8, e64, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v16, 8 -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v24, v24 -; CHECK-NEXT: vmfeq.vv v8, v16, v16 -; CHECK-NEXT: vmerge.vvm v12, v24, v16, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: vfmax.vv v12, v12, v8 -; CHECK-NEXT: vsetivli zero, 4, e64, m4, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v12, 4 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v8, v12, v12 -; CHECK-NEXT: vmerge.vvm v10, v16, v12, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 -; CHECK-NEXT: vfmax.vv v10, v10, v8 -; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v10, 2 -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v12, v12 -; CHECK-NEXT: vmfeq.vv v8, v10, v10 -; CHECK-NEXT: vmerge.vvm v9, v12, v10, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 -; CHECK-NEXT: vfmax.vv v9, v9, v8 -; CHECK-NEXT: vslidedown.vi v10, v9, 1 -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vmfeq.vv v0, v10, v10 -; CHECK-NEXT: vmfeq.vv v8, v9, v9 -; CHECK-NEXT: vmerge.vvm v11, v10, v9, v0 -; CHECK-NEXT: vmv.v.v v0, v8 -; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 -; CHECK-NEXT: vfmax.vv v8, v11, v8 +; CHECK-NEXT: vfmax.vv v8, v8, v16 +; CHECK-NEXT: vfredmax.vs v8, v8, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <64 x double>, ptr %x