diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 04ee24c0916e5..6447752c451d8 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -5108,6 +5108,10 @@ class TargetLowering : public TargetLoweringBase { SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl &Created) const; + SDValue BuildVPSDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization, + SmallVectorImpl &Created) const; + SDValue BuildVPUDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization, + SmallVectorImpl &Created) const; /// Targets may override this function to provide custom SDIV lowering for /// power-of-2 denominators. If the target returns an empty SDValue, LLVM diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def index 55f4719da7c8b..e71ca44779adb 100644 --- a/llvm/include/llvm/IR/VPIntrinsics.def +++ b/llvm/include/llvm/IR/VPIntrinsics.def @@ -180,6 +180,11 @@ HELPER_REGISTER_BINARY_INT_VP(vp_xor, VP_XOR, Xor, XOR) #undef HELPER_REGISTER_BINARY_INT_VP +BEGIN_REGISTER_VP_SDNODE(VP_MULHU, -1, vp_mulhs, 2, 3) +END_REGISTER_VP_SDNODE(VP_MULHU) +BEGIN_REGISTER_VP_SDNODE(VP_MULHS, -1, vp_mulhs, 2, 3) +END_REGISTER_VP_SDNODE(VP_MULHS) + // llvm.vp.smin(x,y,mask,vlen) BEGIN_REGISTER_VP(vp_smin, 2, 3, VP_SMIN, -1) VP_PROPERTY_BINARYOP diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 8858c2012c706..ecb428bff27e6 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -564,6 +564,12 @@ namespace { SDValue visitFSUBForFMACombine(SDNode *N); SDValue visitFMULForFMADistributiveCombine(SDNode *N); + SDValue visitVPUDIV(SDNode *N); + SDValue visitVPUDIVLike(SDValue N0, SDValue N1, SDNode *N); + SDValue visitVPSDIV(SDNode *N); + SDValue visitVPSDIVLike(SDValue N0, SDValue N1, SDNode *N); + SDValue visitVPREM(SDNode *N); + SDValue XformToShuffleWithZero(SDNode *N); bool reassociationCanBreakAddressingModePattern(unsigned Opc, const SDLoc &DL, @@ -5161,6 +5167,59 @@ SDValue DAGCombiner::visitREM(SDNode *N) { return SDValue(); } +// handles ISD::VP_SREM and ISD::VP_UREM +SDValue DAGCombiner::visitVPREM(SDNode *N) { + unsigned Opcode = N->getOpcode(); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue Mask = N->getOperand(2); + SDValue VL = N->getOperand(3); + EVT VT = N->getValueType(0); + EVT CCVT = + EVT::getVectorVT(*DAG.getContext(), MVT::i1, VT.getVectorElementCount()); + + bool IsSigned = (Opcode == ISD::VP_SREM); + SDLoc DL(N); + + // fold (vp.urem X, -1) -> select(FX == -1, 0, FX) + // Freeze the numerator to avoid a miscompile with an undefined value. + if (!IsSigned && llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false)) { + SDValue F0 = DAG.getFreeze(N0); + SDValue EqualsNeg1 = DAG.getSetCCVP(DL, CCVT, F0, N1, ISD::SETEQ, Mask, VL); + return DAG.getNode(ISD::VP_SELECT, DL, VT, EqualsNeg1, + DAG.getConstant(0, DL, VT), F0, VL); + } + + AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); + + // If X/C can be simplified by the division-by-constant logic, lower + // X%C to the equivalent of X-X/C*C. + // Reuse the SDIVLike/UDIVLike combines - to avoid mangling nodes, the + // speculative DIV must not cause a DIVREM conversion. We guard against this + // by skipping the simplification if isIntDivCheap(). When div is not cheap, + // combine will not return a DIVREM. Regardless, checking cheapness here + // makes sense since the simplification results in fatter code. + if (DAG.isKnownNeverZero(N1) && !TLI.isIntDivCheap(VT, Attr)) { + SDValue OptimizedDiv = + IsSigned ? visitVPSDIVLike(N0, N1, N) : visitVPUDIVLike(N0, N1, N); + if (OptimizedDiv.getNode() && OptimizedDiv.getNode() != N) { + // If the equivalent Div node also exists, update its users. + unsigned DivOpcode = IsSigned ? ISD::VP_SDIV : ISD::VP_UDIV; + if (SDNode *DivNode = DAG.getNodeIfExists(DivOpcode, N->getVTList(), + {N0, N1, Mask, VL})) + CombineTo(DivNode, OptimizedDiv); + SDValue Mul = + DAG.getNode(ISD::VP_MUL, DL, VT, OptimizedDiv, N1, Mask, VL); + SDValue Sub = DAG.getNode(ISD::VP_SUB, DL, VT, N0, Mul, Mask, VL); + AddToWorklist(OptimizedDiv.getNode()); + AddToWorklist(Mul.getNode()); + return Sub; + } + } + + return SDValue(); +} + SDValue DAGCombiner::visitMULHS(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -27219,6 +27278,232 @@ SDValue DAGCombiner::visitVP_FSUB(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitVPUDIV(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue Mask = N->getOperand(2); + SDValue VL = N->getOperand(3); + EVT VT = N->getValueType(0); + SDLoc DL(N); + + ConstantSDNode *N1C = isConstOrConstSplat(N1); + // fold (vp.udiv X, -1) -> vp.select(X == -1, 1, 0) + if (N1C && N1C->isAllOnes()) { + EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + VT.getVectorElementCount()); + return DAG.getNode(ISD::VP_SELECT, DL, VT, + DAG.getSetCCVP(DL, CCVT, N0, N1, ISD::SETEQ, Mask, VL), + DAG.getConstant(1, DL, VT), DAG.getConstant(0, DL, VT), + VL); + } + + if (SDValue V = visitVPUDIVLike(N0, N1, N)) { + // If the corresponding remainder node exists, update its users with + // (Dividend - (Quotient * Divisor). + if (SDNode *RemNode = DAG.getNodeIfExists(ISD::VP_UREM, N->getVTList(), + {N0, N1, Mask, VL})) { + SDValue Mul = DAG.getNode(ISD::VP_MUL, DL, VT, V, N1, Mask, VL); + SDValue Sub = DAG.getNode(ISD::VP_SUB, DL, VT, N0, Mul, Mask, VL); + AddToWorklist(Mul.getNode()); + AddToWorklist(Sub.getNode()); + CombineTo(RemNode, Sub); + } + return V; + } + + return SDValue(); +} + +SDValue DAGCombiner::visitVPUDIVLike(SDValue N0, SDValue N1, SDNode *N) { + SDLoc DL(N); + SDValue Mask = N->getOperand(2); + SDValue VL = N->getOperand(3); + EVT VT = N->getValueType(0); + + // fold (vp.udiv x, (1 << c)) -> vp.lshr(x, c) + if (isConstantOrConstantVector(N1, /*NoOpaques=*/true) && + DAG.isKnownToBeAPowerOfTwo(N1)) { + SDValue LogBase2 = BuildLogBase2(N1, DL); + AddToWorklist(LogBase2.getNode()); + + EVT ShiftVT = getShiftAmountTy(N0.getValueType()); + SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT); + AddToWorklist(Trunc.getNode()); + return DAG.getNode(ISD::VP_SRL, DL, VT, N0, Trunc, Mask, VL); + } + + // fold (vp.udiv x, (vp.shl c, y)) -> vp.lshr(x, vp.add(log2(c)+y)) iff c is + // power of 2 + if (N1.getOpcode() == ISD::VP_SHL && N1->getOperand(2) == Mask && + N1->getOperand(3) == VL) { + SDValue N10 = N1.getOperand(0); + if (isConstantOrConstantVector(N10, /*NoOpaques=*/true) && + DAG.isKnownToBeAPowerOfTwo(N10)) { + SDValue LogBase2 = BuildLogBase2(N10, DL); + AddToWorklist(LogBase2.getNode()); + + EVT ADDVT = N1.getOperand(1).getValueType(); + SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ADDVT); + AddToWorklist(Trunc.getNode()); + SDValue Add = DAG.getNode(ISD::VP_ADD, DL, ADDVT, N1.getOperand(1), Trunc, + Mask, VL); + AddToWorklist(Add.getNode()); + return DAG.getNode(ISD::VP_SRL, DL, VT, N0, Add, Mask, VL); + } + } + + // fold (vp.udiv x, Splat(shl c, y)) -> vp.lshr(x, add(log2(c)+y)) iff c is + // power of 2 + if (N1.getOpcode() == ISD::SPLAT_VECTOR) { + SDValue N10 = N1.getOperand(0); + if (N10.getOpcode() == ISD::SHL) { + SDValue N0SHL = N10.getOperand(0); + if (isa(N0SHL) && DAG.isKnownToBeAPowerOfTwo(N0SHL)) { + SDValue LogBase2 = BuildLogBase2(N0SHL, DL); + AddToWorklist(LogBase2.getNode()); + + EVT ADDVT = N10.getOperand(1).getValueType(); + SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ADDVT); + AddToWorklist(Trunc.getNode()); + SDValue Add = + DAG.getNode(ISD::ADD, DL, ADDVT, N10.getOperand(1), Trunc); + AddToWorklist(Add.getNode()); + SDValue Splat = DAG.getSplatVector(VT, DL, Add); + AddToWorklist(Splat.getNode()); + return DAG.getNode(ISD::VP_SRL, DL, VT, N0, Splat, Mask, VL); + } + } + } + + // fold (udiv x, c) -> alternate + AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); + if (isConstantOrConstantVector(N1) && + !TLI.isIntDivCheap(N->getValueType(0), Attr)) { + if (SDValue Op = BuildUDIV(N)) + return Op; + } + return SDValue(); +} + +SDValue DAGCombiner::visitVPSDIV(SDNode *N) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue Mask = N->getOperand(2); + SDValue VL = N->getOperand(3); + EVT VT = N->getValueType(0); + SDLoc DL(N); + + // fold (vp.sdiv X, -1) -> 0-X + ConstantSDNode *N1C = isConstOrConstSplat(N1); + if (N1C && N1C->isAllOnes()) + return DAG.getNode(ISD::VP_SUB, DL, VT, DAG.getConstant(0, DL, VT), N0, + Mask, VL); + + // fold (vp.sdiv X, MIN_SIGNED) -> select(X == MIN_SIGNED, 1, 0) + if (N1C && N1C->getAPIntValue().isMinSignedValue()) { + EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + VT.getVectorElementCount()); + return DAG.getNode(ISD::VP_SELECT, DL, VT, + DAG.getSetCCVP(DL, CCVT, N0, N1, ISD::SETEQ, Mask, VL), + DAG.getConstant(1, DL, VT), DAG.getConstant(0, DL, VT), + VL); + } + + // If we know the sign bits of both operands are zero, strength reduce to a + // vp.udiv instead. Handles (X&15) /s 4 -> X&15 >> 2 + if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0)) + return DAG.getNode(ISD::VP_UDIV, DL, N1.getValueType(), N0, N1, Mask, VL); + + if (SDValue V = visitVPSDIVLike(N0, N1, N)) { + // If the corresponding remainder node exists, update its users with + // (Dividend - (Quotient * Divisor). + if (SDNode *RemNode = DAG.getNodeIfExists(ISD::VP_SREM, N->getVTList(), + {N0, N1, Mask, VL})) { + SDValue Mul = DAG.getNode(ISD::VP_MUL, DL, VT, V, N1, Mask, VL); + SDValue Sub = DAG.getNode(ISD::VP_SUB, DL, VT, N0, Mul, Mask, VL); + AddToWorklist(Mul.getNode()); + AddToWorklist(Sub.getNode()); + CombineTo(RemNode, Sub); + } + return V; + } + return SDValue(); +} + +SDValue DAGCombiner::visitVPSDIVLike(SDValue N0, SDValue N1, SDNode *N) { + SDLoc DL(N); + SDValue Mask = N->getOperand(2); + SDValue VL = N->getOperand(3); + EVT VT = N->getValueType(0); + unsigned BitWidth = VT.getScalarSizeInBits(); + + // fold (vp.sdiv X, V of pow 2) + if (N1.getOpcode() == ISD::SPLAT_VECTOR && + isDivisorPowerOfTwo(N1.getOperand(0))) { + // Create constants that are functions of the shift amount value. + SDValue N = N1.getOperand(0); + EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + VT.getVectorElementCount()); + EVT ScalarShiftAmtTy = + getShiftAmountTy(N0.getValueType().getVectorElementType()); + SDValue Bits = DAG.getConstant(BitWidth, DL, ScalarShiftAmtTy); + SDValue C1 = DAG.getNode(ISD::CTTZ, DL, VT.getVectorElementType(), N); + C1 = DAG.getZExtOrTrunc(C1, DL, ScalarShiftAmtTy); + SDValue Inexact = DAG.getNode(ISD::SUB, DL, ScalarShiftAmtTy, Bits, C1); + if (!isa(Inexact)) + return SDValue(); + + // Splat the sign bit into the register + EVT VecShiftAmtTy = EVT::getVectorVT(*DAG.getContext(), ScalarShiftAmtTy, + VT.getVectorElementCount()); + SDValue Sign = + DAG.getNode(ISD::VP_SRA, DL, VT, N0, + DAG.getConstant(BitWidth - 1, DL, VecShiftAmtTy), Mask, VL); + AddToWorklist(Sign.getNode()); + + // Add N0, ((N0 < 0) ? abs(N1) - 1 : 0); + Inexact = DAG.getSplat(VT, DL, Inexact); + C1 = DAG.getSplat(VT, DL, C1); + SDValue Srl = DAG.getNode(ISD::VP_SRL, DL, VT, Sign, Inexact, Mask, VL); + AddToWorklist(Srl.getNode()); + SDValue Add = DAG.getNode(ISD::VP_ADD, DL, VT, N0, Srl, Mask, VL); + AddToWorklist(Add.getNode()); + SDValue Sra = DAG.getNode(ISD::VP_SRA, DL, VT, Add, C1, Mask, VL); + AddToWorklist(Sra.getNode()); + + // Special case: (sdiv X, 1) -> X + // Special Case: (sdiv X, -1) -> 0-X + SDValue One = DAG.getConstant(1, DL, VT); + SDValue AllOnes = DAG.getAllOnesConstant(DL, VT); + SDValue IsOne = DAG.getSetCCVP(DL, CCVT, N1, One, ISD::SETEQ, Mask, VL); + SDValue IsAllOnes = + DAG.getSetCCVP(DL, CCVT, N1, AllOnes, ISD::SETEQ, Mask, VL); + SDValue IsOneOrAllOnes = + DAG.getNode(ISD::VP_OR, DL, CCVT, IsOne, IsAllOnes, Mask, VL); + Sra = DAG.getNode(ISD::VP_SELECT, DL, VT, IsOneOrAllOnes, N0, Sra, VL); + + // If dividing by a positive value, we're done. Otherwise, the result must + // be negated. + SDValue Zero = DAG.getConstant(0, DL, VT); + SDValue Sub = DAG.getNode(ISD::VP_SUB, DL, VT, Zero, Sra, Mask, VL); + + SDValue IsNeg = DAG.getSetCCVP(DL, CCVT, N1, Zero, ISD::SETLT, Mask, VL); + SDValue Res = DAG.getNode(ISD::VP_SELECT, DL, VT, IsNeg, Sub, Sra, VL); + return Res; + } + + // If integer divide is expensive and we satisfy the requirements, emit an + // alternate sequence. Targets may check function attributes for size/speed + // trade-offs. + AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); + if (isConstantOrConstantVector(N1) && + !TLI.isIntDivCheap(N->getValueType(0), Attr)) + if (SDValue Op = BuildSDIV(N)) + return Op; + + return SDValue(); +} + SDValue DAGCombiner::visitVPOp(SDNode *N) { if (N->getOpcode() == ISD::VP_GATHER) @@ -27262,6 +27547,13 @@ SDValue DAGCombiner::visitVPOp(SDNode *N) { return visitMUL(N); case ISD::VP_SUB: return foldSubCtlzNot(N, DAG); + case ISD::VP_UDIV: + return visitVPUDIV(N); + case ISD::VP_SDIV: + return visitVPSDIV(N); + case ISD::VP_UREM: + case ISD::VP_SREM: + return visitVPREM(N); default: break; } @@ -28309,7 +28601,13 @@ SDValue DAGCombiner::BuildSDIV(SDNode *N) { return SDValue(); SmallVector Built; - if (SDValue S = TLI.BuildSDIV(N, DAG, LegalOperations, LegalTypes, Built)) { + SDValue S; + if (N->isVPOpcode()) + S = TLI.BuildVPSDIV(N, DAG, LegalOperations, Built); + else + S = TLI.BuildSDIV(N, DAG, LegalOperations, LegalTypes, Built); + + if (S) { for (SDNode *N : Built) AddToWorklist(N); return S; @@ -28350,7 +28648,13 @@ SDValue DAGCombiner::BuildUDIV(SDNode *N) { return SDValue(); SmallVector Built; - if (SDValue S = TLI.BuildUDIV(N, DAG, LegalOperations, LegalTypes, Built)) { + SDValue S; + if (N->isVPOpcode()) + S = TLI.BuildVPUDIV(N, DAG, LegalOperations, Built); + else + S = TLI.BuildUDIV(N, DAG, LegalOperations, LegalTypes, Built); + + if (S) { for (SDNode *N : Built) AddToWorklist(N); return S; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 1000235ab4061..6e2f37d7c3dd4 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1277,8 +1277,8 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::ADD: case ISD::VP_ADD: case ISD::SUB: case ISD::VP_SUB: case ISD::MUL: case ISD::VP_MUL: - case ISD::MULHS: - case ISD::MULHU: + case ISD::MULHS: case ISD::VP_MULHS: + case ISD::MULHU: case ISD::VP_MULHU: case ISD::ABDS: case ISD::ABDU: case ISD::AVGCEILS: @@ -4552,8 +4552,8 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { case ISD::ADD: case ISD::VP_ADD: case ISD::AND: case ISD::VP_AND: case ISD::MUL: case ISD::VP_MUL: - case ISD::MULHS: - case ISD::MULHU: + case ISD::MULHS: case ISD::VP_MULHS: + case ISD::MULHU: case ISD::VP_MULHU: case ISD::ABDS: case ISD::ABDU: case ISD::OR: case ISD::VP_OR: diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index adfb96041c5c0..df6c2abef04f9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -6492,6 +6492,132 @@ SDValue TargetLowering::BuildSDIV(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::ADD, dl, VT, Q, T); } +/// Given an ISD::VP_SDIV node expressing a divide by constant, +/// return a DAG expression to select that will generate the same value by +/// multiplying by a magic number. +/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide". +SDValue TargetLowering::BuildVPSDIV(SDNode *N, SelectionDAG &DAG, + bool IsAfterLegalization, + SmallVectorImpl &Created) const { + SDLoc DL(N); + EVT VT = N->getValueType(0); + EVT SVT = VT.getScalarType(); + EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout()); + EVT ShSVT = ShVT.getScalarType(); + unsigned EltBits = VT.getScalarSizeInBits(); + + // Check to see if we can do this. + if (!isTypeLegal(VT) || + !isOperationLegalOrCustom(ISD::VP_MULHS, VT, IsAfterLegalization)) + return SDValue(); + + bool AnyFactorOne = false; + bool AnyFactorNegOne = false; + + SmallVector MagicFactors, Factors, Shifts, ShiftMasks; + + auto BuildSDIVPattern = [&](ConstantSDNode *C) { + if (C->isZero()) + return false; + + const APInt &Divisor = C->getAPIntValue(); + SignedDivisionByConstantInfo magics = + SignedDivisionByConstantInfo::get(Divisor); + int NumeratorFactor = 0; + int ShiftMask = -1; + + if (Divisor.isOne() || Divisor.isAllOnes()) { + // If d is +1/-1, we just multiply the numerator by +1/-1. + NumeratorFactor = Divisor.getSExtValue(); + magics.Magic = 0; + magics.ShiftAmount = 0; + ShiftMask = 0; + AnyFactorOne |= Divisor.isOne(); + AnyFactorNegOne |= Divisor.isAllOnes(); + } else if (Divisor.isStrictlyPositive() && magics.Magic.isNegative()) { + // If d > 0 and m < 0, add the numerator. + NumeratorFactor = 1; + AnyFactorOne = true; + } else if (Divisor.isNegative() && magics.Magic.isStrictlyPositive()) { + // If d < 0 and m > 0, subtract the numerator. + NumeratorFactor = -1; + AnyFactorNegOne = true; + } + + MagicFactors.push_back(DAG.getConstant(magics.Magic, DL, SVT)); + Factors.push_back(DAG.getSignedConstant(NumeratorFactor, DL, SVT)); + Shifts.push_back(DAG.getConstant(magics.ShiftAmount, DL, ShSVT)); + ShiftMasks.push_back(DAG.getSignedConstant(ShiftMask, DL, SVT)); + return true; + }; + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue Mask = N->getOperand(2); + SDValue VL = N->getOperand(3); + + // Collect the shifts / magic values from each element. + if (!ISD::matchUnaryPredicate(N1, BuildSDIVPattern)) + return SDValue(); + + SDValue MagicFactor, Factor, Shift, ShiftMask; + if (N1.getOpcode() == ISD::BUILD_VECTOR) { + MagicFactor = DAG.getBuildVector(VT, DL, MagicFactors); + Factor = DAG.getBuildVector(VT, DL, Factors); + Shift = DAG.getBuildVector(ShVT, DL, Shifts); + ShiftMask = DAG.getBuildVector(VT, DL, ShiftMasks); + } else { + assert(N1.getOpcode() == ISD::SPLAT_VECTOR && "Expected a splat_vector"); + assert(MagicFactors.size() == 1 && Factors.size() == 1 && + Shifts.size() == 1 && ShiftMasks.size() == 1 && + "Expected matchUnaryPredicate to return one element for scalable " + "vectors"); + MagicFactor = DAG.getSplatVector(VT, DL, MagicFactors[0]); + Factor = DAG.getSplatVector(VT, DL, Factors[0]); + Shift = DAG.getSplatVector(ShVT, DL, Shifts[0]); + ShiftMask = DAG.getSplatVector(VT, DL, ShiftMasks[0]); + } + + // Multiply the numerator (operand 0) by the magic value. + auto GetMULHS = [&](SDValue X, SDValue Y) { + return DAG.getNode(ISD::VP_MULHS, DL, VT, X, Y, Mask, VL); + }; + + SDValue Q = GetMULHS(N0, MagicFactor); + if (!Q) + return SDValue(); + + Created.push_back(Q.getNode()); + + // (Optionally) Add/subtract the numerator using Factor. + // FIXME: The AnyFactorOne/NegOne flags are a hack around lack of constant + // folding for VP_MUL/ADD. + if (AnyFactorOne && AnyFactorNegOne) { + Factor = DAG.getNode(ISD::VP_MUL, DL, VT, N0, Factor, Mask, VL); + Created.push_back(Factor.getNode()); + Q = DAG.getNode(ISD::VP_ADD, DL, VT, Q, Factor, Mask, VL); + Created.push_back(Q.getNode()); + } else if (AnyFactorOne) { + Q = DAG.getNode(ISD::VP_ADD, DL, VT, Q, N0, Mask, VL); + Created.push_back(Q.getNode()); + } else if (AnyFactorNegOne) { + Q = DAG.getNode(ISD::VP_SUB, DL, VT, Q, N0, Mask, VL); + Created.push_back(Q.getNode()); + } + + // Shift right algebraic by shift value. + Q = DAG.getNode(ISD::VP_SRA, DL, VT, Q, Shift, Mask, VL); + Created.push_back(Q.getNode()); + + // Extract the sign bit, mask it and add it to the quotient. + SDValue SignShift = DAG.getConstant(EltBits - 1, DL, ShVT); + SDValue T = DAG.getNode(ISD::VP_SRL, DL, VT, Q, SignShift, Mask, VL); + Created.push_back(T.getNode()); + T = DAG.getNode(ISD::VP_AND, DL, VT, T, ShiftMask, Mask, VL); + Created.push_back(T.getNode()); + return DAG.getNode(ISD::VP_ADD, DL, VT, Q, T, Mask, VL); +} + /// Given an ISD::UDIV node expressing a divide by constant, /// return a DAG expression to select that will generate the same value by /// multiplying by a magic number. @@ -6692,6 +6818,139 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG, return DAG.getSelect(dl, VT, IsOne, N0, Q); } +/// Given an ISD::VP_UDIV node expressing a divide by constant, +/// return a DAG expression to select that will generate the same value by +/// multiplying by a magic number. +/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide". +SDValue TargetLowering::BuildVPUDIV(SDNode *N, SelectionDAG &DAG, + bool IsAfterLegalization, + SmallVectorImpl &Created) const { + SDLoc DL(N); + EVT VT = N->getValueType(0); + EVT SVT = VT.getScalarType(); + EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout()); + EVT ShSVT = ShVT.getScalarType(); + unsigned EltBits = VT.getScalarSizeInBits(); + + // Check to see if we can do this. + if (!isTypeLegal(VT) || + !isOperationLegalOrCustom(ISD::VP_MULHU, VT, IsAfterLegalization)) + return SDValue(); + + bool UseNPQ = false, UsePreShift = false, UsePostShift = false; + + SmallVector PreShifts, PostShifts, MagicFactors, NPQFactors; + + auto BuildUDIVPattern = [&](ConstantSDNode *C) { + if (C->isZero()) + return false; + // FIXME: We should use a narrower constant when the upper + // bits are known to be zero. + const APInt &Divisor = C->getAPIntValue(); + SDValue PreShift, MagicFactor, NPQFactor, PostShift; + + // Magic algorithm doesn't work for division by 1. We need to emit a select + // at the end. + if (Divisor.isOne()) { + PreShift = PostShift = DAG.getUNDEF(ShSVT); + MagicFactor = NPQFactor = DAG.getUNDEF(SVT); + } else { + UnsignedDivisionByConstantInfo magics = + UnsignedDivisionByConstantInfo::get(Divisor); + + MagicFactor = DAG.getConstant(magics.Magic, DL, SVT); + + assert(magics.PreShift < Divisor.getBitWidth() && + "We shouldn't generate an undefined shift!"); + assert(magics.PostShift < Divisor.getBitWidth() && + "We shouldn't generate an undefined shift!"); + assert((!magics.IsAdd || magics.PreShift == 0) && "Unexpected pre-shift"); + PreShift = DAG.getConstant(magics.PreShift, DL, ShSVT); + PostShift = DAG.getConstant(magics.PostShift, DL, ShSVT); + NPQFactor = DAG.getConstant( + magics.IsAdd ? APInt::getOneBitSet(EltBits, EltBits - 1) + : APInt::getZero(EltBits), + DL, SVT); + UseNPQ |= magics.IsAdd; + UsePreShift |= magics.PreShift != 0; + UsePostShift |= magics.PostShift != 0; + } + + PreShifts.push_back(PreShift); + MagicFactors.push_back(MagicFactor); + NPQFactors.push_back(NPQFactor); + PostShifts.push_back(PostShift); + return true; + }; + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue Mask = N->getOperand(2); + SDValue VL = N->getOperand(3); + + // Collect the shifts/magic values from each element. + if (!ISD::matchUnaryPredicate(N1, BuildUDIVPattern)) + return SDValue(); + + SDValue PreShift, PostShift, MagicFactor, NPQFactor; + if (N1.getOpcode() == ISD::BUILD_VECTOR) { + PreShift = DAG.getBuildVector(ShVT, DL, PreShifts); + MagicFactor = DAG.getBuildVector(VT, DL, MagicFactors); + NPQFactor = DAG.getBuildVector(VT, DL, NPQFactors); + PostShift = DAG.getBuildVector(ShVT, DL, PostShifts); + } else { + assert(N1.getOpcode() == ISD::SPLAT_VECTOR && "Expected a splat_vector"); + assert(PreShifts.size() == 1 && MagicFactors.size() == 1 && + NPQFactors.size() == 1 && PostShifts.size() == 1 && + "Expected matchUnaryPredicate to return one for scalable vectors"); + PreShift = DAG.getSplatVector(ShVT, DL, PreShifts[0]); + MagicFactor = DAG.getSplatVector(VT, DL, MagicFactors[0]); + NPQFactor = DAG.getSplatVector(VT, DL, NPQFactors[0]); + PostShift = DAG.getSplatVector(ShVT, DL, PostShifts[0]); + } + + SDValue Q = N0; + if (UsePreShift) { + Q = DAG.getNode(ISD::VP_SRL, DL, VT, Q, PreShift, Mask, VL); + Created.push_back(Q.getNode()); + } + + auto GetMULHU = [&](SDValue X, SDValue Y) { + return DAG.getNode(ISD::VP_MULHU, DL, VT, X, Y, Mask, VL); + }; + + // Multiply the numerator (operand 0) by the magic value. + Q = GetMULHU(Q, MagicFactor); + if (!Q) + return SDValue(); + + Created.push_back(Q.getNode()); + + if (UseNPQ) { + SDValue NPQ = DAG.getNode(ISD::VP_SUB, DL, VT, N0, Q, Mask, VL); + Created.push_back(NPQ.getNode()); + + // For vectors we might have a mix of non-NPQ/NPQ paths, so use + // MULHU to act as a SRL-by-1 for NPQ, else multiply by zero. + NPQ = GetMULHU(NPQ, NPQFactor); + Created.push_back(NPQ.getNode()); + + Q = DAG.getNode(ISD::VP_ADD, DL, VT, NPQ, Q, Mask, VL); + Created.push_back(Q.getNode()); + } + + if (UsePostShift) { + Q = DAG.getNode(ISD::VP_SRL, DL, VT, Q, PostShift, Mask, VL); + Created.push_back(Q.getNode()); + } + + EVT SetCCVT = + EVT::getVectorVT(*DAG.getContext(), MVT::i1, VT.getVectorElementCount()); + SDValue One = DAG.getConstant(1, DL, VT); + SDValue IsOne = DAG.getSetCCVP(DL, SetCCVT, N1, One, ISD::SETEQ, Mask, VL); + return DAG.getNode(ISD::VP_SELECT, DL, VT, IsOne, N0, Q, VL); +} + /// If all values in Values that *don't* match the predicate are same 'splat' /// value, then replace all values with that splat value. /// Else, if AlternativeReplacement was provided, then replace all values that diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 8e3caf51d876b..ba4aaf36a0650 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -696,6 +696,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::VP_FP_TO_UINT, ISD::VP_SETCC, ISD::VP_SIGN_EXTEND, ISD::VP_ZERO_EXTEND, ISD::VP_TRUNCATE, ISD::VP_SMIN, ISD::VP_SMAX, ISD::VP_UMIN, ISD::VP_UMAX, + ISD::VP_MULHU, ISD::VP_MULHS, ISD::VP_ABS, ISD::EXPERIMENTAL_VP_REVERSE, ISD::EXPERIMENTAL_VP_SPLICE, ISD::VP_SADDSAT, ISD::VP_UADDSAT, ISD::VP_SSUBSAT, ISD::VP_USUBSAT, ISD::VP_CTTZ_ELTS, ISD::VP_CTTZ_ELTS_ZERO_UNDEF, @@ -870,6 +871,11 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(IntegerVPOps, VT, Custom); + // Zve64* does not support VP_MULHU/S with nxvXi64. + if (VT.getVectorElementType() == MVT::i64 && !Subtarget.hasStdExtV()) { + setOperationAction({ISD::VP_MULHU, ISD::VP_MULHS}, VT, Expand); + } + setOperationAction({ISD::LOAD, ISD::STORE}, VT, Custom); setOperationAction({ISD::MLOAD, ISD::MSTORE, ISD::MGATHER, ISD::MSCATTER}, @@ -1277,8 +1283,11 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction({ISD::ABDS, ISD::ABDU}, VT, Custom); // vXi64 MULHS/MULHU requires the V extension instead of Zve64*. - if (VT.getVectorElementType() != MVT::i64 || Subtarget.hasStdExtV()) + if (VT.getVectorElementType() != MVT::i64 || Subtarget.hasStdExtV()) { setOperationAction({ISD::MULHS, ISD::MULHU}, VT, Custom); + } else { + setOperationAction({ISD::VP_MULHS, ISD::VP_MULHU}, VT, Expand); + } setOperationAction({ISD::AVGFLOORS, ISD::AVGFLOORU, ISD::AVGCEILS, ISD::AVGCEILU, ISD::SADDSAT, ISD::UADDSAT, @@ -6410,6 +6419,8 @@ static unsigned getRISCVVLOp(SDValue Op) { VP_CASE(ADD) // VP_ADD VP_CASE(SUB) // VP_SUB VP_CASE(MUL) // VP_MUL + VP_CASE(MULHS) // VP_MULHS + VP_CASE(MULHU) // VP_MULHU VP_CASE(SDIV) // VP_SDIV VP_CASE(SREM) // VP_SREM VP_CASE(UDIV) // VP_UDIV @@ -7605,6 +7616,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, case ISD::VP_ADD: case ISD::VP_SUB: case ISD::VP_MUL: + case ISD::VP_MULHS: + case ISD::VP_MULHU: case ISD::VP_SDIV: case ISD::VP_UDIV: case ISD::VP_SREM: diff --git a/llvm/test/CodeGen/RISCV/rvv/vpdiv-by-const-zve64.ll b/llvm/test/CodeGen/RISCV/rvv/vpdiv-by-const-zve64.ll new file mode 100644 index 0000000000000..2972df3e1cf7a --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vpdiv-by-const-zve64.ll @@ -0,0 +1,97 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv64 -mattr=+zve64x -verify-machineinstrs | FileCheck %s + +declare @llvm.vp.udiv.nxv1i64(, , , i32) +declare @llvm.vp.sdiv.nxv1i64(, , , i32) +declare @llvm.vp.urem.nxv1i64(, , , i32) +declare @llvm.vp.srem.nxv1i64(, , , i32) +declare @llvm.vp.shl.nxv1i64(, , , i32) + + +define @vpudiv_by_const_no_add_nxv1i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpudiv_by_const_no_add_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 5 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vdivu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.udiv.nxv1i64( %va, splat (i64 5), %m, i32 %evl) + ret %v +} + +define @vpudiv_by_const_with_add_nxv1i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpudiv_by_const_with_add_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 7 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vdivu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.udiv.nxv1i64( %va, splat (i64 7), %m, i32 %evl) + ret %v +} + +define @vpsdiv_const_no_ashr_nxv1i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsdiv_const_no_ashr_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 3 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vdiv.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv1i64( %va, splat (i64 3), %m, i32 %evl) + ret %v +} + +define @vpsdiv_const_ashr_nxv1i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsdiv_const_ashr_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 5 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vdiv.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv1i64( %va, splat (i64 5), %m, i32 %evl) + ret %v +} + +define @vpsdiv_const_add_ashr_nxv1i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsdiv_const_add_ashr_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 15 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vdiv.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv1i64( %va, splat (i64 15), %m, i32 %evl) + ret %v +} + +define @vpsdiv_const_sub_ashr_nxv1i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsdiv_const_sub_ashr_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -3 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vdiv.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv1i64( %va, splat (i64 -3), %m, i32 %evl) + ret %v +} + +define @vpurem_by_const_nxv1i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpurem_by_const_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 5 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vremu.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.urem.nxv1i64( %va, splat (i64 5), %m, i32 %evl) + ret %v +} + +define @vpsrem_by_const_nxv1i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsrem_by_const_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 5 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vrem.vx v8, v8, a1, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.srem.nxv1i64( %va, splat (i64 5), %m, i32 %evl) + ret %v +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vpdiv-by-const.ll b/llvm/test/CodeGen/RISCV/rvv/vpdiv-by-const.ll new file mode 100644 index 0000000000000..c5159f7789d80 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vpdiv-by-const.ll @@ -0,0 +1,1595 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck %s + +declare @llvm.vp.udiv.nxv8i8(, , , i32) +declare @llvm.vp.udiv.nxv4i16(, , , i32) +declare @llvm.vp.udiv.nxv2i32(, , , i32) +declare @llvm.vp.udiv.nxv1i64(, , , i32) +declare @llvm.vp.sdiv.nxv8i8(, , , i32) +declare @llvm.vp.sdiv.nxv4i16(, , , i32) +declare @llvm.vp.sdiv.nxv2i32(, , , i32) +declare @llvm.vp.sdiv.nxv1i64(, , , i32) +declare @llvm.vp.urem.nxv8i8(, , , i32) +declare @llvm.vp.urem.nxv4i16(, , , i32) +declare @llvm.vp.urem.nxv2i32(, , , i32) +declare @llvm.vp.urem.nxv1i64(, , , i32) +declare @llvm.vp.srem.nxv8i8(, , , i32) +declare @llvm.vp.srem.nxv4i16(, , , i32) +declare @llvm.vp.srem.nxv2i32(, , , i32) +declare @llvm.vp.srem.nxv1i64(, , , i32) +declare @llvm.vp.shl.nxv8i8(, , , i32) +declare @llvm.vp.shl.nxv4i16(, , , i32) +declare @llvm.vp.shl.nxv2i32(, , , i32) +declare @llvm.vp.shl.nxv1i64(, , , i32) + + +define @vpudiv_by_max_nxv8i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpudiv_by_max_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vmseq.vi v0, v8, -1, v0.t +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: ret + %v = call @llvm.vp.udiv.nxv8i8( %va, splat (i8 255), %m, i32 %evl) + ret %v +} + +define @vpudiv_by_max_nxv4i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpudiv_by_max_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vmseq.vi v0, v8, -1, v0.t +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: ret + %v = call @llvm.vp.udiv.nxv4i16( %va, splat (i16 65535), %m, i32 %evl) + ret %v +} + +define @vpudiv_by_max_nxv2i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpudiv_by_max_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vmseq.vi v0, v8, -1, v0.t +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: ret + %v = call @llvm.vp.udiv.nxv2i32( %va, splat (i32 4294967295), %m, i32 %evl) + ret %v +} + +define @vpudiv_by_max_nxv1i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpudiv_by_max_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vmseq.vi v0, v8, -1, v0.t +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: ret + %v = call @llvm.vp.udiv.nxv1i64( %va, splat (i64 18446744073709551615), %m, i32 %evl) + ret %v +} + +define @fold_vpudiv_vpurem_nxv8i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: fold_vpudiv_vpurem_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vsrl.vi v9, v8, 7, v0.t +; CHECK-NEXT: vsll.vi v10, v9, 7, v0.t +; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t +; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma +; CHECK-NEXT: vadd.vv v8, v9, v8 +; CHECK-NEXT: ret + %v = call @llvm.vp.udiv.nxv8i8( %va, splat (i8 128), %m, i32 %evl) + %u = call @llvm.vp.urem.nxv8i8( %va, splat (i8 128), %m, i32 %evl) + %x = add %v, %u + ret %x +} + +define @fold_vpudiv_vpurem_nxv4i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: fold_vpudiv_vpurem_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vsrl.vi v9, v8, 14, v0.t +; CHECK-NEXT: vsll.vi v10, v9, 14, v0.t +; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vadd.vv v8, v9, v8 +; CHECK-NEXT: ret + %v = call @llvm.vp.udiv.nxv4i16( %va, splat (i16 16384), %m, i32 %evl) + %u = call @llvm.vp.urem.nxv4i16( %va, splat (i16 16384), %m, i32 %evl) + %x = add %v, %u + ret %x +} + +define @fold_vpudiv_vpurem_nxv2i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: fold_vpudiv_vpurem_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vsrl.vi v9, v8, 14, v0.t +; CHECK-NEXT: vsll.vi v10, v9, 14, v0.t +; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vadd.vv v8, v9, v8 +; CHECK-NEXT: ret + %v = call @llvm.vp.udiv.nxv2i32( %va, splat (i32 16384), %m, i32 %evl) + %u = call @llvm.vp.urem.nxv2i32( %va, splat (i32 16384), %m, i32 %evl) + %x = add %v, %u + ret %x +} + +define @fold_vpudiv_vpurem_nxv1i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: fold_vpudiv_vpurem_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vsrl.vi v9, v8, 14, v0.t +; CHECK-NEXT: vsll.vi v10, v9, 14, v0.t +; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t +; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; CHECK-NEXT: vadd.vv v8, v9, v8 +; CHECK-NEXT: ret + %v = call @llvm.vp.udiv.nxv1i64( %va, splat (i64 16384), %m, i32 %evl) + %u = call @llvm.vp.urem.nxv1i64( %va, splat (i64 16384), %m, i32 %evl) + %x = add %v, %u + ret %x +} + +define @vpudiv_by_shl2_nxv8i8( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpudiv_by_shl2_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: andi a0, a0, 255 +; CHECK-NEXT: addi a0, a0, 1 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsrl.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %sh = shl i8 2, %b + %vec = insertelement poison, i8 %sh, i32 0 + %splat = shufflevector %vec, poison, zeroinitializer + %v = call @llvm.vp.udiv.nxv8i8( %va, %splat, %m, i32 %evl) + ret %v +} + +define @vpudiv_by_shl2_nxv4i16( %va, i16 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpudiv_by_shl2_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a0, a0, 48 +; CHECK-NEXT: srli a0, a0, 48 +; CHECK-NEXT: addi a0, a0, 1 +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsrl.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %sh = shl i16 2, %b + %vec = insertelement poison, i16 %sh, i32 0 + %splat = shufflevector %vec, poison, zeroinitializer + %v = call @llvm.vp.udiv.nxv4i16( %va, %splat, %m, i32 %evl) + ret %v +} + +define @vpudiv_by_shl2_nxv2i32( %va, i32 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpudiv_by_shl2_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a0, a0, 1 +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsrl.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %sh = shl i32 2, %b + %vec = insertelement poison, i32 %sh, i32 0 + %splat = shufflevector %vec, poison, zeroinitializer + %v = call @llvm.vp.udiv.nxv2i32( %va, %splat, %m, i32 %evl) + ret %v +} + +define @vpudiv_by_shl2_nxv1i64( %va, i64 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpudiv_by_shl2_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a0, a0, 1 +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsrl.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %sh = shl i64 2, %b + %vec = insertelement poison, i64 %sh, i32 0 + %splat = shufflevector %vec, poison, zeroinitializer + %v = call @llvm.vp.udiv.nxv1i64( %va, %splat, %m, i32 %evl) + ret %v +} + +define @vpudiv_by_vpshl2_nxv8i8( %va, i8 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpudiv_by_vpshl2_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vadd.vi v9, v9, 2, v0.t +; CHECK-NEXT: vsrl.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %vec2 = insertelement poison, i8 %b, i32 0 + %splat2 = shufflevector %vec2, poison, zeroinitializer + %sh = call @llvm.vp.shl.nxv8i8( splat (i8 4), %splat2, %m, i32 %evl) + %v = call @llvm.vp.udiv.nxv8i8( %va, %sh, %m, i32 %evl) + ret %v +} + +define @vpudiv_by_vpshl2_nxv4i16( %va, i16 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpudiv_by_vpshl2_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vadd.vi v9, v9, 2, v0.t +; CHECK-NEXT: vsrl.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %vec2 = insertelement poison, i16 %b, i32 0 + %splat2 = shufflevector %vec2, poison, zeroinitializer + %sh = call @llvm.vp.shl.nxv4i16( splat (i16 4), %splat2, %m, i32 %evl) + %v = call @llvm.vp.udiv.nxv4i16( %va, %sh, %m, i32 %evl) + ret %v +} + +define @vpudiv_by_vpshl2_nxv2i32( %va, i32 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpudiv_by_vpshl2_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vadd.vi v9, v9, 2, v0.t +; CHECK-NEXT: vsrl.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %vec2 = insertelement poison, i32 %b, i32 0 + %splat2 = shufflevector %vec2, poison, zeroinitializer + %sh = call @llvm.vp.shl.nxv2i32( splat (i32 4), %splat2, %m, i32 %evl) + %v = call @llvm.vp.udiv.nxv2i32( %va, %sh, %m, i32 %evl) + ret %v +} + +define @vpudiv_by_vpshl2_nxv1i64( %va, i64 %b, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpudiv_by_vpshl2_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vadd.vi v9, v9, 2, v0.t +; CHECK-NEXT: vsrl.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %vec2 = insertelement poison, i64 %b, i32 0 + %splat2 = shufflevector %vec2, poison, zeroinitializer + %sh = call @llvm.vp.shl.nxv1i64( splat (i64 4), %splat2, %m, i32 %evl) + %v = call @llvm.vp.udiv.nxv1i64( %va, %sh, %m, i32 %evl) + ret %v +} + +define @vpudiv_by_const_no_add_nxv8i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpudiv_by_const_no_add_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -51 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vmulhu.vx v9, v8, a1, v0.t +; CHECK-NEXT: vsrl.vi v9, v9, 2, v0.t +; CHECK-NEXT: vmv.v.i v10, 5 +; CHECK-NEXT: vmseq.vi v0, v10, 1, v0.t +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 +; CHECK-NEXT: ret + %v = call @llvm.vp.udiv.nxv8i8( %va, splat (i8 5), %m, i32 %evl) + ret %v +} + +define @vpudiv_by_const_no_add_nxv4i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpudiv_by_const_no_add_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, 1048573 +; CHECK-NEXT: addi a1, a1, -819 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vmulhu.vx v9, v8, a1, v0.t +; CHECK-NEXT: vsrl.vi v9, v9, 2, v0.t +; CHECK-NEXT: vmv.v.i v10, 5 +; CHECK-NEXT: vmseq.vi v0, v10, 1, v0.t +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 +; CHECK-NEXT: ret + %v = call @llvm.vp.udiv.nxv4i16( %va, splat (i16 5), %m, i32 %evl) + ret %v +} + +define @vpudiv_by_const_no_add_nxv2i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpudiv_by_const_no_add_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, 838861 +; CHECK-NEXT: addi a1, a1, -819 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vmulhu.vx v9, v8, a1, v0.t +; CHECK-NEXT: vsrl.vi v9, v9, 2, v0.t +; CHECK-NEXT: vmv.v.i v10, 5 +; CHECK-NEXT: vmseq.vi v0, v10, 1, v0.t +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 +; CHECK-NEXT: ret + %v = call @llvm.vp.udiv.nxv2i32( %va, splat (i32 5), %m, i32 %evl) + ret %v +} + +define @vpudiv_by_const_no_add_nxv1i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpudiv_by_const_no_add_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vmv.v.i v9, 5 +; CHECK-NEXT: lui a0, 838861 +; CHECK-NEXT: vmseq.vi v9, v9, 1, v0.t +; CHECK-NEXT: addiw a0, a0, -819 +; CHECK-NEXT: slli a1, a0, 32 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vmulhu.vx v10, v8, a0, v0.t +; CHECK-NEXT: vsrl.vi v10, v10, 2, v0.t +; CHECK-NEXT: vmv.v.v v0, v9 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 +; CHECK-NEXT: ret + %v = call @llvm.vp.udiv.nxv1i64( %va, splat (i64 5), %m, i32 %evl) + ret %v +} + +define @vpudiv_by_const_with_add_nxv8i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpudiv_by_const_with_add_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 37 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vmulhu.vx v9, v8, a1, v0.t +; CHECK-NEXT: li a0, -128 +; CHECK-NEXT: vsub.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmulhu.vx v10, v10, a0, v0.t +; CHECK-NEXT: vadd.vv v9, v10, v9, v0.t +; CHECK-NEXT: vmv.v.i v10, 7 +; CHECK-NEXT: vsrl.vi v9, v9, 2, v0.t +; CHECK-NEXT: vmseq.vi v0, v10, 1, v0.t +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 +; CHECK-NEXT: ret + %v = call @llvm.vp.udiv.nxv8i8( %va, splat (i8 7), %m, i32 %evl) + ret %v +} + +define @vpudiv_by_const_with_add_nxv4i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpudiv_by_const_with_add_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, 2 +; CHECK-NEXT: addi a1, a1, 1171 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vmulhu.vx v9, v8, a1, v0.t +; CHECK-NEXT: lui a0, 1048568 +; CHECK-NEXT: vsub.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmulhu.vx v10, v10, a0, v0.t +; CHECK-NEXT: vadd.vv v9, v10, v9, v0.t +; CHECK-NEXT: vmv.v.i v10, 7 +; CHECK-NEXT: vsrl.vi v9, v9, 2, v0.t +; CHECK-NEXT: vmseq.vi v0, v10, 1, v0.t +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 +; CHECK-NEXT: ret + %v = call @llvm.vp.udiv.nxv4i16( %va, splat (i16 7), %m, i32 %evl) + ret %v +} + +define @vpudiv_by_const_with_add_nxv2i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpudiv_by_const_with_add_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, 149797 +; CHECK-NEXT: addi a1, a1, -1755 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vmulhu.vx v9, v8, a1, v0.t +; CHECK-NEXT: lui a0, 524288 +; CHECK-NEXT: vsub.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmulhu.vx v10, v10, a0, v0.t +; CHECK-NEXT: vadd.vv v9, v10, v9, v0.t +; CHECK-NEXT: vmv.v.i v10, 7 +; CHECK-NEXT: vsrl.vi v9, v9, 2, v0.t +; CHECK-NEXT: vmseq.vi v0, v10, 1, v0.t +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 +; CHECK-NEXT: ret + %v = call @llvm.vp.udiv.nxv2i32( %va, splat (i32 7), %m, i32 %evl) + ret %v +} + +define @vpudiv_by_const_with_add_nxv1i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpudiv_by_const_with_add_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, %hi(.LCPI23_0) +; CHECK-NEXT: ld a1, %lo(.LCPI23_0)(a1) +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vmulhu.vx v9, v8, a1, v0.t +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: slli a0, a0, 63 +; CHECK-NEXT: vsub.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmulhu.vx v10, v10, a0, v0.t +; CHECK-NEXT: vadd.vv v9, v10, v9, v0.t +; CHECK-NEXT: vmv.v.i v10, 7 +; CHECK-NEXT: vsrl.vi v9, v9, 2, v0.t +; CHECK-NEXT: vmseq.vi v0, v10, 1, v0.t +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 +; CHECK-NEXT: ret + %v = call @llvm.vp.udiv.nxv1i64( %va, splat (i64 7), %m, i32 %evl) + ret %v +} + +define @vpsdiv_by_neg1_nxv8i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsdiv_by_neg1_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vrsub.vi v8, v8, 0, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv8i8( %va, splat (i8 -1), %m, i32 %evl) + ret %v +} + +define @vpsdiv_by_neg1_nxv1i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsdiv_by_neg1_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vrsub.vi v8, v8, 0, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv1i64( %va, splat (i64 -1), %m, i32 %evl) + ret %v +} + +define @vpsdiv_by_neg1_nxv4i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsdiv_by_neg1_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vrsub.vi v8, v8, 0, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv4i16( %va, splat (i16 -1), %m, i32 %evl) + ret %v +} + +define @vpsdiv_by_neg1_nxv2i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsdiv_by_neg1_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vrsub.vi v8, v8, 0, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv2i32( %va, splat (i32 -1), %m, i32 %evl) + ret %v +} + +define @vpsdiv_by_min_nxv8i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsdiv_by_min_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -128 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vmseq.vx v0, v8, a1, v0.t +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv8i8( %va, splat (i8 -128), %m, i32 %evl) + ret %v +} + +define @vpsdiv_by_min_nxv1i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsdiv_by_min_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: slli a1, a1, 63 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vmseq.vx v0, v8, a1, v0.t +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv1i64( %va, splat (i64 -9223372036854775808), %m, i32 %evl) + ret %v +} + +define @vpsdiv_by_min_nxv4i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsdiv_by_min_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, 1048568 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vmseq.vx v0, v8, a1, v0.t +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv4i16( %va, splat (i16 -32768), %m, i32 %evl) + ret %v +} + +define @vpsdiv_by_min_nxv2i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsdiv_by_min_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, 524288 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vmseq.vx v0, v8, a1, v0.t +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv2i32( %va, splat (i32 -2147483648), %m, i32 %evl) + ret %v +} + +define @vpsdiv_pow2_nxv4i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsdiv_pow2_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vmv.v.i v10, 4 +; CHECK-NEXT: vmseq.vi v11, v10, -1, v0.t +; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t +; CHECK-NEXT: vmor.mm v10, v10, v11 +; CHECK-NEXT: vsra.vi v11, v8, 15, v0.t +; CHECK-NEXT: vsrl.vi v11, v11, 14, v0.t +; CHECK-NEXT: vadd.vv v11, v8, v11, v0.t +; CHECK-NEXT: vsra.vi v11, v11, 2, v0.t +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vmv.v.v v0, v10 +; CHECK-NEXT: vmerge.vvm v8, v11, v8, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vrsub.vi v10, v8, 0, v0.t +; CHECK-NEXT: vmsgt.vi v0, v12, 4, v0.t +; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0 +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv4i16( %va, splat (i16 4), %m, i32 %evl) + ret %v +} + +define @vpsdiv_pow2_nxv8i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsdiv_pow2_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vmv.v.i v10, 4 +; CHECK-NEXT: vmseq.vi v11, v10, -1, v0.t +; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t +; CHECK-NEXT: vmor.mm v10, v10, v11 +; CHECK-NEXT: vsra.vi v11, v8, 7, v0.t +; CHECK-NEXT: vsrl.vi v11, v11, 6, v0.t +; CHECK-NEXT: vadd.vv v11, v8, v11, v0.t +; CHECK-NEXT: vsra.vi v11, v11, 2, v0.t +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vmv.v.v v0, v10 +; CHECK-NEXT: vmerge.vvm v8, v11, v8, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vrsub.vi v10, v8, 0, v0.t +; CHECK-NEXT: vmsgt.vi v0, v12, 4, v0.t +; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0 +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv8i8( %va, splat (i8 4), %m, i32 %evl) + ret %v +} + +define @vpsdiv_pow2_nxv2i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsdiv_pow2_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vmv.v.i v10, 4 +; CHECK-NEXT: vmseq.vi v11, v10, -1, v0.t +; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t +; CHECK-NEXT: vmor.mm v10, v10, v11 +; CHECK-NEXT: vsra.vi v11, v8, 31, v0.t +; CHECK-NEXT: vsrl.vi v11, v11, 30, v0.t +; CHECK-NEXT: vadd.vv v11, v8, v11, v0.t +; CHECK-NEXT: vsra.vi v11, v11, 2, v0.t +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vmv.v.v v0, v10 +; CHECK-NEXT: vmerge.vvm v8, v11, v8, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vrsub.vi v10, v8, 0, v0.t +; CHECK-NEXT: vmsgt.vi v0, v12, 4, v0.t +; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0 +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv2i32( %va, splat (i32 4), %m, i32 %evl) + ret %v +} + +define @vpsdiv_pow2_nxv1i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsdiv_pow2_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vmv.v.i v10, 4 +; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vmseq.vi v11, v10, -1, v0.t +; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t +; CHECK-NEXT: vsra.vx v12, v8, a0, v0.t +; CHECK-NEXT: li a0, 62 +; CHECK-NEXT: vmor.mm v10, v10, v11 +; CHECK-NEXT: vsrl.vx v11, v12, a0, v0.t +; CHECK-NEXT: vadd.vv v11, v8, v11, v0.t +; CHECK-NEXT: vsra.vi v11, v11, 2, v0.t +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vmv.v.v v0, v10 +; CHECK-NEXT: vmerge.vvm v8, v11, v8, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vrsub.vi v10, v8, 0, v0.t +; CHECK-NEXT: vmsgt.vi v0, v12, 4, v0.t +; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0 +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv1i64( %va, splat (i64 4), %m, i32 %evl) + ret %v +} + +define @vpsdiv_const_no_ashr_nxv8i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsdiv_const_no_ashr_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 86 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vmulh.vx v8, v8, a1, v0.t +; CHECK-NEXT: vsra.vi v8, v8, 0, v0.t +; CHECK-NEXT: vsrl.vi v9, v8, 7, v0.t +; CHECK-NEXT: vand.vi v9, v9, -1, v0.t +; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv8i8( %va, splat (i8 3), %m, i32 %evl) + ret %v +} + +define @vpsdiv_const_no_ashr_nxv4i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsdiv_const_no_ashr_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, 5 +; CHECK-NEXT: addi a1, a1, 1366 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vmulh.vx v8, v8, a1, v0.t +; CHECK-NEXT: vsra.vi v8, v8, 0, v0.t +; CHECK-NEXT: vsrl.vi v9, v8, 15, v0.t +; CHECK-NEXT: vand.vi v9, v9, -1, v0.t +; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv4i16( %va, splat (i16 3), %m, i32 %evl) + ret %v +} + +define @vpsdiv_const_no_ashr_nxv2i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsdiv_const_no_ashr_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, 349525 +; CHECK-NEXT: addi a1, a1, 1366 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vmulh.vx v8, v8, a1, v0.t +; CHECK-NEXT: vsra.vi v8, v8, 0, v0.t +; CHECK-NEXT: vsrl.vi v9, v8, 31, v0.t +; CHECK-NEXT: vand.vi v9, v9, -1, v0.t +; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv2i32( %va, splat (i32 3), %m, i32 %evl) + ret %v +} + +define @vpsdiv_const_no_ashr_nxv1i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsdiv_const_no_ashr_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, %hi(.LCPI39_0) +; CHECK-NEXT: ld a1, %lo(.LCPI39_0)(a1) +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vmulh.vx v8, v8, a1, v0.t +; CHECK-NEXT: vsra.vi v8, v8, 0, v0.t +; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vsrl.vx v9, v8, a0, v0.t +; CHECK-NEXT: vand.vi v9, v9, -1, v0.t +; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv1i64( %va, splat (i64 3), %m, i32 %evl) + ret %v +} + +define @vpsdiv_const_ashr_nxv8i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsdiv_const_ashr_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 103 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vmulh.vx v8, v8, a1, v0.t +; CHECK-NEXT: vsra.vi v8, v8, 1, v0.t +; CHECK-NEXT: vsrl.vi v9, v8, 7, v0.t +; CHECK-NEXT: vand.vi v9, v9, -1, v0.t +; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv8i8( %va, splat (i8 5), %m, i32 %evl) + ret %v +} + +define @vpsdiv_const_ashr_nxv4i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsdiv_const_ashr_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, 6 +; CHECK-NEXT: addi a1, a1, 1639 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vmulh.vx v8, v8, a1, v0.t +; CHECK-NEXT: vsra.vi v8, v8, 1, v0.t +; CHECK-NEXT: vsrl.vi v9, v8, 15, v0.t +; CHECK-NEXT: vand.vi v9, v9, -1, v0.t +; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv4i16( %va, splat (i16 5), %m, i32 %evl) + ret %v +} + +define @vpsdiv_const_ashr_nxv2i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsdiv_const_ashr_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, 419430 +; CHECK-NEXT: addi a1, a1, 1639 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vmulh.vx v8, v8, a1, v0.t +; CHECK-NEXT: vsra.vi v8, v8, 1, v0.t +; CHECK-NEXT: vsrl.vi v9, v8, 31, v0.t +; CHECK-NEXT: vand.vi v9, v9, -1, v0.t +; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv2i32( %va, splat (i32 5), %m, i32 %evl) + ret %v +} + +define @vpsdiv_const_ashr_nxv1i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsdiv_const_ashr_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, %hi(.LCPI43_0) +; CHECK-NEXT: ld a1, %lo(.LCPI43_0)(a1) +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vmulh.vx v8, v8, a1, v0.t +; CHECK-NEXT: vsra.vi v8, v8, 1, v0.t +; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vsrl.vx v9, v8, a0, v0.t +; CHECK-NEXT: vand.vi v9, v9, -1, v0.t +; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv1i64( %va, splat (i64 5), %m, i32 %evl) + ret %v +} + +define @vpsdiv_const_add_ashr_nxv8i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsdiv_const_add_ashr_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -109 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vmulh.vx v9, v8, a1, v0.t +; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t +; CHECK-NEXT: vsra.vi v8, v8, 2, v0.t +; CHECK-NEXT: vsrl.vi v9, v8, 7, v0.t +; CHECK-NEXT: vand.vi v9, v9, -1, v0.t +; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv8i8( %va, splat (i8 7), %m, i32 %evl) + ret %v +} + +define @vpsdiv_const_add_ashr_nxv4i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsdiv_const_add_ashr_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, 1048569 +; CHECK-NEXT: addi a1, a1, -1911 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vmulh.vx v9, v8, a1, v0.t +; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t +; CHECK-NEXT: vsra.vi v8, v8, 3, v0.t +; CHECK-NEXT: vsrl.vi v9, v8, 15, v0.t +; CHECK-NEXT: vand.vi v9, v9, -1, v0.t +; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv4i16( %va, splat (i16 15), %m, i32 %evl) + ret %v +} + +define @vpsdiv_const_add_ashr_nxv2i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsdiv_const_add_ashr_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, 599186 +; CHECK-NEXT: addi a1, a1, 1171 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vmulh.vx v9, v8, a1, v0.t +; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t +; CHECK-NEXT: vsra.vi v8, v8, 2, v0.t +; CHECK-NEXT: vsrl.vi v9, v8, 31, v0.t +; CHECK-NEXT: vand.vi v9, v9, -1, v0.t +; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv2i32( %va, splat (i32 7), %m, i32 %evl) + ret %v +} + +define @vpsdiv_const_add_ashr_nxv1i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsdiv_const_add_ashr_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, 559241 +; CHECK-NEXT: addiw a1, a1, -1911 +; CHECK-NEXT: slli a2, a1, 32 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vmulh.vx v9, v8, a1, v0.t +; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t +; CHECK-NEXT: vsra.vi v8, v8, 3, v0.t +; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vsrl.vx v9, v8, a0, v0.t +; CHECK-NEXT: vand.vi v9, v9, -1, v0.t +; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv1i64( %va, splat (i64 15), %m, i32 %evl) + ret %v +} + +define @vpsdiv_const_sub_ashr_nxv8i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsdiv_const_sub_ashr_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 109 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vmulh.vx v9, v8, a1, v0.t +; CHECK-NEXT: vsub.vv v8, v9, v8, v0.t +; CHECK-NEXT: vsra.vi v8, v8, 2, v0.t +; CHECK-NEXT: vsrl.vi v9, v8, 7, v0.t +; CHECK-NEXT: vand.vi v9, v9, -1, v0.t +; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv8i8( %va, splat (i8 -7), %m, i32 %evl) + ret %v +} + +define @vpsdiv_const_sub_ashr_nxv4i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsdiv_const_sub_ashr_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, 7 +; CHECK-NEXT: addi a1, a1, 1911 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vmulh.vx v9, v8, a1, v0.t +; CHECK-NEXT: vsub.vv v8, v9, v8, v0.t +; CHECK-NEXT: vsra.vi v8, v8, 3, v0.t +; CHECK-NEXT: vsrl.vi v9, v8, 15, v0.t +; CHECK-NEXT: vand.vi v9, v9, -1, v0.t +; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv4i16( %va, splat (i16 -15), %m, i32 %evl) + ret %v +} + +define @vpsdiv_const_sub_ashr_nxv2i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsdiv_const_sub_ashr_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, 449390 +; CHECK-NEXT: addi a1, a1, -1171 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vmulh.vx v9, v8, a1, v0.t +; CHECK-NEXT: vsub.vv v8, v9, v8, v0.t +; CHECK-NEXT: vsra.vi v8, v8, 2, v0.t +; CHECK-NEXT: vsrl.vi v9, v8, 31, v0.t +; CHECK-NEXT: vand.vi v9, v9, -1, v0.t +; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv2i32( %va, splat (i32 -7), %m, i32 %evl) + ret %v +} + +define @vpsdiv_const_sub_ashr_nxv1i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsdiv_const_sub_ashr_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, 349525 +; CHECK-NEXT: addiw a1, a1, 1365 +; CHECK-NEXT: slli a2, a1, 32 +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vmulh.vx v9, v8, a1, v0.t +; CHECK-NEXT: vsub.vv v8, v9, v8, v0.t +; CHECK-NEXT: vsra.vi v8, v8, 1, v0.t +; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vsrl.vx v9, v8, a0, v0.t +; CHECK-NEXT: vand.vi v9, v9, -1, v0.t +; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv1i64( %va, splat (i64 -3), %m, i32 %evl) + ret %v +} + +define @vpurem_by_max_nxv1i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpurem_by_max_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vmseq.vi v0, v8, -1, v0.t +; CHECK-NEXT: vmerge.vim v8, v8, 0, v0 +; CHECK-NEXT: ret + %v = call @llvm.vp.urem.nxv1i64( %va, splat (i64 18446744073709551615), %m, i32 %evl) + ret %v +} + +define @vpurem_by_max_nxv4i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpurem_by_max_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vmseq.vi v0, v8, -1, v0.t +; CHECK-NEXT: vmerge.vim v8, v8, 0, v0 +; CHECK-NEXT: ret + %v = call @llvm.vp.urem.nxv4i16( %va, splat (i16 65535), %m, i32 %evl) + ret %v +} + +define @vpurem_by_max_nxv8i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpurem_by_max_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vmseq.vi v0, v8, -1, v0.t +; CHECK-NEXT: vmerge.vim v8, v8, 0, v0 +; CHECK-NEXT: ret + %v = call @llvm.vp.urem.nxv8i8( %va, splat (i8 255), %m, i32 %evl) + ret %v +} + +define @vpurem_by_max_nxv2i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpurem_by_max_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vmseq.vi v0, v8, -1, v0.t +; CHECK-NEXT: vmerge.vim v8, v8, 0, v0 +; CHECK-NEXT: ret + %v = call @llvm.vp.urem.nxv2i32( %va, splat (i32 4294967295), %m, i32 %evl) + ret %v +} + +define @vpurem_by_const_nxv1i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpurem_by_const_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vmv.v.i v10, 5 +; CHECK-NEXT: lui a0, 838861 +; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t +; CHECK-NEXT: addiw a0, a0, -819 +; CHECK-NEXT: slli a1, a0, 32 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vmulhu.vx v11, v8, a0, v0.t +; CHECK-NEXT: vsrl.vi v11, v11, 2, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 +; CHECK-NEXT: vmerge.vvm v10, v11, v8, v0 +; CHECK-NEXT: li a0, 5 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vmul.vx v10, v10, a0, v0.t +; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.urem.nxv1i64( %va, splat (i64 5), %m, i32 %evl) + ret %v +} + +define @vpurem_by_const_nxv4i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpurem_by_const_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: lui a0, 1048573 +; CHECK-NEXT: vmv.v.i v10, 5 +; CHECK-NEXT: addi a0, a0, -819 +; CHECK-NEXT: vmulhu.vx v11, v8, a0, v0.t +; CHECK-NEXT: vsrl.vi v11, v11, 2, v0.t +; CHECK-NEXT: vmseq.vi v0, v10, 1, v0.t +; CHECK-NEXT: li a0, 5 +; CHECK-NEXT: vmerge.vvm v10, v11, v8, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vmul.vx v10, v10, a0, v0.t +; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.urem.nxv4i16( %va, splat (i16 5), %m, i32 %evl) + ret %v +} + +define @vpurem_by_const_nxv8i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpurem_by_const_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: li a0, -51 +; CHECK-NEXT: vmv.v.i v10, 5 +; CHECK-NEXT: vmulhu.vx v11, v8, a0, v0.t +; CHECK-NEXT: vsrl.vi v11, v11, 2, v0.t +; CHECK-NEXT: vmseq.vi v0, v10, 1, v0.t +; CHECK-NEXT: li a0, 5 +; CHECK-NEXT: vmerge.vvm v10, v11, v8, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vmul.vx v10, v10, a0, v0.t +; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.urem.nxv8i8( %va, splat (i8 5), %m, i32 %evl) + ret %v +} + +define @vpurem_by_const_nxv2i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpurem_by_const_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: lui a0, 838861 +; CHECK-NEXT: vmv.v.i v10, 5 +; CHECK-NEXT: addi a0, a0, -819 +; CHECK-NEXT: vmulhu.vx v11, v8, a0, v0.t +; CHECK-NEXT: vsrl.vi v11, v11, 2, v0.t +; CHECK-NEXT: vmseq.vi v0, v10, 1, v0.t +; CHECK-NEXT: li a0, 5 +; CHECK-NEXT: vmerge.vvm v10, v11, v8, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vmul.vx v10, v10, a0, v0.t +; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.urem.nxv2i32( %va, splat (i32 5), %m, i32 %evl) + ret %v +} + +define @vpsrem_by_const_nxv1i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsrem_by_const_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, %hi(.LCPI60_0) +; CHECK-NEXT: ld a1, %lo(.LCPI60_0)(a1) +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vmulh.vx v9, v8, a1, v0.t +; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vsra.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsrl.vx v10, v9, a0, v0.t +; CHECK-NEXT: vand.vi v10, v10, -1, v0.t +; CHECK-NEXT: vadd.vv v9, v9, v10, v0.t +; CHECK-NEXT: li a0, 5 +; CHECK-NEXT: vmul.vx v9, v9, a0, v0.t +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.srem.nxv1i64( %va, splat (i64 5), %m, i32 %evl) + ret %v +} + +define @vpsrem_by_const_nxv4i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsrem_by_const_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, 6 +; CHECK-NEXT: addi a1, a1, 1639 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vmulh.vx v9, v8, a1, v0.t +; CHECK-NEXT: vsra.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsrl.vi v10, v9, 15, v0.t +; CHECK-NEXT: vand.vi v10, v10, -1, v0.t +; CHECK-NEXT: vadd.vv v9, v9, v10, v0.t +; CHECK-NEXT: li a0, 5 +; CHECK-NEXT: vmul.vx v9, v9, a0, v0.t +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.srem.nxv4i16( %va, splat (i16 5), %m, i32 %evl) + ret %v +} + +define @vpsrem_by_const_nxv8i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsrem_by_const_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 103 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vmulh.vx v9, v8, a1, v0.t +; CHECK-NEXT: vsra.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsrl.vi v10, v9, 7, v0.t +; CHECK-NEXT: vand.vi v10, v10, -1, v0.t +; CHECK-NEXT: vadd.vv v9, v9, v10, v0.t +; CHECK-NEXT: li a0, 5 +; CHECK-NEXT: vmul.vx v9, v9, a0, v0.t +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.srem.nxv8i8( %va, splat (i8 5), %m, i32 %evl) + ret %v +} + +define @vpsrem_by_const_nxv2i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsrem_by_const_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, 419430 +; CHECK-NEXT: addi a1, a1, 1639 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vmulh.vx v9, v8, a1, v0.t +; CHECK-NEXT: vsra.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsrl.vi v10, v9, 31, v0.t +; CHECK-NEXT: vand.vi v10, v10, -1, v0.t +; CHECK-NEXT: vadd.vv v9, v9, v10, v0.t +; CHECK-NEXT: li a0, 5 +; CHECK-NEXT: vmul.vx v9, v9, a0, v0.t +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.srem.nxv2i32( %va, splat (i32 5), %m, i32 %evl) + ret %v +} + +define @vpudiv_by_1_nxv8i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpudiv_by_1_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vmv.v.i v10, 1 +; CHECK-NEXT: vmseq.vi v11, v10, -1, v0.t +; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t +; CHECK-NEXT: vmor.mm v10, v10, v11 +; CHECK-NEXT: vsra.vi v11, v8, 7, v0.t +; CHECK-NEXT: vsrl.vi v11, v11, 8, v0.t +; CHECK-NEXT: vadd.vv v11, v8, v11, v0.t +; CHECK-NEXT: vsra.vi v11, v11, 0, v0.t +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vmv.v.v v0, v10 +; CHECK-NEXT: vmerge.vvm v8, v11, v8, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vrsub.vi v10, v8, 0, v0.t +; CHECK-NEXT: vmsgt.vi v0, v12, 1, v0.t +; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0 +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv8i8( %va, splat (i8 1), %m, i32 %evl) + ret %v +} + +define @vpudiv_by_1_nxv4i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpudiv_by_1_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vmv.v.i v10, 1 +; CHECK-NEXT: vmseq.vi v11, v10, -1, v0.t +; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t +; CHECK-NEXT: vmor.mm v10, v10, v11 +; CHECK-NEXT: vsra.vi v11, v8, 15, v0.t +; CHECK-NEXT: vsrl.vi v11, v11, 16, v0.t +; CHECK-NEXT: vadd.vv v11, v8, v11, v0.t +; CHECK-NEXT: vsra.vi v11, v11, 0, v0.t +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vmv.v.v v0, v10 +; CHECK-NEXT: vmerge.vvm v8, v11, v8, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vrsub.vi v10, v8, 0, v0.t +; CHECK-NEXT: vmsgt.vi v0, v12, 1, v0.t +; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0 +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv4i16( %va, splat (i16 1), %m, i32 %evl) + ret %v +} + +define @vpudiv_by_1_nxv2i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpudiv_by_1_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vmv.v.i v10, 1 +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vmseq.vi v11, v10, -1, v0.t +; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t +; CHECK-NEXT: vmor.mm v10, v10, v11 +; CHECK-NEXT: vsra.vi v11, v8, 31, v0.t +; CHECK-NEXT: vsrl.vx v11, v11, a0, v0.t +; CHECK-NEXT: vadd.vv v11, v8, v11, v0.t +; CHECK-NEXT: vsra.vi v11, v11, 0, v0.t +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vmv.v.v v0, v10 +; CHECK-NEXT: vmerge.vvm v8, v11, v8, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vrsub.vi v10, v8, 0, v0.t +; CHECK-NEXT: vmsgt.vi v0, v12, 1, v0.t +; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0 +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv2i32( %va, splat (i32 1), %m, i32 %evl) + ret %v +} + +define @vpudiv_by_1_nxv1i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpudiv_by_1_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vmv.v.i v10, 1 +; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vmseq.vi v11, v10, -1, v0.t +; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t +; CHECK-NEXT: vsra.vx v12, v8, a0, v0.t +; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: vmor.mm v10, v10, v11 +; CHECK-NEXT: vsrl.vx v11, v12, a0, v0.t +; CHECK-NEXT: vadd.vv v11, v8, v11, v0.t +; CHECK-NEXT: vsra.vi v11, v11, 0, v0.t +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vmv.v.v v0, v10 +; CHECK-NEXT: vmerge.vvm v8, v11, v8, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vrsub.vi v10, v8, 0, v0.t +; CHECK-NEXT: vmsgt.vi v0, v12, 1, v0.t +; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0 +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv1i64( %va, splat (i64 1), %m, i32 %evl) + ret %v +} + +define @vpsdiv_by_1_nxv8i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsdiv_by_1_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vmv.v.i v10, 1 +; CHECK-NEXT: vmseq.vi v11, v10, -1, v0.t +; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t +; CHECK-NEXT: vmor.mm v10, v10, v11 +; CHECK-NEXT: vsra.vi v11, v8, 7, v0.t +; CHECK-NEXT: vsrl.vi v11, v11, 8, v0.t +; CHECK-NEXT: vadd.vv v11, v8, v11, v0.t +; CHECK-NEXT: vsra.vi v11, v11, 0, v0.t +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vmv.v.v v0, v10 +; CHECK-NEXT: vmerge.vvm v8, v11, v8, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vrsub.vi v10, v8, 0, v0.t +; CHECK-NEXT: vmsgt.vi v0, v12, 1, v0.t +; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0 +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv8i8( %va, splat (i8 1), %m, i32 %evl) + ret %v +} + +define @vpsdiv_by_1_nxv4i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsdiv_by_1_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vmv.v.i v10, 1 +; CHECK-NEXT: vmseq.vi v11, v10, -1, v0.t +; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t +; CHECK-NEXT: vmor.mm v10, v10, v11 +; CHECK-NEXT: vsra.vi v11, v8, 15, v0.t +; CHECK-NEXT: vsrl.vi v11, v11, 16, v0.t +; CHECK-NEXT: vadd.vv v11, v8, v11, v0.t +; CHECK-NEXT: vsra.vi v11, v11, 0, v0.t +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vmv.v.v v0, v10 +; CHECK-NEXT: vmerge.vvm v8, v11, v8, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vrsub.vi v10, v8, 0, v0.t +; CHECK-NEXT: vmsgt.vi v0, v12, 1, v0.t +; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0 +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv4i16( %va, splat (i16 1), %m, i32 %evl) + ret %v +} + +define @vpsdiv_by_1_nxv2i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsdiv_by_1_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vmv.v.i v10, 1 +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vmseq.vi v11, v10, -1, v0.t +; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t +; CHECK-NEXT: vmor.mm v10, v10, v11 +; CHECK-NEXT: vsra.vi v11, v8, 31, v0.t +; CHECK-NEXT: vsrl.vx v11, v11, a0, v0.t +; CHECK-NEXT: vadd.vv v11, v8, v11, v0.t +; CHECK-NEXT: vsra.vi v11, v11, 0, v0.t +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vmv.v.v v0, v10 +; CHECK-NEXT: vmerge.vvm v8, v11, v8, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vrsub.vi v10, v8, 0, v0.t +; CHECK-NEXT: vmsgt.vi v0, v12, 1, v0.t +; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0 +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv2i32( %va, splat (i32 1), %m, i32 %evl) + ret %v +} + +define @vpsdiv_by_1_nxv1i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsdiv_by_1_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vmv.v.i v10, 1 +; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vmseq.vi v11, v10, -1, v0.t +; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t +; CHECK-NEXT: vsra.vx v12, v8, a0, v0.t +; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: vmor.mm v10, v10, v11 +; CHECK-NEXT: vsrl.vx v11, v12, a0, v0.t +; CHECK-NEXT: vadd.vv v11, v8, v11, v0.t +; CHECK-NEXT: vsra.vi v11, v11, 0, v0.t +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vmv.v.v v0, v10 +; CHECK-NEXT: vmerge.vvm v8, v11, v8, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vrsub.vi v10, v8, 0, v0.t +; CHECK-NEXT: vmsgt.vi v0, v12, 1, v0.t +; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0 +; CHECK-NEXT: ret + %v = call @llvm.vp.sdiv.nxv1i64( %va, splat (i64 1), %m, i32 %evl) + ret %v +} + +define @vpurem_by_1_nxv8i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpurem_by_1_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vsrl.vi v9, v8, 0, v0.t +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.urem.nxv8i8( %va, splat (i8 1), %m, i32 %evl) + ret %v +} + +define @vpurem_by_1_nxv4i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpurem_by_1_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vsrl.vi v9, v8, 0, v0.t +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.urem.nxv4i16( %va, splat (i16 1), %m, i32 %evl) + ret %v +} + +define @vpurem_by_1_nxv2i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpurem_by_1_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vsrl.vi v9, v8, 0, v0.t +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.urem.nxv2i32( %va, splat (i32 1), %m, i32 %evl) + ret %v +} + +define @vpurem_by_1_nxv1i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpurem_by_1_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vsrl.vi v9, v8, 0, v0.t +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.urem.nxv1i64( %va, splat (i64 1), %m, i32 %evl) + ret %v +} + +define @vpsrem_by_1_nxv8i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsrem_by_1_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vmv.v.i v10, 1 +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vmseq.vi v12, v10, -1, v0.t +; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t +; CHECK-NEXT: vmor.mm v10, v10, v12 +; CHECK-NEXT: vsra.vi v12, v8, 7, v0.t +; CHECK-NEXT: vsrl.vi v12, v12, 8, v0.t +; CHECK-NEXT: vadd.vv v12, v8, v12, v0.t +; CHECK-NEXT: vsra.vi v12, v12, 0, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 +; CHECK-NEXT: vmerge.vvm v10, v12, v8, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vrsub.vi v12, v10, 0, v0.t +; CHECK-NEXT: vmsgt.vi v0, v11, 1, v0.t +; CHECK-NEXT: vmerge.vvm v10, v10, v12, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.srem.nxv8i8( %va, splat (i8 1), %m, i32 %evl) + ret %v +} + +define @vpsrem_by_1_nxv4i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsrem_by_1_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vmv.v.i v10, 1 +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vmseq.vi v12, v10, -1, v0.t +; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t +; CHECK-NEXT: vmor.mm v10, v10, v12 +; CHECK-NEXT: vsra.vi v12, v8, 15, v0.t +; CHECK-NEXT: vsrl.vi v12, v12, 16, v0.t +; CHECK-NEXT: vadd.vv v12, v8, v12, v0.t +; CHECK-NEXT: vsra.vi v12, v12, 0, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 +; CHECK-NEXT: vmerge.vvm v10, v12, v8, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vrsub.vi v12, v10, 0, v0.t +; CHECK-NEXT: vmsgt.vi v0, v11, 1, v0.t +; CHECK-NEXT: vmerge.vvm v10, v10, v12, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.srem.nxv4i16( %va, splat (i16 1), %m, i32 %evl) + ret %v +} + +define @vpsrem_by_1_nxv2i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsrem_by_1_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vmv.v.i v10, 1 +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vmseq.vi v12, v10, -1, v0.t +; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t +; CHECK-NEXT: vmor.mm v10, v10, v12 +; CHECK-NEXT: vsra.vi v12, v8, 31, v0.t +; CHECK-NEXT: vsrl.vx v12, v12, a0, v0.t +; CHECK-NEXT: vadd.vv v12, v8, v12, v0.t +; CHECK-NEXT: vsra.vi v12, v12, 0, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 +; CHECK-NEXT: vmerge.vvm v10, v12, v8, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vrsub.vi v12, v10, 0, v0.t +; CHECK-NEXT: vmsgt.vi v0, v11, 1, v0.t +; CHECK-NEXT: vmerge.vvm v10, v10, v12, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.srem.nxv2i32( %va, splat (i32 1), %m, i32 %evl) + ret %v +} + +define @vpsrem_by_1_nxv1i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsrem_by_1_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vmv.v.i v10, 1 +; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vmseq.vi v11, v10, -1, v0.t +; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t +; CHECK-NEXT: vsra.vx v12, v8, a0, v0.t +; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vmor.mm v10, v10, v11 +; CHECK-NEXT: vsrl.vx v11, v12, a0, v0.t +; CHECK-NEXT: vadd.vv v11, v8, v11, v0.t +; CHECK-NEXT: vsra.vi v11, v11, 0, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 +; CHECK-NEXT: vmerge.vvm v10, v11, v8, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vrsub.vi v11, v10, 0, v0.t +; CHECK-NEXT: vmsgt.vi v0, v13, 1, v0.t +; CHECK-NEXT: vmerge.vvm v10, v10, v11, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.srem.nxv1i64( %va, splat (i64 1), %m, i32 %evl) + ret %v +} + +define @vpsrem_by_neg1_nxv8i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsrem_by_neg1_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vmv.v.i v10, -1 +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vmseq.vi v12, v10, -1, v0.t +; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t +; CHECK-NEXT: vmor.mm v10, v10, v12 +; CHECK-NEXT: vsra.vi v12, v8, 7, v0.t +; CHECK-NEXT: vsrl.vi v12, v12, 8, v0.t +; CHECK-NEXT: vadd.vv v12, v8, v12, v0.t +; CHECK-NEXT: vsra.vi v12, v12, 0, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 +; CHECK-NEXT: vmerge.vvm v10, v12, v8, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vrsub.vi v12, v10, 0, v0.t +; CHECK-NEXT: vmsgt.vi v0, v11, -1, v0.t +; CHECK-NEXT: vmerge.vvm v10, v10, v12, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vrsub.vi v10, v10, 0, v0.t +; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.srem.nxv8i8( %va, splat (i8 -1), %m, i32 %evl) + ret %v +} + +define @vpsrem_by_neg1_nxv4i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsrem_by_neg1_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vmv.v.i v10, -1 +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vmseq.vi v12, v10, -1, v0.t +; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t +; CHECK-NEXT: vmor.mm v10, v10, v12 +; CHECK-NEXT: vsra.vi v12, v8, 15, v0.t +; CHECK-NEXT: vsrl.vi v12, v12, 16, v0.t +; CHECK-NEXT: vadd.vv v12, v8, v12, v0.t +; CHECK-NEXT: vsra.vi v12, v12, 0, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 +; CHECK-NEXT: vmerge.vvm v10, v12, v8, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vrsub.vi v12, v10, 0, v0.t +; CHECK-NEXT: vmsgt.vi v0, v11, -1, v0.t +; CHECK-NEXT: vmerge.vvm v10, v10, v12, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vrsub.vi v10, v10, 0, v0.t +; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.srem.nxv4i16( %va, splat (i16 -1), %m, i32 %evl) + ret %v +} + +define @vpsrem_by_neg1_nxv2i32( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsrem_by_neg1_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vmv.v.i v10, -1 +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vmv.v.i v11, 0 +; CHECK-NEXT: vmseq.vi v12, v10, -1, v0.t +; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t +; CHECK-NEXT: vmor.mm v10, v10, v12 +; CHECK-NEXT: vsra.vi v12, v8, 31, v0.t +; CHECK-NEXT: vsrl.vx v12, v12, a0, v0.t +; CHECK-NEXT: vadd.vv v12, v8, v12, v0.t +; CHECK-NEXT: vsra.vi v12, v12, 0, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 +; CHECK-NEXT: vmerge.vvm v10, v12, v8, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vrsub.vi v12, v10, 0, v0.t +; CHECK-NEXT: vmsgt.vi v0, v11, -1, v0.t +; CHECK-NEXT: vmerge.vvm v10, v10, v12, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vrsub.vi v10, v10, 0, v0.t +; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.srem.nxv2i32( %va, splat (i32 -1), %m, i32 %evl) + ret %v +} + +define @vpsrem_by_neg1_nxv1i64( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsrem_by_neg1_nxv1i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vmv.v.i v10, -1 +; CHECK-NEXT: li a0, 63 +; CHECK-NEXT: vmseq.vi v11, v10, -1, v0.t +; CHECK-NEXT: vmseq.vi v10, v10, 1, v0.t +; CHECK-NEXT: vsra.vx v12, v8, a0, v0.t +; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: vmv.v.i v13, 0 +; CHECK-NEXT: vmor.mm v10, v10, v11 +; CHECK-NEXT: vsrl.vx v11, v12, a0, v0.t +; CHECK-NEXT: vadd.vv v11, v8, v11, v0.t +; CHECK-NEXT: vsra.vi v11, v11, 0, v0.t +; CHECK-NEXT: vmv.v.v v0, v10 +; CHECK-NEXT: vmerge.vvm v10, v11, v8, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vrsub.vi v11, v10, 0, v0.t +; CHECK-NEXT: vmsgt.vi v0, v13, -1, v0.t +; CHECK-NEXT: vmerge.vvm v10, v10, v11, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vrsub.vi v10, v10, 0, v0.t +; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.srem.nxv1i64( %va, splat (i64 -1), %m, i32 %evl) + ret %v +} + +define @vpsdivrem_nxv8i8( %va, %vb, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpsdivrem_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 109 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vmulh.vx v9, v8, a1, v0.t +; CHECK-NEXT: li a0, -7 +; CHECK-NEXT: vsub.vv v9, v9, v8, v0.t +; CHECK-NEXT: vsra.vi v9, v9, 2, v0.t +; CHECK-NEXT: vsrl.vi v10, v9, 7, v0.t +; CHECK-NEXT: vand.vi v10, v10, -1, v0.t +; CHECK-NEXT: vadd.vv v9, v9, v10, v0.t +; CHECK-NEXT: vmul.vx v10, v9, a0, v0.t +; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t +; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.srem.nxv8i8( %va, splat (i8 -7), %m, i32 %evl) + %w = call @llvm.vp.sdiv.nxv8i8( %va, splat (i8 -7), %m, i32 %evl) + %x = call @llvm.vp.add.nxv8i8( %v, %w, %m, i32 %evl) + ret %x +} + +define @vpudivrem_nxv8i8( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpudivrem_nxv8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vmv.v.i v10, 7 +; CHECK-NEXT: li a0, 37 +; CHECK-NEXT: vmulhu.vx v11, v8, a0, v0.t +; CHECK-NEXT: li a0, -128 +; CHECK-NEXT: vsub.vv v12, v8, v11, v0.t +; CHECK-NEXT: vmulhu.vx v12, v12, a0, v0.t +; CHECK-NEXT: vadd.vv v11, v12, v11, v0.t +; CHECK-NEXT: vsrl.vi v11, v11, 2, v0.t +; CHECK-NEXT: vmseq.vi v0, v10, 1, v0.t +; CHECK-NEXT: li a0, 7 +; CHECK-NEXT: vmerge.vvm v10, v11, v8, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vmul.vx v11, v10, a0, v0.t +; CHECK-NEXT: vadd.vv v8, v8, v10, v0.t +; CHECK-NEXT: vsub.vv v8, v8, v11, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.urem.nxv8i8( %va, splat (i8 7), %m, i32 %evl) + %w = call @llvm.vp.udiv.nxv8i8( %va, splat (i8 7), %m, i32 %evl) + %x = call @llvm.vp.add.nxv8i8( %v, %w, %m, i32 %evl) + ret %x +}