From 3769e1f169796f9ec591a29145ea9eb7608fe576 Mon Sep 17 00:00:00 2001 From: Elvis Wang Date: Sun, 27 Apr 2025 23:32:25 -0700 Subject: [PATCH 1/9] [VPlan] Implement transformation for widen-cast/widen-mul + reduction to abstract recipe. This patch introduce two new recipes. * VPExtendedReductionRecipe - cast + reduction. * VPMulAccumulateReductionRecipe - (cast) + mul + reduction. This patch also implements the transformation that match following patterns via vplan and converts to abstract recipes for better cost estimation. * VPExtendedReduction - reduce(cast(...)) * VPMulAccumulateReductionRecipe - reduce.add(mul(...)) - reduce.add(mul(ext(...), ext(...)) - reduce.add(ext(mul(ext(...), ext(...)))) The conveted abstract recipes will be lower to the concrete recipes (widen-cast + widen-mul + reduction) just before recipe execution. Split from #113903. --- .../Transforms/Vectorize/LoopVectorize.cpp | 19 +- llvm/lib/Transforms/Vectorize/VPlan.h | 259 +++++++++++++++++- .../Transforms/Vectorize/VPlanAnalysis.cpp | 2 + .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 101 ++++++- .../Transforms/Vectorize/VPlanTransforms.cpp | 247 +++++++++++++++++ .../Transforms/Vectorize/VPlanTransforms.h | 7 + llvm/lib/Transforms/Vectorize/VPlanValue.h | 2 + .../LoopVectorize/ARM/mve-reduction-types.ll | 4 +- .../LoopVectorize/ARM/mve-reductions.ll | 120 ++++---- .../LoopVectorize/reduction-inloop-pred.ll | 2 +- .../LoopVectorize/reduction-inloop.ll | 8 +- .../vplan-printing-reductions.ll | 145 ++++++++++ 12 files changed, 838 insertions(+), 78 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 1611c6d3a4437..9af481d9a172f 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9568,10 +9568,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range, "entry block must be set to a VPRegionBlock having a non-empty entry " "VPBasicBlock"); - for (ElementCount VF : Range) - Plan->addVF(VF); - Plan->setName("Initial VPlan"); - // Update wide induction increments to use the same step as the corresponding // wide induction. This enables detecting induction increments directly in // VPlan and removes redundant splats. @@ -9601,6 +9597,21 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range, // Adjust the recipes for any inloop reductions. adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start); + // Transform recipes to abstract recipes if it is legal and beneficial and + // clamp the range for better cost estimation. + // TODO: Enable following transform when the EVL-version of extended-reduction + // and mulacc-reduction are implemented. + if (!CM.foldTailWithEVL()) { + VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM, + CM.CostKind); + VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan, + CostCtx, Range); + } + + for (ElementCount VF : Range) + Plan->addVF(VF); + Plan->setName("Initial VPlan"); + // Interleave memory: for each Interleave Group we marked earlier as relevant // for this VPlan, replace the Recipes widening its memory instructions with a // single VPInterleaveRecipe at its insertion point. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 2c4cac7655ec9..598413d7ddb74 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -517,6 +517,8 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPInstructionSC: case VPRecipeBase::VPReductionEVLSC: case VPRecipeBase::VPReductionSC: + case VPRecipeBase::VPMulAccumulateReductionSC: + case VPRecipeBase::VPExtendedReductionSC: case VPRecipeBase::VPReplicateSC: case VPRecipeBase::VPScalarIVStepsSC: case VPRecipeBase::VPVectorPointerSC: @@ -601,13 +603,15 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe { DisjointFlagsTy(bool IsDisjoint) : IsDisjoint(IsDisjoint) {} }; + struct NonNegFlagsTy { + char NonNeg : 1; + NonNegFlagsTy(bool IsNonNeg) : NonNeg(IsNonNeg) {} + }; + private: struct ExactFlagsTy { char IsExact : 1; }; - struct NonNegFlagsTy { - char NonNeg : 1; - }; struct FastMathFlagsTy { char AllowReassoc : 1; char NoNaNs : 1; @@ -697,6 +701,12 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe { : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::DisjointOp), DisjointFlags(DisjointFlags) {} + template + VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, + NonNegFlagsTy NonNegFlags, DebugLoc DL = {}) + : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::NonNegOp), + NonNegFlags(NonNegFlags) {} + protected: VPRecipeWithIRFlags(const unsigned char SC, ArrayRef Operands, GEPNoWrapFlags GEPFlags, DebugLoc DL = {}) @@ -715,7 +725,9 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe { R->getVPDefID() == VPRecipeBase::VPReductionEVLSC || R->getVPDefID() == VPRecipeBase::VPReplicateSC || R->getVPDefID() == VPRecipeBase::VPVectorEndPointerSC || - R->getVPDefID() == VPRecipeBase::VPVectorPointerSC; + R->getVPDefID() == VPRecipeBase::VPVectorPointerSC || + R->getVPDefID() == VPRecipeBase::VPExtendedReductionSC || + R->getVPDefID() == VPRecipeBase::VPMulAccumulateReductionSC; } static inline bool classof(const VPUser *U) { @@ -812,6 +824,15 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe { FastMathFlags getFastMathFlags() const; + /// Returns true if the recipe has non-negative flag. + bool hasNonNegFlag() const { return OpType == OperationType::NonNegOp; } + + bool isNonNeg() const { + assert(OpType == OperationType::NonNegOp && + "recipe doesn't have a NNEG flag"); + return NonNegFlags.NonNeg; + } + bool hasNoUnsignedWrap() const { assert(OpType == OperationType::OverflowingBinOp && "recipe doesn't have a NUW flag"); @@ -1289,10 +1310,21 @@ class VPWidenRecipe : public VPRecipeWithIRFlags, public VPIRMetadata { : VPRecipeWithIRFlags(VPDefOpcode, Operands, I), VPIRMetadata(I), Opcode(I.getOpcode()) {} + template + VPWidenRecipe(unsigned VPDefOpcode, unsigned Opcode, + iterator_range Operands, bool NUW, bool NSW, DebugLoc DL) + : VPRecipeWithIRFlags(VPDefOpcode, Operands, WrapFlagsTy(NUW, NSW), DL), + Opcode(Opcode) {} + public: VPWidenRecipe(Instruction &I, ArrayRef Operands) : VPWidenRecipe(VPDef::VPWidenSC, I, Operands) {} + template + VPWidenRecipe(unsigned Opcode, iterator_range Operands, bool NUW, + bool NSW, DebugLoc DL) + : VPWidenRecipe(VPDef::VPWidenSC, Opcode, Operands, NUW, NSW, DL) {} + ~VPWidenRecipe() override = default; VPWidenRecipe *clone() override { @@ -1337,8 +1369,15 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags, public VPIRMetadata { "opcode of underlying cast doesn't match"); } - VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy) - : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op), VPIRMetadata(), + VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, + DebugLoc DL = {}) + : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, DL), VPIRMetadata(), + Opcode(Opcode), ResultTy(ResultTy) {} + + VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, + bool IsNonNeg, DebugLoc DL = {}) + : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, NonNegFlagsTy(IsNonNeg), + DL), Opcode(Opcode), ResultTy(ResultTy) {} ~VPWidenCastRecipe() override = default; @@ -2381,6 +2420,28 @@ class VPReductionRecipe : public VPRecipeWithIRFlags { setUnderlyingValue(I); } + /// For VPExtendedReductionRecipe. + /// Note that the debug location is from the extend. + VPReductionRecipe(const unsigned char SC, const RecurKind RdxKind, + ArrayRef Operands, VPValue *CondOp, + bool IsOrdered, DebugLoc DL) + : VPRecipeWithIRFlags(SC, Operands, DL), RdxKind(RdxKind), + IsOrdered(IsOrdered), IsConditional(CondOp) { + if (CondOp) + addOperand(CondOp); + } + + /// For VPMulAccumulateReductionRecipe. + /// Note that the NUW/NSW flags and the debug location are from the Mul. + VPReductionRecipe(const unsigned char SC, const RecurKind RdxKind, + ArrayRef Operands, VPValue *CondOp, + bool IsOrdered, WrapFlagsTy WrapFlags, DebugLoc DL) + : VPRecipeWithIRFlags(SC, Operands, WrapFlags, DL), RdxKind(RdxKind), + IsOrdered(IsOrdered), IsConditional(CondOp) { + if (CondOp) + addOperand(CondOp); + } + public: VPReductionRecipe(RecurKind RdxKind, FastMathFlags FMFs, Instruction *I, VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp, @@ -2389,6 +2450,13 @@ class VPReductionRecipe : public VPRecipeWithIRFlags { ArrayRef({ChainOp, VecOp}), CondOp, IsOrdered, DL) {} + VPReductionRecipe(const RecurKind RdxKind, FastMathFlags FMFs, + VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp, + bool IsOrdered, DebugLoc DL = {}) + : VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, nullptr, + ArrayRef({ChainOp, VecOp}), CondOp, + IsOrdered, DL) {} + ~VPReductionRecipe() override = default; VPReductionRecipe *clone() override { @@ -2399,7 +2467,9 @@ class VPReductionRecipe : public VPRecipeWithIRFlags { static inline bool classof(const VPRecipeBase *R) { return R->getVPDefID() == VPRecipeBase::VPReductionSC || - R->getVPDefID() == VPRecipeBase::VPReductionEVLSC; + R->getVPDefID() == VPRecipeBase::VPReductionEVLSC || + R->getVPDefID() == VPRecipeBase::VPExtendedReductionSC || + R->getVPDefID() == VPRecipeBase::VPMulAccumulateReductionSC; } static inline bool classof(const VPUser *U) { @@ -2538,6 +2608,181 @@ class VPReductionEVLRecipe : public VPReductionRecipe { } }; +/// A recipe to represent inloop extended reduction operations, performing a +/// reduction on a extended vector operand into a scalar value, and adding the +/// result to a chain. This recipe is abstract and needs to be lowered to +/// concrete recipes before codegen. The operands are {ChainOp, VecOp, +/// [Condition]}. +class VPExtendedReductionRecipe : public VPReductionRecipe { + /// Opcode of the extend recipe will be lowered to. + Instruction::CastOps ExtOp; + + Type *ResultTy; + + /// For cloning VPExtendedReductionRecipe. + VPExtendedReductionRecipe(VPExtendedReductionRecipe *ExtRed) + : VPReductionRecipe( + VPDef::VPExtendedReductionSC, ExtRed->getRecurrenceKind(), + {ExtRed->getChainOp(), ExtRed->getVecOp()}, ExtRed->getCondOp(), + ExtRed->isOrdered(), ExtRed->getDebugLoc()), + ExtOp(ExtRed->getExtOpcode()), ResultTy(ExtRed->getResultType()) { + transferFlags(*ExtRed); + } + +public: + VPExtendedReductionRecipe(VPReductionRecipe *R, VPWidenCastRecipe *Ext) + : VPReductionRecipe(VPDef::VPExtendedReductionSC, R->getRecurrenceKind(), + {R->getChainOp(), Ext->getOperand(0)}, R->getCondOp(), + R->isOrdered(), Ext->getDebugLoc()), + ExtOp(Ext->getOpcode()), ResultTy(Ext->getResultType()) { + // Not all WidenCastRecipes contain nneg flag. Need to transfer flags from + // the original recipe to prevent setting wrong flags. + transferFlags(*Ext); + } + + ~VPExtendedReductionRecipe() override = default; + + VPExtendedReductionRecipe *clone() override { + auto *Copy = new VPExtendedReductionRecipe(this); + Copy->transferFlags(*this); + return Copy; + } + + VP_CLASSOF_IMPL(VPDef::VPExtendedReductionSC); + + void execute(VPTransformState &State) override { + llvm_unreachable("VPExtendedReductionRecipe should be transform to " + "VPExtendedRecipe + VPReductionRecipe before execution."); + }; + + /// Return the cost of VPExtendedReductionRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + /// The scalar type after extending. + Type *getResultType() const { return ResultTy; } + + /// Is the extend ZExt? + bool isZExt() const { return getExtOpcode() == Instruction::ZExt; } + + /// The opcode of extend recipe. + Instruction::CastOps getExtOpcode() const { return ExtOp; } +}; + +/// A recipe to represent inloop MulAccumulateReduction operations, performing a +/// reduction.add on the result of vector operands (might be extended) +/// multiplication into a scalar value, and adding the result to a chain. This +/// recipe is abstract and needs to be lowered to concrete recipes before +/// codegen. The operands are {ChainOp, VecOp1, VecOp2, [Condition]}. +class VPMulAccumulateReductionRecipe : public VPReductionRecipe { + /// Opcode of the extend recipe. + Instruction::CastOps ExtOp; + + /// Non-neg flag of the extend recipe. + bool IsNonNeg = false; + + Type *ResultTy; + + /// For cloning VPMulAccumulateReductionRecipe. + VPMulAccumulateReductionRecipe(VPMulAccumulateReductionRecipe *MulAcc) + : VPReductionRecipe( + VPDef::VPMulAccumulateReductionSC, MulAcc->getRecurrenceKind(), + {MulAcc->getChainOp(), MulAcc->getVecOp0(), MulAcc->getVecOp1()}, + MulAcc->getCondOp(), MulAcc->isOrdered(), + WrapFlagsTy(MulAcc->hasNoUnsignedWrap(), MulAcc->hasNoSignedWrap()), + MulAcc->getDebugLoc()), + ExtOp(MulAcc->getExtOpcode()), IsNonNeg(MulAcc->isNonNeg()), + ResultTy(MulAcc->getResultType()) {} + +public: + VPMulAccumulateReductionRecipe(VPReductionRecipe *R, VPWidenRecipe *Mul, + VPWidenCastRecipe *Ext0, + VPWidenCastRecipe *Ext1, Type *ResultTy) + : VPReductionRecipe( + VPDef::VPMulAccumulateReductionSC, R->getRecurrenceKind(), + {R->getChainOp(), Ext0->getOperand(0), Ext1->getOperand(0)}, + R->getCondOp(), R->isOrdered(), + WrapFlagsTy(Mul->hasNoUnsignedWrap(), Mul->hasNoSignedWrap()), + R->getDebugLoc()), + ExtOp(Ext0->getOpcode()), ResultTy(ResultTy) { + assert(RecurrenceDescriptor::getOpcode(getRecurrenceKind()) == + Instruction::Add && + "The reduction instruction in MulAccumulateteReductionRecipe must " + "be Add"); + // Only set the non-negative flag if the original recipe contains. + if (Ext0->hasNonNegFlag()) + IsNonNeg = Ext0->isNonNeg(); + } + + VPMulAccumulateReductionRecipe(VPReductionRecipe *R, VPWidenRecipe *Mul) + : VPReductionRecipe( + VPDef::VPMulAccumulateReductionSC, R->getRecurrenceKind(), + {R->getChainOp(), Mul->getOperand(0), Mul->getOperand(1)}, + R->getCondOp(), R->isOrdered(), + WrapFlagsTy(Mul->hasNoUnsignedWrap(), Mul->hasNoSignedWrap()), + R->getDebugLoc()), + ExtOp(Instruction::CastOps::CastOpsEnd) { + assert(RecurrenceDescriptor::getOpcode(getRecurrenceKind()) == + Instruction::Add && + "The reduction instruction in MulAccumulateReductionRecipe must be " + "Add"); + } + + ~VPMulAccumulateReductionRecipe() override = default; + + VPMulAccumulateReductionRecipe *clone() override { + auto *Copy = new VPMulAccumulateReductionRecipe(this); + Copy->transferFlags(*this); + return Copy; + } + + VP_CLASSOF_IMPL(VPDef::VPMulAccumulateReductionSC); + + void execute(VPTransformState &State) override { + llvm_unreachable("VPMulAccumulateReductionRecipe should transform to " + "VPWidenCastRecipe + " + "VPWidenRecipe + VPReductionRecipe before execution"); + } + + /// Return the cost of VPMulAccumulateReductionRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + Type *getResultType() const { + assert(isExtended() && "Only support getResultType when this recipe " + "contains implicit extend."); + return ResultTy; + } + + /// The VPValue of the vector value to be extended and reduced. + VPValue *getVecOp0() const { return getOperand(1); } + VPValue *getVecOp1() const { return getOperand(2); } + + /// Return if this MulAcc recipe contains extended operands. + bool isExtended() const { return ExtOp != Instruction::CastOps::CastOpsEnd; } + + /// Return the opcode of the extends for the operands. + Instruction::CastOps getExtOpcode() const { return ExtOp; } + + /// Return if the operands are zero extended. + bool isZExt() const { return ExtOp == Instruction::CastOps::ZExt; } + + /// Return the non negative flag of the ext recipe. + bool isNonNeg() const { return IsNonNeg; } +}; + /// VPReplicateRecipe replicates a given instruction producing multiple scalar /// copies of the original scalar type, one per lane, instead of producing a /// single copy of widened type for all lanes. If the instruction is known to be diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index c86815c84d8d9..7dcbd72c25191 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -273,6 +273,8 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { // TODO: Use info from interleave group. return V->getUnderlyingValue()->getType(); }) + .Case( + [](const auto *R) { return R->getResultType(); }) .Case([](const VPExpandSCEVRecipe *R) { return R->getSCEV()->getType(); }) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 3c7ab7d24bf6d..04eab649713bc 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -73,6 +73,8 @@ bool VPRecipeBase::mayWriteToMemory() const { case VPBlendSC: case VPReductionEVLSC: case VPReductionSC: + case VPExtendedReductionSC: + case VPMulAccumulateReductionSC: case VPVectorPointerSC: case VPWidenCanonicalIVSC: case VPWidenCastSC: @@ -120,6 +122,8 @@ bool VPRecipeBase::mayReadFromMemory() const { case VPBlendSC: case VPReductionEVLSC: case VPReductionSC: + case VPExtendedReductionSC: + case VPMulAccumulateReductionSC: case VPVectorPointerSC: case VPWidenCanonicalIVSC: case VPWidenCastSC: @@ -157,6 +161,8 @@ bool VPRecipeBase::mayHaveSideEffects() const { case VPBlendSC: case VPReductionEVLSC: case VPReductionSC: + case VPExtendedReductionSC: + case VPMulAccumulateReductionSC: case VPScalarIVStepsSC: case VPVectorPointerSC: case VPWidenCanonicalIVSC: @@ -2521,28 +2527,49 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF, auto *VectorTy = cast(toVectorTy(ElementTy, VF)); unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind); FastMathFlags FMFs = getFastMathFlags(); + std::optional OptionalFMF = + ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt; - // TODO: Support any-of and in-loop reductions. + // TODO: Support any-of reductions. assert( (!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) || ForceTargetInstructionCost.getNumOccurrences() > 0) && "Any-of reduction not implemented in VPlan-based cost model currently."); - assert( - (!cast(getOperand(0))->isInLoop() || - ForceTargetInstructionCost.getNumOccurrences() > 0) && - "In-loop reduction not implemented in VPlan-based cost model currently."); - // Cost = Reduction cost + BinOp cost - InstructionCost Cost = - Ctx.TTI.getArithmeticInstrCost(Opcode, ElementTy, Ctx.CostKind); if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) { Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind); - return Cost + - Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind); + return Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind); } - return Cost + Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, FMFs, - Ctx.CostKind); + return Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, OptionalFMF, + Ctx.CostKind); +} + +InstructionCost +VPExtendedReductionRecipe::computeCost(ElementCount VF, + VPCostContext &Ctx) const { + unsigned Opcode = RecurrenceDescriptor::getOpcode(getRecurrenceKind()); + Type *RedTy = Ctx.Types.inferScalarType(this); + auto *SrcVecTy = + cast(toVectorTy(Ctx.Types.inferScalarType(getVecOp()), VF)); + assert(RedTy->isIntegerTy() && + "ExtendedReduction only support integer type currently."); + InstructionCost Cost = Ctx.TTI.getExtendedReductionCost( + Opcode, isZExt(), RedTy, SrcVecTy, std::nullopt, Ctx.CostKind); + // The cost of this recipe should be decided by the legacy model. + return Cost.isValid() ? 0 : Cost; +} + +InstructionCost +VPMulAccumulateReductionRecipe::computeCost(ElementCount VF, + VPCostContext &Ctx) const { + Type *RedTy = Ctx.Types.inferScalarType(this); + auto *SrcVecTy = + cast(toVectorTy(Ctx.Types.inferScalarType(getVecOp0()), VF)); + InstructionCost Cost = + Ctx.TTI.getMulAccReductionCost(isZExt(), RedTy, SrcVecTy, Ctx.CostKind); + // The cost of this recipe should be decided by the legacy model. + return Cost.isValid() ? 0 : Cost; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -2587,6 +2614,56 @@ void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent, } O << ")"; } + +void VPExtendedReductionRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "EXTENDED-REDUCE "; + printAsOperand(O, SlotTracker); + O << " = "; + getChainOp()->printAsOperand(O, SlotTracker); + O << " +"; + O << " reduce." + << Instruction::getOpcodeName( + RecurrenceDescriptor::getOpcode(getRecurrenceKind())) + << " ("; + getVecOp()->printAsOperand(O, SlotTracker); + O << " extended to " << *getResultType(); + if (isConditional()) { + O << ", "; + getCondOp()->printAsOperand(O, SlotTracker); + } + O << ")"; +} + +void VPMulAccumulateReductionRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "MULACC-REDUCE "; + printAsOperand(O, SlotTracker); + O << " = "; + getChainOp()->printAsOperand(O, SlotTracker); + O << " + "; + O << "reduce." + << Instruction::getOpcodeName( + RecurrenceDescriptor::getOpcode(getRecurrenceKind())) + << " ("; + O << "mul"; + printFlags(O); + if (isExtended()) + O << "("; + getVecOp0()->printAsOperand(O, SlotTracker); + if (isExtended()) + O << " extended to " << *getResultType() << "), ("; + else + O << ", "; + getVecOp1()->printAsOperand(O, SlotTracker); + if (isExtended()) + O << " extended to " << *getResultType() << ")"; + if (isConditional()) { + O << ", "; + getCondOp()->printAsOperand(O, SlotTracker); + } + O << ")"; +} #endif /// A helper function to scalarize a single Instruction in the innermost loop. diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 806c20ef8cf73..5e56104c875af 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2392,6 +2392,82 @@ void VPlanTransforms::createInterleaveGroups( } } +// Expand VPExtendedReductionRecipe to VPWidenCastRecipe + VPReductionRecipe. +static void expandVPExtendedReduction(VPExtendedReductionRecipe *ExtRed) { + VPWidenCastRecipe *Ext; + // Only ZExt contains non-neg flags. + if (ExtRed->isZExt()) + Ext = new VPWidenCastRecipe(ExtRed->getExtOpcode(), ExtRed->getVecOp(), + ExtRed->getResultType(), ExtRed->isNonNeg(), + ExtRed->getDebugLoc()); + else + Ext = new VPWidenCastRecipe(ExtRed->getExtOpcode(), ExtRed->getVecOp(), + ExtRed->getResultType(), ExtRed->getDebugLoc()); + + auto *Red = new VPReductionRecipe( + ExtRed->getRecurrenceKind(), FastMathFlags(), ExtRed->getChainOp(), Ext, + ExtRed->getCondOp(), ExtRed->isOrdered(), ExtRed->getDebugLoc()); + Ext->insertBefore(ExtRed); + Red->insertBefore(ExtRed); + ExtRed->replaceAllUsesWith(Red); + ExtRed->eraseFromParent(); +} + +// Expand VPMulAccumulateReductionRecipe to VPWidenRecipe (mul) + +// VPReductionRecipe (reduce.add) +// + VPWidenCastRecipe (optional). +static void +expandVPMulAccumulateReduction(VPMulAccumulateReductionRecipe *MulAcc) { + // Generate inner VPWidenCastRecipes if necessary. + // Note that we will drop the extend after mul which transform + // reduce.add(ext(mul(ext, ext))) to reduce.add(mul(ext, ext)). + VPValue *Op0, *Op1; + if (MulAcc->isExtended()) { + Type *RedTy = MulAcc->getResultType(); + if (MulAcc->isZExt()) + Op0 = new VPWidenCastRecipe(MulAcc->getExtOpcode(), MulAcc->getVecOp0(), + RedTy, MulAcc->isNonNeg(), + MulAcc->getDebugLoc()); + else + Op0 = new VPWidenCastRecipe(MulAcc->getExtOpcode(), MulAcc->getVecOp0(), + RedTy, MulAcc->getDebugLoc()); + Op0->getDefiningRecipe()->insertBefore(MulAcc); + // Prevent reduce.add(mul(ext(A), ext(A))) generate duplicate + // VPWidenCastRecipe. + if (MulAcc->getVecOp0() == MulAcc->getVecOp1()) { + Op1 = Op0; + } else { + if (MulAcc->isZExt()) + Op1 = new VPWidenCastRecipe(MulAcc->getExtOpcode(), MulAcc->getVecOp1(), + RedTy, MulAcc->isNonNeg(), + MulAcc->getDebugLoc()); + else + Op1 = new VPWidenCastRecipe(MulAcc->getExtOpcode(), MulAcc->getVecOp1(), + RedTy, MulAcc->getDebugLoc()); + Op1->getDefiningRecipe()->insertBefore(MulAcc); + } + } else { + // No extends in this MulAccRecipe. + Op0 = MulAcc->getVecOp0(); + Op1 = MulAcc->getVecOp1(); + } + + std::array MulOps = {Op0, Op1}; + auto *Mul = new VPWidenRecipe( + Instruction::Mul, make_range(MulOps.begin(), MulOps.end()), + MulAcc->hasNoUnsignedWrap(), MulAcc->hasNoSignedWrap(), + MulAcc->getDebugLoc()); + Mul->insertBefore(MulAcc); + + auto *Red = new VPReductionRecipe( + MulAcc->getRecurrenceKind(), FastMathFlags(), MulAcc->getChainOp(), Mul, + MulAcc->getCondOp(), MulAcc->isOrdered(), MulAcc->getDebugLoc()); + Red->insertBefore(MulAcc); + + MulAcc->replaceAllUsesWith(Red); + MulAcc->eraseFromParent(); +} + void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan, Type &CanonicalIVTy) { using namespace llvm::VPlanPatternMatch; @@ -2454,6 +2530,12 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan, VPI->replaceAllUsesWith(VectorStep); ToRemove.push_back(VPI); } + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { + if (auto *ExtRed = dyn_cast(&R)) + expandVPExtendedReduction(ExtRed); + if (auto *MulAcc = dyn_cast(&R)) + expandVPMulAccumulateReduction(MulAcc); + } } for (VPRecipeBase *R : ToRemove) @@ -2551,6 +2633,171 @@ void VPlanTransforms::handleUncountableEarlyExit( LatchExitingBranch->eraseFromParent(); } +/// This function tries convert extended in-loop reductions to +/// VPExtendedReductionRecipe and clamp the \p Range if it is beneficial and +/// valid. The created recipe must be lowered to concrete +/// recipes before execution. +static VPExtendedReductionRecipe * +tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, + VFRange &Range) { + using namespace VPlanPatternMatch; + + Type *RedTy = Ctx.Types.inferScalarType(Red); + VPValue *VecOp = Red->getVecOp(); + + // Clamp the range if using extended-reduction is profitable. + auto IsExtendedRedValidAndClampRange = [&](unsigned Opcode, bool isZExt, + Type *SrcTy) -> bool { + return LoopVectorizationPlanner::getDecisionAndClampRange( + [&](ElementCount VF) { + auto *SrcVecTy = cast(toVectorTy(SrcTy, VF)); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + InstructionCost ExtRedCost = Ctx.TTI.getExtendedReductionCost( + Opcode, isZExt, RedTy, SrcVecTy, Red->getFastMathFlags(), + CostKind); + InstructionCost ExtCost = + cast(VecOp)->computeCost(VF, Ctx); + InstructionCost RedCost = Red->computeCost(VF, Ctx); + return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost; + }, + Range); + }; + + VPValue *A; + // Match reduce(ext)). + if (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) && + IsExtendedRedValidAndClampRange( + RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()), + cast(VecOp)->getOpcode() == + Instruction::CastOps::ZExt, + Ctx.Types.inferScalarType(A))) + return new VPExtendedReductionRecipe(Red, cast(VecOp)); + + return nullptr; +} + +/// This function tries convert extended in-loop reductions to +/// VPMulAccumulateReductionRecipe and clamp the \p Range if it is beneficial +/// and valid. The created VPExtendedReductionRecipe must be lower to concrete +/// recipes before execution. Patterns of MulAccumulateReduction: +/// reduce.add(mul(...)), +/// reduce.add(mul(ext(A), ext(B))), +/// reduce.add(ext(mul(ext(A), ext(B)))). +static VPMulAccumulateReductionRecipe * +tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, + VPCostContext &Ctx, VFRange &Range) { + using namespace VPlanPatternMatch; + + Type *RedTy = Ctx.Types.inferScalarType(Red); + + // Clamp the range if using multiply-accumulate-reduction is profitable. + auto IsMulAccValidAndClampRange = + [&](bool isZExt, VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0, + VPWidenCastRecipe *Ext1, VPWidenCastRecipe *OuterExt) -> bool { + return LoopVectorizationPlanner::getDecisionAndClampRange( + [&](ElementCount VF) { + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + Type *SrcTy = + Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy; + auto *SrcVecTy = cast(toVectorTy(SrcTy, VF)); + InstructionCost MulAccCost = + Ctx.TTI.getMulAccReductionCost(isZExt, RedTy, SrcVecTy, CostKind); + InstructionCost MulCost = Mul->computeCost(VF, Ctx); + InstructionCost RedCost = Red->computeCost(VF, Ctx); + InstructionCost ExtCost = 0; + if (Ext0) + ExtCost += Ext0->computeCost(VF, Ctx); + if (Ext1) + ExtCost += Ext1->computeCost(VF, Ctx); + if (OuterExt) + ExtCost += OuterExt->computeCost(VF, Ctx); + + return MulAccCost.isValid() && + MulAccCost < ExtCost + MulCost + RedCost; + }, + Range); + }; + + unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()); + if (Opcode != Instruction::Add) + return nullptr; + + VPValue *VecOp = Red->getVecOp(); + VPValue *A, *B; + // Try to match reduce.add(mul(...)) + if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) { + auto *RecipeA = + dyn_cast_if_present(A->getDefiningRecipe()); + auto *RecipeB = + dyn_cast_if_present(B->getDefiningRecipe()); + auto *Mul = cast(VecOp->getDefiningRecipe()); + + // Match reduce.add(mul(ext, ext)) + if (RecipeA && RecipeB && + (RecipeA->getOpcode() == RecipeB->getOpcode() || A == B) && + match(RecipeA, m_ZExtOrSExt(m_VPValue())) && + match(RecipeB, m_ZExtOrSExt(m_VPValue())) && + IsMulAccValidAndClampRange(RecipeA->getOpcode() == + Instruction::CastOps::ZExt, + Mul, RecipeA, RecipeB, nullptr)) + return new VPMulAccumulateReductionRecipe(Red, Mul, RecipeA, RecipeB, + RecipeA->getResultType()); + // Match reduce.add(mul) + if (IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr)) + return new VPMulAccumulateReductionRecipe(Red, Mul); + } + // Match reduce.add(ext(mul(ext(A), ext(B)))) + // All extend recipes must have same opcode or A == B + // which can be transform to reduce.add(zext(mul(sext(A), sext(B)))). + if (match(VecOp, m_ZExtOrSExt(m_Mul(m_ZExtOrSExt(m_VPValue()), + m_ZExtOrSExt(m_VPValue()))))) { + auto *Ext = cast(VecOp->getDefiningRecipe()); + auto *Mul = cast(Ext->getOperand(0)->getDefiningRecipe()); + auto *Ext0 = + cast(Mul->getOperand(0)->getDefiningRecipe()); + auto *Ext1 = + cast(Mul->getOperand(1)->getDefiningRecipe()); + if ((Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) && + Ext0->getOpcode() == Ext1->getOpcode() && + IsMulAccValidAndClampRange(Ext0->getOpcode() == + Instruction::CastOps::ZExt, + Mul, Ext0, Ext1, Ext)) + return new VPMulAccumulateReductionRecipe(Red, Mul, Ext0, Ext1, + Ext->getResultType()); + } + return nullptr; +} + +/// This function tries to create abstract recipes from the reduction recipe for +/// following optimizations and cost estimation. +static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, + VPCostContext &Ctx, + VFRange &Range) { + VPReductionRecipe *AbstractR = nullptr; + + if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range)) + AbstractR = MulAcc; + else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range)) + AbstractR = ExtRed; + // Cannot create abstract inloop reduction recipes. + if (!AbstractR) + return; + + AbstractR->insertBefore(Red); + Red->replaceAllUsesWith(AbstractR); +} + +void VPlanTransforms::convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, + VFRange &Range) { + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( + vp_depth_first_deep(Plan.getVectorLoopRegion()))) { + for (VPRecipeBase &R : *VPBB) { + if (auto *Red = dyn_cast(&R)) + tryToCreateAbstractReductionRecipe(Red, Ctx, Range); + } + } +} + void VPlanTransforms::materializeStepVectors(VPlan &Plan) { for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { auto *IVR = dyn_cast(&Phi); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index d284d916633c8..3a1ed7406b383 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -190,6 +190,13 @@ struct VPlanTransforms { /// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis. static void convertToConcreteRecipes(VPlan &Plan, Type &CanonicalIVTy); + /// This function converts initial recipes to the abstract recipes and clamps + /// \p Range based on cost model for following optimizations and cost + /// estimations. The converted abstract recipes will lower to concrete + /// recipes before codegen. + static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, + VFRange &Range); + /// Perform instcombine-like simplifications on recipes in \p Plan. Use \p /// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis. static void simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy); diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 638156eab7a84..64065edd315f9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -339,6 +339,8 @@ class VPDef { VPInterleaveSC, VPReductionEVLSC, VPReductionSC, + VPMulAccumulateReductionSC, + VPExtendedReductionSC, VPPartialReductionSC, VPReplicateSC, VPScalarIVStepsSC, diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll index 2078a10d04ce7..ce3b2a9f216f2 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll @@ -23,11 +23,11 @@ define i32 @mla_i32(ptr noalias nocapture readonly %A, ptr noalias nocapture rea ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) -; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP5]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) ; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]] ; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]]) @@ -105,11 +105,11 @@ define i32 @mla_i8(ptr noalias nocapture readonly %A, ptr noalias nocapture read ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) -; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP5]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) ; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]] ; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]]) diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll index a11cc15a8a85b..d021306b89aab 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll @@ -646,12 +646,11 @@ define i64 @mla_i16_i64(ptr nocapture readonly %x, ptr nocapture readonly %y, i3 ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2 -; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2 -; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[WIDE_LOAD1]] to <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <8 x i32> [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i32> [[TMP4]] to <8 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i16> [[WIDE_LOAD1]] to <8 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = mul nsw <8 x i64> [[TMP4]], [[TMP3]] ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]]) ; CHECK-NEXT: [[TMP7]] = add i64 [[TMP6]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 @@ -726,12 +725,11 @@ define i64 @mla_i8_i64(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1 -; CHECK-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD1]] to <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = mul nuw nsw <8 x i32> [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = zext nneg <8 x i32> [[TMP4]] to <8 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD1]] to <8 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = mul nuw nsw <8 x i64> [[TMP4]], [[TMP3]] ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]]) ; CHECK-NEXT: [[TMP7]] = add i64 [[TMP6]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 @@ -855,10 +853,10 @@ define i32 @mla_i16_i32(ptr nocapture readonly %x, ptr nocapture readonly %y, i3 ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]]) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP0]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison) -; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD]] to <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP2]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison) ; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD1]] to <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD]] to <8 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <8 x i32> [[TMP3]], [[TMP1]] ; CHECK-NEXT: [[TMP5:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i32> [[TMP4]], <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]]) @@ -910,10 +908,10 @@ define i32 @mla_i8_i32(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]]) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) -; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) ; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = mul nuw nsw <16 x i32> [[TMP3]], [[TMP1]] ; CHECK-NEXT: [[TMP5:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]]) @@ -1016,10 +1014,10 @@ define signext i16 @mla_i8_i16(ptr nocapture readonly %x, ptr nocapture readonly ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]]) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) -; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i16> ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) ; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i16> +; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = mul nuw <16 x i16> [[TMP3]], [[TMP1]] ; CHECK-NEXT: [[TMP5:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i16> [[TMP4]], <16 x i16> zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> [[TMP5]]) @@ -1122,10 +1120,10 @@ define i32 @red_mla_ext_s8_s16_s32(ptr noalias nocapture readonly %A, ptr noalia ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]]) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[TMP0]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison) -; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i8> [[WIDE_MASKED_LOAD]] to <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[B:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[TMP2]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison) ; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD1]] to <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i8> [[WIDE_MASKED_LOAD]] to <8 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <8 x i32> [[TMP3]], [[TMP1]] ; CHECK-NEXT: [[TMP5:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i32> [[TMP4]], <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]]) @@ -1459,9 +1457,8 @@ define i64 @mla_xx_sext_zext(ptr nocapture noundef readonly %x, i32 %n) #0 { ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2 -; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <8 x i32> [[TMP1]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = zext nneg <8 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = mul nsw <8 x i64> [[TMP1]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP3]]) ; CHECK-NEXT: [[TMP5]] = add i64 [[TMP4]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 @@ -1528,11 +1525,11 @@ define i64 @mla_and_add_together_16_64(ptr nocapture noundef readonly %x, i32 no ; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2 -; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <8 x i32> [[TMP1]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = zext nneg <8 x i32> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = mul nsw <8 x i64> [[TMP2]], [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP3]]) ; CHECK-NEXT: [[TMP5]] = add i64 [[TMP4]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP1]]) ; CHECK-NEXT: [[TMP7]] = add i32 [[TMP6]], [[VEC_PHI1]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 @@ -1667,24 +1664,55 @@ define i64 @test_std_q31(ptr %x, i32 %n) #0 { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP11]]) +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp samesign ult i32 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], 2147483644 ; CHECK-NEXT: br label [[FOR_BODY1:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY1]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[FOR_BODY1]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[FOR_BODY1]] ] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = ashr <4 x i32> [[WIDE_LOAD]], splat (i32 8) +; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i32> [[TMP1]] to <4 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP2]]) +; CHECK-NEXT: [[TMP4]] = add i64 [[TMP3]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i32> [[TMP1]] to <4 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <4 x i64> [[TMP5]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP6]]) +; CHECK-NEXT: [[TMP8]] = add i64 [[TMP7]], [[VEC_PHI1]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY1]], !llvm.loop [[LOOP37:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX2:%.*]] = phi i64 [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[DIV:%.*]] = sdiv i64 [[ADD3:%.*]], [[ADD:%.*]] +; CHECK-NEXT: [[ADD:%.*]] = phi i64 [ [[ADD1:%.*]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD3:%.*]] = phi i64 [ [[ADD5:%.*]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[DIV:%.*]] = sdiv i64 [[ADD3]], [[ADD]] ; CHECK-NEXT: ret i64 [[DIV]] ; CHECK: for.body: -; CHECK-NEXT: [[S_014:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY1]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[I_013:%.*]] = phi i32 [ [[ADD4:%.*]], [[FOR_BODY1]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[T_012:%.*]] = phi i64 [ [[ADD3]], [[FOR_BODY1]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[X:%.*]], i32 [[I_013]] +; CHECK-NEXT: [[S_014:%.*]] = phi i64 [ [[ADD1]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[I_013:%.*]] = phi i32 [ [[ADD4:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[T_012:%.*]] = phi i64 [ [[ADD5]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX2]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[X]], i32 [[I_013]] ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[TMP0]], 8 ; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[SHR]] to i64 -; CHECK-NEXT: [[ADD]] = add nsw i64 [[S_014]], [[CONV]] +; CHECK-NEXT: [[ADD1]] = add nsw i64 [[S_014]], [[CONV]] ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV]] -; CHECK-NEXT: [[ADD3]] = add nuw nsw i64 [[MUL]], [[T_012]] +; CHECK-NEXT: [[ADD5]] = add nuw nsw i64 [[MUL]], [[T_012]] ; CHECK-NEXT: [[ADD4]] = add nuw nsw i32 [[I_013]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[ADD4]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY1]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] ; entry: %cmp11 = icmp sgt i32 %n, 0 @@ -1720,10 +1748,10 @@ define i64 @test_fir_q15(ptr %x, ptr %y, i32 %n) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = add nsw i32 [[N]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw i32 [[TMP1]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 7 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 15 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[TMP2]], -4 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[TMP2]], -8 ; CHECK-NEXT: [[IND_END:%.*]] = shl i32 [[N_VEC]], 1 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: @@ -1731,28 +1759,26 @@ define i64 @test_fir_q15(ptr %x, ptr %y, i32 %n) #0 { ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 1 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[OFFSET_IDX]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2 -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32> +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i16>, ptr [[TMP3]], align 2 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i16> [[WIDE_VEC]], <16 x i16> poison, <8 x i32> +; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i16> [[WIDE_VEC]], <16 x i16> poison, <8 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i32 [[OFFSET_IDX]] -; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i16>, ptr [[TMP4]], align 2 -; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = sext <4 x i16> [[STRIDED_VEC3]] to <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = mul nsw <4 x i32> [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = sext <4 x i32> [[TMP7]] to <4 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = sext <4 x i16> [[STRIDED_VEC1]] to <4 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <4 x i32> [[TMP14]], [[TMP13]] -; CHECK-NEXT: [[TMP12:%.*]] = sext <4 x i32> [[TMP11]] to <4 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP8]]) +; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <16 x i16>, ptr [[TMP4]], align 2 +; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i16> [[WIDE_VEC2]], <16 x i16> poison, <8 x i32> +; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i16> [[WIDE_VEC2]], <16 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i16> [[STRIDED_VEC3]] to <8 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = sext <8 x i16> [[STRIDED_VEC]] to <8 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = mul nsw <8 x i64> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP8]]) ; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP12]]) +; CHECK-NEXT: [[TMP11:%.*]] = sext <8 x i16> [[STRIDED_VEC4]] to <8 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = sext <8 x i16> [[STRIDED_VEC1]] to <8 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = mul nsw <8 x i64> [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP13]]) ; CHECK-NEXT: [[TMP16]] = add i64 [[TMP15]], [[TMP10]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] @@ -1787,7 +1813,7 @@ define i64 @test_fir_q15(ptr %x, ptr %y, i32 %n) #0 { ; CHECK-NEXT: [[ADD12]] = add nsw i64 [[ADD]], [[CONV11]] ; CHECK-NEXT: [[ADD13]] = add nuw nsw i32 [[I_025]], 2 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[ADD13]], [[N]] -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP38:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP40:![0-9]+]] ; entry: %cmp23 = icmp sgt i32 %n, 0 diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll index 4c7a74ed05b58..17e3bb3cce7eb 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll @@ -476,10 +476,10 @@ define i32 @reduction_mix(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK: pred.load.continue8: ; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP34]], [[PRED_LOAD_IF7]] ] ; CHECK-NEXT: [[TMP39:%.*]] = phi <4 x i32> [ [[TMP29]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP37]], [[PRED_LOAD_IF7]] ] -; CHECK-NEXT: [[TMP40:%.*]] = mul nsw <4 x i32> [[TMP39]], [[TMP38]] ; CHECK-NEXT: [[TMP41:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[VEC_IND1]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP41]]) ; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP42]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP40:%.*]] = mul nsw <4 x i32> [[TMP39]], [[TMP38]] ; CHECK-NEXT: [[TMP44:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP40]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP44]]) ; CHECK-NEXT: [[TMP46]] = add i32 [[TMP45]], [[TMP43]] diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll index 0ad1581f0a4a1..9ca7a84b3ea1c 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll @@ -225,9 +225,9 @@ define i32 @reduction_mix(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND]]) ; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]]) ; CHECK-NEXT: [[TMP6]] = add i32 [[TMP5]], [[TMP4]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 @@ -1289,15 +1289,13 @@ define i32 @predicated_not_dominates_reduction_twoadd(ptr nocapture noundef read ; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[INDEX]] to i64 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[H:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = udiv <4 x i8> [[WIDE_LOAD]], splat (i8 31) ; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw <4 x i8> [[TMP2]], splat (i8 3) ; CHECK-NEXT: [[TMP4:%.*]] = udiv <4 x i8> [[TMP3]], splat (i8 31) ; CHECK-NEXT: [[TMP5:%.*]] = zext nneg <4 x i8> [[TMP4]] to <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[DOTNOT]], <4 x i32> zeroinitializer, <4 x i32> [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP6]]) +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]]) ; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[DOTNOT]], <4 x i32> zeroinitializer, <4 x i32> [[TMP5]] +; CHECK-NEXT: [[TMP9:%.*]] = zext nneg <4 x i8> [[TMP4]] to <4 x i32> ; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP9]]) ; CHECK-NEXT: [[TMP11]] = add i32 [[TMP10]], [[TMP8]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll index 2cf630de208c9..cf920c91913fb 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll @@ -268,3 +268,148 @@ loop: exit: ret i64 %cond } + +define i64 @print_extended_reduction(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) { +; CHECK-LABEL: 'print_extended_reduction' +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count +; CHECK-NEXT: Live-in ir<%n> = original trip-count +; CHECK-EMPTY: +; CHECK: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi ir<0>, vp<[[RDX_NEXT:%.+]]> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1> +; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%x>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[ADDR:%.+]]> = vector-pointer ir<%arrayidx> +; CHECK-NEXT: WIDEN ir<[[LOAD:%.+]]> = load vp<[[ADDR]]> +; CHECK-NEXT: EXTENDED-REDUCE vp<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (ir<[[LOAD]]> extended to i64) +; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; +entry: + br label %loop + +loop: + %iv = phi i32 [ %iv.next, %loop ], [ 0, %entry ] + %rdx = phi i64 [ %rdx.next, %loop ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, ptr %x, i32 %iv + %load0 = load i32, ptr %arrayidx, align 4 + %conv0 = zext i32 %load0 to i64 + %rdx.next = add nsw i64 %rdx, %conv0 + %iv.next = add nuw nsw i32 %iv, 1 + %exitcond = icmp eq i32 %iv.next, %n + br i1 %exitcond, label %exit, label %loop + +exit: + %r.0.lcssa = phi i64 [ %rdx.next, %loop ] + ret i64 %r.0.lcssa +} + +define i64 @print_mulacc(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) { +; CHECK-LABEL: 'print_mulacc' +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count +; CHECK-NEXT: Live-in ir<%n> = original trip-count +; CHECK-EMPTY: +; CHECK: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi ir<0>, vp<[[RDX_NEXT:%.+]]> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1> +; CHECK-NEXT: CLONE ir<[[ARRAYIDX0:%.+]]> = getelementptr inbounds ir<%x>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[ADDR0:%.+]]> = vector-pointer ir<[[ARRAYIDX0]]> +; CHECK-NEXT: WIDEN ir<[[LOAD0:%.+]]> = load vp<[[ADDR0]]> +; CHECK-NEXT: CLONE ir<[[ARRAYIDX1:%.+]]> = getelementptr inbounds ir<%y>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[ADDR1:%.+]]> = vector-pointer ir<[[ARRAYIDX1]]> +; CHECK-NEXT: WIDEN ir<[[LOAD1:%.+]]> = load vp<[[ADDR1]]> +; CHECK-NEXT: MULACC-REDUCE vp<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (mul nsw ir<[[LOAD0]]>, ir<[[LOAD1]]>) +; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; +entry: + br label %loop + +loop: + %iv = phi i32 [ %iv.next, %loop ], [ 0, %entry ] + %rdx = phi i64 [ %rdx.next, %loop ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i64, ptr %x, i32 %iv + %load0 = load i64, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i64, ptr %y, i32 %iv + %load1 = load i64, ptr %arrayidx1, align 4 + %mul = mul nsw i64 %load0, %load1 + %rdx.next = add nsw i64 %rdx, %mul + %iv.next = add nuw nsw i32 %iv, 1 + %exitcond = icmp eq i32 %iv.next, %n + br i1 %exitcond, label %exit, label %loop + +exit: + %r.0.lcssa = phi i64 [ %rdx.next, %loop ] + ret i64 %r.0.lcssa +} + +define i64 @print_mulacc_extended(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) { +; CHECK-LABEL: 'print_mulacc_extended' +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count +; CHECK-NEXT: Live-in ir<%n> = original trip-count +; CHECK-EMPTY: +; CHECK: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi ir<0>, vp<[[RDX_NEXT:%.+]]> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1> +; CHECK-NEXT: CLONE ir<[[ARRAYIDX0:%.+]]> = getelementptr inbounds ir<%x>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[ADDR0:%.+]]> = vector-pointer ir<[[ARRAYIDX0]]> +; CHECK-NEXT: WIDEN ir<[[LOAD0:%.+]]> = load vp<[[ADDR0]]> +; CHECK-NEXT: CLONE ir<[[ARRAYIDX1:%.+]]> = getelementptr inbounds ir<%y>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[ADDR1:%.+]]> = vector-pointer ir<[[ARRAYIDX1]]> +; CHECK-NEXT: WIDEN ir<[[LOAD1:%.+]]> = load vp<[[ADDR1]]> +; CHECK-NEXT: MULACC-REDUCE vp<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (mul nsw (ir<[[LOAD0]]> extended to i64), (ir<[[LOAD1]]> extended to i64)) +; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; +entry: + br label %loop + +loop: + %iv = phi i32 [ %iv.next, %loop ], [ 0, %entry ] + %rdx = phi i64 [ %rdx.next, %loop ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i16, ptr %x, i32 %iv + %load0 = load i16, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i16, ptr %y, i32 %iv + %load1 = load i16, ptr %arrayidx1, align 4 + %conv0 = sext i16 %load0 to i32 + %conv1 = sext i16 %load1 to i32 + %mul = mul nsw i32 %conv0, %conv1 + %conv = sext i32 %mul to i64 + %rdx.next = add nsw i64 %rdx, %conv + %iv.next = add nuw nsw i32 %iv, 1 + %exitcond = icmp eq i32 %iv.next, %n + br i1 %exitcond, label %exit, label %loop + +exit: + %r.0.lcssa = phi i64 [ %rdx.next, %loop ] + ret i64 %r.0.lcssa +} From a4077bce3c5dc2e692478c6483381c3d0c665f66 Mon Sep 17 00:00:00 2001 From: Elvis Wang Date: Sun, 4 May 2025 16:16:25 -0700 Subject: [PATCH 2/9] Fixup, Address comments. --- llvm/lib/Transforms/Vectorize/VPlan.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 598413d7ddb74..b3566f8ea4976 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2635,6 +2635,10 @@ class VPExtendedReductionRecipe : public VPReductionRecipe { {R->getChainOp(), Ext->getOperand(0)}, R->getCondOp(), R->isOrdered(), Ext->getDebugLoc()), ExtOp(Ext->getOpcode()), ResultTy(Ext->getResultType()) { + assert((ExtOp == Instruction::CastOps::ZExt || + ExtOp == Instruction::CastOps::SExt) && + "VPExtendedReductionRecipe only support zext and sext."); + // Not all WidenCastRecipes contain nneg flag. Need to transfer flags from // the original recipe to prevent setting wrong flags. transferFlags(*Ext); @@ -2643,9 +2647,7 @@ class VPExtendedReductionRecipe : public VPReductionRecipe { ~VPExtendedReductionRecipe() override = default; VPExtendedReductionRecipe *clone() override { - auto *Copy = new VPExtendedReductionRecipe(this); - Copy->transferFlags(*this); - return Copy; + return new VPExtendedReductionRecipe(this); } VP_CLASSOF_IMPL(VPDef::VPExtendedReductionSC); @@ -2715,6 +2717,9 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe { Instruction::Add && "The reduction instruction in MulAccumulateteReductionRecipe must " "be Add"); + assert((ExtOp == Instruction::CastOps::ZExt || + ExtOp == Instruction::CastOps::SExt) && + "VPMulAccumulateReductionRecipe only support zext and sext."); // Only set the non-negative flag if the original recipe contains. if (Ext0->hasNonNegFlag()) IsNonNeg = Ext0->isNonNeg(); From ce95f18efeb972629cfe0b415475f3726e91fd52 Mon Sep 17 00:00:00 2001 From: Elvis Wang Date: Mon, 5 May 2025 07:22:17 -0700 Subject: [PATCH 3/9] !fixup, Remove `computeCost()` for new recipes. --- llvm/lib/Transforms/Vectorize/VPlan.h | 11 ++---- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 39 ++++--------------- .../vplan-printing-reductions.ll | 12 +++--- 3 files changed, 16 insertions(+), 46 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index b3566f8ea4976..5de910046ef89 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2642,6 +2642,7 @@ class VPExtendedReductionRecipe : public VPReductionRecipe { // Not all WidenCastRecipes contain nneg flag. Need to transfer flags from // the original recipe to prevent setting wrong flags. transferFlags(*Ext); + setUnderlyingValue(R->getUnderlyingValue()); } ~VPExtendedReductionRecipe() override = default; @@ -2657,10 +2658,6 @@ class VPExtendedReductionRecipe : public VPReductionRecipe { "VPExtendedRecipe + VPReductionRecipe before execution."); }; - /// Return the cost of VPExtendedReductionRecipe. - InstructionCost computeCost(ElementCount VF, - VPCostContext &Ctx) const override; - #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, @@ -2720,6 +2717,7 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe { assert((ExtOp == Instruction::CastOps::ZExt || ExtOp == Instruction::CastOps::SExt) && "VPMulAccumulateReductionRecipe only support zext and sext."); + setUnderlyingValue(R->getUnderlyingValue()); // Only set the non-negative flag if the original recipe contains. if (Ext0->hasNonNegFlag()) IsNonNeg = Ext0->isNonNeg(); @@ -2737,6 +2735,7 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe { Instruction::Add && "The reduction instruction in MulAccumulateReductionRecipe must be " "Add"); + setUnderlyingValue(R->getUnderlyingValue()); } ~VPMulAccumulateReductionRecipe() override = default; @@ -2755,10 +2754,6 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe { "VPWidenRecipe + VPReductionRecipe before execution"); } - /// Return the cost of VPMulAccumulateReductionRecipe. - InstructionCost computeCost(ElementCount VF, - VPCostContext &Ctx) const override; - #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 04eab649713bc..f4093af7377f8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2527,8 +2527,6 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF, auto *VectorTy = cast(toVectorTy(ElementTy, VF)); unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind); FastMathFlags FMFs = getFastMathFlags(); - std::optional OptionalFMF = - ElementTy->isFloatingPointTy() ? std::make_optional(FMFs) : std::nullopt; // TODO: Support any-of reductions. assert( @@ -2536,40 +2534,17 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF, ForceTargetInstructionCost.getNumOccurrences() > 0) && "Any-of reduction not implemented in VPlan-based cost model currently."); + // Cost = Reduction cost + BinOp cost + InstructionCost Cost = + Ctx.TTI.getArithmeticInstrCost(Opcode, ElementTy, Ctx.CostKind); if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) { Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind); - return Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind); + return Cost + + Ctx.TTI.getMinMaxReductionCost(Id, VectorTy, FMFs, Ctx.CostKind); } - return Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, OptionalFMF, - Ctx.CostKind); -} - -InstructionCost -VPExtendedReductionRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { - unsigned Opcode = RecurrenceDescriptor::getOpcode(getRecurrenceKind()); - Type *RedTy = Ctx.Types.inferScalarType(this); - auto *SrcVecTy = - cast(toVectorTy(Ctx.Types.inferScalarType(getVecOp()), VF)); - assert(RedTy->isIntegerTy() && - "ExtendedReduction only support integer type currently."); - InstructionCost Cost = Ctx.TTI.getExtendedReductionCost( - Opcode, isZExt(), RedTy, SrcVecTy, std::nullopt, Ctx.CostKind); - // The cost of this recipe should be decided by the legacy model. - return Cost.isValid() ? 0 : Cost; -} - -InstructionCost -VPMulAccumulateReductionRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { - Type *RedTy = Ctx.Types.inferScalarType(this); - auto *SrcVecTy = - cast(toVectorTy(Ctx.Types.inferScalarType(getVecOp0()), VF)); - InstructionCost Cost = - Ctx.TTI.getMulAccReductionCost(isZExt(), RedTy, SrcVecTy, Ctx.CostKind); - // The cost of this recipe should be decided by the legacy model. - return Cost.isValid() ? 0 : Cost; + return Cost + Ctx.TTI.getArithmeticReductionCost(Opcode, VectorTy, FMFs, + Ctx.CostKind); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll index cf920c91913fb..307228220cb15 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll @@ -283,12 +283,12 @@ define i64 @print_extended_reduction(ptr nocapture readonly %x, ptr nocapture re ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]> -; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi ir<0>, vp<[[RDX_NEXT:%.+]]> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi ir<0>, ir<[[RDX_NEXT:%.+]]> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%x>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[ADDR:%.+]]> = vector-pointer ir<%arrayidx> ; CHECK-NEXT: WIDEN ir<[[LOAD:%.+]]> = load vp<[[ADDR]]> -; CHECK-NEXT: EXTENDED-REDUCE vp<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (ir<[[LOAD]]> extended to i64) +; CHECK-NEXT: EXTENDED-REDUCE ir<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (ir<[[LOAD]]> extended to i64) ; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors @@ -327,7 +327,7 @@ define i64 @print_mulacc(ptr nocapture readonly %x, ptr nocapture readonly %y, i ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]> -; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi ir<0>, vp<[[RDX_NEXT:%.+]]> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi ir<0>, ir<[[RDX_NEXT:%.+]]> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<[[ARRAYIDX0:%.+]]> = getelementptr inbounds ir<%x>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[ADDR0:%.+]]> = vector-pointer ir<[[ARRAYIDX0]]> @@ -335,7 +335,7 @@ define i64 @print_mulacc(ptr nocapture readonly %x, ptr nocapture readonly %y, i ; CHECK-NEXT: CLONE ir<[[ARRAYIDX1:%.+]]> = getelementptr inbounds ir<%y>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[ADDR1:%.+]]> = vector-pointer ir<[[ARRAYIDX1]]> ; CHECK-NEXT: WIDEN ir<[[LOAD1:%.+]]> = load vp<[[ADDR1]]> -; CHECK-NEXT: MULACC-REDUCE vp<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (mul nsw ir<[[LOAD0]]>, ir<[[LOAD1]]>) +; CHECK-NEXT: MULACC-REDUCE ir<[[RDX_NEXT]]> = ir<[[RDX]]> + reduce.add (mul nsw ir<[[LOAD0]]>, ir<[[LOAD1]]>) ; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors @@ -376,7 +376,7 @@ define i64 @print_mulacc_extended(ptr nocapture readonly %x, ptr nocapture reado ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]> -; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi ir<0>, vp<[[RDX_NEXT:%.+]]> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi ir<0>, ir<[[RDX_NEXT:%.+]]> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<[[ARRAYIDX0:%.+]]> = getelementptr inbounds ir<%x>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[ADDR0:%.+]]> = vector-pointer ir<[[ARRAYIDX0]]> @@ -384,7 +384,7 @@ define i64 @print_mulacc_extended(ptr nocapture readonly %x, ptr nocapture reado ; CHECK-NEXT: CLONE ir<[[ARRAYIDX1:%.+]]> = getelementptr inbounds ir<%y>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[ADDR1:%.+]]> = vector-pointer ir<[[ARRAYIDX1]]> ; CHECK-NEXT: WIDEN ir<[[LOAD1:%.+]]> = load vp<[[ADDR1]]> -; CHECK-NEXT: MULACC-REDUCE vp<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (mul nsw (ir<[[LOAD0]]> extended to i64), (ir<[[LOAD1]]> extended to i64)) +; CHECK-NEXT: MULACC-REDUCE ir<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (mul nsw (ir<[[LOAD0]]> extended to i64), (ir<[[LOAD1]]> extended to i64)) ; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors From d2674119cf9fd8377563933ce54cff82cb1ee545 Mon Sep 17 00:00:00 2001 From: Elvis Wang Date: Wed, 7 May 2025 16:26:53 -0700 Subject: [PATCH 4/9] !fixup, address comments. --- llvm/lib/Transforms/Vectorize/VPlan.h | 34 ++++++++++--------- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 9 +++-- .../Transforms/Vectorize/VPlanTransforms.cpp | 23 ++++++------- .../vplan-printing-reductions.ll | 4 +-- 4 files changed, 37 insertions(+), 33 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 5de910046ef89..cc81f376c41ec 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1311,8 +1311,8 @@ class VPWidenRecipe : public VPRecipeWithIRFlags, public VPIRMetadata { Opcode(I.getOpcode()) {} template - VPWidenRecipe(unsigned VPDefOpcode, unsigned Opcode, - iterator_range Operands, bool NUW, bool NSW, DebugLoc DL) + VPWidenRecipe(unsigned VPDefOpcode, unsigned Opcode, ArrayRef Operands, + bool NUW, bool NSW, DebugLoc DL) : VPRecipeWithIRFlags(VPDefOpcode, Operands, WrapFlagsTy(NUW, NSW), DL), Opcode(Opcode) {} @@ -1321,8 +1321,8 @@ class VPWidenRecipe : public VPRecipeWithIRFlags, public VPIRMetadata { : VPWidenRecipe(VPDef::VPWidenSC, I, Operands) {} template - VPWidenRecipe(unsigned Opcode, iterator_range Operands, bool NUW, - bool NSW, DebugLoc DL) + VPWidenRecipe(unsigned Opcode, ArrayRef Operands, bool NUW, bool NSW, + DebugLoc DL) : VPWidenRecipe(VPDef::VPWidenSC, Opcode, Operands, NUW, NSW, DL) {} ~VPWidenRecipe() override = default; @@ -2614,9 +2614,10 @@ class VPReductionEVLRecipe : public VPReductionRecipe { /// concrete recipes before codegen. The operands are {ChainOp, VecOp, /// [Condition]}. class VPExtendedReductionRecipe : public VPReductionRecipe { - /// Opcode of the extend recipe will be lowered to. + /// Opcode of the extend for VecOp. Instruction::CastOps ExtOp; + /// The scalar type after extending. Type *ResultTy; /// For cloning VPExtendedReductionRecipe. @@ -2637,10 +2638,8 @@ class VPExtendedReductionRecipe : public VPReductionRecipe { ExtOp(Ext->getOpcode()), ResultTy(Ext->getResultType()) { assert((ExtOp == Instruction::CastOps::ZExt || ExtOp == Instruction::CastOps::SExt) && - "VPExtendedReductionRecipe only support zext and sext."); + "VPExtendedReductionRecipe only supports zext and sext."); - // Not all WidenCastRecipes contain nneg flag. Need to transfer flags from - // the original recipe to prevent setting wrong flags. transferFlags(*Ext); setUnderlyingValue(R->getUnderlyingValue()); } @@ -2670,7 +2669,7 @@ class VPExtendedReductionRecipe : public VPReductionRecipe { /// Is the extend ZExt? bool isZExt() const { return getExtOpcode() == Instruction::ZExt; } - /// The opcode of extend recipe. + /// Get the opcode of the extend for VecOp. Instruction::CastOps getExtOpcode() const { return ExtOp; } }; @@ -2680,12 +2679,13 @@ class VPExtendedReductionRecipe : public VPReductionRecipe { /// recipe is abstract and needs to be lowered to concrete recipes before /// codegen. The operands are {ChainOp, VecOp1, VecOp2, [Condition]}. class VPMulAccumulateReductionRecipe : public VPReductionRecipe { - /// Opcode of the extend recipe. + /// Opcode of the extend for VecOp1 and VecOp2. Instruction::CastOps ExtOp; /// Non-neg flag of the extend recipe. bool IsNonNeg = false; + /// The scalar type after extending. Type *ResultTy; /// For cloning VPMulAccumulateReductionRecipe. @@ -2716,7 +2716,7 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe { "be Add"); assert((ExtOp == Instruction::CastOps::ZExt || ExtOp == Instruction::CastOps::SExt) && - "VPMulAccumulateReductionRecipe only support zext and sext."); + "VPMulAccumulateReductionRecipe only supports zext and sext."); setUnderlyingValue(R->getUnderlyingValue()); // Only set the non-negative flag if the original recipe contains. if (Ext0->hasNonNegFlag()) @@ -2762,24 +2762,26 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe { Type *getResultType() const { assert(isExtended() && "Only support getResultType when this recipe " - "contains implicit extend."); + "is implicitly extend."); return ResultTy; } - /// The VPValue of the vector value to be extended and reduced. + /// The first vector value to be extended and reduced. VPValue *getVecOp0() const { return getOperand(1); } + + /// The second vector value to be extended and reduced. VPValue *getVecOp1() const { return getOperand(2); } - /// Return if this MulAcc recipe contains extended operands. + /// Return true if this recipe contains extended operands. bool isExtended() const { return ExtOp != Instruction::CastOps::CastOpsEnd; } /// Return the opcode of the extends for the operands. Instruction::CastOps getExtOpcode() const { return ExtOp; } - /// Return if the operands are zero extended. + /// Return if the operands are zero-extended. bool isZExt() const { return ExtOp == Instruction::CastOps::ZExt; } - /// Return the non negative flag of the ext recipe. + /// Return true if the operand extends have the non-negative flag. bool isNonNeg() const { return IsNonNeg; } }; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index f4093af7377f8..fc1ee89e81c75 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2602,7 +2602,8 @@ void VPExtendedReductionRecipe::print(raw_ostream &O, const Twine &Indent, RecurrenceDescriptor::getOpcode(getRecurrenceKind())) << " ("; getVecOp()->printAsOperand(O, SlotTracker); - O << " extended to " << *getResultType(); + printFlags(O); + O << Instruction::getOpcodeName(ExtOp) << " to " << *getResultType(); if (isConditional()) { O << ", "; getCondOp()->printAsOperand(O, SlotTracker); @@ -2627,12 +2628,14 @@ void VPMulAccumulateReductionRecipe::print(raw_ostream &O, const Twine &Indent, O << "("; getVecOp0()->printAsOperand(O, SlotTracker); if (isExtended()) - O << " extended to " << *getResultType() << "), ("; + O << " " << Instruction::getOpcodeName(ExtOp) << " to " << *getResultType() + << "), ("; else O << ", "; getVecOp1()->printAsOperand(O, SlotTracker); if (isExtended()) - O << " extended to " << *getResultType() << ")"; + O << " " << Instruction::getOpcodeName(ExtOp) << " to " << *getResultType() + << ")"; if (isConditional()) { O << ", "; getCondOp()->printAsOperand(O, SlotTracker); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 5e56104c875af..87fa4f268ea15 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2419,7 +2419,7 @@ static void expandVPExtendedReduction(VPExtendedReductionRecipe *ExtRed) { static void expandVPMulAccumulateReduction(VPMulAccumulateReductionRecipe *MulAcc) { // Generate inner VPWidenCastRecipes if necessary. - // Note that we will drop the extend after mul which transform + // Note that we will drop the extend after mul which transforms // reduce.add(ext(mul(ext, ext))) to reduce.add(mul(ext, ext)). VPValue *Op0, *Op1; if (MulAcc->isExtended()) { @@ -2454,9 +2454,8 @@ expandVPMulAccumulateReduction(VPMulAccumulateReductionRecipe *MulAcc) { std::array MulOps = {Op0, Op1}; auto *Mul = new VPWidenRecipe( - Instruction::Mul, make_range(MulOps.begin(), MulOps.end()), - MulAcc->hasNoUnsignedWrap(), MulAcc->hasNoSignedWrap(), - MulAcc->getDebugLoc()); + Instruction::Mul, ArrayRef(MulOps), MulAcc->hasNoUnsignedWrap(), + MulAcc->hasNoSignedWrap(), MulAcc->getDebugLoc()); Mul->insertBefore(MulAcc); auto *Red = new VPReductionRecipe( @@ -2688,6 +2687,10 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range) { using namespace VPlanPatternMatch; + unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()); + if (Opcode != Instruction::Add) + return nullptr; + Type *RedTy = Ctx.Types.inferScalarType(Red); // Clamp the range if using multiply-accumulate-reduction is profitable. @@ -2718,13 +2721,9 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, Range); }; - unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()); - if (Opcode != Instruction::Add) - return nullptr; - VPValue *VecOp = Red->getVecOp(); VPValue *A, *B; - // Try to match reduce.add(mul(...)) + // Try to match reduce.add(mul(...)). if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) { auto *RecipeA = dyn_cast_if_present(A->getDefiningRecipe()); @@ -2732,7 +2731,7 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, dyn_cast_if_present(B->getDefiningRecipe()); auto *Mul = cast(VecOp->getDefiningRecipe()); - // Match reduce.add(mul(ext, ext)) + // Match reduce.add(mul(ext, ext)). if (RecipeA && RecipeB && (RecipeA->getOpcode() == RecipeB->getOpcode() || A == B) && match(RecipeA, m_ZExtOrSExt(m_VPValue())) && @@ -2742,11 +2741,11 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, Mul, RecipeA, RecipeB, nullptr)) return new VPMulAccumulateReductionRecipe(Red, Mul, RecipeA, RecipeB, RecipeA->getResultType()); - // Match reduce.add(mul) + // Match reduce.add(mul). if (IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr)) return new VPMulAccumulateReductionRecipe(Red, Mul); } - // Match reduce.add(ext(mul(ext(A), ext(B)))) + // Match reduce.add(ext(mul(ext(A), ext(B)))). // All extend recipes must have same opcode or A == B // which can be transform to reduce.add(zext(mul(sext(A), sext(B)))). if (match(VecOp, m_ZExtOrSExt(m_Mul(m_ZExtOrSExt(m_VPValue()), diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll index 307228220cb15..da42d62d39c2e 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll @@ -288,7 +288,7 @@ define i64 @print_extended_reduction(ptr nocapture readonly %x, ptr nocapture re ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%x>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[ADDR:%.+]]> = vector-pointer ir<%arrayidx> ; CHECK-NEXT: WIDEN ir<[[LOAD:%.+]]> = load vp<[[ADDR]]> -; CHECK-NEXT: EXTENDED-REDUCE ir<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (ir<[[LOAD]]> extended to i64) +; CHECK-NEXT: EXTENDED-REDUCE ir<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (ir<[[LOAD]]> zext to i64) ; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors @@ -384,7 +384,7 @@ define i64 @print_mulacc_extended(ptr nocapture readonly %x, ptr nocapture reado ; CHECK-NEXT: CLONE ir<[[ARRAYIDX1:%.+]]> = getelementptr inbounds ir<%y>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[ADDR1:%.+]]> = vector-pointer ir<[[ARRAYIDX1]]> ; CHECK-NEXT: WIDEN ir<[[LOAD1:%.+]]> = load vp<[[ADDR1]]> -; CHECK-NEXT: MULACC-REDUCE ir<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (mul nsw (ir<[[LOAD0]]> extended to i64), (ir<[[LOAD1]]> extended to i64)) +; CHECK-NEXT: MULACC-REDUCE ir<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.add (mul nsw (ir<[[LOAD0]]> sext to i64), (ir<[[LOAD1]]> sext to i64)) ; CHECK-NEXT: EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors From 06ef08793483c35ee0e9ac94cadad149654deca4 Mon Sep 17 00:00:00 2001 From: Elvis Wang Date: Wed, 7 May 2025 17:21:27 -0700 Subject: [PATCH 5/9] !fixup, fix assertion of getResultType(). --- llvm/lib/Transforms/Vectorize/VPlan.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index cc81f376c41ec..d430fb1969a9a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2696,8 +2696,10 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe { MulAcc->getCondOp(), MulAcc->isOrdered(), WrapFlagsTy(MulAcc->hasNoUnsignedWrap(), MulAcc->hasNoSignedWrap()), MulAcc->getDebugLoc()), - ExtOp(MulAcc->getExtOpcode()), IsNonNeg(MulAcc->isNonNeg()), - ResultTy(MulAcc->getResultType()) {} + ExtOp(MulAcc->getExtOpcode()), IsNonNeg(MulAcc->isNonNeg()) { + if (MulAcc->isExtended()) + ResultTy = MulAcc->getResultType(); + } public: VPMulAccumulateReductionRecipe(VPReductionRecipe *R, VPWidenRecipe *Mul, From 34a6f3b5c4a876bf4ddb1b7d86686584e60a7628 Mon Sep 17 00:00:00 2001 From: Elvis Wang Date: Thu, 8 May 2025 17:15:04 -0700 Subject: [PATCH 6/9] !fixup getResultType() in VPMulAccumulateReductionRecipe. --- llvm/lib/Transforms/Vectorize/VPlan.h | 8 ++------ llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp | 6 +++++- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 4 +++- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index d430fb1969a9a..973ef4de44efb 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2686,7 +2686,7 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe { bool IsNonNeg = false; /// The scalar type after extending. - Type *ResultTy; + Type *ResultTy = nullptr; /// For cloning VPMulAccumulateReductionRecipe. VPMulAccumulateReductionRecipe(VPMulAccumulateReductionRecipe *MulAcc) @@ -2762,11 +2762,7 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe { VPSlotTracker &SlotTracker) const override; #endif - Type *getResultType() const { - assert(isExtended() && "Only support getResultType when this recipe " - "is implicitly extend."); - return ResultTy; - } + Type *getResultType() const { return ResultTy; } /// The first vector value to be extended and reduced. VPValue *getVecOp0() const { return getOperand(1); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 7dcbd72c25191..cd8878fce26d9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -273,8 +273,12 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { // TODO: Use info from interleave group. return V->getUnderlyingValue()->getType(); }) - .Case( + .Case( [](const auto *R) { return R->getResultType(); }) + .Case([this](const auto *R) { + return R->isExtended() ? R->getResultType() + : inferScalarType(R->getOperand(0)); + }) .Case([](const VPExpandSCEVRecipe *R) { return R->getSCEV()->getType(); }) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 87fa4f268ea15..98af08fc65426 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2530,8 +2530,10 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan, ToRemove.push_back(VPI); } for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { - if (auto *ExtRed = dyn_cast(&R)) + if (auto *ExtRed = dyn_cast(&R)) { expandVPExtendedReduction(ExtRed); + continue; + } if (auto *MulAcc = dyn_cast(&R)) expandVPMulAccumulateReduction(MulAcc); } From bfc5fc2e2f85d6efefcec81d726f5df7198b8358 Mon Sep 17 00:00:00 2001 From: Elvis Wang Date: Mon, 12 May 2025 09:04:08 -0700 Subject: [PATCH 7/9] Fixup! Remove IterT and always add result type in VPMulAccumulateReductionRecipe. --- llvm/lib/Transforms/Vectorize/VPlan.h | 15 +++++++-------- llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp | 6 +----- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 2 +- 3 files changed, 9 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 973ef4de44efb..ecc79653df296 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1310,9 +1310,8 @@ class VPWidenRecipe : public VPRecipeWithIRFlags, public VPIRMetadata { : VPRecipeWithIRFlags(VPDefOpcode, Operands, I), VPIRMetadata(I), Opcode(I.getOpcode()) {} - template - VPWidenRecipe(unsigned VPDefOpcode, unsigned Opcode, ArrayRef Operands, - bool NUW, bool NSW, DebugLoc DL) + VPWidenRecipe(unsigned VPDefOpcode, unsigned Opcode, + ArrayRef Operands, bool NUW, bool NSW, DebugLoc DL) : VPRecipeWithIRFlags(VPDefOpcode, Operands, WrapFlagsTy(NUW, NSW), DL), Opcode(Opcode) {} @@ -1320,9 +1319,8 @@ class VPWidenRecipe : public VPRecipeWithIRFlags, public VPIRMetadata { VPWidenRecipe(Instruction &I, ArrayRef Operands) : VPWidenRecipe(VPDef::VPWidenSC, I, Operands) {} - template - VPWidenRecipe(unsigned Opcode, ArrayRef Operands, bool NUW, bool NSW, - DebugLoc DL) + VPWidenRecipe(unsigned Opcode, ArrayRef Operands, bool NUW, + bool NSW, DebugLoc DL) : VPWidenRecipe(VPDef::VPWidenSC, Opcode, Operands, NUW, NSW, DL) {} ~VPWidenRecipe() override = default; @@ -2725,14 +2723,15 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe { IsNonNeg = Ext0->isNonNeg(); } - VPMulAccumulateReductionRecipe(VPReductionRecipe *R, VPWidenRecipe *Mul) + VPMulAccumulateReductionRecipe(VPReductionRecipe *R, VPWidenRecipe *Mul, + Type *ResultTy) : VPReductionRecipe( VPDef::VPMulAccumulateReductionSC, R->getRecurrenceKind(), {R->getChainOp(), Mul->getOperand(0), Mul->getOperand(1)}, R->getCondOp(), R->isOrdered(), WrapFlagsTy(Mul->hasNoUnsignedWrap(), Mul->hasNoSignedWrap()), R->getDebugLoc()), - ExtOp(Instruction::CastOps::CastOpsEnd) { + ExtOp(Instruction::CastOps::CastOpsEnd), ResultTy(ResultTy) { assert(RecurrenceDescriptor::getOpcode(getRecurrenceKind()) == Instruction::Add && "The reduction instruction in MulAccumulateReductionRecipe must be " diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index cd8878fce26d9..7dcbd72c25191 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -273,12 +273,8 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { // TODO: Use info from interleave group. return V->getUnderlyingValue()->getType(); }) - .Case( + .Case( [](const auto *R) { return R->getResultType(); }) - .Case([this](const auto *R) { - return R->isExtended() ? R->getResultType() - : inferScalarType(R->getOperand(0)); - }) .Case([](const VPExpandSCEVRecipe *R) { return R->getSCEV()->getType(); }) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 98af08fc65426..81accab2be54e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2745,7 +2745,7 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, RecipeA->getResultType()); // Match reduce.add(mul). if (IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr)) - return new VPMulAccumulateReductionRecipe(Red, Mul); + return new VPMulAccumulateReductionRecipe(Red, Mul, RedTy); } // Match reduce.add(ext(mul(ext(A), ext(B)))). // All extend recipes must have same opcode or A == B From a0515c33c4014299d22f29e6f5be4233402412c1 Mon Sep 17 00:00:00 2001 From: Elvis Wang Date: Mon, 12 May 2025 09:19:36 -0700 Subject: [PATCH 8/9] !fixup update VPMulAccumulateReduction::clone(). --- llvm/lib/Transforms/Vectorize/VPlan.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index ecc79653df296..4cbf0e82ea701 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2694,10 +2694,8 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe { MulAcc->getCondOp(), MulAcc->isOrdered(), WrapFlagsTy(MulAcc->hasNoUnsignedWrap(), MulAcc->hasNoSignedWrap()), MulAcc->getDebugLoc()), - ExtOp(MulAcc->getExtOpcode()), IsNonNeg(MulAcc->isNonNeg()) { - if (MulAcc->isExtended()) - ResultTy = MulAcc->getResultType(); - } + ExtOp(MulAcc->getExtOpcode()), IsNonNeg(MulAcc->isNonNeg()), + ResultTy(MulAcc->getResultType()) {} public: VPMulAccumulateReductionRecipe(VPReductionRecipe *R, VPWidenRecipe *Mul, From fca5a28590c49896098a7ab1763c4943be81c041 Mon Sep 17 00:00:00 2001 From: Elvis Wang Date: Wed, 14 May 2025 23:21:58 -0700 Subject: [PATCH 9/9] Address comments. --- llvm/lib/Transforms/Vectorize/VPlan.h | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 4cbf0e82ea701..0c549af66a751 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2626,6 +2626,7 @@ class VPExtendedReductionRecipe : public VPReductionRecipe { ExtRed->isOrdered(), ExtRed->getDebugLoc()), ExtOp(ExtRed->getExtOpcode()), ResultTy(ExtRed->getResultType()) { transferFlags(*ExtRed); + setUnderlyingValue(ExtRed->getUnderlyingValue()); } public: @@ -2671,11 +2672,11 @@ class VPExtendedReductionRecipe : public VPReductionRecipe { Instruction::CastOps getExtOpcode() const { return ExtOp; } }; -/// A recipe to represent inloop MulAccumulateReduction operations, performing a -/// reduction.add on the result of vector operands (might be extended) -/// multiplication into a scalar value, and adding the result to a chain. This -/// recipe is abstract and needs to be lowered to concrete recipes before -/// codegen. The operands are {ChainOp, VecOp1, VecOp2, [Condition]}. +/// A recipe to represent inloop MulAccumulateReduction operations, multiplying +/// the vector operands (which may be extended), performing a reduction.add on +/// the result, and adding the scalar result to a chain. This recipe is abstract +/// and needs to be lowered to concrete recipes before codegen. The operands are +/// {ChainOp, VecOp1, VecOp2, [Condition]}. class VPMulAccumulateReductionRecipe : public VPReductionRecipe { /// Opcode of the extend for VecOp1 and VecOp2. Instruction::CastOps ExtOp; @@ -2695,7 +2696,10 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe { WrapFlagsTy(MulAcc->hasNoUnsignedWrap(), MulAcc->hasNoSignedWrap()), MulAcc->getDebugLoc()), ExtOp(MulAcc->getExtOpcode()), IsNonNeg(MulAcc->isNonNeg()), - ResultTy(MulAcc->getResultType()) {} + ResultTy(MulAcc->getResultType()) { + transferFlags(*MulAcc); + setUnderlyingValue(MulAcc->getUnderlyingValue()); + } public: VPMulAccumulateReductionRecipe(VPReductionRecipe *R, VPWidenRecipe *Mul, @@ -2740,9 +2744,7 @@ class VPMulAccumulateReductionRecipe : public VPReductionRecipe { ~VPMulAccumulateReductionRecipe() override = default; VPMulAccumulateReductionRecipe *clone() override { - auto *Copy = new VPMulAccumulateReductionRecipe(this); - Copy->transferFlags(*this); - return Copy; + return new VPMulAccumulateReductionRecipe(this); } VP_CLASSOF_IMPL(VPDef::VPMulAccumulateReductionSC);