From e08017c1e1935de4399933f08b63f901b6e03336 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Wed, 26 Mar 2025 11:35:01 +0000 Subject: [PATCH 01/24] [LV] Vectorize conditional scalar assignments Based on Michael Maitland's previous work: https://github.com/llvm/llvm-project/pull/121222 This PR uses the existing recurrences code instead of introducing a new pass just for CSA autovec. I've also made recipes that are more generic. I've enabled it by default to see the impact on tests; if there are regressions we can put it behind a cli option. --- llvm/include/llvm/Analysis/IVDescriptors.h | 23 +- llvm/lib/Analysis/IVDescriptors.cpp | 45 +- .../AArch64/AArch64TargetTransformInfo.cpp | 1 + .../Transforms/Vectorize/LoopVectorize.cpp | 53 +- .../Transforms/Vectorize/SLPVectorizer.cpp | 3 + llvm/lib/Transforms/Vectorize/VPlan.h | 37 ++ .../Transforms/Vectorize/VPlanAnalysis.cpp | 19 +- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 47 ++ .../Transforms/Vectorize/VPlanTransforms.cpp | 78 +++ .../Transforms/Vectorize/VPlanTransforms.h | 8 + llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 2 +- llvm/lib/Transforms/Vectorize/VPlanValue.h | 1 + .../AArch64/conditional-scalar-assignment.ll | 397 ++++++++++++ .../conditional-scalar-assignment-vplan.ll | 123 ++++ .../LoopVectorize/iv-select-cmp-decreasing.ll | 339 +++++++++-- .../LoopVectorize/iv-select-cmp-no-wrap.ll | 88 ++- .../iv-select-cmp-non-const-iv-start.ll | 373 ++++++++++-- .../LoopVectorize/iv-select-cmp-trunc.ll | 570 ++++++++++++++---- .../Transforms/LoopVectorize/iv-select-cmp.ll | 190 +++++- .../Transforms/LoopVectorize/select-cmp.ll | 141 ++++- 20 files changed, 2238 insertions(+), 300 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll create mode 100644 llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h index fc141ed6d96fe..f9376c1c2a06b 100644 --- a/llvm/include/llvm/Analysis/IVDescriptors.h +++ b/llvm/include/llvm/Analysis/IVDescriptors.h @@ -70,6 +70,9 @@ enum class RecurKind { FindLastIVUMax, ///< FindLast reduction with select(cmp(),x,y) where one of ///< (x,y) is increasing loop induction, and both x and y ///< are integer type, producing a UMax reduction. + FindLast, ///< FindLast reduction with select(cmp(),x,y) where x and y + ///< are an integer type, one is the current recurrence value, + ///< and the other is an arbitrary value. // clang-format on // TODO: Any_of and FindLast reduction need not be restricted to integer type // only. @@ -180,13 +183,12 @@ class RecurrenceDescriptor { /// Returns a struct describing whether the instruction is either a /// Select(ICmp(A, B), X, Y), or /// Select(FCmp(A, B), X, Y) - /// where one of (X, Y) is an increasing (FindLast) or decreasing (FindFirst) - /// loop induction variable, and the other is a PHI value. - // TODO: Support non-monotonic variable. FindLast does not need be restricted - // to increasing loop induction variables. - LLVM_ABI static InstDesc isFindIVPattern(RecurKind Kind, Loop *TheLoop, - PHINode *OrigPhi, Instruction *I, - ScalarEvolution &SE); + /// where one of (X, Y) is an increasing (FindLastIV) or decreasing + /// (FindFirstIV) loop induction variable, or an arbitrary integer value + /// (FindLast), and the other is a PHI value. + LLVM_ABI static InstDesc isFindPattern(RecurKind Kind, Loop *TheLoop, + PHINode *OrigPhi, Instruction *I, + ScalarEvolution &SE); /// Returns a struct describing if the instruction is a /// Select(FCmp(X, Y), (Z = X op PHINode), PHINode) instruction pattern. @@ -310,6 +312,13 @@ class RecurrenceDescriptor { isFindLastIVRecurrenceKind(Kind); } + /// Returns true if the recurrence kind is of the form + /// select(cmp(),x,y) where one of (x,y) is an arbitrary value and the + /// other is a recurrence. + static bool isFindLastRecurrenceKind(RecurKind Kind) { + return Kind == RecurKind::FindLast; + } + /// Returns the type of the recurrence. This type can be narrower than the /// actual type of the Phi if the recurrence has been type-promoted. Type *getRecurrenceType() const { return RecurrenceType; } diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp index 7624e0ed6f2b0..c6e712090e942 100644 --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -58,6 +58,8 @@ bool RecurrenceDescriptor::isIntegerRecurrenceKind(RecurKind Kind) { case RecurKind::FindFirstIVUMin: case RecurKind::FindLastIVSMax: case RecurKind::FindLastIVUMax: + // TODO: Make type-agnostic. + case RecurKind::FindLast: return true; } return false; @@ -746,9 +748,9 @@ RecurrenceDescriptor::isAnyOfPattern(Loop *Loop, PHINode *OrigPhi, // value of the data type or a non-constant value by using mask and multiple // reduction operations. RecurrenceDescriptor::InstDesc -RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop, - PHINode *OrigPhi, Instruction *I, - ScalarEvolution &SE) { +RecurrenceDescriptor::isFindPattern(RecurKind Kind, Loop *TheLoop, + PHINode *OrigPhi, Instruction *I, + ScalarEvolution &SE) { // TODO: Support the vectorization of FindLastIV when the reduction phi is // used by more than one select instruction. This vectorization is only // performed when the SCEV of each increasing induction variable used by the @@ -757,8 +759,10 @@ RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop, return InstDesc(false, I); // We are looking for selects of the form: - // select(cmp(), phi, loop_induction) or - // select(cmp(), loop_induction, phi) + // select(cmp(), phi, value) or + // select(cmp(), value, phi) + // where 'value' is be a loop induction variable + // (for FindFirstIV/FindLastIV) or an arbitrary value (for FindLast). // TODO: Match selects with multi-use cmp conditions. Value *NonRdxPhi = nullptr; if (!match(I, m_CombineOr(m_Select(m_OneUse(m_Cmp()), m_Value(NonRdxPhi), @@ -767,6 +771,25 @@ RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop, m_Value(NonRdxPhi))))) return InstDesc(false, I); + if (isFindLastRecurrenceKind(Kind)) { + // Must be an integer scalar. + Type *Type = OrigPhi->getType(); + if (!Type->isIntegerTy()) + return InstDesc(false, I); + + // FIXME: Support more complex patterns, including multiple selects. + // The Select must be used only outside the loop and by the PHI. + for (User *U : I->users()) { + if (U == OrigPhi) + continue; + if (auto *UI = dyn_cast(U); UI && !TheLoop->contains(UI)) + continue; + return InstDesc(false, I); + } + + return InstDesc(I, RecurKind::FindLast); + } + // Returns either FindFirstIV/FindLastIV, if such a pattern is found, or // std::nullopt. auto GetRecurKind = [&](Value *V) -> std::optional { @@ -976,8 +999,8 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr( Kind == RecurKind::Add || Kind == RecurKind::Mul || Kind == RecurKind::Sub || Kind == RecurKind::AddChainWithSubs) return isConditionalRdxPattern(I); - if (isFindIVRecurrenceKind(Kind) && SE) - return isFindIVPattern(Kind, L, OrigPhi, I, *SE); + if ((isFindIVRecurrenceKind(Kind) || isFindLastRecurrenceKind(Kind)) && SE) + return isFindPattern(Kind, L, OrigPhi, I, *SE); [[fallthrough]]; case Instruction::FCmp: case Instruction::ICmp: @@ -1174,7 +1197,11 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop, << "\n"); return true; } - + if (AddReductionVar(Phi, RecurKind::FindLast, TheLoop, FMF, RedDes, DB, AC, + DT, SE)) { + LLVM_DEBUG(dbgs() << "Found a FindLast reduction PHI." << *Phi << "\n"); + return true; + } // Not a reduction of known type. return false; } @@ -1299,6 +1326,8 @@ unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) { case RecurKind::FMaximumNum: case RecurKind::FMinimumNum: return Instruction::FCmp; + case RecurKind::FindLast: + return Instruction::Select; case RecurKind::AnyOf: case RecurKind::FindFirstIVSMin: case RecurKind::FindFirstIVUMin: diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 3a5f1499f9d2d..c704c434176be 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -5488,6 +5488,7 @@ bool AArch64TTIImpl::isLegalToVectorizeReduction( case RecurKind::FMax: case RecurKind::FMulAdd: case RecurKind::AnyOf: + case RecurKind::FindLast: return true; default: return false; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 9b727a7998392..4f924ad4cfc3b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1293,6 +1293,7 @@ class LoopVectorizationCostModel { "from latch block\n"); return true; } + if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) { LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: " "interleaved group requires scalar epilogue\n"); @@ -4084,6 +4085,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, continue; case VPDef::VPReductionSC: case VPDef::VPActiveLaneMaskPHISC: + case VPDef::VPLastActiveMaskPHISC: case VPDef::VPWidenCallSC: case VPDef::VPWidenCanonicalIVSC: case VPDef::VPWidenCastSC: @@ -4302,11 +4304,15 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( ElementCount VF) const { // Cross iteration phis such as fixed-order recurrences and FMaxNum/FMinNum // reductions need special handling and are currently unsupported. + // FindLast reductions also require special handling for the synthesized + // mask PHI. if (any_of(OrigLoop->getHeader()->phis(), [&](PHINode &Phi) { if (!Legal->isReductionVariable(&Phi)) return Legal->isFixedOrderRecurrence(&Phi); - return RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind( - Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind()); + RecurKind Kind = + Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind(); + return RecurrenceDescriptor::isFindLastRecurrenceKind(Kind) || + RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind(Kind); })) return false; @@ -4612,6 +4618,12 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF, any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(), IsaPred); + // FIXME: implement interleaving for FindLast transform correctly. + for (auto &[_, RdxDesc] : Legal->getReductionVars()) + if (RecurrenceDescriptor::isFindLastRecurrenceKind( + RdxDesc.getRecurrenceKind())) + return 1; + // If we did not calculate the cost for VF (because the user selected the VF) // then we calculate the cost of VF here. if (LoopCost == 0) { @@ -8624,6 +8636,10 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( *Plan, Builder)) return nullptr; + // Create whole-vector selects for find-last recurrences. + VPlanTransforms::runPass(VPlanTransforms::convertFindLastRecurrences, *Plan, + RecipeBuilder, Legal); + if (useActiveLaneMask(Style)) { // TODO: Move checks to VPlanTransforms::addActiveLaneMask once // TailFoldingStyle is visible there. @@ -8707,10 +8723,11 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( continue; RecurKind Kind = PhiR->getRecurrenceKind(); - assert( - !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && - !RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) && - "AnyOf and FindIV reductions are not allowed for in-loop reductions"); + assert(!RecurrenceDescriptor::isFindLastRecurrenceKind(Kind) && + !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && + !RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) && + "AnyOf, FindIV, and FindLast reductions are not allowed for in-loop " + "reductions"); bool IsFPRecurrence = RecurrenceDescriptor::isFloatingPointRecurrenceKind(Kind); @@ -9017,7 +9034,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( RecurKind RK = RdxDesc.getRecurrenceKind(); if ((!RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) && !RecurrenceDescriptor::isFindIVRecurrenceKind(RK) && - !RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))) { + !RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) && + !RecurrenceDescriptor::isFindLastRecurrenceKind(RK))) { VPBuilder PHBuilder(Plan->getVectorPreheader()); VPValue *Iden = Plan->getOrAddLiveIn( getRecurrenceIdentity(RK, PhiTy, RdxDesc.getFastMathFlags())); @@ -9430,7 +9448,7 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) { SmallPtrSet EpiWidenedPhis; for (VPRecipeBase &R : EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { - if (isa(&R)) + if (isa(&R)) continue; EpiWidenedPhis.insert( cast(R.getVPSingleValue()->getUnderlyingValue())); @@ -9627,6 +9645,10 @@ static SmallVector preparePlanForEpilogueVectorLoop( continue; } } + } else if (isa(R)) { + // LastActiveMasks are only used as part of FindLast reductions, + // and aren't passed to the scalar loop. + continue; } else { // Retrieve the induction resume values for wide inductions from // their original phi nodes in the scalar loop. @@ -10148,6 +10170,21 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Override IC if user provided an interleave count. IC = UserIC > 0 ? UserIC : IC; + // FIXME: Enable interleaving for last_active reductions. + if (any_of(make_second_range(LVL.getReductionVars()), [&](auto &RdxDesc) { + return RecurrenceDescriptor::isFindLastRecurrenceKind( + RdxDesc.getRecurrenceKind()); + })) { + LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due " + << "to conditional scalar assignments.\n"); + IntDiagMsg = { + "ConditionalAssignmentPreventsScalarInterleaving", + "Unable to interleave without vectorization due to conditional " + "assignments"}; + InterleaveLoop = false; + IC = 1; + } + // Emit diagnostic messages, if any. const char *VAPassName = Hints.vectorizeAnalysisPassName(); if (!VectorizeLoop && !InterleaveLoop) { diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 0eb8ad8d3c93d..ccdb0be05f6d4 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -25401,6 +25401,7 @@ class HorizontalReduction { case RecurKind::FindFirstIVUMin: case RecurKind::FindLastIVSMax: case RecurKind::FindLastIVUMax: + case RecurKind::FindLast: case RecurKind::FMaxNum: case RecurKind::FMinNum: case RecurKind::FMaximumNum: @@ -25542,6 +25543,7 @@ class HorizontalReduction { case RecurKind::FindFirstIVUMin: case RecurKind::FindLastIVSMax: case RecurKind::FindLastIVUMax: + case RecurKind::FindLast: case RecurKind::FMaxNum: case RecurKind::FMinNum: case RecurKind::FMaximumNum: @@ -25648,6 +25650,7 @@ class HorizontalReduction { case RecurKind::FindFirstIVUMin: case RecurKind::FindLastIVSMax: case RecurKind::FindLastIVUMax: + case RecurKind::FindLast: case RecurKind::FMaxNum: case RecurKind::FMinNum: case RecurKind::FMaximumNum: diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 6ca750fc53279..68dacb813e4fd 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -562,6 +562,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPPredInstPHISC: case VPRecipeBase::VPCanonicalIVPHISC: case VPRecipeBase::VPActiveLaneMaskPHISC: + case VPRecipeBase::VPLastActiveMaskPHISC: case VPRecipeBase::VPFirstOrderRecurrencePHISC: case VPRecipeBase::VPWidenPHISC: case VPRecipeBase::VPWidenIntOrFpInductionSC: @@ -1128,6 +1129,8 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags, /// Returns the value for vscale. VScale, OpsEnd = VScale, + /// Extracts the last active lane based on a predicate vector operand. + ExtractLastActive, }; /// Returns true if this VPInstruction generates scalar values for all lanes. @@ -3635,6 +3638,40 @@ class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe { #endif }; +// TODO: Can we unify the PHI recipe hierarchy a bit? VPPredInstPHISC is close +// to this (just a PHI of a predicate), but isn't a header phi so can't +// be used for the mask of FindLastActive reductions. +// +// This is basically a clone of VPActiveLaneMaskPHIRecipe, but won't run into +// problems with transforms that expect there to only be a single ALM PHI, and +// can be ignored by other code looking for a (non-existent) underlying value. +class VPLastActiveMaskPHIRecipe : public VPHeaderPHIRecipe { +public: + VPLastActiveMaskPHIRecipe(VPValue *StartMask, DebugLoc DL) + : VPHeaderPHIRecipe(VPDef::VPLastActiveMaskPHISC, nullptr, StartMask, + DL) {} + + ~VPLastActiveMaskPHIRecipe() override = default; + + VPLastActiveMaskPHIRecipe *clone() override { + auto *R = new VPLastActiveMaskPHIRecipe(getOperand(0), getDebugLoc()); + if (getNumOperands() == 2) + R->addOperand(getOperand(1)); + return R; + } + + VP_CLASSOF_IMPL(VPDef::VPLastActiveMaskPHISC); + + /// Generate the mask phi + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif +}; + /// A recipe for generating the phi node for the current index of elements, /// adjusted in accordance with EVL value. It starts at the start value of the /// canonical induction and gets incremented by EVL in each iteration of the diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index ea38a8b16ebc7..1dd26ee9da3fe 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -121,7 +121,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { return Type::getIntNTy(Ctx, 64); case VPInstruction::ExtractLastElement: case VPInstruction::ExtractLastLanePerPart: - case VPInstruction::ExtractPenultimateElement: { + case VPInstruction::ExtractPenultimateElement: + case VPInstruction::ExtractLastActive: { Type *BaseTy = inferScalarType(R->getOperand(0)); if (auto *VecTy = dyn_cast(BaseTy)) return VecTy->getElementType(); @@ -279,14 +280,14 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { TypeSwitch(V->getDefiningRecipe()) .Case( - [this](const auto *R) { - // Handle header phi recipes, except VPWidenIntOrFpInduction - // which needs special handling due it being possibly truncated. - // TODO: consider inferring/caching type of siblings, e.g., - // backedge value, here and in cases below. - return inferScalarType(R->getStartValue()); - }) + VPWidenPointerInductionRecipe, VPEVLBasedIVPHIRecipe, + VPLastActiveMaskPHIRecipe>([this](const auto *R) { + // Handle header phi recipes, except VPWidenIntOrFpInduction + // which needs special handling due it being possibly truncated. + // TODO: consider inferring/caching type of siblings, e.g., + // backedge value, here and in cases below. + return inferScalarType(R->getStartValue()); + }) .Case( [](const auto *R) { return R->getScalarType(); }) .CasegetType(); + + Module *M = State.Builder.GetInsertBlock()->getModule(); + Function *ExtractLast = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::experimental_vector_extract_last_active, {VTy}); + return Builder.CreateCall(ExtractLast, {Data, Mask, Default}); + } default: llvm_unreachable("Unsupported opcode for instruction"); } @@ -1074,6 +1086,15 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, Instruction::Sub, Type::getInt64Ty(Ctx.LLVMCtx), Ctx.CostKind); return Cost; } + case VPInstruction::ExtractLastActive: { + Type *ScalarTy = Ctx.Types.inferScalarType(this); + Type *VecTy = toVectorTy(ScalarTy, VF); + Type *MaskTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF); + IntrinsicCostAttributes ICA( + Intrinsic::experimental_vector_extract_last_active, ScalarTy, + {VecTy, MaskTy, ScalarTy}); + return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind); + } case VPInstruction::FirstOrderRecurrenceSplice: { assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?"); SmallVector Mask(VF.getKnownMinValue()); @@ -1131,6 +1152,7 @@ bool VPInstruction::isVectorToScalar() const { getOpcode() == VPInstruction::LastActiveLane || getOpcode() == VPInstruction::ComputeAnyOfResult || getOpcode() == VPInstruction::ComputeFindIVResult || + getOpcode() == VPInstruction::ExtractLastActive || getOpcode() == VPInstruction::ComputeReductionResult || getOpcode() == VPInstruction::AnyOf; } @@ -1197,6 +1219,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { case VPInstruction::ExplicitVectorLength: case VPInstruction::FirstActiveLane: case VPInstruction::LastActiveLane: + case VPInstruction::ExtractLastActive: case VPInstruction::FirstOrderRecurrenceSplice: case VPInstruction::LogicalAnd: case VPInstruction::Not: @@ -1385,6 +1408,9 @@ void VPInstruction::printRecipe(raw_ostream &O, const Twine &Indent, case VPInstruction::Unpack: O << "unpack"; break; + case VPInstruction::ExtractLastActive: + O << "extract-last-active"; + break; default: O << Instruction::getOpcodeName(getOpcode()); } @@ -4413,6 +4439,27 @@ void VPActiveLaneMaskPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent, } #endif +void VPLastActiveMaskPHIRecipe::execute(VPTransformState &State) { + BasicBlock *VectorPH = + State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0)); + Value *StartMask = State.get(getOperand(0)); + PHINode *Phi = + State.Builder.CreatePHI(StartMask->getType(), 2, "last.active.mask"); + Phi->addIncoming(StartMask, VectorPH); + State.set(this, Phi); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPLastActiveMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "LAST-ACTIVE-MASK-PHI "; + + printAsOperand(O, SlotTracker); + O << " = phi "; + printOperands(O, SlotTracker); +} +#endif + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPEVLBasedIVPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 38024aa6897fc..b63dcb1d136e8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -41,6 +41,7 @@ #include "llvm/Support/Casting.h" #include "llvm/Support/TypeSize.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" +#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" using namespace llvm; using namespace VPlanPatternMatch; @@ -5106,3 +5107,80 @@ void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan, } } } + +void VPlanTransforms::convertFindLastRecurrences( + VPlan &Plan, VPRecipeBuilder &RecipeBuilder, + LoopVectorizationLegality *Legal) { + assert(Legal && "Need valid LoopVecLegality"); + + // May need to do something better than this? + if (Plan.hasScalarVFOnly()) + return; + + // We want to create the following nodes: + // vec.body: + // mask.phi = phi [ all.false, vec.ph ], [ new.mask, vec.body ] + // ...data.phi already exists, but needs updating... + // data.phi = phi [ default.val, vec.ph ], [ new.data, vec.body ] + // + // ...'data' and 'compare' created by existing nodes... + // + // any_active = i1 any_of_reduction(compare) + // new.mask = select any_active, compare, mask.phi + // new.data = select any_active, data, data.phi + // + // middle.block: + // result = extract-last-active new.data, new.mask, default.val + + for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) { + if (RecurrenceDescriptor::isFindLastRecurrenceKind( + RdxDesc.getRecurrenceKind())) { + VPRecipeBase *PhiR = RecipeBuilder.getRecipe(Phi); + VPBuilder Builder = VPBuilder::getToInsertAfter(PhiR); + + // Add mask phi + VPValue *False = + Plan.getOrAddLiveIn(ConstantInt::getFalse(Phi->getContext())); + auto *MaskPHI = new VPLastActiveMaskPHIRecipe(False, DebugLoc()); + Builder.insert(MaskPHI); + + // Find the condition for the select + SelectInst *Select = cast(RdxDesc.getLoopExitInstr()); + auto *SR = cast(RecipeBuilder.getRecipe(Select)); + VPValue *Cond = SR->getCond(); + + // Add select for mask + Builder.setInsertPoint(SR); + VPValue *AnyOf = Builder.createNaryOp(VPInstruction::AnyOf, {Cond}); + VPValue *MaskSelect = Builder.createSelect(AnyOf, Cond, MaskPHI); + MaskPHI->addOperand(MaskSelect); + + // Replace select for data + VPValue *DataSelect = Builder.createSelect( + AnyOf, SR->getOperand(1), SR->getOperand(2), SR->getDebugLoc()); + SR->replaceAllUsesWith(DataSelect); + SR->eraseFromParent(); + + // Find final reduction and replace it with an + // extract.last.active intrinsic. + VPInstruction *RdxResult = nullptr; + for (VPUser *U : DataSelect->users()) { + VPInstruction *I = dyn_cast(U); + if (I && I->getOpcode() == VPInstruction::ComputeReductionResult) { + RdxResult = I; + break; + } + } + + assert(RdxResult); + Builder.setInsertPoint(RdxResult); + VPValue *Default = RecipeBuilder.getVPValueOrAddLiveIn( + RdxDesc.getRecurrenceStartValue()); + auto *ExtractLastActive = Builder.createNaryOp( + VPInstruction::ExtractLastActive, {DataSelect, MaskSelect, Default}, + RdxResult->getDebugLoc()); + RdxResult->replaceAllUsesWith(ExtractLastActive); + RdxResult->eraseFromParent(); + } + } +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index afdf1655b4622..a479d2b49e665 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -24,6 +24,7 @@ namespace llvm { class InductionDescriptor; class Instruction; class LoopVersioning; +class LoopVectorizationLegality; class PHINode; class ScalarEvolution; class PredicatedScalarEvolution; @@ -402,6 +403,13 @@ struct VPlanTransforms { /// users in the original exit block using the VPIRInstruction wrapping to the /// LCSSA phi. static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range); + + /// Change FindLast reductions to save the appropriate state using selects + /// for entire vectors for both the latest mask containing at least one active + /// element and the corresponding data vector. + static void convertFindLastRecurrences(VPlan &Plan, + VPRecipeBuilder &RecipeBuilder, + LoopVectorizationLegality *Legal); }; } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index d36975699c4a8..e6cf992488826 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -48,7 +48,7 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) { } bool vputils::isHeaderMask(const VPValue *V, const VPlan &Plan) { - if (isa(V)) + if (isa(V)) return true; auto IsWideCanonicalIV = [](VPValue *A) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index b9f5847ec731c..7a488973010b9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -373,6 +373,7 @@ class VPDef { // VPHeaderPHIRecipe need to be kept together. VPCanonicalIVPHISC, VPActiveLaneMaskPHISC, + VPLastActiveMaskPHISC, VPEVLBasedIVPHISC, VPFirstOrderRecurrencePHISC, VPWidenIntOrFpInductionSC, diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll new file mode 100644 index 0000000000000..25c698f3df245 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll @@ -0,0 +1,397 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6 +; RUN: opt -passes=loop-vectorize,instcombine -S < %s 2>&1 | FileCheck %s --check-prefix=NEON +; RUN: opt -passes=loop-vectorize,instcombine -mattr=+sve -S < %s 2>&1 | FileCheck %s --check-prefix=SVE + +;; The following run line caused an ICE before using a dedicated FindLast PHI recipe. +;; We're not looking at the resulting IR, just confirming it doesn't crash. +; RUN: opt -passes=loop-vectorize,instcombine -mattr=+sve -epilogue-vectorization-force-VF=4 -S < %s 2>&1 > /dev/null + +target triple = "aarch64-linux-gnu" + +define i32 @simple_csa_int_select(i64 %N, ptr %data, i32 %a) { +; NEON-LABEL: define i32 @simple_csa_int_select( +; NEON-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i32 [[A:%.*]]) { +; NEON-NEXT: [[ENTRY:.*]]: +; NEON-NEXT: br label %[[LOOP:.*]] +; NEON: [[LOOP]]: +; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA]], i64 [[IV]] +; NEON-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 +; NEON-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] +; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] +; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; NEON: [[EXIT]]: +; NEON-NEXT: ret i32 [[SELECT_DATA]] +; +; SVE-LABEL: define i32 @simple_csa_int_select( +; SVE-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i32 [[A:%.*]]) #[[ATTR0:[0-9]+]] { +; SVE-NEXT: [[ENTRY:.*]]: +; SVE-NEXT: [[A_FR:%.*]] = freeze i32 [[A]] +; SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; SVE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2 +; SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; SVE: [[VECTOR_PH]]: +; SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; SVE-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2 +; SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; SVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement zeroinitializer, i32 [[A_FR]], i64 0 +; SVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; SVE: [[VECTOR_BODY]]: +; SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; SVE-NEXT: [[VEC_PHI:%.*]] = phi [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; SVE-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ] +; SVE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDEX]] +; SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; SVE-NEXT: [[WIDE_LOAD_FR:%.*]] = freeze [[WIDE_LOAD]] +; SVE-NEXT: [[TMP7:%.*]] = icmp slt [[BROADCAST_SPLAT]], [[WIDE_LOAD_FR]] +; SVE-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP7]]) +; SVE-NEXT: [[TMP9]] = select i1 [[TMP8]], [[TMP7]], [[LAST_ACTIVE_MASK]] +; SVE-NEXT: [[TMP10]] = select i1 [[TMP8]], [[WIDE_LOAD_FR]], [[VEC_PHI]] +; SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; SVE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SVE-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; SVE: [[MIDDLE_BLOCK]]: +; SVE-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32( [[TMP10]], [[TMP9]], i32 -1) +; SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; SVE-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; SVE: [[SCALAR_PH]]: +; SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ] +; SVE-NEXT: br label %[[LOOP:.*]] +; SVE: [[LOOP]]: +; SVE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[DATA_PHI:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]] +; SVE-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 +; SVE-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A_FR]], [[LD]] +; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] +; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; SVE: [[EXIT]]: +; SVE-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ], [ [[TMP12]], %[[MIDDLE_BLOCK]] ] +; SVE-NEXT: ret i32 [[SELECT_DATA_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] + %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv + %ld = load i32, ptr %ld.addr, align 4 + %select.cmp = icmp slt i32 %a, %ld + %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, %N + br i1 %exit.cmp, label %exit, label %loop + +exit: + ret i32 %select.data +} + +define ptr @simple_csa_ptr_select(i64 %N, ptr %data, i64 %a, ptr %init) { +; NEON-LABEL: define ptr @simple_csa_ptr_select( +; NEON-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i64 [[A:%.*]], ptr [[INIT:%.*]]) { +; NEON-NEXT: [[ENTRY:.*]]: +; NEON-NEXT: br label %[[LOOP:.*]] +; NEON: [[LOOP]]: +; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[DATA_PHI:%.*]] = phi ptr [ [[INIT]], %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw ptr, ptr [[DATA]], i64 [[IV]] +; NEON-NEXT: [[LD:%.*]] = load ptr, ptr [[LD_ADDR]], align 4 +; NEON-NEXT: [[LD_I64:%.*]] = ptrtoint ptr [[LD]] to i64 +; NEON-NEXT: [[SELECT_CMP:%.*]] = icmp slt i64 [[A]], [[LD_I64]] +; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], ptr [[LD]], ptr [[DATA_PHI]] +; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; NEON: [[EXIT]]: +; NEON-NEXT: ret ptr [[SELECT_DATA]] +; +; SVE-LABEL: define ptr @simple_csa_ptr_select( +; SVE-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i64 [[A:%.*]], ptr [[INIT:%.*]]) #[[ATTR0]] { +; SVE-NEXT: [[ENTRY:.*]]: +; SVE-NEXT: br label %[[LOOP:.*]] +; SVE: [[LOOP]]: +; SVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[DATA_PHI:%.*]] = phi ptr [ [[INIT]], %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw ptr, ptr [[DATA]], i64 [[IV]] +; SVE-NEXT: [[LD:%.*]] = load ptr, ptr [[LD_ADDR]], align 4 +; SVE-NEXT: [[LD_I64:%.*]] = ptrtoint ptr [[LD]] to i64 +; SVE-NEXT: [[SELECT_CMP:%.*]] = icmp slt i64 [[A]], [[LD_I64]] +; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], ptr [[LD]], ptr [[DATA_PHI]] +; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; SVE: [[EXIT]]: +; SVE-NEXT: ret ptr [[SELECT_DATA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %data.phi = phi ptr [ %init, %entry ], [ %select.data, %loop ] + %ld.addr = getelementptr inbounds ptr, ptr %data, i64 %iv + %ld = load ptr, ptr %ld.addr, align 4 + %ld.i64 = ptrtoint ptr %ld to i64 + %select.cmp = icmp slt i64 %a, %ld.i64 + %select.data = select i1 %select.cmp, ptr %ld, ptr %data.phi + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, %N + br i1 %exit.cmp, label %exit, label %loop + +exit: + ret ptr %select.data +} + +define float @simple_csa_float_select(i64 %N, ptr %data, float %a) { +; NEON-LABEL: define float @simple_csa_float_select( +; NEON-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], float [[A:%.*]]) { +; NEON-NEXT: [[ENTRY:.*]]: +; NEON-NEXT: br label %[[LOOP:.*]] +; NEON: [[LOOP]]: +; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[DATA_PHI:%.*]] = phi float [ -1.000000e+00, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw float, ptr [[DATA]], i64 [[IV]] +; NEON-NEXT: [[LD:%.*]] = load float, ptr [[LD_ADDR]], align 4 +; NEON-NEXT: [[SELECT_CMP:%.*]] = fcmp olt float [[A]], [[LD]] +; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], float [[LD]], float [[DATA_PHI]] +; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; NEON: [[EXIT]]: +; NEON-NEXT: ret float [[SELECT_DATA]] +; +; SVE-LABEL: define float @simple_csa_float_select( +; SVE-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], float [[A:%.*]]) #[[ATTR0]] { +; SVE-NEXT: [[ENTRY:.*]]: +; SVE-NEXT: br label %[[LOOP:.*]] +; SVE: [[LOOP]]: +; SVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[DATA_PHI:%.*]] = phi float [ -1.000000e+00, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw float, ptr [[DATA]], i64 [[IV]] +; SVE-NEXT: [[LD:%.*]] = load float, ptr [[LD_ADDR]], align 4 +; SVE-NEXT: [[SELECT_CMP:%.*]] = fcmp olt float [[A]], [[LD]] +; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], float [[LD]], float [[DATA_PHI]] +; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; SVE: [[EXIT]]: +; SVE-NEXT: ret float [[SELECT_DATA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %data.phi = phi float [ -1.0, %entry ], [ %select.data, %loop ] + %ld.addr = getelementptr inbounds float, ptr %data, i64 %iv + %ld = load float, ptr %ld.addr, align 4 + %select.cmp = fcmp olt float %a, %ld + %select.data = select i1 %select.cmp, float %ld, float %data.phi + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, %N + br i1 %exit.cmp, label %exit, label %loop + +exit: + ret float %select.data +} + +define i32 @multi_user_csa_int_select(i64 %N, ptr %data, ptr %results, i32 %a) { +; NEON-LABEL: define i32 @multi_user_csa_int_select( +; NEON-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], ptr [[RESULTS:%.*]], i32 [[A:%.*]]) { +; NEON-NEXT: [[ENTRY:.*]]: +; NEON-NEXT: br label %[[LOOP:.*]] +; NEON: [[LOOP]]: +; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA]], i64 [[IV]] +; NEON-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 +; NEON-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] +; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] +; NEON-NEXT: [[RES_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[RESULTS]], i64 [[IV]] +; NEON-NEXT: store i32 [[SELECT_DATA]], ptr [[RES_ADDR]], align 4 +; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; NEON: [[EXIT]]: +; NEON-NEXT: ret i32 [[SELECT_DATA]] +; +; SVE-LABEL: define i32 @multi_user_csa_int_select( +; SVE-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], ptr [[RESULTS:%.*]], i32 [[A:%.*]]) #[[ATTR0]] { +; SVE-NEXT: [[ENTRY:.*]]: +; SVE-NEXT: br label %[[LOOP:.*]] +; SVE: [[LOOP]]: +; SVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA]], i64 [[IV]] +; SVE-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 +; SVE-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] +; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] +; SVE-NEXT: [[RES_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[RESULTS]], i64 [[IV]] +; SVE-NEXT: store i32 [[SELECT_DATA]], ptr [[RES_ADDR]], align 4 +; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; SVE: [[EXIT]]: +; SVE-NEXT: ret i32 [[SELECT_DATA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] + %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv + %ld = load i32, ptr %ld.addr, align 4 + %select.cmp = icmp slt i32 %a, %ld + %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi + %res.addr = getelementptr inbounds i32, ptr %results, i64 %iv + store i32 %select.data, ptr %res.addr, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, %N + br i1 %exit.cmp, label %exit, label %loop + +exit: + ret i32 %select.data +} + + +define i32 @multi_use_cmp_for_csa_int_select(i64 %N, ptr %data, i32 %a) { +; NEON-LABEL: define i32 @multi_use_cmp_for_csa_int_select( +; NEON-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i32 [[A:%.*]]) { +; NEON-NEXT: [[ENTRY:.*]]: +; NEON-NEXT: br label %[[LOOP:.*]] +; NEON: [[LOOP]]: +; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[IDX_PHI:%.*]] = phi i64 [ -1, %[[ENTRY]] ], [ [[SELECT_IDX:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA]], i64 [[IV]] +; NEON-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 +; NEON-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] +; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] +; NEON-NEXT: [[SELECT_IDX]] = select i1 [[SELECT_CMP]], i64 [[IV]], i64 [[IDX_PHI]] +; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; NEON: [[EXIT]]: +; NEON-NEXT: [[IDX:%.*]] = trunc i64 [[SELECT_IDX]] to i32 +; NEON-NEXT: [[RES:%.*]] = add i32 [[SELECT_DATA]], [[IDX]] +; NEON-NEXT: ret i32 [[RES]] +; +; SVE-LABEL: define i32 @multi_use_cmp_for_csa_int_select( +; SVE-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i32 [[A:%.*]]) #[[ATTR0]] { +; SVE-NEXT: [[ENTRY:.*]]: +; SVE-NEXT: br label %[[LOOP:.*]] +; SVE: [[LOOP]]: +; SVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[IDX_PHI:%.*]] = phi i64 [ -1, %[[ENTRY]] ], [ [[SELECT_IDX:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA]], i64 [[IV]] +; SVE-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 +; SVE-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] +; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] +; SVE-NEXT: [[SELECT_IDX]] = select i1 [[SELECT_CMP]], i64 [[IV]], i64 [[IDX_PHI]] +; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; SVE: [[EXIT]]: +; SVE-NEXT: [[IDX:%.*]] = trunc i64 [[SELECT_IDX]] to i32 +; SVE-NEXT: [[RES:%.*]] = add i32 [[SELECT_DATA]], [[IDX]] +; SVE-NEXT: ret i32 [[RES]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] + %idx.phi = phi i64 [ -1, %entry ], [ %select.idx, %loop ] + %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv + %ld = load i32, ptr %ld.addr, align 4 + %select.cmp = icmp slt i32 %a, %ld + %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi + %select.idx = select i1 %select.cmp, i64 %iv, i64 %idx.phi + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, %N + br i1 %exit.cmp, label %exit, label %loop + +exit: + %idx = trunc i64 %select.idx to i32 + %res = add i32 %idx, %select.data + ret i32 %res +} + + +define i32 @chained_select_for_csa_int_select(i64 %N, ptr %data1, ptr %data2, i32 %a, i32 %b) { +; NEON-LABEL: define i32 @chained_select_for_csa_int_select( +; NEON-SAME: i64 [[N:%.*]], ptr [[DATA1:%.*]], ptr [[DATA2:%.*]], i32 [[A:%.*]], i32 [[B:%.*]]) { +; NEON-NEXT: [[ENTRY:.*]]: +; NEON-NEXT: br label %[[LOOP:.*]] +; NEON: [[LOOP]]: +; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[LD1_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA1]], i64 [[IV]] +; NEON-NEXT: [[LD1:%.*]] = load i32, ptr [[LD1_ADDR]], align 4 +; NEON-NEXT: [[SELECT_CMP1:%.*]] = icmp slt i32 [[A]], [[LD1]] +; NEON-NEXT: [[SELECT_LD1:%.*]] = select i1 [[SELECT_CMP1]], i32 [[LD1]], i32 [[DATA_PHI]] +; NEON-NEXT: [[LD2_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA2]], i64 [[IV]] +; NEON-NEXT: [[LD2:%.*]] = load i32, ptr [[LD2_ADDR]], align 4 +; NEON-NEXT: [[SELECT_CMP2:%.*]] = icmp sgt i32 [[B]], [[LD2]] +; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP2]], i32 [[LD2]], i32 [[SELECT_LD1]] +; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; NEON: [[EXIT]]: +; NEON-NEXT: ret i32 [[SELECT_DATA]] +; +; SVE-LABEL: define i32 @chained_select_for_csa_int_select( +; SVE-SAME: i64 [[N:%.*]], ptr [[DATA1:%.*]], ptr [[DATA2:%.*]], i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR0]] { +; SVE-NEXT: [[ENTRY:.*]]: +; SVE-NEXT: br label %[[LOOP:.*]] +; SVE: [[LOOP]]: +; SVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[LD1_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA1]], i64 [[IV]] +; SVE-NEXT: [[LD1:%.*]] = load i32, ptr [[LD1_ADDR]], align 4 +; SVE-NEXT: [[SELECT_CMP1:%.*]] = icmp slt i32 [[A]], [[LD1]] +; SVE-NEXT: [[SELECT_LD1:%.*]] = select i1 [[SELECT_CMP1]], i32 [[LD1]], i32 [[DATA_PHI]] +; SVE-NEXT: [[LD2_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA2]], i64 [[IV]] +; SVE-NEXT: [[LD2:%.*]] = load i32, ptr [[LD2_ADDR]], align 4 +; SVE-NEXT: [[SELECT_CMP2:%.*]] = icmp sgt i32 [[B]], [[LD2]] +; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP2]], i32 [[LD2]], i32 [[SELECT_LD1]] +; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; SVE: [[EXIT]]: +; SVE-NEXT: ret i32 [[SELECT_DATA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] + %ld1.addr = getelementptr inbounds i32, ptr %data1, i64 %iv + %ld1 = load i32, ptr %ld1.addr, align 4 + %select.cmp1 = icmp slt i32 %a, %ld1 + %select.ld1 = select i1 %select.cmp1, i32 %ld1, i32 %data.phi + %ld2.addr = getelementptr inbounds i32, ptr %data2, i64 %iv + %ld2 = load i32, ptr %ld2.addr, align 4 + %select.cmp2 = icmp sgt i32 %b, %ld2 + %select.data = select i1 %select.cmp2, i32 %ld2, i32 %select.ld1 + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, %N + br i1 %exit.cmp, label %exit, label %loop + +exit: + ret i32 %select.data +} diff --git a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll new file mode 100644 index 0000000000000..e802093fc7886 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll @@ -0,0 +1,123 @@ +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \ +; RUN: -scalable-vectorization=on -force-target-supports-scalable-vectors \ +; RUN: -disable-output 2>&1 < %s | FileCheck %s + + +; This function is derived from the following C program: +; int simple_csa_int_select(int N, int *data, int a) { +; int t = -1; +; for (int i = 0; i < N; i++) { +; if (a < data[i]) +; t = data[i]; +; } +; return t; +; } +define i32 @simple_csa_int_select(i64 %N, ptr %data, i32 %a) { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] + %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv + %ld = load i32, ptr %ld.addr, align 4 + %select.cmp = icmp slt i32 %a, %ld + %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, %N + br i1 %exit.cmp, label %exit, label %loop + +exit: + ret i32 %select.data +} + + +; CHECK: VPlan 'Initial VPlan for VF={vscale x 1},UF>=1' { +; CHECK-NEXT: Live-in vp<%0> = VF +; CHECK-NEXT: Live-in vp<%1> = VF * UF +; CHECK-NEXT: Live-in vp<%2> = vector-trip-count +; CHECK-NEXT: Live-in ir<%N> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: Successor(s): scalar.ph, vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%data.phi> = phi ir<-1>, vp<%9> +; CHECK-NEXT: LAST-ACTIVE-MASK-PHI vp<%4> = phi ir, vp<%8> +; CHECK-NEXT: vp<%5> = SCALAR-STEPS vp<%3>, ir<1>, vp<%0> +; CHECK-NEXT: CLONE ir<%ld.addr> = getelementptr inbounds ir<%data>, vp<%5> +; CHECK-NEXT: vp<%6> = vector-pointer ir<%ld.addr> +; CHECK-NEXT: WIDEN ir<%ld> = load vp<%6> +; CHECK-NEXT: WIDEN ir<%select.cmp> = icmp slt ir<%a>, ir<%ld> +; CHECK-NEXT: EMIT vp<%7> = any-of ir<%select.cmp> +; CHECK-NEXT: EMIT vp<%8> = select vp<%7>, ir<%select.cmp>, vp<%4> +; CHECK-NEXT: EMIT vp<%9> = select vp<%7>, ir<%ld>, ir<%data.phi> +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%3>, vp<%1> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<%11> = extract-last-active vp<%9>, vp<%8>, ir<-1> +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq ir<%N>, vp<%2> +; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> +; CHECK-NEXT: Successor(s): ir-bb, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %select.data.lcssa = phi i32 [ %select.data, %loop ] (extra operand: vp<%11> from middle.block) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%2>, middle.block ], [ ir<0>, ir-bb ] +; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<-1>, ir-bb ] +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph) +; CHECK-NEXT: IR %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph) +; CHECK-NEXT: IR %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv +; CHECK-NEXT: IR %ld = load i32, ptr %ld.addr, align 4 +; CHECK-NEXT: IR %select.cmp = icmp slt i32 %a, %ld +; CHECK-NEXT: IR %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi +; CHECK-NEXT: IR %iv.next = add nuw nsw i64 %iv, 1 +; CHECK-NEXT: IR %exit.cmp = icmp eq i64 %iv.next, %N +; CHECK-NEXT: No successors +; CHECK-NEXT: } + +; CHECK: Cost of 1 for VF vscale x 1: induction instruction %iv.next = add nuw nsw i64 %iv, 1 +; CHECK-NEXT: Cost of 1 for VF vscale x 1: induction instruction %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] +; CHECK-NEXT: Cost of 1 for VF vscale x 1: exit condition instruction %exit.cmp = icmp eq i64 %iv.next, %N +; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN-REDUCTION-PHI ir<%data.phi> = phi ir<-1>, vp<%9> +; CHECK-NEXT: Cost of 1 for VF vscale x 1: LAST-ACTIVE-MASK-PHI vp<%4> = phi ir, vp<%8> +; CHECK-NEXT: Cost of 0 for VF vscale x 1: vp<%5> = SCALAR-STEPS vp<%3>, ir<1>, vp<%0> +; CHECK-NEXT: Cost of 0 for VF vscale x 1: CLONE ir<%ld.addr> = getelementptr inbounds ir<%data>, vp<%5> +; CHECK-NEXT: Cost of 0 for VF vscale x 1: vp<%6> = vector-pointer ir<%ld.addr> +; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN ir<%ld> = load vp<%6> +; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN ir<%select.cmp> = icmp slt ir<%a>, ir<%ld> +; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<%7> = any-of ir<%select.cmp> +; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<%8> = select vp<%7>, ir<%select.cmp>, vp<%4> +; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<%9> = select vp<%7>, ir<%ld>, ir<%data.phi> +; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT vp<%index.next> = add nuw vp<%3>, vp<%1> +; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK-NEXT: Cost of 1 for VF vscale x 1: vector loop backedge +; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%2>, middle.block ], [ ir<0>, ir-bb ] +; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<-1>, ir-bb ] +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph) +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph) +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %ld = load i32, ptr %ld.addr, align 4 +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %select.cmp = icmp slt i32 %a, %ld +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %iv.next = add nuw nsw i64 %iv, 1 +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %exit.cmp = icmp eq i64 %iv.next, %N +; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<%11> = extract-last-active vp<%9>, vp<%8>, ir<-1> +; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT vp<%cmp.n> = icmp eq ir<%N>, vp<%2> +; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT branch-on-cond vp<%cmp.n> +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %select.data.lcssa = phi i32 [ %select.data, %loop ] (extra operand: vp<%11> from middle.block) diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll index 2200a7d0431d2..503837894a7b4 100644 --- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll +++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5 -; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck --check-prefixes=CHECK,IC1VF4 %s -; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck --check-prefixes=CHECK,IC4VF4 %s -; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=1 -S < %s | FileCheck --check-prefixes=CHECK,IC4VF1 %s +; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck --check-prefix=IC1VF4 %s +; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck --check-prefix=IC4VF4 %s +; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=1 -S < %s | FileCheck --check-prefix=IC4VF1 %s define i64 @select_decreasing_induction_icmp_const_start(ptr %a) { ; IC1VF4-LABEL: define i64 @select_decreasing_induction_icmp_const_start( @@ -961,29 +961,142 @@ exit: ; preds = %loop } ; The unsigned sentinel value for decreasing-IV vectorization is ULONG_MAX, -; and since the IV hits this value, it is impossible to vectorize this case. +; and since the IV hits this value, it cannot be vectorized as a FindLastIV +; reduction. Instead, it is recognized and vectorized as a generic FindLast. ; In this test, %iv's range will include both signed and unsigned ; maximum (sentinel) values. -define i64 @not_vectorized_select_decreasing_induction_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.start) { -; CHECK-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_iv_out_of_bound( -; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br label %[[LOOP:.*]] -; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ -1, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], -1 -; CHECK-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV_NEXT]] -; CHECK-NEXT: [[LD_A:%.*]] = load i8, ptr [[GEP_A_IV]], align 1 -; CHECK-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV_NEXT]] -; CHECK-NEXT: [[LD_B:%.*]] = load i8, ptr [[GEP_B_IV]], align 1 -; CHECK-NEXT: [[CMP_A_B:%.*]] = icmp sgt i8 [[LD_A]], [[LD_B]] -; CHECK-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]] -; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], 0 -; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]] -; CHECK: [[EXIT]]: -; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ] -; CHECK-NEXT: ret i64 [[COND_LCSSA]] +define i64 @select_decreasing_induction_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.start) { +; IC1VF4-LABEL: define i64 @select_decreasing_induction_icmp_iv_out_of_bound( +; IC1VF4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]]) { +; IC1VF4-NEXT: [[ENTRY:.*:]] +; IC1VF4-NEXT: br label %[[VECTOR_PH:.*]] +; IC1VF4: [[VECTOR_PH]]: +; IC1VF4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; IC1VF4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; IC1VF4-NEXT: br label %[[VECTOR_BODY:.*]] +; IC1VF4: [[VECTOR_BODY]]: +; IC1VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC1VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC1VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ] +; IC1VF4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ] +; IC1VF4-NEXT: [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -1) +; IC1VF4-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0 +; IC1VF4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]] +; IC1VF4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 +; IC1VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 -3 +; IC1VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 +; IC1VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> +; IC1VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] +; IC1VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; IC1VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 -3 +; IC1VF4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1 +; IC1VF4-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> +; IC1VF4-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i8> [[REVERSE]], [[REVERSE2]] +; IC1VF4-NEXT: [[TMP9:%.*]] = freeze <4 x i1> [[TMP8]] +; IC1VF4-NEXT: [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP9]]) +; IC1VF4-NEXT: [[TMP11]] = select i1 [[TMP10]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]] +; IC1VF4-NEXT: [[TMP12]] = select i1 [[TMP10]], <4 x i64> [[TMP0]], <4 x i64> [[VEC_PHI]] +; IC1VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; IC1VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4) +; IC1VF4-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], -4 +; IC1VF4-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; IC1VF4: [[MIDDLE_BLOCK]]: +; IC1VF4-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; IC1VF4-NEXT: [[TMP15:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP12]], <4 x i1> [[TMP11]], i64 [[TMP14]]) +; IC1VF4-NEXT: br label %[[SCALAR_PH:.*]] +; IC1VF4: [[SCALAR_PH]]: +; IC1VF4-NEXT: br label %[[LOOP:.*]] +; IC1VF4: [[LOOP]]: +; IC1VF4-NEXT: [[IV:%.*]] = phi i64 [ 3, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; IC1VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[TMP15]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[LOOP]] ] +; IC1VF4-NEXT: [[IV_NEXT]] = add i64 [[IV]], -1 +; IC1VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV_NEXT]] +; IC1VF4-NEXT: [[LD_A:%.*]] = load i8, ptr [[GEP_A_IV]], align 1 +; IC1VF4-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV_NEXT]] +; IC1VF4-NEXT: [[LD_B:%.*]] = load i8, ptr [[GEP_B_IV]], align 1 +; IC1VF4-NEXT: [[CMP_A_B:%.*]] = icmp sgt i8 [[LD_A]], [[LD_B]] +; IC1VF4-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]] +; IC1VF4-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; IC1VF4-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; IC1VF4: [[EXIT]]: +; IC1VF4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ] +; IC1VF4-NEXT: ret i64 [[COND_LCSSA]] +; +; IC4VF4-LABEL: define i64 @select_decreasing_induction_icmp_iv_out_of_bound( +; IC4VF4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]]) { +; IC4VF4-NEXT: [[ENTRY:.*:]] +; IC4VF4-NEXT: br label %[[VECTOR_PH:.*]] +; IC4VF4: [[VECTOR_PH]]: +; IC4VF4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; IC4VF4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; IC4VF4-NEXT: br label %[[VECTOR_BODY:.*]] +; IC4VF4: [[VECTOR_BODY]]: +; IC4VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -1) +; IC4VF4-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0 +; IC4VF4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]] +; IC4VF4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 +; IC4VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 -3 +; IC4VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 +; IC4VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> +; IC4VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] +; IC4VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; IC4VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 -3 +; IC4VF4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1 +; IC4VF4-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> +; IC4VF4-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i8> [[REVERSE]], [[REVERSE2]] +; IC4VF4-NEXT: [[TMP9:%.*]] = freeze <4 x i1> [[TMP8]] +; IC4VF4-NEXT: [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP9]]) +; IC4VF4-NEXT: [[TMP11]] = select i1 [[TMP10]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]] +; IC4VF4-NEXT: [[TMP12]] = select i1 [[TMP10]], <4 x i64> [[TMP0]], <4 x i64> [[VEC_PHI]] +; IC4VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; IC4VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4) +; IC4VF4-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], -4 +; IC4VF4-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; IC4VF4: [[MIDDLE_BLOCK]]: +; IC4VF4-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; IC4VF4-NEXT: [[TMP15:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP12]], <4 x i1> [[TMP11]], i64 [[TMP14]]) +; IC4VF4-NEXT: br label %[[SCALAR_PH:.*]] +; IC4VF4: [[SCALAR_PH]]: +; IC4VF4-NEXT: br label %[[LOOP:.*]] +; IC4VF4: [[LOOP]]: +; IC4VF4-NEXT: [[IV:%.*]] = phi i64 [ 3, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; IC4VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[TMP15]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[LOOP]] ] +; IC4VF4-NEXT: [[IV_NEXT]] = add i64 [[IV]], -1 +; IC4VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV_NEXT]] +; IC4VF4-NEXT: [[LD_A:%.*]] = load i8, ptr [[GEP_A_IV]], align 1 +; IC4VF4-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV_NEXT]] +; IC4VF4-NEXT: [[LD_B:%.*]] = load i8, ptr [[GEP_B_IV]], align 1 +; IC4VF4-NEXT: [[CMP_A_B:%.*]] = icmp sgt i8 [[LD_A]], [[LD_B]] +; IC4VF4-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]] +; IC4VF4-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; IC4VF4-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; IC4VF4: [[EXIT]]: +; IC4VF4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ] +; IC4VF4-NEXT: ret i64 [[COND_LCSSA]] +; +; IC4VF1-LABEL: define i64 @select_decreasing_induction_icmp_iv_out_of_bound( +; IC4VF1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]]) { +; IC4VF1-NEXT: [[ENTRY:.*]]: +; IC4VF1-NEXT: br label %[[LOOP:.*]] +; IC4VF1: [[LOOP]]: +; IC4VF1-NEXT: [[IV:%.*]] = phi i64 [ -1, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; IC4VF1-NEXT: [[RDX:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND:%.*]], %[[LOOP]] ] +; IC4VF1-NEXT: [[IV_NEXT]] = add i64 [[IV]], -1 +; IC4VF1-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV_NEXT]] +; IC4VF1-NEXT: [[LD_A:%.*]] = load i8, ptr [[GEP_A_IV]], align 1 +; IC4VF1-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV_NEXT]] +; IC4VF1-NEXT: [[LD_B:%.*]] = load i8, ptr [[GEP_B_IV]], align 1 +; IC4VF1-NEXT: [[CMP_A_B:%.*]] = icmp sgt i8 [[LD_A]], [[LD_B]] +; IC4VF1-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]] +; IC4VF1-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; IC4VF1-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]] +; IC4VF1: [[EXIT]]: +; IC4VF1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ] +; IC4VF1-NEXT: ret i64 [[COND_LCSSA]] ; entry: br label %loop @@ -1005,26 +1118,164 @@ exit: ret i64 %cond } -define i64 @not_vectorized_select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64 %rdx.start, i64 %n) { -; CHECK-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_non_const_start( -; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br label %[[LOOP:.*]] -; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[N]], %[[ENTRY]] ] -; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[LOOP]] ], [ [[RDX_START]], %[[ENTRY]] ] -; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 -; CHECK-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_NEXT]] -; CHECK-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8 -; CHECK-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_NEXT]] -; CHECK-NEXT: [[LD_B:%.*]] = load i64, ptr [[GEP_B_IV]], align 8 -; CHECK-NEXT: [[CMP_A_B:%.*]] = icmp sgt i64 [[LD_A]], [[LD_B]] -; CHECK-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]] -; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp ugt i64 [[IV]], 1 -; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[LOOP]], label %[[EXIT:.*]] -; CHECK: [[EXIT]]: -; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ] -; CHECK-NEXT: ret i64 [[COND_LCSSA]] +define i64 @select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64 %rdx.start, i64 %n) { +; IC1VF4-LABEL: define i64 @select_decreasing_induction_icmp_non_const_start( +; IC1VF4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) { +; IC1VF4-NEXT: [[ENTRY:.*]]: +; IC1VF4-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; IC1VF4-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 1) +; IC1VF4-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[UMIN]] +; IC1VF4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 4 +; IC1VF4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IC1VF4: [[VECTOR_PH]]: +; IC1VF4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4 +; IC1VF4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] +; IC1VF4-NEXT: [[TMP2:%.*]] = sub i64 [[N]], [[N_VEC]] +; IC1VF4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; IC1VF4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; IC1VF4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0 +; IC1VF4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; IC1VF4-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], +; IC1VF4-NEXT: br label %[[VECTOR_BODY:.*]] +; IC1VF4: [[VECTOR_BODY]]: +; IC1VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC1VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC1VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] +; IC1VF4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ] +; IC1VF4-NEXT: [[TMP3:%.*]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -1) +; IC1VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; IC1VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; IC1VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; IC1VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -3 +; IC1VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8 +; IC1VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> +; IC1VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] +; IC1VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 +; IC1VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i32 -3 +; IC1VF4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8 +; IC1VF4-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD3]], <4 x i64> poison, <4 x i32> +; IC1VF4-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i64> [[REVERSE]], [[REVERSE4]] +; IC1VF4-NEXT: [[TMP12:%.*]] = freeze <4 x i1> [[TMP11]] +; IC1VF4-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP12]]) +; IC1VF4-NEXT: [[TMP14]] = select i1 [[TMP13]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]] +; IC1VF4-NEXT: [[TMP15]] = select i1 [[TMP13]], <4 x i64> [[TMP3]], <4 x i64> [[VEC_PHI]] +; IC1VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; IC1VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4) +; IC1VF4-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IC1VF4-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; IC1VF4: [[MIDDLE_BLOCK]]: +; IC1VF4-NEXT: [[TMP17:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; IC1VF4-NEXT: [[TMP18:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP15]], <4 x i1> [[TMP14]], i64 [[TMP17]]) +; IC1VF4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] +; IC1VF4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IC1VF4: [[SCALAR_PH]]: +; IC1VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[ENTRY]] ] +; IC1VF4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP18]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ] +; IC1VF4-NEXT: br label %[[LOOP:.*]] +; IC1VF4: [[LOOP]]: +; IC1VF4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; IC1VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[LOOP]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] +; IC1VF4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 +; IC1VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_NEXT]] +; IC1VF4-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8 +; IC1VF4-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_NEXT]] +; IC1VF4-NEXT: [[LD_B:%.*]] = load i64, ptr [[GEP_B_IV]], align 8 +; IC1VF4-NEXT: [[CMP_A_B:%.*]] = icmp sgt i64 [[LD_A]], [[LD_B]] +; IC1VF4-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]] +; IC1VF4-NEXT: [[EXIT_COND:%.*]] = icmp ugt i64 [[IV]], 1 +; IC1VF4-NEXT: br i1 [[EXIT_COND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP9:![0-9]+]] +; IC1VF4: [[EXIT]]: +; IC1VF4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ], [ [[TMP18]], %[[MIDDLE_BLOCK]] ] +; IC1VF4-NEXT: ret i64 [[COND_LCSSA]] +; +; IC4VF4-LABEL: define i64 @select_decreasing_induction_icmp_non_const_start( +; IC4VF4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) { +; IC4VF4-NEXT: [[ENTRY:.*]]: +; IC4VF4-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; IC4VF4-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 1) +; IC4VF4-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[UMIN]] +; IC4VF4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 4 +; IC4VF4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IC4VF4: [[VECTOR_PH]]: +; IC4VF4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4 +; IC4VF4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] +; IC4VF4-NEXT: [[TMP2:%.*]] = sub i64 [[N]], [[N_VEC]] +; IC4VF4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; IC4VF4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; IC4VF4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0 +; IC4VF4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; IC4VF4-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], +; IC4VF4-NEXT: br label %[[VECTOR_BODY:.*]] +; IC4VF4: [[VECTOR_BODY]]: +; IC4VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[TMP3:%.*]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -1) +; IC4VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; IC4VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; IC4VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; IC4VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -3 +; IC4VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8 +; IC4VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> +; IC4VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] +; IC4VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 +; IC4VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i32 -3 +; IC4VF4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8 +; IC4VF4-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD3]], <4 x i64> poison, <4 x i32> +; IC4VF4-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i64> [[REVERSE]], [[REVERSE4]] +; IC4VF4-NEXT: [[TMP12:%.*]] = freeze <4 x i1> [[TMP11]] +; IC4VF4-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP12]]) +; IC4VF4-NEXT: [[TMP14]] = select i1 [[TMP13]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]] +; IC4VF4-NEXT: [[TMP15]] = select i1 [[TMP13]], <4 x i64> [[TMP3]], <4 x i64> [[VEC_PHI]] +; IC4VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; IC4VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4) +; IC4VF4-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IC4VF4-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; IC4VF4: [[MIDDLE_BLOCK]]: +; IC4VF4-NEXT: [[TMP17:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; IC4VF4-NEXT: [[TMP18:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP15]], <4 x i1> [[TMP14]], i64 [[TMP17]]) +; IC4VF4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] +; IC4VF4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IC4VF4: [[SCALAR_PH]]: +; IC4VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[ENTRY]] ] +; IC4VF4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP18]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ] +; IC4VF4-NEXT: br label %[[LOOP:.*]] +; IC4VF4: [[LOOP]]: +; IC4VF4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; IC4VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[LOOP]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] +; IC4VF4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 +; IC4VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_NEXT]] +; IC4VF4-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8 +; IC4VF4-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_NEXT]] +; IC4VF4-NEXT: [[LD_B:%.*]] = load i64, ptr [[GEP_B_IV]], align 8 +; IC4VF4-NEXT: [[CMP_A_B:%.*]] = icmp sgt i64 [[LD_A]], [[LD_B]] +; IC4VF4-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]] +; IC4VF4-NEXT: [[EXIT_COND:%.*]] = icmp ugt i64 [[IV]], 1 +; IC4VF4-NEXT: br i1 [[EXIT_COND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP9:![0-9]+]] +; IC4VF4: [[EXIT]]: +; IC4VF4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ], [ [[TMP18]], %[[MIDDLE_BLOCK]] ] +; IC4VF4-NEXT: ret i64 [[COND_LCSSA]] +; +; IC4VF1-LABEL: define i64 @select_decreasing_induction_icmp_non_const_start( +; IC4VF1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) { +; IC4VF1-NEXT: [[ENTRY:.*]]: +; IC4VF1-NEXT: br label %[[LOOP:.*]] +; IC4VF1: [[LOOP]]: +; IC4VF1-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[N]], %[[ENTRY]] ] +; IC4VF1-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[LOOP]] ], [ [[RDX_START]], %[[ENTRY]] ] +; IC4VF1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 +; IC4VF1-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_NEXT]] +; IC4VF1-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8 +; IC4VF1-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_NEXT]] +; IC4VF1-NEXT: [[LD_B:%.*]] = load i64, ptr [[GEP_B_IV]], align 8 +; IC4VF1-NEXT: [[CMP_A_B:%.*]] = icmp sgt i64 [[LD_A]], [[LD_B]] +; IC4VF1-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]] +; IC4VF1-NEXT: [[EXIT_COND:%.*]] = icmp ugt i64 [[IV]], 1 +; IC4VF1-NEXT: br i1 [[EXIT_COND]], label %[[LOOP]], label %[[EXIT:.*]] +; IC4VF1: [[EXIT]]: +; IC4VF1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ] +; IC4VF1-NEXT: ret i64 [[COND_LCSSA]] ; entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll index 21ef1885b75b9..18f1470aba3a5 100644 --- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll +++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll @@ -145,10 +145,44 @@ define i64 @select_icmp_nuw(ptr %a, ptr %b, i64 %ii, i64 %n) { ; CHECK-LABEL: define i64 @select_icmp_nuw( ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[II:%.*]], i64 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[II]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]]) +; CHECK-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP2]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP6]], <4 x i1> [[TMP5]], i64 [[TMP8]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[II]], %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[II]], %[[ENTRY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]] @@ -157,9 +191,9 @@ define i64 @select_icmp_nuw(ptr %a, ptr %b, i64 %ii, i64 %n) { ; CHECK-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]] ; CHECK-NEXT: [[INC]] = add nuw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[COND_LCSSA]] ; entry: @@ -186,10 +220,44 @@ define i64 @select_icmp_noflag(ptr %a, ptr %b, i64 %ii, i64 %n) { ; CHECK-LABEL: define i64 @select_icmp_noflag( ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[II:%.*]], i64 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[II]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]]) +; CHECK-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP2]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP6]], <4 x i1> [[TMP5]], i64 [[TMP8]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[II]], %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[II]], %[[ENTRY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]] @@ -198,9 +266,9 @@ define i64 @select_icmp_noflag(ptr %a, ptr %b, i64 %ii, i64 %n) { ; CHECK-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]] ; CHECK-NEXT: [[INC]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[COND_LCSSA]] ; entry: @@ -229,4 +297,8 @@ exit: ; preds = %for.body ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} ; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll index 72ed6537ef640..7a89c32b197d3 100644 --- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll +++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll @@ -1,32 +1,156 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK -; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK -; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=1 -S < %s | FileCheck %s --check-prefix=CHECK +; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC1 +; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC4 +; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=1 -S < %s | FileCheck %s --check-prefix=CHECK-VF1IC4 define i64 @select_non_const_iv_start_signed_guard(ptr %a, i64 %rdx_start, i64 %iv_start ,i64 %n) { -; CHECK-LABEL: define i64 @select_non_const_iv_start_signed_guard( -; CHECK-SAME: ptr [[A:%.*]], i64 [[RDX_START:%.*]], i64 [[IV_START:%.*]], i64 [[N:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[GUARD:%.*]] = icmp slt i64 [[IV_START]], [[N]] -; CHECK-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] -; CHECK: [[FOR_BODY_PREHEADER]]: -; CHECK-NEXT: br label %[[FOR_BODY:.*]] -; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[IV_START]], %[[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[RDX_07:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] -; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i64 [[TMP0]], 3 -; CHECK-NEXT: [[COND]] = select i1 [[CMP1]], i64 [[IV]], i64 [[RDX_07]] -; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] -; CHECK: [[EXIT_LOOPEXIT]]: -; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ] -; CHECK-NEXT: br label %[[EXIT]] -; CHECK: [[EXIT]]: -; CHECK-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] -; CHECK-NEXT: ret i64 [[IDX_0_LCSSA]] +; CHECK-VF4IC1-LABEL: define i64 @select_non_const_iv_start_signed_guard( +; CHECK-VF4IC1-SAME: ptr [[A:%.*]], i64 [[RDX_START:%.*]], i64 [[IV_START:%.*]], i64 [[N:%.*]]) { +; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC1-NEXT: [[GUARD:%.*]] = icmp slt i64 [[IV_START]], [[N]] +; CHECK-VF4IC1-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] +; CHECK-VF4IC1: [[FOR_BODY_PREHEADER]]: +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[IV_START]] +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = add i64 [[IV_START]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[IV_START]], i64 0 +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[IV_START]], [[INDEX]] +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], splat (i64 3) +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]] +; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]]) +; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP5]], <4 x i1> [[TMP3]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP7]] = select i1 [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP7]], <4 x i1> [[TMP6]], i64 [[TMP9]]) +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC1: [[SCALAR_PH]]: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[IV_START]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-VF4IC1: [[FOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-VF4IC1-NEXT: [[RDX_07:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[CMP1:%.*]] = icmp sgt i64 [[TMP11]], 3 +; CHECK-VF4IC1-NEXT: [[COND]] = select i1 [[CMP1]], i64 [[IV]], i64 [[RDX_07]] +; CHECK-VF4IC1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-VF4IC1: [[EXIT_LOOPEXIT]]: +; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: br label %[[EXIT]] +; CHECK-VF4IC1: [[EXIT]]: +; CHECK-VF4IC1-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] +; CHECK-VF4IC1-NEXT: ret i64 [[IDX_0_LCSSA]] +; +; CHECK-VF4IC4-LABEL: define i64 @select_non_const_iv_start_signed_guard( +; CHECK-VF4IC4-SAME: ptr [[A:%.*]], i64 [[RDX_START:%.*]], i64 [[IV_START:%.*]], i64 [[N:%.*]]) { +; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC4-NEXT: [[GUARD:%.*]] = icmp slt i64 [[IV_START]], [[N]] +; CHECK-VF4IC4-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] +; CHECK-VF4IC4: [[FOR_BODY_PREHEADER]]: +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[IV_START]] +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = add i64 [[IV_START]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[IV_START]], i64 0 +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[IV_START]], [[INDEX]] +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], splat (i64 3) +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]] +; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]]) +; CHECK-VF4IC4-NEXT: [[TMP6]] = select i1 [[TMP5]], <4 x i1> [[TMP3]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP7]] = select i1 [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP7]], <4 x i1> [[TMP6]], i64 [[TMP9]]) +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC4: [[SCALAR_PH]]: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[IV_START]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-VF4IC4: [[FOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-VF4IC4-NEXT: [[RDX_07:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[CMP1:%.*]] = icmp sgt i64 [[TMP11]], 3 +; CHECK-VF4IC4-NEXT: [[COND]] = select i1 [[CMP1]], i64 [[IV]], i64 [[RDX_07]] +; CHECK-VF4IC4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 +; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-VF4IC4: [[EXIT_LOOPEXIT]]: +; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: br label %[[EXIT]] +; CHECK-VF4IC4: [[EXIT]]: +; CHECK-VF4IC4-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] +; CHECK-VF4IC4-NEXT: ret i64 [[IDX_0_LCSSA]] +; +; CHECK-VF1IC4-LABEL: define i64 @select_non_const_iv_start_signed_guard( +; CHECK-VF1IC4-SAME: ptr [[A:%.*]], i64 [[RDX_START:%.*]], i64 [[IV_START:%.*]], i64 [[N:%.*]]) { +; CHECK-VF1IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF1IC4-NEXT: [[GUARD:%.*]] = icmp slt i64 [[IV_START]], [[N]] +; CHECK-VF1IC4-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] +; CHECK-VF1IC4: [[FOR_BODY_PREHEADER]]: +; CHECK-VF1IC4-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-VF1IC4: [[FOR_BODY]]: +; CHECK-VF1IC4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[IV_START]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF1IC4-NEXT: [[RDX_07:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF1IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] +; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-VF1IC4-NEXT: [[CMP1:%.*]] = icmp sgt i64 [[TMP0]], 3 +; CHECK-VF1IC4-NEXT: [[COND]] = select i1 [[CMP1]], i64 [[IV]], i64 [[RDX_07]] +; CHECK-VF1IC4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 +; CHECK-VF1IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF1IC4: [[EXIT_LOOPEXIT]]: +; CHECK-VF1IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: br label %[[EXIT]] +; CHECK-VF1IC4: [[EXIT]]: +; CHECK-VF1IC4-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] +; CHECK-VF1IC4-NEXT: ret i64 [[IDX_0_LCSSA]] ; entry: %guard = icmp slt i64 %iv_start, %n @@ -49,32 +173,162 @@ exit: } define i32 @select_trunc_non_const_iv_start_signed_guard(ptr %a, i32 %rdx_start, i32 %iv_start ,i32 %n) { -; CHECK-LABEL: define i32 @select_trunc_non_const_iv_start_signed_guard( -; CHECK-SAME: ptr [[A:%.*]], i32 [[RDX_START:%.*]], i32 [[IV_START:%.*]], i32 [[N:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[GUARD:%.*]] = icmp slt i32 [[IV_START]], [[N]] -; CHECK-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] -; CHECK: [[FOR_BODY_PREHEADER]]: -; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[IV_START]] to i64 -; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = sext i32 [[N]] to i64 -; CHECK-NEXT: br label %[[FOR_BODY:.*]] -; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[RDX_07:%.*]] = phi i32 [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP1]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[IV]] to i32 -; CHECK-NEXT: [[COND]] = select i1 [[CMP1]], i32 [[TMP2]], i32 [[RDX_07]] -; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] -; CHECK: [[EXIT_LOOPEXIT]]: -; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ] -; CHECK-NEXT: br label %[[EXIT]] -; CHECK: [[EXIT]]: -; CHECK-NEXT: [[IDX_0_LCSSA:%.*]] = phi i32 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] -; CHECK-NEXT: ret i32 [[IDX_0_LCSSA]] +; CHECK-VF4IC1-LABEL: define i32 @select_trunc_non_const_iv_start_signed_guard( +; CHECK-VF4IC1-SAME: ptr [[A:%.*]], i32 [[RDX_START:%.*]], i32 [[IV_START:%.*]], i32 [[N:%.*]]) { +; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC1-NEXT: [[GUARD:%.*]] = icmp slt i32 [[IV_START]], [[N]] +; CHECK-VF4IC1-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] +; CHECK-VF4IC1: [[FOR_BODY_PREHEADER]]: +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = sext i32 [[IV_START]] to i64 +; CHECK-VF4IC1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = sext i32 [[N]] to i64 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[TMP0]] +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = add i64 [[TMP0]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[RDX_START]], i64 0 +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[IV_START]], i64 0 +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT2]], +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX]] +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[OFFSET_IDX]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; CHECK-VF4IC1-NEXT: [[TMP7]] = select i1 [[TMP6]], <4 x i1> [[TMP4]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP8]] = select i1 [[TMP6]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP8]], <4 x i1> [[TMP7]], i32 [[TMP10]]) +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC1: [[SCALAR_PH]]: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-VF4IC1: [[FOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[RDX_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP12]], 3 +; CHECK-VF4IC1-NEXT: [[TMP13:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC1-NEXT: [[COND]] = select i1 [[CMP1]], i32 [[TMP13]], i32 [[RDX_07]] +; CHECK-VF4IC1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-VF4IC1: [[EXIT_LOOPEXIT]]: +; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: br label %[[EXIT]] +; CHECK-VF4IC1: [[EXIT]]: +; CHECK-VF4IC1-NEXT: [[IDX_0_LCSSA:%.*]] = phi i32 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] +; CHECK-VF4IC1-NEXT: ret i32 [[IDX_0_LCSSA]] +; +; CHECK-VF4IC4-LABEL: define i32 @select_trunc_non_const_iv_start_signed_guard( +; CHECK-VF4IC4-SAME: ptr [[A:%.*]], i32 [[RDX_START:%.*]], i32 [[IV_START:%.*]], i32 [[N:%.*]]) { +; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC4-NEXT: [[GUARD:%.*]] = icmp slt i32 [[IV_START]], [[N]] +; CHECK-VF4IC4-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] +; CHECK-VF4IC4: [[FOR_BODY_PREHEADER]]: +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = sext i32 [[IV_START]] to i64 +; CHECK-VF4IC4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = sext i32 [[N]] to i64 +; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[TMP0]] +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 4 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = add i64 [[TMP0]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[RDX_START]], i64 0 +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[IV_START]], i64 0 +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT2]], +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX]] +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[OFFSET_IDX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; CHECK-VF4IC4-NEXT: [[TMP7]] = select i1 [[TMP6]], <4 x i1> [[TMP4]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP8]] = select i1 [[TMP6]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP8]], <4 x i1> [[TMP7]], i32 [[TMP10]]) +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC4: [[SCALAR_PH]]: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-VF4IC4: [[FOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[RDX_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP12]], 3 +; CHECK-VF4IC4-NEXT: [[TMP13:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC4-NEXT: [[COND]] = select i1 [[CMP1]], i32 [[TMP13]], i32 [[RDX_07]] +; CHECK-VF4IC4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 +; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-VF4IC4: [[EXIT_LOOPEXIT]]: +; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: br label %[[EXIT]] +; CHECK-VF4IC4: [[EXIT]]: +; CHECK-VF4IC4-NEXT: [[IDX_0_LCSSA:%.*]] = phi i32 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] +; CHECK-VF4IC4-NEXT: ret i32 [[IDX_0_LCSSA]] +; +; CHECK-VF1IC4-LABEL: define i32 @select_trunc_non_const_iv_start_signed_guard( +; CHECK-VF1IC4-SAME: ptr [[A:%.*]], i32 [[RDX_START:%.*]], i32 [[IV_START:%.*]], i32 [[N:%.*]]) { +; CHECK-VF1IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF1IC4-NEXT: [[GUARD:%.*]] = icmp slt i32 [[IV_START]], [[N]] +; CHECK-VF1IC4-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] +; CHECK-VF1IC4: [[FOR_BODY_PREHEADER]]: +; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = sext i32 [[IV_START]] to i64 +; CHECK-VF1IC4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = sext i32 [[N]] to i64 +; CHECK-VF1IC4-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-VF1IC4: [[FOR_BODY]]: +; CHECK-VF1IC4-NEXT: [[IV:%.*]] = phi i64 [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[RDX_07:%.*]] = phi i32 [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF1IC4-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF1IC4-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP1]], 3 +; CHECK-VF1IC4-NEXT: [[TMP2:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF1IC4-NEXT: [[COND]] = select i1 [[CMP1]], i32 [[TMP2]], i32 [[RDX_07]] +; CHECK-VF1IC4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 +; CHECK-VF1IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF1IC4: [[EXIT_LOOPEXIT]]: +; CHECK-VF1IC4-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: br label %[[EXIT]] +; CHECK-VF1IC4: [[EXIT]]: +; CHECK-VF1IC4-NEXT: [[IDX_0_LCSSA:%.*]] = phi i32 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] +; CHECK-VF1IC4-NEXT: ret i32 [[IDX_0_LCSSA]] ; entry: %guard = icmp slt i32 %iv_start, %n @@ -101,3 +355,18 @@ exit: %idx.0.lcssa = phi i32 [ %rdx_start, %entry ], [ %cond, %for.body ] ret i32 %idx.0.lcssa } +;. +; CHECK-VF4IC1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-VF4IC1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-VF4IC1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-VF4IC1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK-VF4IC1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK-VF4IC1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +;. +; CHECK-VF4IC4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-VF4IC4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-VF4IC4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-VF4IC4: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK-VF4IC4: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK-VF4IC4: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll index 45c2abd43c36a..839ea7ce7e7a4 100644 --- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll +++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll @@ -674,65 +674,125 @@ exit: ; preds = %for.body ; Negative tests -; This test can theoretically be vectorized, but only with a runtime-check. -; The construct that are introduced by IndVarSimplify is: +; This test can theoretically be vectorized as a FindLastIV reduction, but only +; with a runtime-check. It will vectorize as a generic FindLast reduction. +; +; For FindLastIV, the construct that are introduced by IndVarSimplify is: ; %1 = trunc i64 %iv to i32 ; However, the loop guard is an i64: ; %cmp.sgt = icmp sgt i64 %n, 0 ; We cannot guarantee that %iv won't overflow an i32 value (and hence hit the ; sentinel value), and need a runtime-check to vectorize this case. -define i32 @not_vectorized_select_icmp_const_truncated_iv_unwidened_exit(ptr %a, i64 %n) { -; CHECK-VF4IC1-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_unwidened_exit( +define i32 @select_icmp_const_truncated_iv_unwidened_exit(ptr %a, i64 %n) { +; CHECK-VF4IC1-LABEL: define i32 @select_icmp_const_truncated_iv_unwidened_exit( ; CHECK-VF4IC1-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) { ; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: ; CHECK-VF4IC1-NEXT: [[CMP_SGT:%.*]] = icmp sgt i64 [[N]], 0 ; CHECK-VF4IC1-NEXT: br i1 [[CMP_SGT]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] ; CHECK-VF4IC1: [[FOR_BODY_PREHEADER]]: +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 331), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP8]] +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-VF4IC1-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 331) +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC1: [[SCALAR_PH]]: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 331, %[[FOR_BODY_PREHEADER]] ] ; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC1: [[FOR_BODY]]: -; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] -; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ], [ 331, %[[FOR_BODY_PREHEADER]] ] -; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[IV1:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 ; CHECK-VF4IC1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 3 -; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = trunc i64 [[IV1]] to i32 ; CHECK-VF4IC1-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]] -; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-VF4IC1: [[EXIT_LOOPEXIT]]: -; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC1-NEXT: br label %[[EXIT]] ; CHECK-VF4IC1: [[EXIT]]: ; CHECK-VF4IC1-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ] ; CHECK-VF4IC1-NEXT: ret i32 [[RDX_LCSSA]] ; -; CHECK-VF4IC4-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_unwidened_exit( +; CHECK-VF4IC4-LABEL: define i32 @select_icmp_const_truncated_iv_unwidened_exit( ; CHECK-VF4IC4-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) { ; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: ; CHECK-VF4IC4-NEXT: [[CMP_SGT:%.*]] = icmp sgt i64 [[N]], 0 ; CHECK-VF4IC4-NEXT: br i1 [[CMP_SGT]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] ; CHECK-VF4IC4: [[FOR_BODY_PREHEADER]]: +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 331), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP1]] +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-VF4IC4-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 331) +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC4: [[SCALAR_PH]]: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 331, %[[FOR_BODY_PREHEADER]] ] ; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC4: [[FOR_BODY]]: -; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] -; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ], [ 331, %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] ; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-VF4IC4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 3 -; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32 -; CHECK-VF4IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]] +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP8]], 3 +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP9]], i32 [[RDX]] ; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 ; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-VF4IC4: [[EXIT_LOOPEXIT]]: -; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC4-NEXT: br label %[[EXIT]] ; CHECK-VF4IC4: [[EXIT]]: ; CHECK-VF4IC4-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ] ; CHECK-VF4IC4-NEXT: ret i32 [[RDX_LCSSA]] ; -; CHECK-VF1IC4-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_unwidened_exit( +; CHECK-VF1IC4-LABEL: define i32 @select_icmp_const_truncated_iv_unwidened_exit( ; CHECK-VF1IC4-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) { ; CHECK-VF1IC4-NEXT: [[ENTRY:.*]]: ; CHECK-VF1IC4-NEXT: [[CMP_SGT:%.*]] = icmp sgt i64 [[N]], 0 @@ -778,67 +838,127 @@ exit: ; preds = %for.body, %entry ret i32 %rdx.lcssa } -; This test can theoretically be vectorized, but only with a runtime-check. -; The construct that are introduced by IndVarSimplify is: +; This test can theoretically be vectorized as a FindLastIV reduction, but only +; with a runtime-check. It will vectorize as a generic FindLast reduction. +; +; For FindLastIV, the construct that are introduced by IndVarSimplify is: ; %1 = trunc i64 %iv to i32 ; However, the loop guard is unsigned: ; %cmp.not = icmp eq i32 %n, 0 ; We cannot guarantee that %iv won't overflow an i32 value (and hence hit the ; sentinel value), and need a runtime-check to vectorize this case. -define i32 @not_vectorized_select_icmp_const_truncated_iv_unsigned_loop_guard(ptr %a, i32 %n) { -; CHECK-VF4IC1-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_unsigned_loop_guard( +define i32 @select_icmp_const_truncated_iv_unsigned_loop_guard(ptr %a, i32 %n) { +; CHECK-VF4IC1-LABEL: define i32 @select_icmp_const_truncated_iv_unsigned_loop_guard( ; CHECK-VF4IC1-SAME: ptr [[A:%.*]], i32 [[N:%.*]]) { ; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: ; CHECK-VF4IC1-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[N]], 0 ; CHECK-VF4IC1-NEXT: br i1 [[CMP_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY_PREHEADER:.*]] ; CHECK-VF4IC1: [[FOR_BODY_PREHEADER]]: ; CHECK-VF4IC1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 331), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP8]] +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-VF4IC1-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 331) +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC1: [[SCALAR_PH]]: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 331, %[[FOR_BODY_PREHEADER]] ] ; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC1: [[FOR_BODY]]: -; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ 331, %[[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 ; CHECK-VF4IC1-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 3 -; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = trunc i64 [[IV1]] to i32 ; CHECK-VF4IC1-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[RDX]] -; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]] -; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-VF4IC1: [[EXIT_LOOPEXIT]]: -; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC1-NEXT: br label %[[EXIT]] ; CHECK-VF4IC1: [[EXIT]]: ; CHECK-VF4IC1-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ] ; CHECK-VF4IC1-NEXT: ret i32 [[RDX_LCSSA]] ; -; CHECK-VF4IC4-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_unsigned_loop_guard( +; CHECK-VF4IC4-LABEL: define i32 @select_icmp_const_truncated_iv_unsigned_loop_guard( ; CHECK-VF4IC4-SAME: ptr [[A:%.*]], i32 [[N:%.*]]) { ; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: ; CHECK-VF4IC4-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[N]], 0 ; CHECK-VF4IC4-NEXT: br i1 [[CMP_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY_PREHEADER:.*]] ; CHECK-VF4IC4: [[FOR_BODY_PREHEADER]]: ; CHECK-VF4IC4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 331), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP1]] +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-VF4IC4-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 331) +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC4: [[SCALAR_PH]]: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 331, %[[FOR_BODY_PREHEADER]] ] ; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC4: [[FOR_BODY]]: -; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ 331, %[[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] ; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-VF4IC4-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 3 -; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32 -; CHECK-VF4IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[RDX]] +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP8]], 3 +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP9]], i32 [[RDX]] ; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 ; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]] -; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-VF4IC4: [[EXIT_LOOPEXIT]]: -; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC4-NEXT: br label %[[EXIT]] ; CHECK-VF4IC4: [[EXIT]]: ; CHECK-VF4IC4-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ] ; CHECK-VF4IC4-NEXT: ret i32 [[RDX_LCSSA]] ; -; CHECK-VF1IC4-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_unsigned_loop_guard( +; CHECK-VF1IC4-LABEL: define i32 @select_icmp_const_truncated_iv_unsigned_loop_guard( ; CHECK-VF1IC4-SAME: ptr [[A:%.*]], i32 [[N:%.*]]) { ; CHECK-VF1IC4-NEXT: [[ENTRY:.*]]: ; CHECK-VF1IC4-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[N]], 0 @@ -899,41 +1019,61 @@ exit: ; preds = %for.body, %entry define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound(ptr %a) { ; CHECK-VF4IC1-LABEL: define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound( ; CHECK-VF4IC1-SAME: ptr [[A:%.*]]) { -; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: -; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] -; CHECK-VF4IC1: [[FOR_BODY]]: -; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 4294967294, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ENTRY:.*:]] +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 331), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = add i64 4294967294, [[INDEX]] ; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-VF4IC1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 3 -; CHECK-VF4IC1-NEXT: [[CONV:%.*]] = trunc i64 [[IV]] to i32 -; CHECK-VF4IC1-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[CONV]], i32 [[RDX]] -; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 -; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 9223372036854775806 -; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP1]] +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-VF4IC1-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9223372032559808512 +; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 331) +; CHECK-VF4IC1-NEXT: br label %[[EXIT:.*]] ; CHECK-VF4IC1: [[EXIT]]: -; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: ret i32 [[SPEC_SELECT_LCSSA]] +; CHECK-VF4IC1-NEXT: ret i32 [[TMP7]] ; ; CHECK-VF4IC4-LABEL: define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound( ; CHECK-VF4IC4-SAME: ptr [[A:%.*]]) { -; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: -; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] -; CHECK-VF4IC4: [[FOR_BODY]]: -; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 4294967294, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ENTRY:.*:]] +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 331), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = add i64 4294967294, [[INDEX]] ; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-VF4IC4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 3 -; CHECK-VF4IC4-NEXT: [[CONV:%.*]] = trunc i64 [[IV]] to i32 -; CHECK-VF4IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[CONV]], i32 [[RDX]] -; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 -; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 9223372036854775806 -; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP1]] +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-VF4IC4-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9223372032559808512 +; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 331) +; CHECK-VF4IC4-NEXT: br label %[[EXIT:.*]] ; CHECK-VF4IC4: [[EXIT]]: -; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: ret i32 [[SPEC_SELECT_LCSSA]] +; CHECK-VF4IC4-NEXT: ret i32 [[TMP7]] ; ; CHECK-VF1IC4-LABEL: define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound( ; CHECK-VF1IC4-SAME: ptr [[A:%.*]]) { @@ -980,44 +1120,112 @@ define i32 @not_vectorized_select_iv_icmp_no_guard(ptr %a, ptr %b, i32 %start, i ; CHECK-VF4IC1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[START:%.*]], i32 [[N:%.*]]) { ; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: ; CHECK-VF4IC1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[START]], i64 0 +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP11]] +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]]) +; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP6]], <4 x i1> [[TMP5]], i32 [[TMP8]]) +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC1: [[SCALAR_PH]]: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC1: [[FOR_BODY]]: -; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-VF4IC1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +; CHECK-VF4IC1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV1]] ; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 ; CHECK-VF4IC1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], [[TMP1]] -; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = trunc i64 [[IV1]] to i32 ; CHECK-VF4IC1-NEXT: [[COND]] = select i1 [[CMP]], i32 [[TMP2]], i32 [[RDX]] -; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]] -; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK-VF4IC1: [[EXIT]]: -; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC1-NEXT: ret i32 [[COND_LCSSA]] ; ; CHECK-VF4IC4-LABEL: define i32 @not_vectorized_select_iv_icmp_no_guard( ; CHECK-VF4IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[START:%.*]], i32 [[N:%.*]]) { ; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: ; CHECK-VF4IC4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[START]], i64 0 +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP11]] +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]]) +; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP6]], <4 x i1> [[TMP5]], i32 [[TMP8]]) +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC4: [[SCALAR_PH]]: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC4: [[FOR_BODY]]: -; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-VF4IC4-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]] +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +; CHECK-VF4IC4-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV1]] ; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 ; CHECK-VF4IC4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], [[TMP1]] -; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = trunc i64 [[IV1]] to i32 ; CHECK-VF4IC4-NEXT: [[COND]] = select i1 [[CMP]], i32 [[TMP2]], i32 [[RDX]] -; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]] -; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK-VF4IC4: [[EXIT]]: -; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC4-NEXT: ret i32 [[COND_LCSSA]] ; ; CHECK-VF1IC4-LABEL: define i32 @not_vectorized_select_iv_icmp_no_guard( @@ -1071,38 +1279,84 @@ exit: ; preds = %for.body define i32 @not_vectorized_select_fcmp_invalid_const_ub(ptr %a) { ; CHECK-VF4IC1-LABEL: define i32 @not_vectorized_select_fcmp_invalid_const_ub( ; CHECK-VF4IC1-SAME: ptr [[A:%.*]]) { -; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC1-NEXT: [[ENTRY:.*:]] +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = fcmp fast olt <4 x float> [[WIDE_LOAD]], zeroinitializer +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP8]] +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-VF4IC1-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2147483648 +; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 -1) +; CHECK-VF4IC1-NEXT: br label %[[SCALAR_PH:.*]] +; CHECK-VF4IC1: [[SCALAR_PH]]: ; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC1: [[FOR_BODY]]: -; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[IV1:%.*]] = phi i64 [ 2147483648, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[TMP7]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV1]] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 ; CHECK-VF4IC1-NEXT: [[CMP:%.*]] = fcmp fast olt float [[TMP0]], 0.000000e+00 -; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = trunc i64 [[IV1]] to i32 ; CHECK-VF4IC1-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]] -; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 2147483649 -; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK-VF4IC1: [[EXIT]]: ; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ] ; CHECK-VF4IC1-NEXT: ret i32 [[SPEC_SELECT_LCSSA]] ; ; CHECK-VF4IC4-LABEL: define i32 @not_vectorized_select_fcmp_invalid_const_ub( ; CHECK-VF4IC4-SAME: ptr [[A:%.*]]) { -; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC4-NEXT: [[ENTRY:.*:]] +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = fcmp fast olt <4 x float> [[WIDE_LOAD]], zeroinitializer +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP8]] +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-VF4IC4-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2147483648 +; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 -1) +; CHECK-VF4IC4-NEXT: br label %[[SCALAR_PH:.*]] +; CHECK-VF4IC4: [[SCALAR_PH]]: ; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC4: [[FOR_BODY]]: -; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[IV1:%.*]] = phi i64 [ 2147483648, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[TMP7]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV1]] +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 ; CHECK-VF4IC4-NEXT: [[CMP:%.*]] = fcmp fast olt float [[TMP0]], 0.000000e+00 -; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = trunc i64 [[IV1]] to i32 ; CHECK-VF4IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]] -; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 2147483649 -; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK-VF4IC4: [[EXIT]]: ; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ] ; CHECK-VF4IC4-NEXT: ret i32 [[SPEC_SELECT_LCSSA]] @@ -1156,22 +1410,56 @@ define i16 @not_vectorized_select_iv_icmp_overflow_unwidened_tripcount(ptr %a, p ; CHECK-VF4IC1-NEXT: br i1 [[CMP9]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] ; CHECK-VF4IC1: [[FOR_BODY_PREHEADER]]: ; CHECK-VF4IC1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[START]], i64 0 +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i16> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP11]] +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]]) +; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i16> [[VEC_IND]], <4 x i16> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4) +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = extractelement <4 x i16> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = call i16 @llvm.experimental.vector.extract.last.active.v4i16(<4 x i16> [[TMP6]], <4 x i1> [[TMP5]], i16 [[TMP8]]) +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC1: [[SCALAR_PH]]: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[FOR_BODY_PREHEADER]] ] ; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC1: [[FOR_BODY]]: -; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i16 [ [[START]], %[[FOR_BODY_PREHEADER]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-VF4IC1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i16 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +; CHECK-VF4IC1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV1]] ; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 ; CHECK-VF4IC1-NEXT: [[CMP3:%.*]] = icmp sgt i32 [[TMP0]], [[TMP1]] -; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = trunc i64 [[IV]] to i16 +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = trunc i64 [[IV1]] to i16 ; CHECK-VF4IC1-NEXT: [[COND]] = select i1 [[CMP3]], i16 [[TMP2]], i16 [[RDX]] -; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]] -; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK-VF4IC1: [[EXIT_LOOPEXIT]]: -; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i16 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i16 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC1-NEXT: br label %[[EXIT]] ; CHECK-VF4IC1: [[EXIT]]: ; CHECK-VF4IC1-NEXT: [[RDX_0_LCSSA:%.*]] = phi i16 [ [[START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] @@ -1184,22 +1472,56 @@ define i16 @not_vectorized_select_iv_icmp_overflow_unwidened_tripcount(ptr %a, p ; CHECK-VF4IC4-NEXT: br i1 [[CMP9]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] ; CHECK-VF4IC4: [[FOR_BODY_PREHEADER]]: ; CHECK-VF4IC4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[START]], i64 0 +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i16> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP11]] +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]]) +; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i16> [[VEC_IND]], <4 x i16> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4) +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = extractelement <4 x i16> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = call i16 @llvm.experimental.vector.extract.last.active.v4i16(<4 x i16> [[TMP6]], <4 x i1> [[TMP5]], i16 [[TMP8]]) +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC4: [[SCALAR_PH]]: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[FOR_BODY_PREHEADER]] ] ; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC4: [[FOR_BODY]]: -; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i16 [ [[START]], %[[FOR_BODY_PREHEADER]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-VF4IC4-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i16 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]] +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +; CHECK-VF4IC4-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV1]] ; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 ; CHECK-VF4IC4-NEXT: [[CMP3:%.*]] = icmp sgt i32 [[TMP0]], [[TMP1]] -; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = trunc i64 [[IV]] to i16 +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = trunc i64 [[IV1]] to i16 ; CHECK-VF4IC4-NEXT: [[COND]] = select i1 [[CMP3]], i16 [[TMP2]], i16 [[RDX]] -; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]] -; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK-VF4IC4: [[EXIT_LOOPEXIT]]: -; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i16 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i16 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC4-NEXT: br label %[[EXIT]] ; CHECK-VF4IC4: [[EXIT]]: ; CHECK-VF4IC4-NEXT: [[RDX_0_LCSSA:%.*]] = phi i16 [ [[START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll index a071949f82062..6001ee32ca62a 100644 --- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll @@ -1948,16 +1948,52 @@ exit: ; preds = %for.body } ; The sentinel value for increasing-IV vectorization is -LONG_MAX, and since -; the IV hits this value, it is impossible to vectorize this case. +; the IV hits this value, it is vectorized as a generic last-active reduction. define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.start, i64 %n) { ; CHECK-VF4IC1-LABEL: define i64 @not_vectorized_select_icmp_iv_out_of_bound( ; CHECK-VF4IC1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) { ; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = add i64 -9223372036854775808, [[N_VEC]] +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP12]], align 8 +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]] +; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]]) +; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP5]], <4 x i1> [[TMP3]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP7]] = select i1 [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP7]], <4 x i1> [[TMP6]], i64 [[TMP9]]) +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC1: [[SCALAR_PH]]: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ -9223372036854775808, %[[ENTRY]] ] +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ] ; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC1: [[FOR_BODY]]: -; CHECK-VF4IC1-NEXT: [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ -9223372036854775808, %[[ENTRY]] ] -; CHECK-VF4IC1-NEXT: [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ] -; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ] +; CHECK-VF4IC1-NEXT: [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-VF4IC1-NEXT: [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] ; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_I]] ; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_I]] @@ -1967,19 +2003,55 @@ define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx. ; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV_I]], 1 ; CHECK-VF4IC1-NEXT: [[INC3]] = add nsw i64 [[IV_J]], 1 ; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK-VF4IC1: [[EXIT]]: -; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC1-NEXT: ret i64 [[COND_LCSSA]] ; ; CHECK-VF4IC4-LABEL: define i64 @not_vectorized_select_icmp_iv_out_of_bound( ; CHECK-VF4IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) { ; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = add i64 -9223372036854775808, [[N_VEC]] +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP12]], align 8 +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]] +; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]]) +; CHECK-VF4IC4-NEXT: [[TMP6]] = select i1 [[TMP5]], <4 x i1> [[TMP3]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP7]] = select i1 [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP7]], <4 x i1> [[TMP6]], i64 [[TMP9]]) +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC4: [[SCALAR_PH]]: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ -9223372036854775808, %[[ENTRY]] ] +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ] ; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC4: [[FOR_BODY]]: -; CHECK-VF4IC4-NEXT: [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ -9223372036854775808, %[[ENTRY]] ] -; CHECK-VF4IC4-NEXT: [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ] -; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ] +; CHECK-VF4IC4-NEXT: [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-VF4IC4-NEXT: [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] ; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_I]] ; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_I]] @@ -1989,9 +2061,9 @@ define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx. ; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV_I]], 1 ; CHECK-VF4IC4-NEXT: [[INC3]] = add nsw i64 [[IV_J]], 1 ; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK-VF4IC4: [[EXIT]]: -; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC4-NEXT: ret i64 [[COND_LCSSA]] ; ; CHECK-VF1IC4-LABEL: define i64 @not_vectorized_select_icmp_iv_out_of_bound( @@ -2042,10 +2114,50 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b, ; CHECK-VF4IC1-LABEL: define i64 @not_vectorized_select_icmp_non_const_iv_start_value( ; CHECK-VF4IC1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[IVSTART:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) { ; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC1-NEXT: [[TMP12:%.*]] = sub i64 [[N]], [[IVSTART]] +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP12]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP12]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP12]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: [[TMP13:%.*]] = add i64 [[IVSTART]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[IVSTART]], i64 0 +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[IVSTART]], [[INDEX]] +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8 +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD3]] +; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; CHECK-VF4IC1-NEXT: [[TMP7]] = select i1 [[TMP6]], <4 x i1> [[TMP4]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP8]] = select i1 [[TMP6]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP8]], <4 x i1> [[TMP7]], i64 [[TMP10]]) +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP12]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC1: [[SCALAR_PH]]: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ [[IVSTART]], %[[ENTRY]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ] ; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC1: [[FOR_BODY]]: -; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[IVSTART]], %[[ENTRY]] ] -; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ] +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] ; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]] @@ -2054,18 +2166,58 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b, ; CHECK-VF4IC1-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]] ; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 ; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK-VF4IC1: [[EXIT]]: -; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC1-NEXT: ret i64 [[COND_LCSSA]] ; ; CHECK-VF4IC4-LABEL: define i64 @not_vectorized_select_icmp_non_const_iv_start_value( ; CHECK-VF4IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[IVSTART:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) { ; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC4-NEXT: [[TMP12:%.*]] = sub i64 [[N]], [[IVSTART]] +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP12]], 4 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP12]], 4 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP12]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: [[TMP13:%.*]] = add i64 [[IVSTART]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[IVSTART]], i64 0 +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[IVSTART]], [[INDEX]] +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8 +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD3]] +; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; CHECK-VF4IC4-NEXT: [[TMP7]] = select i1 [[TMP6]], <4 x i1> [[TMP4]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP8]] = select i1 [[TMP6]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP8]], <4 x i1> [[TMP7]], i64 [[TMP10]]) +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP12]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC4: [[SCALAR_PH]]: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ [[IVSTART]], %[[ENTRY]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ] ; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC4: [[FOR_BODY]]: -; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[IVSTART]], %[[ENTRY]] ] -; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ] +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] ; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]] @@ -2074,9 +2226,9 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b, ; CHECK-VF4IC4-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]] ; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 ; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK-VF4IC4: [[EXIT]]: -; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC4-NEXT: ret i64 [[COND_LCSSA]] ; ; CHECK-VF1IC4-LABEL: define i64 @not_vectorized_select_icmp_non_const_iv_start_value( diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp.ll b/llvm/test/Transforms/LoopVectorize/select-cmp.ll index 2b352abe9f7a1..e19ebb9a3251c 100644 --- a/llvm/test/Transforms/LoopVectorize/select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/select-cmp.ll @@ -1128,27 +1128,124 @@ exit: ; preds = %loop ret float %sel } -; We don't support selecting loop-variant values. define i32 @select_variant_i32_from_icmp(ptr %v1, ptr %v2, i64 %n) { -; CHECK-LABEL: define i32 @select_variant_i32_from_icmp( -; CHECK-SAME: ptr [[V1:%.*]], ptr [[V2:%.*]], i64 [[N:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br label %[[LOOP:.*]] -; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ 3, %[[ENTRY]] ], [ [[SEL:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[GEP_V1_IV:%.*]] = getelementptr inbounds i32, ptr [[V1]], i64 [[IV]] -; CHECK-NEXT: [[LOAD_V1_IV:%.*]] = load i32, ptr [[GEP_V1_IV]], align 4 -; CHECK-NEXT: [[GEP_V2_IV:%.*]] = getelementptr inbounds i32, ptr [[V2]], i64 [[IV]] -; CHECK-NEXT: [[LOAD_V2_IV:%.*]] = load i32, ptr [[GEP_V2_IV]], align 4 -; CHECK-NEXT: [[CMP_V1_IV_3:%.*]] = icmp eq i32 [[LOAD_V1_IV]], 3 -; CHECK-NEXT: [[SEL]] = select i1 [[CMP_V1_IV_3]], i32 [[RDX]], i32 [[LOAD_V2_IV]] -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]] -; CHECK: [[EXIT]]: -; CHECK-NEXT: [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ] -; CHECK-NEXT: ret i32 [[SEL_LCSSA]] +; CHECK-VF4IC1-LABEL: define i32 @select_variant_i32_from_icmp( +; CHECK-VF4IC1-SAME: ptr [[V1:%.*]], ptr [[V2:%.*]], i64 [[N:%.*]]) { +; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 3), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[V1]], i64 [[INDEX]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V2]], i64 [[INDEX]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]] +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]]) +; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP2]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD1]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP6]], <4 x i1> [[TMP5]], i32 3) +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC1: [[SCALAR_PH]]: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 3, %[[ENTRY]] ] +; CHECK-VF4IC1-NEXT: br label %[[LOOP:.*]] +; CHECK-VF4IC1: [[LOOP]]: +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] +; CHECK-VF4IC1-NEXT: [[GEP_V1_IV:%.*]] = getelementptr inbounds i32, ptr [[V1]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[LOAD_V1_IV:%.*]] = load i32, ptr [[GEP_V1_IV]], align 4 +; CHECK-VF4IC1-NEXT: [[GEP_V2_IV:%.*]] = getelementptr inbounds i32, ptr [[V2]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[LOAD_V2_IV:%.*]] = load i32, ptr [[GEP_V2_IV]], align 4 +; CHECK-VF4IC1-NEXT: [[CMP_V1_IV_3:%.*]] = icmp eq i32 [[LOAD_V1_IV]], 3 +; CHECK-VF4IC1-NEXT: [[SEL]] = select i1 [[CMP_V1_IV_3]], i32 [[RDX]], i32 [[LOAD_V2_IV]] +; CHECK-VF4IC1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-VF4IC1-NEXT: br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-VF4IC1: [[EXIT]]: +; CHECK-VF4IC1-NEXT: [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: ret i32 [[SEL_LCSSA]] +; +; CHECK-VF4IC4-LABEL: define i32 @select_variant_i32_from_icmp( +; CHECK-VF4IC4-SAME: ptr [[V1:%.*]], ptr [[V2:%.*]], i64 [[N:%.*]]) { +; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 3), %[[VECTOR_PH]] ], [ [[TMP27:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[V1]], i64 [[INDEX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[V2]], i64 [[INDEX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD9]], splat (i32 3) +; CHECK-VF4IC4-NEXT: [[TMP17:%.*]] = freeze <4 x i1> [[TMP11]] +; CHECK-VF4IC4-NEXT: [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP17]]) +; CHECK-VF4IC4-NEXT: [[TMP23]] = select i1 [[TMP19]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP27]] = select i1 [[TMP19]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD13]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP27]], <4 x i1> [[TMP23]], i32 3) +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC4: [[SCALAR_PH]]: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 3, %[[ENTRY]] ] +; CHECK-VF4IC4-NEXT: br label %[[LOOP:.*]] +; CHECK-VF4IC4: [[LOOP]]: +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] +; CHECK-VF4IC4-NEXT: [[GEP_V1_IV:%.*]] = getelementptr inbounds i32, ptr [[V1]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[LOAD_V1_IV:%.*]] = load i32, ptr [[GEP_V1_IV]], align 4 +; CHECK-VF4IC4-NEXT: [[GEP_V2_IV:%.*]] = getelementptr inbounds i32, ptr [[V2]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[LOAD_V2_IV:%.*]] = load i32, ptr [[GEP_V2_IV]], align 4 +; CHECK-VF4IC4-NEXT: [[CMP_V1_IV_3:%.*]] = icmp eq i32 [[LOAD_V1_IV]], 3 +; CHECK-VF4IC4-NEXT: [[SEL]] = select i1 [[CMP_V1_IV_3]], i32 [[RDX]], i32 [[LOAD_V2_IV]] +; CHECK-VF4IC4-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC4-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-VF4IC4-NEXT: br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-VF4IC4: [[EXIT]]: +; CHECK-VF4IC4-NEXT: [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: ret i32 [[SEL_LCSSA]] +; +; CHECK-VF1IC4-LABEL: define i32 @select_variant_i32_from_icmp( +; CHECK-VF1IC4-SAME: ptr [[V1:%.*]], ptr [[V2:%.*]], i64 [[N:%.*]]) { +; CHECK-VF1IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF1IC4-NEXT: br label %[[LOOP:.*]] +; CHECK-VF1IC4: [[LOOP]]: +; CHECK-VF1IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-VF1IC4-NEXT: [[RDX:%.*]] = phi i32 [ 3, %[[ENTRY]] ], [ [[SEL:%.*]], %[[LOOP]] ] +; CHECK-VF1IC4-NEXT: [[GEP_V1_IV:%.*]] = getelementptr inbounds i32, ptr [[V1]], i64 [[IV]] +; CHECK-VF1IC4-NEXT: [[LOAD_V1_IV:%.*]] = load i32, ptr [[GEP_V1_IV]], align 4 +; CHECK-VF1IC4-NEXT: [[GEP_V2_IV:%.*]] = getelementptr inbounds i32, ptr [[V2]], i64 [[IV]] +; CHECK-VF1IC4-NEXT: [[LOAD_V2_IV:%.*]] = load i32, ptr [[GEP_V2_IV]], align 4 +; CHECK-VF1IC4-NEXT: [[CMP_V1_IV_3:%.*]] = icmp eq i32 [[LOAD_V1_IV]], 3 +; CHECK-VF1IC4-NEXT: [[SEL]] = select i1 [[CMP_V1_IV_3]], i32 [[RDX]], i32 [[LOAD_V2_IV]] +; CHECK-VF1IC4-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF1IC4-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-VF1IC4-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK-VF1IC4: [[EXIT]]: +; CHECK-VF1IC4-NEXT: [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ] +; CHECK-VF1IC4-NEXT: ret i32 [[SEL_LCSSA]] ; entry: br label %loop @@ -1220,6 +1317,8 @@ exit: ; preds = %loop ; CHECK-VF4IC1: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} ; CHECK-VF4IC1: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} ; CHECK-VF4IC1: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} +; CHECK-VF4IC1: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]} +; CHECK-VF4IC1: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]} ;. ; CHECK-VF4IC4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK-VF4IC4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} @@ -1235,6 +1334,8 @@ exit: ; preds = %loop ; CHECK-VF4IC4: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} ; CHECK-VF4IC4: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} ; CHECK-VF4IC4: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} +; CHECK-VF4IC4: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]} +; CHECK-VF4IC4: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]} ;. ; CHECK-VF1IC4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK-VF1IC4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} From ee66898ae6cc4c128a02c48e638b98d943abdbf5 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Tue, 11 Nov 2025 10:26:04 +0000 Subject: [PATCH 02/24] Apply suggestion from @MacDue Co-authored-by: Benjamin Maxwell --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 4f924ad4cfc3b..7c3a396a91e31 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1293,7 +1293,6 @@ class LoopVectorizationCostModel { "from latch block\n"); return true; } - if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) { LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: " "interleaved group requires scalar epilogue\n"); From da74e57509fce242329cecabe96d343c4cc01392 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Tue, 11 Nov 2025 16:02:27 +0000 Subject: [PATCH 03/24] Remove LVL param --- .../Transforms/Vectorize/LoopVectorize.cpp | 6 +- .../Transforms/Vectorize/VPlanTransforms.cpp | 101 +++++++++--------- .../Transforms/Vectorize/VPlanTransforms.h | 4 +- 3 files changed, 51 insertions(+), 60 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 7c3a396a91e31..b4d8c081e576b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8637,7 +8637,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // Create whole-vector selects for find-last recurrences. VPlanTransforms::runPass(VPlanTransforms::convertFindLastRecurrences, *Plan, - RecipeBuilder, Legal); + RecipeBuilder); if (useActiveLaneMask(Style)) { // TODO: Move checks to VPlanTransforms::addActiveLaneMask once @@ -9644,10 +9644,6 @@ static SmallVector preparePlanForEpilogueVectorLoop( continue; } } - } else if (isa(R)) { - // LastActiveMasks are only used as part of FindLast reductions, - // and aren't passed to the scalar loop. - continue; } else { // Retrieve the induction resume values for wide inductions from // their original phi nodes in the scalar loop. diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index b63dcb1d136e8..28fec1e3da100 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -41,7 +41,6 @@ #include "llvm/Support/Casting.h" #include "llvm/Support/TypeSize.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" -#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" using namespace llvm; using namespace VPlanPatternMatch; @@ -5109,9 +5108,7 @@ void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan, } void VPlanTransforms::convertFindLastRecurrences( - VPlan &Plan, VPRecipeBuilder &RecipeBuilder, - LoopVectorizationLegality *Legal) { - assert(Legal && "Need valid LoopVecLegality"); + VPlan &Plan, VPRecipeBuilder &RecipeBuilder) { // May need to do something better than this? if (Plan.hasScalarVFOnly()) @@ -5132,55 +5129,55 @@ void VPlanTransforms::convertFindLastRecurrences( // middle.block: // result = extract-last-active new.data, new.mask, default.val - for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) { - if (RecurrenceDescriptor::isFindLastRecurrenceKind( - RdxDesc.getRecurrenceKind())) { - VPRecipeBase *PhiR = RecipeBuilder.getRecipe(Phi); - VPBuilder Builder = VPBuilder::getToInsertAfter(PhiR); - - // Add mask phi - VPValue *False = - Plan.getOrAddLiveIn(ConstantInt::getFalse(Phi->getContext())); - auto *MaskPHI = new VPLastActiveMaskPHIRecipe(False, DebugLoc()); - Builder.insert(MaskPHI); - - // Find the condition for the select - SelectInst *Select = cast(RdxDesc.getLoopExitInstr()); - auto *SR = cast(RecipeBuilder.getRecipe(Select)); - VPValue *Cond = SR->getCond(); - - // Add select for mask - Builder.setInsertPoint(SR); - VPValue *AnyOf = Builder.createNaryOp(VPInstruction::AnyOf, {Cond}); - VPValue *MaskSelect = Builder.createSelect(AnyOf, Cond, MaskPHI); - MaskPHI->addOperand(MaskSelect); - - // Replace select for data - VPValue *DataSelect = Builder.createSelect( - AnyOf, SR->getOperand(1), SR->getOperand(2), SR->getDebugLoc()); - SR->replaceAllUsesWith(DataSelect); - SR->eraseFromParent(); - - // Find final reduction and replace it with an - // extract.last.active intrinsic. - VPInstruction *RdxResult = nullptr; - for (VPUser *U : DataSelect->users()) { - VPInstruction *I = dyn_cast(U); - if (I && I->getOpcode() == VPInstruction::ComputeReductionResult) { - RdxResult = I; - break; - } - } + for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { + auto *PhiR = dyn_cast(&Phi); + if (!PhiR || !RecurrenceDescriptor::isFindLastRecurrenceKind( + PhiR->getRecurrenceKind())) + continue; - assert(RdxResult); - Builder.setInsertPoint(RdxResult); - VPValue *Default = RecipeBuilder.getVPValueOrAddLiveIn( - RdxDesc.getRecurrenceStartValue()); - auto *ExtractLastActive = Builder.createNaryOp( - VPInstruction::ExtractLastActive, {DataSelect, MaskSelect, Default}, - RdxResult->getDebugLoc()); - RdxResult->replaceAllUsesWith(ExtractLastActive); - RdxResult->eraseFromParent(); + // Find the condition for the select + auto *SR = dyn_cast(&PhiR->getBackedgeRecipe()); + if (!SR) + continue; + VPValue *Cond = SR->getCond(); + + // Add mask phi + VPBuilder Builder = VPBuilder::getToInsertAfter(PhiR); + VPValue *False = Plan.getOrAddLiveIn( + ConstantInt::getFalse(PhiR->getUnderlyingValue()->getContext())); + auto *MaskPHI = new VPLastActiveMaskPHIRecipe(False, DebugLoc()); + Builder.insert(MaskPHI); + + // Add select for mask + Builder.setInsertPoint(SR); + VPValue *AnyOf = Builder.createNaryOp(VPInstruction::AnyOf, {Cond}); + VPValue *MaskSelect = Builder.createSelect(AnyOf, Cond, MaskPHI); + MaskPHI->addOperand(MaskSelect); + + // Replace select for data + VPValue *DataSelect = Builder.createSelect( + AnyOf, SR->getOperand(1), SR->getOperand(2), SR->getDebugLoc()); + SR->replaceAllUsesWith(DataSelect); + SR->eraseFromParent(); + + // Find final reduction and replace it with an + // extract.last.active intrinsic. + VPInstruction *RdxResult = nullptr; + for (VPUser *U : DataSelect->users()) { + VPInstruction *I = dyn_cast(U); + if (I && I->getOpcode() == VPInstruction::ComputeReductionResult) { + RdxResult = I; + break; + } } + + assert(RdxResult); + Builder.setInsertPoint(RdxResult); + auto *ExtractLastActive = + Builder.createNaryOp(VPInstruction::ExtractLastActive, + {DataSelect, MaskSelect, PhiR->getStartValue()}, + RdxResult->getDebugLoc()); + RdxResult->replaceAllUsesWith(ExtractLastActive); + RdxResult->eraseFromParent(); } } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index a479d2b49e665..d3b9d24eb5689 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -24,7 +24,6 @@ namespace llvm { class InductionDescriptor; class Instruction; class LoopVersioning; -class LoopVectorizationLegality; class PHINode; class ScalarEvolution; class PredicatedScalarEvolution; @@ -408,8 +407,7 @@ struct VPlanTransforms { /// for entire vectors for both the latest mask containing at least one active /// element and the corresponding data vector. static void convertFindLastRecurrences(VPlan &Plan, - VPRecipeBuilder &RecipeBuilder, - LoopVectorizationLegality *Legal); + VPRecipeBuilder &RecipeBuilder); }; } // namespace llvm From 016adab2e557bc098271c8c7342f7e78232fd757 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Wed, 12 Nov 2025 11:40:02 +0000 Subject: [PATCH 04/24] Remove VPLastActiveMaskPHIRecipe --- llvm/lib/Analysis/IVDescriptors.cpp | 8 ---- .../Transforms/Vectorize/LoopVectorize.cpp | 2 +- llvm/lib/Transforms/Vectorize/VPlan.h | 43 +++---------------- .../Transforms/Vectorize/VPlanAnalysis.cpp | 16 +++---- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 26 +++-------- .../Transforms/Vectorize/VPlanTransforms.cpp | 2 +- llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 2 +- .../conditional-scalar-assignment-vplan.ll | 4 +- 8 files changed, 25 insertions(+), 78 deletions(-) diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp index c6e712090e942..f4130440a2f96 100644 --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -779,14 +779,6 @@ RecurrenceDescriptor::isFindPattern(RecurKind Kind, Loop *TheLoop, // FIXME: Support more complex patterns, including multiple selects. // The Select must be used only outside the loop and by the PHI. - for (User *U : I->users()) { - if (U == OrigPhi) - continue; - if (auto *UI = dyn_cast(U); UI && !TheLoop->contains(UI)) - continue; - return InstDesc(false, I); - } - return InstDesc(I, RecurKind::FindLast); } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index b4d8c081e576b..d6133ea6eca8f 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9447,7 +9447,7 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) { SmallPtrSet EpiWidenedPhis; for (VPRecipeBase &R : EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { - if (isa(&R)) + if (isa(&R)) continue; EpiWidenedPhis.insert( cast(R.getVPSingleValue()->getUnderlyingValue())); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 68dacb813e4fd..b41853a9c972e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2346,8 +2346,9 @@ class LLVM_ABI_FOR_TEST VPWidenPHIRecipe : public VPSingleDefRecipe, } VPWidenPHIRecipe *clone() override { - auto *C = new VPWidenPHIRecipe(cast(getUnderlyingValue()), - getOperand(0), getDebugLoc(), Name); + auto *C = + new VPWidenPHIRecipe(cast_if_present(getUnderlyingValue()), + getOperand(0), getDebugLoc(), Name); for (VPValue *Op : llvm::drop_begin(operands())) C->addOperand(Op); return C; @@ -2360,6 +2361,10 @@ class LLVM_ABI_FOR_TEST VPWidenPHIRecipe : public VPSingleDefRecipe, /// Generate the phi/select nodes. void execute(VPTransformState &State) override; + /// Return the cost of this VPWidenPHIRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + protected: #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. @@ -3638,40 +3643,6 @@ class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe { #endif }; -// TODO: Can we unify the PHI recipe hierarchy a bit? VPPredInstPHISC is close -// to this (just a PHI of a predicate), but isn't a header phi so can't -// be used for the mask of FindLastActive reductions. -// -// This is basically a clone of VPActiveLaneMaskPHIRecipe, but won't run into -// problems with transforms that expect there to only be a single ALM PHI, and -// can be ignored by other code looking for a (non-existent) underlying value. -class VPLastActiveMaskPHIRecipe : public VPHeaderPHIRecipe { -public: - VPLastActiveMaskPHIRecipe(VPValue *StartMask, DebugLoc DL) - : VPHeaderPHIRecipe(VPDef::VPLastActiveMaskPHISC, nullptr, StartMask, - DL) {} - - ~VPLastActiveMaskPHIRecipe() override = default; - - VPLastActiveMaskPHIRecipe *clone() override { - auto *R = new VPLastActiveMaskPHIRecipe(getOperand(0), getDebugLoc()); - if (getNumOperands() == 2) - R->addOperand(getOperand(1)); - return R; - } - - VP_CLASSOF_IMPL(VPDef::VPLastActiveMaskPHISC); - - /// Generate the mask phi - void execute(VPTransformState &State) override; - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - /// Print the recipe - void print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const override; -#endif -}; - /// A recipe for generating the phi node for the current index of elements, /// adjusted in accordance with EVL value. It starts at the start value of the /// canonical induction and gets incremented by EVL in each iteration of the diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 1dd26ee9da3fe..a135f827e4abf 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -280,14 +280,14 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { TypeSwitch(V->getDefiningRecipe()) .Case([this](const auto *R) { - // Handle header phi recipes, except VPWidenIntOrFpInduction - // which needs special handling due it being possibly truncated. - // TODO: consider inferring/caching type of siblings, e.g., - // backedge value, here and in cases below. - return inferScalarType(R->getStartValue()); - }) + VPWidenPointerInductionRecipe, VPEVLBasedIVPHIRecipe>( + [this](const auto *R) { + // Handle header phi recipes, except VPWidenIntOrFpInduction + // which needs special handling due it being possibly truncated. + // TODO: consider inferring/caching type of siblings, e.g., + // backedge value, here and in cases below. + return inferScalarType(R->getStartValue()); + }) .Case( [](const auto *R) { return R->getScalarType(); }) .CasegetCFGPredecessor(0)); - Value *StartMask = State.get(getOperand(0)); - PHINode *Phi = - State.Builder.CreatePHI(StartMask->getType(), 2, "last.active.mask"); - Phi->addIncoming(StartMask, VectorPH); - State.set(this, Phi); -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void VPLastActiveMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << "LAST-ACTIVE-MASK-PHI "; - - printAsOperand(O, SlotTracker); - O << " = phi "; - printOperands(O, SlotTracker); -} -#endif - #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPEVLBasedIVPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 28fec1e3da100..141e138d0a58c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -5145,7 +5145,7 @@ void VPlanTransforms::convertFindLastRecurrences( VPBuilder Builder = VPBuilder::getToInsertAfter(PhiR); VPValue *False = Plan.getOrAddLiveIn( ConstantInt::getFalse(PhiR->getUnderlyingValue()->getContext())); - auto *MaskPHI = new VPLastActiveMaskPHIRecipe(False, DebugLoc()); + auto *MaskPHI = new VPWidenPHIRecipe(nullptr, False, DebugLoc()); Builder.insert(MaskPHI); // Add select for mask diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index e6cf992488826..d36975699c4a8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -48,7 +48,7 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) { } bool vputils::isHeaderMask(const VPValue *V, const VPlan &Plan) { - if (isa(V)) + if (isa(V)) return true; auto IsWideCanonicalIV = [](VPValue *A) { diff --git a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll index e802093fc7886..6d63e2f927df1 100644 --- a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll +++ b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll @@ -48,7 +48,7 @@ exit: ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%data.phi> = phi ir<-1>, vp<%9> -; CHECK-NEXT: LAST-ACTIVE-MASK-PHI vp<%4> = phi ir, vp<%8> +; CHECK-NEXT: WIDEN-PHI vp<%4> = phi [ ir, vector.ph ], [ vp<%8>, vector.body ] ; CHECK-NEXT: vp<%5> = SCALAR-STEPS vp<%3>, ir<1>, vp<%0> ; CHECK-NEXT: CLONE ir<%ld.addr> = getelementptr inbounds ir<%data>, vp<%5> ; CHECK-NEXT: vp<%6> = vector-pointer ir<%ld.addr> @@ -95,7 +95,7 @@ exit: ; CHECK-NEXT: Cost of 1 for VF vscale x 1: exit condition instruction %exit.cmp = icmp eq i64 %iv.next, %N ; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN-REDUCTION-PHI ir<%data.phi> = phi ir<-1>, vp<%9> -; CHECK-NEXT: Cost of 1 for VF vscale x 1: LAST-ACTIVE-MASK-PHI vp<%4> = phi ir, vp<%8> +; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN-PHI vp<%4> = phi [ ir, vector.ph ], [ vp<%8>, vector.body ] ; CHECK-NEXT: Cost of 0 for VF vscale x 1: vp<%5> = SCALAR-STEPS vp<%3>, ir<1>, vp<%0> ; CHECK-NEXT: Cost of 0 for VF vscale x 1: CLONE ir<%ld.addr> = getelementptr inbounds ir<%data>, vp<%5> ; CHECK-NEXT: Cost of 0 for VF vscale x 1: vp<%6> = vector-pointer ir<%ld.addr> From 2c3fb8fd1d9e2aed5b10822c59dbca27a736fdd3 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Wed, 12 Nov 2025 14:25:37 +0000 Subject: [PATCH 05/24] Add regex matches to vplan test --- .../conditional-scalar-assignment-vplan.ll | 122 +++++++++--------- 1 file changed, 61 insertions(+), 61 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll index 6d63e2f927df1..23964f65b7aae 100644 --- a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll +++ b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll @@ -33,10 +33,10 @@ exit: ; CHECK: VPlan 'Initial VPlan for VF={vscale x 1},UF>=1' { -; CHECK-NEXT: Live-in vp<%0> = VF -; CHECK-NEXT: Live-in vp<%1> = VF * UF -; CHECK-NEXT: Live-in vp<%2> = vector-trip-count -; CHECK-NEXT: Live-in ir<%N> = original trip-count +; CHECK-NEXT: Live-in vp<[[VF:%.*]]> = VF +; CHECK-NEXT: Live-in vp<[[VFxUF:%.*]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VECTC:%.*]]> = vector-trip-count +; CHECK-NEXT: Live-in ir<[[ORIGTC:%.*]]> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: ; CHECK-NEXT: Successor(s): scalar.ph, vector.ph @@ -46,78 +46,78 @@ exit: ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%data.phi> = phi ir<-1>, vp<%9> -; CHECK-NEXT: WIDEN-PHI vp<%4> = phi [ ir, vector.ph ], [ vp<%8>, vector.body ] -; CHECK-NEXT: vp<%5> = SCALAR-STEPS vp<%3>, ir<1>, vp<%0> -; CHECK-NEXT: CLONE ir<%ld.addr> = getelementptr inbounds ir<%data>, vp<%5> -; CHECK-NEXT: vp<%6> = vector-pointer ir<%ld.addr> -; CHECK-NEXT: WIDEN ir<%ld> = load vp<%6> -; CHECK-NEXT: WIDEN ir<%select.cmp> = icmp slt ir<%a>, ir<%ld> -; CHECK-NEXT: EMIT vp<%7> = any-of ir<%select.cmp> -; CHECK-NEXT: EMIT vp<%8> = select vp<%7>, ir<%select.cmp>, vp<%4> -; CHECK-NEXT: EMIT vp<%9> = select vp<%7>, ir<%ld>, ir<%data.phi> -; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%3>, vp<%1> -; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK-NEXT: EMIT vp<[[CIV:%.*]]> = CANONICAL-INDUCTION ir<0>, vp<[[INDEXNEXT:%.*]]> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[DATAPHI:%.*]]> = phi ir<-1>, vp<[[DATASELECT:%.*]]> +; CHECK-NEXT: WIDEN-PHI vp<[[MASKPHI:%.*]]> = phi [ ir, vector.ph ], [ vp<[[MASKSELECT:%.*]]>, vector.body ] +; CHECK-NEXT: vp<[[STEPS:%.*]]> = SCALAR-STEPS vp<[[CIV]]>, ir<1>, vp<[[VF]]> +; CHECK-NEXT: CLONE ir<[[LDADDR:%.*]]> = getelementptr inbounds ir<%data>, vp<[[STEPS:%.*]]> +; CHECK-NEXT: vp<[[VPTR:%.*]]> = vector-pointer ir<[[LDADDR]]> +; CHECK-NEXT: WIDEN ir<[[LD:%.*]]> = load vp<[[VPTR]]> +; CHECK-NEXT: WIDEN ir<[[SELECTCMP:%.*]]> = icmp slt ir<%a>, ir<[[LD]]> +; CHECK-NEXT: EMIT vp<[[ANYOF:%.*]]> = any-of ir<[[SELECTCMP]]> +; CHECK-NEXT: EMIT vp<[[MASKSELECT]]> = select vp<[[ANYOF]]>, ir<[[SELECTCMP]]>, vp<[[MASKPHI]]> +; CHECK-NEXT: EMIT vp<[[DATASELECT]]> = select vp<[[ANYOF]]>, ir<[[LD]]>, ir<[[DATAPHI]]> +; CHECK-NEXT: EMIT vp<[[INDEXNEXT]]> = add nuw vp<[[CIV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[INDEXNEXT]]>, vp<[[VECTC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<%11> = extract-last-active vp<%9>, vp<%8>, ir<-1> -; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq ir<%N>, vp<%2> -; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> +; CHECK-NEXT: EMIT vp<[[EXTRACTLAST:%.*]]> = extract-last-active vp<[[DATASELECT]]>, vp<[[MASKSELECT]]>, ir<-1> +; CHECK-NEXT: EMIT vp<[[TCCMP:%.*]]> = icmp eq ir<[[ORIGTC]]>, vp<[[VECTC]]> +; CHECK-NEXT: EMIT branch-on-cond vp<[[TCCMP]]> ; CHECK-NEXT: Successor(s): ir-bb, scalar.ph ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %select.data.lcssa = phi i32 [ %select.data, %loop ] (extra operand: vp<%11> from middle.block) +; CHECK-NEXT: IR [[SELECTLCSSA:%.*]] = phi i32 [ [[SELECTDATA:%.*]], %loop ] (extra operand: vp<[[EXTRACTLAST]]> from middle.block) ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph: -; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%2>, middle.block ], [ ir<0>, ir-bb ] -; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<-1>, ir-bb ] +; CHECK-NEXT: EMIT-SCALAR vp<[[RESUMEVAL:%.*]]> = phi [ vp<[[VECTC]]>, middle.block ], [ ir<0>, ir-bb ] +; CHECK-NEXT: EMIT-SCALAR vp<[[MERGERDX:%.*]]> = phi [ vp<[[EXTRACTLAST]]>, middle.block ], [ ir<-1>, ir-bb ] ; CHECK-NEXT: Successor(s): ir-bb ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph) -; CHECK-NEXT: IR %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph) -; CHECK-NEXT: IR %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv -; CHECK-NEXT: IR %ld = load i32, ptr %ld.addr, align 4 -; CHECK-NEXT: IR %select.cmp = icmp slt i32 %a, %ld -; CHECK-NEXT: IR %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi -; CHECK-NEXT: IR %iv.next = add nuw nsw i64 %iv, 1 -; CHECK-NEXT: IR %exit.cmp = icmp eq i64 %iv.next, %N +; CHECK-NEXT: IR [[IV:%.*]] = phi i64 [ 0, %entry ], [ [[IVNEXT:%.*]], %loop ] (extra operand: vp<[[RESUMEVAL]]> from scalar.ph) +; CHECK-NEXT: IR [[DATAPHI]] = phi i32 [ -1, %entry ], [ [[SELECTDATA]], %loop ] (extra operand: vp<[[MERGERDX]]> from scalar.ph) +; CHECK-NEXT: IR [[LDADDR]] = getelementptr inbounds i32, ptr %data, i64 [[IV]] +; CHECK-NEXT: IR [[LD]] = load i32, ptr [[LDADDR]], align 4 +; CHECK-NEXT: IR [[SELECTCMP]] = icmp slt i32 %a, [[LD]] +; CHECK-NEXT: IR [[SELECTDATA]] = select i1 [[SELECTCMP]], i32 [[LD]], i32 [[DATAPHI]] +; CHECK-NEXT: IR [[IVNEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: IR [[EXITCMP:%.*]] = icmp eq i64 [[IVNEXT]], [[ORIGTC]] ; CHECK-NEXT: No successors ; CHECK-NEXT: } -; CHECK: Cost of 1 for VF vscale x 1: induction instruction %iv.next = add nuw nsw i64 %iv, 1 -; CHECK-NEXT: Cost of 1 for VF vscale x 1: induction instruction %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] -; CHECK-NEXT: Cost of 1 for VF vscale x 1: exit condition instruction %exit.cmp = icmp eq i64 %iv.next, %N -; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN-REDUCTION-PHI ir<%data.phi> = phi ir<-1>, vp<%9> -; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN-PHI vp<%4> = phi [ ir, vector.ph ], [ vp<%8>, vector.body ] -; CHECK-NEXT: Cost of 0 for VF vscale x 1: vp<%5> = SCALAR-STEPS vp<%3>, ir<1>, vp<%0> -; CHECK-NEXT: Cost of 0 for VF vscale x 1: CLONE ir<%ld.addr> = getelementptr inbounds ir<%data>, vp<%5> -; CHECK-NEXT: Cost of 0 for VF vscale x 1: vp<%6> = vector-pointer ir<%ld.addr> -; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN ir<%ld> = load vp<%6> -; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN ir<%select.cmp> = icmp slt ir<%a>, ir<%ld> -; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<%7> = any-of ir<%select.cmp> -; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<%8> = select vp<%7>, ir<%select.cmp>, vp<%4> -; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<%9> = select vp<%7>, ir<%ld>, ir<%data.phi> -; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT vp<%index.next> = add nuw vp<%3>, vp<%1> -; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK: Cost of 1 for VF vscale x 1: induction instruction [[IVNEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: Cost of 1 for VF vscale x 1: induction instruction [[IV]] = phi i64 [ 0, %entry ], [ [[IVNEXT]], %loop ] +; CHECK-NEXT: Cost of 1 for VF vscale x 1: exit condition instruction [[EXITCMP:%.*]] = icmp eq i64 [[IVNEXT]], [[ORIGTC]] +; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT vp<[[CIV]]> = CANONICAL-INDUCTION ir<0>, vp<[[INDEXNEXT]]> +; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN-REDUCTION-PHI ir<[[DATAPHI]]> = phi ir<-1>, vp<[[DATASELECT]]> +; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN-PHI vp<[[MASKPHI]]> = phi [ ir, vector.ph ], [ vp<[[MASKSELECT]]>, vector.body ] +; CHECK-NEXT: Cost of 0 for VF vscale x 1: vp<[[STEPS:%.*]]> = SCALAR-STEPS vp<[[CIV]]>, ir<1>, vp<[[VF]]> +; CHECK-NEXT: Cost of 0 for VF vscale x 1: CLONE ir<[[LDADDR]]> = getelementptr inbounds ir<%data>, vp<[[STEPS:%.*]]> +; CHECK-NEXT: Cost of 0 for VF vscale x 1: vp<[[VPTR]]> = vector-pointer ir<[[LDADDR]]> +; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN ir<[[LD]]> = load vp<[[VPTR]]> +; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN ir<[[SELECTCMP]]> = icmp slt ir<%a>, ir<[[LD]]> +; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<[[ANYOF]]> = any-of ir<[[SELECTCMP]]> +; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<[[MASKSELECT]]> = select vp<[[ANYOF]]>, ir<[[SELECTCMP]]>, vp<[[MASKPHI]]> +; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<[[DATASELECT]]> = select vp<[[ANYOF]]>, ir<[[LD]]>, ir<[[DATAPHI]]> +; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT vp<[[INDEXNEXT]]> = add nuw vp<[[CIV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT branch-on-count vp<[[INDEXNEXT]]>, vp<[[VECTC]]> ; CHECK-NEXT: Cost of 1 for VF vscale x 1: vector loop backedge -; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%2>, middle.block ], [ ir<0>, ir-bb ] -; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<-1>, ir-bb ] -; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph) -; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph) -; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv -; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %ld = load i32, ptr %ld.addr, align 4 -; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %select.cmp = icmp slt i32 %a, %ld -; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi -; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %iv.next = add nuw nsw i64 %iv, 1 -; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %exit.cmp = icmp eq i64 %iv.next, %N -; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<%11> = extract-last-active vp<%9>, vp<%8>, ir<-1> -; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT vp<%cmp.n> = icmp eq ir<%N>, vp<%2> -; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT branch-on-cond vp<%cmp.n> -; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR %select.data.lcssa = phi i32 [ %select.data, %loop ] (extra operand: vp<%11> from middle.block) +; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT-SCALAR vp<[[RESUMEVAL]]> = phi [ vp<[[VECTC]]>, middle.block ], [ ir<0>, ir-bb ] +; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT-SCALAR vp<[[MERGERDX]]> = phi [ vp<[[EXTRACTLAST]]>, middle.block ], [ ir<-1>, ir-bb ] +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR [[IV]] = phi i64 [ 0, %entry ], [ [[IVNEXT]], %loop ] (extra operand: vp<[[RESUMEVAL]]> from scalar.ph) +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR [[DATAPHI]] = phi i32 [ -1, %entry ], [ [[SELECTDATA]], %loop ] (extra operand: vp<[[MERGERDX]]> from scalar.ph) +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR [[LDADDR]] = getelementptr inbounds i32, ptr %data, i64 [[IV]] +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR [[LD]] = load i32, ptr [[LDADDR]], align 4 +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR [[SELECTCMP]] = icmp slt i32 %a, [[LD]] +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR [[SELECTDATA]] = select i1 [[SELECTCMP]], i32 [[LD]], i32 [[DATAPHI]] +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR [[IVNEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR [[EXITCMP:%.*]] = icmp eq i64 [[IVNEXT]], [[ORIGTC]] +; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<[[EXTRACTLAST]]> = extract-last-active vp<[[DATASELECT]]>, vp<[[MASKSELECT]]>, ir<-1> +; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT vp<[[TCCMP]]> = icmp eq ir<[[ORIGTC]]>, vp<[[VECTC]]> +; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT branch-on-cond vp<[[TCCMP]]> +; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR [[SELECTLCSSA]] = phi i32 [ [[SELECTDATA]], %loop ] (extra operand: vp<[[EXTRACTLAST]]> from middle.block) From f1b8bcdfc8615078a128a98ddd89cb03c43a139f Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Wed, 12 Nov 2025 15:22:22 +0000 Subject: [PATCH 06/24] Use find_if --- .../Transforms/Vectorize/VPlanTransforms.cpp | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 141e138d0a58c..e3f53466a56be 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -5160,18 +5160,15 @@ void VPlanTransforms::convertFindLastRecurrences( SR->replaceAllUsesWith(DataSelect); SR->eraseFromParent(); - // Find final reduction and replace it with an + // Find final reduction computation and replace it with an // extract.last.active intrinsic. - VPInstruction *RdxResult = nullptr; - for (VPUser *U : DataSelect->users()) { + VPUser **ComputeRdx = find_if(DataSelect->users(), [](VPUser *U) { VPInstruction *I = dyn_cast(U); - if (I && I->getOpcode() == VPInstruction::ComputeReductionResult) { - RdxResult = I; - break; - } - } - - assert(RdxResult); + return I && I->getOpcode() == VPInstruction::ComputeReductionResult; + }); + assert(ComputeRdx != DataSelect->user_end() && + "Unable to find Reduction Result Recipe"); + VPInstruction *RdxResult = cast(*ComputeRdx); Builder.setInsertPoint(RdxResult); auto *ExtractLastActive = Builder.createNaryOp(VPInstruction::ExtractLastActive, From 5e75d92c386a927d68095785c2f4739d5bb5c34e Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Wed, 19 Nov 2025 11:54:39 +0000 Subject: [PATCH 07/24] Handle FindLast properly in unrolling, test --- llvm/lib/Transforms/Utils/LoopUnroll.cpp | 3 +- .../LoopUnroll/partial-unroll-reductions.ll | 52 +++++++++++++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp index 0f256398e5b1e..d3285e32c5dee 100644 --- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -1261,7 +1261,8 @@ llvm::canParallelizeReductionWhenUnrolling(PHINode &Phi, Loop *L, // TODO: Handle additional reductions, including min-max reductions. if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) || RecurrenceDescriptor::isFindIVRecurrenceKind(RK) || - RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) + RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) || + RecurrenceDescriptor::isFindLastRecurrenceKind(RK)) return std::nullopt; if (RdxDesc.hasExactFPMath()) diff --git a/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll b/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll index e94a368d3ded0..e5e969d638acd 100644 --- a/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll +++ b/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll @@ -683,3 +683,55 @@ loop: exit: ret <4 x i32> %rdx.next } + +define i32 @test_findlast_reduction(ptr %data, i32 %a) { +; CHECK-LABEL: define i32 @test_findlast_reduction( +; CHECK-SAME: ptr [[DATA:%.*]], i32 [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA_3:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]] +; CHECK-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 +; CHECK-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] +; CHECK-NEXT: [[SELECT_DATA:%.*]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] +; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[LD_ADDR_1:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV_NEXT]] +; CHECK-NEXT: [[LD_1:%.*]] = load i32, ptr [[LD_ADDR_1]], align 4 +; CHECK-NEXT: [[SELECT_CMP_1:%.*]] = icmp slt i32 [[A]], [[LD_1]] +; CHECK-NEXT: [[SELECT_DATA_1:%.*]] = select i1 [[SELECT_CMP_1]], i32 [[LD_1]], i32 [[SELECT_DATA]] +; CHECK-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2 +; CHECK-NEXT: [[LD_ADDR_2:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV_NEXT_1]] +; CHECK-NEXT: [[LD_2:%.*]] = load i32, ptr [[LD_ADDR_2]], align 4 +; CHECK-NEXT: [[SELECT_CMP_2:%.*]] = icmp slt i32 [[A]], [[LD_2]] +; CHECK-NEXT: [[SELECT_DATA_2:%.*]] = select i1 [[SELECT_CMP_2]], i32 [[LD_2]], i32 [[SELECT_DATA_1]] +; CHECK-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3 +; CHECK-NEXT: [[LD_ADDR_3:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV_NEXT_2]] +; CHECK-NEXT: [[LD_3:%.*]] = load i32, ptr [[LD_ADDR_3]], align 4 +; CHECK-NEXT: [[SELECT_CMP_3:%.*]] = icmp slt i32 [[A]], [[LD_3]] +; CHECK-NEXT: [[SELECT_DATA_3]] = select i1 [[SELECT_CMP_3]], i32 [[LD_3]], i32 [[SELECT_DATA_2]] +; CHECK-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4 +; CHECK-NEXT: [[EXIT_CMP_3:%.*]] = icmp eq i64 [[IV_NEXT_3]], 200 +; CHECK-NEXT: br i1 [[EXIT_CMP_3]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA_3]], %[[LOOP]] ] +; CHECK-NEXT: ret i32 [[SELECT_DATA_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] + %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv + %ld = load i32, ptr %ld.addr, align 4 + %select.cmp = icmp slt i32 %a, %ld + %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, 200 + br i1 %exit.cmp, label %exit, label %loop + +exit: + ret i32 %select.data +} From 2cfbbfb0a69c83947aabb49642d54a35d0951d03 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Wed, 19 Nov 2025 12:01:48 +0000 Subject: [PATCH 08/24] Remove instcombine from AArch64 FindLast runlines --- .../AArch64/conditional-scalar-assignment.ll | 94 +++++++++++-------- 1 file changed, 53 insertions(+), 41 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll index 25c698f3df245..8b80c161c438c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6 -; RUN: opt -passes=loop-vectorize,instcombine -S < %s 2>&1 | FileCheck %s --check-prefix=NEON -; RUN: opt -passes=loop-vectorize,instcombine -mattr=+sve -S < %s 2>&1 | FileCheck %s --check-prefix=SVE +; RUN: opt -passes=loop-vectorize -S < %s 2>&1 | FileCheck %s --check-prefix=NEON +; RUN: opt -passes=loop-vectorize -mattr=+sve -S < %s 2>&1 | FileCheck %s --check-prefix=SVE ;; The following run line caused an ICE before using a dedicated FindLast PHI recipe. ;; We're not looking at the resulting IR, just confirming it doesn't crash. -; RUN: opt -passes=loop-vectorize,instcombine -mattr=+sve -epilogue-vectorization-force-VF=4 -S < %s 2>&1 > /dev/null +; RUN: opt -passes=loop-vectorize -mattr=+sve -epilogue-vectorization-force-VF=4 -S < %s 2>&1 > /dev/null target triple = "aarch64-linux-gnu" @@ -16,7 +16,7 @@ define i32 @simple_csa_int_select(i64 %N, ptr %data, i32 %a) { ; NEON: [[LOOP]]: ; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; NEON-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] -; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA]], i64 [[IV]] +; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]] ; NEON-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 ; NEON-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] ; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] @@ -24,41 +24,41 @@ define i32 @simple_csa_int_select(i64 %N, ptr %data, i32 %a) { ; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] ; NEON: [[EXIT]]: -; NEON-NEXT: ret i32 [[SELECT_DATA]] +; NEON-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ] +; NEON-NEXT: ret i32 [[SELECT_DATA_LCSSA]] ; ; SVE-LABEL: define i32 @simple_csa_int_select( ; SVE-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i32 [[A:%.*]]) #[[ATTR0:[0-9]+]] { ; SVE-NEXT: [[ENTRY:.*]]: -; SVE-NEXT: [[A_FR:%.*]] = freeze i32 [[A]] ; SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; SVE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2 ; SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] ; SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; SVE: [[VECTOR_PH]]: ; SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; SVE-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2 +; SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 ; SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; SVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement zeroinitializer, i32 [[A_FR]], i64 0 +; SVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[A]], i64 0 ; SVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; SVE-NEXT: br label %[[VECTOR_BODY:.*]] ; SVE: [[VECTOR_BODY]]: ; SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; SVE-NEXT: [[VEC_PHI:%.*]] = phi [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] -; SVE-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ] +; SVE-NEXT: [[TMP4:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ] ; SVE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDEX]] ; SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 -; SVE-NEXT: [[WIDE_LOAD_FR:%.*]] = freeze [[WIDE_LOAD]] -; SVE-NEXT: [[TMP7:%.*]] = icmp slt [[BROADCAST_SPLAT]], [[WIDE_LOAD_FR]] +; SVE-NEXT: [[TMP13:%.*]] = icmp slt [[BROADCAST_SPLAT]], [[WIDE_LOAD]] +; SVE-NEXT: [[TMP7:%.*]] = freeze [[TMP13]] ; SVE-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP7]]) -; SVE-NEXT: [[TMP9]] = select i1 [[TMP8]], [[TMP7]], [[LAST_ACTIVE_MASK]] -; SVE-NEXT: [[TMP10]] = select i1 [[TMP8]], [[WIDE_LOAD_FR]], [[VEC_PHI]] +; SVE-NEXT: [[TMP9]] = select i1 [[TMP8]], [[TMP13]], [[TMP4]] +; SVE-NEXT: [[TMP10]] = select i1 [[TMP8]], [[WIDE_LOAD]], [[VEC_PHI]] ; SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; SVE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; SVE-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; SVE: [[MIDDLE_BLOCK]]: ; SVE-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32( [[TMP10]], [[TMP9]], i32 -1) -; SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; SVE-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] ; SVE: [[SCALAR_PH]]: ; SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] @@ -69,7 +69,7 @@ define i32 @simple_csa_int_select(i64 %N, ptr %data, i32 %a) { ; SVE-NEXT: [[DATA_PHI:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] ; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]] ; SVE-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 -; SVE-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A_FR]], [[LD]] +; SVE-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] ; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] ; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] @@ -104,7 +104,7 @@ define ptr @simple_csa_ptr_select(i64 %N, ptr %data, i64 %a, ptr %init) { ; NEON: [[LOOP]]: ; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; NEON-NEXT: [[DATA_PHI:%.*]] = phi ptr [ [[INIT]], %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] -; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw ptr, ptr [[DATA]], i64 [[IV]] +; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[IV]] ; NEON-NEXT: [[LD:%.*]] = load ptr, ptr [[LD_ADDR]], align 4 ; NEON-NEXT: [[LD_I64:%.*]] = ptrtoint ptr [[LD]] to i64 ; NEON-NEXT: [[SELECT_CMP:%.*]] = icmp slt i64 [[A]], [[LD_I64]] @@ -113,7 +113,8 @@ define ptr @simple_csa_ptr_select(i64 %N, ptr %data, i64 %a, ptr %init) { ; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] ; NEON: [[EXIT]]: -; NEON-NEXT: ret ptr [[SELECT_DATA]] +; NEON-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi ptr [ [[SELECT_DATA]], %[[LOOP]] ] +; NEON-NEXT: ret ptr [[SELECT_DATA_LCSSA]] ; ; SVE-LABEL: define ptr @simple_csa_ptr_select( ; SVE-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i64 [[A:%.*]], ptr [[INIT:%.*]]) #[[ATTR0]] { @@ -122,7 +123,7 @@ define ptr @simple_csa_ptr_select(i64 %N, ptr %data, i64 %a, ptr %init) { ; SVE: [[LOOP]]: ; SVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; SVE-NEXT: [[DATA_PHI:%.*]] = phi ptr [ [[INIT]], %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] -; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw ptr, ptr [[DATA]], i64 [[IV]] +; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[IV]] ; SVE-NEXT: [[LD:%.*]] = load ptr, ptr [[LD_ADDR]], align 4 ; SVE-NEXT: [[LD_I64:%.*]] = ptrtoint ptr [[LD]] to i64 ; SVE-NEXT: [[SELECT_CMP:%.*]] = icmp slt i64 [[A]], [[LD_I64]] @@ -131,7 +132,8 @@ define ptr @simple_csa_ptr_select(i64 %N, ptr %data, i64 %a, ptr %init) { ; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] ; SVE: [[EXIT]]: -; SVE-NEXT: ret ptr [[SELECT_DATA]] +; SVE-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi ptr [ [[SELECT_DATA]], %[[LOOP]] ] +; SVE-NEXT: ret ptr [[SELECT_DATA_LCSSA]] ; entry: br label %loop @@ -160,7 +162,7 @@ define float @simple_csa_float_select(i64 %N, ptr %data, float %a) { ; NEON: [[LOOP]]: ; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; NEON-NEXT: [[DATA_PHI:%.*]] = phi float [ -1.000000e+00, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] -; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw float, ptr [[DATA]], i64 [[IV]] +; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[IV]] ; NEON-NEXT: [[LD:%.*]] = load float, ptr [[LD_ADDR]], align 4 ; NEON-NEXT: [[SELECT_CMP:%.*]] = fcmp olt float [[A]], [[LD]] ; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], float [[LD]], float [[DATA_PHI]] @@ -168,7 +170,8 @@ define float @simple_csa_float_select(i64 %N, ptr %data, float %a) { ; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] ; NEON: [[EXIT]]: -; NEON-NEXT: ret float [[SELECT_DATA]] +; NEON-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi float [ [[SELECT_DATA]], %[[LOOP]] ] +; NEON-NEXT: ret float [[SELECT_DATA_LCSSA]] ; ; SVE-LABEL: define float @simple_csa_float_select( ; SVE-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], float [[A:%.*]]) #[[ATTR0]] { @@ -177,7 +180,7 @@ define float @simple_csa_float_select(i64 %N, ptr %data, float %a) { ; SVE: [[LOOP]]: ; SVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; SVE-NEXT: [[DATA_PHI:%.*]] = phi float [ -1.000000e+00, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] -; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw float, ptr [[DATA]], i64 [[IV]] +; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[IV]] ; SVE-NEXT: [[LD:%.*]] = load float, ptr [[LD_ADDR]], align 4 ; SVE-NEXT: [[SELECT_CMP:%.*]] = fcmp olt float [[A]], [[LD]] ; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], float [[LD]], float [[DATA_PHI]] @@ -185,7 +188,8 @@ define float @simple_csa_float_select(i64 %N, ptr %data, float %a) { ; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] ; SVE: [[EXIT]]: -; SVE-NEXT: ret float [[SELECT_DATA]] +; SVE-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi float [ [[SELECT_DATA]], %[[LOOP]] ] +; SVE-NEXT: ret float [[SELECT_DATA_LCSSA]] ; entry: br label %loop @@ -213,17 +217,18 @@ define i32 @multi_user_csa_int_select(i64 %N, ptr %data, ptr %results, i32 %a) { ; NEON: [[LOOP]]: ; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; NEON-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] -; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA]], i64 [[IV]] +; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]] ; NEON-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 ; NEON-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] ; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] -; NEON-NEXT: [[RES_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[RESULTS]], i64 [[IV]] +; NEON-NEXT: [[RES_ADDR:%.*]] = getelementptr inbounds i32, ptr [[RESULTS]], i64 [[IV]] ; NEON-NEXT: store i32 [[SELECT_DATA]], ptr [[RES_ADDR]], align 4 ; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] ; NEON: [[EXIT]]: -; NEON-NEXT: ret i32 [[SELECT_DATA]] +; NEON-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ] +; NEON-NEXT: ret i32 [[SELECT_DATA_LCSSA]] ; ; SVE-LABEL: define i32 @multi_user_csa_int_select( ; SVE-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], ptr [[RESULTS:%.*]], i32 [[A:%.*]]) #[[ATTR0]] { @@ -232,17 +237,18 @@ define i32 @multi_user_csa_int_select(i64 %N, ptr %data, ptr %results, i32 %a) { ; SVE: [[LOOP]]: ; SVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; SVE-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] -; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA]], i64 [[IV]] +; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]] ; SVE-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 ; SVE-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] ; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] -; SVE-NEXT: [[RES_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[RESULTS]], i64 [[IV]] +; SVE-NEXT: [[RES_ADDR:%.*]] = getelementptr inbounds i32, ptr [[RESULTS]], i64 [[IV]] ; SVE-NEXT: store i32 [[SELECT_DATA]], ptr [[RES_ADDR]], align 4 ; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] ; SVE: [[EXIT]]: -; SVE-NEXT: ret i32 [[SELECT_DATA]] +; SVE-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ] +; SVE-NEXT: ret i32 [[SELECT_DATA_LCSSA]] ; entry: br label %loop @@ -274,7 +280,7 @@ define i32 @multi_use_cmp_for_csa_int_select(i64 %N, ptr %data, i32 %a) { ; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; NEON-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] ; NEON-NEXT: [[IDX_PHI:%.*]] = phi i64 [ -1, %[[ENTRY]] ], [ [[SELECT_IDX:%.*]], %[[LOOP]] ] -; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA]], i64 [[IV]] +; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]] ; NEON-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 ; NEON-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] ; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] @@ -283,8 +289,10 @@ define i32 @multi_use_cmp_for_csa_int_select(i64 %N, ptr %data, i32 %a) { ; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] ; NEON: [[EXIT]]: -; NEON-NEXT: [[IDX:%.*]] = trunc i64 [[SELECT_IDX]] to i32 -; NEON-NEXT: [[RES:%.*]] = add i32 [[SELECT_DATA]], [[IDX]] +; NEON-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ] +; NEON-NEXT: [[SELECT_IDX_LCSSA:%.*]] = phi i64 [ [[SELECT_IDX]], %[[LOOP]] ] +; NEON-NEXT: [[IDX:%.*]] = trunc i64 [[SELECT_IDX_LCSSA]] to i32 +; NEON-NEXT: [[RES:%.*]] = add i32 [[IDX]], [[SELECT_DATA_LCSSA]] ; NEON-NEXT: ret i32 [[RES]] ; ; SVE-LABEL: define i32 @multi_use_cmp_for_csa_int_select( @@ -295,7 +303,7 @@ define i32 @multi_use_cmp_for_csa_int_select(i64 %N, ptr %data, i32 %a) { ; SVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; SVE-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] ; SVE-NEXT: [[IDX_PHI:%.*]] = phi i64 [ -1, %[[ENTRY]] ], [ [[SELECT_IDX:%.*]], %[[LOOP]] ] -; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA]], i64 [[IV]] +; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]] ; SVE-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 ; SVE-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] ; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] @@ -304,8 +312,10 @@ define i32 @multi_use_cmp_for_csa_int_select(i64 %N, ptr %data, i32 %a) { ; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] ; SVE: [[EXIT]]: -; SVE-NEXT: [[IDX:%.*]] = trunc i64 [[SELECT_IDX]] to i32 -; SVE-NEXT: [[RES:%.*]] = add i32 [[SELECT_DATA]], [[IDX]] +; SVE-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ] +; SVE-NEXT: [[SELECT_IDX_LCSSA:%.*]] = phi i64 [ [[SELECT_IDX]], %[[LOOP]] ] +; SVE-NEXT: [[IDX:%.*]] = trunc i64 [[SELECT_IDX_LCSSA]] to i32 +; SVE-NEXT: [[RES:%.*]] = add i32 [[IDX]], [[SELECT_DATA_LCSSA]] ; SVE-NEXT: ret i32 [[RES]] ; entry: @@ -339,11 +349,11 @@ define i32 @chained_select_for_csa_int_select(i64 %N, ptr %data1, ptr %data2, i3 ; NEON: [[LOOP]]: ; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; NEON-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] -; NEON-NEXT: [[LD1_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA1]], i64 [[IV]] +; NEON-NEXT: [[LD1_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[IV]] ; NEON-NEXT: [[LD1:%.*]] = load i32, ptr [[LD1_ADDR]], align 4 ; NEON-NEXT: [[SELECT_CMP1:%.*]] = icmp slt i32 [[A]], [[LD1]] ; NEON-NEXT: [[SELECT_LD1:%.*]] = select i1 [[SELECT_CMP1]], i32 [[LD1]], i32 [[DATA_PHI]] -; NEON-NEXT: [[LD2_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA2]], i64 [[IV]] +; NEON-NEXT: [[LD2_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA2]], i64 [[IV]] ; NEON-NEXT: [[LD2:%.*]] = load i32, ptr [[LD2_ADDR]], align 4 ; NEON-NEXT: [[SELECT_CMP2:%.*]] = icmp sgt i32 [[B]], [[LD2]] ; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP2]], i32 [[LD2]], i32 [[SELECT_LD1]] @@ -351,7 +361,8 @@ define i32 @chained_select_for_csa_int_select(i64 %N, ptr %data1, ptr %data2, i3 ; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] ; NEON: [[EXIT]]: -; NEON-NEXT: ret i32 [[SELECT_DATA]] +; NEON-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ] +; NEON-NEXT: ret i32 [[SELECT_DATA_LCSSA]] ; ; SVE-LABEL: define i32 @chained_select_for_csa_int_select( ; SVE-SAME: i64 [[N:%.*]], ptr [[DATA1:%.*]], ptr [[DATA2:%.*]], i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR0]] { @@ -360,11 +371,11 @@ define i32 @chained_select_for_csa_int_select(i64 %N, ptr %data1, ptr %data2, i3 ; SVE: [[LOOP]]: ; SVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; SVE-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] -; SVE-NEXT: [[LD1_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA1]], i64 [[IV]] +; SVE-NEXT: [[LD1_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[IV]] ; SVE-NEXT: [[LD1:%.*]] = load i32, ptr [[LD1_ADDR]], align 4 ; SVE-NEXT: [[SELECT_CMP1:%.*]] = icmp slt i32 [[A]], [[LD1]] ; SVE-NEXT: [[SELECT_LD1:%.*]] = select i1 [[SELECT_CMP1]], i32 [[LD1]], i32 [[DATA_PHI]] -; SVE-NEXT: [[LD2_ADDR:%.*]] = getelementptr inbounds nuw i32, ptr [[DATA2]], i64 [[IV]] +; SVE-NEXT: [[LD2_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA2]], i64 [[IV]] ; SVE-NEXT: [[LD2:%.*]] = load i32, ptr [[LD2_ADDR]], align 4 ; SVE-NEXT: [[SELECT_CMP2:%.*]] = icmp sgt i32 [[B]], [[LD2]] ; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP2]], i32 [[LD2]], i32 [[SELECT_LD1]] @@ -372,7 +383,8 @@ define i32 @chained_select_for_csa_int_select(i64 %N, ptr %data1, ptr %data2, i3 ; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] ; SVE: [[EXIT]]: -; SVE-NEXT: ret i32 [[SELECT_DATA]] +; SVE-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ] +; SVE-NEXT: ret i32 [[SELECT_DATA_LCSSA]] ; entry: br label %loop From 436fd2ff28b8e9bbdcf08fef45a8968379b9d5a7 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Wed, 19 Nov 2025 12:15:07 +0000 Subject: [PATCH 09/24] Switched vplan print test to use fixed VF instead of scalable --- .../conditional-scalar-assignment-vplan.ll | 38 +------------------ 1 file changed, 2 insertions(+), 36 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll index 23964f65b7aae..79e5ca2cc7a1d 100644 --- a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll +++ b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll @@ -1,6 +1,5 @@ ; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \ -; RUN: -scalable-vectorization=on -force-target-supports-scalable-vectors \ -; RUN: -disable-output 2>&1 < %s | FileCheck %s +; RUN: -force-vector-width=4 -disable-output 2>&1 < %s | FileCheck %s ; This function is derived from the following C program: @@ -31,8 +30,7 @@ exit: ret i32 %select.data } - -; CHECK: VPlan 'Initial VPlan for VF={vscale x 1},UF>=1' { +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VF:%.*]]> = VF ; CHECK-NEXT: Live-in vp<[[VFxUF:%.*]]> = VF * UF ; CHECK-NEXT: Live-in vp<[[VECTC:%.*]]> = vector-trip-count @@ -89,35 +87,3 @@ exit: ; CHECK-NEXT: IR [[EXITCMP:%.*]] = icmp eq i64 [[IVNEXT]], [[ORIGTC]] ; CHECK-NEXT: No successors ; CHECK-NEXT: } - -; CHECK: Cost of 1 for VF vscale x 1: induction instruction [[IVNEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: Cost of 1 for VF vscale x 1: induction instruction [[IV]] = phi i64 [ 0, %entry ], [ [[IVNEXT]], %loop ] -; CHECK-NEXT: Cost of 1 for VF vscale x 1: exit condition instruction [[EXITCMP:%.*]] = icmp eq i64 [[IVNEXT]], [[ORIGTC]] -; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT vp<[[CIV]]> = CANONICAL-INDUCTION ir<0>, vp<[[INDEXNEXT]]> -; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN-REDUCTION-PHI ir<[[DATAPHI]]> = phi ir<-1>, vp<[[DATASELECT]]> -; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN-PHI vp<[[MASKPHI]]> = phi [ ir, vector.ph ], [ vp<[[MASKSELECT]]>, vector.body ] -; CHECK-NEXT: Cost of 0 for VF vscale x 1: vp<[[STEPS:%.*]]> = SCALAR-STEPS vp<[[CIV]]>, ir<1>, vp<[[VF]]> -; CHECK-NEXT: Cost of 0 for VF vscale x 1: CLONE ir<[[LDADDR]]> = getelementptr inbounds ir<%data>, vp<[[STEPS:%.*]]> -; CHECK-NEXT: Cost of 0 for VF vscale x 1: vp<[[VPTR]]> = vector-pointer ir<[[LDADDR]]> -; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN ir<[[LD]]> = load vp<[[VPTR]]> -; CHECK-NEXT: Cost of 1 for VF vscale x 1: WIDEN ir<[[SELECTCMP]]> = icmp slt ir<%a>, ir<[[LD]]> -; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<[[ANYOF]]> = any-of ir<[[SELECTCMP]]> -; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<[[MASKSELECT]]> = select vp<[[ANYOF]]>, ir<[[SELECTCMP]]>, vp<[[MASKPHI]]> -; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<[[DATASELECT]]> = select vp<[[ANYOF]]>, ir<[[LD]]>, ir<[[DATAPHI]]> -; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT vp<[[INDEXNEXT]]> = add nuw vp<[[CIV]]>, vp<[[VFxUF]]> -; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT branch-on-count vp<[[INDEXNEXT]]>, vp<[[VECTC]]> -; CHECK-NEXT: Cost of 1 for VF vscale x 1: vector loop backedge -; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT-SCALAR vp<[[RESUMEVAL]]> = phi [ vp<[[VECTC]]>, middle.block ], [ ir<0>, ir-bb ] -; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT-SCALAR vp<[[MERGERDX]]> = phi [ vp<[[EXTRACTLAST]]>, middle.block ], [ ir<-1>, ir-bb ] -; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR [[IV]] = phi i64 [ 0, %entry ], [ [[IVNEXT]], %loop ] (extra operand: vp<[[RESUMEVAL]]> from scalar.ph) -; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR [[DATAPHI]] = phi i32 [ -1, %entry ], [ [[SELECTDATA]], %loop ] (extra operand: vp<[[MERGERDX]]> from scalar.ph) -; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR [[LDADDR]] = getelementptr inbounds i32, ptr %data, i64 [[IV]] -; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR [[LD]] = load i32, ptr [[LDADDR]], align 4 -; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR [[SELECTCMP]] = icmp slt i32 %a, [[LD]] -; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR [[SELECTDATA]] = select i1 [[SELECTCMP]], i32 [[LD]], i32 [[DATAPHI]] -; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR [[IVNEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR [[EXITCMP:%.*]] = icmp eq i64 [[IVNEXT]], [[ORIGTC]] -; CHECK-NEXT: Cost of 1 for VF vscale x 1: EMIT vp<[[EXTRACTLAST]]> = extract-last-active vp<[[DATASELECT]]>, vp<[[MASKSELECT]]>, ir<-1> -; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT vp<[[TCCMP]]> = icmp eq ir<[[ORIGTC]]>, vp<[[VECTC]]> -; CHECK-NEXT: Cost of 0 for VF vscale x 1: EMIT branch-on-cond vp<[[TCCMP]]> -; CHECK-NEXT: Cost of 0 for VF vscale x 1: IR [[SELECTLCSSA]] = phi i32 [ [[SELECTDATA]], %loop ] (extra operand: vp<[[EXTRACTLAST]]> from middle.block) From 74f035155fb43b9422f3a6e87f975d9cf22616bd Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Wed, 19 Nov 2025 14:03:42 +0000 Subject: [PATCH 10/24] Moved vectorized epilogue ICE test to separate file, removed unnecessary check lines --- .../AArch64/conditional-scalar-assignment.ll | 4 - .../conditional-scalar-assignment-vplan.ll | 12 --- .../AArch64/findlast-epilogue-loop.ll | 78 +++++++++++++++++++ 3 files changed, 78 insertions(+), 16 deletions(-) create mode 100644 llvm/test/Transforms/PhaseOrdering/AArch64/findlast-epilogue-loop.ll diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll index 8b80c161c438c..01e76cae2db7f 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll @@ -2,10 +2,6 @@ ; RUN: opt -passes=loop-vectorize -S < %s 2>&1 | FileCheck %s --check-prefix=NEON ; RUN: opt -passes=loop-vectorize -mattr=+sve -S < %s 2>&1 | FileCheck %s --check-prefix=SVE -;; The following run line caused an ICE before using a dedicated FindLast PHI recipe. -;; We're not looking at the resulting IR, just confirming it doesn't crash. -; RUN: opt -passes=loop-vectorize -mattr=+sve -epilogue-vectorization-force-VF=4 -S < %s 2>&1 > /dev/null - target triple = "aarch64-linux-gnu" define i32 @simple_csa_int_select(i64 %N, ptr %data, i32 %a) { diff --git a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll index 79e5ca2cc7a1d..64dc57f7ec492 100644 --- a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll +++ b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll @@ -75,15 +75,3 @@ exit: ; CHECK-NEXT: EMIT-SCALAR vp<[[RESUMEVAL:%.*]]> = phi [ vp<[[VECTC]]>, middle.block ], [ ir<0>, ir-bb ] ; CHECK-NEXT: EMIT-SCALAR vp<[[MERGERDX:%.*]]> = phi [ vp<[[EXTRACTLAST]]>, middle.block ], [ ir<-1>, ir-bb ] ; CHECK-NEXT: Successor(s): ir-bb -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR [[IV:%.*]] = phi i64 [ 0, %entry ], [ [[IVNEXT:%.*]], %loop ] (extra operand: vp<[[RESUMEVAL]]> from scalar.ph) -; CHECK-NEXT: IR [[DATAPHI]] = phi i32 [ -1, %entry ], [ [[SELECTDATA]], %loop ] (extra operand: vp<[[MERGERDX]]> from scalar.ph) -; CHECK-NEXT: IR [[LDADDR]] = getelementptr inbounds i32, ptr %data, i64 [[IV]] -; CHECK-NEXT: IR [[LD]] = load i32, ptr [[LDADDR]], align 4 -; CHECK-NEXT: IR [[SELECTCMP]] = icmp slt i32 %a, [[LD]] -; CHECK-NEXT: IR [[SELECTDATA]] = select i1 [[SELECTCMP]], i32 [[LD]], i32 [[DATAPHI]] -; CHECK-NEXT: IR [[IVNEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: IR [[EXITCMP:%.*]] = icmp eq i64 [[IVNEXT]], [[ORIGTC]] -; CHECK-NEXT: No successors -; CHECK-NEXT: } diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/findlast-epilogue-loop.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/findlast-epilogue-loop.ll new file mode 100644 index 0000000000000..8ceff41be73b3 --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/findlast-epilogue-loop.ll @@ -0,0 +1,78 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6 +; RUN: opt -passes=loop-vectorize -mattr=+sve -epilogue-vectorization-force-VF=4 -S < %s 2>&1 | FileCheck %s + +; This test is mainly confirming that we don't crash when vectorizing a findlast +; reduction and trying to use a vectorized epilogue loop. Once support for that +; has been added, this test can be removed. + +target triple = "aarch64-linux-gnu" + +define i32 @simple_csa_int_select(i64 %N, ptr %data, i32 %a) { +; CHECK-LABEL: define i32 @simple_csa_int_select( +; CHECK-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i32 [[A:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[A]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP5]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp slt [[BROADCAST_SPLAT]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP7:%.*]] = freeze [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP7]]) +; CHECK-NEXT: [[TMP9]] = select i1 [[TMP8]], [[TMP6]], [[TMP4]] +; CHECK-NEXT: [[TMP10]] = select i1 [[TMP8]], [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32( [[TMP10]], [[TMP9]], i32 -1) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[DATA_PHI:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]] +; CHECK-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 +; CHECK-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] +; CHECK-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ], [ [[TMP12]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[SELECT_DATA_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] + %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv + %ld = load i32, ptr %ld.addr, align 4 + %select.cmp = icmp slt i32 %a, %ld + %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, %N + br i1 %exit.cmp, label %exit, label %loop + +exit: + ret i32 %select.data +} From efd9d175dfad91ddde1430f46ff8d34ed520c7f0 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Wed, 19 Nov 2025 16:43:32 +0000 Subject: [PATCH 11/24] Updated check output after rebase --- .../LoopVectorize/iv-select-cmp-decreasing.ll | 8 +++---- .../LoopVectorize/iv-select-cmp-no-wrap.ll | 2 +- .../iv-select-cmp-non-const-iv-start.ll | 16 ++++++------- .../LoopVectorize/iv-select-cmp-trunc.ll | 24 +++++++++---------- .../Transforms/LoopVectorize/iv-select-cmp.ll | 12 +++++----- 5 files changed, 31 insertions(+), 31 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll index 503837894a7b4..cdd76957afcf1 100644 --- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll +++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll @@ -1135,7 +1135,7 @@ define i64 @select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64 ; IC1VF4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; IC1VF4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0 ; IC1VF4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer -; IC1VF4-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], +; IC1VF4-NEXT: [[INDUCTION:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT2]], ; IC1VF4-NEXT: br label %[[VECTOR_BODY:.*]] ; IC1VF4: [[VECTOR_BODY]]: ; IC1VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] @@ -1160,7 +1160,7 @@ define i64 @select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64 ; IC1VF4-NEXT: [[TMP14]] = select i1 [[TMP13]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]] ; IC1VF4-NEXT: [[TMP15]] = select i1 [[TMP13]], <4 x i64> [[TMP3]], <4 x i64> [[VEC_PHI]] ; IC1VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; IC1VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4) +; IC1VF4-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -4) ; IC1VF4-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IC1VF4-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; IC1VF4: [[MIDDLE_BLOCK]]: @@ -1204,7 +1204,7 @@ define i64 @select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64 ; IC4VF4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; IC4VF4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0 ; IC4VF4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer -; IC4VF4-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], +; IC4VF4-NEXT: [[INDUCTION:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT2]], ; IC4VF4-NEXT: br label %[[VECTOR_BODY:.*]] ; IC4VF4: [[VECTOR_BODY]]: ; IC4VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] @@ -1229,7 +1229,7 @@ define i64 @select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64 ; IC4VF4-NEXT: [[TMP14]] = select i1 [[TMP13]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]] ; IC4VF4-NEXT: [[TMP15]] = select i1 [[TMP13]], <4 x i64> [[TMP3]], <4 x i64> [[VEC_PHI]] ; IC4VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; IC4VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4) +; IC4VF4-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -4) ; IC4VF4-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IC4VF4-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; IC4VF4: [[MIDDLE_BLOCK]]: diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll index 18f1470aba3a5..377afaea8f7fd 100644 --- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll +++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll @@ -168,7 +168,7 @@ define i64 @select_icmp_nuw(ptr %a, ptr %b, i64 %ii, i64 %n) { ; CHECK-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP2]], <4 x i1> [[LAST_ACTIVE_MASK]] ; CHECK-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw <4 x i64> [[VEC_IND]], splat (i64 4) ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll index 7a89c32b197d3..115956b977cee 100644 --- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll +++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll @@ -21,7 +21,7 @@ define i64 @select_non_const_iv_start_signed_guard(ptr %a, i64 %rdx_start, i64 % ; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[IV_START]], i64 0 ; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], +; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT2]], ; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK-VF4IC1: [[VECTOR_BODY]]: ; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] @@ -37,7 +37,7 @@ define i64 @select_non_const_iv_start_signed_guard(ptr %a, i64 %rdx_start, i64 % ; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP5]], <4 x i1> [[TMP3]], <4 x i1> [[LAST_ACTIVE_MASK]] ; CHECK-VF4IC1-NEXT: [[TMP7]] = select i1 [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] ; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4) ; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF4IC1-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: @@ -83,7 +83,7 @@ define i64 @select_non_const_iv_start_signed_guard(ptr %a, i64 %rdx_start, i64 % ; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[IV_START]], i64 0 ; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], +; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT2]], ; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK-VF4IC4: [[VECTOR_BODY]]: ; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] @@ -99,7 +99,7 @@ define i64 @select_non_const_iv_start_signed_guard(ptr %a, i64 %rdx_start, i64 % ; CHECK-VF4IC4-NEXT: [[TMP6]] = select i1 [[TMP5]], <4 x i1> [[TMP3]], <4 x i1> [[LAST_ACTIVE_MASK]] ; CHECK-VF4IC4-NEXT: [[TMP7]] = select i1 [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] ; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4) ; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF4IC4-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: @@ -192,7 +192,7 @@ define i32 @select_trunc_non_const_iv_start_signed_guard(ptr %a, i32 %rdx_start, ; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[IV_START]], i64 0 ; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT2]], +; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add nsw <4 x i32> [[BROADCAST_SPLAT2]], ; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK-VF4IC1: [[VECTOR_BODY]]: ; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] @@ -208,7 +208,7 @@ define i32 @select_trunc_non_const_iv_start_signed_guard(ptr %a, i32 %rdx_start, ; CHECK-VF4IC1-NEXT: [[TMP7]] = select i1 [[TMP6]], <4 x i1> [[TMP4]], <4 x i1> [[LAST_ACTIVE_MASK]] ; CHECK-VF4IC1-NEXT: [[TMP8]] = select i1 [[TMP6]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] ; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i32> [[VEC_IND]], splat (i32 4) ; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF4IC1-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: @@ -257,7 +257,7 @@ define i32 @select_trunc_non_const_iv_start_signed_guard(ptr %a, i32 %rdx_start, ; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[IV_START]], i64 0 ; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT2]], +; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add nsw <4 x i32> [[BROADCAST_SPLAT2]], ; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK-VF4IC4: [[VECTOR_BODY]]: ; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] @@ -273,7 +273,7 @@ define i32 @select_trunc_non_const_iv_start_signed_guard(ptr %a, i32 %rdx_start, ; CHECK-VF4IC4-NEXT: [[TMP7]] = select i1 [[TMP6]], <4 x i1> [[TMP4]], <4 x i1> [[LAST_ACTIVE_MASK]] ; CHECK-VF4IC4-NEXT: [[TMP8]] = select i1 [[TMP6]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] ; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i32> [[VEC_IND]], splat (i32 4) ; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF4IC4-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll index 839ea7ce7e7a4..ad5189c35cfd7 100644 --- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll +++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll @@ -709,7 +709,7 @@ define i32 @select_icmp_const_truncated_iv_unwidened_exit(ptr %a, i64 %n) { ; CHECK-VF4IC1-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]] ; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] ; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 -; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4) ; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: @@ -763,7 +763,7 @@ define i32 @select_icmp_const_truncated_iv_unwidened_exit(ptr %a, i64 %n) { ; CHECK-VF4IC4-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> [[LAST_ACTIVE_MASK]] ; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] ; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4) ; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: @@ -874,7 +874,7 @@ define i32 @select_icmp_const_truncated_iv_unsigned_loop_guard(ptr %a, i32 %n) { ; CHECK-VF4IC1-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]] ; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] ; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 -; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4) ; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: @@ -929,7 +929,7 @@ define i32 @select_icmp_const_truncated_iv_unsigned_loop_guard(ptr %a, i32 %n) { ; CHECK-VF4IC4-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> [[LAST_ACTIVE_MASK]] ; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] ; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4) ; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: @@ -1037,7 +1037,7 @@ define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound(ptr %a) { ; CHECK-VF4IC1-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> [[LAST_ACTIVE_MASK]] ; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] ; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4) ; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9223372032559808512 ; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: @@ -1066,7 +1066,7 @@ define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound(ptr %a) { ; CHECK-VF4IC4-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> [[LAST_ACTIVE_MASK]] ; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] ; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4) ; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9223372032559808512 ; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: @@ -1143,7 +1143,7 @@ define i32 @not_vectorized_select_iv_icmp_no_guard(ptr %a, ptr %b, i32 %start, i ; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]] ; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] ; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 -; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4) ; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF4IC1-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: @@ -1199,7 +1199,7 @@ define i32 @not_vectorized_select_iv_icmp_no_guard(ptr %a, ptr %b, i32 %start, i ; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]] ; CHECK-VF4IC4-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] ; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 -; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4) ; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF4IC4-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: @@ -1296,7 +1296,7 @@ define i32 @not_vectorized_select_fcmp_invalid_const_ub(ptr %a) { ; CHECK-VF4IC1-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]] ; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] ; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 -; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4) ; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2147483648 ; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: @@ -1338,7 +1338,7 @@ define i32 @not_vectorized_select_fcmp_invalid_const_ub(ptr %a) { ; CHECK-VF4IC4-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]] ; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] ; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 -; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4) ; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2147483648 ; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: @@ -1433,7 +1433,7 @@ define i16 @not_vectorized_select_iv_icmp_overflow_unwidened_tripcount(ptr %a, p ; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]] ; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i16> [[VEC_IND]], <4 x i16> [[VEC_PHI]] ; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 -; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4) +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i16> [[VEC_IND]], splat (i16 4) ; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF4IC1-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: @@ -1495,7 +1495,7 @@ define i16 @not_vectorized_select_iv_icmp_overflow_unwidened_tripcount(ptr %a, p ; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]] ; CHECK-VF4IC4-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i16> [[VEC_IND]], <4 x i16> [[VEC_PHI]] ; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 -; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4) +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i16> [[VEC_IND]], splat (i16 4) ; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF4IC4-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll index 6001ee32ca62a..e3df1d3c39cca 100644 --- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll @@ -1977,7 +1977,7 @@ define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx. ; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP5]], <4 x i1> [[TMP3]], <4 x i1> [[LAST_ACTIVE_MASK]] ; CHECK-VF4IC1-NEXT: [[TMP7]] = select i1 [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] ; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4) ; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF4IC1-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: @@ -2035,7 +2035,7 @@ define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx. ; CHECK-VF4IC4-NEXT: [[TMP6]] = select i1 [[TMP5]], <4 x i1> [[TMP3]], <4 x i1> [[LAST_ACTIVE_MASK]] ; CHECK-VF4IC4-NEXT: [[TMP7]] = select i1 [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] ; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4) ; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF4IC4-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: @@ -2125,7 +2125,7 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b, ; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[IVSTART]], i64 0 ; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], +; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add nuw nsw <4 x i64> [[BROADCAST_SPLAT2]], ; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK-VF4IC1: [[VECTOR_BODY]]: ; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] @@ -2143,7 +2143,7 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b, ; CHECK-VF4IC1-NEXT: [[TMP7]] = select i1 [[TMP6]], <4 x i1> [[TMP4]], <4 x i1> [[LAST_ACTIVE_MASK]] ; CHECK-VF4IC1-NEXT: [[TMP8]] = select i1 [[TMP6]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] ; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4) ; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF4IC1-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: @@ -2185,7 +2185,7 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b, ; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[IVSTART]], i64 0 ; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], +; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add nuw nsw <4 x i64> [[BROADCAST_SPLAT2]], ; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK-VF4IC4: [[VECTOR_BODY]]: ; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] @@ -2203,7 +2203,7 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b, ; CHECK-VF4IC4-NEXT: [[TMP7]] = select i1 [[TMP6]], <4 x i1> [[TMP4]], <4 x i1> [[LAST_ACTIVE_MASK]] ; CHECK-VF4IC4-NEXT: [[TMP8]] = select i1 [[TMP6]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] ; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4) ; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF4IC4-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: From 72a1b8c33e2daf2ddcd0e86156d2991a6903d906 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Thu, 20 Nov 2025 12:03:41 +0000 Subject: [PATCH 12/24] Move epilogue vectorization test back to LV --- .../AArch64/findlast-epilogue-loop.ll | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) rename llvm/test/Transforms/{PhaseOrdering => LoopVectorize}/AArch64/findlast-epilogue-loop.ll (95%) diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/findlast-epilogue-loop.ll b/llvm/test/Transforms/LoopVectorize/AArch64/findlast-epilogue-loop.ll similarity index 95% rename from llvm/test/Transforms/PhaseOrdering/AArch64/findlast-epilogue-loop.ll rename to llvm/test/Transforms/LoopVectorize/AArch64/findlast-epilogue-loop.ll index 8ceff41be73b3..8e4a8457414bc 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/findlast-epilogue-loop.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/findlast-epilogue-loop.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6 ; RUN: opt -passes=loop-vectorize -mattr=+sve -epilogue-vectorization-force-VF=4 -S < %s 2>&1 | FileCheck %s -; This test is mainly confirming that we don't crash when vectorizing a findlast -; reduction and trying to use a vectorized epilogue loop. Once support for that -; has been added, this test can be removed. +;; This test is currently ensuring we don't crash when vectorizing loops with +;; conditional scalar assignment when epilogue vectorization is either requested +;; or costed as profitable. target triple = "aarch64-linux-gnu" From 9fe68d25ac9cd1baafea8ca41e1fb962f627c5bf Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Thu, 20 Nov 2025 12:28:59 +0000 Subject: [PATCH 13/24] Improve IVDesc comments --- llvm/lib/Analysis/IVDescriptors.cpp | 36 ++++++++++++++++++----------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp index f4130440a2f96..71890f876ba48 100644 --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -723,9 +723,15 @@ RecurrenceDescriptor::isAnyOfPattern(Loop *Loop, PHINode *OrigPhi, // if (src[i] > 3) // r = i; // } +// or like this: +// int r = 0; +// for (int i = 0; i < n; i++) { +// if (a[i] > 3) +// r = a[i]; +// } // The reduction value (r) is derived from either the values of an induction -// variable (i) sequence, or from the start value (0). The LLVM IR generated for -// such loops would be as follows: +// variable (i) sequence, an arbitrary value (a[i]), or from the start value +// (0). The LLVM IR generated for such loops would be as follows: // for.body: // %r = phi i32 [ %spec.select, %for.body ], [ 0, %entry ] // %i = phi i32 [ %inc, %for.body ], [ 0, %entry ] @@ -734,19 +740,23 @@ RecurrenceDescriptor::isAnyOfPattern(Loop *Loop, PHINode *OrigPhi, // %spec.select = select i1 %cmp, i32 %i, i32 %r // %inc = add nsw i32 %i, 1 // ... -// Since 'i' is an induction variable, the reduction value after the loop will -// be the maximum (increasing induction) or minimum (decreasing induction) value -// of 'i' that the condition (src[i] > 3) is satisfied, or the start value (0 in -// the example above). When the start value of the induction variable 'i' is -// greater than the minimum (increasing induction) or maximum (decreasing -// induction) value of the data type, we can use the minimum (increasing -// induction) or maximum (decreasing induction) value of the data type as a -// sentinel value to replace the start value. This allows us to perform a single -// reduction max (increasing induction) or min (decreasing induction) operation -// to obtain the final reduction result. +// When searching for an induction variable (i), the reduction value after the +// loop will be the maximum (increasing induction) or minimum (decreasing +// induction) value of 'i' that the condition (src[i] > 3) is satisfied, or the +// start value (0 in the example above). When the start value of the induction +// variable 'i' is greater than the minimum (increasing induction) or maximum +// (decreasing induction) value of the data type, we can use the minimum +// (increasing induction) or maximum (decreasing induction) value of the data +// type as a sentinel value to replace the start value. This allows us to +// perform a single reduction max (increasing induction) or min (decreasing +// induction) operation to obtain the final reduction result. // TODO: It is possible to solve the case where the start value is the minimum // value of the data type or a non-constant value by using mask and multiple // reduction operations. +// +// When searching for an arbitrary value (such as 'a[i]'), the reduction value +// will either be the initial value (0) if the condition was never met, or the +// value of a[i] in the most recent loop iteration where the condition was met. RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isFindPattern(RecurKind Kind, Loop *TheLoop, PHINode *OrigPhi, Instruction *I, @@ -761,7 +771,7 @@ RecurrenceDescriptor::isFindPattern(RecurKind Kind, Loop *TheLoop, // We are looking for selects of the form: // select(cmp(), phi, value) or // select(cmp(), value, phi) - // where 'value' is be a loop induction variable + // where 'value' must be a loop induction variable // (for FindFirstIV/FindLastIV) or an arbitrary value (for FindLast). // TODO: Match selects with multi-use cmp conditions. Value *NonRdxPhi = nullptr; From 9cbdf2160d693c3805f8a167e07e7fe696f7b03d Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Thu, 20 Nov 2025 12:30:11 +0000 Subject: [PATCH 14/24] Remove traces of dedicated FindLast phi recipe --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 1 - llvm/lib/Transforms/Vectorize/VPlan.h | 1 - llvm/lib/Transforms/Vectorize/VPlanValue.h | 1 - 3 files changed, 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index d6133ea6eca8f..876f3fbef2d6f 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4084,7 +4084,6 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, continue; case VPDef::VPReductionSC: case VPDef::VPActiveLaneMaskPHISC: - case VPDef::VPLastActiveMaskPHISC: case VPDef::VPWidenCallSC: case VPDef::VPWidenCanonicalIVSC: case VPDef::VPWidenCastSC: diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index b41853a9c972e..7e8d33b448331 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -562,7 +562,6 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPPredInstPHISC: case VPRecipeBase::VPCanonicalIVPHISC: case VPRecipeBase::VPActiveLaneMaskPHISC: - case VPRecipeBase::VPLastActiveMaskPHISC: case VPRecipeBase::VPFirstOrderRecurrencePHISC: case VPRecipeBase::VPWidenPHISC: case VPRecipeBase::VPWidenIntOrFpInductionSC: diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 7a488973010b9..b9f5847ec731c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -373,7 +373,6 @@ class VPDef { // VPHeaderPHIRecipe need to be kept together. VPCanonicalIVPHISC, VPActiveLaneMaskPHISC, - VPLastActiveMaskPHISC, VPEVLBasedIVPHISC, VPFirstOrderRecurrencePHISC, VPWidenIntOrFpInductionSC, From bb8106ffe129b4612f9c7a6c6f9264d411d764b0 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Thu, 20 Nov 2025 12:31:04 +0000 Subject: [PATCH 15/24] Move and improve convertFindLastRecurrences --- .../Vectorize/VPlanConstruction.cpp | 63 ++++++++++++++++ .../Transforms/Vectorize/VPlanTransforms.cpp | 72 ------------------- 2 files changed, 63 insertions(+), 72 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index 329b62cee4fce..4ceeb9a6558cd 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -1148,3 +1148,66 @@ bool VPlanTransforms::handleMultiUseReductions(VPlan &Plan) { } return true; } + +void VPlanTransforms::convertFindLastRecurrences( + VPlan &Plan, VPRecipeBuilder &RecipeBuilder) { + if (Plan.hasScalarVFOnly()) + return; + + // We want to create the following nodes: + // vec.body: + // mask.phi = phi [ all.false, vec.ph ], [ new.mask, vec.body ] + // ...data.phi already exists, but needs updating... + // data.phi = phi [ default.val, vec.ph ], [ new.data, vec.body ] + // + // ...'data' and 'compare' created by existing nodes... + // + // any_active = i1 any_of_reduction(compare) + // new.mask = select any_active, compare, mask.phi + // new.data = select any_active, data, data.phi + // + // middle.block: + // result = extract-last-active new.data, new.mask, default.val + + for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { + auto *PhiR = dyn_cast(&Phi); + if (!PhiR || !RecurrenceDescriptor::isFindLastRecurrenceKind( + PhiR->getRecurrenceKind())) + continue; + + // Find the condition for the select + auto *SR = cast(&PhiR->getBackedgeRecipe()); + VPValue *Cond = SR->getCond(); + + // Add mask phi + VPBuilder Builder = VPBuilder::getToInsertAfter(PhiR); + VPValue *False = Plan.getOrAddLiveIn( + ConstantInt::getFalse(PhiR->getUnderlyingValue()->getContext())); + auto *MaskPHI = new VPWidenPHIRecipe(nullptr, False, DebugLoc()); + Builder.insert(MaskPHI); + + // Add select for mask + Builder.setInsertPoint(SR); + VPValue *AnyOf = Builder.createNaryOp(VPInstruction::AnyOf, {Cond}); + VPValue *MaskSelect = Builder.createSelect(AnyOf, Cond, MaskPHI); + MaskPHI->addOperand(MaskSelect); + + // Replace select for data + VPValue *DataSelect = Builder.createSelect( + AnyOf, SR->getOperand(1), SR->getOperand(2), SR->getDebugLoc()); + SR->replaceAllUsesWith(DataSelect); + SR->eraseFromParent(); + + // Find final reduction computation and replace it with an + // extract.last.active intrinsic. + VPInstruction *RdxResult = findComputeReductionResult(PhiR); + assert(RdxResult && "Unable to find Reduction Result Recipe"); + Builder.setInsertPoint(RdxResult); + auto *ExtractLastActive = + Builder.createNaryOp(VPInstruction::ExtractLastActive, + {DataSelect, MaskSelect, PhiR->getStartValue()}, + RdxResult->getDebugLoc()); + RdxResult->replaceAllUsesWith(ExtractLastActive); + RdxResult->eraseFromParent(); + } +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index e3f53466a56be..38024aa6897fc 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -5106,75 +5106,3 @@ void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan, } } } - -void VPlanTransforms::convertFindLastRecurrences( - VPlan &Plan, VPRecipeBuilder &RecipeBuilder) { - - // May need to do something better than this? - if (Plan.hasScalarVFOnly()) - return; - - // We want to create the following nodes: - // vec.body: - // mask.phi = phi [ all.false, vec.ph ], [ new.mask, vec.body ] - // ...data.phi already exists, but needs updating... - // data.phi = phi [ default.val, vec.ph ], [ new.data, vec.body ] - // - // ...'data' and 'compare' created by existing nodes... - // - // any_active = i1 any_of_reduction(compare) - // new.mask = select any_active, compare, mask.phi - // new.data = select any_active, data, data.phi - // - // middle.block: - // result = extract-last-active new.data, new.mask, default.val - - for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { - auto *PhiR = dyn_cast(&Phi); - if (!PhiR || !RecurrenceDescriptor::isFindLastRecurrenceKind( - PhiR->getRecurrenceKind())) - continue; - - // Find the condition for the select - auto *SR = dyn_cast(&PhiR->getBackedgeRecipe()); - if (!SR) - continue; - VPValue *Cond = SR->getCond(); - - // Add mask phi - VPBuilder Builder = VPBuilder::getToInsertAfter(PhiR); - VPValue *False = Plan.getOrAddLiveIn( - ConstantInt::getFalse(PhiR->getUnderlyingValue()->getContext())); - auto *MaskPHI = new VPWidenPHIRecipe(nullptr, False, DebugLoc()); - Builder.insert(MaskPHI); - - // Add select for mask - Builder.setInsertPoint(SR); - VPValue *AnyOf = Builder.createNaryOp(VPInstruction::AnyOf, {Cond}); - VPValue *MaskSelect = Builder.createSelect(AnyOf, Cond, MaskPHI); - MaskPHI->addOperand(MaskSelect); - - // Replace select for data - VPValue *DataSelect = Builder.createSelect( - AnyOf, SR->getOperand(1), SR->getOperand(2), SR->getDebugLoc()); - SR->replaceAllUsesWith(DataSelect); - SR->eraseFromParent(); - - // Find final reduction computation and replace it with an - // extract.last.active intrinsic. - VPUser **ComputeRdx = find_if(DataSelect->users(), [](VPUser *U) { - VPInstruction *I = dyn_cast(U); - return I && I->getOpcode() == VPInstruction::ComputeReductionResult; - }); - assert(ComputeRdx != DataSelect->user_end() && - "Unable to find Reduction Result Recipe"); - VPInstruction *RdxResult = cast(*ComputeRdx); - Builder.setInsertPoint(RdxResult); - auto *ExtractLastActive = - Builder.createNaryOp(VPInstruction::ExtractLastActive, - {DataSelect, MaskSelect, PhiR->getStartValue()}, - RdxResult->getDebugLoc()); - RdxResult->replaceAllUsesWith(ExtractLastActive); - RdxResult->eraseFromParent(); - } -} From 5d1be36d14c74eaadb235aad33962d835ace37c2 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Thu, 20 Nov 2025 12:31:38 +0000 Subject: [PATCH 16/24] Add test with extra user for select --- .../AArch64/conditional-scalar-assignment.ll | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll index 01e76cae2db7f..e777659f003c8 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll @@ -403,3 +403,64 @@ loop: exit: ret i32 %select.data } + +define i32 @csa_with_extra_use_of_select(i64 %N, ptr readonly %data, ptr noalias %out, i32 %a) { +; NEON-LABEL: define i32 @csa_with_extra_use_of_select( +; NEON-SAME: i64 [[N:%.*]], ptr readonly [[DATA:%.*]], ptr noalias [[OUT:%.*]], i32 [[A:%.*]]) { +; NEON-NEXT: [[ENTRY:.*]]: +; NEON-NEXT: br label %[[LOOP:.*]] +; NEON: [[LOOP]]: +; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]] +; NEON-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 +; NEON-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] +; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] +; NEON-NEXT: [[ST_ADDR:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 [[IV]] +; NEON-NEXT: store i32 [[SELECT_DATA]], ptr [[ST_ADDR]], align 4 +; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; NEON: [[EXIT]]: +; NEON-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ] +; NEON-NEXT: ret i32 [[SELECT_DATA_LCSSA]] +; +; SVE-LABEL: define i32 @csa_with_extra_use_of_select( +; SVE-SAME: i64 [[N:%.*]], ptr readonly [[DATA:%.*]], ptr noalias [[OUT:%.*]], i32 [[A:%.*]]) #[[ATTR0]] { +; SVE-NEXT: [[ENTRY:.*]]: +; SVE-NEXT: br label %[[LOOP:.*]] +; SVE: [[LOOP]]: +; SVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]] +; SVE-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 +; SVE-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] +; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] +; SVE-NEXT: [[ST_ADDR:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 [[IV]] +; SVE-NEXT: store i32 [[SELECT_DATA]], ptr [[ST_ADDR]], align 4 +; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; SVE: [[EXIT]]: +; SVE-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ] +; SVE-NEXT: ret i32 [[SELECT_DATA_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] + %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv + %ld = load i32, ptr %ld.addr, align 4 + %select.cmp = icmp slt i32 %a, %ld + %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi + %st.addr = getelementptr inbounds i32, ptr %out, i64 %iv + store i32 %select.data, ptr %st.addr, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, %N + br i1 %exit.cmp, label %exit, label %loop + +exit: + ret i32 %select.data +} From 37084864fe3c3cd9af89d28f340732ca806fd707 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Tue, 25 Nov 2025 13:54:01 +0000 Subject: [PATCH 17/24] Call xform earlier --- .../Transforms/Vectorize/LoopVectorize.cpp | 9 +- .../Vectorize/VPlanConstruction.cpp | 142 ++++++++++-------- .../Transforms/Vectorize/VPlanTransforms.h | 14 +- 3 files changed, 93 insertions(+), 72 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 876f3fbef2d6f..970270751f5f4 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8596,6 +8596,11 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( *Plan)) return nullptr; + // Create whole-vector selects for find-last recurrences. + if (!VPlanTransforms::runPass(VPlanTransforms::handleFindLastReductions, + *Plan, RecipeBuilder)) + return nullptr; + // Transform recipes to abstract recipes if it is legal and beneficial and // clamp the range for better cost estimation. // TODO: Enable following transform when the EVL-version of extended-reduction @@ -8634,10 +8639,6 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( *Plan, Builder)) return nullptr; - // Create whole-vector selects for find-last recurrences. - VPlanTransforms::runPass(VPlanTransforms::convertFindLastRecurrences, *Plan, - RecipeBuilder); - if (useActiveLaneMask(Style)) { // TODO: Move checks to VPlanTransforms::addActiveLaneMask once // TailFoldingStyle is visible there. diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index 4ceeb9a6558cd..ff23417f0af49 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "LoopVectorizationPlanner.h" +#include "VPRecipeBuilder.h" #include "VPlan.h" #include "VPlanCFG.h" #include "VPlanDominatorTree.h" @@ -997,6 +998,85 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) { return true; } +bool VPlanTransforms::handleFindLastReductions(VPlan &Plan, + VPRecipeBuilder &RecipeBuilder) { + if (Plan.hasScalarVFOnly()) + return false; + + // We want to create the following nodes: + // vec.body: + // mask.phi = phi [ all.false, vec.ph ], [ new.mask, vec.body ] + // ...data.phi already exists, but needs updating... + // data.phi = phi [ default.val, vec.ph ], [ new.data, vec.body ] + // + // ...'data' and 'compare' created by existing nodes... + // + // any_active = i1 any_of_reduction(compare) + // new.mask = select any_active, compare, mask.phi + // new.data = select any_active, data, data.phi + // + // middle.block: + // result = extract-last-active new.data, new.mask, default.val + + for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { + auto *PhiR = dyn_cast(&Phi); + if (!PhiR || !RecurrenceDescriptor::isFindLastRecurrenceKind( + PhiR->getRecurrenceKind())) + continue; + + // Find the condition for the select + auto *SelectR = cast(&PhiR->getBackedgeRecipe()); + VPValue *Cond = nullptr; + if (auto *WidenR = dyn_cast(SelectR)) + Cond = WidenR->getCond(); + else if (auto *RepR = dyn_cast(SelectR)) { + auto *SI = dyn_cast(RepR->getUnderlyingInstr()); + if (!SI) + return false; + auto *CmpI = dyn_cast(SI->getCondition()); + if (!CmpI) + return false; + Cond = RecipeBuilder.getRecipe(CmpI)->getVPSingleValue(); + } else + return false; + + // Add mask phi + VPBuilder Builder = VPBuilder::getToInsertAfter(PhiR); + VPValue *False = Plan.getOrAddLiveIn( + ConstantInt::getFalse(PhiR->getUnderlyingValue()->getContext())); + auto *MaskPHI = new VPWidenPHIRecipe(nullptr, False, DebugLoc()); + Builder.insert(MaskPHI); + + // Add select for mask + Builder.setInsertPoint(SelectR); + VPValue *AnyOf = Builder.createNaryOp(VPInstruction::AnyOf, {Cond}); + VPValue *MaskSelect = Builder.createSelect(AnyOf, Cond, MaskPHI); + MaskPHI->addOperand(MaskSelect); + + // Replace select for data + VPValue *DataSelect = + Builder.createSelect(AnyOf, SelectR->getOperand(1), + SelectR->getOperand(2), SelectR->getDebugLoc()); + SelectR->replaceAllUsesWith(DataSelect); + SelectR->eraseFromParent(); + + // Find final reduction computation and replace it with an + // extract.last.active intrinsic. + auto *RdxResult = findUserOf(PhiR); + if (!RdxResult) + return false; + Builder.setInsertPoint(RdxResult); + auto *ExtractLastActive = + Builder.createNaryOp(VPInstruction::ExtractLastActive, + {DataSelect, MaskSelect, PhiR->getStartValue()}, + RdxResult->getDebugLoc()); + RdxResult->replaceAllUsesWith(ExtractLastActive); + RdxResult->eraseFromParent(); + } + + return true; +} + bool VPlanTransforms::handleMultiUseReductions(VPlan &Plan) { for (auto &PhiR : make_early_inc_range( Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis())) { @@ -1149,65 +1229,3 @@ bool VPlanTransforms::handleMultiUseReductions(VPlan &Plan) { return true; } -void VPlanTransforms::convertFindLastRecurrences( - VPlan &Plan, VPRecipeBuilder &RecipeBuilder) { - if (Plan.hasScalarVFOnly()) - return; - - // We want to create the following nodes: - // vec.body: - // mask.phi = phi [ all.false, vec.ph ], [ new.mask, vec.body ] - // ...data.phi already exists, but needs updating... - // data.phi = phi [ default.val, vec.ph ], [ new.data, vec.body ] - // - // ...'data' and 'compare' created by existing nodes... - // - // any_active = i1 any_of_reduction(compare) - // new.mask = select any_active, compare, mask.phi - // new.data = select any_active, data, data.phi - // - // middle.block: - // result = extract-last-active new.data, new.mask, default.val - - for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { - auto *PhiR = dyn_cast(&Phi); - if (!PhiR || !RecurrenceDescriptor::isFindLastRecurrenceKind( - PhiR->getRecurrenceKind())) - continue; - - // Find the condition for the select - auto *SR = cast(&PhiR->getBackedgeRecipe()); - VPValue *Cond = SR->getCond(); - - // Add mask phi - VPBuilder Builder = VPBuilder::getToInsertAfter(PhiR); - VPValue *False = Plan.getOrAddLiveIn( - ConstantInt::getFalse(PhiR->getUnderlyingValue()->getContext())); - auto *MaskPHI = new VPWidenPHIRecipe(nullptr, False, DebugLoc()); - Builder.insert(MaskPHI); - - // Add select for mask - Builder.setInsertPoint(SR); - VPValue *AnyOf = Builder.createNaryOp(VPInstruction::AnyOf, {Cond}); - VPValue *MaskSelect = Builder.createSelect(AnyOf, Cond, MaskPHI); - MaskPHI->addOperand(MaskSelect); - - // Replace select for data - VPValue *DataSelect = Builder.createSelect( - AnyOf, SR->getOperand(1), SR->getOperand(2), SR->getDebugLoc()); - SR->replaceAllUsesWith(DataSelect); - SR->eraseFromParent(); - - // Find final reduction computation and replace it with an - // extract.last.active intrinsic. - VPInstruction *RdxResult = findComputeReductionResult(PhiR); - assert(RdxResult && "Unable to find Reduction Result Recipe"); - Builder.setInsertPoint(RdxResult); - auto *ExtractLastActive = - Builder.createNaryOp(VPInstruction::ExtractLastActive, - {DataSelect, MaskSelect, PhiR->getStartValue()}, - RdxResult->getDebugLoc()); - RdxResult->replaceAllUsesWith(ExtractLastActive); - RdxResult->eraseFromParent(); - } -} diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index d3b9d24eb5689..725e4ebefa8f4 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -166,6 +166,14 @@ struct VPlanTransforms { /// this attempt was unsuccessful. static bool handleMaxMinNumReductions(VPlan &Plan); + /// Check if \p Plan contains any FindLast reductions. If it does, try to + /// update the vector loop to save the appropriate state using selects + /// for entire vectors for both the latest mask containing at least one active + /// element and the corresponding data vector. Return false if this attempt + /// was unsuccessful. + static bool handleFindLastReductions(VPlan &Plan, + VPRecipeBuilder &RecipeBuilder); + /// Clear NSW/NUW flags from reduction instructions if necessary. static void clearReductionWrapFlags(VPlan &Plan); @@ -402,12 +410,6 @@ struct VPlanTransforms { /// users in the original exit block using the VPIRInstruction wrapping to the /// LCSSA phi. static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range); - - /// Change FindLast reductions to save the appropriate state using selects - /// for entire vectors for both the latest mask containing at least one active - /// element and the corresponding data vector. - static void convertFindLastRecurrences(VPlan &Plan, - VPRecipeBuilder &RecipeBuilder); }; } // namespace llvm From 2b43298d7fbe23993080384a0f998473145b77bb Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Tue, 25 Nov 2025 14:00:43 +0000 Subject: [PATCH 18/24] Use Plan.getFalse() --- llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index ff23417f0af49..38fb6be908496 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -1042,9 +1042,7 @@ bool VPlanTransforms::handleFindLastReductions(VPlan &Plan, // Add mask phi VPBuilder Builder = VPBuilder::getToInsertAfter(PhiR); - VPValue *False = Plan.getOrAddLiveIn( - ConstantInt::getFalse(PhiR->getUnderlyingValue()->getContext())); - auto *MaskPHI = new VPWidenPHIRecipe(nullptr, False, DebugLoc()); + auto *MaskPHI = new VPWidenPHIRecipe(nullptr, Plan.getFalse()); Builder.insert(MaskPHI); // Add select for mask From 87b837b6850aef5a86fc04b4b27aee5b7a84dc8f Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Tue, 25 Nov 2025 14:43:14 +0000 Subject: [PATCH 19/24] Add a test case with extra arithmetic operations --- .../AArch64/conditional-scalar-assignment.ll | 153 ++++++++++++++++++ 1 file changed, 153 insertions(+) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll index e777659f003c8..564ecf1ca2230 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll @@ -464,3 +464,156 @@ loop: exit: ret i32 %select.data } + +;; Add more work to the loop besides the CSA to check cost modelling for NEON. +define i32 @int_select_with_extra_arith_payload(i64 %N, ptr readonly %A, ptr readonly %B, ptr noalias %C, i32 %a) { +; NEON-LABEL: define i32 @int_select_with_extra_arith_payload( +; NEON-SAME: i64 [[N:%.*]], ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr noalias [[C:%.*]], i32 [[A:%.*]]) { +; NEON-NEXT: [[ENTRY:.*]]: +; NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; NEON: [[VECTOR_PH]]: +; NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NEON-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0 +; NEON-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; NEON-NEXT: br label %[[VECTOR_BODY:.*]] +; NEON: [[VECTOR_BODY]]: +; NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; NEON-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ] +; NEON-NEXT: [[TMP0:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] +; NEON-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 +; NEON-NEXT: [[TMP2:%.*]] = mul <4 x i32> [[WIDE_LOAD]], splat (i32 13) +; NEON-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] +; NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4 +; NEON-NEXT: [[TMP4:%.*]] = mul <4 x i32> [[WIDE_LOAD1]], splat (i32 5) +; NEON-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]] +; NEON-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]] +; NEON-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4 +; NEON-NEXT: [[TMP7:%.*]] = icmp slt <4 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD]] +; NEON-NEXT: [[TMP8:%.*]] = freeze <4 x i1> [[TMP7]] +; NEON-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP8]]) +; NEON-NEXT: [[TMP10]] = select i1 [[TMP9]], <4 x i1> [[TMP7]], <4 x i1> [[TMP0]] +; NEON-NEXT: [[TMP11]] = select i1 [[TMP9]], <4 x i32> [[WIDE_LOAD]], <4 x i32> [[VEC_PHI]] +; NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; NEON-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NEON-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; NEON: [[MIDDLE_BLOCK]]: +; NEON-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP11]], <4 x i1> [[TMP10]], i32 -1) +; NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NEON-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; NEON: [[SCALAR_PH]]: +; NEON-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; NEON-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ] +; NEON-NEXT: br label %[[LOOP:.*]] +; NEON: [[LOOP]]: +; NEON-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[A_PHI:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_A:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[A_ADDR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NEON-NEXT: [[LD_A:%.*]] = load i32, ptr [[A_ADDR]], align 4 +; NEON-NEXT: [[MUL_A:%.*]] = mul i32 [[LD_A]], 13 +; NEON-NEXT: [[B_ADDR:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; NEON-NEXT: [[LD_B:%.*]] = load i32, ptr [[B_ADDR]], align 4 +; NEON-NEXT: [[MUL_B:%.*]] = mul i32 [[LD_B]], 5 +; NEON-NEXT: [[ADD:%.*]] = add i32 [[MUL_A]], [[MUL_B]] +; NEON-NEXT: [[C_ADDR:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]] +; NEON-NEXT: store i32 [[ADD]], ptr [[C_ADDR]], align 4 +; NEON-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD_A]] +; NEON-NEXT: [[SELECT_A]] = select i1 [[SELECT_CMP]], i32 [[LD_A]], i32 [[A_PHI]] +; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; NEON: [[EXIT]]: +; NEON-NEXT: [[SELECT_A_LCSSA:%.*]] = phi i32 [ [[SELECT_A]], %[[LOOP]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ] +; NEON-NEXT: ret i32 [[SELECT_A_LCSSA]] +; +; SVE-LABEL: define i32 @int_select_with_extra_arith_payload( +; SVE-SAME: i64 [[N:%.*]], ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr noalias [[C:%.*]], i32 [[A:%.*]]) #[[ATTR0]] { +; SVE-NEXT: [[ENTRY:.*]]: +; SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; SVE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2 +; SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; SVE: [[VECTOR_PH]]: +; SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 +; SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; SVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[A]], i64 0 +; SVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; SVE: [[VECTOR_BODY]]: +; SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; SVE-NEXT: [[VEC_PHI:%.*]] = phi [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] +; SVE-NEXT: [[TMP4:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ] +; SVE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] +; SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP5]], align 4 +; SVE-NEXT: [[TMP6:%.*]] = mul [[WIDE_LOAD]], splat (i32 13) +; SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] +; SVE-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP7]], align 4 +; SVE-NEXT: [[TMP8:%.*]] = mul [[WIDE_LOAD1]], splat (i32 5) +; SVE-NEXT: [[TMP9:%.*]] = add [[TMP6]], [[TMP8]] +; SVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]] +; SVE-NEXT: store [[TMP9]], ptr [[TMP10]], align 4 +; SVE-NEXT: [[TMP11:%.*]] = icmp slt [[BROADCAST_SPLAT]], [[WIDE_LOAD]] +; SVE-NEXT: [[TMP12:%.*]] = freeze [[TMP11]] +; SVE-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP12]]) +; SVE-NEXT: [[TMP14]] = select i1 [[TMP13]], [[TMP11]], [[TMP4]] +; SVE-NEXT: [[TMP15]] = select i1 [[TMP13]], [[WIDE_LOAD]], [[VEC_PHI]] +; SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; SVE-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SVE-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; SVE: [[MIDDLE_BLOCK]]: +; SVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32( [[TMP15]], [[TMP14]], i32 -1) +; SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; SVE-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; SVE: [[SCALAR_PH]]: +; SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP17]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ] +; SVE-NEXT: br label %[[LOOP:.*]] +; SVE: [[LOOP]]: +; SVE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[A_PHI:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_A:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[A_ADDR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; SVE-NEXT: [[LD_A:%.*]] = load i32, ptr [[A_ADDR]], align 4 +; SVE-NEXT: [[MUL_A:%.*]] = mul i32 [[LD_A]], 13 +; SVE-NEXT: [[B_ADDR:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; SVE-NEXT: [[LD_B:%.*]] = load i32, ptr [[B_ADDR]], align 4 +; SVE-NEXT: [[MUL_B:%.*]] = mul i32 [[LD_B]], 5 +; SVE-NEXT: [[ADD:%.*]] = add i32 [[MUL_A]], [[MUL_B]] +; SVE-NEXT: [[C_ADDR:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]] +; SVE-NEXT: store i32 [[ADD]], ptr [[C_ADDR]], align 4 +; SVE-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD_A]] +; SVE-NEXT: [[SELECT_A]] = select i1 [[SELECT_CMP]], i32 [[LD_A]], i32 [[A_PHI]] +; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; SVE: [[EXIT]]: +; SVE-NEXT: [[SELECT_A_LCSSA:%.*]] = phi i32 [ [[SELECT_A]], %[[LOOP]] ], [ [[TMP17]], %[[MIDDLE_BLOCK]] ] +; SVE-NEXT: ret i32 [[SELECT_A_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %A.phi = phi i32 [ -1, %entry ], [ %select.A, %loop ] + %A.addr = getelementptr inbounds i32, ptr %A, i64 %iv + %ld.A = load i32, ptr %A.addr, align 4 + %mul.A = mul i32 %ld.A, 13 + %B.addr = getelementptr inbounds i32, ptr %B, i64 %iv + %ld.B = load i32, ptr %B.addr, align 4 + %mul.B = mul i32 %ld.B, 5 + %add = add i32 %mul.A, %mul.B + %C.addr = getelementptr inbounds i32, ptr %C, i64 %iv + store i32 %add, ptr %C.addr, align 4 + %select.cmp = icmp slt i32 %a, %ld.A + %select.A = select i1 %select.cmp, i32 %ld.A, i32 %A.phi + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, %N + br i1 %exit.cmp, label %exit, label %loop + +exit: + ret i32 %select.A +} From 570f0e326d3ac6bef66c0b45c4a68946c3425c13 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Wed, 26 Nov 2025 11:50:44 +0000 Subject: [PATCH 20/24] Adjust tests post-rebase --- .../AArch64/conditional-scalar-assignment.ll | 16 +++++----- .../conditional-scalar-assignment-vplan.ll | 2 +- .../LoopVectorize/iv-select-cmp-decreasing.ll | 32 +++++++++---------- .../iv-select-cmp-non-const-iv-start.ll | 8 ++--- .../LoopVectorize/iv-select-cmp-trunc.ll | 24 +++++++------- 5 files changed, 41 insertions(+), 41 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll index 564ecf1ca2230..73ba412c10249 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll @@ -466,16 +466,16 @@ exit: } ;; Add more work to the loop besides the CSA to check cost modelling for NEON. -define i32 @int_select_with_extra_arith_payload(i64 %N, ptr readonly %A, ptr readonly %B, ptr noalias %C, i32 %a) { +define i32 @int_select_with_extra_arith_payload(i64 %N, ptr readonly %A, ptr readonly %B, ptr noalias %C, i32 %threshold) { ; NEON-LABEL: define i32 @int_select_with_extra_arith_payload( -; NEON-SAME: i64 [[N:%.*]], ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr noalias [[C:%.*]], i32 [[A:%.*]]) { +; NEON-SAME: i64 [[N:%.*]], ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr noalias [[C:%.*]], i32 [[THRESHOLD:%.*]]) { ; NEON-NEXT: [[ENTRY:.*]]: ; NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 ; NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; NEON: [[VECTOR_PH]]: ; NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 ; NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; NEON-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0 +; NEON-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[THRESHOLD]], i64 0 ; NEON-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; NEON-NEXT: br label %[[VECTOR_BODY:.*]] ; NEON: [[VECTOR_BODY]]: @@ -519,7 +519,7 @@ define i32 @int_select_with_extra_arith_payload(i64 %N, ptr readonly %A, ptr rea ; NEON-NEXT: [[ADD:%.*]] = add i32 [[MUL_A]], [[MUL_B]] ; NEON-NEXT: [[C_ADDR:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]] ; NEON-NEXT: store i32 [[ADD]], ptr [[C_ADDR]], align 4 -; NEON-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD_A]] +; NEON-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[THRESHOLD]], [[LD_A]] ; NEON-NEXT: [[SELECT_A]] = select i1 [[SELECT_CMP]], i32 [[LD_A]], i32 [[A_PHI]] ; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] @@ -529,7 +529,7 @@ define i32 @int_select_with_extra_arith_payload(i64 %N, ptr readonly %A, ptr rea ; NEON-NEXT: ret i32 [[SELECT_A_LCSSA]] ; ; SVE-LABEL: define i32 @int_select_with_extra_arith_payload( -; SVE-SAME: i64 [[N:%.*]], ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr noalias [[C:%.*]], i32 [[A:%.*]]) #[[ATTR0]] { +; SVE-SAME: i64 [[N:%.*]], ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr noalias [[C:%.*]], i32 [[THRESHOLD:%.*]]) #[[ATTR0]] { ; SVE-NEXT: [[ENTRY:.*]]: ; SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; SVE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2 @@ -540,7 +540,7 @@ define i32 @int_select_with_extra_arith_payload(i64 %N, ptr readonly %A, ptr rea ; SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 ; SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; SVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[A]], i64 0 +; SVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[THRESHOLD]], i64 0 ; SVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; SVE-NEXT: br label %[[VECTOR_BODY:.*]] ; SVE: [[VECTOR_BODY]]: @@ -584,7 +584,7 @@ define i32 @int_select_with_extra_arith_payload(i64 %N, ptr readonly %A, ptr rea ; SVE-NEXT: [[ADD:%.*]] = add i32 [[MUL_A]], [[MUL_B]] ; SVE-NEXT: [[C_ADDR:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]] ; SVE-NEXT: store i32 [[ADD]], ptr [[C_ADDR]], align 4 -; SVE-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD_A]] +; SVE-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[THRESHOLD]], [[LD_A]] ; SVE-NEXT: [[SELECT_A]] = select i1 [[SELECT_CMP]], i32 [[LD_A]], i32 [[A_PHI]] ; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] @@ -608,7 +608,7 @@ loop: %add = add i32 %mul.A, %mul.B %C.addr = getelementptr inbounds i32, ptr %C, i64 %iv store i32 %add, ptr %C.addr, align 4 - %select.cmp = icmp slt i32 %a, %ld.A + %select.cmp = icmp slt i32 %threshold, %ld.A %select.A = select i1 %select.cmp, i32 %ld.A, i32 %A.phi %iv.next = add nuw nsw i64 %iv, 1 %exit.cmp = icmp eq i64 %iv.next, %N diff --git a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll index 64dc57f7ec492..788b35e88734e 100644 --- a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll +++ b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll @@ -49,7 +49,7 @@ exit: ; CHECK-NEXT: WIDEN-PHI vp<[[MASKPHI:%.*]]> = phi [ ir, vector.ph ], [ vp<[[MASKSELECT:%.*]]>, vector.body ] ; CHECK-NEXT: vp<[[STEPS:%.*]]> = SCALAR-STEPS vp<[[CIV]]>, ir<1>, vp<[[VF]]> ; CHECK-NEXT: CLONE ir<[[LDADDR:%.*]]> = getelementptr inbounds ir<%data>, vp<[[STEPS:%.*]]> -; CHECK-NEXT: vp<[[VPTR:%.*]]> = vector-pointer ir<[[LDADDR]]> +; CHECK-NEXT: vp<[[VPTR:%.*]]> = vector-pointer inbounds ir<[[LDADDR]]> ; CHECK-NEXT: WIDEN ir<[[LD:%.*]]> = load vp<[[VPTR]]> ; CHECK-NEXT: WIDEN ir<[[SELECTCMP:%.*]]> = icmp slt ir<%a>, ir<[[LD]]> ; CHECK-NEXT: EMIT vp<[[ANYOF:%.*]]> = any-of ir<[[SELECTCMP]]> diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll index cdd76957afcf1..890b6ccba0796 100644 --- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll +++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll @@ -982,13 +982,13 @@ define i64 @select_decreasing_induction_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 ; IC1VF4-NEXT: [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -1) ; IC1VF4-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0 ; IC1VF4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]] -; IC1VF4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 -; IC1VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 -3 +; IC1VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 0 +; IC1VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 -3 ; IC1VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 ; IC1VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> ; IC1VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] -; IC1VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 -; IC1VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 -3 +; IC1VF4-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 0 +; IC1VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i64 -3 ; IC1VF4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1 ; IC1VF4-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> ; IC1VF4-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i8> [[REVERSE]], [[REVERSE2]] @@ -1038,13 +1038,13 @@ define i64 @select_decreasing_induction_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 ; IC4VF4-NEXT: [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -1) ; IC4VF4-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0 ; IC4VF4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]] -; IC4VF4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 -; IC4VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 -3 +; IC4VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 0 +; IC4VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 -3 ; IC4VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 ; IC4VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> ; IC4VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] -; IC4VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 -; IC4VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 -3 +; IC4VF4-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 0 +; IC4VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i64 -3 ; IC4VF4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1 ; IC4VF4-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> ; IC4VF4-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i8> [[REVERSE]], [[REVERSE2]] @@ -1145,13 +1145,13 @@ define i64 @select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64 ; IC1VF4-NEXT: [[TMP3:%.*]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -1) ; IC1VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 ; IC1VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] -; IC1VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 -; IC1VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -3 +; IC1VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i64 0 +; IC1VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i64 -3 ; IC1VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8 ; IC1VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> ; IC1VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] -; IC1VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 -; IC1VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i32 -3 +; IC1VF4-NEXT: [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i64 0 +; IC1VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP19]], i64 -3 ; IC1VF4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8 ; IC1VF4-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD3]], <4 x i64> poison, <4 x i32> ; IC1VF4-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i64> [[REVERSE]], [[REVERSE4]] @@ -1214,13 +1214,13 @@ define i64 @select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64 ; IC4VF4-NEXT: [[TMP3:%.*]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -1) ; IC4VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 ; IC4VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] -; IC4VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 -; IC4VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 -3 +; IC4VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i64 0 +; IC4VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i64 -3 ; IC4VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8 ; IC4VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> ; IC4VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] -; IC4VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 -; IC4VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i32 -3 +; IC4VF4-NEXT: [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i64 0 +; IC4VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP19]], i64 -3 ; IC4VF4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8 ; IC4VF4-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD3]], <4 x i64> poison, <4 x i32> ; IC4VF4-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i64> [[REVERSE]], [[REVERSE4]] diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll index 115956b977cee..88bb91efa0410 100644 --- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll +++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll @@ -192,7 +192,7 @@ define i32 @select_trunc_non_const_iv_start_signed_guard(ptr %a, i32 %rdx_start, ; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[IV_START]], i64 0 ; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add nsw <4 x i32> [[BROADCAST_SPLAT2]], +; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT2]], ; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK-VF4IC1: [[VECTOR_BODY]]: ; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] @@ -208,7 +208,7 @@ define i32 @select_trunc_non_const_iv_start_signed_guard(ptr %a, i32 %rdx_start, ; CHECK-VF4IC1-NEXT: [[TMP7]] = select i1 [[TMP6]], <4 x i1> [[TMP4]], <4 x i1> [[LAST_ACTIVE_MASK]] ; CHECK-VF4IC1-NEXT: [[TMP8]] = select i1 [[TMP6]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] ; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) ; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF4IC1-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: @@ -257,7 +257,7 @@ define i32 @select_trunc_non_const_iv_start_signed_guard(ptr %a, i32 %rdx_start, ; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[IV_START]], i64 0 ; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add nsw <4 x i32> [[BROADCAST_SPLAT2]], +; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT2]], ; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK-VF4IC4: [[VECTOR_BODY]]: ; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] @@ -273,7 +273,7 @@ define i32 @select_trunc_non_const_iv_start_signed_guard(ptr %a, i32 %rdx_start, ; CHECK-VF4IC4-NEXT: [[TMP7]] = select i1 [[TMP6]], <4 x i1> [[TMP4]], <4 x i1> [[LAST_ACTIVE_MASK]] ; CHECK-VF4IC4-NEXT: [[TMP8]] = select i1 [[TMP6]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] ; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) ; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF4IC4-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll index ad5189c35cfd7..839ea7ce7e7a4 100644 --- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll +++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll @@ -709,7 +709,7 @@ define i32 @select_icmp_const_truncated_iv_unwidened_exit(ptr %a, i64 %n) { ; CHECK-VF4IC1-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]] ; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] ; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 -; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) ; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: @@ -763,7 +763,7 @@ define i32 @select_icmp_const_truncated_iv_unwidened_exit(ptr %a, i64 %n) { ; CHECK-VF4IC4-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> [[LAST_ACTIVE_MASK]] ; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] ; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) ; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: @@ -874,7 +874,7 @@ define i32 @select_icmp_const_truncated_iv_unsigned_loop_guard(ptr %a, i32 %n) { ; CHECK-VF4IC1-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]] ; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] ; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 -; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) ; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: @@ -929,7 +929,7 @@ define i32 @select_icmp_const_truncated_iv_unsigned_loop_guard(ptr %a, i32 %n) { ; CHECK-VF4IC4-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> [[LAST_ACTIVE_MASK]] ; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] ; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) ; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: @@ -1037,7 +1037,7 @@ define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound(ptr %a) { ; CHECK-VF4IC1-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> [[LAST_ACTIVE_MASK]] ; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] ; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) ; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9223372032559808512 ; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: @@ -1066,7 +1066,7 @@ define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound(ptr %a) { ; CHECK-VF4IC4-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> [[LAST_ACTIVE_MASK]] ; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] ; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) ; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9223372032559808512 ; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: @@ -1143,7 +1143,7 @@ define i32 @not_vectorized_select_iv_icmp_no_guard(ptr %a, ptr %b, i32 %start, i ; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]] ; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] ; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 -; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) ; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF4IC1-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: @@ -1199,7 +1199,7 @@ define i32 @not_vectorized_select_iv_icmp_no_guard(ptr %a, ptr %b, i32 %start, i ; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]] ; CHECK-VF4IC4-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] ; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 -; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) ; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF4IC4-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: @@ -1296,7 +1296,7 @@ define i32 @not_vectorized_select_fcmp_invalid_const_ub(ptr %a) { ; CHECK-VF4IC1-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]] ; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] ; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 -; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) ; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2147483648 ; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: @@ -1338,7 +1338,7 @@ define i32 @not_vectorized_select_fcmp_invalid_const_ub(ptr %a) { ; CHECK-VF4IC4-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]] ; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] ; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 -; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) ; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2147483648 ; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: @@ -1433,7 +1433,7 @@ define i16 @not_vectorized_select_iv_icmp_overflow_unwidened_tripcount(ptr %a, p ; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]] ; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i16> [[VEC_IND]], <4 x i16> [[VEC_PHI]] ; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 -; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i16> [[VEC_IND]], splat (i16 4) +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4) ; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF4IC1-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: @@ -1495,7 +1495,7 @@ define i16 @not_vectorized_select_iv_icmp_overflow_unwidened_tripcount(ptr %a, p ; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]] ; CHECK-VF4IC4-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i16> [[VEC_IND]], <4 x i16> [[VEC_PHI]] ; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 -; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i16> [[VEC_IND]], splat (i16 4) +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4) ; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF4IC4-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: From 5df3db688bad12fd0da4559ffdb63066ca17bf9e Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Wed, 26 Nov 2025 15:47:38 +0000 Subject: [PATCH 21/24] Unify Find recurrence detection. --- llvm/include/llvm/Analysis/IVDescriptors.h | 9 ++- llvm/lib/Analysis/IVDescriptors.cpp | 61 ++++++------------- llvm/lib/Transforms/Utils/LoopUnroll.cpp | 8 +-- llvm/lib/Transforms/Utils/LoopUtils.cpp | 2 +- .../Transforms/Vectorize/LoopVectorize.cpp | 10 +-- 5 files changed, 35 insertions(+), 55 deletions(-) diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h index f9376c1c2a06b..05c17632e0e49 100644 --- a/llvm/include/llvm/Analysis/IVDescriptors.h +++ b/llvm/include/llvm/Analysis/IVDescriptors.h @@ -186,9 +186,8 @@ class RecurrenceDescriptor { /// where one of (X, Y) is an increasing (FindLastIV) or decreasing /// (FindFirstIV) loop induction variable, or an arbitrary integer value /// (FindLast), and the other is a PHI value. - LLVM_ABI static InstDesc isFindPattern(RecurKind Kind, Loop *TheLoop, - PHINode *OrigPhi, Instruction *I, - ScalarEvolution &SE); + LLVM_ABI static InstDesc isFindPattern(Loop *TheLoop, PHINode *OrigPhi, + Instruction *I, ScalarEvolution &SE); /// Returns a struct describing if the instruction is a /// Select(FCmp(X, Y), (Z = X op PHINode), PHINode) instruction pattern. @@ -319,6 +318,10 @@ class RecurrenceDescriptor { return Kind == RecurKind::FindLast; } + static bool isFindRecurrenceKind(RecurKind Kind) { + return isFindLastRecurrenceKind(Kind) || isFindIVRecurrenceKind(Kind); + } + /// Returns the type of the recurrence. This type can be narrower than the /// actual type of the Phi if the recurrence has been type-promoted. Type *getRecurrenceType() const { return RecurrenceType; } diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp index 71890f876ba48..77ae3382a8ea4 100644 --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -758,9 +758,8 @@ RecurrenceDescriptor::isAnyOfPattern(Loop *Loop, PHINode *OrigPhi, // will either be the initial value (0) if the condition was never met, or the // value of a[i] in the most recent loop iteration where the condition was met. RecurrenceDescriptor::InstDesc -RecurrenceDescriptor::isFindPattern(RecurKind Kind, Loop *TheLoop, - PHINode *OrigPhi, Instruction *I, - ScalarEvolution &SE) { +RecurrenceDescriptor::isFindPattern(Loop *TheLoop, PHINode *OrigPhi, + Instruction *I, ScalarEvolution &SE) { // TODO: Support the vectorization of FindLastIV when the reduction phi is // used by more than one select instruction. This vectorization is only // performed when the SCEV of each increasing induction variable used by the @@ -781,17 +780,6 @@ RecurrenceDescriptor::isFindPattern(RecurKind Kind, Loop *TheLoop, m_Value(NonRdxPhi))))) return InstDesc(false, I); - if (isFindLastRecurrenceKind(Kind)) { - // Must be an integer scalar. - Type *Type = OrigPhi->getType(); - if (!Type->isIntegerTy()) - return InstDesc(false, I); - - // FIXME: Support more complex patterns, including multiple selects. - // The Select must be used only outside the loop and by the PHI. - return InstDesc(I, RecurKind::FindLast); - } - // Returns either FindFirstIV/FindLastIV, if such a pattern is found, or // std::nullopt. auto GetRecurKind = [&](Value *V) -> std::optional { @@ -805,8 +793,9 @@ RecurrenceDescriptor::isFindPattern(RecurKind Kind, Loop *TheLoop, m_SpecificLoop(TheLoop)))) return std::nullopt; - if ((isFindFirstIVRecurrenceKind(Kind) && !SE.isKnownNegative(Step)) || - (isFindLastIVRecurrenceKind(Kind) && !SE.isKnownPositive(Step))) + // We must have a known positive or negative step for FindIV + const bool PositiveStep = SE.isKnownPositive(Step); + if (!PositiveStep && !SE.isKnownNegative(Step)) return std::nullopt; // Check if the minimum (FindLast) or maximum (FindFirst) value of the @@ -822,7 +811,7 @@ RecurrenceDescriptor::isFindPattern(RecurKind Kind, Loop *TheLoop, IsSigned ? SE.getSignedRange(AR) : SE.getUnsignedRange(AR); unsigned NumBits = Ty->getIntegerBitWidth(); ConstantRange ValidRange = ConstantRange::getEmpty(NumBits); - if (isFindLastIVRecurrenceKind(Kind)) { + if (PositiveStep) { APInt Sentinel = IsSigned ? APInt::getSignedMinValue(NumBits) : APInt::getMinValue(NumBits); ValidRange = ConstantRange::getNonEmpty(Sentinel + 1, Sentinel); @@ -836,26 +825,22 @@ RecurrenceDescriptor::isFindPattern(RecurKind Kind, Loop *TheLoop, APInt::getMinValue(NumBits), APInt::getMaxValue(NumBits) - 1); } - LLVM_DEBUG(dbgs() << "LV: " - << (isFindLastIVRecurrenceKind(Kind) ? "FindLastIV" - : "FindFirstIV") - << " valid range is " << ValidRange - << ", and the range of " << *AR << " is " << IVRange - << "\n"); + LLVM_DEBUG( + dbgs() << "LV: " << (PositiveStep ? "FindLastIV" : "FindFirstIV") + << " valid range is " << ValidRange << ", and the range of " + << *AR << " is " << IVRange << "\n"); // Ensure the induction variable does not wrap around by verifying that // its range is fully contained within the valid range. return ValidRange.contains(IVRange); }; - if (isFindLastIVRecurrenceKind(Kind)) { + if (PositiveStep) { if (CheckRange(true)) return RecurKind::FindLastIVSMax; if (CheckRange(false)) return RecurKind::FindLastIVUMax; return std::nullopt; } - assert(isFindFirstIVRecurrenceKind(Kind) && - "Kind must either be a FindLastIV or FindFirstIV"); if (CheckRange(true)) return RecurKind::FindFirstIVSMin; @@ -867,7 +852,8 @@ RecurrenceDescriptor::isFindPattern(RecurKind Kind, Loop *TheLoop, if (auto RK = GetRecurKind(NonRdxPhi)) return InstDesc(I, *RK); - return InstDesc(false, I); + // If the recurrence is not specific to an IV, return a generic FindLast. + return InstDesc(I, RecurKind::FindLast); } RecurrenceDescriptor::InstDesc @@ -1001,8 +987,8 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr( Kind == RecurKind::Add || Kind == RecurKind::Mul || Kind == RecurKind::Sub || Kind == RecurKind::AddChainWithSubs) return isConditionalRdxPattern(I); - if ((isFindIVRecurrenceKind(Kind) || isFindLastRecurrenceKind(Kind)) && SE) - return isFindPattern(Kind, L, OrigPhi, I, *SE); + if (isFindRecurrenceKind(Kind) && SE) + return isFindPattern(L, OrigPhi, I, *SE); [[fallthrough]]; case Instruction::FCmp: case Instruction::ICmp: @@ -1142,14 +1128,9 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop, << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::FindLastIVSMax, TheLoop, FMF, RedDes, DB, - AC, DT, SE)) { - LLVM_DEBUG(dbgs() << "Found a FindLastIV reduction PHI." << *Phi << "\n"); - return true; - } - if (AddReductionVar(Phi, RecurKind::FindFirstIVSMin, TheLoop, FMF, RedDes, DB, - AC, DT, SE)) { - LLVM_DEBUG(dbgs() << "Found a FindFirstIV reduction PHI." << *Phi << "\n"); + if (AddReductionVar(Phi, RecurKind::FindLast, TheLoop, FMF, RedDes, DB, AC, + DT, SE)) { + LLVM_DEBUG(dbgs() << "Found a Find reduction PHI." << *Phi << "\n"); return true; } if (AddReductionVar(Phi, RecurKind::FMul, TheLoop, FMF, RedDes, DB, AC, DT, @@ -1199,11 +1180,6 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop, << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::FindLast, TheLoop, FMF, RedDes, DB, AC, - DT, SE)) { - LLVM_DEBUG(dbgs() << "Found a FindLast reduction PHI." << *Phi << "\n"); - return true; - } // Not a reduction of known type. return false; } @@ -1329,7 +1305,6 @@ unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) { case RecurKind::FMinimumNum: return Instruction::FCmp; case RecurKind::FindLast: - return Instruction::Select; case RecurKind::AnyOf: case RecurKind::FindFirstIVSMin: case RecurKind::FindFirstIVUMin: diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp index d3285e32c5dee..f74271ac2d3c7 100644 --- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -1258,11 +1258,11 @@ llvm::canParallelizeReductionWhenUnrolling(PHINode &Phi, Loop *L, return std::nullopt; RecurKind RK = RdxDesc.getRecurrenceKind(); // Skip unsupported reductions. - // TODO: Handle additional reductions, including min-max reductions. + // TODO: Handle additional reductions, including FP and min-max + // reductions. if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) || - RecurrenceDescriptor::isFindIVRecurrenceKind(RK) || - RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) || - RecurrenceDescriptor::isFindLastRecurrenceKind(RK)) + RecurrenceDescriptor::isFindRecurrenceKind(RK) || + RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) return std::nullopt; if (RdxDesc.hasExactFPMath()) diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 8e2a4f80fce16..50c78c5d22d3c 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1491,7 +1491,7 @@ Value *llvm::createSimpleReduction(IRBuilderBase &Builder, Value *Src, Value *llvm::createSimpleReduction(IRBuilderBase &Builder, Value *Src, RecurKind Kind, Value *Mask, Value *EVL) { assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && - !RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) && + !RecurrenceDescriptor::isFindRecurrenceKind(Kind) && "AnyOf and FindIV reductions are not supported."); Intrinsic::ID Id = getReductionIntrinsicID(Kind); auto VPID = VPIntrinsic::getForIntrinsic(Id); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 970270751f5f4..9c6b67a7bcadc 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4617,10 +4617,12 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF, IsaPred); // FIXME: implement interleaving for FindLast transform correctly. - for (auto &[_, RdxDesc] : Legal->getReductionVars()) - if (RecurrenceDescriptor::isFindLastRecurrenceKind( - RdxDesc.getRecurrenceKind())) - return 1; + if (any_of(make_second_range(Legal->getReductionVars()), + [](const RecurrenceDescriptor &RdxDesc) { + return RecurrenceDescriptor::isFindLastRecurrenceKind( + RdxDesc.getRecurrenceKind()); + })) + return 1; // If we did not calculate the cost for VF (because the user selected the VF) // then we calculate the cost of VF here. From 962a23c5361bff51db0ba48fc6cf45b8577fb7c4 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Tue, 2 Dec 2025 17:03:27 +0000 Subject: [PATCH 22/24] Clean up any_of with values() iterator --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 9c6b67a7bcadc..f8c470c890ab9 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -10168,7 +10168,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { IC = UserIC > 0 ? UserIC : IC; // FIXME: Enable interleaving for last_active reductions. - if (any_of(make_second_range(LVL.getReductionVars()), [&](auto &RdxDesc) { + if (any_of(LVL.getReductionVars().values(), [](auto &RdxDesc) { return RecurrenceDescriptor::isFindLastRecurrenceKind( RdxDesc.getRecurrenceKind()); })) { From 12a7cd880176cd21a9368494a6b1430a5dd9e0c4 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Tue, 2 Dec 2025 17:30:05 +0000 Subject: [PATCH 23/24] Formatting --- llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index 38fb6be908496..56b1874f7f00a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -1226,4 +1226,3 @@ bool VPlanTransforms::handleMultiUseReductions(VPlan &Plan) { } return true; } - From c7b50ac8adc2fce4b09abd34fd4728dcbaf136d2 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Thu, 4 Dec 2025 12:02:18 +0000 Subject: [PATCH 24/24] Cleanups --- llvm/lib/Analysis/IVDescriptors.cpp | 10 +++++----- llvm/lib/Transforms/Vectorize/VPlan.h | 7 +++++-- .../Vectorize/VPlanConstruction.cpp | 20 +++++-------------- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 8 +++----- 4 files changed, 18 insertions(+), 27 deletions(-) diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp index 77ae3382a8ea4..77f049cb96653 100644 --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -726,8 +726,8 @@ RecurrenceDescriptor::isAnyOfPattern(Loop *Loop, PHINode *OrigPhi, // or like this: // int r = 0; // for (int i = 0; i < n; i++) { -// if (a[i] > 3) -// r = a[i]; +// if (src[i] > 3) +// r = src[i]; // } // The reduction value (r) is derived from either the values of an induction // variable (i) sequence, an arbitrary value (a[i]), or from the start value @@ -782,7 +782,7 @@ RecurrenceDescriptor::isFindPattern(Loop *TheLoop, PHINode *OrigPhi, // Returns either FindFirstIV/FindLastIV, if such a pattern is found, or // std::nullopt. - auto GetRecurKind = [&](Value *V) -> std::optional { + auto GetFindFirstLastIVRecurKind = [&](Value *V) -> std::optional { Type *Ty = V->getType(); if (!SE.isSCEVable(Ty)) return std::nullopt; @@ -795,7 +795,7 @@ RecurrenceDescriptor::isFindPattern(Loop *TheLoop, PHINode *OrigPhi, // We must have a known positive or negative step for FindIV const bool PositiveStep = SE.isKnownPositive(Step); - if (!PositiveStep && !SE.isKnownNegative(Step)) + if (!SE.isKnownNonZero(Step)) return std::nullopt; // Check if the minimum (FindLast) or maximum (FindFirst) value of the @@ -849,7 +849,7 @@ RecurrenceDescriptor::isFindPattern(Loop *TheLoop, PHINode *OrigPhi, return std::nullopt; }; - if (auto RK = GetRecurKind(NonRdxPhi)) + if (auto RK = GetFindFirstLastIVRecurKind(NonRdxPhi)) return InstDesc(I, *RK); // If the recurrence is not specific to an IV, return a generic FindLast. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 7e8d33b448331..2a793af0d4887 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1125,11 +1125,14 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags, /// Explicit user for the resume phi of the canonical induction in the main /// VPlan, used by the epilogue vector loop. ResumeForEpilogue, + + /// Extracts the last active lane based on a predicate vector operand, or + /// returns the default if no lanes were active. + ExtractLastActive, + /// Returns the value for vscale. VScale, OpsEnd = VScale, - /// Extracts the last active lane based on a predicate vector operand. - ExtractLastActive, }; /// Returns true if this VPInstruction generates scalar values for all lanes. diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index 56b1874f7f00a..336b144e30fe6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -1026,23 +1026,14 @@ bool VPlanTransforms::handleFindLastReductions(VPlan &Plan, // Find the condition for the select auto *SelectR = cast(&PhiR->getBackedgeRecipe()); - VPValue *Cond = nullptr; - if (auto *WidenR = dyn_cast(SelectR)) - Cond = WidenR->getCond(); - else if (auto *RepR = dyn_cast(SelectR)) { - auto *SI = dyn_cast(RepR->getUnderlyingInstr()); - if (!SI) - return false; - auto *CmpI = dyn_cast(SI->getCondition()); - if (!CmpI) - return false; - Cond = RecipeBuilder.getRecipe(CmpI)->getVPSingleValue(); - } else + VPValue *Cond = nullptr, *Op1 = nullptr, *Op2 = nullptr; + if (!match(SelectR, + m_Select(m_VPValue(Cond), m_VPValue(Op1), m_VPValue(Op2)))) return false; // Add mask phi VPBuilder Builder = VPBuilder::getToInsertAfter(PhiR); - auto *MaskPHI = new VPWidenPHIRecipe(nullptr, Plan.getFalse()); + auto *MaskPHI = new VPWidenPHIRecipe(nullptr, /*Start=*/Plan.getFalse()); Builder.insert(MaskPHI); // Add select for mask @@ -1053,8 +1044,7 @@ bool VPlanTransforms::handleFindLastReductions(VPlan &Plan, // Replace select for data VPValue *DataSelect = - Builder.createSelect(AnyOf, SelectR->getOperand(1), - SelectR->getOperand(2), SelectR->getDebugLoc()); + Builder.createSelect(AnyOf, Op1, Op2, SelectR->getDebugLoc()); SelectR->replaceAllUsesWith(DataSelect); SelectR->eraseFromParent(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 03c0951f338d5..1865e55117a1f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -921,11 +921,9 @@ Value *VPInstruction::generate(VPTransformState &State) { Value *Mask = State.get(getOperand(1)); Value *Default = State.get(getOperand(2), /*IsScalar=*/true); Type *VTy = Data->getType(); - - Module *M = State.Builder.GetInsertBlock()->getModule(); - Function *ExtractLast = Intrinsic::getOrInsertDeclaration( - M, Intrinsic::experimental_vector_extract_last_active, {VTy}); - return Builder.CreateCall(ExtractLast, {Data, Mask, Default}); + return Builder.CreateIntrinsic( + Intrinsic::experimental_vector_extract_last_active, {VTy}, + {Data, Mask, Default}); } default: llvm_unreachable("Unsupported opcode for instruction");