diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h index fc141ed6d96fe..05c17632e0e49 100644 --- a/llvm/include/llvm/Analysis/IVDescriptors.h +++ b/llvm/include/llvm/Analysis/IVDescriptors.h @@ -70,6 +70,9 @@ enum class RecurKind { FindLastIVUMax, ///< FindLast reduction with select(cmp(),x,y) where one of ///< (x,y) is increasing loop induction, and both x and y ///< are integer type, producing a UMax reduction. + FindLast, ///< FindLast reduction with select(cmp(),x,y) where x and y + ///< are an integer type, one is the current recurrence value, + ///< and the other is an arbitrary value. // clang-format on // TODO: Any_of and FindLast reduction need not be restricted to integer type // only. @@ -180,13 +183,11 @@ class RecurrenceDescriptor { /// Returns a struct describing whether the instruction is either a /// Select(ICmp(A, B), X, Y), or /// Select(FCmp(A, B), X, Y) - /// where one of (X, Y) is an increasing (FindLast) or decreasing (FindFirst) - /// loop induction variable, and the other is a PHI value. - // TODO: Support non-monotonic variable. FindLast does not need be restricted - // to increasing loop induction variables. - LLVM_ABI static InstDesc isFindIVPattern(RecurKind Kind, Loop *TheLoop, - PHINode *OrigPhi, Instruction *I, - ScalarEvolution &SE); + /// where one of (X, Y) is an increasing (FindLastIV) or decreasing + /// (FindFirstIV) loop induction variable, or an arbitrary integer value + /// (FindLast), and the other is a PHI value. + LLVM_ABI static InstDesc isFindPattern(Loop *TheLoop, PHINode *OrigPhi, + Instruction *I, ScalarEvolution &SE); /// Returns a struct describing if the instruction is a /// Select(FCmp(X, Y), (Z = X op PHINode), PHINode) instruction pattern. @@ -310,6 +311,17 @@ class RecurrenceDescriptor { isFindLastIVRecurrenceKind(Kind); } + /// Returns true if the recurrence kind is of the form + /// select(cmp(),x,y) where one of (x,y) is an arbitrary value and the + /// other is a recurrence. + static bool isFindLastRecurrenceKind(RecurKind Kind) { + return Kind == RecurKind::FindLast; + } + + static bool isFindRecurrenceKind(RecurKind Kind) { + return isFindLastRecurrenceKind(Kind) || isFindIVRecurrenceKind(Kind); + } + /// Returns the type of the recurrence. This type can be narrower than the /// actual type of the Phi if the recurrence has been type-promoted. Type *getRecurrenceType() const { return RecurrenceType; } diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp index 7624e0ed6f2b0..7c2154cf268c1 100644 --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -58,6 +58,8 @@ bool RecurrenceDescriptor::isIntegerRecurrenceKind(RecurKind Kind) { case RecurKind::FindFirstIVUMin: case RecurKind::FindLastIVSMax: case RecurKind::FindLastIVUMax: + // TODO: Make type-agnostic. + case RecurKind::FindLast: return true; } return false; @@ -721,9 +723,15 @@ RecurrenceDescriptor::isAnyOfPattern(Loop *Loop, PHINode *OrigPhi, // if (src[i] > 3) // r = i; // } +// or like this: +// int r = 0; +// for (int i = 0; i < n; i++) { +// if (src[i] > 3) +// r = ; +// } // The reduction value (r) is derived from either the values of an induction -// variable (i) sequence, or from the start value (0). The LLVM IR generated for -// such loops would be as follows: +// variable (i) sequence, an arbitrary loop-varying value, or from the start +// value (0). The LLVM IR generated for such loops would be as follows: // for.body: // %r = phi i32 [ %spec.select, %for.body ], [ 0, %entry ] // %i = phi i32 [ %inc, %for.body ], [ 0, %entry ] @@ -732,23 +740,27 @@ RecurrenceDescriptor::isAnyOfPattern(Loop *Loop, PHINode *OrigPhi, // %spec.select = select i1 %cmp, i32 %i, i32 %r // %inc = add nsw i32 %i, 1 // ... -// Since 'i' is an induction variable, the reduction value after the loop will -// be the maximum (increasing induction) or minimum (decreasing induction) value -// of 'i' that the condition (src[i] > 3) is satisfied, or the start value (0 in -// the example above). When the start value of the induction variable 'i' is -// greater than the minimum (increasing induction) or maximum (decreasing -// induction) value of the data type, we can use the minimum (increasing -// induction) or maximum (decreasing induction) value of the data type as a -// sentinel value to replace the start value. This allows us to perform a single -// reduction max (increasing induction) or min (decreasing induction) operation -// to obtain the final reduction result. +// When searching for an induction variable (i), the reduction value after the +// loop will be the maximum (increasing induction) or minimum (decreasing +// induction) value of 'i' that the condition (src[i] > 3) is satisfied, or the +// start value (0 in the example above). When the start value of the induction +// variable 'i' is greater than the minimum (increasing induction) or maximum +// (decreasing induction) value of the data type, we can use the minimum +// (increasing induction) or maximum (decreasing induction) value of the data +// type as a sentinel value to replace the start value. This allows us to +// perform a single reduction max (increasing induction) or min (decreasing +// induction) operation to obtain the final reduction result. // TODO: It is possible to solve the case where the start value is the minimum // value of the data type or a non-constant value by using mask and multiple // reduction operations. +// +// When searching for an arbitrary loop-varying value, the reduction value will +// either be the initial value (0) if the condition was never met, or the value +// of the loop-varying value in the most recent loop iteration where the +// condition was met. RecurrenceDescriptor::InstDesc -RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop, - PHINode *OrigPhi, Instruction *I, - ScalarEvolution &SE) { +RecurrenceDescriptor::isFindPattern(Loop *TheLoop, PHINode *OrigPhi, + Instruction *I, ScalarEvolution &SE) { // TODO: Support the vectorization of FindLastIV when the reduction phi is // used by more than one select instruction. This vectorization is only // performed when the SCEV of each increasing induction variable used by the @@ -757,8 +769,10 @@ RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop, return InstDesc(false, I); // We are looking for selects of the form: - // select(cmp(), phi, loop_induction) or - // select(cmp(), loop_induction, phi) + // select(cmp(), phi, value) or + // select(cmp(), value, phi) + // where 'value' must be a loop induction variable + // (for FindFirstIV/FindLastIV) or an arbitrary value (for FindLast). // TODO: Match selects with multi-use cmp conditions. Value *NonRdxPhi = nullptr; if (!match(I, m_CombineOr(m_Select(m_OneUse(m_Cmp()), m_Value(NonRdxPhi), @@ -769,7 +783,7 @@ RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop, // Returns either FindFirstIV/FindLastIV, if such a pattern is found, or // std::nullopt. - auto GetRecurKind = [&](Value *V) -> std::optional { + auto GetFindFirstLastIVRecurKind = [&](Value *V) -> std::optional { Type *Ty = V->getType(); if (!SE.isSCEVable(Ty)) return std::nullopt; @@ -780,8 +794,9 @@ RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop, m_SpecificLoop(TheLoop)))) return std::nullopt; - if ((isFindFirstIVRecurrenceKind(Kind) && !SE.isKnownNegative(Step)) || - (isFindLastIVRecurrenceKind(Kind) && !SE.isKnownPositive(Step))) + // We must have a known positive or negative step for FindIV + const bool PositiveStep = SE.isKnownPositive(Step); + if (!SE.isKnownNonZero(Step)) return std::nullopt; // Check if the minimum (FindLast) or maximum (FindFirst) value of the @@ -797,7 +812,7 @@ RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop, IsSigned ? SE.getSignedRange(AR) : SE.getUnsignedRange(AR); unsigned NumBits = Ty->getIntegerBitWidth(); ConstantRange ValidRange = ConstantRange::getEmpty(NumBits); - if (isFindLastIVRecurrenceKind(Kind)) { + if (PositiveStep) { APInt Sentinel = IsSigned ? APInt::getSignedMinValue(NumBits) : APInt::getMinValue(NumBits); ValidRange = ConstantRange::getNonEmpty(Sentinel + 1, Sentinel); @@ -811,26 +826,22 @@ RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop, APInt::getMinValue(NumBits), APInt::getMaxValue(NumBits) - 1); } - LLVM_DEBUG(dbgs() << "LV: " - << (isFindLastIVRecurrenceKind(Kind) ? "FindLastIV" - : "FindFirstIV") - << " valid range is " << ValidRange - << ", and the range of " << *AR << " is " << IVRange - << "\n"); + LLVM_DEBUG( + dbgs() << "LV: " << (PositiveStep ? "FindLastIV" : "FindFirstIV") + << " valid range is " << ValidRange << ", and the range of " + << *AR << " is " << IVRange << "\n"); // Ensure the induction variable does not wrap around by verifying that // its range is fully contained within the valid range. return ValidRange.contains(IVRange); }; - if (isFindLastIVRecurrenceKind(Kind)) { + if (PositiveStep) { if (CheckRange(true)) return RecurKind::FindLastIVSMax; if (CheckRange(false)) return RecurKind::FindLastIVUMax; return std::nullopt; } - assert(isFindFirstIVRecurrenceKind(Kind) && - "Kind must either be a FindLastIV or FindFirstIV"); if (CheckRange(true)) return RecurKind::FindFirstIVSMin; @@ -839,10 +850,11 @@ RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop, return std::nullopt; }; - if (auto RK = GetRecurKind(NonRdxPhi)) + if (auto RK = GetFindFirstLastIVRecurKind(NonRdxPhi)) return InstDesc(I, *RK); - return InstDesc(false, I); + // If the recurrence is not specific to an IV, return a generic FindLast. + return InstDesc(I, RecurKind::FindLast); } RecurrenceDescriptor::InstDesc @@ -976,8 +988,8 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr( Kind == RecurKind::Add || Kind == RecurKind::Mul || Kind == RecurKind::Sub || Kind == RecurKind::AddChainWithSubs) return isConditionalRdxPattern(I); - if (isFindIVRecurrenceKind(Kind) && SE) - return isFindIVPattern(Kind, L, OrigPhi, I, *SE); + if (isFindRecurrenceKind(Kind) && SE) + return isFindPattern(L, OrigPhi, I, *SE); [[fallthrough]]; case Instruction::FCmp: case Instruction::ICmp: @@ -1117,14 +1129,9 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop, << "\n"); return true; } - if (AddReductionVar(Phi, RecurKind::FindLastIVSMax, TheLoop, FMF, RedDes, DB, - AC, DT, SE)) { - LLVM_DEBUG(dbgs() << "Found a FindLastIV reduction PHI." << *Phi << "\n"); - return true; - } - if (AddReductionVar(Phi, RecurKind::FindFirstIVSMin, TheLoop, FMF, RedDes, DB, - AC, DT, SE)) { - LLVM_DEBUG(dbgs() << "Found a FindFirstIV reduction PHI." << *Phi << "\n"); + if (AddReductionVar(Phi, RecurKind::FindLast, TheLoop, FMF, RedDes, DB, AC, + DT, SE)) { + LLVM_DEBUG(dbgs() << "Found a Find reduction PHI." << *Phi << "\n"); return true; } if (AddReductionVar(Phi, RecurKind::FMul, TheLoop, FMF, RedDes, DB, AC, DT, @@ -1174,7 +1181,6 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop, << "\n"); return true; } - // Not a reduction of known type. return false; } @@ -1299,6 +1305,7 @@ unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) { case RecurKind::FMaximumNum: case RecurKind::FMinimumNum: return Instruction::FCmp; + case RecurKind::FindLast: case RecurKind::AnyOf: case RecurKind::FindFirstIVSMin: case RecurKind::FindFirstIVUMin: diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 3a5f1499f9d2d..c704c434176be 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -5488,6 +5488,7 @@ bool AArch64TTIImpl::isLegalToVectorizeReduction( case RecurKind::FMax: case RecurKind::FMulAdd: case RecurKind::AnyOf: + case RecurKind::FindLast: return true; default: return false; diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp index 0f256398e5b1e..f74271ac2d3c7 100644 --- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -1258,9 +1258,10 @@ llvm::canParallelizeReductionWhenUnrolling(PHINode &Phi, Loop *L, return std::nullopt; RecurKind RK = RdxDesc.getRecurrenceKind(); // Skip unsupported reductions. - // TODO: Handle additional reductions, including min-max reductions. + // TODO: Handle additional reductions, including FP and min-max + // reductions. if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) || - RecurrenceDescriptor::isFindIVRecurrenceKind(RK) || + RecurrenceDescriptor::isFindRecurrenceKind(RK) || RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) return std::nullopt; diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 8e2a4f80fce16..50c78c5d22d3c 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1491,7 +1491,7 @@ Value *llvm::createSimpleReduction(IRBuilderBase &Builder, Value *Src, Value *llvm::createSimpleReduction(IRBuilderBase &Builder, Value *Src, RecurKind Kind, Value *Mask, Value *EVL) { assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && - !RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) && + !RecurrenceDescriptor::isFindRecurrenceKind(Kind) && "AnyOf and FindIV reductions are not supported."); Intrinsic::ID Id = getReductionIntrinsicID(Kind); auto VPID = VPIntrinsic::getForIntrinsic(Id); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 9b727a7998392..d425b61301cf7 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4302,11 +4302,15 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( ElementCount VF) const { // Cross iteration phis such as fixed-order recurrences and FMaxNum/FMinNum // reductions need special handling and are currently unsupported. + // FindLast reductions also require special handling for the synthesized + // mask PHI. if (any_of(OrigLoop->getHeader()->phis(), [&](PHINode &Phi) { if (!Legal->isReductionVariable(&Phi)) return Legal->isFixedOrderRecurrence(&Phi); - return RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind( - Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind()); + RecurKind Kind = + Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind(); + return RecurrenceDescriptor::isFindLastRecurrenceKind(Kind) || + RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind(Kind); })) return false; @@ -4612,6 +4616,14 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF, any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(), IsaPred); + // FIXME: implement interleaving for FindLast transform correctly. + if (any_of(make_second_range(Legal->getReductionVars()), + [](const RecurrenceDescriptor &RdxDesc) { + return RecurrenceDescriptor::isFindLastRecurrenceKind( + RdxDesc.getRecurrenceKind()); + })) + return 1; + // If we did not calculate the cost for VF (because the user selected the VF) // then we calculate the cost of VF here. if (LoopCost == 0) { @@ -8586,6 +8598,11 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( *Plan)) return nullptr; + // Create whole-vector selects for find-last recurrences. + if (!VPlanTransforms::runPass(VPlanTransforms::handleFindLastReductions, + *Plan, RecipeBuilder)) + return nullptr; + // Transform recipes to abstract recipes if it is legal and beneficial and // clamp the range for better cost estimation. // TODO: Enable following transform when the EVL-version of extended-reduction @@ -8707,10 +8724,11 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( continue; RecurKind Kind = PhiR->getRecurrenceKind(); - assert( - !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && - !RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) && - "AnyOf and FindIV reductions are not allowed for in-loop reductions"); + assert(!RecurrenceDescriptor::isFindLastRecurrenceKind(Kind) && + !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && + !RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) && + "AnyOf, FindIV, and FindLast reductions are not allowed for in-loop " + "reductions"); bool IsFPRecurrence = RecurrenceDescriptor::isFloatingPointRecurrenceKind(Kind); @@ -9017,7 +9035,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( RecurKind RK = RdxDesc.getRecurrenceKind(); if ((!RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) && !RecurrenceDescriptor::isFindIVRecurrenceKind(RK) && - !RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))) { + !RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) && + !RecurrenceDescriptor::isFindLastRecurrenceKind(RK))) { VPBuilder PHBuilder(Plan->getVectorPreheader()); VPValue *Iden = Plan->getOrAddLiveIn( getRecurrenceIdentity(RK, PhiTy, RdxDesc.getFastMathFlags())); @@ -10148,6 +10167,20 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Override IC if user provided an interleave count. IC = UserIC > 0 ? UserIC : IC; + // FIXME: Enable interleaving for FindLast reductions. + if (any_of(LVL.getReductionVars().values(), [](auto &RdxDesc) { + return RecurrenceDescriptor::isFindLastRecurrenceKind( + RdxDesc.getRecurrenceKind()); + })) { + LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due " + << "to FindLast reduction.\n"); + IntDiagMsg = {"FindLastPreventsScalarInterleaving", + "Unable to interleave without vectorization due to FindLast " + "reduction."}; + InterleaveLoop = false; + IC = 1; + } + // Emit diagnostic messages, if any. const char *VAPassName = Hints.vectorizeAnalysisPassName(); if (!VectorizeLoop && !InterleaveLoop) { diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 0eb8ad8d3c93d..ccdb0be05f6d4 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -25401,6 +25401,7 @@ class HorizontalReduction { case RecurKind::FindFirstIVUMin: case RecurKind::FindLastIVSMax: case RecurKind::FindLastIVUMax: + case RecurKind::FindLast: case RecurKind::FMaxNum: case RecurKind::FMinNum: case RecurKind::FMaximumNum: @@ -25542,6 +25543,7 @@ class HorizontalReduction { case RecurKind::FindFirstIVUMin: case RecurKind::FindLastIVSMax: case RecurKind::FindLastIVUMax: + case RecurKind::FindLast: case RecurKind::FMaxNum: case RecurKind::FMinNum: case RecurKind::FMaximumNum: @@ -25648,6 +25650,7 @@ class HorizontalReduction { case RecurKind::FindFirstIVUMin: case RecurKind::FindLastIVSMax: case RecurKind::FindLastIVUMax: + case RecurKind::FindLast: case RecurKind::FMaxNum: case RecurKind::FMinNum: case RecurKind::FMaximumNum: diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 6ca750fc53279..0a354bfe79996 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1125,6 +1125,12 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags, /// Explicit user for the resume phi of the canonical induction in the main /// VPlan, used by the epilogue vector loop. ResumeForEpilogue, + + /// Extracts the lane from the first operand corresponding to the last + /// active (non-zero) lane in the mask (second operand), or if no lanes + /// were active in the mask, returns the default value (third operand). + ExtractLastActive, + /// Returns the value for vscale. VScale, OpsEnd = VScale, @@ -2357,6 +2363,10 @@ class LLVM_ABI_FOR_TEST VPWidenPHIRecipe : public VPSingleDefRecipe, /// Generate the phi/select nodes. void execute(VPTransformState &State) override; + /// Return the cost of this VPWidenPHIRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + protected: #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index ea38a8b16ebc7..a135f827e4abf 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -121,7 +121,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { return Type::getIntNTy(Ctx, 64); case VPInstruction::ExtractLastElement: case VPInstruction::ExtractLastLanePerPart: - case VPInstruction::ExtractPenultimateElement: { + case VPInstruction::ExtractPenultimateElement: + case VPInstruction::ExtractLastActive: { Type *BaseTy = inferScalarType(R->getOperand(0)); if (auto *VecTy = dyn_cast(BaseTy)) return VecTy->getElementType(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index 329b62cee4fce..336b144e30fe6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "LoopVectorizationPlanner.h" +#include "VPRecipeBuilder.h" #include "VPlan.h" #include "VPlanCFG.h" #include "VPlanDominatorTree.h" @@ -997,6 +998,73 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) { return true; } +bool VPlanTransforms::handleFindLastReductions(VPlan &Plan, + VPRecipeBuilder &RecipeBuilder) { + if (Plan.hasScalarVFOnly()) + return false; + + // We want to create the following nodes: + // vec.body: + // mask.phi = phi [ all.false, vec.ph ], [ new.mask, vec.body ] + // ...data.phi already exists, but needs updating... + // data.phi = phi [ default.val, vec.ph ], [ new.data, vec.body ] + // + // ...'data' and 'compare' created by existing nodes... + // + // any_active = i1 any_of_reduction(compare) + // new.mask = select any_active, compare, mask.phi + // new.data = select any_active, data, data.phi + // + // middle.block: + // result = extract-last-active new.data, new.mask, default.val + + for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { + auto *PhiR = dyn_cast(&Phi); + if (!PhiR || !RecurrenceDescriptor::isFindLastRecurrenceKind( + PhiR->getRecurrenceKind())) + continue; + + // Find the condition for the select + auto *SelectR = cast(&PhiR->getBackedgeRecipe()); + VPValue *Cond = nullptr, *Op1 = nullptr, *Op2 = nullptr; + if (!match(SelectR, + m_Select(m_VPValue(Cond), m_VPValue(Op1), m_VPValue(Op2)))) + return false; + + // Add mask phi + VPBuilder Builder = VPBuilder::getToInsertAfter(PhiR); + auto *MaskPHI = new VPWidenPHIRecipe(nullptr, /*Start=*/Plan.getFalse()); + Builder.insert(MaskPHI); + + // Add select for mask + Builder.setInsertPoint(SelectR); + VPValue *AnyOf = Builder.createNaryOp(VPInstruction::AnyOf, {Cond}); + VPValue *MaskSelect = Builder.createSelect(AnyOf, Cond, MaskPHI); + MaskPHI->addOperand(MaskSelect); + + // Replace select for data + VPValue *DataSelect = + Builder.createSelect(AnyOf, Op1, Op2, SelectR->getDebugLoc()); + SelectR->replaceAllUsesWith(DataSelect); + SelectR->eraseFromParent(); + + // Find final reduction computation and replace it with an + // extract.last.active intrinsic. + auto *RdxResult = findUserOf(PhiR); + if (!RdxResult) + return false; + Builder.setInsertPoint(RdxResult); + auto *ExtractLastActive = + Builder.createNaryOp(VPInstruction::ExtractLastActive, + {DataSelect, MaskSelect, PhiR->getStartValue()}, + RdxResult->getDebugLoc()); + RdxResult->replaceAllUsesWith(ExtractLastActive); + RdxResult->eraseFromParent(); + } + + return true; +} + bool VPlanTransforms::handleMultiUseReductions(VPlan &Plan) { for (auto &PhiR : make_early_inc_range( Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis())) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 6491a2ce6813b..1865e55117a1f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -461,6 +461,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) { case VPInstruction::ActiveLaneMask: case VPInstruction::ComputeAnyOfResult: case VPInstruction::ReductionStartVector: + case VPInstruction::ExtractLastActive: return 3; case VPInstruction::ComputeFindIVResult: return 4; @@ -915,6 +916,15 @@ Value *VPInstruction::generate(VPTransformState &State) { } case VPInstruction::ResumeForEpilogue: return State.get(getOperand(0), true); + case VPInstruction::ExtractLastActive: { + Value *Data = State.get(getOperand(0)); + Value *Mask = State.get(getOperand(1)); + Value *Default = State.get(getOperand(2), /*IsScalar=*/true); + Type *VTy = Data->getType(); + return Builder.CreateIntrinsic( + Intrinsic::experimental_vector_extract_last_active, {VTy}, + {Data, Mask, Default}); + } default: llvm_unreachable("Unsupported opcode for instruction"); } @@ -1074,6 +1084,15 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, Instruction::Sub, Type::getInt64Ty(Ctx.LLVMCtx), Ctx.CostKind); return Cost; } + case VPInstruction::ExtractLastActive: { + Type *ScalarTy = Ctx.Types.inferScalarType(this); + Type *VecTy = toVectorTy(ScalarTy, VF); + Type *MaskTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF); + IntrinsicCostAttributes ICA( + Intrinsic::experimental_vector_extract_last_active, ScalarTy, + {VecTy, MaskTy, ScalarTy}); + return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind); + } case VPInstruction::FirstOrderRecurrenceSplice: { assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?"); SmallVector Mask(VF.getKnownMinValue()); @@ -1131,6 +1150,7 @@ bool VPInstruction::isVectorToScalar() const { getOpcode() == VPInstruction::LastActiveLane || getOpcode() == VPInstruction::ComputeAnyOfResult || getOpcode() == VPInstruction::ComputeFindIVResult || + getOpcode() == VPInstruction::ExtractLastActive || getOpcode() == VPInstruction::ComputeReductionResult || getOpcode() == VPInstruction::AnyOf; } @@ -1197,6 +1217,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { case VPInstruction::ExplicitVectorLength: case VPInstruction::FirstActiveLane: case VPInstruction::LastActiveLane: + case VPInstruction::ExtractLastActive: case VPInstruction::FirstOrderRecurrenceSplice: case VPInstruction::LogicalAnd: case VPInstruction::Not: @@ -1385,6 +1406,9 @@ void VPInstruction::printRecipe(raw_ostream &O, const Twine &Indent, case VPInstruction::Unpack: O << "unpack"; break; + case VPInstruction::ExtractLastActive: + O << "extract-last-active"; + break; default: O << Instruction::getOpcodeName(getOpcode()); } @@ -4379,6 +4403,11 @@ void VPWidenPHIRecipe::execute(VPTransformState &State) { State.set(this, VecPhi); } +InstructionCost VPWidenPHIRecipe::computeCost(ElementCount VF, + VPCostContext &Ctx) const { + return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind); +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index afdf1655b4622..725e4ebefa8f4 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -166,6 +166,14 @@ struct VPlanTransforms { /// this attempt was unsuccessful. static bool handleMaxMinNumReductions(VPlan &Plan); + /// Check if \p Plan contains any FindLast reductions. If it does, try to + /// update the vector loop to save the appropriate state using selects + /// for entire vectors for both the latest mask containing at least one active + /// element and the corresponding data vector. Return false if this attempt + /// was unsuccessful. + static bool handleFindLastReductions(VPlan &Plan, + VPRecipeBuilder &RecipeBuilder); + /// Clear NSW/NUW flags from reduction instructions if necessary. static void clearReductionWrapFlags(VPlan &Plan); diff --git a/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll b/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll index e94a368d3ded0..e5e969d638acd 100644 --- a/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll +++ b/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll @@ -683,3 +683,55 @@ loop: exit: ret <4 x i32> %rdx.next } + +define i32 @test_findlast_reduction(ptr %data, i32 %a) { +; CHECK-LABEL: define i32 @test_findlast_reduction( +; CHECK-SAME: ptr [[DATA:%.*]], i32 [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA_3:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]] +; CHECK-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 +; CHECK-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] +; CHECK-NEXT: [[SELECT_DATA:%.*]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] +; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[LD_ADDR_1:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV_NEXT]] +; CHECK-NEXT: [[LD_1:%.*]] = load i32, ptr [[LD_ADDR_1]], align 4 +; CHECK-NEXT: [[SELECT_CMP_1:%.*]] = icmp slt i32 [[A]], [[LD_1]] +; CHECK-NEXT: [[SELECT_DATA_1:%.*]] = select i1 [[SELECT_CMP_1]], i32 [[LD_1]], i32 [[SELECT_DATA]] +; CHECK-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2 +; CHECK-NEXT: [[LD_ADDR_2:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV_NEXT_1]] +; CHECK-NEXT: [[LD_2:%.*]] = load i32, ptr [[LD_ADDR_2]], align 4 +; CHECK-NEXT: [[SELECT_CMP_2:%.*]] = icmp slt i32 [[A]], [[LD_2]] +; CHECK-NEXT: [[SELECT_DATA_2:%.*]] = select i1 [[SELECT_CMP_2]], i32 [[LD_2]], i32 [[SELECT_DATA_1]] +; CHECK-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3 +; CHECK-NEXT: [[LD_ADDR_3:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV_NEXT_2]] +; CHECK-NEXT: [[LD_3:%.*]] = load i32, ptr [[LD_ADDR_3]], align 4 +; CHECK-NEXT: [[SELECT_CMP_3:%.*]] = icmp slt i32 [[A]], [[LD_3]] +; CHECK-NEXT: [[SELECT_DATA_3]] = select i1 [[SELECT_CMP_3]], i32 [[LD_3]], i32 [[SELECT_DATA_2]] +; CHECK-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4 +; CHECK-NEXT: [[EXIT_CMP_3:%.*]] = icmp eq i64 [[IV_NEXT_3]], 200 +; CHECK-NEXT: br i1 [[EXIT_CMP_3]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA_3]], %[[LOOP]] ] +; CHECK-NEXT: ret i32 [[SELECT_DATA_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] + %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv + %ld = load i32, ptr %ld.addr, align 4 + %select.cmp = icmp slt i32 %a, %ld + %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, 200 + br i1 %exit.cmp, label %exit, label %loop + +exit: + ret i32 %select.data +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll new file mode 100644 index 0000000000000..1c8241130a875 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll @@ -0,0 +1,707 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6 +; RUN: opt -passes=loop-vectorize -S < %s 2>&1 | FileCheck %s --check-prefix=NEON +; RUN: opt -passes=loop-vectorize -mattr=+sve -S < %s 2>&1 | FileCheck %s --check-prefix=SVE + +target triple = "aarch64-linux-gnu" + +define i32 @simple_csa_int_select(i64 %N, ptr %data, i32 %a) { +; NEON-LABEL: define i32 @simple_csa_int_select( +; NEON-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i32 [[A:%.*]]) { +; NEON-NEXT: [[ENTRY:.*]]: +; NEON-NEXT: br label %[[LOOP:.*]] +; NEON: [[LOOP]]: +; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]] +; NEON-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 +; NEON-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] +; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] +; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; NEON: [[EXIT]]: +; NEON-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ] +; NEON-NEXT: ret i32 [[SELECT_DATA_LCSSA]] +; +; SVE-LABEL: define i32 @simple_csa_int_select( +; SVE-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i32 [[A:%.*]]) #[[ATTR0:[0-9]+]] { +; SVE-NEXT: [[ENTRY:.*]]: +; SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; SVE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2 +; SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; SVE: [[VECTOR_PH]]: +; SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 +; SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; SVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[A]], i64 0 +; SVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; SVE: [[VECTOR_BODY]]: +; SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; SVE-NEXT: [[VEC_PHI:%.*]] = phi [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; SVE-NEXT: [[TMP4:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ] +; SVE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDEX]] +; SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; SVE-NEXT: [[TMP13:%.*]] = icmp slt [[BROADCAST_SPLAT]], [[WIDE_LOAD]] +; SVE-NEXT: [[TMP7:%.*]] = freeze [[TMP13]] +; SVE-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP7]]) +; SVE-NEXT: [[TMP9]] = select i1 [[TMP8]], [[TMP13]], [[TMP4]] +; SVE-NEXT: [[TMP10]] = select i1 [[TMP8]], [[WIDE_LOAD]], [[VEC_PHI]] +; SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; SVE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SVE-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; SVE: [[MIDDLE_BLOCK]]: +; SVE-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32( [[TMP10]], [[TMP9]], i32 -1) +; SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; SVE-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; SVE: [[SCALAR_PH]]: +; SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ] +; SVE-NEXT: br label %[[LOOP:.*]] +; SVE: [[LOOP]]: +; SVE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[DATA_PHI:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]] +; SVE-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 +; SVE-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] +; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] +; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; SVE: [[EXIT]]: +; SVE-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ], [ [[TMP12]], %[[MIDDLE_BLOCK]] ] +; SVE-NEXT: ret i32 [[SELECT_DATA_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] + %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv + %ld = load i32, ptr %ld.addr, align 4 + %select.cmp = icmp slt i32 %a, %ld + %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, %N + br i1 %exit.cmp, label %exit, label %loop + +exit: + ret i32 %select.data +} + +define ptr @simple_csa_ptr_select(i64 %N, ptr %data, i64 %a, ptr %init) { +; NEON-LABEL: define ptr @simple_csa_ptr_select( +; NEON-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i64 [[A:%.*]], ptr [[INIT:%.*]]) { +; NEON-NEXT: [[ENTRY:.*]]: +; NEON-NEXT: br label %[[LOOP:.*]] +; NEON: [[LOOP]]: +; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[DATA_PHI:%.*]] = phi ptr [ [[INIT]], %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[IV]] +; NEON-NEXT: [[LD:%.*]] = load ptr, ptr [[LD_ADDR]], align 4 +; NEON-NEXT: [[LD_I64:%.*]] = ptrtoint ptr [[LD]] to i64 +; NEON-NEXT: [[SELECT_CMP:%.*]] = icmp slt i64 [[A]], [[LD_I64]] +; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], ptr [[LD]], ptr [[DATA_PHI]] +; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; NEON: [[EXIT]]: +; NEON-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi ptr [ [[SELECT_DATA]], %[[LOOP]] ] +; NEON-NEXT: ret ptr [[SELECT_DATA_LCSSA]] +; +; SVE-LABEL: define ptr @simple_csa_ptr_select( +; SVE-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i64 [[A:%.*]], ptr [[INIT:%.*]]) #[[ATTR0]] { +; SVE-NEXT: [[ENTRY:.*]]: +; SVE-NEXT: br label %[[LOOP:.*]] +; SVE: [[LOOP]]: +; SVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[DATA_PHI:%.*]] = phi ptr [ [[INIT]], %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[IV]] +; SVE-NEXT: [[LD:%.*]] = load ptr, ptr [[LD_ADDR]], align 4 +; SVE-NEXT: [[LD_I64:%.*]] = ptrtoint ptr [[LD]] to i64 +; SVE-NEXT: [[SELECT_CMP:%.*]] = icmp slt i64 [[A]], [[LD_I64]] +; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], ptr [[LD]], ptr [[DATA_PHI]] +; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; SVE: [[EXIT]]: +; SVE-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi ptr [ [[SELECT_DATA]], %[[LOOP]] ] +; SVE-NEXT: ret ptr [[SELECT_DATA_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %data.phi = phi ptr [ %init, %entry ], [ %select.data, %loop ] + %ld.addr = getelementptr inbounds ptr, ptr %data, i64 %iv + %ld = load ptr, ptr %ld.addr, align 4 + %ld.i64 = ptrtoint ptr %ld to i64 + %select.cmp = icmp slt i64 %a, %ld.i64 + %select.data = select i1 %select.cmp, ptr %ld, ptr %data.phi + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, %N + br i1 %exit.cmp, label %exit, label %loop + +exit: + ret ptr %select.data +} + +define float @simple_csa_float_select(i64 %N, ptr %data, float %a) { +; NEON-LABEL: define float @simple_csa_float_select( +; NEON-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], float [[A:%.*]]) { +; NEON-NEXT: [[ENTRY:.*]]: +; NEON-NEXT: br label %[[LOOP:.*]] +; NEON: [[LOOP]]: +; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[DATA_PHI:%.*]] = phi float [ -1.000000e+00, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[IV]] +; NEON-NEXT: [[LD:%.*]] = load float, ptr [[LD_ADDR]], align 4 +; NEON-NEXT: [[SELECT_CMP:%.*]] = fcmp olt float [[A]], [[LD]] +; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], float [[LD]], float [[DATA_PHI]] +; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; NEON: [[EXIT]]: +; NEON-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi float [ [[SELECT_DATA]], %[[LOOP]] ] +; NEON-NEXT: ret float [[SELECT_DATA_LCSSA]] +; +; SVE-LABEL: define float @simple_csa_float_select( +; SVE-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], float [[A:%.*]]) #[[ATTR0]] { +; SVE-NEXT: [[ENTRY:.*]]: +; SVE-NEXT: br label %[[LOOP:.*]] +; SVE: [[LOOP]]: +; SVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[DATA_PHI:%.*]] = phi float [ -1.000000e+00, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[IV]] +; SVE-NEXT: [[LD:%.*]] = load float, ptr [[LD_ADDR]], align 4 +; SVE-NEXT: [[SELECT_CMP:%.*]] = fcmp olt float [[A]], [[LD]] +; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], float [[LD]], float [[DATA_PHI]] +; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; SVE: [[EXIT]]: +; SVE-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi float [ [[SELECT_DATA]], %[[LOOP]] ] +; SVE-NEXT: ret float [[SELECT_DATA_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %data.phi = phi float [ -1.0, %entry ], [ %select.data, %loop ] + %ld.addr = getelementptr inbounds float, ptr %data, i64 %iv + %ld = load float, ptr %ld.addr, align 4 + %select.cmp = fcmp olt float %a, %ld + %select.data = select i1 %select.cmp, float %ld, float %data.phi + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, %N + br i1 %exit.cmp, label %exit, label %loop + +exit: + ret float %select.data +} + +define i32 @multi_user_csa_int_select(i64 %N, ptr %data, ptr %results, i32 %a) { +; NEON-LABEL: define i32 @multi_user_csa_int_select( +; NEON-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], ptr [[RESULTS:%.*]], i32 [[A:%.*]]) { +; NEON-NEXT: [[ENTRY:.*]]: +; NEON-NEXT: br label %[[LOOP:.*]] +; NEON: [[LOOP]]: +; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]] +; NEON-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 +; NEON-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] +; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] +; NEON-NEXT: [[RES_ADDR:%.*]] = getelementptr inbounds i32, ptr [[RESULTS]], i64 [[IV]] +; NEON-NEXT: store i32 [[SELECT_DATA]], ptr [[RES_ADDR]], align 4 +; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; NEON: [[EXIT]]: +; NEON-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ] +; NEON-NEXT: ret i32 [[SELECT_DATA_LCSSA]] +; +; SVE-LABEL: define i32 @multi_user_csa_int_select( +; SVE-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], ptr [[RESULTS:%.*]], i32 [[A:%.*]]) #[[ATTR0]] { +; SVE-NEXT: [[ENTRY:.*]]: +; SVE-NEXT: br label %[[LOOP:.*]] +; SVE: [[LOOP]]: +; SVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]] +; SVE-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 +; SVE-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] +; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] +; SVE-NEXT: [[RES_ADDR:%.*]] = getelementptr inbounds i32, ptr [[RESULTS]], i64 [[IV]] +; SVE-NEXT: store i32 [[SELECT_DATA]], ptr [[RES_ADDR]], align 4 +; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; SVE: [[EXIT]]: +; SVE-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ] +; SVE-NEXT: ret i32 [[SELECT_DATA_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] + %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv + %ld = load i32, ptr %ld.addr, align 4 + %select.cmp = icmp slt i32 %a, %ld + %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi + %res.addr = getelementptr inbounds i32, ptr %results, i64 %iv + store i32 %select.data, ptr %res.addr, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, %N + br i1 %exit.cmp, label %exit, label %loop + +exit: + ret i32 %select.data +} + + +define i32 @multi_use_cmp_for_csa_int_select(i64 %N, ptr %data, i32 %a) { +; NEON-LABEL: define i32 @multi_use_cmp_for_csa_int_select( +; NEON-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i32 [[A:%.*]]) { +; NEON-NEXT: [[ENTRY:.*]]: +; NEON-NEXT: br label %[[LOOP:.*]] +; NEON: [[LOOP]]: +; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[IDX_PHI:%.*]] = phi i64 [ -1, %[[ENTRY]] ], [ [[SELECT_IDX:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]] +; NEON-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 +; NEON-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] +; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] +; NEON-NEXT: [[SELECT_IDX]] = select i1 [[SELECT_CMP]], i64 [[IV]], i64 [[IDX_PHI]] +; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; NEON: [[EXIT]]: +; NEON-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ] +; NEON-NEXT: [[SELECT_IDX_LCSSA:%.*]] = phi i64 [ [[SELECT_IDX]], %[[LOOP]] ] +; NEON-NEXT: [[IDX:%.*]] = trunc i64 [[SELECT_IDX_LCSSA]] to i32 +; NEON-NEXT: [[RES:%.*]] = add i32 [[IDX]], [[SELECT_DATA_LCSSA]] +; NEON-NEXT: ret i32 [[RES]] +; +; SVE-LABEL: define i32 @multi_use_cmp_for_csa_int_select( +; SVE-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i32 [[A:%.*]]) #[[ATTR0]] { +; SVE-NEXT: [[ENTRY:.*]]: +; SVE-NEXT: br label %[[LOOP:.*]] +; SVE: [[LOOP]]: +; SVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[IDX_PHI:%.*]] = phi i64 [ -1, %[[ENTRY]] ], [ [[SELECT_IDX:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]] +; SVE-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 +; SVE-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] +; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] +; SVE-NEXT: [[SELECT_IDX]] = select i1 [[SELECT_CMP]], i64 [[IV]], i64 [[IDX_PHI]] +; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; SVE: [[EXIT]]: +; SVE-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ] +; SVE-NEXT: [[SELECT_IDX_LCSSA:%.*]] = phi i64 [ [[SELECT_IDX]], %[[LOOP]] ] +; SVE-NEXT: [[IDX:%.*]] = trunc i64 [[SELECT_IDX_LCSSA]] to i32 +; SVE-NEXT: [[RES:%.*]] = add i32 [[IDX]], [[SELECT_DATA_LCSSA]] +; SVE-NEXT: ret i32 [[RES]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] + %idx.phi = phi i64 [ -1, %entry ], [ %select.idx, %loop ] + %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv + %ld = load i32, ptr %ld.addr, align 4 + %select.cmp = icmp slt i32 %a, %ld + %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi + %select.idx = select i1 %select.cmp, i64 %iv, i64 %idx.phi + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, %N + br i1 %exit.cmp, label %exit, label %loop + +exit: + %idx = trunc i64 %select.idx to i32 + %res = add i32 %idx, %select.data + ret i32 %res +} + + +define i32 @chained_select_for_csa_int_select(i64 %N, ptr %data1, ptr %data2, i32 %a, i32 %b) { +; NEON-LABEL: define i32 @chained_select_for_csa_int_select( +; NEON-SAME: i64 [[N:%.*]], ptr [[DATA1:%.*]], ptr [[DATA2:%.*]], i32 [[A:%.*]], i32 [[B:%.*]]) { +; NEON-NEXT: [[ENTRY:.*]]: +; NEON-NEXT: br label %[[LOOP:.*]] +; NEON: [[LOOP]]: +; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[LD1_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[IV]] +; NEON-NEXT: [[LD1:%.*]] = load i32, ptr [[LD1_ADDR]], align 4 +; NEON-NEXT: [[SELECT_CMP1:%.*]] = icmp slt i32 [[A]], [[LD1]] +; NEON-NEXT: [[SELECT_LD1:%.*]] = select i1 [[SELECT_CMP1]], i32 [[LD1]], i32 [[DATA_PHI]] +; NEON-NEXT: [[LD2_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA2]], i64 [[IV]] +; NEON-NEXT: [[LD2:%.*]] = load i32, ptr [[LD2_ADDR]], align 4 +; NEON-NEXT: [[SELECT_CMP2:%.*]] = icmp sgt i32 [[B]], [[LD2]] +; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP2]], i32 [[LD2]], i32 [[SELECT_LD1]] +; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; NEON: [[EXIT]]: +; NEON-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ] +; NEON-NEXT: ret i32 [[SELECT_DATA_LCSSA]] +; +; SVE-LABEL: define i32 @chained_select_for_csa_int_select( +; SVE-SAME: i64 [[N:%.*]], ptr [[DATA1:%.*]], ptr [[DATA2:%.*]], i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR0]] { +; SVE-NEXT: [[ENTRY:.*]]: +; SVE-NEXT: br label %[[LOOP:.*]] +; SVE: [[LOOP]]: +; SVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[LD1_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[IV]] +; SVE-NEXT: [[LD1:%.*]] = load i32, ptr [[LD1_ADDR]], align 4 +; SVE-NEXT: [[SELECT_CMP1:%.*]] = icmp slt i32 [[A]], [[LD1]] +; SVE-NEXT: [[SELECT_LD1:%.*]] = select i1 [[SELECT_CMP1]], i32 [[LD1]], i32 [[DATA_PHI]] +; SVE-NEXT: [[LD2_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA2]], i64 [[IV]] +; SVE-NEXT: [[LD2:%.*]] = load i32, ptr [[LD2_ADDR]], align 4 +; SVE-NEXT: [[SELECT_CMP2:%.*]] = icmp sgt i32 [[B]], [[LD2]] +; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP2]], i32 [[LD2]], i32 [[SELECT_LD1]] +; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; SVE: [[EXIT]]: +; SVE-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ] +; SVE-NEXT: ret i32 [[SELECT_DATA_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] + %ld1.addr = getelementptr inbounds i32, ptr %data1, i64 %iv + %ld1 = load i32, ptr %ld1.addr, align 4 + %select.cmp1 = icmp slt i32 %a, %ld1 + %select.ld1 = select i1 %select.cmp1, i32 %ld1, i32 %data.phi + %ld2.addr = getelementptr inbounds i32, ptr %data2, i64 %iv + %ld2 = load i32, ptr %ld2.addr, align 4 + %select.cmp2 = icmp sgt i32 %b, %ld2 + %select.data = select i1 %select.cmp2, i32 %ld2, i32 %select.ld1 + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, %N + br i1 %exit.cmp, label %exit, label %loop + +exit: + ret i32 %select.data +} + +define i32 @csa_with_extra_use_of_select(i64 %N, ptr readonly %data, ptr noalias %out, i32 %a) { +; NEON-LABEL: define i32 @csa_with_extra_use_of_select( +; NEON-SAME: i64 [[N:%.*]], ptr readonly [[DATA:%.*]], ptr noalias [[OUT:%.*]], i32 [[A:%.*]]) { +; NEON-NEXT: [[ENTRY:.*]]: +; NEON-NEXT: br label %[[LOOP:.*]] +; NEON: [[LOOP]]: +; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]] +; NEON-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 +; NEON-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] +; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] +; NEON-NEXT: [[ST_ADDR:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 [[IV]] +; NEON-NEXT: store i32 [[SELECT_DATA]], ptr [[ST_ADDR]], align 4 +; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; NEON: [[EXIT]]: +; NEON-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ] +; NEON-NEXT: ret i32 [[SELECT_DATA_LCSSA]] +; +; SVE-LABEL: define i32 @csa_with_extra_use_of_select( +; SVE-SAME: i64 [[N:%.*]], ptr readonly [[DATA:%.*]], ptr noalias [[OUT:%.*]], i32 [[A:%.*]]) #[[ATTR0]] { +; SVE-NEXT: [[ENTRY:.*]]: +; SVE-NEXT: br label %[[LOOP:.*]] +; SVE: [[LOOP]]: +; SVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]] +; SVE-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 +; SVE-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] +; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] +; SVE-NEXT: [[ST_ADDR:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 [[IV]] +; SVE-NEXT: store i32 [[SELECT_DATA]], ptr [[ST_ADDR]], align 4 +; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; SVE: [[EXIT]]: +; SVE-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ] +; SVE-NEXT: ret i32 [[SELECT_DATA_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] + %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv + %ld = load i32, ptr %ld.addr, align 4 + %select.cmp = icmp slt i32 %a, %ld + %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi + %st.addr = getelementptr inbounds i32, ptr %out, i64 %iv + store i32 %select.data, ptr %st.addr, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, %N + br i1 %exit.cmp, label %exit, label %loop + +exit: + ret i32 %select.data +} + +;; Add more work to the loop besides the CSA to check cost modelling for NEON. +define i32 @int_select_with_extra_arith_payload(i64 %N, ptr readonly %A, ptr readonly %B, ptr noalias %C, i32 %threshold) { +; NEON-LABEL: define i32 @int_select_with_extra_arith_payload( +; NEON-SAME: i64 [[N:%.*]], ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr noalias [[C:%.*]], i32 [[THRESHOLD:%.*]]) { +; NEON-NEXT: [[ENTRY:.*]]: +; NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; NEON: [[VECTOR_PH]]: +; NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NEON-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[THRESHOLD]], i64 0 +; NEON-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; NEON-NEXT: br label %[[VECTOR_BODY:.*]] +; NEON: [[VECTOR_BODY]]: +; NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; NEON-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ] +; NEON-NEXT: [[TMP0:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] +; NEON-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 +; NEON-NEXT: [[TMP2:%.*]] = mul <4 x i32> [[WIDE_LOAD]], splat (i32 13) +; NEON-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] +; NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4 +; NEON-NEXT: [[TMP4:%.*]] = mul <4 x i32> [[WIDE_LOAD1]], splat (i32 5) +; NEON-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]] +; NEON-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]] +; NEON-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4 +; NEON-NEXT: [[TMP7:%.*]] = icmp slt <4 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD]] +; NEON-NEXT: [[TMP8:%.*]] = freeze <4 x i1> [[TMP7]] +; NEON-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP8]]) +; NEON-NEXT: [[TMP10]] = select i1 [[TMP9]], <4 x i1> [[TMP7]], <4 x i1> [[TMP0]] +; NEON-NEXT: [[TMP11]] = select i1 [[TMP9]], <4 x i32> [[WIDE_LOAD]], <4 x i32> [[VEC_PHI]] +; NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; NEON-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NEON-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; NEON: [[MIDDLE_BLOCK]]: +; NEON-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP11]], <4 x i1> [[TMP10]], i32 -1) +; NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NEON-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; NEON: [[SCALAR_PH]]: +; NEON-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; NEON-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ] +; NEON-NEXT: br label %[[LOOP:.*]] +; NEON: [[LOOP]]: +; NEON-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[A_PHI:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_A:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[A_ADDR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NEON-NEXT: [[LD_A:%.*]] = load i32, ptr [[A_ADDR]], align 4 +; NEON-NEXT: [[MUL_A:%.*]] = mul i32 [[LD_A]], 13 +; NEON-NEXT: [[B_ADDR:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; NEON-NEXT: [[LD_B:%.*]] = load i32, ptr [[B_ADDR]], align 4 +; NEON-NEXT: [[MUL_B:%.*]] = mul i32 [[LD_B]], 5 +; NEON-NEXT: [[ADD:%.*]] = add i32 [[MUL_A]], [[MUL_B]] +; NEON-NEXT: [[C_ADDR:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]] +; NEON-NEXT: store i32 [[ADD]], ptr [[C_ADDR]], align 4 +; NEON-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[THRESHOLD]], [[LD_A]] +; NEON-NEXT: [[SELECT_A]] = select i1 [[SELECT_CMP]], i32 [[LD_A]], i32 [[A_PHI]] +; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; NEON: [[EXIT]]: +; NEON-NEXT: [[SELECT_A_LCSSA:%.*]] = phi i32 [ [[SELECT_A]], %[[LOOP]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ] +; NEON-NEXT: ret i32 [[SELECT_A_LCSSA]] +; +; SVE-LABEL: define i32 @int_select_with_extra_arith_payload( +; SVE-SAME: i64 [[N:%.*]], ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr noalias [[C:%.*]], i32 [[THRESHOLD:%.*]]) #[[ATTR0]] { +; SVE-NEXT: [[ENTRY:.*]]: +; SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; SVE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2 +; SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; SVE: [[VECTOR_PH]]: +; SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 +; SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; SVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[THRESHOLD]], i64 0 +; SVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; SVE: [[VECTOR_BODY]]: +; SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; SVE-NEXT: [[VEC_PHI:%.*]] = phi [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] +; SVE-NEXT: [[TMP4:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ] +; SVE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] +; SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP5]], align 4 +; SVE-NEXT: [[TMP6:%.*]] = mul [[WIDE_LOAD]], splat (i32 13) +; SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] +; SVE-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP7]], align 4 +; SVE-NEXT: [[TMP8:%.*]] = mul [[WIDE_LOAD1]], splat (i32 5) +; SVE-NEXT: [[TMP9:%.*]] = add [[TMP6]], [[TMP8]] +; SVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]] +; SVE-NEXT: store [[TMP9]], ptr [[TMP10]], align 4 +; SVE-NEXT: [[TMP11:%.*]] = icmp slt [[BROADCAST_SPLAT]], [[WIDE_LOAD]] +; SVE-NEXT: [[TMP12:%.*]] = freeze [[TMP11]] +; SVE-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP12]]) +; SVE-NEXT: [[TMP14]] = select i1 [[TMP13]], [[TMP11]], [[TMP4]] +; SVE-NEXT: [[TMP15]] = select i1 [[TMP13]], [[WIDE_LOAD]], [[VEC_PHI]] +; SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; SVE-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SVE-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; SVE: [[MIDDLE_BLOCK]]: +; SVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32( [[TMP15]], [[TMP14]], i32 -1) +; SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; SVE-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; SVE: [[SCALAR_PH]]: +; SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP17]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ] +; SVE-NEXT: br label %[[LOOP:.*]] +; SVE: [[LOOP]]: +; SVE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[A_PHI:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_A:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[A_ADDR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; SVE-NEXT: [[LD_A:%.*]] = load i32, ptr [[A_ADDR]], align 4 +; SVE-NEXT: [[MUL_A:%.*]] = mul i32 [[LD_A]], 13 +; SVE-NEXT: [[B_ADDR:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; SVE-NEXT: [[LD_B:%.*]] = load i32, ptr [[B_ADDR]], align 4 +; SVE-NEXT: [[MUL_B:%.*]] = mul i32 [[LD_B]], 5 +; SVE-NEXT: [[ADD:%.*]] = add i32 [[MUL_A]], [[MUL_B]] +; SVE-NEXT: [[C_ADDR:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]] +; SVE-NEXT: store i32 [[ADD]], ptr [[C_ADDR]], align 4 +; SVE-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[THRESHOLD]], [[LD_A]] +; SVE-NEXT: [[SELECT_A]] = select i1 [[SELECT_CMP]], i32 [[LD_A]], i32 [[A_PHI]] +; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; SVE: [[EXIT]]: +; SVE-NEXT: [[SELECT_A_LCSSA:%.*]] = phi i32 [ [[SELECT_A]], %[[LOOP]] ], [ [[TMP17]], %[[MIDDLE_BLOCK]] ] +; SVE-NEXT: ret i32 [[SELECT_A_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %A.phi = phi i32 [ -1, %entry ], [ %select.A, %loop ] + %A.addr = getelementptr inbounds i32, ptr %A, i64 %iv + %ld.A = load i32, ptr %A.addr, align 4 + %mul.A = mul i32 %ld.A, 13 + %B.addr = getelementptr inbounds i32, ptr %B, i64 %iv + %ld.B = load i32, ptr %B.addr, align 4 + %mul.B = mul i32 %ld.B, 5 + %add = add i32 %mul.A, %mul.B + %C.addr = getelementptr inbounds i32, ptr %C, i64 %iv + store i32 %add, ptr %C.addr, align 4 + %select.cmp = icmp slt i32 %threshold, %ld.A + %select.A = select i1 %select.cmp, i32 %ld.A, i32 %A.phi + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, %N + br i1 %exit.cmp, label %exit, label %loop + +exit: + ret i32 %select.A +} + +define i8 @simple_csa_byte_select(i64 %N, ptr %data, i8 %a) { +; NEON-LABEL: define i8 @simple_csa_byte_select( +; NEON-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i8 [[A:%.*]]) { +; NEON-NEXT: [[ENTRY:.*]]: +; NEON-NEXT: br label %[[LOOP:.*]] +; NEON: [[LOOP]]: +; NEON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[DATA_PHI:%.*]] = phi i8 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; NEON-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i8, ptr [[DATA]], i64 [[IV]] +; NEON-NEXT: [[LD:%.*]] = load i8, ptr [[LD_ADDR]], align 4 +; NEON-NEXT: [[SELECT_CMP:%.*]] = icmp slt i8 [[A]], [[LD]] +; NEON-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i8 [[LD]], i8 [[DATA_PHI]] +; NEON-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NEON-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NEON-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; NEON: [[EXIT]]: +; NEON-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i8 [ [[SELECT_DATA]], %[[LOOP]] ] +; NEON-NEXT: ret i8 [[SELECT_DATA_LCSSA]] +; +; SVE-LABEL: define i8 @simple_csa_byte_select( +; SVE-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i8 [[A:%.*]]) #[[ATTR0]] { +; SVE-NEXT: [[ENTRY:.*]]: +; SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; SVE-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 4 +; SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; SVE: [[VECTOR_PH]]: +; SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 +; SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; SVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i8 [[A]], i64 0 +; SVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; SVE: [[VECTOR_BODY]]: +; SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; SVE-NEXT: [[VEC_PHI:%.*]] = phi [ splat (i8 -1), %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; SVE-NEXT: [[TMP4:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ] +; SVE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[DATA]], i64 [[INDEX]] +; SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP5]], align 4 +; SVE-NEXT: [[TMP6:%.*]] = icmp slt [[BROADCAST_SPLAT]], [[WIDE_LOAD]] +; SVE-NEXT: [[TMP7:%.*]] = freeze [[TMP6]] +; SVE-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[TMP7]]) +; SVE-NEXT: [[TMP9]] = select i1 [[TMP8]], [[TMP6]], [[TMP4]] +; SVE-NEXT: [[TMP10]] = select i1 [[TMP8]], [[WIDE_LOAD]], [[VEC_PHI]] +; SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; SVE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SVE-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; SVE: [[MIDDLE_BLOCK]]: +; SVE-NEXT: [[TMP12:%.*]] = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8( [[TMP10]], [[TMP9]], i8 -1) +; SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; SVE-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; SVE: [[SCALAR_PH]]: +; SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; SVE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[TMP12]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ] +; SVE-NEXT: br label %[[LOOP:.*]] +; SVE: [[LOOP]]: +; SVE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[DATA_PHI:%.*]] = phi i8 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; SVE-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i8, ptr [[DATA]], i64 [[IV]] +; SVE-NEXT: [[LD:%.*]] = load i8, ptr [[LD_ADDR]], align 4 +; SVE-NEXT: [[SELECT_CMP:%.*]] = icmp slt i8 [[A]], [[LD]] +; SVE-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i8 [[LD]], i8 [[DATA_PHI]] +; SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; SVE-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; SVE-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; SVE: [[EXIT]]: +; SVE-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i8 [ [[SELECT_DATA]], %[[LOOP]] ], [ [[TMP12]], %[[MIDDLE_BLOCK]] ] +; SVE-NEXT: ret i8 [[SELECT_DATA_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %data.phi = phi i8 [ -1, %entry ], [ %select.data, %loop ] + %ld.addr = getelementptr inbounds i8, ptr %data, i64 %iv + %ld = load i8, ptr %ld.addr, align 4 + %select.cmp = icmp slt i8 %a, %ld + %select.data = select i1 %select.cmp, i8 %ld, i8 %data.phi + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, %N + br i1 %exit.cmp, label %exit, label %loop + +exit: + ret i8 %select.data +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/findlast-epilogue-loop.ll b/llvm/test/Transforms/LoopVectorize/AArch64/findlast-epilogue-loop.ll new file mode 100644 index 0000000000000..8e4a8457414bc --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/findlast-epilogue-loop.ll @@ -0,0 +1,78 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6 +; RUN: opt -passes=loop-vectorize -mattr=+sve -epilogue-vectorization-force-VF=4 -S < %s 2>&1 | FileCheck %s + +;; This test is currently ensuring we don't crash when vectorizing loops with +;; conditional scalar assignment when epilogue vectorization is either requested +;; or costed as profitable. + +target triple = "aarch64-linux-gnu" + +define i32 @simple_csa_int_select(i64 %N, ptr %data, i32 %a) { +; CHECK-LABEL: define i32 @simple_csa_int_select( +; CHECK-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i32 [[A:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[A]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP5]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp slt [[BROADCAST_SPLAT]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP7:%.*]] = freeze [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP7]]) +; CHECK-NEXT: [[TMP9]] = select i1 [[TMP8]], [[TMP6]], [[TMP4]] +; CHECK-NEXT: [[TMP10]] = select i1 [[TMP8]], [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32( [[TMP10]], [[TMP9]], i32 -1) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[DATA_PHI:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]] +; CHECK-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 +; CHECK-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] +; CHECK-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ], [ [[TMP12]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[SELECT_DATA_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] + %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv + %ld = load i32, ptr %ld.addr, align 4 + %select.cmp = icmp slt i32 %a, %ld + %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, %N + br i1 %exit.cmp, label %exit, label %loop + +exit: + ret i32 %select.data +} diff --git a/llvm/test/Transforms/LoopVectorize/X86/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/X86/conditional-scalar-assignment.ll new file mode 100644 index 0000000000000..83eb068bd79a9 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/conditional-scalar-assignment.ll @@ -0,0 +1,754 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6 +; RUN: opt -passes=loop-vectorize -S < %s 2>&1 | FileCheck %s --check-prefix=X86 +; RUN: opt -passes=loop-vectorize -mattr=+avx512f -S < %s 2>&1 | FileCheck %s --check-prefix=AVX512 + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i32 @simple_csa_int_select(i64 %N, ptr %data, i32 %a) { +; X86-LABEL: define i32 @simple_csa_int_select( +; X86-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i32 [[A:%.*]]) { +; X86-NEXT: [[ENTRY:.*]]: +; X86-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; X86-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; X86: [[VECTOR_PH]]: +; X86-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; X86-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; X86-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0 +; X86-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; X86-NEXT: br label %[[VECTOR_BODY:.*]] +; X86: [[VECTOR_BODY]]: +; X86-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; X86-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; X86-NEXT: [[TMP0:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; X86-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDEX]] +; X86-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 +; X86-NEXT: [[TMP2:%.*]] = icmp slt <4 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD]] +; X86-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]] +; X86-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]]) +; X86-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP2]], <4 x i1> [[TMP0]] +; X86-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i32> [[WIDE_LOAD]], <4 x i32> [[VEC_PHI]] +; X86-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; X86-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; X86-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; X86: [[MIDDLE_BLOCK]]: +; X86-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP6]], <4 x i1> [[TMP5]], i32 -1) +; X86-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; X86-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; X86: [[SCALAR_PH]]: +; X86-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; X86-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ] +; X86-NEXT: br label %[[LOOP:.*]] +; X86: [[LOOP]]: +; X86-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; X86-NEXT: [[DATA_PHI:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; X86-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]] +; X86-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 +; X86-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] +; X86-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] +; X86-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; X86-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; X86-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; X86: [[EXIT]]: +; X86-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ] +; X86-NEXT: ret i32 [[SELECT_DATA_LCSSA]] +; +; AVX512-LABEL: define i32 @simple_csa_int_select( +; AVX512-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i32 [[A:%.*]]) #[[ATTR0:[0-9]+]] { +; AVX512-NEXT: [[ENTRY:.*]]: +; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16 +; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; AVX512: [[VECTOR_PH]]: +; AVX512-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16 +; AVX512-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; AVX512-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[A]], i64 0 +; AVX512-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer +; AVX512-NEXT: br label %[[VECTOR_BODY:.*]] +; AVX512: [[VECTOR_BODY]]: +; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; AVX512-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP0:%.*]] = phi <16 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[INDEX]] +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP1]], align 4 +; AVX512-NEXT: [[TMP2:%.*]] = icmp slt <16 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD]] +; AVX512-NEXT: [[TMP3:%.*]] = freeze <16 x i1> [[TMP2]] +; AVX512-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]]) +; AVX512-NEXT: [[TMP5]] = select i1 [[TMP4]], <16 x i1> [[TMP2]], <16 x i1> [[TMP0]] +; AVX512-NEXT: [[TMP6]] = select i1 [[TMP4]], <16 x i32> [[WIDE_LOAD]], <16 x i32> [[VEC_PHI]] +; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; AVX512-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX512-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; AVX512: [[MIDDLE_BLOCK]]: +; AVX512-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v16i32(<16 x i32> [[TMP6]], <16 x i1> [[TMP5]], i32 -1) +; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; AVX512-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; AVX512: [[SCALAR_PH]]: +; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; AVX512-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ] +; AVX512-NEXT: br label %[[LOOP:.*]] +; AVX512: [[LOOP]]: +; AVX512-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; AVX512-NEXT: [[DATA_PHI:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; AVX512-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]] +; AVX512-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 +; AVX512-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] +; AVX512-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] +; AVX512-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; AVX512-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; AVX512-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; AVX512: [[EXIT]]: +; AVX512-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ] +; AVX512-NEXT: ret i32 [[SELECT_DATA_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] + %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv + %ld = load i32, ptr %ld.addr, align 4 + %select.cmp = icmp slt i32 %a, %ld + %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, %N + br i1 %exit.cmp, label %exit, label %loop + +exit: + ret i32 %select.data +} + +define ptr @simple_csa_ptr_select(i64 %N, ptr %data, i64 %a, ptr %init) { +; X86-LABEL: define ptr @simple_csa_ptr_select( +; X86-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i64 [[A:%.*]], ptr [[INIT:%.*]]) { +; X86-NEXT: [[ENTRY:.*]]: +; X86-NEXT: br label %[[LOOP:.*]] +; X86: [[LOOP]]: +; X86-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; X86-NEXT: [[DATA_PHI:%.*]] = phi ptr [ [[INIT]], %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; X86-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[IV]] +; X86-NEXT: [[LD:%.*]] = load ptr, ptr [[LD_ADDR]], align 4 +; X86-NEXT: [[LD_I64:%.*]] = ptrtoint ptr [[LD]] to i64 +; X86-NEXT: [[SELECT_CMP:%.*]] = icmp slt i64 [[A]], [[LD_I64]] +; X86-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], ptr [[LD]], ptr [[DATA_PHI]] +; X86-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; X86-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; X86-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; X86: [[EXIT]]: +; X86-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi ptr [ [[SELECT_DATA]], %[[LOOP]] ] +; X86-NEXT: ret ptr [[SELECT_DATA_LCSSA]] +; +; AVX512-LABEL: define ptr @simple_csa_ptr_select( +; AVX512-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i64 [[A:%.*]], ptr [[INIT:%.*]]) #[[ATTR0]] { +; AVX512-NEXT: [[ENTRY:.*]]: +; AVX512-NEXT: br label %[[LOOP:.*]] +; AVX512: [[LOOP]]: +; AVX512-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; AVX512-NEXT: [[DATA_PHI:%.*]] = phi ptr [ [[INIT]], %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; AVX512-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds ptr, ptr [[DATA]], i64 [[IV]] +; AVX512-NEXT: [[LD:%.*]] = load ptr, ptr [[LD_ADDR]], align 4 +; AVX512-NEXT: [[LD_I64:%.*]] = ptrtoint ptr [[LD]] to i64 +; AVX512-NEXT: [[SELECT_CMP:%.*]] = icmp slt i64 [[A]], [[LD_I64]] +; AVX512-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], ptr [[LD]], ptr [[DATA_PHI]] +; AVX512-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; AVX512-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; AVX512-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; AVX512: [[EXIT]]: +; AVX512-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi ptr [ [[SELECT_DATA]], %[[LOOP]] ] +; AVX512-NEXT: ret ptr [[SELECT_DATA_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %data.phi = phi ptr [ %init, %entry ], [ %select.data, %loop ] + %ld.addr = getelementptr inbounds ptr, ptr %data, i64 %iv + %ld = load ptr, ptr %ld.addr, align 4 + %ld.i64 = ptrtoint ptr %ld to i64 + %select.cmp = icmp slt i64 %a, %ld.i64 + %select.data = select i1 %select.cmp, ptr %ld, ptr %data.phi + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, %N + br i1 %exit.cmp, label %exit, label %loop + +exit: + ret ptr %select.data +} + +define float @simple_csa_float_select(i64 %N, ptr %data, float %a) { +; X86-LABEL: define float @simple_csa_float_select( +; X86-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], float [[A:%.*]]) { +; X86-NEXT: [[ENTRY:.*]]: +; X86-NEXT: br label %[[LOOP:.*]] +; X86: [[LOOP]]: +; X86-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; X86-NEXT: [[DATA_PHI:%.*]] = phi float [ -1.000000e+00, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; X86-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[IV]] +; X86-NEXT: [[LD:%.*]] = load float, ptr [[LD_ADDR]], align 4 +; X86-NEXT: [[SELECT_CMP:%.*]] = fcmp olt float [[A]], [[LD]] +; X86-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], float [[LD]], float [[DATA_PHI]] +; X86-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; X86-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; X86-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; X86: [[EXIT]]: +; X86-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi float [ [[SELECT_DATA]], %[[LOOP]] ] +; X86-NEXT: ret float [[SELECT_DATA_LCSSA]] +; +; AVX512-LABEL: define float @simple_csa_float_select( +; AVX512-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], float [[A:%.*]]) #[[ATTR0]] { +; AVX512-NEXT: [[ENTRY:.*]]: +; AVX512-NEXT: br label %[[LOOP:.*]] +; AVX512: [[LOOP]]: +; AVX512-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; AVX512-NEXT: [[DATA_PHI:%.*]] = phi float [ -1.000000e+00, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; AVX512-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds float, ptr [[DATA]], i64 [[IV]] +; AVX512-NEXT: [[LD:%.*]] = load float, ptr [[LD_ADDR]], align 4 +; AVX512-NEXT: [[SELECT_CMP:%.*]] = fcmp olt float [[A]], [[LD]] +; AVX512-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], float [[LD]], float [[DATA_PHI]] +; AVX512-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; AVX512-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; AVX512-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; AVX512: [[EXIT]]: +; AVX512-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi float [ [[SELECT_DATA]], %[[LOOP]] ] +; AVX512-NEXT: ret float [[SELECT_DATA_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %data.phi = phi float [ -1.0, %entry ], [ %select.data, %loop ] + %ld.addr = getelementptr inbounds float, ptr %data, i64 %iv + %ld = load float, ptr %ld.addr, align 4 + %select.cmp = fcmp olt float %a, %ld + %select.data = select i1 %select.cmp, float %ld, float %data.phi + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, %N + br i1 %exit.cmp, label %exit, label %loop + +exit: + ret float %select.data +} + +define i32 @multi_user_csa_int_select(i64 %N, ptr %data, ptr %results, i32 %a) { +; X86-LABEL: define i32 @multi_user_csa_int_select( +; X86-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], ptr [[RESULTS:%.*]], i32 [[A:%.*]]) { +; X86-NEXT: [[ENTRY:.*]]: +; X86-NEXT: br label %[[LOOP:.*]] +; X86: [[LOOP]]: +; X86-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; X86-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; X86-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]] +; X86-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 +; X86-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] +; X86-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] +; X86-NEXT: [[RES_ADDR:%.*]] = getelementptr inbounds i32, ptr [[RESULTS]], i64 [[IV]] +; X86-NEXT: store i32 [[SELECT_DATA]], ptr [[RES_ADDR]], align 4 +; X86-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; X86-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; X86-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; X86: [[EXIT]]: +; X86-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ] +; X86-NEXT: ret i32 [[SELECT_DATA_LCSSA]] +; +; AVX512-LABEL: define i32 @multi_user_csa_int_select( +; AVX512-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], ptr [[RESULTS:%.*]], i32 [[A:%.*]]) #[[ATTR0]] { +; AVX512-NEXT: [[ENTRY:.*]]: +; AVX512-NEXT: br label %[[LOOP:.*]] +; AVX512: [[LOOP]]: +; AVX512-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; AVX512-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; AVX512-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]] +; AVX512-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 +; AVX512-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] +; AVX512-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] +; AVX512-NEXT: [[RES_ADDR:%.*]] = getelementptr inbounds i32, ptr [[RESULTS]], i64 [[IV]] +; AVX512-NEXT: store i32 [[SELECT_DATA]], ptr [[RES_ADDR]], align 4 +; AVX512-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; AVX512-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; AVX512-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; AVX512: [[EXIT]]: +; AVX512-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ] +; AVX512-NEXT: ret i32 [[SELECT_DATA_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] + %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv + %ld = load i32, ptr %ld.addr, align 4 + %select.cmp = icmp slt i32 %a, %ld + %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi + %res.addr = getelementptr inbounds i32, ptr %results, i64 %iv + store i32 %select.data, ptr %res.addr, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, %N + br i1 %exit.cmp, label %exit, label %loop + +exit: + ret i32 %select.data +} + + +define i32 @multi_use_cmp_for_csa_int_select(i64 %N, ptr %data, i32 %a) { +; X86-LABEL: define i32 @multi_use_cmp_for_csa_int_select( +; X86-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i32 [[A:%.*]]) { +; X86-NEXT: [[ENTRY:.*]]: +; X86-NEXT: br label %[[LOOP:.*]] +; X86: [[LOOP]]: +; X86-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; X86-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; X86-NEXT: [[IDX_PHI:%.*]] = phi i64 [ -1, %[[ENTRY]] ], [ [[SELECT_IDX:%.*]], %[[LOOP]] ] +; X86-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]] +; X86-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 +; X86-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] +; X86-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] +; X86-NEXT: [[SELECT_IDX]] = select i1 [[SELECT_CMP]], i64 [[IV]], i64 [[IDX_PHI]] +; X86-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; X86-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; X86-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; X86: [[EXIT]]: +; X86-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ] +; X86-NEXT: [[SELECT_IDX_LCSSA:%.*]] = phi i64 [ [[SELECT_IDX]], %[[LOOP]] ] +; X86-NEXT: [[IDX:%.*]] = trunc i64 [[SELECT_IDX_LCSSA]] to i32 +; X86-NEXT: [[RES:%.*]] = add i32 [[IDX]], [[SELECT_DATA_LCSSA]] +; X86-NEXT: ret i32 [[RES]] +; +; AVX512-LABEL: define i32 @multi_use_cmp_for_csa_int_select( +; AVX512-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i32 [[A:%.*]]) #[[ATTR0]] { +; AVX512-NEXT: [[ENTRY:.*]]: +; AVX512-NEXT: br label %[[LOOP:.*]] +; AVX512: [[LOOP]]: +; AVX512-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; AVX512-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; AVX512-NEXT: [[IDX_PHI:%.*]] = phi i64 [ -1, %[[ENTRY]] ], [ [[SELECT_IDX:%.*]], %[[LOOP]] ] +; AVX512-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]] +; AVX512-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 +; AVX512-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] +; AVX512-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] +; AVX512-NEXT: [[SELECT_IDX]] = select i1 [[SELECT_CMP]], i64 [[IV]], i64 [[IDX_PHI]] +; AVX512-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; AVX512-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; AVX512-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; AVX512: [[EXIT]]: +; AVX512-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ] +; AVX512-NEXT: [[SELECT_IDX_LCSSA:%.*]] = phi i64 [ [[SELECT_IDX]], %[[LOOP]] ] +; AVX512-NEXT: [[IDX:%.*]] = trunc i64 [[SELECT_IDX_LCSSA]] to i32 +; AVX512-NEXT: [[RES:%.*]] = add i32 [[IDX]], [[SELECT_DATA_LCSSA]] +; AVX512-NEXT: ret i32 [[RES]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] + %idx.phi = phi i64 [ -1, %entry ], [ %select.idx, %loop ] + %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv + %ld = load i32, ptr %ld.addr, align 4 + %select.cmp = icmp slt i32 %a, %ld + %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi + %select.idx = select i1 %select.cmp, i64 %iv, i64 %idx.phi + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, %N + br i1 %exit.cmp, label %exit, label %loop + +exit: + %idx = trunc i64 %select.idx to i32 + %res = add i32 %idx, %select.data + ret i32 %res +} + + +define i32 @chained_select_for_csa_int_select(i64 %N, ptr %data1, ptr %data2, i32 %a, i32 %b) { +; X86-LABEL: define i32 @chained_select_for_csa_int_select( +; X86-SAME: i64 [[N:%.*]], ptr [[DATA1:%.*]], ptr [[DATA2:%.*]], i32 [[A:%.*]], i32 [[B:%.*]]) { +; X86-NEXT: [[ENTRY:.*]]: +; X86-NEXT: br label %[[LOOP:.*]] +; X86: [[LOOP]]: +; X86-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; X86-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; X86-NEXT: [[LD1_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[IV]] +; X86-NEXT: [[LD1:%.*]] = load i32, ptr [[LD1_ADDR]], align 4 +; X86-NEXT: [[SELECT_CMP1:%.*]] = icmp slt i32 [[A]], [[LD1]] +; X86-NEXT: [[SELECT_LD1:%.*]] = select i1 [[SELECT_CMP1]], i32 [[LD1]], i32 [[DATA_PHI]] +; X86-NEXT: [[LD2_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA2]], i64 [[IV]] +; X86-NEXT: [[LD2:%.*]] = load i32, ptr [[LD2_ADDR]], align 4 +; X86-NEXT: [[SELECT_CMP2:%.*]] = icmp sgt i32 [[B]], [[LD2]] +; X86-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP2]], i32 [[LD2]], i32 [[SELECT_LD1]] +; X86-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; X86-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; X86-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; X86: [[EXIT]]: +; X86-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ] +; X86-NEXT: ret i32 [[SELECT_DATA_LCSSA]] +; +; AVX512-LABEL: define i32 @chained_select_for_csa_int_select( +; AVX512-SAME: i64 [[N:%.*]], ptr [[DATA1:%.*]], ptr [[DATA2:%.*]], i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR0]] { +; AVX512-NEXT: [[ENTRY:.*]]: +; AVX512-NEXT: br label %[[LOOP:.*]] +; AVX512: [[LOOP]]: +; AVX512-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; AVX512-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; AVX512-NEXT: [[LD1_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA1]], i64 [[IV]] +; AVX512-NEXT: [[LD1:%.*]] = load i32, ptr [[LD1_ADDR]], align 4 +; AVX512-NEXT: [[SELECT_CMP1:%.*]] = icmp slt i32 [[A]], [[LD1]] +; AVX512-NEXT: [[SELECT_LD1:%.*]] = select i1 [[SELECT_CMP1]], i32 [[LD1]], i32 [[DATA_PHI]] +; AVX512-NEXT: [[LD2_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA2]], i64 [[IV]] +; AVX512-NEXT: [[LD2:%.*]] = load i32, ptr [[LD2_ADDR]], align 4 +; AVX512-NEXT: [[SELECT_CMP2:%.*]] = icmp sgt i32 [[B]], [[LD2]] +; AVX512-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP2]], i32 [[LD2]], i32 [[SELECT_LD1]] +; AVX512-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; AVX512-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; AVX512-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; AVX512: [[EXIT]]: +; AVX512-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ] +; AVX512-NEXT: ret i32 [[SELECT_DATA_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] + %ld1.addr = getelementptr inbounds i32, ptr %data1, i64 %iv + %ld1 = load i32, ptr %ld1.addr, align 4 + %select.cmp1 = icmp slt i32 %a, %ld1 + %select.ld1 = select i1 %select.cmp1, i32 %ld1, i32 %data.phi + %ld2.addr = getelementptr inbounds i32, ptr %data2, i64 %iv + %ld2 = load i32, ptr %ld2.addr, align 4 + %select.cmp2 = icmp sgt i32 %b, %ld2 + %select.data = select i1 %select.cmp2, i32 %ld2, i32 %select.ld1 + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, %N + br i1 %exit.cmp, label %exit, label %loop + +exit: + ret i32 %select.data +} + +define i32 @csa_with_extra_use_of_select(i64 %N, ptr readonly %data, ptr noalias %out, i32 %a) { +; X86-LABEL: define i32 @csa_with_extra_use_of_select( +; X86-SAME: i64 [[N:%.*]], ptr readonly [[DATA:%.*]], ptr noalias [[OUT:%.*]], i32 [[A:%.*]]) { +; X86-NEXT: [[ENTRY:.*]]: +; X86-NEXT: br label %[[LOOP:.*]] +; X86: [[LOOP]]: +; X86-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; X86-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; X86-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]] +; X86-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 +; X86-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] +; X86-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] +; X86-NEXT: [[ST_ADDR:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 [[IV]] +; X86-NEXT: store i32 [[SELECT_DATA]], ptr [[ST_ADDR]], align 4 +; X86-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; X86-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; X86-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; X86: [[EXIT]]: +; X86-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ] +; X86-NEXT: ret i32 [[SELECT_DATA_LCSSA]] +; +; AVX512-LABEL: define i32 @csa_with_extra_use_of_select( +; AVX512-SAME: i64 [[N:%.*]], ptr readonly [[DATA:%.*]], ptr noalias [[OUT:%.*]], i32 [[A:%.*]]) #[[ATTR0]] { +; AVX512-NEXT: [[ENTRY:.*]]: +; AVX512-NEXT: br label %[[LOOP:.*]] +; AVX512: [[LOOP]]: +; AVX512-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; AVX512-NEXT: [[DATA_PHI:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; AVX512-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i64 [[IV]] +; AVX512-NEXT: [[LD:%.*]] = load i32, ptr [[LD_ADDR]], align 4 +; AVX512-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[A]], [[LD]] +; AVX512-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i32 [[LD]], i32 [[DATA_PHI]] +; AVX512-NEXT: [[ST_ADDR:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 [[IV]] +; AVX512-NEXT: store i32 [[SELECT_DATA]], ptr [[ST_ADDR]], align 4 +; AVX512-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; AVX512-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; AVX512-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT:.*]], label %[[LOOP]] +; AVX512: [[EXIT]]: +; AVX512-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i32 [ [[SELECT_DATA]], %[[LOOP]] ] +; AVX512-NEXT: ret i32 [[SELECT_DATA_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] + %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv + %ld = load i32, ptr %ld.addr, align 4 + %select.cmp = icmp slt i32 %a, %ld + %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi + %st.addr = getelementptr inbounds i32, ptr %out, i64 %iv + store i32 %select.data, ptr %st.addr, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, %N + br i1 %exit.cmp, label %exit, label %loop + +exit: + ret i32 %select.data +} + +;; Add more work to the loop besides the CSA to check cost modelling. +define i32 @int_select_with_extra_arith_payload(i64 %N, ptr readonly %A, ptr readonly %B, ptr noalias %C, i32 %threshold) { +; X86-LABEL: define i32 @int_select_with_extra_arith_payload( +; X86-SAME: i64 [[N:%.*]], ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr noalias [[C:%.*]], i32 [[THRESHOLD:%.*]]) { +; X86-NEXT: [[ENTRY:.*]]: +; X86-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; X86-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; X86: [[VECTOR_PH]]: +; X86-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; X86-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; X86-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[THRESHOLD]], i64 0 +; X86-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; X86-NEXT: br label %[[VECTOR_BODY:.*]] +; X86: [[VECTOR_BODY]]: +; X86-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; X86-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ] +; X86-NEXT: [[TMP0:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; X86-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] +; X86-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 +; X86-NEXT: [[TMP2:%.*]] = mul <4 x i32> [[WIDE_LOAD]], splat (i32 13) +; X86-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] +; X86-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4 +; X86-NEXT: [[TMP4:%.*]] = mul <4 x i32> [[WIDE_LOAD1]], splat (i32 5) +; X86-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]] +; X86-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]] +; X86-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4 +; X86-NEXT: [[TMP7:%.*]] = icmp slt <4 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD]] +; X86-NEXT: [[TMP8:%.*]] = freeze <4 x i1> [[TMP7]] +; X86-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP8]]) +; X86-NEXT: [[TMP10]] = select i1 [[TMP9]], <4 x i1> [[TMP7]], <4 x i1> [[TMP0]] +; X86-NEXT: [[TMP11]] = select i1 [[TMP9]], <4 x i32> [[WIDE_LOAD]], <4 x i32> [[VEC_PHI]] +; X86-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; X86-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; X86-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; X86: [[MIDDLE_BLOCK]]: +; X86-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP11]], <4 x i1> [[TMP10]], i32 -1) +; X86-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; X86-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; X86: [[SCALAR_PH]]: +; X86-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; X86-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ] +; X86-NEXT: br label %[[LOOP:.*]] +; X86: [[LOOP]]: +; X86-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; X86-NEXT: [[A_PHI:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_A:%.*]], %[[LOOP]] ] +; X86-NEXT: [[A_ADDR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; X86-NEXT: [[LD_A:%.*]] = load i32, ptr [[A_ADDR]], align 4 +; X86-NEXT: [[MUL_A:%.*]] = mul i32 [[LD_A]], 13 +; X86-NEXT: [[B_ADDR:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; X86-NEXT: [[LD_B:%.*]] = load i32, ptr [[B_ADDR]], align 4 +; X86-NEXT: [[MUL_B:%.*]] = mul i32 [[LD_B]], 5 +; X86-NEXT: [[ADD:%.*]] = add i32 [[MUL_A]], [[MUL_B]] +; X86-NEXT: [[C_ADDR:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]] +; X86-NEXT: store i32 [[ADD]], ptr [[C_ADDR]], align 4 +; X86-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[THRESHOLD]], [[LD_A]] +; X86-NEXT: [[SELECT_A]] = select i1 [[SELECT_CMP]], i32 [[LD_A]], i32 [[A_PHI]] +; X86-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; X86-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; X86-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; X86: [[EXIT]]: +; X86-NEXT: [[SELECT_A_LCSSA:%.*]] = phi i32 [ [[SELECT_A]], %[[LOOP]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ] +; X86-NEXT: ret i32 [[SELECT_A_LCSSA]] +; +; AVX512-LABEL: define i32 @int_select_with_extra_arith_payload( +; AVX512-SAME: i64 [[N:%.*]], ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr noalias [[C:%.*]], i32 [[THRESHOLD:%.*]]) #[[ATTR0]] { +; AVX512-NEXT: [[ENTRY:.*]]: +; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16 +; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; AVX512: [[VECTOR_PH]]: +; AVX512-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16 +; AVX512-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; AVX512-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[THRESHOLD]], i64 0 +; AVX512-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer +; AVX512-NEXT: br label %[[VECTOR_BODY:.*]] +; AVX512: [[VECTOR_BODY]]: +; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; AVX512-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP0:%.*]] = phi <16 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP1]], align 4 +; AVX512-NEXT: [[TMP2:%.*]] = mul <16 x i32> [[WIDE_LOAD]], splat (i32 13) +; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] +; AVX512-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i32>, ptr [[TMP3]], align 4 +; AVX512-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[WIDE_LOAD1]], splat (i32 5) +; AVX512-NEXT: [[TMP5:%.*]] = add <16 x i32> [[TMP2]], [[TMP4]] +; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]] +; AVX512-NEXT: store <16 x i32> [[TMP5]], ptr [[TMP6]], align 4 +; AVX512-NEXT: [[TMP7:%.*]] = icmp slt <16 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD]] +; AVX512-NEXT: [[TMP8:%.*]] = freeze <16 x i1> [[TMP7]] +; AVX512-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP8]]) +; AVX512-NEXT: [[TMP10]] = select i1 [[TMP9]], <16 x i1> [[TMP7]], <16 x i1> [[TMP0]] +; AVX512-NEXT: [[TMP11]] = select i1 [[TMP9]], <16 x i32> [[WIDE_LOAD]], <16 x i32> [[VEC_PHI]] +; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; AVX512-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX512-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; AVX512: [[MIDDLE_BLOCK]]: +; AVX512-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v16i32(<16 x i32> [[TMP11]], <16 x i1> [[TMP10]], i32 -1) +; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; AVX512-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; AVX512: [[SCALAR_PH]]: +; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; AVX512-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ] +; AVX512-NEXT: br label %[[LOOP:.*]] +; AVX512: [[LOOP]]: +; AVX512-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; AVX512-NEXT: [[A_PHI:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_A:%.*]], %[[LOOP]] ] +; AVX512-NEXT: [[A_ADDR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; AVX512-NEXT: [[LD_A:%.*]] = load i32, ptr [[A_ADDR]], align 4 +; AVX512-NEXT: [[MUL_A:%.*]] = mul i32 [[LD_A]], 13 +; AVX512-NEXT: [[B_ADDR:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; AVX512-NEXT: [[LD_B:%.*]] = load i32, ptr [[B_ADDR]], align 4 +; AVX512-NEXT: [[MUL_B:%.*]] = mul i32 [[LD_B]], 5 +; AVX512-NEXT: [[ADD:%.*]] = add i32 [[MUL_A]], [[MUL_B]] +; AVX512-NEXT: [[C_ADDR:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]] +; AVX512-NEXT: store i32 [[ADD]], ptr [[C_ADDR]], align 4 +; AVX512-NEXT: [[SELECT_CMP:%.*]] = icmp slt i32 [[THRESHOLD]], [[LD_A]] +; AVX512-NEXT: [[SELECT_A]] = select i1 [[SELECT_CMP]], i32 [[LD_A]], i32 [[A_PHI]] +; AVX512-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; AVX512-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; AVX512-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; AVX512: [[EXIT]]: +; AVX512-NEXT: [[SELECT_A_LCSSA:%.*]] = phi i32 [ [[SELECT_A]], %[[LOOP]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ] +; AVX512-NEXT: ret i32 [[SELECT_A_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %A.phi = phi i32 [ -1, %entry ], [ %select.A, %loop ] + %A.addr = getelementptr inbounds i32, ptr %A, i64 %iv + %ld.A = load i32, ptr %A.addr, align 4 + %mul.A = mul i32 %ld.A, 13 + %B.addr = getelementptr inbounds i32, ptr %B, i64 %iv + %ld.B = load i32, ptr %B.addr, align 4 + %mul.B = mul i32 %ld.B, 5 + %add = add i32 %mul.A, %mul.B + %C.addr = getelementptr inbounds i32, ptr %C, i64 %iv + store i32 %add, ptr %C.addr, align 4 + %select.cmp = icmp slt i32 %threshold, %ld.A + %select.A = select i1 %select.cmp, i32 %ld.A, i32 %A.phi + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, %N + br i1 %exit.cmp, label %exit, label %loop + +exit: + ret i32 %select.A +} + +define i8 @simple_csa_byte_select(i64 %N, ptr %data, i8 %a) { +; X86-LABEL: define i8 @simple_csa_byte_select( +; X86-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i8 [[A:%.*]]) { +; X86-NEXT: [[ENTRY:.*]]: +; X86-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16 +; X86-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; X86: [[VECTOR_PH]]: +; X86-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16 +; X86-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; X86-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0 +; X86-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer +; X86-NEXT: br label %[[VECTOR_BODY:.*]] +; X86: [[VECTOR_BODY]]: +; X86-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; X86-NEXT: [[VEC_PHI:%.*]] = phi <16 x i8> [ splat (i8 -1), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; X86-NEXT: [[TMP0:%.*]] = phi <16 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; X86-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[DATA]], i64 [[INDEX]] +; X86-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 4 +; X86-NEXT: [[TMP2:%.*]] = icmp slt <16 x i8> [[BROADCAST_SPLAT]], [[WIDE_LOAD]] +; X86-NEXT: [[TMP3:%.*]] = freeze <16 x i1> [[TMP2]] +; X86-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]]) +; X86-NEXT: [[TMP5]] = select i1 [[TMP4]], <16 x i1> [[TMP2]], <16 x i1> [[TMP0]] +; X86-NEXT: [[TMP6]] = select i1 [[TMP4]], <16 x i8> [[WIDE_LOAD]], <16 x i8> [[VEC_PHI]] +; X86-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; X86-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; X86-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; X86: [[MIDDLE_BLOCK]]: +; X86-NEXT: [[TMP8:%.*]] = call i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8> [[TMP6]], <16 x i1> [[TMP5]], i8 -1) +; X86-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; X86-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; X86: [[SCALAR_PH]]: +; X86-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; X86-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ] +; X86-NEXT: br label %[[LOOP:.*]] +; X86: [[LOOP]]: +; X86-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; X86-NEXT: [[DATA_PHI:%.*]] = phi i8 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; X86-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i8, ptr [[DATA]], i64 [[IV]] +; X86-NEXT: [[LD:%.*]] = load i8, ptr [[LD_ADDR]], align 4 +; X86-NEXT: [[SELECT_CMP:%.*]] = icmp slt i8 [[A]], [[LD]] +; X86-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i8 [[LD]], i8 [[DATA_PHI]] +; X86-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; X86-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; X86-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; X86: [[EXIT]]: +; X86-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i8 [ [[SELECT_DATA]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ] +; X86-NEXT: ret i8 [[SELECT_DATA_LCSSA]] +; +; AVX512-LABEL: define i8 @simple_csa_byte_select( +; AVX512-SAME: i64 [[N:%.*]], ptr [[DATA:%.*]], i8 [[A:%.*]]) #[[ATTR0]] { +; AVX512-NEXT: [[ENTRY:.*]]: +; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 64 +; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; AVX512: [[VECTOR_PH]]: +; AVX512-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 64 +; AVX512-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; AVX512-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <64 x i8> poison, i8 [[A]], i64 0 +; AVX512-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i8> [[BROADCAST_SPLATINSERT]], <64 x i8> poison, <64 x i32> zeroinitializer +; AVX512-NEXT: br label %[[VECTOR_BODY:.*]] +; AVX512: [[VECTOR_BODY]]: +; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; AVX512-NEXT: [[VEC_PHI:%.*]] = phi <64 x i8> [ splat (i8 -1), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP0:%.*]] = phi <64 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[DATA]], i64 [[INDEX]] +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <64 x i8>, ptr [[TMP1]], align 4 +; AVX512-NEXT: [[TMP2:%.*]] = icmp slt <64 x i8> [[BROADCAST_SPLAT]], [[WIDE_LOAD]] +; AVX512-NEXT: [[TMP3:%.*]] = freeze <64 x i1> [[TMP2]] +; AVX512-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> [[TMP3]]) +; AVX512-NEXT: [[TMP5]] = select i1 [[TMP4]], <64 x i1> [[TMP2]], <64 x i1> [[TMP0]] +; AVX512-NEXT: [[TMP6]] = select i1 [[TMP4]], <64 x i8> [[WIDE_LOAD]], <64 x i8> [[VEC_PHI]] +; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 +; AVX512-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX512-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; AVX512: [[MIDDLE_BLOCK]]: +; AVX512-NEXT: [[TMP8:%.*]] = call i8 @llvm.experimental.vector.extract.last.active.v64i8(<64 x i8> [[TMP6]], <64 x i1> [[TMP5]], i8 -1) +; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; AVX512-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; AVX512: [[SCALAR_PH]]: +; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; AVX512-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ] +; AVX512-NEXT: br label %[[LOOP:.*]] +; AVX512: [[LOOP]]: +; AVX512-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; AVX512-NEXT: [[DATA_PHI:%.*]] = phi i8 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SELECT_DATA:%.*]], %[[LOOP]] ] +; AVX512-NEXT: [[LD_ADDR:%.*]] = getelementptr inbounds i8, ptr [[DATA]], i64 [[IV]] +; AVX512-NEXT: [[LD:%.*]] = load i8, ptr [[LD_ADDR]], align 4 +; AVX512-NEXT: [[SELECT_CMP:%.*]] = icmp slt i8 [[A]], [[LD]] +; AVX512-NEXT: [[SELECT_DATA]] = select i1 [[SELECT_CMP]], i8 [[LD]], i8 [[DATA_PHI]] +; AVX512-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; AVX512-NEXT: [[EXIT_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; AVX512-NEXT: br i1 [[EXIT_CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; AVX512: [[EXIT]]: +; AVX512-NEXT: [[SELECT_DATA_LCSSA:%.*]] = phi i8 [ [[SELECT_DATA]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ] +; AVX512-NEXT: ret i8 [[SELECT_DATA_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %data.phi = phi i8 [ -1, %entry ], [ %select.data, %loop ] + %ld.addr = getelementptr inbounds i8, ptr %data, i64 %iv + %ld = load i8, ptr %ld.addr, align 4 + %select.cmp = icmp slt i8 %a, %ld + %select.data = select i1 %select.cmp, i8 %ld, i8 %data.phi + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, %N + br i1 %exit.cmp, label %exit, label %loop + +exit: + ret i8 %select.data +} diff --git a/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll new file mode 100644 index 0000000000000..788b35e88734e --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/conditional-scalar-assignment-vplan.ll @@ -0,0 +1,77 @@ +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \ +; RUN: -force-vector-width=4 -disable-output 2>&1 < %s | FileCheck %s + + +; This function is derived from the following C program: +; int simple_csa_int_select(int N, int *data, int a) { +; int t = -1; +; for (int i = 0; i < N; i++) { +; if (a < data[i]) +; t = data[i]; +; } +; return t; +; } +define i32 @simple_csa_int_select(i64 %N, ptr %data, i32 %a) { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %data.phi = phi i32 [ -1, %entry ], [ %select.data, %loop ] + %ld.addr = getelementptr inbounds i32, ptr %data, i64 %iv + %ld = load i32, ptr %ld.addr, align 4 + %select.cmp = icmp slt i32 %a, %ld + %select.data = select i1 %select.cmp, i32 %ld, i32 %data.phi + %iv.next = add nuw nsw i64 %iv, 1 + %exit.cmp = icmp eq i64 %iv.next, %N + br i1 %exit.cmp, label %exit, label %loop + +exit: + ret i32 %select.data +} + +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VF:%.*]]> = VF +; CHECK-NEXT: Live-in vp<[[VFxUF:%.*]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VECTC:%.*]]> = vector-trip-count +; CHECK-NEXT: Live-in ir<[[ORIGTC:%.*]]> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: Successor(s): scalar.ph, vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[CIV:%.*]]> = CANONICAL-INDUCTION ir<0>, vp<[[INDEXNEXT:%.*]]> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[DATAPHI:%.*]]> = phi ir<-1>, vp<[[DATASELECT:%.*]]> +; CHECK-NEXT: WIDEN-PHI vp<[[MASKPHI:%.*]]> = phi [ ir, vector.ph ], [ vp<[[MASKSELECT:%.*]]>, vector.body ] +; CHECK-NEXT: vp<[[STEPS:%.*]]> = SCALAR-STEPS vp<[[CIV]]>, ir<1>, vp<[[VF]]> +; CHECK-NEXT: CLONE ir<[[LDADDR:%.*]]> = getelementptr inbounds ir<%data>, vp<[[STEPS:%.*]]> +; CHECK-NEXT: vp<[[VPTR:%.*]]> = vector-pointer inbounds ir<[[LDADDR]]> +; CHECK-NEXT: WIDEN ir<[[LD:%.*]]> = load vp<[[VPTR]]> +; CHECK-NEXT: WIDEN ir<[[SELECTCMP:%.*]]> = icmp slt ir<%a>, ir<[[LD]]> +; CHECK-NEXT: EMIT vp<[[ANYOF:%.*]]> = any-of ir<[[SELECTCMP]]> +; CHECK-NEXT: EMIT vp<[[MASKSELECT]]> = select vp<[[ANYOF]]>, ir<[[SELECTCMP]]>, vp<[[MASKPHI]]> +; CHECK-NEXT: EMIT vp<[[DATASELECT]]> = select vp<[[ANYOF]]>, ir<[[LD]]>, ir<[[DATAPHI]]> +; CHECK-NEXT: EMIT vp<[[INDEXNEXT]]> = add nuw vp<[[CIV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[INDEXNEXT]]>, vp<[[VECTC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<[[EXTRACTLAST:%.*]]> = extract-last-active vp<[[DATASELECT]]>, vp<[[MASKSELECT]]>, ir<-1> +; CHECK-NEXT: EMIT vp<[[TCCMP:%.*]]> = icmp eq ir<[[ORIGTC]]>, vp<[[VECTC]]> +; CHECK-NEXT: EMIT branch-on-cond vp<[[TCCMP]]> +; CHECK-NEXT: Successor(s): ir-bb, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR [[SELECTLCSSA:%.*]] = phi i32 [ [[SELECTDATA:%.*]], %loop ] (extra operand: vp<[[EXTRACTLAST]]> from middle.block) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: EMIT-SCALAR vp<[[RESUMEVAL:%.*]]> = phi [ vp<[[VECTC]]>, middle.block ], [ ir<0>, ir-bb ] +; CHECK-NEXT: EMIT-SCALAR vp<[[MERGERDX:%.*]]> = phi [ vp<[[EXTRACTLAST]]>, middle.block ], [ ir<-1>, ir-bb ] +; CHECK-NEXT: Successor(s): ir-bb diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll index 2200a7d0431d2..890b6ccba0796 100644 --- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll +++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5 -; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck --check-prefixes=CHECK,IC1VF4 %s -; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck --check-prefixes=CHECK,IC4VF4 %s -; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=1 -S < %s | FileCheck --check-prefixes=CHECK,IC4VF1 %s +; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck --check-prefix=IC1VF4 %s +; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck --check-prefix=IC4VF4 %s +; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=1 -S < %s | FileCheck --check-prefix=IC4VF1 %s define i64 @select_decreasing_induction_icmp_const_start(ptr %a) { ; IC1VF4-LABEL: define i64 @select_decreasing_induction_icmp_const_start( @@ -961,29 +961,142 @@ exit: ; preds = %loop } ; The unsigned sentinel value for decreasing-IV vectorization is ULONG_MAX, -; and since the IV hits this value, it is impossible to vectorize this case. +; and since the IV hits this value, it cannot be vectorized as a FindLastIV +; reduction. Instead, it is recognized and vectorized as a generic FindLast. ; In this test, %iv's range will include both signed and unsigned ; maximum (sentinel) values. -define i64 @not_vectorized_select_decreasing_induction_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.start) { -; CHECK-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_iv_out_of_bound( -; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br label %[[LOOP:.*]] -; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ -1, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], -1 -; CHECK-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV_NEXT]] -; CHECK-NEXT: [[LD_A:%.*]] = load i8, ptr [[GEP_A_IV]], align 1 -; CHECK-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV_NEXT]] -; CHECK-NEXT: [[LD_B:%.*]] = load i8, ptr [[GEP_B_IV]], align 1 -; CHECK-NEXT: [[CMP_A_B:%.*]] = icmp sgt i8 [[LD_A]], [[LD_B]] -; CHECK-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]] -; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], 0 -; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]] -; CHECK: [[EXIT]]: -; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ] -; CHECK-NEXT: ret i64 [[COND_LCSSA]] +define i64 @select_decreasing_induction_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.start) { +; IC1VF4-LABEL: define i64 @select_decreasing_induction_icmp_iv_out_of_bound( +; IC1VF4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]]) { +; IC1VF4-NEXT: [[ENTRY:.*:]] +; IC1VF4-NEXT: br label %[[VECTOR_PH:.*]] +; IC1VF4: [[VECTOR_PH]]: +; IC1VF4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; IC1VF4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; IC1VF4-NEXT: br label %[[VECTOR_BODY:.*]] +; IC1VF4: [[VECTOR_BODY]]: +; IC1VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC1VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC1VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ] +; IC1VF4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ] +; IC1VF4-NEXT: [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -1) +; IC1VF4-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0 +; IC1VF4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]] +; IC1VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 0 +; IC1VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 -3 +; IC1VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 +; IC1VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> +; IC1VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] +; IC1VF4-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 0 +; IC1VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i64 -3 +; IC1VF4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1 +; IC1VF4-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> +; IC1VF4-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i8> [[REVERSE]], [[REVERSE2]] +; IC1VF4-NEXT: [[TMP9:%.*]] = freeze <4 x i1> [[TMP8]] +; IC1VF4-NEXT: [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP9]]) +; IC1VF4-NEXT: [[TMP11]] = select i1 [[TMP10]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]] +; IC1VF4-NEXT: [[TMP12]] = select i1 [[TMP10]], <4 x i64> [[TMP0]], <4 x i64> [[VEC_PHI]] +; IC1VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; IC1VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4) +; IC1VF4-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], -4 +; IC1VF4-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; IC1VF4: [[MIDDLE_BLOCK]]: +; IC1VF4-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; IC1VF4-NEXT: [[TMP15:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP12]], <4 x i1> [[TMP11]], i64 [[TMP14]]) +; IC1VF4-NEXT: br label %[[SCALAR_PH:.*]] +; IC1VF4: [[SCALAR_PH]]: +; IC1VF4-NEXT: br label %[[LOOP:.*]] +; IC1VF4: [[LOOP]]: +; IC1VF4-NEXT: [[IV:%.*]] = phi i64 [ 3, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; IC1VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[TMP15]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[LOOP]] ] +; IC1VF4-NEXT: [[IV_NEXT]] = add i64 [[IV]], -1 +; IC1VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV_NEXT]] +; IC1VF4-NEXT: [[LD_A:%.*]] = load i8, ptr [[GEP_A_IV]], align 1 +; IC1VF4-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV_NEXT]] +; IC1VF4-NEXT: [[LD_B:%.*]] = load i8, ptr [[GEP_B_IV]], align 1 +; IC1VF4-NEXT: [[CMP_A_B:%.*]] = icmp sgt i8 [[LD_A]], [[LD_B]] +; IC1VF4-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]] +; IC1VF4-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; IC1VF4-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; IC1VF4: [[EXIT]]: +; IC1VF4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ] +; IC1VF4-NEXT: ret i64 [[COND_LCSSA]] +; +; IC4VF4-LABEL: define i64 @select_decreasing_induction_icmp_iv_out_of_bound( +; IC4VF4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]]) { +; IC4VF4-NEXT: [[ENTRY:.*:]] +; IC4VF4-NEXT: br label %[[VECTOR_PH:.*]] +; IC4VF4: [[VECTOR_PH]]: +; IC4VF4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; IC4VF4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; IC4VF4-NEXT: br label %[[VECTOR_BODY:.*]] +; IC4VF4: [[VECTOR_BODY]]: +; IC4VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[TMP0:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -1) +; IC4VF4-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0 +; IC4VF4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]] +; IC4VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 0 +; IC4VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 -3 +; IC4VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 +; IC4VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> +; IC4VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] +; IC4VF4-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 0 +; IC4VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i64 -3 +; IC4VF4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1 +; IC4VF4-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD1]], <4 x i8> poison, <4 x i32> +; IC4VF4-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i8> [[REVERSE]], [[REVERSE2]] +; IC4VF4-NEXT: [[TMP9:%.*]] = freeze <4 x i1> [[TMP8]] +; IC4VF4-NEXT: [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP9]]) +; IC4VF4-NEXT: [[TMP11]] = select i1 [[TMP10]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]] +; IC4VF4-NEXT: [[TMP12]] = select i1 [[TMP10]], <4 x i64> [[TMP0]], <4 x i64> [[VEC_PHI]] +; IC4VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; IC4VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4) +; IC4VF4-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], -4 +; IC4VF4-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; IC4VF4: [[MIDDLE_BLOCK]]: +; IC4VF4-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; IC4VF4-NEXT: [[TMP15:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP12]], <4 x i1> [[TMP11]], i64 [[TMP14]]) +; IC4VF4-NEXT: br label %[[SCALAR_PH:.*]] +; IC4VF4: [[SCALAR_PH]]: +; IC4VF4-NEXT: br label %[[LOOP:.*]] +; IC4VF4: [[LOOP]]: +; IC4VF4-NEXT: [[IV:%.*]] = phi i64 [ 3, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; IC4VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[TMP15]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[LOOP]] ] +; IC4VF4-NEXT: [[IV_NEXT]] = add i64 [[IV]], -1 +; IC4VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV_NEXT]] +; IC4VF4-NEXT: [[LD_A:%.*]] = load i8, ptr [[GEP_A_IV]], align 1 +; IC4VF4-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV_NEXT]] +; IC4VF4-NEXT: [[LD_B:%.*]] = load i8, ptr [[GEP_B_IV]], align 1 +; IC4VF4-NEXT: [[CMP_A_B:%.*]] = icmp sgt i8 [[LD_A]], [[LD_B]] +; IC4VF4-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]] +; IC4VF4-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; IC4VF4-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; IC4VF4: [[EXIT]]: +; IC4VF4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ] +; IC4VF4-NEXT: ret i64 [[COND_LCSSA]] +; +; IC4VF1-LABEL: define i64 @select_decreasing_induction_icmp_iv_out_of_bound( +; IC4VF1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]]) { +; IC4VF1-NEXT: [[ENTRY:.*]]: +; IC4VF1-NEXT: br label %[[LOOP:.*]] +; IC4VF1: [[LOOP]]: +; IC4VF1-NEXT: [[IV:%.*]] = phi i64 [ -1, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; IC4VF1-NEXT: [[RDX:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND:%.*]], %[[LOOP]] ] +; IC4VF1-NEXT: [[IV_NEXT]] = add i64 [[IV]], -1 +; IC4VF1-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV_NEXT]] +; IC4VF1-NEXT: [[LD_A:%.*]] = load i8, ptr [[GEP_A_IV]], align 1 +; IC4VF1-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IV_NEXT]] +; IC4VF1-NEXT: [[LD_B:%.*]] = load i8, ptr [[GEP_B_IV]], align 1 +; IC4VF1-NEXT: [[CMP_A_B:%.*]] = icmp sgt i8 [[LD_A]], [[LD_B]] +; IC4VF1-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]] +; IC4VF1-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], 0 +; IC4VF1-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]] +; IC4VF1: [[EXIT]]: +; IC4VF1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ] +; IC4VF1-NEXT: ret i64 [[COND_LCSSA]] ; entry: br label %loop @@ -1005,26 +1118,164 @@ exit: ret i64 %cond } -define i64 @not_vectorized_select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64 %rdx.start, i64 %n) { -; CHECK-LABEL: define i64 @not_vectorized_select_decreasing_induction_icmp_non_const_start( -; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br label %[[LOOP:.*]] -; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[N]], %[[ENTRY]] ] -; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[LOOP]] ], [ [[RDX_START]], %[[ENTRY]] ] -; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 -; CHECK-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_NEXT]] -; CHECK-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8 -; CHECK-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_NEXT]] -; CHECK-NEXT: [[LD_B:%.*]] = load i64, ptr [[GEP_B_IV]], align 8 -; CHECK-NEXT: [[CMP_A_B:%.*]] = icmp sgt i64 [[LD_A]], [[LD_B]] -; CHECK-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]] -; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp ugt i64 [[IV]], 1 -; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[LOOP]], label %[[EXIT:.*]] -; CHECK: [[EXIT]]: -; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ] -; CHECK-NEXT: ret i64 [[COND_LCSSA]] +define i64 @select_decreasing_induction_icmp_non_const_start(ptr %a, ptr %b, i64 %rdx.start, i64 %n) { +; IC1VF4-LABEL: define i64 @select_decreasing_induction_icmp_non_const_start( +; IC1VF4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) { +; IC1VF4-NEXT: [[ENTRY:.*]]: +; IC1VF4-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; IC1VF4-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 1) +; IC1VF4-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[UMIN]] +; IC1VF4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 4 +; IC1VF4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IC1VF4: [[VECTOR_PH]]: +; IC1VF4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4 +; IC1VF4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] +; IC1VF4-NEXT: [[TMP2:%.*]] = sub i64 [[N]], [[N_VEC]] +; IC1VF4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; IC1VF4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; IC1VF4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0 +; IC1VF4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; IC1VF4-NEXT: [[INDUCTION:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT2]], +; IC1VF4-NEXT: br label %[[VECTOR_BODY:.*]] +; IC1VF4: [[VECTOR_BODY]]: +; IC1VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC1VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC1VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] +; IC1VF4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ] +; IC1VF4-NEXT: [[TMP3:%.*]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -1) +; IC1VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; IC1VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; IC1VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i64 0 +; IC1VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i64 -3 +; IC1VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8 +; IC1VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> +; IC1VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] +; IC1VF4-NEXT: [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i64 0 +; IC1VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP19]], i64 -3 +; IC1VF4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8 +; IC1VF4-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD3]], <4 x i64> poison, <4 x i32> +; IC1VF4-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i64> [[REVERSE]], [[REVERSE4]] +; IC1VF4-NEXT: [[TMP12:%.*]] = freeze <4 x i1> [[TMP11]] +; IC1VF4-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP12]]) +; IC1VF4-NEXT: [[TMP14]] = select i1 [[TMP13]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]] +; IC1VF4-NEXT: [[TMP15]] = select i1 [[TMP13]], <4 x i64> [[TMP3]], <4 x i64> [[VEC_PHI]] +; IC1VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; IC1VF4-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -4) +; IC1VF4-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IC1VF4-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; IC1VF4: [[MIDDLE_BLOCK]]: +; IC1VF4-NEXT: [[TMP17:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; IC1VF4-NEXT: [[TMP18:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP15]], <4 x i1> [[TMP14]], i64 [[TMP17]]) +; IC1VF4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] +; IC1VF4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IC1VF4: [[SCALAR_PH]]: +; IC1VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[ENTRY]] ] +; IC1VF4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP18]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ] +; IC1VF4-NEXT: br label %[[LOOP:.*]] +; IC1VF4: [[LOOP]]: +; IC1VF4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; IC1VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[LOOP]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] +; IC1VF4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 +; IC1VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_NEXT]] +; IC1VF4-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8 +; IC1VF4-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_NEXT]] +; IC1VF4-NEXT: [[LD_B:%.*]] = load i64, ptr [[GEP_B_IV]], align 8 +; IC1VF4-NEXT: [[CMP_A_B:%.*]] = icmp sgt i64 [[LD_A]], [[LD_B]] +; IC1VF4-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]] +; IC1VF4-NEXT: [[EXIT_COND:%.*]] = icmp ugt i64 [[IV]], 1 +; IC1VF4-NEXT: br i1 [[EXIT_COND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP9:![0-9]+]] +; IC1VF4: [[EXIT]]: +; IC1VF4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ], [ [[TMP18]], %[[MIDDLE_BLOCK]] ] +; IC1VF4-NEXT: ret i64 [[COND_LCSSA]] +; +; IC4VF4-LABEL: define i64 @select_decreasing_induction_icmp_non_const_start( +; IC4VF4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) { +; IC4VF4-NEXT: [[ENTRY:.*]]: +; IC4VF4-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; IC4VF4-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 1) +; IC4VF4-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[UMIN]] +; IC4VF4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 4 +; IC4VF4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IC4VF4: [[VECTOR_PH]]: +; IC4VF4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4 +; IC4VF4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] +; IC4VF4-NEXT: [[TMP2:%.*]] = sub i64 [[N]], [[N_VEC]] +; IC4VF4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; IC4VF4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; IC4VF4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[N]], i64 0 +; IC4VF4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; IC4VF4-NEXT: [[INDUCTION:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT2]], +; IC4VF4-NEXT: br label %[[VECTOR_BODY:.*]] +; IC4VF4: [[VECTOR_BODY]]: +; IC4VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ] +; IC4VF4-NEXT: [[TMP3:%.*]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -1) +; IC4VF4-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; IC4VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] +; IC4VF4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i64 0 +; IC4VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i64 -3 +; IC4VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8 +; IC4VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> +; IC4VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP4]] +; IC4VF4-NEXT: [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i64 0 +; IC4VF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP19]], i64 -3 +; IC4VF4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8 +; IC4VF4-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD3]], <4 x i64> poison, <4 x i32> +; IC4VF4-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i64> [[REVERSE]], [[REVERSE4]] +; IC4VF4-NEXT: [[TMP12:%.*]] = freeze <4 x i1> [[TMP11]] +; IC4VF4-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP12]]) +; IC4VF4-NEXT: [[TMP14]] = select i1 [[TMP13]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]] +; IC4VF4-NEXT: [[TMP15]] = select i1 [[TMP13]], <4 x i64> [[TMP3]], <4 x i64> [[VEC_PHI]] +; IC4VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; IC4VF4-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 -4) +; IC4VF4-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IC4VF4-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; IC4VF4: [[MIDDLE_BLOCK]]: +; IC4VF4-NEXT: [[TMP17:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; IC4VF4-NEXT: [[TMP18:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP15]], <4 x i1> [[TMP14]], i64 [[TMP17]]) +; IC4VF4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] +; IC4VF4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IC4VF4: [[SCALAR_PH]]: +; IC4VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[N]], %[[ENTRY]] ] +; IC4VF4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP18]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ] +; IC4VF4-NEXT: br label %[[LOOP:.*]] +; IC4VF4: [[LOOP]]: +; IC4VF4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; IC4VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[LOOP]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] +; IC4VF4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 +; IC4VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_NEXT]] +; IC4VF4-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8 +; IC4VF4-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_NEXT]] +; IC4VF4-NEXT: [[LD_B:%.*]] = load i64, ptr [[GEP_B_IV]], align 8 +; IC4VF4-NEXT: [[CMP_A_B:%.*]] = icmp sgt i64 [[LD_A]], [[LD_B]] +; IC4VF4-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]] +; IC4VF4-NEXT: [[EXIT_COND:%.*]] = icmp ugt i64 [[IV]], 1 +; IC4VF4-NEXT: br i1 [[EXIT_COND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP9:![0-9]+]] +; IC4VF4: [[EXIT]]: +; IC4VF4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ], [ [[TMP18]], %[[MIDDLE_BLOCK]] ] +; IC4VF4-NEXT: ret i64 [[COND_LCSSA]] +; +; IC4VF1-LABEL: define i64 @select_decreasing_induction_icmp_non_const_start( +; IC4VF1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) { +; IC4VF1-NEXT: [[ENTRY:.*]]: +; IC4VF1-NEXT: br label %[[LOOP:.*]] +; IC4VF1: [[LOOP]]: +; IC4VF1-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[N]], %[[ENTRY]] ] +; IC4VF1-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[LOOP]] ], [ [[RDX_START]], %[[ENTRY]] ] +; IC4VF1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 +; IC4VF1-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_NEXT]] +; IC4VF1-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8 +; IC4VF1-NEXT: [[GEP_B_IV:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_NEXT]] +; IC4VF1-NEXT: [[LD_B:%.*]] = load i64, ptr [[GEP_B_IV]], align 8 +; IC4VF1-NEXT: [[CMP_A_B:%.*]] = icmp sgt i64 [[LD_A]], [[LD_B]] +; IC4VF1-NEXT: [[COND]] = select i1 [[CMP_A_B]], i64 [[IV_NEXT]], i64 [[RDX]] +; IC4VF1-NEXT: [[EXIT_COND:%.*]] = icmp ugt i64 [[IV]], 1 +; IC4VF1-NEXT: br i1 [[EXIT_COND]], label %[[LOOP]], label %[[EXIT:.*]] +; IC4VF1: [[EXIT]]: +; IC4VF1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[LOOP]] ] +; IC4VF1-NEXT: ret i64 [[COND_LCSSA]] ; entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll index 21ef1885b75b9..377afaea8f7fd 100644 --- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll +++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll @@ -145,10 +145,44 @@ define i64 @select_icmp_nuw(ptr %a, ptr %b, i64 %ii, i64 %n) { ; CHECK-LABEL: define i64 @select_icmp_nuw( ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[II:%.*]], i64 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[II]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]]) +; CHECK-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP2]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add nuw <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP6]], <4 x i1> [[TMP5]], i64 [[TMP8]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[II]], %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[II]], %[[ENTRY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]] @@ -157,9 +191,9 @@ define i64 @select_icmp_nuw(ptr %a, ptr %b, i64 %ii, i64 %n) { ; CHECK-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]] ; CHECK-NEXT: [[INC]] = add nuw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[COND_LCSSA]] ; entry: @@ -186,10 +220,44 @@ define i64 @select_icmp_noflag(ptr %a, ptr %b, i64 %ii, i64 %n) { ; CHECK-LABEL: define i64 @select_icmp_noflag( ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[II:%.*]], i64 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[II]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP10]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]]) +; CHECK-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP2]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP6]], <4 x i1> [[TMP5]], i64 [[TMP8]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[II]], %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[II]], %[[ENTRY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]] @@ -198,9 +266,9 @@ define i64 @select_icmp_noflag(ptr %a, ptr %b, i64 %ii, i64 %n) { ; CHECK-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]] ; CHECK-NEXT: [[INC]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[COND_LCSSA]] ; entry: @@ -229,4 +297,8 @@ exit: ; preds = %for.body ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} ; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll index 72ed6537ef640..88bb91efa0410 100644 --- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll +++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-non-const-iv-start.ll @@ -1,32 +1,156 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK -; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK -; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=1 -S < %s | FileCheck %s --check-prefix=CHECK +; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC1 +; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC4 +; RUN: opt -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=1 -S < %s | FileCheck %s --check-prefix=CHECK-VF1IC4 define i64 @select_non_const_iv_start_signed_guard(ptr %a, i64 %rdx_start, i64 %iv_start ,i64 %n) { -; CHECK-LABEL: define i64 @select_non_const_iv_start_signed_guard( -; CHECK-SAME: ptr [[A:%.*]], i64 [[RDX_START:%.*]], i64 [[IV_START:%.*]], i64 [[N:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[GUARD:%.*]] = icmp slt i64 [[IV_START]], [[N]] -; CHECK-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] -; CHECK: [[FOR_BODY_PREHEADER]]: -; CHECK-NEXT: br label %[[FOR_BODY:.*]] -; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[IV_START]], %[[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[RDX_07:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] -; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i64 [[TMP0]], 3 -; CHECK-NEXT: [[COND]] = select i1 [[CMP1]], i64 [[IV]], i64 [[RDX_07]] -; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] -; CHECK: [[EXIT_LOOPEXIT]]: -; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ] -; CHECK-NEXT: br label %[[EXIT]] -; CHECK: [[EXIT]]: -; CHECK-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] -; CHECK-NEXT: ret i64 [[IDX_0_LCSSA]] +; CHECK-VF4IC1-LABEL: define i64 @select_non_const_iv_start_signed_guard( +; CHECK-VF4IC1-SAME: ptr [[A:%.*]], i64 [[RDX_START:%.*]], i64 [[IV_START:%.*]], i64 [[N:%.*]]) { +; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC1-NEXT: [[GUARD:%.*]] = icmp slt i64 [[IV_START]], [[N]] +; CHECK-VF4IC1-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] +; CHECK-VF4IC1: [[FOR_BODY_PREHEADER]]: +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[IV_START]] +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = add i64 [[IV_START]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[IV_START]], i64 0 +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT2]], +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[IV_START]], [[INDEX]] +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], splat (i64 3) +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]] +; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]]) +; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP5]], <4 x i1> [[TMP3]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP7]] = select i1 [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP7]], <4 x i1> [[TMP6]], i64 [[TMP9]]) +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC1: [[SCALAR_PH]]: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[IV_START]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-VF4IC1: [[FOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-VF4IC1-NEXT: [[RDX_07:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[CMP1:%.*]] = icmp sgt i64 [[TMP11]], 3 +; CHECK-VF4IC1-NEXT: [[COND]] = select i1 [[CMP1]], i64 [[IV]], i64 [[RDX_07]] +; CHECK-VF4IC1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-VF4IC1: [[EXIT_LOOPEXIT]]: +; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: br label %[[EXIT]] +; CHECK-VF4IC1: [[EXIT]]: +; CHECK-VF4IC1-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] +; CHECK-VF4IC1-NEXT: ret i64 [[IDX_0_LCSSA]] +; +; CHECK-VF4IC4-LABEL: define i64 @select_non_const_iv_start_signed_guard( +; CHECK-VF4IC4-SAME: ptr [[A:%.*]], i64 [[RDX_START:%.*]], i64 [[IV_START:%.*]], i64 [[N:%.*]]) { +; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC4-NEXT: [[GUARD:%.*]] = icmp slt i64 [[IV_START]], [[N]] +; CHECK-VF4IC4-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] +; CHECK-VF4IC4: [[FOR_BODY_PREHEADER]]: +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[IV_START]] +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = add i64 [[IV_START]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[IV_START]], i64 0 +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT2]], +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[IV_START]], [[INDEX]] +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], splat (i64 3) +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]] +; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]]) +; CHECK-VF4IC4-NEXT: [[TMP6]] = select i1 [[TMP5]], <4 x i1> [[TMP3]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP7]] = select i1 [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP7]], <4 x i1> [[TMP6]], i64 [[TMP9]]) +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC4: [[SCALAR_PH]]: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ [[IV_START]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-VF4IC4: [[FOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-VF4IC4-NEXT: [[RDX_07:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[CMP1:%.*]] = icmp sgt i64 [[TMP11]], 3 +; CHECK-VF4IC4-NEXT: [[COND]] = select i1 [[CMP1]], i64 [[IV]], i64 [[RDX_07]] +; CHECK-VF4IC4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 +; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-VF4IC4: [[EXIT_LOOPEXIT]]: +; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: br label %[[EXIT]] +; CHECK-VF4IC4: [[EXIT]]: +; CHECK-VF4IC4-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] +; CHECK-VF4IC4-NEXT: ret i64 [[IDX_0_LCSSA]] +; +; CHECK-VF1IC4-LABEL: define i64 @select_non_const_iv_start_signed_guard( +; CHECK-VF1IC4-SAME: ptr [[A:%.*]], i64 [[RDX_START:%.*]], i64 [[IV_START:%.*]], i64 [[N:%.*]]) { +; CHECK-VF1IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF1IC4-NEXT: [[GUARD:%.*]] = icmp slt i64 [[IV_START]], [[N]] +; CHECK-VF1IC4-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] +; CHECK-VF1IC4: [[FOR_BODY_PREHEADER]]: +; CHECK-VF1IC4-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-VF1IC4: [[FOR_BODY]]: +; CHECK-VF1IC4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[IV_START]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF1IC4-NEXT: [[RDX_07:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF1IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] +; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 4 +; CHECK-VF1IC4-NEXT: [[CMP1:%.*]] = icmp sgt i64 [[TMP0]], 3 +; CHECK-VF1IC4-NEXT: [[COND]] = select i1 [[CMP1]], i64 [[IV]], i64 [[RDX_07]] +; CHECK-VF1IC4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 +; CHECK-VF1IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF1IC4: [[EXIT_LOOPEXIT]]: +; CHECK-VF1IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: br label %[[EXIT]] +; CHECK-VF1IC4: [[EXIT]]: +; CHECK-VF1IC4-NEXT: [[IDX_0_LCSSA:%.*]] = phi i64 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] +; CHECK-VF1IC4-NEXT: ret i64 [[IDX_0_LCSSA]] ; entry: %guard = icmp slt i64 %iv_start, %n @@ -49,32 +173,162 @@ exit: } define i32 @select_trunc_non_const_iv_start_signed_guard(ptr %a, i32 %rdx_start, i32 %iv_start ,i32 %n) { -; CHECK-LABEL: define i32 @select_trunc_non_const_iv_start_signed_guard( -; CHECK-SAME: ptr [[A:%.*]], i32 [[RDX_START:%.*]], i32 [[IV_START:%.*]], i32 [[N:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[GUARD:%.*]] = icmp slt i32 [[IV_START]], [[N]] -; CHECK-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] -; CHECK: [[FOR_BODY_PREHEADER]]: -; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[IV_START]] to i64 -; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = sext i32 [[N]] to i64 -; CHECK-NEXT: br label %[[FOR_BODY:.*]] -; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[RDX_07:%.*]] = phi i32 [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP1]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[IV]] to i32 -; CHECK-NEXT: [[COND]] = select i1 [[CMP1]], i32 [[TMP2]], i32 [[RDX_07]] -; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] -; CHECK: [[EXIT_LOOPEXIT]]: -; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ] -; CHECK-NEXT: br label %[[EXIT]] -; CHECK: [[EXIT]]: -; CHECK-NEXT: [[IDX_0_LCSSA:%.*]] = phi i32 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] -; CHECK-NEXT: ret i32 [[IDX_0_LCSSA]] +; CHECK-VF4IC1-LABEL: define i32 @select_trunc_non_const_iv_start_signed_guard( +; CHECK-VF4IC1-SAME: ptr [[A:%.*]], i32 [[RDX_START:%.*]], i32 [[IV_START:%.*]], i32 [[N:%.*]]) { +; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC1-NEXT: [[GUARD:%.*]] = icmp slt i32 [[IV_START]], [[N]] +; CHECK-VF4IC1-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] +; CHECK-VF4IC1: [[FOR_BODY_PREHEADER]]: +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = sext i32 [[IV_START]] to i64 +; CHECK-VF4IC1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = sext i32 [[N]] to i64 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[TMP0]] +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = add i64 [[TMP0]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[RDX_START]], i64 0 +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[IV_START]], i64 0 +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT2]], +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX]] +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[OFFSET_IDX]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; CHECK-VF4IC1-NEXT: [[TMP7]] = select i1 [[TMP6]], <4 x i1> [[TMP4]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP8]] = select i1 [[TMP6]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP8]], <4 x i1> [[TMP7]], i32 [[TMP10]]) +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC1: [[SCALAR_PH]]: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-VF4IC1: [[FOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[RDX_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP12]], 3 +; CHECK-VF4IC1-NEXT: [[TMP13:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC1-NEXT: [[COND]] = select i1 [[CMP1]], i32 [[TMP13]], i32 [[RDX_07]] +; CHECK-VF4IC1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-VF4IC1: [[EXIT_LOOPEXIT]]: +; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: br label %[[EXIT]] +; CHECK-VF4IC1: [[EXIT]]: +; CHECK-VF4IC1-NEXT: [[IDX_0_LCSSA:%.*]] = phi i32 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] +; CHECK-VF4IC1-NEXT: ret i32 [[IDX_0_LCSSA]] +; +; CHECK-VF4IC4-LABEL: define i32 @select_trunc_non_const_iv_start_signed_guard( +; CHECK-VF4IC4-SAME: ptr [[A:%.*]], i32 [[RDX_START:%.*]], i32 [[IV_START:%.*]], i32 [[N:%.*]]) { +; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC4-NEXT: [[GUARD:%.*]] = icmp slt i32 [[IV_START]], [[N]] +; CHECK-VF4IC4-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] +; CHECK-VF4IC4: [[FOR_BODY_PREHEADER]]: +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = sext i32 [[IV_START]] to i64 +; CHECK-VF4IC4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = sext i32 [[N]] to i64 +; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[TMP0]] +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 4 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = add i64 [[TMP0]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[RDX_START]], i64 0 +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[IV_START]], i64 0 +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT2]], +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX]] +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[OFFSET_IDX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; CHECK-VF4IC4-NEXT: [[TMP7]] = select i1 [[TMP6]], <4 x i1> [[TMP4]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP8]] = select i1 [[TMP6]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP8]], <4 x i1> [[TMP7]], i32 [[TMP10]]) +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC4: [[SCALAR_PH]]: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-VF4IC4: [[FOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[RDX_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP12]], 3 +; CHECK-VF4IC4-NEXT: [[TMP13:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC4-NEXT: [[COND]] = select i1 [[CMP1]], i32 [[TMP13]], i32 [[RDX_07]] +; CHECK-VF4IC4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 +; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-VF4IC4: [[EXIT_LOOPEXIT]]: +; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: br label %[[EXIT]] +; CHECK-VF4IC4: [[EXIT]]: +; CHECK-VF4IC4-NEXT: [[IDX_0_LCSSA:%.*]] = phi i32 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] +; CHECK-VF4IC4-NEXT: ret i32 [[IDX_0_LCSSA]] +; +; CHECK-VF1IC4-LABEL: define i32 @select_trunc_non_const_iv_start_signed_guard( +; CHECK-VF1IC4-SAME: ptr [[A:%.*]], i32 [[RDX_START:%.*]], i32 [[IV_START:%.*]], i32 [[N:%.*]]) { +; CHECK-VF1IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF1IC4-NEXT: [[GUARD:%.*]] = icmp slt i32 [[IV_START]], [[N]] +; CHECK-VF1IC4-NEXT: br i1 [[GUARD]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] +; CHECK-VF1IC4: [[FOR_BODY_PREHEADER]]: +; CHECK-VF1IC4-NEXT: [[TMP0:%.*]] = sext i32 [[IV_START]] to i64 +; CHECK-VF1IC4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = sext i32 [[N]] to i64 +; CHECK-VF1IC4-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-VF1IC4: [[FOR_BODY]]: +; CHECK-VF1IC4-NEXT: [[IV:%.*]] = phi i64 [ [[TMP0]], %[[FOR_BODY_PREHEADER]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[RDX_07:%.*]] = phi i32 [ [[RDX_START]], %[[FOR_BODY_PREHEADER]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF1IC4-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF1IC4-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP1]], 3 +; CHECK-VF1IC4-NEXT: [[TMP2:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF1IC4-NEXT: [[COND]] = select i1 [[CMP1]], i32 [[TMP2]], i32 [[RDX_07]] +; CHECK-VF1IC4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 +; CHECK-VF1IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-VF1IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF1IC4: [[EXIT_LOOPEXIT]]: +; CHECK-VF1IC4-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: br label %[[EXIT]] +; CHECK-VF1IC4: [[EXIT]]: +; CHECK-VF1IC4-NEXT: [[IDX_0_LCSSA:%.*]] = phi i32 [ [[RDX_START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] +; CHECK-VF1IC4-NEXT: ret i32 [[IDX_0_LCSSA]] ; entry: %guard = icmp slt i32 %iv_start, %n @@ -101,3 +355,18 @@ exit: %idx.0.lcssa = phi i32 [ %rdx_start, %entry ], [ %cond, %for.body ] ret i32 %idx.0.lcssa } +;. +; CHECK-VF4IC1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-VF4IC1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-VF4IC1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-VF4IC1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK-VF4IC1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK-VF4IC1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +;. +; CHECK-VF4IC4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-VF4IC4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-VF4IC4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-VF4IC4: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK-VF4IC4: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK-VF4IC4: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll index 45c2abd43c36a..839ea7ce7e7a4 100644 --- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll +++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll @@ -674,65 +674,125 @@ exit: ; preds = %for.body ; Negative tests -; This test can theoretically be vectorized, but only with a runtime-check. -; The construct that are introduced by IndVarSimplify is: +; This test can theoretically be vectorized as a FindLastIV reduction, but only +; with a runtime-check. It will vectorize as a generic FindLast reduction. +; +; For FindLastIV, the construct that are introduced by IndVarSimplify is: ; %1 = trunc i64 %iv to i32 ; However, the loop guard is an i64: ; %cmp.sgt = icmp sgt i64 %n, 0 ; We cannot guarantee that %iv won't overflow an i32 value (and hence hit the ; sentinel value), and need a runtime-check to vectorize this case. -define i32 @not_vectorized_select_icmp_const_truncated_iv_unwidened_exit(ptr %a, i64 %n) { -; CHECK-VF4IC1-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_unwidened_exit( +define i32 @select_icmp_const_truncated_iv_unwidened_exit(ptr %a, i64 %n) { +; CHECK-VF4IC1-LABEL: define i32 @select_icmp_const_truncated_iv_unwidened_exit( ; CHECK-VF4IC1-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) { ; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: ; CHECK-VF4IC1-NEXT: [[CMP_SGT:%.*]] = icmp sgt i64 [[N]], 0 ; CHECK-VF4IC1-NEXT: br i1 [[CMP_SGT]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] ; CHECK-VF4IC1: [[FOR_BODY_PREHEADER]]: +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 331), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP8]] +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-VF4IC1-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 331) +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC1: [[SCALAR_PH]]: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 331, %[[FOR_BODY_PREHEADER]] ] ; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC1: [[FOR_BODY]]: -; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] -; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ], [ 331, %[[FOR_BODY_PREHEADER]] ] -; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[IV1:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 ; CHECK-VF4IC1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 3 -; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = trunc i64 [[IV1]] to i32 ; CHECK-VF4IC1-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]] -; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-VF4IC1: [[EXIT_LOOPEXIT]]: -; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC1-NEXT: br label %[[EXIT]] ; CHECK-VF4IC1: [[EXIT]]: ; CHECK-VF4IC1-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ] ; CHECK-VF4IC1-NEXT: ret i32 [[RDX_LCSSA]] ; -; CHECK-VF4IC4-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_unwidened_exit( +; CHECK-VF4IC4-LABEL: define i32 @select_icmp_const_truncated_iv_unwidened_exit( ; CHECK-VF4IC4-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) { ; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: ; CHECK-VF4IC4-NEXT: [[CMP_SGT:%.*]] = icmp sgt i64 [[N]], 0 ; CHECK-VF4IC4-NEXT: br i1 [[CMP_SGT]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] ; CHECK-VF4IC4: [[FOR_BODY_PREHEADER]]: +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 331), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP1]] +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-VF4IC4-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 331) +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC4: [[SCALAR_PH]]: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 331, %[[FOR_BODY_PREHEADER]] ] ; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC4: [[FOR_BODY]]: -; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] -; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ], [ 331, %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] ; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-VF4IC4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 3 -; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32 -; CHECK-VF4IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]] +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP8]], 3 +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP9]], i32 [[RDX]] ; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 ; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-VF4IC4: [[EXIT_LOOPEXIT]]: -; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC4-NEXT: br label %[[EXIT]] ; CHECK-VF4IC4: [[EXIT]]: ; CHECK-VF4IC4-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ] ; CHECK-VF4IC4-NEXT: ret i32 [[RDX_LCSSA]] ; -; CHECK-VF1IC4-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_unwidened_exit( +; CHECK-VF1IC4-LABEL: define i32 @select_icmp_const_truncated_iv_unwidened_exit( ; CHECK-VF1IC4-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) { ; CHECK-VF1IC4-NEXT: [[ENTRY:.*]]: ; CHECK-VF1IC4-NEXT: [[CMP_SGT:%.*]] = icmp sgt i64 [[N]], 0 @@ -778,67 +838,127 @@ exit: ; preds = %for.body, %entry ret i32 %rdx.lcssa } -; This test can theoretically be vectorized, but only with a runtime-check. -; The construct that are introduced by IndVarSimplify is: +; This test can theoretically be vectorized as a FindLastIV reduction, but only +; with a runtime-check. It will vectorize as a generic FindLast reduction. +; +; For FindLastIV, the construct that are introduced by IndVarSimplify is: ; %1 = trunc i64 %iv to i32 ; However, the loop guard is unsigned: ; %cmp.not = icmp eq i32 %n, 0 ; We cannot guarantee that %iv won't overflow an i32 value (and hence hit the ; sentinel value), and need a runtime-check to vectorize this case. -define i32 @not_vectorized_select_icmp_const_truncated_iv_unsigned_loop_guard(ptr %a, i32 %n) { -; CHECK-VF4IC1-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_unsigned_loop_guard( +define i32 @select_icmp_const_truncated_iv_unsigned_loop_guard(ptr %a, i32 %n) { +; CHECK-VF4IC1-LABEL: define i32 @select_icmp_const_truncated_iv_unsigned_loop_guard( ; CHECK-VF4IC1-SAME: ptr [[A:%.*]], i32 [[N:%.*]]) { ; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: ; CHECK-VF4IC1-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[N]], 0 ; CHECK-VF4IC1-NEXT: br i1 [[CMP_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY_PREHEADER:.*]] ; CHECK-VF4IC1: [[FOR_BODY_PREHEADER]]: ; CHECK-VF4IC1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 331), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP8]] +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-VF4IC1-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 331) +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC1: [[SCALAR_PH]]: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 331, %[[FOR_BODY_PREHEADER]] ] ; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC1: [[FOR_BODY]]: -; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ 331, %[[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 ; CHECK-VF4IC1-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 3 -; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = trunc i64 [[IV1]] to i32 ; CHECK-VF4IC1-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[RDX]] -; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]] -; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-VF4IC1: [[EXIT_LOOPEXIT]]: -; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC1-NEXT: br label %[[EXIT]] ; CHECK-VF4IC1: [[EXIT]]: ; CHECK-VF4IC1-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ] ; CHECK-VF4IC1-NEXT: ret i32 [[RDX_LCSSA]] ; -; CHECK-VF4IC4-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_unsigned_loop_guard( +; CHECK-VF4IC4-LABEL: define i32 @select_icmp_const_truncated_iv_unsigned_loop_guard( ; CHECK-VF4IC4-SAME: ptr [[A:%.*]], i32 [[N:%.*]]) { ; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: ; CHECK-VF4IC4-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[N]], 0 ; CHECK-VF4IC4-NEXT: br i1 [[CMP_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY_PREHEADER:.*]] ; CHECK-VF4IC4: [[FOR_BODY_PREHEADER]]: ; CHECK-VF4IC4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 331), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP1]] +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-VF4IC4-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 331) +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC4: [[SCALAR_PH]]: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 331, %[[FOR_BODY_PREHEADER]] ] ; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC4: [[FOR_BODY]]: -; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ 331, %[[FOR_BODY_PREHEADER]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] ; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-VF4IC4-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 3 -; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32 -; CHECK-VF4IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP1]], i32 [[RDX]] +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP8]], 3 +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[TMP9]], i32 [[RDX]] ; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 ; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]] -; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-VF4IC4: [[EXIT_LOOPEXIT]]: -; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC4-NEXT: br label %[[EXIT]] ; CHECK-VF4IC4: [[EXIT]]: ; CHECK-VF4IC4-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT_LCSSA]], %[[EXIT_LOOPEXIT]] ] ; CHECK-VF4IC4-NEXT: ret i32 [[RDX_LCSSA]] ; -; CHECK-VF1IC4-LABEL: define i32 @not_vectorized_select_icmp_const_truncated_iv_unsigned_loop_guard( +; CHECK-VF1IC4-LABEL: define i32 @select_icmp_const_truncated_iv_unsigned_loop_guard( ; CHECK-VF1IC4-SAME: ptr [[A:%.*]], i32 [[N:%.*]]) { ; CHECK-VF1IC4-NEXT: [[ENTRY:.*]]: ; CHECK-VF1IC4-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[N]], 0 @@ -899,41 +1019,61 @@ exit: ; preds = %for.body, %entry define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound(ptr %a) { ; CHECK-VF4IC1-LABEL: define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound( ; CHECK-VF4IC1-SAME: ptr [[A:%.*]]) { -; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: -; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] -; CHECK-VF4IC1: [[FOR_BODY]]: -; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 4294967294, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ENTRY:.*:]] +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 331), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = add i64 4294967294, [[INDEX]] ; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-VF4IC1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 3 -; CHECK-VF4IC1-NEXT: [[CONV:%.*]] = trunc i64 [[IV]] to i32 -; CHECK-VF4IC1-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[CONV]], i32 [[RDX]] -; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 -; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 9223372036854775806 -; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP1]] +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-VF4IC1-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9223372032559808512 +; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 331) +; CHECK-VF4IC1-NEXT: br label %[[EXIT:.*]] ; CHECK-VF4IC1: [[EXIT]]: -; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: ret i32 [[SPEC_SELECT_LCSSA]] +; CHECK-VF4IC1-NEXT: ret i32 [[TMP7]] ; ; CHECK-VF4IC4-LABEL: define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound( ; CHECK-VF4IC4-SAME: ptr [[A:%.*]]) { -; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: -; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] -; CHECK-VF4IC4: [[FOR_BODY]]: -; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 4294967294, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ 331, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ENTRY:.*:]] +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 331), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = add i64 4294967294, [[INDEX]] ; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-VF4IC4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 3 -; CHECK-VF4IC4-NEXT: [[CONV:%.*]] = trunc i64 [[IV]] to i32 -; CHECK-VF4IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[CONV]], i32 [[RDX]] -; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 -; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 9223372036854775806 -; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP1]] +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-VF4IC4-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP1]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9223372032559808512 +; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 331) +; CHECK-VF4IC4-NEXT: br label %[[EXIT:.*]] ; CHECK-VF4IC4: [[EXIT]]: -; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: ret i32 [[SPEC_SELECT_LCSSA]] +; CHECK-VF4IC4-NEXT: ret i32 [[TMP7]] ; ; CHECK-VF1IC4-LABEL: define i32 @not_vectorized_select_icmp_truncated_iv_out_of_bound( ; CHECK-VF1IC4-SAME: ptr [[A:%.*]]) { @@ -980,44 +1120,112 @@ define i32 @not_vectorized_select_iv_icmp_no_guard(ptr %a, ptr %b, i32 %start, i ; CHECK-VF4IC1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[START:%.*]], i32 [[N:%.*]]) { ; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: ; CHECK-VF4IC1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[START]], i64 0 +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP11]] +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]]) +; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP6]], <4 x i1> [[TMP5]], i32 [[TMP8]]) +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC1: [[SCALAR_PH]]: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC1: [[FOR_BODY]]: -; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-VF4IC1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +; CHECK-VF4IC1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV1]] ; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 ; CHECK-VF4IC1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], [[TMP1]] -; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = trunc i64 [[IV1]] to i32 ; CHECK-VF4IC1-NEXT: [[COND]] = select i1 [[CMP]], i32 [[TMP2]], i32 [[RDX]] -; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]] -; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK-VF4IC1: [[EXIT]]: -; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC1-NEXT: ret i32 [[COND_LCSSA]] ; ; CHECK-VF4IC4-LABEL: define i32 @not_vectorized_select_iv_icmp_no_guard( ; CHECK-VF4IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[START:%.*]], i32 [[N:%.*]]) { ; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: ; CHECK-VF4IC4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[START]], i64 0 +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP11]] +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]]) +; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP6]], <4 x i1> [[TMP5]], i32 [[TMP8]]) +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC4: [[SCALAR_PH]]: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC4: [[FOR_BODY]]: -; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-VF4IC4-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]] +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +; CHECK-VF4IC4-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV1]] ; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 ; CHECK-VF4IC4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], [[TMP1]] -; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = trunc i64 [[IV1]] to i32 ; CHECK-VF4IC4-NEXT: [[COND]] = select i1 [[CMP]], i32 [[TMP2]], i32 [[RDX]] -; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]] -; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK-VF4IC4: [[EXIT]]: -; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC4-NEXT: ret i32 [[COND_LCSSA]] ; ; CHECK-VF1IC4-LABEL: define i32 @not_vectorized_select_iv_icmp_no_guard( @@ -1071,38 +1279,84 @@ exit: ; preds = %for.body define i32 @not_vectorized_select_fcmp_invalid_const_ub(ptr %a) { ; CHECK-VF4IC1-LABEL: define i32 @not_vectorized_select_fcmp_invalid_const_ub( ; CHECK-VF4IC1-SAME: ptr [[A:%.*]]) { -; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC1-NEXT: [[ENTRY:.*:]] +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = fcmp fast olt <4 x float> [[WIDE_LOAD]], zeroinitializer +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP8]] +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-VF4IC1-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2147483648 +; CHECK-VF4IC1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 -1) +; CHECK-VF4IC1-NEXT: br label %[[SCALAR_PH:.*]] +; CHECK-VF4IC1: [[SCALAR_PH]]: ; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC1: [[FOR_BODY]]: -; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[IV1:%.*]] = phi i64 [ 2147483648, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[TMP7]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV1]] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 ; CHECK-VF4IC1-NEXT: [[CMP:%.*]] = fcmp fast olt float [[TMP0]], 0.000000e+00 -; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = trunc i64 [[IV1]] to i32 ; CHECK-VF4IC1-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]] -; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 2147483649 -; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK-VF4IC1: [[EXIT]]: ; CHECK-VF4IC1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ] ; CHECK-VF4IC1-NEXT: ret i32 [[SPEC_SELECT_LCSSA]] ; ; CHECK-VF4IC4-LABEL: define i32 @not_vectorized_select_fcmp_invalid_const_ub( ; CHECK-VF4IC4-SAME: ptr [[A:%.*]]) { -; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC4-NEXT: [[ENTRY:.*:]] +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -1), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = fcmp fast olt <4 x float> [[WIDE_LOAD]], zeroinitializer +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = freeze <4 x i1> [[TMP8]] +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-VF4IC4-NEXT: [[TMP4]] = select i1 [[TMP3]], <4 x i1> [[TMP8]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP3]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2147483648 +; CHECK-VF4IC4-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP5]], <4 x i1> [[TMP4]], i32 -1) +; CHECK-VF4IC4-NEXT: br label %[[SCALAR_PH:.*]] +; CHECK-VF4IC4: [[SCALAR_PH]]: ; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC4: [[FOR_BODY]]: -; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ -1, %[[ENTRY]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[IV1:%.*]] = phi i64 [ 2147483648, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[TMP7]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV1]] +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 ; CHECK-VF4IC4-NEXT: [[CMP:%.*]] = fcmp fast olt float [[TMP0]], 0.000000e+00 -; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = trunc i64 [[IV1]] to i32 ; CHECK-VF4IC4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[RDX]] -; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 2147483649 -; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK-VF4IC4: [[EXIT]]: ; CHECK-VF4IC4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ] ; CHECK-VF4IC4-NEXT: ret i32 [[SPEC_SELECT_LCSSA]] @@ -1156,22 +1410,56 @@ define i16 @not_vectorized_select_iv_icmp_overflow_unwidened_tripcount(ptr %a, p ; CHECK-VF4IC1-NEXT: br i1 [[CMP9]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] ; CHECK-VF4IC1: [[FOR_BODY_PREHEADER]]: ; CHECK-VF4IC1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[START]], i64 0 +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i16> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP11]] +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]]) +; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i16> [[VEC_IND]], <4 x i16> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4) +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = extractelement <4 x i16> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = call i16 @llvm.experimental.vector.extract.last.active.v4i16(<4 x i16> [[TMP6]], <4 x i1> [[TMP5]], i16 [[TMP8]]) +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC1: [[SCALAR_PH]]: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[FOR_BODY_PREHEADER]] ] ; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC1: [[FOR_BODY]]: -; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i16 [ [[START]], %[[FOR_BODY_PREHEADER]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-VF4IC1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i16 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +; CHECK-VF4IC1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV1]] ; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 ; CHECK-VF4IC1-NEXT: [[CMP3:%.*]] = icmp sgt i32 [[TMP0]], [[TMP1]] -; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = trunc i64 [[IV]] to i16 +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = trunc i64 [[IV1]] to i16 ; CHECK-VF4IC1-NEXT: [[COND]] = select i1 [[CMP3]], i16 [[TMP2]], i16 [[RDX]] -; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]] -; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK-VF4IC1: [[EXIT_LOOPEXIT]]: -; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i16 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i16 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC1-NEXT: br label %[[EXIT]] ; CHECK-VF4IC1: [[EXIT]]: ; CHECK-VF4IC1-NEXT: [[RDX_0_LCSSA:%.*]] = phi i16 [ [[START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] @@ -1184,22 +1472,56 @@ define i16 @not_vectorized_select_iv_icmp_overflow_unwidened_tripcount(ptr %a, p ; CHECK-VF4IC4-NEXT: br i1 [[CMP9]], label %[[FOR_BODY_PREHEADER:.*]], label %[[EXIT:.*]] ; CHECK-VF4IC4: [[FOR_BODY_PREHEADER]]: ; CHECK-VF4IC4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[START]], i64 0 +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i16> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP11]] +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]]) +; CHECK-VF4IC4-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i16> [[VEC_IND]], <4 x i16> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4) +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = extractelement <4 x i16> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = call i16 @llvm.experimental.vector.extract.last.active.v4i16(<4 x i16> [[TMP6]], <4 x i1> [[TMP5]], i16 [[TMP8]]) +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC4: [[SCALAR_PH]]: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[FOR_BODY_PREHEADER]] ] ; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC4: [[FOR_BODY]]: -; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i16 [ [[START]], %[[FOR_BODY_PREHEADER]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-VF4IC4-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i16 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]] +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +; CHECK-VF4IC4-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV1]] ; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 ; CHECK-VF4IC4-NEXT: [[CMP3:%.*]] = icmp sgt i32 [[TMP0]], [[TMP1]] -; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = trunc i64 [[IV]] to i16 +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = trunc i64 [[IV1]] to i16 ; CHECK-VF4IC4-NEXT: [[COND]] = select i1 [[CMP3]], i16 [[TMP2]], i16 [[RDX]] -; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[WIDE_TRIP_COUNT]] -; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK-VF4IC4: [[EXIT_LOOPEXIT]]: -; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i16 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i16 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC4-NEXT: br label %[[EXIT]] ; CHECK-VF4IC4: [[EXIT]]: ; CHECK-VF4IC4-NEXT: [[RDX_0_LCSSA:%.*]] = phi i16 [ [[START]], %[[ENTRY]] ], [ [[COND_LCSSA]], %[[EXIT_LOOPEXIT]] ] diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll index a071949f82062..e3df1d3c39cca 100644 --- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp.ll @@ -1948,16 +1948,52 @@ exit: ; preds = %for.body } ; The sentinel value for increasing-IV vectorization is -LONG_MAX, and since -; the IV hits this value, it is impossible to vectorize this case. +; the IV hits this value, it is vectorized as a generic last-active reduction. define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx.start, i64 %n) { ; CHECK-VF4IC1-LABEL: define i64 @not_vectorized_select_icmp_iv_out_of_bound( ; CHECK-VF4IC1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) { ; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = add i64 -9223372036854775808, [[N_VEC]] +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP12]], align 8 +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]] +; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]]) +; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP5]], <4 x i1> [[TMP3]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP7]] = select i1 [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP7]], <4 x i1> [[TMP6]], i64 [[TMP9]]) +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC1: [[SCALAR_PH]]: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ -9223372036854775808, %[[ENTRY]] ] +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ] ; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC1: [[FOR_BODY]]: -; CHECK-VF4IC1-NEXT: [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ -9223372036854775808, %[[ENTRY]] ] -; CHECK-VF4IC1-NEXT: [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ] -; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ] +; CHECK-VF4IC1-NEXT: [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-VF4IC1-NEXT: [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] ; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_I]] ; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_I]] @@ -1967,19 +2003,55 @@ define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx. ; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV_I]], 1 ; CHECK-VF4IC1-NEXT: [[INC3]] = add nsw i64 [[IV_J]], 1 ; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK-VF4IC1: [[EXIT]]: -; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC1-NEXT: ret i64 [[COND_LCSSA]] ; ; CHECK-VF4IC4-LABEL: define i64 @not_vectorized_select_icmp_iv_out_of_bound( ; CHECK-VF4IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) { ; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = add i64 -9223372036854775808, [[N_VEC]] +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP12]], align 8 +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]] +; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]]) +; CHECK-VF4IC4-NEXT: [[TMP6]] = select i1 [[TMP5]], <4 x i1> [[TMP3]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP7]] = select i1 [[TMP5]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nsw <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP7]], <4 x i1> [[TMP6]], i64 [[TMP9]]) +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC4: [[SCALAR_PH]]: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ -9223372036854775808, %[[ENTRY]] ] +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ] ; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC4: [[FOR_BODY]]: -; CHECK-VF4IC4-NEXT: [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ -9223372036854775808, %[[ENTRY]] ] -; CHECK-VF4IC4-NEXT: [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ] -; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ] +; CHECK-VF4IC4-NEXT: [[IV_J:%.*]] = phi i64 [ [[INC3:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-VF4IC4-NEXT: [[IV_I:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] ; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_I]] ; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV_I]] @@ -1989,9 +2061,9 @@ define i64 @not_vectorized_select_icmp_iv_out_of_bound(ptr %a, ptr %b, i64 %rdx. ; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV_I]], 1 ; CHECK-VF4IC4-NEXT: [[INC3]] = add nsw i64 [[IV_J]], 1 ; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK-VF4IC4: [[EXIT]]: -; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC4-NEXT: ret i64 [[COND_LCSSA]] ; ; CHECK-VF1IC4-LABEL: define i64 @not_vectorized_select_icmp_iv_out_of_bound( @@ -2042,10 +2114,50 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b, ; CHECK-VF4IC1-LABEL: define i64 @not_vectorized_select_icmp_non_const_iv_start_value( ; CHECK-VF4IC1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[IVSTART:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) { ; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC1-NEXT: [[TMP12:%.*]] = sub i64 [[N]], [[IVSTART]] +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP12]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP12]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP12]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: [[TMP13:%.*]] = add i64 [[IVSTART]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[IVSTART]], i64 0 +; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[INDUCTION:%.*]] = add nuw nsw <4 x i64> [[BROADCAST_SPLAT2]], +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[IVSTART]], [[INDEX]] +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8 +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD3]] +; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] +; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; CHECK-VF4IC1-NEXT: [[TMP7]] = select i1 [[TMP6]], <4 x i1> [[TMP4]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP8]] = select i1 [[TMP6]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP8]], <4 x i1> [[TMP7]], i64 [[TMP10]]) +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP12]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC1: [[SCALAR_PH]]: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ [[IVSTART]], %[[ENTRY]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ] ; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC1: [[FOR_BODY]]: -; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[IVSTART]], %[[ENTRY]] ] -; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ] +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] ; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]] @@ -2054,18 +2166,58 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b, ; CHECK-VF4IC1-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]] ; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 ; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK-VF4IC1: [[EXIT]]: -; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC1-NEXT: ret i64 [[COND_LCSSA]] ; ; CHECK-VF4IC4-LABEL: define i64 @not_vectorized_select_icmp_non_const_iv_start_value( ; CHECK-VF4IC4-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[IVSTART:%.*]], i64 [[RDX_START:%.*]], i64 [[N:%.*]]) { ; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC4-NEXT: [[TMP12:%.*]] = sub i64 [[N]], [[IVSTART]] +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP12]], 4 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP12]], 4 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP12]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: [[TMP13:%.*]] = add i64 [[IVSTART]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[RDX_START]], i64 0 +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[IVSTART]], i64 0 +; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC4-NEXT: [[INDUCTION:%.*]] = add nuw nsw <4 x i64> [[BROADCAST_SPLAT2]], +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT]], %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[IVSTART]], [[INDEX]] +; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 +; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[OFFSET_IDX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8 +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD3]] +; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; CHECK-VF4IC4-NEXT: [[TMP7]] = select i1 [[TMP6]], <4 x i1> [[TMP4]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP8]] = select i1 [[TMP6]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4) +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[BROADCAST_SPLAT]], i32 0 +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> [[TMP8]], <4 x i1> [[TMP7]], i64 [[TMP10]]) +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP12]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC4: [[SCALAR_PH]]: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ [[IVSTART]], %[[ENTRY]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[RDX_START]], %[[ENTRY]] ] ; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC4: [[FOR_BODY]]: -; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[IVSTART]], %[[ENTRY]] ] -; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[RDX_START]], %[[ENTRY]] ] +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i64 [ [[COND:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] ; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-VF4IC4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]] @@ -2074,9 +2226,9 @@ define i64 @not_vectorized_select_icmp_non_const_iv_start_value(ptr %a, ptr %b, ; CHECK-VF4IC4-NEXT: [[COND]] = select i1 [[CMP2]], i64 [[IV]], i64 [[RDX]] ; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[IV]], 1 ; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK-VF4IC4: [[EXIT]]: -; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[COND_LCSSA:%.*]] = phi i64 [ [[COND]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ] ; CHECK-VF4IC4-NEXT: ret i64 [[COND_LCSSA]] ; ; CHECK-VF1IC4-LABEL: define i64 @not_vectorized_select_icmp_non_const_iv_start_value( diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp.ll b/llvm/test/Transforms/LoopVectorize/select-cmp.ll index 2b352abe9f7a1..e19ebb9a3251c 100644 --- a/llvm/test/Transforms/LoopVectorize/select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/select-cmp.ll @@ -1128,27 +1128,124 @@ exit: ; preds = %loop ret float %sel } -; We don't support selecting loop-variant values. define i32 @select_variant_i32_from_icmp(ptr %v1, ptr %v2, i64 %n) { -; CHECK-LABEL: define i32 @select_variant_i32_from_icmp( -; CHECK-SAME: ptr [[V1:%.*]], ptr [[V2:%.*]], i64 [[N:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: br label %[[LOOP:.*]] -; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ 3, %[[ENTRY]] ], [ [[SEL:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[GEP_V1_IV:%.*]] = getelementptr inbounds i32, ptr [[V1]], i64 [[IV]] -; CHECK-NEXT: [[LOAD_V1_IV:%.*]] = load i32, ptr [[GEP_V1_IV]], align 4 -; CHECK-NEXT: [[GEP_V2_IV:%.*]] = getelementptr inbounds i32, ptr [[V2]], i64 [[IV]] -; CHECK-NEXT: [[LOAD_V2_IV:%.*]] = load i32, ptr [[GEP_V2_IV]], align 4 -; CHECK-NEXT: [[CMP_V1_IV_3:%.*]] = icmp eq i32 [[LOAD_V1_IV]], 3 -; CHECK-NEXT: [[SEL]] = select i1 [[CMP_V1_IV_3]], i32 [[RDX]], i32 [[LOAD_V2_IV]] -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]] -; CHECK: [[EXIT]]: -; CHECK-NEXT: [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ] -; CHECK-NEXT: ret i32 [[SEL_LCSSA]] +; CHECK-VF4IC1-LABEL: define i32 @select_variant_i32_from_icmp( +; CHECK-VF4IC1-SAME: ptr [[V1:%.*]], ptr [[V2:%.*]], i64 [[N:%.*]]) { +; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC1: [[VECTOR_PH]]: +; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC1: [[VECTOR_BODY]]: +; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 3), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[V1]], i64 [[INDEX]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V2]], i64 [[INDEX]] +; CHECK-VF4IC1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = freeze <4 x i1> [[TMP2]] +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]]) +; CHECK-VF4IC1-NEXT: [[TMP5]] = select i1 [[TMP4]], <4 x i1> [[TMP2]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC1-NEXT: [[TMP6]] = select i1 [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD1]] +; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-VF4IC1: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP6]], <4 x i1> [[TMP5]], i32 3) +; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC1: [[SCALAR_PH]]: +; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 3, %[[ENTRY]] ] +; CHECK-VF4IC1-NEXT: br label %[[LOOP:.*]] +; CHECK-VF4IC1: [[LOOP]]: +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] +; CHECK-VF4IC1-NEXT: [[GEP_V1_IV:%.*]] = getelementptr inbounds i32, ptr [[V1]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[LOAD_V1_IV:%.*]] = load i32, ptr [[GEP_V1_IV]], align 4 +; CHECK-VF4IC1-NEXT: [[GEP_V2_IV:%.*]] = getelementptr inbounds i32, ptr [[V2]], i64 [[IV]] +; CHECK-VF4IC1-NEXT: [[LOAD_V2_IV:%.*]] = load i32, ptr [[GEP_V2_IV]], align 4 +; CHECK-VF4IC1-NEXT: [[CMP_V1_IV_3:%.*]] = icmp eq i32 [[LOAD_V1_IV]], 3 +; CHECK-VF4IC1-NEXT: [[SEL]] = select i1 [[CMP_V1_IV_3]], i32 [[RDX]], i32 [[LOAD_V2_IV]] +; CHECK-VF4IC1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC1-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-VF4IC1-NEXT: br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-VF4IC1: [[EXIT]]: +; CHECK-VF4IC1-NEXT: [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ] +; CHECK-VF4IC1-NEXT: ret i32 [[SEL_LCSSA]] +; +; CHECK-VF4IC4-LABEL: define i32 @select_variant_i32_from_icmp( +; CHECK-VF4IC4-SAME: ptr [[V1:%.*]], ptr [[V2:%.*]], i64 [[N:%.*]]) { +; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-VF4IC4: [[VECTOR_PH]]: +; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-VF4IC4: [[VECTOR_BODY]]: +; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 3), %[[VECTOR_PH]] ], [ [[TMP27:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[LAST_ACTIVE_MASK:%.*]] = phi <4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[VECTOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[V1]], i64 [[INDEX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[V2]], i64 [[INDEX]] +; CHECK-VF4IC4-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4 +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD9]], splat (i32 3) +; CHECK-VF4IC4-NEXT: [[TMP17:%.*]] = freeze <4 x i1> [[TMP11]] +; CHECK-VF4IC4-NEXT: [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP17]]) +; CHECK-VF4IC4-NEXT: [[TMP23]] = select i1 [[TMP19]], <4 x i1> [[TMP11]], <4 x i1> [[LAST_ACTIVE_MASK]] +; CHECK-VF4IC4-NEXT: [[TMP27]] = select i1 [[TMP19]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD13]] +; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-VF4IC4-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-VF4IC4: [[MIDDLE_BLOCK]]: +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> [[TMP27]], <4 x i1> [[TMP23]], i32 3) +; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-VF4IC4: [[SCALAR_PH]]: +; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 3, %[[ENTRY]] ] +; CHECK-VF4IC4-NEXT: br label %[[LOOP:.*]] +; CHECK-VF4IC4: [[LOOP]]: +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] +; CHECK-VF4IC4-NEXT: [[GEP_V1_IV:%.*]] = getelementptr inbounds i32, ptr [[V1]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[LOAD_V1_IV:%.*]] = load i32, ptr [[GEP_V1_IV]], align 4 +; CHECK-VF4IC4-NEXT: [[GEP_V2_IV:%.*]] = getelementptr inbounds i32, ptr [[V2]], i64 [[IV]] +; CHECK-VF4IC4-NEXT: [[LOAD_V2_IV:%.*]] = load i32, ptr [[GEP_V2_IV]], align 4 +; CHECK-VF4IC4-NEXT: [[CMP_V1_IV_3:%.*]] = icmp eq i32 [[LOAD_V1_IV]], 3 +; CHECK-VF4IC4-NEXT: [[SEL]] = select i1 [[CMP_V1_IV_3]], i32 [[RDX]], i32 [[LOAD_V2_IV]] +; CHECK-VF4IC4-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF4IC4-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-VF4IC4-NEXT: br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-VF4IC4: [[EXIT]]: +; CHECK-VF4IC4-NEXT: [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ] +; CHECK-VF4IC4-NEXT: ret i32 [[SEL_LCSSA]] +; +; CHECK-VF1IC4-LABEL: define i32 @select_variant_i32_from_icmp( +; CHECK-VF1IC4-SAME: ptr [[V1:%.*]], ptr [[V2:%.*]], i64 [[N:%.*]]) { +; CHECK-VF1IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF1IC4-NEXT: br label %[[LOOP:.*]] +; CHECK-VF1IC4: [[LOOP]]: +; CHECK-VF1IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-VF1IC4-NEXT: [[RDX:%.*]] = phi i32 [ 3, %[[ENTRY]] ], [ [[SEL:%.*]], %[[LOOP]] ] +; CHECK-VF1IC4-NEXT: [[GEP_V1_IV:%.*]] = getelementptr inbounds i32, ptr [[V1]], i64 [[IV]] +; CHECK-VF1IC4-NEXT: [[LOAD_V1_IV:%.*]] = load i32, ptr [[GEP_V1_IV]], align 4 +; CHECK-VF1IC4-NEXT: [[GEP_V2_IV:%.*]] = getelementptr inbounds i32, ptr [[V2]], i64 [[IV]] +; CHECK-VF1IC4-NEXT: [[LOAD_V2_IV:%.*]] = load i32, ptr [[GEP_V2_IV]], align 4 +; CHECK-VF1IC4-NEXT: [[CMP_V1_IV_3:%.*]] = icmp eq i32 [[LOAD_V1_IV]], 3 +; CHECK-VF1IC4-NEXT: [[SEL]] = select i1 [[CMP_V1_IV_3]], i32 [[RDX]], i32 [[LOAD_V2_IV]] +; CHECK-VF1IC4-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF1IC4-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-VF1IC4-NEXT: br i1 [[EXIT_COND]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK-VF1IC4: [[EXIT]]: +; CHECK-VF1IC4-NEXT: [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ] +; CHECK-VF1IC4-NEXT: ret i32 [[SEL_LCSSA]] ; entry: br label %loop @@ -1220,6 +1317,8 @@ exit: ; preds = %loop ; CHECK-VF4IC1: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} ; CHECK-VF4IC1: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} ; CHECK-VF4IC1: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} +; CHECK-VF4IC1: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]} +; CHECK-VF4IC1: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]} ;. ; CHECK-VF4IC4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK-VF4IC4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} @@ -1235,6 +1334,8 @@ exit: ; preds = %loop ; CHECK-VF4IC4: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} ; CHECK-VF4IC4: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} ; CHECK-VF4IC4: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} +; CHECK-VF4IC4: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]} +; CHECK-VF4IC4: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]} ;. ; CHECK-VF1IC4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK-VF1IC4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}